├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── LICENSE-SAMPLECODE
├── LICENSE-SUMMARY
├── README.md
├── package-lock.json
└── website
    ├── .gitignore
    ├── README.md
    ├── babel.config.js
    ├── docs
        ├── benchmarks
        │   ├── Analyzing
        │   │   ├── images
        │   │   │   ├── benchmark-1.png
        │   │   │   ├── benchmark-10.png
        │   │   │   ├── benchmark-11.png
        │   │   │   ├── benchmark-12.png
        │   │   │   ├── benchmark-13.png
        │   │   │   ├── benchmark-14.png
        │   │   │   ├── benchmark-2.png
        │   │   │   ├── benchmark-3.png
        │   │   │   ├── benchmark-4.png
        │   │   │   ├── benchmark-5.png
        │   │   │   ├── benchmark-6.png
        │   │   │   ├── benchmark-7.png
        │   │   │   ├── benchmark-8.png
        │   │   │   └── benchmark-9.png
        │   │   ├── read_spark_UI.md
        │   │   └── retrieve_event_logs.md
        │   ├── Resources
        │   │   ├── Benchmark_results.md
        │   │   └── Utilities.md
        │   ├── Running
        │   │   ├── benchmarking_checklist.md
        │   │   └── setting_up_environment.md
        │   ├── benchmarking_variables.md
        │   ├── img1.png
        │   ├── img2.png
        │   ├── introduction.md
        │   └── price_performance.md
        ├── bestpractices
        │   ├── Applications
        │   │   ├── HBase
        │   │   │   ├── best_practice.md
        │   │   │   ├── best_practice_hdfs.md
        │   │   │   ├── best_practice_s3.md
        │   │   │   ├── data_integrity.md
        │   │   │   ├── data_migration.md
        │   │   │   ├── img
        │   │   │   │   ├── hbase_replication_cross.png
        │   │   │   │   ├── hbase_replication_oneway.png
        │   │   │   │   ├── hbase_replication_simple.png
        │   │   │   │   ├── hbase_s3_replication.png
        │   │   │   │   ├── management_avg_payload.png
        │   │   │   │   ├── observability_grafana.png
        │   │   │   │   └── observabilty_webui.png
        │   │   │   ├── introduction.md
        │   │   │   ├── management.md
        │   │   │   ├── observability.md
        │   │   │   ├── performance_tests.md
        │   │   │   ├── scripts
        │   │   │   │   ├── hbase-snapshot-export.sh
        │   │   │   │   └── hbase-snapshot-import.sh
        │   │   │   └── security.md
        │   │   ├── Hadoop
        │   │   │   ├── img
        │   │   │   │   ├── emr_console_events.png
        │   │   │   │   ├── img.png
        │   │   │   │   ├── img2.png
        │   │   │   │   └── three.png
        │   │   │   ├── introduction.md
        │   │   │   ├── scripts
        │   │   │   │   ├── emr-6-ba-yarn_docker_gpu.sh
        │   │   │   │   ├── emr-6-yarn_docker_gpu.yaml
        │   │   │   │   ├── yarn_labels_scaling.sh
        │   │   │   │   └── yarn_labels_scaling.yaml
        │   │   │   ├── yarn_docker_gpu.md
        │   │   │   ├── yarn_labels_scaling.md
        │   │   │   └── yarn_node_ resilience.md
        │   │   ├── Hive
        │   │   │   ├── best_practices.md
        │   │   │   └── introduction.md
        │   │   └── Spark
        │   │   │   ├── best_practices.md
        │   │   │   ├── data_quality.md
        │   │   │   ├── data_skew.md
        │   │   │   ├── images
        │   │   │       ├── spark-bp-1.png
        │   │   │       ├── spark-bp-10.png
        │   │   │       ├── spark-bp-11.png
        │   │   │       ├── spark-bp-12.png
        │   │   │       ├── spark-bp-13.png
        │   │   │       ├── spark-bp-14.png
        │   │   │       ├── spark-bp-15.png
        │   │   │       ├── spark-bp-16.png
        │   │   │       ├── spark-bp-17.png
        │   │   │       ├── spark-bp-18.png
        │   │   │       ├── spark-bp-19.png
        │   │   │       ├── spark-bp-2.png
        │   │   │       ├── spark-bp-20.png
        │   │   │       ├── spark-bp-21.png
        │   │   │       ├── spark-bp-22.png
        │   │   │       ├── spark-bp-23.png
        │   │   │       ├── spark-bp-24.png
        │   │   │       ├── spark-bp-25.png
        │   │   │       ├── spark-bp-26.png
        │   │   │       ├── spark-bp-27.png
        │   │   │       ├── spark-bp-28.png
        │   │   │       ├── spark-bp-29.png
        │   │   │       ├── spark-bp-3.png
        │   │   │       ├── spark-bp-30.png
        │   │   │       ├── spark-bp-31.png
        │   │   │       ├── spark-bp-32.png
        │   │   │       ├── spark-bp-33.png
        │   │   │       ├── spark-bp-4.png
        │   │   │       ├── spark-bp-5.png
        │   │   │       ├── spark-bp-6.png
        │   │   │       ├── spark-bp-7.png
        │   │   │       ├── spark-bp-8.png
        │   │   │       ├── spark-bp-9.png
        │   │   │       ├── spark-bp-range-join-after.png
        │   │   │       ├── spark-bp-range-join-before.png
        │   │   │       ├── spark-tt-1.png
        │   │   │       └── spark-tt-2.png
        │   │   │   ├── introduction.md
        │   │   │   ├── joins.md
        │   │   │   ├── observability.md
        │   │   │   ├── performance.md
        │   │   │   ├── thrift.md
        │   │   │   └── troubleshooting.md
        │   ├── Cost Optimizations
        │   │   ├── Introduction.md
        │   │   ├── best_practices.md
        │   │   ├── images
        │   │   │   ├── bp-1.png
        │   │   │   ├── bp-10.png
        │   │   │   ├── bp-11.png
        │   │   │   ├── bp-2.png
        │   │   │   ├── bp-3.png
        │   │   │   ├── bp-4.png
        │   │   │   ├── bp-5.png
        │   │   │   ├── bp-6.png
        │   │   │   ├── bp-7.png
        │   │   │   ├── bp-8.png
        │   │   │   ├── bp-9.png
        │   │   │   ├── intro-1.png
        │   │   │   ├── intro-10.png
        │   │   │   ├── intro-11.png
        │   │   │   ├── intro-2.png
        │   │   │   ├── intro-3.png
        │   │   │   ├── intro-4.png
        │   │   │   ├── intro-5.png
        │   │   │   ├── intro-6.png
        │   │   │   ├── intro-7.png
        │   │   │   ├── intro-8.png
        │   │   │   ├── intro-9.png
        │   │   │   ├── mru-1.png
        │   │   │   └── mru-2.png
        │   │   └── maximizing-resource-utilization.md
        │   ├── Features
        │   │   ├── EMRFS
        │   │   │   ├── Assets
        │   │   │   │   └── table.png
        │   │   │   ├── aimd.md
        │   │   │   └── images
        │   │   │   │   ├── pic1.png
        │   │   │   │   └── pic2.png
        │   │   ├── Managed Scaling
        │   │   │   ├── best_practices.md
        │   │   │   ├── images
        │   │   │   │   ├── bp-1.png
        │   │   │   │   ├── bp-3.png
        │   │   │   │   └── ms-metrics.png
        │   │   │   └── troubleshooting.md
        │   │   └── Spot Usage
        │   │   │   └── best_practices.md
        │   ├── Observability
        │   │   ├── Assets
        │   │   │   ├── emr-cw_dashboard.sh
        │   │   │   ├── preview_1.png
        │   │   │   └── preview_2.png
        │   │   ├── best_practices.md
        │   │   └── intro.md
        │   ├── Reliability
        │   │   ├── best_practices.md
        │   │   ├── images
        │   │   │   ├── bp-1.png
        │   │   │   ├── bp-2.png
        │   │   │   ├── bp-3.png
        │   │   │   ├── bp-5.png
        │   │   │   └── bp-6.png
        │   │   └── introduction.md
        │   ├── Security
        │   │   ├── best_practices.md
        │   │   └── introduction.md
        │   ├── Troubleshooting
        │   │   ├── Troubleshooting EMR.md
        │   │   └── images
        │   │   │   ├── AmazonQ.png
        │   │   │   ├── CWagent_cpu_graph.png
        │   │   │   ├── CWagent_cpu_metric_list.png
        │   │   │   ├── CWagent_cpu_namespace.png
        │   │   │   ├── CWagent_disk_graph.png
        │   │   │   ├── CWagent_disk_namespace.png
        │   │   │   ├── CWagent_memory_graph.png
        │   │   │   ├── CWagent_memory_namespace.png
        │   │   │   ├── application_container_log_location.png
        │   │   │   ├── application_master_log_location.png
        │   │   │   ├── datanode_log_location.png
        │   │   │   ├── hdfs_fsadmin.png
        │   │   │   ├── hdfs_fsck.png
        │   │   │   ├── instance_state_log_location.png
        │   │   │   ├── iostat.png
        │   │   │   ├── iostat_output.png
        │   │   │   ├── namnode_log_location.png
        │   │   │   ├── nodemanager_log_location.htm
        │   │   │   ├── nodemanager_log_location.png
        │   │   │   ├── resourcemanager_log_location.png
        │   │   │   └── uptime.png
        │   └── introduction.md
        ├── migration
        │   └── introduction.md
        └── utilities
        │   ├── assets
        │       └── emr_advisor.png
        │   └── introduction.md
    ├── docusaurus.config.js
    ├── package-lock.json
    ├── package.json
    ├── sidebars.js
    ├── src
        ├── components
        │   └── HomepageFeatures
        │   │   ├── index.js
        │   │   └── styles.module.css
        ├── css
        │   └── custom.css
        ├── pages
        │   ├── index.js
        │   ├── index.module.css
        │   └── markdown-page.md
        └── theme
        │   └── SearchBar
        │       ├── SearchBar.js
        │       └── index.js
    └── static
        ├── .nojekyll
        └── img
            ├── AWS_logo_RGB.png
            ├── benchmark.svg
            ├── best_practices.svg
            └── utilities.svg


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | #IDE
34 | .idea/
35 | 
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 | 
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *,cover
49 | .hypothesis/
50 | 
51 | # Translations
52 | *.mo
53 | *.pot
54 | 
55 | # Scrapy stuff:
56 | .scrapy
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | # IPython Notebook
62 | .ipynb_checkpoints
63 | 
64 | # pyenv
65 | .python-version
66 | 
67 | # virtualenv
68 | venv/
69 | ENV/
70 | 
71 | # MkDocs documentation
72 | site/
73 | .DS_Store
74 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/LICENSE-SAMPLECODE:
--------------------------------------------------------------------------------
 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this
 4 | software and associated documentation files (the "Software"), to deal in the Software
 5 | without restriction, including without limitation the rights to use, copy, modify,
 6 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
 7 | permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
10 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
11 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
12 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
13 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
14 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 | 


--------------------------------------------------------------------------------
/LICENSE-SUMMARY:
--------------------------------------------------------------------------------
1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | 
3 | The documentation is made available under the Creative Commons Attribution-ShareAlike 4.0 International License. See the LICENSE file.
4 | 
5 | The sample code within this documentation is made available under the MIT-0 license. See the LICENSE-SAMPLECODE file.
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Amazon EMR on Amazon Best Practices
 2 | 
 3 | A best practices guide for submitting spark applications, integration with hive metastore, security, storage options, debugging options and performance considerations..
 4 | 
 5 | Return to [Live Docs](https://aws.github.io/aws-emr-best-practices/).
 6 | 
 7 | ## License Summary
 8 | 
 9 | The documentation is made available under the Creative Commons Attribution-ShareAlike 4.0 International License. See the LICENSE file.
10 | 
11 | The sample code within this documentation is made available under the MIT-0 license. See the LICENSE-SAMPLECODE file.
12 | 


--------------------------------------------------------------------------------
/package-lock.json:
--------------------------------------------------------------------------------
1 | {
2 |   "name": "aws-emr-best-practices",
3 |   "lockfileVersion": 3,
4 |   "requires": true,
5 |   "packages": {}
6 | }
7 | 


--------------------------------------------------------------------------------
/website/.gitignore:
--------------------------------------------------------------------------------
 1 | # Dependencies
 2 | /node_modules
 3 | 
 4 | # Production
 5 | /build
 6 | 
 7 | # Generated files
 8 | .docusaurus
 9 | .cache-loader
10 | 
11 | # Misc
12 | .DS_Store
13 | .env.local
14 | .env.development.local
15 | .env.test.local
16 | .env.production.local
17 | 
18 | npm-debug.log*
19 | yarn-debug.log*
20 | yarn-error.log*
21 | 


--------------------------------------------------------------------------------
/website/README.md:
--------------------------------------------------------------------------------
 1 | # Website
 2 | 
 3 | This website is built using [Docusaurus](https://docusaurus.io/), a modern static website generator.
 4 | 
 5 | ## Installation
 6 | 
 7 | ```bash
 8 | cd website
 9 | npm install
10 | ```
11 | 
12 | ## Local Development
13 | 
14 | ```bash
15 | npm run start
16 | ```
17 | 
18 | This command starts a local development server and opens up a browser window. Most changes are reflected live without having to restart the server.
19 | 
20 | ## Deployment
21 | 
22 | Using SSH:
23 | 
24 | ```bash
25 | USE_SSH=true npm run docusaurus deploy
26 | ```
27 | 
28 | Not using SSH:
29 | 
30 | ```bash
31 | GIT_USER=<Your GitHub username> npm run docusaurus deploy
32 | ```
33 | 
34 | If you are using GitHub pages for hosting, this command is a convenient way to build the website and push to the `gh-pages` branch.
35 | 


--------------------------------------------------------------------------------
/website/babel.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   presets: [require.resolve('@docusaurus/core/lib/babel/preset')],
3 | };
4 | 


--------------------------------------------------------------------------------
/website/docs/benchmarks/Analyzing/images/benchmark-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-1.png


--------------------------------------------------------------------------------
/website/docs/benchmarks/Analyzing/images/benchmark-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-10.png


--------------------------------------------------------------------------------
/website/docs/benchmarks/Analyzing/images/benchmark-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-11.png


--------------------------------------------------------------------------------
/website/docs/benchmarks/Analyzing/images/benchmark-12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-12.png


--------------------------------------------------------------------------------
/website/docs/benchmarks/Analyzing/images/benchmark-13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-13.png


--------------------------------------------------------------------------------
/website/docs/benchmarks/Analyzing/images/benchmark-14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-14.png


--------------------------------------------------------------------------------
/website/docs/benchmarks/Analyzing/images/benchmark-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-2.png


--------------------------------------------------------------------------------
/website/docs/benchmarks/Analyzing/images/benchmark-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-3.png


--------------------------------------------------------------------------------
/website/docs/benchmarks/Analyzing/images/benchmark-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-4.png


--------------------------------------------------------------------------------
/website/docs/benchmarks/Analyzing/images/benchmark-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-5.png


--------------------------------------------------------------------------------
/website/docs/benchmarks/Analyzing/images/benchmark-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-6.png


--------------------------------------------------------------------------------
/website/docs/benchmarks/Analyzing/images/benchmark-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-7.png


--------------------------------------------------------------------------------
/website/docs/benchmarks/Analyzing/images/benchmark-8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-8.png


--------------------------------------------------------------------------------
/website/docs/benchmarks/Analyzing/images/benchmark-9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-9.png


--------------------------------------------------------------------------------
/website/docs/benchmarks/Analyzing/retrieve_event_logs.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 3
 3 | sidebar_label: Retrieving Spark Event Logs
 4 | ---
 5 | 
 6 | # Retrieve Spark Event Logs
 7 | 
 8 | When you want to analyze the performance of your workloads, you’ll typically need to check the Spark Web UI to identify areas of improvement or just to detect events that are de-gradating the performance in your application. The Spark Web UI uses the Event Logs that are generated by each job running in your cluster to provide detailed information about Jobs, Stages and Tasks of your application that provides aggregated metrics that can help you to troubleshoot performance issues.
 9 | 
10 | These files are extremely portable, as they can be collected across different engines or environments and stored in the same Spark History Server to have a single interface where you can review results of different benchmark results across different environment or cloud providers.
11 | 
12 | When using Amazon EMR, the Spark Event logs are enabled by default and are automatically stored on the HDFS of the cluster where the job was running under the HDFS path `/var/log/spark/apps/`
13 | 
14 | ```bash
15 | $ hdfs dfs -ls -R /var/log/spark/apps/
16 | -rw-rw---- 1 hadoop spark 408384 2023-09-08 21:00 /var/log/spark/apps/application_1694206676971_0001
17 | ```
18 | 
19 | If you have Event Logs coming from a different environment or cluster, you can easily store them in this folder, and the Spark Web History Server will automatically pick them and you’ll be able to review the information of the job on the Spark History Server.
20 | 
21 | As alternative, if you want to export the Event Logs from a running cluster, you can also download them manually from the Spark Web History server from the main page as shown in the image below.
22 | 
23 | ![Benchmark - 1](images/benchmark-1.png)
24 | 
25 | Finally, if you’re using on premise cluster or any third-party Spark environment, you can automatically enable the Spark Event logs using the following Spark configurations:
26 | 
27 | * **spark.eventLog.enabled** (Boolean) Determine if you want to enable or disable event logs collection. False by default
28 | * **spark.eventLog.dir** (String) Location where to store the event logs. Can be an Object Store as Amazon S3, Azure Filesystem, or any path recognized by the Hadoop Filesystem API (e.g. HDFS, Local Filesystem, etc.)
29 | 
30 | Below an example to manually enable the Spark event logs in your Spark application.
31 | 
32 | ```bash
33 | spark-submit \ 
34 | --name "Example App" \ 
35 | --conf spark.eventLog.enabled=true \
36 | --conf spark.eventLog.dir=hdfs:///tmp/spark \
37 | ...
38 | ```
39 | 


--------------------------------------------------------------------------------
/website/docs/benchmarks/Resources/Benchmark_results.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 4
 3 | sidebar_label: Benchmark Results
 4 | ---
 5 | 
 6 | # Benchmark Results
 7 | 
 8 | ## Amazon EMR
 9 | 
10 | ### Spark
11 | 
12 | * EMR 6.10: https://aws.amazon.com/blogs/big-data/amazon-emr-on-eks-widens-the-performance-gap-run-apache-spark-workloads-5-37-times-faster-and-at-4-3-times-lower-cost/
13 | * EMR 6.9: https://aws.amazon.com/blogs/big-data/run-apache-spark-workloads-3-5-times-faster-with-amazon-emr-6-9/
14 | * EMR 6.5: https://aws.amazon.com/blogs/big-data/amazon-emr-on-amazon-eks-provides-up-to-61-lower-costs-and-up-to-68-performance-improvement-for-spark-workloads/
15 | 
16 | ### Graviton
17 | 
18 | #### EMR Graviton 2
19 | 
20 | * https://aws.amazon.com/blogs/big-data/achieve-up-to-27-better-price-performance-for-spark-workloads-with-aws-graviton2-on-amazon-emr-serverless/
21 | 
22 | #### EMR Graviton 3
23 | 
24 | * EMR on EKS - https://aws.amazon.com/blogs/big-data/amazon-emr-on-eks-gets-up-to-19-performance-boost-running-on-aws-graviton3-processors-vs-graviton2/
25 | * EMR on EC2 - https://aws.amazon.com/blogs/big-data/amazon-emr-launches-support-for-amazon-ec2-c7g-graviton3-instances-to-improve-cost-performance-for-spark-workloads-by-7-13/
26 | 
27 | ### Intel
28 | 
29 | * EMR Intel (C6i, M6i, I4i, R6i, and R6id): https://aws.amazon.com/blogs/big-data/amazon-emr-launches-support-for-amazon-ec2-c6i-m6i-i4i-r6i-and-r6id-instances-to-improve-cost-performance-for-spark-workloads-by-6-33/
30 | 
31 | ### AMD
32 | 
33 | * EMR AMD (m6a, r6a): https://aws.amazon.com/blogs/big-data/amazon-emr-launches-support-for-amazon-ec2-m6a-r6a-instances-to-improve-cost-performance-for-spark-workloads-by-15-50/
34 | 
35 | ### Managed Scaling
36 | 
37 | * Managed Scaling Improvements: https://aws.amazon.com/blogs/big-data/reduce-amazon-emr-cluster-costs-by-up-to-19-with-new-enhancements-in-amazon-emr-managed-scaling/
38 | 
39 | ### EMR on EKS
40 | 
41 | * EMR on EKS vs OSS: https://aws.amazon.com/blogs/big-data/amazon-emr-on-amazon-eks-provides-up-to-61-lower-costs-and-up-to-68-performance-improvement-for-spark-workloads/
42 | 
43 | ### Hive
44 | 
45 | * Hive Rename Feature: https://aws.amazon.com/blogs/big-data/up-to-15-times-improvement-in-hive-write-performance-with-the-amazon-emr-hive-zero-rename-feature/
46 | 
47 | ### Customer Examples
48 | 
49 | * EMR Serverless: https://aws.amazon.com/blogs/big-data/godaddy-benchmarking-results-in-up-to-24-better-price-performance-for-their-spark-workloads-with-aws-graviton2-on-amazon-emr-serverless/
50 | 
51 | ## Amazon Athena
52 | 
53 | * Athena V3: https://aws.amazon.com/blogs/big-data/upgrade-to-athena-engine-version-3-to-increase-query-performance-and-access-more-analytics-features/
54 | * Athena V2: https://aws.amazon.com/blogs/big-data/run-queries-3x-faster-with-up-to-70-cost-savings-on-the-latest-amazon-athena-engine/
55 | * Athena CBO: https://aws.amazon.com/blogs/big-data/speed-up-queries-with-cost-based-optimizer-in-amazon-athena/


--------------------------------------------------------------------------------
/website/docs/benchmarks/Resources/Utilities.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 4
 3 | sidebar_label: Utilities
 4 | ---
 5 | 
 6 | # Benchmarking Utilities
 7 | 
 8 | * EMR Spark Benchmark: https://github.com/aws-samples/emr-spark-benchmark
 9 | * EMR on EKS Benchmark: https://github.com/aws-samples/emr-on-eks-benchmark
10 | 


--------------------------------------------------------------------------------
/website/docs/benchmarks/Running/benchmarking_checklist.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 3
 3 | sidebar_label: Benchmarking Checklist
 4 | ---
 5 | 
 6 | # Benchmarking Checklist
 7 | 
 8 | ## Environment and Infrastructure
 9 | 
10 | The following checklist assumes you are running benchmarks across deployment models (EC2 vs EKS vs Serverless) or vendors (EMR vs Databricks vs OSS).  Comparing at the deployment model or vendor level takes into consideration a number of variables such as runtime performance, scaling and pricing model. 
11 | 
12 | If running a benchmark for other purposes such as difference in hardware within the same deployment model, items in the checklist will not apply.
13 | 
14 | |Checklist	|	|Notes	|
15 | |---	|---	|---	|
16 | |Are all instances On Demand	| :black_square_button:|Spot interruptions are unpredictable and impacts price-performance. Only use spot when taking into consideration how your benchmark handles spot interruptions and getting spot capacity. Deployment models EMR on EC2 have product differentiators that select instances with are most likely to not get interrupted.	|
17 | |Are all instances the same family, size and generation	|:black_square_button:	|The total amount of compute (vCPU and Memory) should be consistent across benchmark runs. Compute will determine the performance of the application. Additionally, instances can vary in network performance. Additionally, if using Karpenter or Instancefleet, you should ensure the set of instances provided are the same. Note that depending on when the job is submitted, your results may vary	|
18 | |If cluster scaling is enabled, does each deployment model have the same scaling configurations. (min, max)	|:black_square_button:	|The efficiency of scaling between deployment models and vendors can differ but the configurations as it relates to compute should be consistent	|
19 | |Is the EMR cluster or image using the latest EMR version?	|:black_square_button:	|The latest versions of EMR will contain the best runtime performance	|
20 | |Are the Application versions the same across deployment models, OSS and vendors?	|:black_square_button:	|Spark versions should be the same or the latest version that's offered	|
21 | |Is the same data catalog being used across benchmarks?	|:black_square_button:	|Performance between local and remote hivemetastore and glue data catalog can differ	|
22 | |Is the infrastructure being deployed in the same AZ?	|:black_square_button:	|AZ's may have differences in network latency or instance availability.	|
23 | |Are the benchmarks starting from the same state and size. For example, cold start vs warm pool and the # of starting instances	|:black_square_button:	|Initializing compute resources impact price-performance. When comparing benchmarking, ensure applications are starting from the same state	|
24 | |Is the amount and type of local disk consistent?	|:black_square_button:	|Size and type of local disk volumes impact workloads, especially shuffle heavy ones	|
25 | |Are the security settings consistent across deployment models ?  This includes IAM role, security groups, data and in transit encryption	|:black_square_button:	|Security configurations such as encryption can impact performance	|
26 | |Are network settings consistent across deployment models?	|:black_square_button:	|This includes VPC endpoints, NAT Gateways, public or private endpoints, or proxies. The flow of network traffic to access storage, catalog or endpoints impacts performance	|
27 | |Are there differences in the AMI, bootstrap actions or container Image?	|:black_square_button:	|This can impact compute initialization as well as job startup. For example, eliminating the need to load a specific library before executing the job	|
28 | |Are JDK settings consistent across deployment models	|:black_square_button:	|We've seen improved performance with JDK17. Ensure the versions are consistent across benchmarks	|
29 | 
30 | ## Workload
31 | 
32 | |Checklist	|	|Notes	|
33 | |---	|---	|---	|
34 | |Is the input and output data the same (size, location, type, structure)?	|:black_square_button:	|As a best practice, all benchmark runs should point to the same input data set	|
35 | |Are the applications being submitted the same?	|:black_square_button:	|SQL file or application should be the same	|
36 | |Are the applications libraries the same?	|:black_square_button:	|This includes external libraries, python versions, or anything the application requires to run	|
37 | |Are the applications parameters the same? 	|:black_square_button:	|These are application specific parameters passed in the job. These should be identical to ensure the same job is running	|
38 | |Are the applications configurations the same? 	|:black_square_button:	|This refers to Spark configuration settings such as executor size, shuffle partitions or Dynamic Resource Allocation settings	|
39 | |Is EMR using EMRFS library to write to S3	|:black_square_button:	|To take advantage of EMR's optimized run time, EMRFS (s3://) should be used. s3a is not supported and should only be used in OSS	|
40 | |If an Open Table Format (OTF) is being used, is it consistent across benchmarks	|:black_square_button:	|Using OTF's can improve read, write and processing performance. 	|
41 | |Is the application running in isolation? 	|:black_square_button:	|Resource contention can impact benchmark results because Spark workloads will run on any resource that is available. A best practice is to run each job independently. Also ensure that if submitting multiple jobs, jobs are submitted in the same sequence or sequentially.	|
42 | |Is there any data or library caching that impacts future runs?	|:black_square_button:	|Generally, the first run will be slower than future runs because of caching. Keep this in mind when determining how many iterations of a run you want to do. Additional runs will negate any impact of caching but has a trade off of cost and time	|
43 | |Is the applications JVM settings the same?	|:black_square_button:	|Performance is different across JDK version. JDK17 has seen to have the best performance. JVM settings also extend to GC settings. 	|
44 | |Is the applications logging configurations the same? 	|:black_square_button:	|Logging parameters that are not the same such as level (DEBUG, INFO) can impact performance or storage requirements	|
45 | |Are the applications being submitted the same way?	|:black_square_button:	|Ensure the entry point for job submission is the same. There are many ways to submit spark jobs such as EMR APIs, Livy, Airflow, Spark-submit. These can result in differences with how jobs are run	|


--------------------------------------------------------------------------------
/website/docs/benchmarks/Running/setting_up_environment.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 3
 3 | sidebar_label: Setting Up Your Environment
 4 | ---
 5 | 
 6 | # Setting up the Benchmark Environment
 7 | 
 8 | ## EMR on EC2
 9 | 
10 | * Getting started guide: [https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-gs.html](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-gs.html)
11 | * Benchmark guide: [https://github.com/aws-samples/emr-spark-benchmark](https://github.com/aws-samples/emr-spark-benchmark)
12 | 
13 | ## EMR on EKS
14 | 
15 | * Getting started guide: [https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/setting-up.html](https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/setting-up.html)
16 | * Benchmark guide: [https://github.com/aws-samples/emr-on-eks-benchmark/tree/main](https://github.com/aws-samples/emr-on-eks-benchmark/tree/main)
17 | 
18 | ## EMR Serverless
19 | 
20 | * Getting started guide: [https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/getting-started.html](https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/getting-started.html)
21 | * Benchmark guide: [https://github.com/aws-samples/emr-spark-benchmark](https://github.com/aws-samples/emr-spark-benchmark)


--------------------------------------------------------------------------------
/website/docs/benchmarks/img1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/img1.png


--------------------------------------------------------------------------------
/website/docs/benchmarks/img2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/img2.png


--------------------------------------------------------------------------------
/website/docs/benchmarks/introduction.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 1
 3 | sidebar_label: Benchmarks
 4 | ---
 5 | 
 6 | # Benchmarks
 7 | 
 8 | The purpose of this guide is to provide a methodology for running Spark benchmarks on EMR. By following this guide, you will be able to identify the lowest price-performance option for running Spark workloads, considering various variables such as engine type (EMR, OSS), deployment models (EC2, EKS, Serverless), or hardware options (M, C, R, family).
 9 | 
10 | The focus of this guide is on price-performance. Other considerations, such as features, user experience, or compatibility with other services, are out of scope. However, it's essential to evaluate these aspects based on your customers' use cases and needs.
11 | 


--------------------------------------------------------------------------------
/website/docs/benchmarks/price_performance.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 2
 3 | sidebar_label: Price-Performance
 4 | ---
 5 | 
 6 | # Price Performance
 7 | 
 8 | In the scope of this tutorial, "price-performance" signifies the monetary expense associated with executing a given workload while maintaining a specific degree of performance, expressed in terms of execution duration (seconds). Evaluating price-performance plays a vital role in understanding the impact of factors that are not easily quantifiable, such as deployment architectures, competitive offerings, container allocation strategies, and processing engines.
 9 | 
10 | For variables that are within our control, such as infrastructure sizing or application settings, ensuring uniformity among all benchmarks is indispensable for accurate comparisons.
11 | 
12 | The following examples highlight the importance of price-performance.
13 | 
14 | **Example 1:** Customer wants to compare Open Source Software (OSS) Spark vs EMR Spark with different cluster sizes
15 | 
16 | |	|Cluster #1	|Cluster #2	|
17 | |---	|---	|---	|
18 | |Runtime (s)	|12	|30	|
19 | |# of nodes	|50	|10	|
20 | |Engine	|OSS Spark Runtime	|EMR Spark Runtime	|
21 | |Cost ($)	|600	|300	|
22 | 
23 | In the above example, Cluster #1 is running OSS spark and completes in 12s with 50 nodes, while EMR Spark completes in 30s with 10 nodes. However, when we look at total cost, cluster #2 total cost is lower than cluster #1 making it a better option. Comparing cost in relation to the work being done considers the difference in # of nodes and engine. Assuming performance is linear, lets look at what happens when we increase the # of nodes in cluster 2.
24 | 
25 | **Example 2:** Customer wants to compare Open Source Software (OSS) Spark vs EMR Spark  with same cluster sizes
26 | 
27 | |	|Cluster #1	|Cluster #2	|
28 | |---	|---	|---	|
29 | |Runtime (s)	|12	|6	|
30 | |# of nodes	|50	|50	|
31 | |Engine	|OSS Spark Runtime	|EMR Spark Runtime	|
32 | |Cost ($)	|600	|300	|
33 | 
34 | After increasing the # of nodes to be the same across both clusters, runtime is reduced to 6seconds on Cluster #2 and cost remains the same at 300$. Our conclusion from the first example remains the same. Cluster #2 is the best option from a price-performance perspective.
35 | 
36 | It’s important to note that price-performance is not always linear. This is often seen when workloads have data skew. In these cases, adding more compute does not reduce runtime proportionally and adds costs.
37 | 
38 | **Example 3:** Same workload across different # of nodes - data skew
39 | 
40 | |	|Run #1	|Run #2	|
41 | |---	|---	|---	|
42 | |Runtime (s)	|100	|75	|
43 | |# of nodes	|10	|20	|
44 | |Engine	|EMR Spark Runtime	|EMR Spark Runtime	|
45 | |Cost ($)	|1000	|1500	|
46 | 
47 | In the above example, performance is not linear. While runtime reduced to 75s, overall cost increased. In these cases, it’s important ensure the # of nodes are the same for both comparisons.
48 | 
49 | Another scenario where price-performance is useful is when comparing different pricing models or vendors. Take the example below:
50 | 
51 | **Example 4:** Same workload across different pricing models
52 | 
53 | |	|EMR Spark Runtime	|Vendor	|
54 | |---	|---	|---	|
55 | |Runtime (s)	|50	|40	|
56 | |# of nodes	|10	|10	|
57 | |$/s	|1	|1.5	|
58 | |Cost ($)	|500	|600	|
59 | 
60 | In the above example, the same workload on vendor runs in 40s, while EMR runs in 50s. While vendor may seem faster, when we factor in price-performance, we see total cost is lower with EMR. If runtime is a key requirement, we can increase the # of nodes in relation to performance as illustrated in example 5.
61 | 
62 | **Example 5:** Same workload across different pricing models with different # of nodes
63 | 
64 | |	|EMR Spark Runtime	|EMR Spark Runtime linear performance	|Vendor	|
65 | |---	|---	|---	|---	|
66 | |Runtime (s)	|50	|25	|40	|
67 | |# of nodes	|10	|20	|10	|
68 | |$/s	|1	|1	|1.5	|
69 | |Cost ($)	|500	|500	|600	|
70 | 
71 | The goal with benchmarking should always be to have like-for-like comparisons. This is especially true for factors such as application configuration settings such as executor sizes, input and output dataset, cluster size and instances. However, factors like vendor/aws pricing model, engine optimizations, and schedulers cannot be made the same. As such, it’s important to use price-performance as a key factor.
72 | 


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/HBase/best_practice.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | sidebar_position: 2
  3 | sidebar_label: Best Practices
  4 | ---
  5 | 
  6 | # Best Practice
  7 | 
  8 | The following section describes some general HBase tuning and best practice that can be applied both when using HDFS or Amazon S3 as storage layer for HBase.
  9 | 
 10 | ## EMR Multi Master
 11 | 
 12 | When working with HBase on Amazon EMR, it is good practice to enable the [EMR Multi Master](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-plan-ha.html) feature that allows you to launch three EMR master nodes. This functionality allows the HBase cluster to tolerate impairments that might occur if a single master goes down.
 13 | 
 14 | Nevertheless, this functionality is highly recommended both when using HDFS or S3 as storage layer for your HBase cluster. Enabling this, allows you to serve HBase requests (both writes and reads) in case of a master failure. Please note that if you launch the EMR cluster with a single master and this node is terminated for any reason (e.g. human error, hardware impairment, etc.), it will not be possible to recover any data from the HDFS storage on the cluster as the HDFS metadata will be lost after the termination of the EMR master.
 15 | 
 16 | ## EMR Termination Protection
 17 | 
 18 | [Using termination protection](https://docs.aws.amazon.com/emr/latest/ManagementGuide/UsingEMR_TerminationProtection.html) in Amazon EMR is highly recommended both when using HDFS or Amazon S3 for your HBase cluster.
 19 | 
 20 | Amazon EMR periodically checks the Apache Hadoop YARN status of nodes running on CORE and TASK nodes in a cluster. The health status is reported by the [YARN NodeManager health checker service](https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/NodeManager.html#Health_checker_service). If a node reports an UNHEALTHY status, it will not be possible to allocate YARN containers to it until it becomes healthy again. A common reason for unhealthy nodes is that disk utilization goes above 90%. If the node stays in this state for more than 45 minutes and Termination Protection is disabled, the EMR service terminates the node and launch a fresh new one as replacement.
 21 | 
 22 | When a node is in an UNHEALTHY state, with the termination protection enabled the nodes will not be terminated and replaced by the EMR service. This prevents to lose HDFS data blocks in case the utilization of the disks of a CORE node goes above 90%, so preventing data integrity issues in HBase tables.
 23 | 
 24 | ## HBase RPC Listeners
 25 | 
 26 | One of the most important parameters to configure in your HBase cluster is the number of active RPC listeners defined per Region Server. Tuning the parameter *`hbase.regionserver.handler.count`* (default: 30) can increase the number of requests that you can concurrently serve in each region server and so the overall throughput of your cluster. To modify the default number of RPC listeners you can use the following EMR configuration:
 27 | 
 28 | ```json
 29 | [
 30 |   {
 31 |     "Classification": "hbase-site",
 32 |     "Properties": {
 33 |       "hbase.regionserver.handler.count": "120"
 34 |     }
 35 |   }
 36 | ]
 37 | ```
 38 | 
 39 | However, please be mindful that this parameter should be tuned accordingly to the average size of data stored or retrieved from your tables. As rule of thumb, you should increase this number when the payload of your data is lower than 100KB, while you should stick to the default, or decrease it when the payload size is `>= 1MB.` For small payloads (`<= 1KB)`, you can push this value up to 4 times the number of vCpu available in your Region Servers.
 40 | 
 41 | To determine the average payload of data stored in your tables, see [Determine average row size](./management#determine-average-row-size).
 42 | 
 43 | ## HBase Heap Memory
 44 | 
 45 | On Amazon EMR, when you install HBase, the memory will be evenly re-partitioned between Hadoop YARN and HBase services. For a list of the default memory settings used per instance type see [Task configuration](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-hadoop-task-config.html#emr-hadoop-task-jvm) in the EMR documentation.
 46 | 
 47 | However, when working with HBase it might be convenient to override the default parameters and increase the available memory for our HBase services. This might be required if we want to host a higher number of Regions per Region Server. To modify the default memory, you should modify the HBase environmental variables defined in the *hbase-env* which defines the default heap memory available for each HBase service. The following list highlight the variables that should be modified by service:
 48 | 
 49 | * **`HBASE_MASTER_OPTS`** JVM options for the HBase master
 50 | * **`HBASE_REGIONSERVER_OPTS`** JVM options for the HBase Region Servers
 51 | * **`HBASE_THRIFT_OPTS`** JVM options for the HBase Thrift service
 52 | * **`HBASE_REST_OPTS`** JVM options for the HBase REST service
 53 | 
 54 | It’s best practice to modify the memory of each component using its own dedicated variable, rather than using the more general **HBASE_OPTS**, which is used to apply common JVM options across all HBase services.
 55 | 
 56 | To override the default memory we should specify the following java parameter in our environmental variable: `-Xmx<size>[g|G|m|M|k|K]`. Please also make sure to add a self reference in the environmental variable to avoid loosing other parameters that are set in the script. Besides, if we modify the default HBase memory, we should also lower accordingly the memory specified for the YARN Node Manager service to avoid incurring in Out Of Memory errors.
 57 | 
 58 | Please note that either if you’re just installing HBase, it might still be convenient to keep some memory reserved for YARN. This can be useful as some HBase utility runs on YARN (e.g. HBase export utility).
 59 | 
 60 | The example below highlights the configurations that should be modified in an EMR cluster while tuning the HBase heap memory. Please make sure that the sum of the YARN and HBase memory is not greater than the memory available on the node. Also make sure to keep at least 2GB of available memory for the Operating System and other internal components running on the node.
 61 | 
 62 | ```json
 63 | [
 64 |   {
 65 |     "Classification": "yarn-site",
 66 |     "Properties": {
 67 |       "yarn.scheduler.maximum-allocation-mb": "MAX_MEMORY_BYTES",
 68 |       "yarn.nodemanager.resource.memory-mb": "MAX_MEMORY_BYTES"
 69 |     }
 70 |   },
 71 |   {
 72 |     "Classification": "hbase-env",
 73 |     "Configurations": [
 74 |       {
 75 |         "Classification": "export",
 76 |         "Properties": {
 77 |           "HBASE_MASTER_OPTS": "\"$HBASE_MASTER_OPTS -Xmx30g\"",
 78 |           "HBASE_REGIONSERVER_OPTS": "\"$HBASE_REGIONSERVER_OPTS -Xmx30g\""
 79 |         }
 80 |       }
 81 |     ],
 82 |     "Properties": {}
 83 |   }
 84 | ]
 85 | ```
 86 | 
 87 | ## HBase MultiWal Provider
 88 | 
 89 | By default, HBase uses a single [Write Ahead Log](https://hbase.apache.org/book.html#wal) file (WAL) per Region Server to persist mutate operations that are performed against Regions hosted on the node. This implementation can be a bottleneck as WALs are stored on the HDFS and each operation is performed sequentially against the same file.
 90 | 
 91 | In write intensive clusters, you might increase the HBase throughput by adopting a multiwal strategy. In this scenario is recommended to have multiple disks attached to the node to get the most out of this feature. This configuration can be enabled specifying the following properties while launching an EMR cluster:
 92 | 
 93 | ```json
 94 | [
 95 |   {
 96 |     "Classification": "hbase-site",
 97 |     "Properties": {
 98 |       "hbase.wal.provider": "multiwal",
 99 |       "hbase.wal.regiongrouping.numgroups": "2"
100 |     }
101 |   }
102 | ]
103 | ```
104 | 
105 | The parameter *`hbase.wal.regiongrouping.numgroups`* determines the number of WALs that will be created per Region Server. By default, this parameter is set to two, but you can tune this parameter accordingly to the number of disks attached to the node for better performance.
106 | 
107 | ## HBase OffHeap Caching
108 | 
109 | The following example, shows how to enable OffHeap memory caching on HBase. This configuration, can be used both when using Amazon S3 or HDFS as storage layer. The example below sets an offheap memory of 5GB while the bucket cache allocated for this memory will be 4GB.
110 | 
111 | ```json
112 | [
113 |   {
114 |     "Classification": "hbase-env",
115 |     "Properties": {},
116 |     "Configurations": [
117 |       {
118 |         "Classification": "export",
119 |         "Properties": {
120 |           "HBASE_OFFHEAPSIZE": "5G"
121 |         },
122 |         "Configurations": []
123 |       }
124 |     ]
125 |   },
126 |   {
127 |     "Classification": "hbase-site",
128 |     "Properties": {
129 |       "hbase.bucketcache.size": "4096",
130 |       "hbase.bucketcache.ioengine": "offheap"
131 |     }
132 |   }
133 | ]
134 | ```
135 | 
136 | In order to use the configured cache, make sure to enable the following configurations in the tables you want to cache. For example, from the HBase shell:
137 | 
138 | ```bash
139 | # creating new table t with column family info0
140 | hbase> create 't', {NAME => 'info0', CONFIGURATION => {CACHE_DATA_IN_L1 => 'true'}}
141 | 
142 | # modify existing table t with column family info0
143 | hbase> alter 't', {NAME => 'info0', CONFIGURATION => {CACHE_DATA_IN_L1 => 'true'}}
144 | ```
145 | 


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/HBase/best_practice_s3.md:
--------------------------------------------------------------------------------
  1 | # Best Practices for Amazon S3
  2 | 
  3 | This section highlights some of the features / best practice that you can use to improve the performance in your cluster when using Amazon S3 as storage layer for HBase. For additional best practice / tuning parameters, see [Apache HBase on Amazon S3 configuration properties](https://docs.aws.amazon.com/whitepapers/latest/migrate-apache-hbase-s3/identifying-apache-hbase-and-emrfs-tuning-options.html).
  4 | 
  5 | ## Bucket Cache
  6 | 
  7 | When using Amazon S3 as storage layer for HBase, EMR configures the service to use a Bucket Cache for persisting data blocks on the L2 Cache of each region server. The default cache implementation used for Amazon S3 persists blocks on the local volumes of the node as defined by the *`hbase.bucketcache.ioengine`* property. This parameter defines the location of the files used to store the cached data. For example, the following snippet shows the default configurations for a node with 4 EBS volumes attached. 
  8 | 
  9 | ```xml
 10 |   <property>
 11 |     <name>hbase.bucketcache.ioengine</name>
 12 |     <value>files:/mnt1/hbase/bucketcache,/mnt2/hbase/bucketcache,/mnt3/hbase/bucketcache</value>
 13 |   </property>
 14 | ```
 15 | 
 16 | By default, EMR configures N - 1 volumes for caching data, so in our example only 3 volumes out of 4 will be used for the cache. This feature can be useful to persist HOT data on the local disks of the cluster to reduce the latency introduced when accessing HFiles stored on S3. However, by default the cache size is set as 8GB, so you might need to increase it depending on the amount of data you want to store on each node. To modify the default cache value, you can set the following property: 
 17 | 
 18 | ```
 19 | hbase.bucketcache.size: 98304 # defined as MB
 20 | ```
 21 | 
 22 | In the above example, we set the cache size for each node to 98GB. In each volume only 32GB (98304 / 3) are used, as the total cache size will be evenly distributed across the volumes defined in the *`hbase.bucketcache.ioengine`*.
 23 | 
 24 | Besides, when using S3 it might be convenient to pre-warm the cache during the region opening to avoid performance degradation when the cache is still not fully initialized. In this case to enable blocks prefetch, you should enable the following configuration.
 25 | 
 26 | ```
 27 | hbase.rs.prefetchblocksonopen: true
 28 | ```
 29 | 
 30 | This configuration can also be set for individual Column Family of an HBase table. In this case you should specify the configuration through the HBase shell using the following command:
 31 | 
 32 | ```
 33 | hbase> create 'MyTable', { NAME => 'myCF', PREFETCH_BLOCKS_ON_OPEN => 'true' }
 34 | ```
 35 | 
 36 | Finally, in write intensive use cases, it might be useful to also enable the following configurations to automatically persist blocks in the cache as they are written, and to repopulate the cache following a compaction (compaction operations invalidate cache blocks). In this case we can set the following additional properties:
 37 | 
 38 | ```
 39 | hbase.rs.cacheblocksonwrite: true
 40 | hbase.rs.cachecompactedblocksonwrite: true
 41 | ```
 42 | 
 43 | Below a sample configuration to tune the Bucket Cache in an Amazon EMR cluster:
 44 | 
 45 | ```json
 46 | [
 47 |   {
 48 |     "Classification": "hbase-site",
 49 |     "Properties": {
 50 |       "hbase.bucketcache.size": "98304",
 51 |       "hbase.rs.prefetchblocksonopen": "true",
 52 |       "hbase.rs.cacheblocksonwrite": "true",
 53 |       "hbase.rs.cachecompactedblocksonwrite": "true"
 54 |     }
 55 |   }
 56 | ]
 57 | ```
 58 | 
 59 | ## Memstore flush size
 60 | 
 61 | When using Amazon S3 in HBase, it might be convenient to increase the default memstore flush size to avoid performance degradation, or an excessive number of small compaction operations in write intensive clusters. This can be useful if you have manually disabled the [Persistent File Tracking](#persistent-file-tracking) feature that is enabled on EMR greater than 6.2.0 or if you're using an EMR 5.x cluster.
 62 | 
 63 | In this case, you can increase the memstore flush size to 256MB or 512MB (default 128MB). Below an example of how you can change this configuration in an Amazon EMR cluster:
 64 |  
 65 | 
 66 | ```json
 67 | [
 68 |   {
 69 |     "Classification": "hbase-site",
 70 |     "Properties": {
 71 |       "hbase.hregion.memstore.flush.size": "268435456" # 256 * 1024 * 1024
 72 |     }
 73 |   }
 74 | ]
 75 | ```
 76 | 
 77 | ## Region Split Policy
 78 | 
 79 | Depending on the HBase version that you’re using, you will use different region split policies. By default, you’ll have:
 80 | 
 81 | * **HBase 1.x** *`org.apache.hadoop.hbase.regionserver.IncreasingToUpperBoundRegionSplitPolicy`*
 82 | * **HBase 2.x**  *`org.apache.hadoop.hbase.regionserver.SteppingSplitPolicy`*
 83 | 
 84 | These specific implementations aims to quickly increase the number of regions when you have a fresh new table that wasn’t pre-partitioned. This might be a good strategy for new tables in a cluster.
 85 | 
 86 | However, it might be more convenient for a cluster using S3 as storage layer to use the old split strategy *`org.apache.hadoop.hbase.regionserver.ConstantSizeRegionSplitPolicy`* that performs a split operation only when the overall size of a region goes above a threshold as defined by the parameter:  *`hbase.hregion.max.filesize`* (default: 10GB)
 87 | 
 88 | This can help if you want to have more control on the number of regions, as it will allow you to control the growth of the number of regions by a fixed size that you specify. Additionally, this can also be handy in case you’re leveraging Apache Phoenix to query HBase and you have a constant flow of new data. Setting a constant size region split policy will prevent excessive splitting operations. These operations can cause temporary region cache boundaries exceptions while using Phoenix, due to the time required to refresh internal metadata about regions boundaries. This problem might be more frequent when using S3 as storage layer than when using HDFS.
 89 | 
 90 | Below an example to modify the Region Server split logic on an Amazon EMR cluster:
 91 | 
 92 | ```json
 93 | [
 94 |   {
 95 |     "Classification": "hbase-site",
 96 |     "Properties": {
 97 |       "hbase.regionserver.region.split.policy": "org.apache.hadoop.hbase.regionserver.ConstantSizeRegionSplitPolicy",
 98 |       "hbase.hregion.max.filesize": "10737418240"
 99 |     }
100 |   }
101 | ]
102 | ```
103 | 
104 | ## Persistent File Tracking
105 | 
106 | When using EMR versions greater than 6.2.0, EMR will enable a feature called Persistent File Tracking when using Amazon S3 as storage layer. This specific feature, is enabled by default and provides performance benefits as it avoids HFile rename operations that might delay write operations due to S3 latencies. However, please note that this feature does not support the native [HBase replication](https://hbase.apache.org/book.html#_cluster_replication) feature. So if you want to use replication to implement a Highly Available setup when using Amazon S3, you’ll have to disable this feature. This applies only to S3 and is not required when using HDFS as storage layer.
107 | 
108 | For more details on this feature, see [Persistent HFile tracking](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-hbase-s3.html#emr-hbase-s3-hfile-tracking).
109 | 
110 | ## Speed up region assignment / opening / closing
111 | 
112 | ### HBase 1.x
113 | 
114 | Set the below configurations to speed up region assignment, opening and closure on HBase 1.x clusters. These configurations specifically disable the use of zookeeper for the region assignment by setting to false the property *`hbase.assignment.usezk`*. Additionally, you can increase the thread pools the Region Servers use for opening the assigned regions. For Regions Servers handling many regions (in the order of thousands), you can set the thread pools up to 10 times the available number of vCpu on the Region Server. Below, an example EMR Configuration:
115 | 
116 | ```json
117 | [
118 |   {
119 |     "Classification": "hbase-site",
120 |     "Properties": {
121 |       "hbase.assignment.usezk": "false",
122 |       "hbase.regionserver.executor.openregion.threads": "120",
123 |       "hbase.regionserver.executor.closeregion.threads": "120"
124 |     }
125 |   }
126 | ]
127 | ```
128 | 
129 | ### HBase 2.x
130 | 
131 | HBase 2.x introduced a more robust and efficient workflow to manage regions transitions which leverage the ProcedureV2 introduced in [HBASE-14614](https://issues.apache.org/jira/browse/HBASE-14614). In this case, it is only sufficient to increase the default region server thread pools to speed up the initialization of the regions.
132 | 
133 | ```json
134 | [
135 |   {
136 |     "Classification": "hbase-site",
137 |     "Properties": {
138 |       "hbase.regionserver.executor.openregion.threads": "120",
139 |       "hbase.regionserver.executor.closeregion.threads": "120"
140 |     }
141 |   }
142 | ]
143 | ```
144 | 


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/HBase/img/hbase_replication_cross.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/HBase/img/hbase_replication_cross.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/HBase/img/hbase_replication_oneway.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/HBase/img/hbase_replication_oneway.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/HBase/img/hbase_replication_simple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/HBase/img/hbase_replication_simple.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/HBase/img/hbase_s3_replication.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/HBase/img/hbase_s3_replication.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/HBase/img/management_avg_payload.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/HBase/img/management_avg_payload.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/HBase/img/observability_grafana.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/HBase/img/observability_grafana.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/HBase/img/observabilty_webui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/HBase/img/observabilty_webui.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/HBase/introduction.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 1
 3 | sidebar_label: Introduction
 4 | ---
 5 | 
 6 | # Introduction
 7 | 
 8 | When working with Amazon EMR on EC2, you have the ability to choose between two deployment options for the underlying storage layer used by HBase: the [Hadoop HDFS](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html) or [Amazon S3](https://docs.aws.amazon.com/AmazonS3/latest/userguide/Welcome.html).
 9 | 
10 | Although there are no restrictions in the use of these storage options, they serve different purposes, and they both have pros and cons with related performance implications. In this document, we are going to review the main aspects of each storage option.
11 | 
12 | ## Which storage layer should I use?
13 | 
14 | Typically, to understand which storage layer you should use in your HBase cluster, you must determine what are your application requirements and decide what is most important between these two main decision drivers: performance or costs. Generally speaking, on a large cluster setup, HDFS provides better performance in most cases, while Amazon S3 provides better cost savings due to the reduced amount of storage required to persist all your data, and is the right option when you want to decouple your storage from compute.
15 | 
16 | Using HDFS allows you to achieve the best performance for latency responses. This is true if you need milliseconds / sub-milliseconds read responses from HBase. You can also achieve similar results using Amazon S3 as storage layer, but this will require to rely on HBase caching features. Depending on your tables sizes, this can increase costs when provisioning resources for cache, as you’ll have to provision more EBS volumes or use bigger instances to cache your data locally on the nodes, thus losing the main advantages of using Amazon S3. This requires to fine tune HBase to find the right balance between performance and cost for your workload.
17 | 
18 | Another common use case to choose HDFS over S3 is a data migration from an on premise cluster. This is typically recommended as first migration step, as this solution provides similar performance compared to your existing cluster. You can more easily migrate your infrastructure to the cloud, and later decide if it makes sense to use Amazon S3.
19 | Besides, using the HDFS for a data migration can be a requirement before moving to Amazon S3. Specifically this can help to optimize the underlying layout of your HBase tables if they have a considerable amount of small HBase regions, and you want to merge them. This operation can be more quickly be performed on an HDFS cluster, and you can later migrate the data to Amazon S3. For more details, see the sections [Reduce number of Regions](./management#reduce-number-of-regions) and [Data Migration](./data_migration).
20 | 
21 | Finally, using HDFS is also the right choice if you have a cluster that is mostly used for write intensive workloads. This is  because write intensive clusters are subject to intensive compaction and region splitting operations that are performed internally by HBase to manage the underlying data storage. In these cases, using Amazon S3 might not be the right option, because of data movements that occur between Amazon S3 and the cluster to perform compaction processes. This increases the time required to perform such operations, thus impacting the overall cluster performance resulting in higher latencies.
22 | 
23 | On the other side, Amazon S3 is a good option for read-intensive HBase clusters. One of the best use cases where S3 excels is when the data that is most frequently accessed (read or modified) is the most recent, while old data is rarely modified. You can use the pre-configured bucket cache, to store a hot copy of the most recent data on local disks of your cluster, thus maintaining a good compromise in terms of costs and performance. For more details, see [Bucket Cache](./best_practice_s3#bucket-cache).
24 | 
25 | Another good use case for using Amazon S3 is when you have tables that rarely change over time, and you need to serve a large amount of read requests. In this case, you can opt for Amazon S3 in combination with the [EMR HBase read-replica](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-hbase-s3.html#emr-hbase-s3-read-replica), to distribute your read requests across multiple clusters. For more details about this approach kindly see [Data Integrity](./data_integrity#amazon-emr---read-replica). Moreover, Amazon S3 provides stronger SLA for data durability and availability transparently at the storage level and will not be impacted by failures on EMR instances.
26 | 
27 | Finally, one major benefit of relying on S3 for storage is cost saving. If you have significant costs in your cluster due to large amount of data stored on EBS volumes, moving to S3 can reduce costs drastically. Moreover, HDFS uses block replication to provide fault tolerance, which increases the footprint of data stored locally in your cluster. In Amazon EMR, the default [HDFS replication](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-hdfs-config.html) factor is defined automatically when launching the cluster (or you can override it manually using the [EMR configuration API](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-configure-apps.html)). For large tables size this can drastically increase EBS storage costs, so you might want to leverage S3 where replication is handled natively by the service for a more convenient cost.
28 | 
29 | ## Which instance should I use?
30 | 
31 | When talking about hardware requirements for HBase, it is very important to choose the right EC2 instance type when using HDFS as storage layer, as it might be prohibitive to change it once you have a live production cluster. On the other side, changing instances for an HBase cluster running on Amazon S3 is much easier as data is persisted on S3. This allows us to more easily terminate an EMR cluster without losing data and launch a new one using a different instance type. Below you can find some details that can help you to choose the right instances based on your use case / workloads requirements.
32 | 
33 | HBase typically performs better with small instances and when you spread the overall requests across multiple instances. This is because there are some limitations in the number of HBase regions a single Region Server can handle, and having a huge amount of regions on a single node can lead to issues and unexpected behavior. For more details on determining the right number of regions for a specific instance, see the section [Number of HBase Regions](#number-of-hbase-regions).
34 | 
35 | Generally speaking, if you want to achieve the best possible performance in your HBase cluster, it’s highly recommended to use EC2 instances powered with an [Instance Store](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html) volume. This is especially true for write intensive / mixed (50% writes 50% reads) workloads. For such use cases, if you have significant write requests, you’ll need disks that can provide a large amount of IOPS in order to accommodate all background operations performed by HBase (compaction, WAL writes). Using disk optimized instances allows you to sustain high volumes of write operations either if HBase is performing compaction or other background operations on disks. Some example of instances that are recommended for such workloads are:
36 | 
37 | * [i3](https://aws.amazon.com/ec2/instance-types/i3/) / [i3en](https://aws.amazon.com/ec2/instance-types/i3en/) provide dense SSD storage for data-intensive workloads. They provide the best performance for write intensive workloads but can be prohibitive depending on the amount of storage you want to use. They are recommended if you want to achieve the best possible performance, and if you want to cache several data in memory.
38 | * [m5d](https://aws.amazon.com/ec2/instance-types/m5/) / [r5d](https://aws.amazon.com/ec2/instance-types/r5/) / [c5d](https://aws.amazon.com/ec2/instance-types/c5/) all these families provide NVMe SSD disks to deliver high random I/O performance. They can be used in different ways to exploit HBase features. For example, r5d can be used in combination with HBase off heap caching to maintain a significant amount of data cached in a performant memory (instead of reading data from the disks). On the other side, c5d comes with a higher proportion of vCPU compared to the memory, so they can be a better match if you need to serve huge volumes of requests on a single region server.
39 | 
40 | To decide the right instance size, it’s important to understand how many regions you’re going to serve on a single region server. As general rule however, for large HBase tables, it’s recommended to choose an instance type that can provide at least 32GB of memory dedicated for the HBase services (HMaster and Region Servers). Please note that by default Amazon EMR split the available memory of an instance between the YARN Node Manager and the HBase Region Server. For a list of default memory settings, see [Default values for task configuration settings](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-hadoop-task-config.html#emr-hadoop-task-jvm). You can always override the default EMR behavior using the EMR Configuration API. For more details see [Modify Heap Memory](./best_practice#hbase-heap-memory).
41 | 
42 | 
43 | ## Number of HBase Regions
44 | 
45 | As described in the [HBase documentation](https://hbase.apache.org/book.html#ops.capacity.regions.count), you can use the following formula to compute the number of HBase regions that should be hosted on a single region server. You should note that this is gives more of guideline about number of regions, but you should investigate and experiment on your workload to tune the number of regions: 
46 | 
47 | ```
48 | (REGION_SERVER_MEM_SIZE * MEMSTORE_FRACTION) / (MEMSTORE_SIZE * NUM_COLUMN_FAMILIES)
49 | ```
50 | 
51 | * **REGION_SERVER_MEM_SIZE** Memory allocated for the Region Server, as defined by the parameter -Xmx in *hbase-env.sh*
52 | * **MEMSTORE_FRACTION** Memstore memory fraction, defined by *hbase.regionserver.global.memstore.size* (default 0.4)
53 | * **MEMSTORE_SIZE** Memstore flush size (default 128MB)
54 | * **NUM_COLUMN_FAMILIES** Number of column families defined for the table
55 | 
56 | 
57 | For example for a Region Server configured with 32GB of Heap memory and hosting a table with a single column family with the default HBase settings, we'll have an ideal allocation of regions equals to:
58 | 
59 | ```
60 | # Number Recommended Regions
61 | (32GB * 0.4) / (128MB * 1) = 100 
62 | ```
63 | 
64 | As previosly mentioned, this is a recommended setting that you can use as a starting point. For example, is not unfrequent to have a region server with 3 / 4 times the recommended value. However, to avoid impacting the performance it’s better that you’re not extensively using these extra regions for write operations to avoid extensive GC operations that might degrade performance or in worst cases failures that will force a Region Server restart.
65 | 


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/HBase/management.md:
--------------------------------------------------------------------------------
 1 | # Management
 2 | 
 3 | This section highlights some commands and best practices, that can help you to manage your HBase clusters on Amazon EMR.
 4 | 
 5 | ## Create command alias
 6 | 
 7 | If you administrate your HBase cluster mainly from the shell of the EMR master, it might be convenient to define a command alias to avoid permission issues you might incur by typing erroneous commands using a different user (e.g. root).
 8 | 
 9 | As best practice, you should always run HBase commands as `hbase` user. In order to do that, you can add the following alias in the `~/.bashrc` profile for the user that you use to administrate your cluster (e.g. hadoop)
10 | 
11 | ```bash
12 | alias hbase='sudo -u hbase hbase'
13 | ```
14 | 
15 | Once done, you can safely run HBase commands as usual
16 | 
17 | ```bash
18 | hbase shell
19 | ```
20 | 
21 | ## Determine average row size
22 | 
23 | If you want to determine the average size of a row stored in a HBase table, you can use the following commands to retrieve the payload from a storefile of the table. For example:
24 | 
25 | ```bash
26 | # Simple notation
27 | hbase hfile -m -f $HBASE_PATH
28 | 
29 | # Extended notation
30 | hbase org.apache.hadoop.hbase.io.hfile.HFile -m -f $HBASE_PATH
31 | ```
32 | 
33 | ![Average Payload Size](./img/management_avg_payload.png "Average Payload Size")
34 | 
35 | The class `org.apache.hadoop.hbase.io.hfile.HFile` allows you to analyze HBase store files that are persisted on HDFS or S3. The option `-m` returns the metadata for the file analyzed that reports the average size (bytes) of the Row Key in that particular file, and the average size (bytes) of the values stored in that file.
36 | 
37 | To get a rough estimation of the average payload of a single row, you can sum the parameters **avgKeyLen** and **avgValueLen** that are returned in the previous command, to get the average size in bytes of a row. For example:
38 | 
39 | ```
40 | # row_avg_size = avgKeyLen + avgValueLen
41 | row_avg_size = 19 + 7 = 26
42 | ```
43 | 
44 | This command might be useful to get a rough estimate of your data payload when you are not sure about it. You can later on use this value to fine-tuning your cluster (e.g. [increase/decrease RPC Listeners](./best_practice#hbase-rpc-listeners))
45 | 
46 | ## Reduce number of Regions
47 | 
48 | The HBase community introduced a new utility in the [hbase-tools](https://github.com/apache/hbase-operator-tools/blob/master/hbase-tools/) package that helps to reduce the number of regions for tables stored in HBase. This utility is available in the class `org.apache.hbase.RegionsMerger` and can help you to automatically merge the number of regions to a value that you define, if you have a high count in your cluster (e.g. wrong table pre-split, or high split rate due to incorrect settings)
49 | 
50 | ```bash
51 | # copy library in classpath
52 | sudo cp /usr/lib/hbase-operator-tools/hbase-tools-*.jar /usr/lib/hbase/lib/
53 | 
54 | # merge regions
55 | hbase org.apache.hbase.RegionsMerger <TABLE_NAME> <TARGET_NUMBER_OF_REGIONS>
56 | ```
57 | 
58 | This tool is available in HBase versions >= 2.x.x and should only be used with these versions.
59 | 


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/HBase/performance_tests.md:
--------------------------------------------------------------------------------
 1 | # Performance Tests
 2 | 
 3 | One of the most important operations to perform before start using an HBase cluster is to perform a stress test to verify if the provisioned infrastructure meets the requirements in terms of latency and throughput for your applications.
 4 | 
 5 | In this section, we’re going to explore some tools that can help to validate the provisioned infrastructure in terms of performance. For optimal results, it’s recommended to setup a monitoring tool as described in the [Observability](./observability#monitoring-hbase) section to collect advanced metrics from the tests.
 6 | 
 7 | ## Evaluation Framework
 8 | 
 9 | Typically, there are different aspects you want to check depending on how your cluster will be used. However, there are two major metrics that are important to define a baseline for the cluster performance: operation throughput (number of requests we can serve for a specific operation in a given period of time, e.g. GET) and operation latency (time required to acknowledge a client request).
10 | 
11 | It’s very important to baseline these metrics in a production cluster. They will give you hints on when to scale nodes based on clients requests during the day, and they can suggest configuration tuning if it is not matching expected performance.
12 | 
13 | Typically, you can perform a benchmark in an HBase cluster following the steps below:
14 | 
15 | * **Write / data load** This is always the first step in the process as you should populate some tables with mock data to perform read tests or simply to evaluate the maximum throughput you can achieve during write operations. For this test, it is important to mimic as much as possible the average payload size of the data that will be ingested in the cluster. This can help to evaluate the number of compactions performed with the ingested volume and see the performance degradation that you might expect during these operations. Besides, this will also give you an idea of the maximum number of write requests you can serve with the specified cluster topology.
16 | 
17 | * **Read / latency / cache** This is the next step to define our baseline. The major aim of this test should be to verify the max throughput that the cluster can serve and understand how well you are leveraging the HBase cache to improve response latency.
18 | 
19 | As best practice for running these tests, you can follow the following rules:
20 | 
21 | * Separate the clients from the HBase cluster. The goal is to collect metrics without having to care about resources used in our cluster. So as best practice, you should run your client fleet on a separate cluster.
22 | 
23 | * If your clients are on a separate cluster, make sure that your fleet is co-located on the same subnet of the cluster. This will improve response latency and avoid extra costs you might incur for data trasfer across Availability Zones.
24 | 
25 | * Use [EMR Configurations](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-configure-apps.html) defined as JSON files that are stored on an Amazon S3 bucket to launch your test clusters. This will help you to more easily export configurations used in your QA environment to production. Moreover, it will be easier to track specific configurations used in a test cluster, rather than setting them manually while launching the cluster.
26 | 
27 | The following section describes some tools that can be used to baseline an HBase cluster.
28 | 
29 | ## Performance Evaluation Tool
30 | 
31 | The first tool we’re going to use is the HBase Performance Evaluation utility that is already available in your Amazon EMR cluster. This utility can be invoked using the following syntax from the EMR master node:
32 | 
33 | ```bash
34 | hbase pe <OPTIONS> <command> <nclients>
35 | ```
36 | 
37 | The tool allow us to perform both write and read operations specifying different options to control several aspects of our tests (e.g. create a partitioned table, disable WAL flush, etc.)
38 | 
39 | For example, the following command allows us to create a table called *MyWriteTest* that will be pre-partitioned with 200 regions (--presplit) and we’re going to write 2GB (--size) of data using a single client. We also enable the *latency* parameter to report operation latencies that help us to identify if the response time met our requirements.
40 | 
41 | ```bash
42 | hbase pe --table=MyWriteTest --presplit=200 --size=2 --latency --nomapred randomWrite 1
43 | ```
44 | 
45 | As described in the [Logs section](./observability#logs), the log output will be stored in the **`/var/log/hbase/hbase.log`** file. Please make sure to run the previous command as **`hbase`** user, or you’ll not have the permissions to modify this file using the standard **`hadoop`** user. The following shows a sample output for the previous command:
46 | 
47 | ```log
48 | INFO [TestClient-0] hbase.PerformanceEvaluation: Latency (us) : mean=21.45, min=1.00, max=480941.00, stdDev=992.53, 50th=2.00, 75th=2.00, 95th=2.00, 99th=3.00, 99.9th=24.00, 99.99th=37550.00, 99.999th=46364.23
49 | INFO [TestClient-0] hbase.PerformanceEvaluation: Num measures (latency) : 2097151
50 | INFO [TestClient-0] hbase.PerformanceEvaluation: Mean = 21.45
51 | ...
52 | INFO [TestClient-0] hbase.PerformanceEvaluation: No valueSize statistics available
53 | INFO [TestClient-0] hbase.PerformanceEvaluation: Finished class org.apache.hadoop.hbase.PerformanceEvaluation$RandomWriteTest in 42448ms at offset 0 for 2097152 rows (48.62 MB/s)
54 | INFO [TestClient-0] hbase.PerformanceEvaluation: Finished TestClient-0 in 42448ms over 2097152 rows
55 | INFO [main] hbase.PerformanceEvaluation: [RandomWriteTest] Summary of timings (ms): [42448]
56 | INFO [main] hbase.PerformanceEvaluation: [RandomWriteTest duration ] Min: 42448ms Max: 42448ms Avg: 42448ms
57 | INFO [main] hbase.PerformanceEvaluation: [ Avg latency (us)] 21
58 | INFO [main] hbase.PerformanceEvaluation: [ Avg TPS/QPS] 49405 row per second
59 | ```
60 | 
61 | As you can see this will report min, max and avg response latency for our write requests, along with throughput information about the max number of calls served by the cluster. Please note that in our example we used the *`nomapred`* parameter that will use a local thread to perform the test (in this case client resides on the EMR master node).
62 | 
63 | If we want to generate a higher number of requests is better to remove this option, so that the utility will use a Map Reduce (MR) job to perform the test. In this last scenario, it might be convenient to run the MR job on a separate cluster, to avoid using resources (cpu, network bandwidth) from our HBase cluster and gather more realistic results.
64 | 
65 | For example, the same tests can performed from a separate EMR cluster adding the following parameter: **`-Dhbase.zookeeper.quorum=TARGET_HBASE_MASTER_DNS`**, and replacing TARGET_HBASE_MASTER_DNS with the EMR master hostname we want to test.
66 | 
67 | ```bash
68 | hbase pe -Dhbase.zookeeper.quorum=ip-xxx-xx-x-xxx.compute.internal --table=MyWriteTestTwo --presplit=200 --size=2 --latency  randomWrite 1
69 | ```
70 | 
71 | In the same way we can perform Read test operations. For a detailed list of all options and tests available in the utility, please check the help section of the tool from ther command line.
72 | 
73 | ## YCSB
74 | 
75 | Another popular tool to benchmark your HBase cluster is [YCSB](https://github.com/brianfrankcooper/YCSB) (Yahoo Cloud Serving Benchmark). This utility is not available on Amazon EMR, so it should be manually installed on the EMR master itself, or on a separate EC2 instance.
76 | 
77 | This tool, unlike the previous one, is more focused on testing workloads patterns. In fact, in doesn’t provide several options as the HBase PE utility, but allows you to define different types of workloads (typically called workload A,B,C,D, etc.) where you can mix different volumes of write/read/mutate operations, along with sizes of the data that are going to be read or modified.
78 | 
79 | By default, the tool comes with pre-defined templates to tests some standard workloads patterns. For example, [workload A](https://github.com/brianfrankcooper/YCSB/blob/master/workloads/workloada) performs 50% of read operations and 50% of update operations using 1KB payloads for each row.
80 | 
81 | This tool is especially useful, when you know exactly your workloads patterns, and you want to simulate more realistic use cases. However, please note that the tool can only launch multithreaded clients on the same node. So if you have a large cluster that you want to test, you’ll have to configure a fleet of EC2 instances and run the clients from each node using some automation scripts.
82 | 


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/HBase/scripts/hbase-snapshot-export.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #===============================================================================
 3 | #!# script: hbase-snapshot-export
 4 | #!# authors: ripani
 5 | #!# version: v0.2
 6 | #!#
 7 | #!# Generate a snapshot of all the tables present in a specific hbase namespace.
 8 | #!# The snapshots so created are then copied over an S3 bucket or HDFS path.
 9 | #!# For legacy clusters, use the s3a:// schema and specify the AWS programmatic
10 | #!# keys.
11 | #!#
12 | #!# If you're transfering data between kerberized clusters, make sure the
13 | #!# clusters belong to the same Kerberos realm.
14 | #===============================================================================
15 | #?#
16 | #?# usage: ./hbase-snapshot-export.sh <HBASE_NS> <SNAPSHOT_PATH> [S3_ACCESS_KEY] [AWS_SECRET_KEY]
17 | #?#        ./hbase-snapshot-export.sh "default" "hdfs://NN:8020/hbase"
18 | #?#        ./hbase-snapshot-export.sh "default" "s3://BUCKET/PREFIX"
19 | #?#        ./hbase-snapshot-export.sh "default" "s3a://BUCKET/PREFIX" "KEY" "SECRET"
20 | #?#
21 | #?#   HBASE_NS                 HBase namespace to backup
22 | #?#   SNAPSHOT_PATH            HDFS or S3 path.
23 | #?#                            Example: s3://BUCKET or hdfs://NN:8020/user/hbase
24 | #?#   AWS_ACCESS_KEY           [Optional] AWS access key for s3a schema
25 | #?#   AWS_SECRET_KEY           [Optional] AWS secret key for s3a schema
26 | #?#
27 | #===============================================================================
28 | 
29 | # Print the usage helper using the header as source
30 | function usage() {
31 | 	[ "$*" ] && echo "$0: $*"
32 | 	sed -n '/^#?#/,/^$/s/^#?# \{0,1\}//p' "$0"
33 | 	exit -1
34 | }
35 | 
36 | [[ $# -lt 2 ]] && echo "error: wrong parameters" && usage
37 | 
38 | #===============================================================================
39 | # Configurations
40 | #===============================================================================
41 | HBASE_NS="$1"
42 | SNAPSHOT_PATH="$2"
43 | AWS_ACCESS_KEY="$3"
44 | AWS_SECRET_KEY="$4"
45 | 
46 | if [[ -f "/emr/instance-controller/lib/info/extraInstanceData.json" ]]; then
47 | 	HBASE_CMD="sudo -u hbase hbase"
48 | else
49 | 	HBASE_CMD="hbase"
50 | fi
51 | 
52 | # Retrieve list tables for the namespace
53 | readarray -t tables < <(echo "list_namespace_tables '$HBASE_NS'" | $HBASE_CMD shell 2> /dev/null | sed -e '1,/TABLE/d' -e '/seconds/,$d' | while IFS='' read -r line || [[ -n "$line" ]]; do echo "$line"; done)
54 | 
55 | # Generate Snapshots
56 | label="$(date +"%Y%m%d")-$(date +%s)"
57 | for table in "${tables[@]}"; do
58 | 	echo "Creating snapshot for table $HBASE_NS:$table"
59 | 	$HBASE_CMD snapshot create -n "$label-$HBASE_NS-$table" -t $HBASE_NS:$table
60 | done
61 | 
62 | # Copy Snapshots to S3
63 | snapshots=$($HBASE_CMD snapshot info -list-snapshots | grep $label | awk '{print $1}')
64 | for s in ${snapshots}; do
65 | 	echo "Transfer snapshot $s to $SNAPSHOT_PATH"
66 | 	if [[ -z "$AWS_ACCESS_KEY" && -z "$AWS_SECRET_KEY" ]]; then
67 | 		$HBASE_CMD org.apache.hadoop.hbase.snapshot.ExportSnapshot -snapshot ${s} -copy-to $SNAPSHOT_PATH
68 | 	else
69 | 		$HBASE_CMD org.apache.hadoop.hbase.snapshot.ExportSnapshot -Dfs.s3a.access.key=$AWS_ACCESS_KEY -Dfs.s3a.secret.key=$AWS_SECRET_KEY -snapshot ${s} -copy-to $SNAPSHOT_PATH
70 | 	fi
71 | done
72 | 


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/HBase/scripts/hbase-snapshot-import.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #===============================================================================
 3 | #!# script: hbase-snapshot-import
 4 | #!# authors: ripani
 5 | #!# version: v0.2
 6 | #!#
 7 | #!# Import and Restore HBase Snapshots using a label. Make sure all the required
 8 | #!# HBase namespaces required by the tables already exist before launching the
 9 | #!# script.
10 | #!#
11 | #!# If you're transfering data between kerberized clusters, make sure the
12 | #!# clusters belong to the same Kerberos realm.
13 | #===============================================================================
14 | #?#
15 | #?# usage: ./hbase-snapshot-import.sh <LABEL> <SNAPSHOT_PATH>
16 | #?#        ./hbase-snapshot-import.sh "20220813-1660430" "hdfs://NN:8020/hbase"
17 | #?#        ./hbase-snapshot-import.sh "20220813-1660430" "s3://BUCKET/PREFIX"
18 | #?#
19 | #?#   LABEL                    HBase namespace to backup
20 | #?#   SNAPSHOT_PATH            HDFS or S3 path.
21 | #?#                            Example: s3://BUCKET or hdfs://NN:8020/user/hbase
22 | #?#
23 | #===============================================================================
24 | 
25 | # Print the usage helper using the header as source
26 | function usage() {
27 |     [ "$*" ] && echo "$0: $*"
28 |     sed -n '/^#?#/,/^$/s/^#?# \{0,1\}//p' "$0"
29 |     exit -1
30 | }
31 | 
32 | [[ $# -lt 2 ]] && echo "error: wrong parameters" && usage
33 | 
34 | #===============================================================================
35 | # Configurations
36 | #===============================================================================
37 | LABEL="$1"
38 | SNAPSHOT_PATH="$2"
39 | 
40 | HBASE_CMD="sudo -u hbase hbase"
41 | HBASE_CONF="/etc/hbase/conf/hbase-site.xml"
42 | HBASE_ROOT=$(xmllint --xpath "//configuration/property/*[text()='hbase.rootdir']/../value/text()" $HBASE_CONF)
43 | 
44 | #===============================================================================
45 | # Import Snapshots
46 | #===============================================================================
47 | snapshots=$(
48 |     $HBASE_CMD snapshot info -list-snapshots \
49 |         -remote-dir $SNAPSHOT_PATH | grep $LABEL | awk '{print $1}'
50 | )
51 | for s in ${snapshots}; do
52 |     echo "Import snapshot $s"
53 |     $HBASE_CMD snapshot export -D hbase.rootdir=$SNAPSHOT_PATH \
54 |         -snapshot $s \
55 |         -copy-to $HBASE_ROOT
56 | done
57 | 
58 | #===============================================================================
59 | # Restore Snapshots
60 | #===============================================================================
61 | snapshots=$(
62 |     $HBASE_CMD snapshot info -list-snapshots | grep $LABEL | awk '{print $1}'
63 | )
64 | for s in ${snapshots}; do
65 |     echo "Restore snapshot $s"
66 |     echo "restore_snapshot '$s'" | $HBASE_CMD shell
67 | done
68 | 


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/HBase/security.md:
--------------------------------------------------------------------------------
 1 | # Security
 2 | 
 3 | The following section describes the main security aspects that can help you to secure an HBase cluster running on Amazon EMR.
 4 | 
 5 | ## Authentication
 6 | 
 7 | By default when launching an Amazon EMR cluster with HBase installed, the service will configure HBase without enabling any type of authentication. This allows every client connecting to HBase to read / write tables stored in the cluster without the need to provide any credentials. In this context it is a best practice to limit access to the cluster by scoping access to the cluster using firewalls or [EMR Security Groups](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-security-groups.html) attached to the cluster. For more details see [Networking](#networking)
 8 | 
 9 | However, if you require to enable a strong authentication system, you can use Kerberos authentication to secure your cluster. HBase implements the Simple Authentication and Security Layer (SASL) at the RPC level, that will handle authentication and encryption negotiation for each connection established with the service.
10 | 
11 | Amazon EMR automatically configures HBase with the required configurations when you launch a cluster with a Security Configuration where Kerberos authentication is enabled. The following highlights the main HBase configurations set by the service when launching an EMR cluster with Kerberos enabled (generated using Amazon EMR 6.9.0):
12 | 
13 | | Configuration                                    | Value                              |
14 | |--------------------------------------------------|------------------------------------|
15 | | **hbase.security.authentication**                | `kerberos`                           |
16 | | **hbase.security.authorization**                 | `true`                               |
17 | | **hbase.master.kerberos.principal**              | `hbase/_HOST@<YOUR_KERBEROS_REALM>`  |
18 | | **hbase.master.keytab.file**                     | `/etc/hbase.keytab`                  |
19 | | **hbase.regionserver.kerberos.principal**        | `hbase/_HOST@<YOUR_KERBEROS_REALM>`  |
20 | | **hbase.regionserver.keytab.file**               | `/etc/hbase.keytab`                  |
21 | | **hbase.thrift.kerberos.principal**              | `hbase/_HOST@<YOUR_KERBEROS_REALM>`  |
22 | | **hbase.thrift.keytab.file**                     | `/etc/hbase.keytab`                  |
23 | | **hbase.thrift.security.qop**                    | `auth`                               |
24 | | **hbase.rest.authentication.type**               | `kerberos`                           |
25 | | **hbase.rest.authentication.kerberos.principal** | `HTTP/_HOST@<YOUR_KERBEROS_REALM>`   |
26 | | **hbase.rest.authentication.kerberos.keytab**    | `/etc/hbase.keytab`                  |
27 | | **hbase.rest.kerberos.principal**                | `hbase/_HOST@<YOUR_KERBEROS_REALM>`  |
28 | | **hbase.rest.keytab.file**                       | `/etc/hbase.keytab`                  |
29 | | **hbase.rest.support.proxyuser**                 | `true`                               |
30 | | **hadoop.proxyuser.hbase.groups**                | `*`                                  |
31 | | **hadoop.proxyuser.hbase.hosts**                 | `*`                                  |
32 | 
33 | To launch an EMR cluster with Kerberos Authentication, please refer to [Configuring Kerberos on Amazon EMR](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-kerberos-configure.html)
34 | 
35 | ## Authorization
36 | 
37 | Once the users are authenticated through Kerberos, we can now implement our Authorization policies to allow restricted access for specific user to our tables. To enable this functionality, it’s required to enable the Access Controller Coprocessor, by adding additional configurations when launching the EMR cluster. Below an example EMR configuration:
38 | 
39 | ```json
40 | [
41 |   {
42 |     "Classification": "hbase-site",
43 |     "Properties": {
44 |       "hbase.coprocessor.master.classes": "org.apache.hadoop.hbase.security.access.AccessController",
45 |       "hbase.coprocessor.region.classes": "org.apache.hadoop.hbase.security.token.TokenProvider,org.apache.hadoop.hbase.security.access.AccessController",
46 |       "hbase.security.authorization": "true",
47 |       "hbase.security.exec.permission.checks": "true"
48 |     }
49 |   }
50 | ]
51 | ```
52 | 
53 | In order to grant permissions to specific users in the cluster, you must define the ACL policies using the `hbase` admin user. For example, the below command add the `READ('R')`, `WRITE('W')`, `EXEC('X')`, `CREATE('C')`, `ADMIN('A')` permissions to the `hadoop` user:
54 | 
55 | ```bash
56 | sudo -s
57 | kdestroy
58 | kinit hbase/`hostname -f`@YOUR_KERBEROS_REALM -k -t /etc/hbase.keytab
59 | hbase shell
60 | grant 'hadoop', 'RWXCA'
61 | ```
62 | 
63 | For additional details, please see the [Administration](https://hbase.apache.org/book.html#_administration) section in official HBase documentation.
64 | 
65 | ## Networking
66 | 
67 | It’s always a good practice to restrict network access to the cluster to reduce the exposure of the services to external attacks. When using Amazon EMR, you can specify additional Security Groups attached to the cluster to enable network communication with the cluster from pre-defined ranges of IPs or other AWS Security Groups. The tables below provides HBase ports you can control in the EMR Security Groups to allow interactions with trusted parties.
68 | 
69 | For additional information see [Control network traffic with security groups](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-security-groups.html) in the EMR documentation.
70 | 
71 | ### HBase Services Ports
72 | 
73 | | Port        | Security Group | Description           |
74 | |-------------|----------------|-----------------------|
75 | | 2181 / TCP  | Master         | Zookeeper client port |
76 | | 16000 / TCP | Master         | HMaster               |
77 | | 16020 / TCP | Core & Task    | Region Server         |
78 | | 8070 / TCP  | Master         | REST server           |
79 | | 9090 / TCP  | Master         | Thrift Server         |
80 | 
81 | ### HBase Web UI Ports
82 | 
83 | | Port        | Security Group | Description          |
84 | |-------------|----------------|----------------------|
85 | | 16010 / TCP | Master         | HMaster Web UI       |
86 | | 16030 / TCP | Core & Task    | Region Server Web UI |
87 | | 8085 / TCP  | Master         | REST Server UI       |
88 | | 9095 / TCP  | Master         | Thrift Server UI     |
89 | 


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Hadoop/img/emr_console_events.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Hadoop/img/emr_console_events.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Hadoop/img/img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Hadoop/img/img.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Hadoop/img/img2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Hadoop/img/img2.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Hadoop/img/three.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Hadoop/img/three.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Hadoop/introduction.md:
--------------------------------------------------------------------------------
1 | ---
2 | sidebar_position: 1
3 | sidebar_label: Overview
4 | ---
5 | 
6 | # Introduction
7 | 
8 | This section offers recommendations and configuration guidelines for particular Hadoop aspects within Amazon EMR clusters, enabling desired performances or behaviors.
9 | 


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Hadoop/scripts/emr-6-ba-yarn_docker_gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #===============================================================================
 3 | #!# script: emr-6-ba-yarn_docker_gpu.sh
 4 | #!#
 5 | #!# EMR Bootstrap Action - Install docker requirements on a GPU node
 6 | #===============================================================================
 7 | set -ex
 8 | 
 9 | # force to run as root
10 | if [ $(id -u) != "0" ]; then
11 |   sudo "$0" "$@"
12 |   exit $?
13 | fi
14 | 
15 | # cgroups
16 | chmod a+rwx -R /sys/fs/cgroup/cpu,cpuacct
17 | chmod a+rwx -R /sys/fs/cgroup/devices
18 | 
19 | # nvidia docker runtime
20 | curl -s -L https://nvidia.github.io/libnvidia-container/stable/rpm/nvidia-container-toolkit.repo | tee /etc/yum.repos.d/nvidia-container-toolkit.repo
21 | yum install -y nvidia-docker2 nvidia-container-toolkit
22 | 
23 | # (IMPORTANT) configure nvidia runtime
24 | nvidia-ctk runtime configure --runtime=docker
25 | # (IMPORTANT) this set the value "no-cgroups = false" in /etc/nvidia-container-runtime/config.toml
26 | # without this YARN docker containers will fail with "Failed to initialize NVML: Unknown Error"
27 | nvidia-ctk config --set nvidia-container-cli.no-cgroups=false -i
28 | systemctl restart docker
29 | exit 0


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Hadoop/scripts/emr-6-yarn_docker_gpu.yaml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: 2010-09-09
  2 | 
  3 | Description: 'AWS CloudFormation EMR Sample Template:
  4 |   Create an EMR cluster for GPU processing on docker containers.
  5 | 
  6 |   **WARNING** You will be billed for the AWS resources used if you create a
  7 |   stack from this template.'
  8 | 
  9 | Metadata:
 10 |   'AWS::CloudFormation::Interface':
 11 |     ParameterGroups:
 12 |       - Label:
 13 |           default: Project
 14 |         Parameters:
 15 |           - Project
 16 | 
 17 |       - Label:
 18 |           default: Network
 19 |         Parameters:
 20 |           - Vpc
 21 |           - VpcNetwork
 22 |           - Subnet
 23 | 
 24 |       - Label:
 25 |           default: EMR
 26 |         Parameters:
 27 |           - S3BootstrapLocation
 28 |           - KeyName
 29 |           - emrRelease
 30 |           - emrMasterInstanceType
 31 |           - emrCoreInstanceType
 32 |           - emrCoreCount
 33 | 
 34 |     ParameterLabels:
 35 |       S3BootstrapLocation:
 36 |         default: 'S3 Bootstrap Action'
 37 | 
 38 |       KeyName:
 39 |         default: 'SSH Key Name'
 40 |       emrRelease:
 41 |         default: 'Release Version'
 42 |       emrMasterInstanceType:
 43 |         default: 'MASTER Instance Type'
 44 |       emrCoreInstanceType:
 45 |         default: 'CORE Instance Type'
 46 |       emrCoreCount:
 47 |         default: 'CORE nodes count'
 48 | 
 49 | Parameters:
 50 | 
 51 |   S3BootstrapLocation:
 52 |     Description: S3 Location of the Bootstrap Action. For example, s3://YOUR_BUCKET/emr-6-ba-yarn_docker_gpu.sh 
 53 |     Type: String
 54 | 
 55 |   Project:
 56 |     Type: String
 57 |     Description: Value of the `Project` tag attached to each resource
 58 |     Default: aws-emr
 59 | 
 60 |   Vpc:
 61 |     Type: AWS::EC2::VPC::Id
 62 |   Subnet:
 63 |     Type: AWS::EC2::Subnet::Id
 64 |   VpcNetwork:
 65 |     Description: Network range for the Vpc (ex. 10.0.0.0/16)
 66 |     Type: String
 67 |     Default: 10.0.0.0/16
 68 | 
 69 |   KeyName:
 70 |     Type: AWS::EC2::KeyPair::KeyName
 71 | 
 72 |   emrMasterInstanceType:
 73 |     Type: String
 74 |     Default: m5.2xlarge
 75 |     AllowedValues:
 76 |       - m5.2xlarge
 77 |       - m5.4xlarge
 78 |   emrCoreCount:
 79 |     Type: String
 80 |     Default: 1
 81 |   emrCoreInstanceType:
 82 |     Type: String
 83 |     Default: g4dn.2xlarge
 84 |     AllowedValues:
 85 |       - g4dn.2xlarge
 86 |       - g4dn.12xlarge
 87 |       - g5.2xlarge
 88 |       - g5.12xlarge
 89 |       - p3.2xlarge
 90 |       - p3.8xlarge
 91 | 
 92 |   emrRelease:
 93 |     Type: String
 94 |     Default: emr-6.15.0
 95 |     AllowedValues:
 96 |       - emr-6.15.0
 97 | 
 98 | Resources:
 99 | 
100 | #===============================================================================
101 | # EMR security configuration
102 | #===============================================================================
103 |   emrAdditionalSecurityGroup:
104 |     Type: 'AWS::EC2::SecurityGroup'
105 |     Properties:
106 |       GroupDescription: Open Up all Ranger Server ports
107 |       VpcId: !Ref Vpc
108 |       SecurityGroupIngress:
109 |         - IpProtocol: -1
110 |           CidrIp: !Ref VpcNetwork
111 |         - IpProtocol: tcp
112 |           FromPort: '22'
113 |           ToPort: '22'
114 |           CidrIp: 0.0.0.0/0
115 | 
116 | #===============================================================================
117 | # EMR cluster
118 | #===============================================================================
119 |   EmrCluster:
120 |     Type: AWS::EMR::Cluster
121 |     Properties:
122 |       Name: !Sub ${Project}/emr-yarn_docker_gpu
123 |       JobFlowRole: 'EMR_EC2_DefaultRole'
124 |       ServiceRole: 'EMR_DefaultRole'
125 |       ReleaseLabel: !Ref emrRelease
126 |       LogUri: !Sub s3://aws-logs-${AWS::AccountId}-${AWS::Region}/elasticmapreduce/
127 |       VisibleToAllUsers: true
128 |       EbsRootVolumeSize: 15
129 |       Instances:
130 |         Ec2SubnetId: !Ref Subnet
131 |         Ec2KeyName: !Ref KeyName
132 |         AdditionalMasterSecurityGroups:
133 |           - !Ref emrAdditionalSecurityGroup
134 |         AdditionalSlaveSecurityGroups:
135 |           - !Ref emrAdditionalSecurityGroup
136 |         TerminationProtected: false
137 |         MasterInstanceGroup:
138 |           InstanceCount: 1
139 |           InstanceType: !Ref emrMasterInstanceType
140 |           Market: ON_DEMAND
141 |           Name: Master node
142 |         CoreInstanceGroup:
143 |           InstanceCount: !Ref emrCoreCount
144 |           InstanceType: !Ref emrCoreInstanceType
145 |           Market: ON_DEMAND
146 |           Name: Core instance
147 | 
148 |       BootstrapActions:
149 |         - Name: Install Nvidia Docker 
150 |           ScriptBootstrapAction:
151 |             Path: !Ref S3BootstrapLocation
152 | 
153 |       Applications:
154 |         - Name: Hadoop
155 |         - Name: Spark
156 | 
157 |       Configurations:
158 | 
159 |         ####################################
160 |         # YARN / Capacity Scheduler
161 |         ####################################
162 |         - Classification: yarn-site
163 |           ConfigurationProperties:
164 |             yarn.nodemanager.container-executor.class: "org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor"
165 |             yarn.nodemanager.linux-container-executor.cgroups.hierarchy: "yarn"
166 |             yarn.nodemanager.linux-container-executor.cgroups.mount: "true"
167 |             yarn.nodemanager.linux-container-executor.cgroups.mount-path: "/sys/fs/cgroup"
168 |             yarn.nodemanager.resource-plugins: "yarn.io/gpu"
169 |             yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices: "auto"
170 |             yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables: "/usr/bin"
171 |             yarn.resource-types: "yarn.io/gpu"
172 |             yarn.nodemanager.resource-plugins.gpu.docker-plugin: "nvidia-docker-v2"
173 | 
174 |         - Classification: capacity-scheduler
175 |           ConfigurationProperties:
176 |             yarn.scheduler.capacity.resource-calculator: "org.apache.hadoop.yarn.util.resource.DominantResourceCalculator"
177 | 
178 |         ####################################
179 |         # YARN DOCKER + GPU
180 |         ####################################
181 |         - Classification: container-executor
182 |           Configurations:
183 |             - Classification: docker
184 |               ConfigurationProperties:
185 |                 docker.allowed.runtimes: "nvidia"
186 |                 docker.trusted.registries: "library,centos"         
187 |             - Classification: gpu
188 |               ConfigurationProperties:
189 |                 module.enabled: "true"  
190 |             - Classification: cgroups
191 |               ConfigurationProperties:
192 |                 root: "/sys/fs/cgroup"
193 |                 yarn-hierarchy: "yarn"
194 | 
195 |       Tags:
196 |         - Key: Project
197 |           Value: !Ref Project
198 |         - Key: Name
199 |           Value: !Sub ${Project}/emr-node
200 | 


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Hadoop/scripts/yarn_labels_scaling.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #===============================================================================
 3 | #!# script: yarn_labels_scaling.sh
 4 | #!#
 5 | #!# Install a custom script to publish additional Hadoop metrics on CloudWatch 
 6 | #===============================================================================
 7 | set -ex
 8 | 
 9 | # force to run as root
10 | if [ $(id -u) != "0" ]; then
11 |   sudo "$0" "$@"
12 |   exit $?
13 | fi
14 | 
15 | script_path="/etc/emr-scripts"
16 | script_file="$script_path/yarn_metrics.sh"
17 | 
18 | # create script path
19 | mkdir -p $script_path
20 | 
21 | # create the custom metrics script
22 | cat << 'EOF' > $script_file
23 | #!/usr/bin/env bash
24 | 
25 | cluster_id=$(cat /emr/instance-controller/lib/info/job-flow.json 2>/dev/null | jq -r .jobFlowId)
26 | region=$(curl -s http://169.254.169.254/latest/dynamic/instance-identity/document | jq -r .region)
27 | rm_host="http://`hostname -f`:8088/ws/v1"
28 | 
29 | # Collect metrics for each YARN label Partition
30 | readarray -t yarn_labels < <(curl -s $rm_host/cluster/nodes | jq -r '[.nodes.node[].nodeLabels[]] | unique | sort | .[]')
31 | 
32 | for label in "${yarn_labels[@]}"; do
33 |   echo "Collecting metrics for $label"
34 |   pending_memory=$(curl -s $rm_host/cluster/scheduler | jq -r --arg LABEL "$label" '[ .scheduler.schedulerInfo.queues.queue[] | .resources.resourceUsagesByPartition[] | select(.partitionName==$LABEL) | .pending.memory ] | add')
35 |   pending_vcores=$(curl -s $rm_host/cluster/scheduler | jq -r --arg LABEL "$label" '[ .scheduler.schedulerInfo.queues.queue[] | .resources.resourceUsagesByPartition[] | select(.partitionName==$LABEL) | .pending.vCores ] | add')
36 |   aws cloudwatch put-metric-data --region $region --metric-name "$label.PendingMemory" --namespace "AWS/ElasticMapReduce" --unit Count --value $pending_memory --dimensions JobFlowId=$cluster_id
37 |   aws cloudwatch put-metric-data --region $region --metric-name "$label.PendingVCores" --namespace "AWS/ElasticMapReduce" --unit Count --value $pending_vcores --dimensions JobFlowId=$cluster_id  
38 | done
39 | EOF
40 | 
41 | chmod u+x $script_file
42 | chown hadoop:hadoop $script_file
43 | 
44 | # create crontab to monitor etl and departments queues
45 | echo "*/1 * * * * $script_file" | crontab -u hadoop -
46 | 
47 | exit 0


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Hadoop/yarn_docker_gpu.md:
--------------------------------------------------------------------------------
 1 | # YARN Docker with GPU
 2 | 
 3 | ## Overview
 4 | 
 5 | This document provides a guide on launching an Amazon EMR cluster to execute YARN applications utilizing GPU resources within Docker containers.
 6 | 
 7 | While EMR inherently installs all necessary NVIDIA drivers and libraries to run GPU-accelerated tasks, additional configurations and software are required to extend this capability to YARN applications running in docker containers.
 8 | 
 9 | To enable GPU utilization within Docker containers, the following steps are required while launching an EMR cluster:
10 | 
11 | 1. Attach an [EMR Bootstrap Action](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-plan-bootstrap.html) to prepare the cluster nodes. You can use the following script as an example [EMR 6.x - Bootstrap Action](./scripts/emr-6-ba-yarn_docker_gpu.sh)
12 | 
13 | 2. Specify the following [EMR cluster configurations](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-configure-apps.html) while launching the cluster.
14 | 
15 | ## Configurations
16 | 
17 | ```json
18 | [
19 |   {
20 |     "Classification": "yarn-site",
21 |     "Properties": {
22 |       "yarn.nodemanager.container-executor.class": "org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor",
23 |       "yarn.nodemanager.linux-container-executor.cgroups.hierarchy": "yarn",
24 |       "yarn.nodemanager.linux-container-executor.cgroups.mount": "true",
25 |       "yarn.nodemanager.linux-container-executor.cgroups.mount-path": "/sys/fs/cgroup",
26 |       "yarn.nodemanager.resource-plugins": "yarn.io/gpu",
27 |       "yarn.nodemanager.resource-plugins.gpu.allowed-gpu-devices": "auto",
28 |       "yarn.nodemanager.resource-plugins.gpu.path-to-discovery-executables": "/usr/bin",
29 |       "yarn.resource-types": "yarn.io/gpu",
30 |       "yarn.nodemanager.resource-plugins.gpu.docker-plugin": "nvidia-docker-v2"
31 |     }
32 |   },
33 |   {
34 |     "Classification": "container-executor",
35 |     "Configurations": [
36 |       {
37 |         "Classification": "docker",
38 |         "Properties": {
39 |           "docker.allowed.runtimes": "nvidia",
40 |           "docker.trusted.registries": "library,centos"
41 |         }
42 |       },     
43 |       {
44 |         "Classification": "gpu",
45 |         "Properties": {
46 |           "module.enabled": "true"
47 |         }
48 |       },     
49 |       {
50 |         "Classification": "cgroups",
51 |         "Properties": {
52 |           "root": "/sys/fs/cgroup",
53 |           "yarn-hierarchy": "yarn"
54 |         }
55 |       }
56 |     ],
57 |     "Properties": {}
58 |   },
59 |   {
60 |     "Classification": "capacity-scheduler",
61 |     "Properties": {
62 |       "yarn.scheduler.capacity.resource-calculator": "org.apache.hadoop.yarn.util.resource.DominantResourceCalculator"
63 |     }
64 |   }
65 | ]
66 | ```
67 | 
68 | ## Examples
69 | 
70 | To validate the setup, you can connect to the EMR primary node and run the YARN Distributed Shell. The following command launch a YARN application using docker nvidia runtime.
71 | 
72 | ```bash
73 | yarn jar /usr/lib/hadoop-yarn/hadoop-yarn-applications-distributedshell.jar \
74 |   -jar /usr/lib/hadoop-yarn/hadoop-yarn-applications-distributedshell.jar \
75 |   -shell_env YARN_CONTAINER_RUNTIME_TYPE=docker \
76 |   -shell_env YARN_CONTAINER_RUNTIME_DOCKER_IMAGE=ubuntu \
77 |   -shell_command nvidia-smi \
78 |   -container_resources memory-mb=3072,vcores=1,yarn.io/gpu=1 \
79 |   -num_containers 1
80 | ```
81 | 
82 | ## Resources
83 | 
84 | In this section, you'll find additional resources for testing this feature. 
85 | 
86 | To deploy the CloudFormation templates, store the relevant Bootstrap Action script in an S3 bucket. Next, launch the template from the AWS Web Console, ensuring to complete all necessary input parameters.
87 | 
88 | - EMR 6.x - [CloudFormation Template](./scripts/emr-6-yarn_docker_gpu.yaml) / [Bootstrap Action](./scripts/emr-6-ba-yarn_docker_gpu.sh)
89 | 
90 | ## References
91 | 
92 | - [Hadoop - Using GPU On YARN](https://hadoop.apache.org/docs/stable/hadoop-yarn/hadoop-yarn-site/UsingGpus.html)
93 | - [EMR - Use the Nvidia RAPIDS Accelerator for Apache Spark](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-rapids.html)


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Hadoop/yarn_labels_scaling.md:
--------------------------------------------------------------------------------
  1 | # YARN Scaling with custom labels
  2 | 
  3 | ## Overview
  4 | 
  5 | This document offers a comprehensive guide for deploying an Amazon EMR cluster capable of dynamically adjusting its capacity using [YARN Node Labels](https://hadoop.apache.org/docs/stable/hadoop-yarn/hadoop-yarn-site/NodeLabel.html) to scale specific Instance Groups based on pending container requests.
  6 | 
  7 | In scenarios where a multitenant cluster accommodates diverse applications with distinct hardware requirements (e.g. memory vs compute intensive jobs), selective scaling of the cluster can enhance throughput and minimize resource underutilization.
  8 | 
  9 | To achieve this, we can use a custom agent deployed on the EMR primary node to gather additional metrics and publish them on CloudWatch. These metrics can serve as input for the [EMR Custom Autoscale](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-automatic-scaling.html) functionality where we can define precise scaling logic for the nodes of our cluster.
 10 | 
 11 | In the provided example, we demonstrate launching an EMR cluster with two distinct Instance Groups: one optimized for COMPUTE tasks (using c5 instances) and another tailored for MEMORY intensive workloads (using r5 instances). Each Instance Group is tagged with a custom [YARN Node Label](https://hadoop.apache.org/docs/stable/hadoop-yarn/hadoop-yarn-site/NodeLabel.html) defined at cluster launch. EMR's Instance Group configurations accommodate such labeling, using the following YARN configurations:
 12 | 
 13 | **Note** *Custom YARN labels defined in the configurations should be created manually while launching the cluster. You can use an EMR Step to automate this process as shown in the CloudFormation example*
 14 | 
 15 | ```json
 16 | [
 17 |   {
 18 |     "Classification": "yarn-site",
 19 |     "Properties": {
 20 |       "yarn.nodemanager.node-labels.provider": "config",
 21 |       "yarn.nodemanager.node-labels.provider.configured-node-partition": "COMPUTE_OPTIMIZED"
 22 |     }
 23 |   }
 24 | ]
 25 | ```
 26 | 
 27 | Subsequently, custom YARN metrics are streamed to CloudWatch to serve as triggers for cluster scaling. In our demonstration, we track two distinct metrics on CloudWatch: pending vCores and memory utilization across each cluster partition of the cluster identified by a YARN Label. These metrics are acquired using the [YARN Resource Manager Scheduler API](https://hadoop.apache.org/docs/stable/hadoop-yarn/hadoop-yarn-site/ResourceManagerRest.html), and aggregated across all YARN queues (if defined in the yarn configurations). For a practical implementation, refer to the provided [Metric Collector Script](./scripts/yarn_labels_scaling.sh).
 28 | 
 29 | Finally, scaling policies are defined using CloudWatch triggers available in EMR Custom Autoscale. In our example, we adopt two distinct scaling strategies for each Instance Group: memory-optimized scaling, based on pending memory, and compute-optimized scaling, reliant on pending vCore.
 30 | 
 31 | Below is an example of a scale-out policy defined for the COMPUTE Instance Group. This policy triggers a scaling action to augment the cluster's capacity by 20% if the count of pending vCores exceeds 10, with a cooldown period of 300 seconds between consecutive scaling actions.
 32 | 
 33 | 
 34 | ```json
 35 | {
 36 |   "Name": "COMPUTE_OPTIMIZED_SCALE_OUT",
 37 |   "Description": "Scale-Out on pending vCores",
 38 |   "Action": {
 39 |     "SimpleScalingPolicyConfiguration": {
 40 |       "AdjustmentType": "PERCENT_CHANGE_IN_CAPACITY",
 41 |       "ScalingAdjustment": 20,
 42 |       "CoolDown": 300
 43 |     }
 44 |   },
 45 |   "Trigger": {
 46 |     "CloudWatchAlarmDefinition": {
 47 |       "ComparisonOperator": "GREATER_THAN_OR_EQUAL",
 48 |       "EvaluationPeriods": 1,
 49 |       "MetricName": "COMPUTE_OPTIMIZED.PendingVCores",
 50 |       "Namespace": "AWS/ElasticMapReduce",
 51 |       "Period": 120,
 52 |       "Statistic": "AVERAGE",
 53 |       "Threshold": 10,
 54 |       "Unit": "COUNT"
 55 |     }
 56 |   }
 57 | }
 58 | ```
 59 | 
 60 | For further details on configurations and parameters, consult [Understanding automatic scaling rules](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-automatic-scaling.html#emr-scaling-rules).
 61 | 
 62 | ## Commands / Examples
 63 | 
 64 | ### Hadoop YARN
 65 | 
 66 | #### List Node Labels
 67 | 
 68 | ```bash
 69 | yarn cluster --list-node-labels
 70 | ```
 71 | 
 72 | #### List Cluster Nodes Labels
 73 | 
 74 | ```bash
 75 | yarn node -list -all -showDetails
 76 | ```
 77 | 
 78 | #### Create Node Label
 79 | 
 80 | ```bash
 81 | yarn rmadmin -addToClusterNodeLabels "MY_TEST_LABEL(exclusive=true)"
 82 | ```
 83 | 
 84 | #### Delete Node Label
 85 | 
 86 | ```bash
 87 | yarn rmadmin -removeFromClusterNodeLabels "MY_TEST_LABEL"
 88 | ```
 89 | 
 90 | ### Apache Spark
 91 | 
 92 | To target specific nodes of the cluster while launching your Spark application you can use the following spark configurations:
 93 | 
 94 | - **spark.yarn.am.nodeLabelExpression** A YARN node label expression that restricts the set of nodes AM will be scheduled on
 95 | - **spark.yarn.executor.nodeLabelExpression** A YARN node label expression that restricts the set of nodes executors will be scheduled on
 96 | 
 97 | The following example, launch a SparkPi example to run only on COMPUTE_OPTIMIZED nodes of our cluster running on c5 instances:
 98 | 
 99 | ```bash
100 | spark-example \
101 |   --queue analytics \
102 |   --driver-cores 4 --driver-memory 4g \
103 |   --executor-cores 4 --executor-memory 4g \
104 |   --conf spark.yarn.am.nodeLabelExpression="CORE" \
105 |   --conf spark.yarn.executor.nodeLabelExpression="COMPUTE_OPTIMIZED" \
106 |   --conf spark.dynamicAllocation.maxExecutors=5 \
107 |   SparkPi 5000000
108 | ```
109 | 
110 | If you're using the default template configurations, the job will be able to request only 1 executor for the job and the COMPUTE_OPTIMIZED.PendingVCores metric will be 16 triggering a scale out operation targetting the specific Instance Group of the cluster. 
111 | 
112 | ## Monitoring
113 | 
114 | For troubleshooting and monitoring purposes, metrics visualization on CloudWatch offers insights into resources requested against each cluster partition. 
115 | Besides, in the EMR Web Console you can find more details on the scaling events that occurred on the cluster. Open the EMR Web Console for the cluster of interest and select the **Events** tab.
116 | 
117 | ![EMR Web Console Events](./img/emr_console_events.png)
118 | 
119 | ## Resources
120 | 
121 | In this section, you'll find additional resources for testing this feature. 
122 | 
123 | To deploy the CloudFormation templates, store the relevant Bootstrap Action script in an S3 bucket. Next, launch the template from the AWS Web Console, ensuring to complete all necessary input parameters.
124 | 
125 | - EMR 6.x - [CloudFormation Template](./scripts/yarn_labels_scaling.yaml) / [Metric Collector Script](./scripts/yarn_labels_scaling.sh)
126 | 
127 | ## References
128 | 
129 | - [EMR - Custom Autoscale](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-automatic-scaling.html)
130 | - [Hadoop - YARN Node Labels](https://hadoop.apache.org/docs/stable/hadoop-yarn/hadoop-yarn-site/NodeLabel.html)
131 | - [Spark - Running on YARN](https://spark.apache.org/docs/latest/running-on-yarn.html)


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Hive/best_practices.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 2
 3 | sidebar_label: Best Practices
 4 | ---
 5 | 
 6 | # Best Practices
 7 | 
 8 | ## Upgrading Hive Metastore
 9 | 
10 | When upgrading from EMR 5.x (hive 2.x) to EMR 6.x (hive 3.x), hive metastore requires a schema upgrade to support changes and new features in Hive 3.x The following steps assumes hive metastore is running on Amazon RDS. Upgrading hive metastore is backwards compatible. Once hive metastore is upgraded, both hive 2x and hive 3.x clients/clusters can use the same hive metastore. 
11 | 
12 | 1\. Take a snapshot of current Hive Metastore on Amazon RDS
13 | 
14 | 2\. Provision a new Amazon RDS with the snapshot that was created in step 1
15 | 
16 | 3\. Provision target EMR 6.x version without configuring an external hive metastore
17 | 
18 | 4\. SSH into EMR 6 cluster and update the below in hive-site.xml to point to new RDS from the previous step
19 | 
20 |  
21 | ```
22 | "javax.jdo.option.ConnectionURL": "jdbc:mysql://hostname:3306/hive?createDatabaseIfNotExist=true",
23 | "javax.jdo.option.ConnectionDriverName": "org.mariadb.jdbc.Driver",
24 | "javax.jdo.option.ConnectionUserName": "username",
25 | "javax.jdo.option.ConnectionPassword": "password"
26 | ```
27 | 
28 | 5\.  Run the following to check current hms schema version:
29 | 
30 | 
31 | ```
32 | hive --service schemaTool -dbType mysql -info
33 | ```
34 | 
35 | You should see the below. Make note of the current metastore schema version (2.3.0 in this case)
36 | 
37 | ```    
38 | Hive distribution version:              3.1.0
39 | Metastore schema version:          2.3.0
40 | org.apache.hadoop.hive.metastore.HiveMetaException: Metastore schema version is not compatible. Hive Version: 3.1.0, Database Schema Version: 2.3.0
41 | Use --verbose for detailed stacktrace.
42 | *** schemaTool failed \***
43 | ```    
44 | 
45 | 6\. Change directory to the hive metastore upgrade scripts location
46 | 
47 | ``` 
48 | cd /usr/lib/hive/scripts/metastore/upgrade/mysql
49 | ``` 
50 | 
51 | Doing these steps on EMR 6.x is required because you need need the target hive distribution version (hive 3.1) and the upgrade scripts inorder to ugprade the schema.
52 | 
53 | 7\. Connect to mysql using below command and upgrade the schema as per the hive version. For example, if you are upgrading from 2.3.0 to 3.1.0, you would need to source the 2 scripts. Scripts can also be found in this location: [https://github.com/apache/hive/tree/master/standalone-metastore/metastore-server/src/main/sql/mysql](https://github.com/apache/hive/tree/master/standalone-metastore/metastore-server/src/main/sql/mysql)
54 | 
55 | ```
56 | mysql -u<HIVEUSER> -h<ENDPOINT-RDS> -p'PASSWORD'
57 | mysql> use hive; 
58 | mysql> source upgrade-2.3.0-to-3.0.0.mysql.sql;
59 | mysql> source upgrade-3.0.0-to-3.1.0.mysql.sql;
60 | ```
61 | 
62 | 8\. Verify the upgrade was succesful 
63 | 
64 | ```
65 | /usr/lib/hive/bin/schematool -dbType mysql -info
66 | Metastore connection URL:    jdbc:mysql://hostname:3306/hive?createDatabaseIfNotExist=true
67 | Metastore Connection Driver :    org.mariadb.jdbc.Driver
68 | Metastore connection User:   admin
69 | Hive distribution version:   3.1.0
70 | Metastore schema version:    3.1.0
71 | schemaTool completed
72 | ``` 
73 | 
74 | 9\. After all commands are run, terminate the cluster.
75 |  
76 | 10\. Further validation:
77 | Provision new 5.x and 6.x cluster with updated hive-site.xml that points to new RDS. In both version, you can run
78 | 
79 | ```
80 | hive --service schemaTool -dbType mysql -info
81 | ``` 
82 | In 5.x you'll see
83 | ```
84 | Hive distribution version:         2.3.0
85 | Metastore schema version:          3.1.0
86 | ``` 
87 | and in 6.x, you'll see
88 | ```
89 | Hive distribution version:         3.1.0
90 | Metastore schema version:          3.1.0
91 | ``` 
92 | 


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Hive/introduction.md:
--------------------------------------------------------------------------------
1 | ---
2 | sidebar_position: 1
3 | sidebar_label: Introduction
4 | ---
5 | 
6 | # Introduction
7 | 
8 | This section offers best practices and tuning guidance for running Apache Hive workloads on Amazon EMR.


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/data_quality.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_label: Data Quality
 3 | ---
 4 | 
 5 | # Data Quality
 6 | 
 7 | ## Data quality and integrity checks with Deequ
 8 | 
 9 | Spark and Hadoop frameworks do not inherently guarantee data integrity. While it is very rare, you may observe some data corruption or missing data or duplicate data due to unexpected errors in the hardware and software stack. It is highly recommended that you validate the integrity and quality of your data atleast once after your job execution. It would be best to check for data correctness in multiple stages of your job - especially if your job is long-running.
10 | 
11 | The Spark and Hadoop frameworks don't ensure data integrity as a default feature. Although uncommon, instances of data corruption, loss, or duplication can occur due to unexplained issues within the underlying hardware and software infrastructure. To maintain data accuracy, it's strongly advised to verify the consistency of your data post-job completion. Ideally, this validation process should take place throughout various phases of your job, particularly when dealing with extended processing tasks.
12 | 
13 | In order to check data integrity, consider using [Deequ](https://github.com/awslabs/deequ) for your Spark workloads. Following are some blogs that can help you get started with Deequ for Spark workloads.
14 | 
15 | * [Test data quality at scale with Deequ | AWS Big Data Blog](https://aws.amazon.com/blogs/big-data/test-data-quality-at-scale-with-deequ/)
16 | * [Testing data quality at scale with PyDeequ | AWS Big Data Blog](https://aws.amazon.com/blogs/big-data/testing-data-quality-at-scale-with-pydeequ/)
17 | 
18 | Sometimes, you may have to write your own validation logic. For example, if you are doing a lot of calculations or aggregations, you will need to compute twice and compare the two results for accuracy. In other cases, you may also implement checksum on data computed and compare it with the checksum on data written to disk or S3. If you see unexpected results, then check your Spark UI and see if you are getting too many task failures from a single node by sorting the Task list based on "Status" and check for error message of failed tasks. If you are seeing too many random unexpected errors such as "ArrayIndexOutOfBounds" or checksum errors from a single node, then it may be possible that the node is impaired. Exclude or terminate this node and re-start your job.


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/data_skew.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | sidebar_label: Data Skew
  3 | ---
  4 | 
  5 | # Data Skew
  6 | 
  7 | Data imbalance, or skew, can have a significant impact on performance in big data processing systems like Apache Spark. When one task processes an unusually large portion of the dataset compared to others, it can result in several issues:
  8 | 
  9 | * **Slowness** A single JVM may become overloaded with excessive data, causing delays for that specific task and potentially affecting the overall throughput of the system.
 10 | * **Out-of-Memory (OOM)** The increased memory requirements for handling larger datasets can cause OOM errors in YARN containers if not managed properly.
 11 | * **Disk space filling** As tasks process their respective portions of the data, unevenly distributed data can fill up available storage faster than expected, leading to potential bottlenecks.
 12 | 
 13 | In this example, as observed in Spark UI, a single task is processing 25 times more data than other tasks. This can inevitably lead to slowness, OOMs and disk space filling issues.
 14 | 
 15 | ![BP - 15](images/spark-bp-15.png)
 16 | 
 17 | When there is a data skew, it is best handled at code level since very little can be done in terms of configuration. You can increase JVM size or use one fat executor per node in order to prevent OOMs to the best of ability. But this will impact other running tasks and also will not improve your job performance since one task uses only one vCPU. Following are some of the common strategies to mitigate data skew at code level.
 18 | 
 19 | ## Salting
 20 | 
 21 | Salting is an effective technique used for skew reduction in data processing systems. By adding a random value or "salt" to a skewed column, such as 'col1', we aim to distribute the data more evenly among processing units. This method can also involve splitting the salted column into several smaller ones, like 'col1_0', 'col1_1', and so forth. The greater the number of salts employed, the less pronounced the skew becomes, enabling improved task parallelism.
 22 | 
 23 | Original data              |  Salted 4 times           |  Salted 8 times
 24 | :-------------------------:|:-------------------------:|:-------------------------:
 25 | ![BP - 17](images/spark-bp-17.png "Original data") |  ![BP - 18](images/spark-bp-18.png "Salted 4  times") | ![BP - 19](images/spark-bp-19.png "Salted 8 times")
 26 | 
 27 | A typical Salting workflow looks like below:
 28 | 
 29 | ![BP - 20](images/spark-bp-20.png)
 30 | 
 31 | For example, a salt column is added to the data with 100 randomized salts during narrow transformation phase (map or flatMap type of transforms).
 32 | 
 33 | ```python
 34 | from pyspark.sql.functions import rand
 35 | n = 100
 36 | salted_df = df.withColumn("salt", (rand() * n).cast("int"))
 37 | ```
 38 | 
 39 | Now, aggregation is performed on this salt column and the results are reduced by keys
 40 | 
 41 | ```python
 42 | unsalted_df = (
 43 |     salted_df.groupBy("salt", groupByFields)
 44 |     .agg(aggregateFields)
 45 |     .groupBy(groupByFields)
 46 |     .agg(aggregateFields)
 47 | )
 48 | ```
 49 | 
 50 | The same principle applies when dealing with windowing functions. However, it's essential to note that there are potential drawbacks associated with this strategy. One significant disadvantage arises from creating numerous tiny tasks for non-skewed keys. This situation could negatively influence the overall performance of the job due to increased overhead costs related to managing these multitudes of smaller tasks. Therefore, careful consideration should be given before implementing salting techniques.
 51 | 
 52 | ## Isolated Salting
 53 | 
 54 | In this approach salting is applied to only subset of the keys. If 80% or more data has a single value, isolated salting approach could be considered (for eg: skew due to NULL columns). In narrow transformation phase, we will isolate the skewed column. In the wide transformation phase, we  will isolate and reduce the heavily skewed column after salting. Finally, we will reduce other values without the salt and merge the results. Isolated Salting workflow looks like below:
 55 | 
 56 | ![BP - 21](images/spark-bp-21.png)
 57 | 
 58 | Example code looks like below:
 59 | 
 60 | ```scala
 61 | val count = 4
 62 | val salted = df.withColumn("salt", when('col === "A", rand(1) * count cast IntegerType) otherwise 0)
 63 | val replicaDF = skewDF
 64 |       .withColumn("replica", when('col === "A", (0 until count) toArray) otherwise Array(0))
 65 |       .withColumn("salt", explode('replica'))
 66 |       .drop('replica')
 67 | val merged = salted.join(replicaDF, joinColumns :+ "salt")
 68 | ```
 69 | 
 70 | ## Isolated broadcast join
 71 | 
 72 | In this approach, smaller lookup table is broadcasted across the workers and joined in map phase itself. Thus, reducing the amount of data shuffles. Similar to last approach, skewed keys are separated from normal keys. Then, we reduce the ”normal” keys and perform map-side join on isolated ”skewed” keys. Finally, we can merge the results of skewed and normal joins
 73 | 
 74 | Isolated map-side join workflow looks like below:
 75 | 
 76 | ![BP - 22](images/spark-bp-22.png)
 77 | 
 78 | Example code looks like below:
 79 | 
 80 | ```scala
 81 | val count = 8
 82 | val salted = skewDF.withColumn("salt", when('col === "A", rand(1) * count cast IntegerType) otherwise 0).repartition('col', 'salt') // Re-partition to remove skew
 83 | val broadcastDF = salted.join(broadcast(sourceDF), "symbol")
 84 | ```
 85 | 
 86 | ## Hashing for SparkSQL queries
 87 | 
 88 | While running SparkSQL queries using window functions on skewed data, you may have observed that it runs out of memory sometimes. Following could be an example query working on top of a skewed dataset.
 89 | 
 90 | ```sql
 91 | select *, ROW_NUMBER() OVER (partition by l_orderkey order by l_orderkey) AS row_num FROM testdb.skewlineitem
 92 | ```
 93 | 
 94 | Considering there is a skew in l_orderkey field, we can split the above query into 4 hashes.
 95 | 
 96 | ```sql
 97 | select * from (select *, ROW_NUMBER() OVER (partition by l_orderkey order by l_orderkey) AS row_num FROM testdb.skewlineitem where cast(l_orderkey as integer)%4 = 1
 98 | union
 99 | select *, ROW_NUMBER() OVER (partition by l_orderkey order by l_orderkey ) AS row_num FROM testdb.skewlineitem where cast(l_orderkey as integer)%4 = 2
100 | union
101 | select *, ROW_NUMBER() OVER (partition by l_orderkey order by l_orderkey ) AS row_num FROM testdb.skewlineitem where cast(l_orderkey as integer)%4 = 3
102 | union
103 | select *, ROW_NUMBER() OVER (partition by l_orderkey order by l_orderkey ) AS row_num FROM testdb.skewlineitem where cast(l_orderkey as integer)%4 = 4 )
104 | limit 10;
105 | ```
106 | 
107 | If the values are highly skewed, then salting approaches should be used instead since this approach will still send all the skewed keys to a single task. This approach should be used to prevent OOMs quickly rather than to increase performance. The read job is re-computed for the number of sub queries written.
108 | 


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-1.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-10.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-11.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-12.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-13.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-14.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-15.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-16.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-17.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-18.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-19.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-2.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-20.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-21.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-22.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-23.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-24.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-25.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-26.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-26.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-27.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-27.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-28.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-29.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-3.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-30.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-31.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-32.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-33.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-33.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-4.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-5.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-6.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-7.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-8.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-9.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-range-join-after.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-range-join-after.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-bp-range-join-before.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-bp-range-join-before.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-tt-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-tt-1.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/images/spark-tt-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/Spark/images/spark-tt-2.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/introduction.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 1
 3 | sidebar_label: Introduction
 4 | ---
 5 | 
 6 | # Introduction
 7 | 
 8 | This section offers best practices and tuning guidance for running Apache Spark workloads on Amazon EMR. The guidances cover the following main themes:
 9 | 
10 | * Cost optimization
11 | * Performance optimization
12 | * Error mitigation
13 | 


--------------------------------------------------------------------------------
/website/docs/bestpractices/Applications/Spark/thrift.md:
--------------------------------------------------------------------------------
 1 | # Thrift Server
 2 | 
 3 | ## Long running thrift server
 4 | 
 5 | Spark Thrift Server allows JDBC/ODBC clients to execute SQL queries on Spark. It is recommended to follow the best practices outlined below.
 6 | 
 7 | * [Ensure Application Masters only run on an On Demand Node](../../Features/Spot%20Usage/best_practices#ensure-application-masters-only-run-on-an-on-demand-node)
 8 | 
 9 | * As the query results are collected by thrift server,  ensure Spark driver core/memory and `spark.driver.maxResultSize` are properly configured. Use `--driver-memory` insted of `--conf spark.driver.memory` as thrift server is running at client mode
10 | 
11 | * Long running thrift server can generate large amount of Spark event logs. [Activate the Spark event log rolling and compaction feature](https://docs.aws.amazon.com/emr/latest/ManagementGuide/app-history-spark-UI.html#app-history-spark-UI-large-event-logs)
12 | 
13 | * Thrift server log file size can be huge as by default the log are accumulated.   Try to configure custom log4j2 properties file to use rolling file appender
14 | 
15 | ```bash
16 | /usr/lib/spark/sbin/start-thriftserver.sh                         \
17 |   -Dlog4j.configurationFile=/home/hadoop/thriftlog4j2.properties" \
18 |   --driver-cores 8                                                \
19 |   --driver-memory 10G
20 | ```
21 | 
22 |   **thriftlog4j2.properties** example as below:
23 | 
24 | ```
25 | property.basePath = /var/log/spark/
26 |   
27 | rootLogger.level = info
28 | rootLogger.appenderRef.rolling.ref = fileLogger
29 |   
30 | appender.rolling.type = RollingFile
31 | appender.rolling.name = fileLogger
32 | appender.rolling.fileName = ${basePath}spark-root-org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-application.log
33 | appender.rolling.filePattern = ${basePath}spark-root-org.apache.spark.sql.hive.thriftserver.HiveThriftServer2-application.%d{MM-dd-yy}-%i.log
34 | appender.rolling.layout.type = PatternLayout
35 | appender.rolling.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
36 | appender.rolling.policies.type = Policies
37 | appender.rolling.policies.size.type = SizeBasedTriggeringPolicy
38 | appender.rolling.policies.size.size = 100MB
39 | appender.rolling.strategy.type = DefaultRolloverStrategy
40 | appender.rolling.strategy.max = 10
41 |   
42 | appender.console.type = Console
43 | appender.console.name = console
44 | appender.console.target = SYSTEM_ERR
45 | appender.console.layout.type = PatternLayout
46 | appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
47 |   
48 | # Set the default spark-shell/spark-sql log level to WARN. When running the
49 | # spark-shell/spark-sql, the log level for these classes is used to overwrite
50 | # the root logger's log level, so that the user can have different defaults
51 | # for the shell and regular Spark apps.
52 | logger.repl.name = org.apache.spark.repl.Main
53 | logger.repl.level = warn
54 |   
55 | logger.thriftserver.name = org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver
56 | logger.thriftserver.level = warn
57 |   
58 | # Settings to quiet third party logs that are too verbose
59 | logger.jetty1.name = org.sparkproject.jetty
60 | logger.jetty1.level = warn
61 | logger.jetty2.name = org.sparkproject.jetty.util.component.AbstractLifeCycle
62 | logger.jetty2.level = error
63 | logger.replexprTyper.name = org.apache.spark.repl.SparkIMain$exprTyper
64 | logger.replexprTyper.level = info
65 | logger.replSparkILoopInterpreter.name = org.apache.spark.repl.SparkILoop$SparkILoopInterpreter
66 | logger.replSparkILoopInterpreter.level = info
67 | logger.parquet1.name = org.apache.parquet
68 | logger.parquet1.level = error
69 | logger.parquet2.name = parquet
70 | logger.parquet2.level = error
71 | logger.hudi.name = org.apache.hudi
72 | logger.hudi.level = warn
73 |   
74 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
75 | logger.RetryingHMSHandler.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler
76 | logger.RetryingHMSHandler.level = fatal
77 | logger.FunctionRegistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry
78 | logger.FunctionRegistry.level = error
79 |   
80 | # For deploying Spark ThriftServer
81 | # SPARK-34128: Suppress undesirable TTransportException warnings involved in THRIFT-4805
82 | appender.console.filter.1.type = RegexFilter
83 | appender.console.filter.1.regex = .*Thrift error occurred during processing of message.*
84 | appender.console.filter.1.onMatch = deny
85 | appender.console.filter.1.onMismatch = neutral
86 | appender.rolling.filter.1.type = RegexFilter
87 | appender.rolling.filter.1.regex = .*Thrift error occurred during processing of message.*
88 | appender.rolling.filter.1.onMatch = deny
89 | appender.rolling.filter.1.onMismatch = neutral
90 | ```
91 | 


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/Introduction.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 1
 3 | sidebar_label: Introduction
 4 | ---
 5 | 
 6 | EMR Cost Optimization best practices focus on the continual process of refinement and improvement of a system over its entire lifecycle. From the initial design of your very first proof of concept to the ongoing operation of production workloads, adopting the practices in this document can enable you to build and operate cost-aware systems that achieve business outcomes and minimize costs, thus allowing your business to maximize its return on investment.
 7 | 
 8 | A cost-optimized workload is one that
 9 | 
10 | 1. Meets functional and non functional requirements 
11 | 2. Fully utilizes all cluster resources and
12 | 3. Achieves an outcome at the lowest possible price point
13 | 
14 | To better understand this, let’s look at an example. Let’s assume we have an ETL job that needs to be completed within 8 hours. 
15 | 
16 | ![Sample EMR Job - 1](images/intro-1.png)
17 | 
18 | In order to meet the requirements of completing the job within 8 hours,  a certain amount of compute resources will be required. This is represented in the graph by the “Job Demand”. Sometimes this is static, where the amount of resources needed is consistent throughout the duration of the job.
19 | 
20 | ![Sample EMR Job - 2](images/intro-2.png)
21 | 
22 | And sometimes it’s more dynamic, where throughout the job, you have various peaks and valleys depending on the number of tasks that are running at each stage. 
23 | 
24 | ![Sample EMR Job - 3](images/intro-3.png)
25 | 
26 | In order for the job to finish within the 8 hours, it needs enough cluster capacity to meet the jobs compute demand - represented by the blue dotted line. 
27 | 
28 | ![Sample EMR Job - 4](images/intro-4.png)
29 | 
30 | If our cluster capacity is below our jobs compute demand 
31 | 
32 | ![Sample EMR Job - 5](images/intro-5.png)
33 | 
34 | Our job will be resource constrained and It’ll cause the job to run longer than our 8 hour sla
35 | 
36 | ![Sample EMR Job - 6](images/intro-6.png)
37 | 
38 | Now, just being able to meet your SLA is not enough to be cost optimized.  This leads us to our 2nd step of a cost optimized workload - Fully utilizing all cluster resources 
39 | 
40 | Take these next two graphs as an example, in the first case, we have a cluster that has compute capacity well beyond the jobs needs, represented by space in between the jobs demand and cluster capacity
41 | 
42 | ![Sample EMR Job - 7](images/intro-7.png)
43 | 
44 | ![Sample EMR Job - 8](images/intro-8.png)
45 | 
46 | In this 2nd graph,  we have a better match between the clusters capacity and jobs compute demands 
47 | 
48 | The space in between in between two is unused resources.  These are resources that are being charged for but the job does not actually need. Fully utilizing all resources means reducing this space as much as possible. 
49 | 
50 | 
51 | ![Sample EMR Job - 9](images/intro-9.png)
52 | 
53 | Going back to our job with less predictable workload patterns, a static cluster size may not be the best way to maximize cluster resources, but instead, using something like EMR autoscaling that adjusts cluster capacity based off of your workload demand would be a better fit.
54 | 
55 | ![Sample EMR Job - 10](images/intro-10.png)
56 | 
57 | In this graph, our cluster scales up and down depending on demand. Our cluster capacity becomes a function of the jobs demand of resources. 
58 | 
59 | The last part of being “Cost Optimized” is achieving your jobs outcomes at the lowest price point possible.
60 | 
61 | EMR has multiple pricing models that allow you to pay for your resources in the most cost-effective way that suits your needs.  For example, On-Demand, Spot and Commitment discounts - Savings Plans/ Reserved Instances/Capacity
62 | 
63 | All of these pricing options leverage the exact same infrastructure but depending on which option you choose, the cost of your job will vary significantly. The numbers are just examples, with spot you can get up to 90% off on demand prices and with saving plans or RI, up to 72%. 
64 | 
65 | ![Sample EMR Job - 11](images/intro-11.png)
66 | 
67 | In the next sections, we’ll discuss best practices on choosing the right pricing model for your workload.  For the purpose of this example, regardless of of which option you choose, the cluster compute capacity stays the same. 
68 | 
69 | 


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/bp-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/bp-1.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/bp-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/bp-10.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/bp-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/bp-11.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/bp-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/bp-2.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/bp-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/bp-3.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/bp-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/bp-4.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/bp-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/bp-5.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/bp-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/bp-6.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/bp-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/bp-7.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/bp-8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/bp-8.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/bp-9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/bp-9.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/intro-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/intro-1.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/intro-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/intro-10.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/intro-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/intro-11.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/intro-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/intro-2.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/intro-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/intro-3.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/intro-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/intro-4.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/intro-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/intro-5.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/intro-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/intro-6.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/intro-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/intro-7.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/intro-8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/intro-8.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/intro-9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/intro-9.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/mru-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/mru-1.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Cost Optimizations/images/mru-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Cost Optimizations/images/mru-2.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Features/EMRFS/Assets/table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Features/EMRFS/Assets/table.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Features/EMRFS/images/pic1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Features/EMRFS/images/pic1.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Features/EMRFS/images/pic2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Features/EMRFS/images/pic2.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Features/Managed Scaling/best_practices.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 2
 3 | sidebar_label: Best Practices
 4 | ---
 5 | 
 6 | # Managed Scaling
 7 | 
 8 | [Managed scaling](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-managed-scaling.html) lets you automatically increase or decrease the number of instances or units in your cluster based on workload. Amazon EMR continuously evaluates cluster metrics to make scaling decisions that optimize your clusters for cost and speed. Managed scaling is available for clusters composed of either instance groups or instance fleets.
 9 | 
10 | ## Keep core nodes constant and scale with only task nodes
11 | 
12 | Scaling solely based on task nodes enhances the speed at which nodes join or leave the cluster due to their lack of involvement in managing HDFS storage.
13 | During the addition of more nodes to the cluster (scale up), task nodes avoid the necessity of installing data node daemons. Similarly, upon removal of nodes (scale down), task nodes escape the requirement of participating in HDFS block redistribution activities. These optimizations lead to enhanced performance through reduced scaling times and lower operational costs.
14 | 
15 | On the other hand, when diminishing the quantity of core nodes, there exists a danger of overwhelming the remaining nodes' disk volumes during HDFS rebalance. Should any node reach a 90% disk utilization rate, it could be flagged as unhealthy, effectively excluding it from YARN's purview.
16 | 
17 | In order to only scale with task nodes, you keep the number of core nodes constant and right size your core node EBS volumes for your HDFS usage. Remember to consider the HDFS replication factor which is configured via dfs.replication in hdfs-site.xml. It is recommended that at a minimum, you keep 2 core nodes and set dfs.replication=2.
18 | 
19 | Below is a managed scaling configuration example where the cluster will scale only on task nodes. In this example, the minimum nodes is 25, maximum 100. Of the 25 minimum, they will be all on-demand and core nodes. When the cluster needs to scale up, the remaining 75 will be task nodes on spot.
20 | 
21 | ![BP - 1](images/bp-1.png)
22 | 
23 | ## Monitor Managed Scaling with Cloudwatch Metrics
24 | 
25 | You can monitor your managed scaling cluster with CloudWatch metrics. This is useful if you want to better understand how your cluster is resizing to the change in job load/usage. Lets looks at an example:
26 | 
27 | ![!BP - 2](images/bp-3.png)
28 | 
29 | At 18:25, `YARNMemoryAvailablePercentage` starts at 100%. This means that no jobs are running. At 18:27 a job starts and we see `YARNMemoryAvailablePercentage` begin to drop, reaching 0% at 18:29. This triggers managed scaling to start a resize request - represented by the increase in the metric `TotalNodesRequested`. After 5-6 mins, at 18:35 the nodes finish provisioning and are considered `RUNNING`. We see an increase in the metric, `TotalNodesRunning`. Around the same time, we see `YARNMemoryAvailablePercentage` begin increasing back to 100%.
30 | 
31 | For a full list of metrics and description of each, see [Understanding managed scaling metrics in Amazon EMR](https://docs.aws.amazon.com/emr/latest/ManagementGuide/managed-scaling-metrics.html).
32 | 
33 | ## Consider adjusting YARN decommissioning timeouts depending on your workload
34 | 
35 | There are two decommissioning timeouts that are important in managed scaling:
36 | 
37 | 1. *`yarn.resourcemanager.nodemanager-graceful-decommission-timeout-secs`* This is the maximal time to wait for running containers and applications to complete before transition a `DECOMMISSIONING` node into `DECOMMISSIONED`.
38 | 2. *`spark.blacklist.decommissioning.timeout`* This is the maximal time that Spark does not schedule new tasks on executors running on that node. Tasks already running are allowed to complete.
39 | 
40 | When managed scaling triggers a scale down, YARN will put nodes it wants to decomission in a `DECOMMISSIONING` state. Spark will detect this and add these nodes to a “black list” (_AWS acknowledges the use of non-inclusive language in this codebase and will work with the Spark community to update_). In this state, Spark will not assign any new tasks to the node and once all tasks are completed, YARN will finish decommissioning the node. If the task runs longer than *`yarn.resourcemanager.nodemanager-graceful-decommission-timeout-secs`*, the node is force-terminated and the task will be reassigned to another node.
41 | 
42 | In certain scale down scenarios where you have long running tasks, many nodes can end up in this state where they are `DECOMMISSIONING` and `blacklisted` because of *`spark.blacklist.decommissioning.timeout.`* You may observe that new jobs run slower because it cannot assign tasks to all nodes in the cluster.
43 | 
44 | To mitigate this, you can lower *`spark.blacklist.decommissioning.timeout`* to make the node available for other pending containers to continue task processing. This can improve job run times. However, please take the below into consideration:
45 | 
46 | * If a task is assigned to this node, and YARN transitions from `DECOMMISSIONING` into `DECOMMISSIONED`, the task will fail and will need to be reassigned to another node.
47 | * Spark blacklist also protects from bad nodes in the cluster, e.g., faulty hardware leading to high task failure rate. Lowering the blacklist timeout can increase task failure rate since tasks will continue to be assigned to these nodes.
48 | 
49 | Nodes can be transitioned from `DECOMMISSIONING` to `RUNNING` due to a scale up request. In this scenario, tasks will not fail and with a lower blacklist timeout, pending tasks can continuously be assigned to the node.
50 | 
51 | With *`yarn.resourcemanager.nodemanager-graceful-decommission-timeout-secs`*, consider increasing this from the default of 1hr to the length of your longest running task. This is to ensure that YARN does not force-terminate the node while the task is running, causing it to re-run on another node. The cost associated with rerunning the long running task is generally higher than keeping the node running to ensure it's completed.
52 | 
53 | For more information, see:
54 | 
55 | * [Configuring node decommissioning behavior](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-configure.html#spark-decommissioning)
56 | * [Spark enhancements for elasticity and resiliency on Amazon EMR](https://aws.amazon.com/blogs/big-data/spark-enhancements-for-elasticity-and-resiliency-on-amazon-emr/)
57 | * [Amazon EMR cluster error: Deny-listed nodes](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-troubleshoot-error-resource-3.html)
58 | 
59 | ## EMR Managed Scaling compared to Custom Automatic Scaling
60 | 
61 | The following [document](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-scale-on-demand.html) highlights the key differences between EMR managed scaling vs. custom automatic scaling.
62 | 
63 | In general, we recommend using EMR managed scaling since the metric evaluation is every 5-10 seconds. This means your EMR cluster will adjust quicker to the change in the required cluster resources. In addition, EMR managed scaling also supports instance fleets and the the scaling policy is simpler to configure because EMR managed scaling only requires min and max amounts for purchasing options (On-Demand/Spot) and node type (core/task).
64 | 
65 | Custom automatic scaling should be considered if you want autoscaling outside of YARN applications or if you want full control over your scaling policies (e.g., evaluation period, cool down, number of nodes)
66 | 
67 | ## Configure Spark History Server (SHS) custom executor log URL to point to Job History Server (JHS) Directly
68 | 
69 | When you use SHS to access application container logs, YARN ResourceManager relies on the NodeManager that the jobs' Application Master (AM) ran on, to redirect to the JHS. The JHS is what hosts the container logs. A job's executor logs cannot be accessed if the AM ran on a node that’s been decommissioned due to managed scaling or spot.
70 | 
71 | A solution to this is pointing SHS to the JHS directly, instead of letting node manager redirect.
72 | 
73 | Spark 3.0 introduced *spark.history.custom.executor.log.url*, which allows you to specify a custom Spark executor log url.
74 | You can configure *spark.history.custom.executor.log.url* as below to point to JHS directly:
75 | 
76 | ```
77 | {{HTTP_SCHEME}}<JHS_HOST>:<JHS_PORT>/jobhistory/logs/{{NM_HOST}}:{{NM_PORT}}/{{CONTAINER_ID}}/{{CONTAINER_ID}}/{{USER}}/{{FILE_NAME}}?start=-4096
78 | ```
79 | 
80 | Replace `JHS_HOST` and `JHS_PORT` with actual values. `JHS_HOST` is the EMR master node.
81 | 


--------------------------------------------------------------------------------
/website/docs/bestpractices/Features/Managed Scaling/images/bp-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Features/Managed Scaling/images/bp-1.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Features/Managed Scaling/images/bp-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Features/Managed Scaling/images/bp-3.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Features/Managed Scaling/images/ms-metrics.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Features/Managed Scaling/images/ms-metrics.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Observability/Assets/preview_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Observability/Assets/preview_1.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Observability/Assets/preview_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Observability/Assets/preview_2.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Observability/best_practices.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 2
 3 | sidebar_label: CloudWatch
 4 | ---
 5 | 
 6 | # Monitor Cluster Performance with CloudWatch
 7 | 
 8 | Amazon EMR allows you to process vast amounts of data quickly and cost-effectively. However, monitoring the performance of your EMR clusters is crucial to ensure optimal resource utilization and efficient job execution. One powerful tool for monitoring EMR clusters is Amazon CloudWatch, a monitoring and observability service provided by AWS.
 9 | 
10 | The script `emr-cw_dashboard.sh` streamlines the process of creating a CloudWatch dashboard, enabling you to monitor the performance of your EMR cluster. This dashboard offers insights into various Hadoop metrics, such as resource utilization, container management, and overall cluster health. It displays cluster resource utilization metrics, including memory and virtual core usage, allowing you to identify potential bottlenecks and evaluate scaling actions taken by Managed Scaling.
11 | 
12 | By analyzing these metrics, you can gain insights into whether your cluster can run more efficiently with different instance families or if you can reduce the cluster size while maintaining the same level of performance. Additionally, the dashboard provides visibility into the overall cluster health by monitoring active, decommissioned, lost, and unhealthy nodes. Notably, the script leverages data generated by the EMR API to create annotations within the graphs, visually indicating when EC2 instances were launched or terminated, as well as the execution time of EMR steps. This comprehensive view enables you to correlate your cluster's performance with operational events, facilitating better decision-making and optimizing resource utilization.
13 | 
14 | ## Getting Started
15 | 
16 | 1. **Configure EMR Cluster** Ensure that your EMR cluster has the [CloudWatch Agent](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/AmazonCloudWatchAgent-config-710.html) installed and configured to publish the Hadoop metrics in CloudWatch (please note that metrics published by the CloudWatch agent will incur in additional costs). You can reference the following EMR Configuration to publish the required Hadoop metrics in CloudWatch:
17 | 
18 | ```json
19 | {
20 |   "Classification": "emr-metrics",
21 |   "Properties": {},
22 |   "Configurations": [
23 |     {
24 |       "Classification": "emr-hadoop-yarn-resourcemanager-metrics",
25 |       "Properties": {
26 |         "Hadoop:service=ResourceManager,name=ClusterMetrics": "CapabilityMB,CapabilityVirtualCores,UtilizedMB,UtilizedVirtualCores,NumActiveNMs,NumDecommissionedNMs,NumDecommissioningNMs,NumLostNMs,NumUnhealthyNMs",
27 |         "Hadoop:service=ResourceManager,name=QueueMetrics,q0=root": "AppsCompleted,AppsFailed,AppsKilled,AppsPending,AppsRunning,AppsSubmitted,AllocatedContainers,AllocatedMB,AllocatedVCores,AvailableMB,AvailableVCores,PendingContainers,PendingMB,PendingVCores",
28 |         "otel.metric.export.interval": "30000"
29 |       }
30 |     }
31 |   ]
32 | }
33 | ```
34 | 
35 | 2. **Run the Script** Once your cluster is configured, you can run the `emr-cw_dashboard.sh` script either as an EMR step or externally, providing the cluster ID and AWS region as input parameters. The script will automatically create the CloudWatch dashboard, enabling you to visualize and monitor the performance of your EMR cluster in real-time.
36 | 
37 | Please note that events related to instances and EMR steps start / termination times are generated by the bash script. You might want to run the script only when the cluster is terminated to generate all the annotation events that will be displayed in the dashboard.
38 | 
39 | ## Resources
40 | 
41 | [emr-cw_dashboard.sh](Assets/emr-cw_dashboard.sh)
42 | 
43 | ## Sample View
44 | 
45 | ![Sample 1](Assets/preview_1.png)
46 | ![Sample 2](Assets/preview_2.png)
47 | 


--------------------------------------------------------------------------------
/website/docs/bestpractices/Observability/intro.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 1
 3 | sidebar_label: Introduction
 4 | ---
 5 | 
 6 | # CloudWatch
 7 | 
 8 | Amazon EMR automatically publishes a set of free metrics to CloudWatch every five minutes to help monitor cluster activity and health.
 9 | 
10 | 
11 | ## Cloudwatch Agent
12 | 
13 | Starting with Amazon EMR Release 7.0, you can install the Amazon CloudWatch Agent to publish 34 additional paid metrics to CloudWatch every minute. The agent collects metrics from all nodes in the cluster, aggregates them on the primary node, and publishes the data to the cloud. You can view these metrics in the EMR console under the Monitoring tab or in the CloudWatch Console.
14 | 
15 | By analyzing these metrics, you can gain valuable insights into whether your cluster could perform more efficiently with different instance families or if it’s possible to reduce the cluster size without compromising performance.
16 | 
17 | With Amazon EMR 7.1, you can configure the agent to send additional metrics - Hadoop, YARN and Hbase, offering deeper visibility into your cluster’s performance. Also, if you are using Prometheus to monitor your enterprise metrics, you can opt to send these metrics to an Amazon Managed Service for Prometheus endpoint.
18 | 
19 | CloudWatchAgent is supported on Runtime Role Clusters for EMR 7.6 and higher.
20 | 
21 | You can install the agent when creating a new cluster via the console or the create-cluster API. For more details, refer to "Create an EMR cluster that uses Amazon CloudWatch agent.
22 | 
23 | **Limitations of Large EMR on EC2 Clusters:**
24 | The CloudWatch GetMetricData API supports up to 500 metrics per request. If your EMR cluster has more than 250 nodes in an instance group or fleet, the corresponding graphs in the CloudWatch embedded dashboard in EMR will display a "Too many metrics" error and appear blank. This is because these metrics require two data points per metric in the "Cluster Overview" dashboard. However, by filtering the Core or Task Instance Group/fleet dashboards, you can view graphs for up to 500 nodes per instance group or fleet, as these dashboards don’t require two data points per metric. For instance groups or fleets with more than 500 nodes, the "Too many metrics" error will also occur for the metrics in these dashboards.
25 | 
26 | In this case, we recommend using CloudWatch Metrics Insights. A single query in Metrics Insights can return up to 500 time series. If the query results exceed this limit, not all metrics will be included. However, with the ORDER BY clause, the metrics are sorted, and the top 500 metrics based on the specified criteria are returned. This approach is still useful because it can handle up to 10,000 metrics, and the ORDER BY clause allows you to control which 500 metrics are returned.
27 | 
28 | The only limitation is that CloudWatch Metrics Insights currently only allows data from the last 3 hours.
29 | 


--------------------------------------------------------------------------------
/website/docs/bestpractices/Reliability/images/bp-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Reliability/images/bp-1.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Reliability/images/bp-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Reliability/images/bp-2.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Reliability/images/bp-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Reliability/images/bp-3.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Reliability/images/bp-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Reliability/images/bp-5.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Reliability/images/bp-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Reliability/images/bp-6.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Reliability/introduction.md:
--------------------------------------------------------------------------------
1 | ---
2 | sidebar_position: 1
3 | sidebar_label: Introduction
4 | ---
5 | 
6 | # Introduction
7 | 
8 | EMR Reliability best practices discusses how to recover from infrastructure or service disruptions, dynamically acquire computing resources to meet demand, imprve availability of resources when required and mitigate disruptions such as misconfiguration or transient network issues.
9 | 


--------------------------------------------------------------------------------
/website/docs/bestpractices/Security/best_practices.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 2
 3 | sidebar_label: Best Practices
 4 | ---
 5 | 
 6 | # Security
 7 | 
 8 | Best Practices (BP) for running secure workloads on EMR.
 9 | 
10 | ## Encrypt Data at rest and in transit
11 | 
12 | Properly protecting your data at rest and in transit using encryption is a core component of AWS' Well-Architected pillar of security. Amazon EMR security configurations make it easy for you to encrypt data both at rest and in transit. A security configuration is like a template for encryption and other security configurations that you can apply to any cluster when you launch it.
13 | 
14 | For data at rest, EMR provides encryption options for reading and writing data in S3 via EMRFS. You specify Amazon S3 server-side encryption (SSE) or client-side encryption (CSE) as the Default encryption mode when you enable encryption at rest. Optionally, you can specify different encryption methods for individual buckets using Per bucket encryption overrides. EMR also provides the option to encrypt local disk storage. These are EC2 instance store volumes and the attached Amazon Elastic Block Store (EBS) storage that are provisioned with your cluster. You have the options of using Linux Unified Key Setup (LUKS) encryption or using AWS KMS as your key provider.
15 | 
16 | For data in transit, EMR security configurations allow you to either manually create PEM certificates, zip them in a file, and reference from Amazon S3 or implement a certificate custom provider in Java and specify the S3 path to the JAR. In either case, EMR automatically downloads artifacts to each node in the cluster and later uses them to implement the open-source, in-transit encryption features. For more information on how these certificates are used with different big data technologies, see: 
17 | (https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-data-encryption-options.html#emr-encryption-intransit)
18 | 
19 | For more information about setting up security configurations in Amazon EMR, see the AWS Big Data Blog post Secure Amazon EMR with Encryption, see: (https://aws.amazon.com/blogs/big-data/secure-amazon-emr-with-encryption)
20 | 
21 | 
22 | ## Restrict network access to your EMR cluster and keep EMR block public access feature enabled
23 | 
24 | Inbound and outbound network access to your EMR cluster is controlled by EC2 Security Groups. It is recommended to apply the principle of least privilege to your Security Groups, so that your cluster is locked down to only the applications or individuals who need access from the expected source IPs.
25 | 
26 | It’s also recommended to not allow SSH access to the `hadoop` user, which has elevated sudo access and access to this user is typically not required. EMR provides a number of ways for users to interact with clusters remotely. For job submission, users can use EMR Steps API or an orchestration service like AWS Managed Apache Airflow or AWS Step functions. For ad-hoc or notebook use cases, you can use EMR Studio, or allow users to connect to the specific application ports e.g Hiveserver2 JDBC, Livy or Notebook UI’s.
27 | 
28 | The block public access feature prevents a cluster in a public subnet from launching when any security group associated with the cluster has a rule that allows inbound traffic from IPv4 0.0.0.0/0 or IPv6 ::/0 (public access) on a port, unless the port has been specified as an exception - port 22 is an exception by default.  This feature is enabled by default for each AWS Region in your AWS account and is not recommended to be turned off.
29 | 
30 | Use Persistent Application UI's to remove the need to open firewall to get access to debugging UI
31 | 
32 | ## Provision clusters in a private subnet
33 | 
34 | It is recommended to provision your EMR clusters in Private VPC Subnets. Private Subnets allow you to limit access to deployed components, and to control security and routing of the system. With a private Subnet, you can enable communication with your own network over a VPN tunnel or AWS Direct Connect, which allows you to access your EMR clusters from your network without requiring internet routing. For access to other AWS services from your EMR Cluster e.g S3, VPC endpoints can be used.
35 | 
36 | For more information on configuring EMR clusters in private subnets or VPC endpoints, see:
37 | (https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-plan-vpc-subnet.html)
38 | (https://docs.aws.amazon.com/vpc/latest/privatelink/vpc-endpoints-access.html)
39 | 
40 | ## Configure EC2 instance metadata service (IMDS) v2
41 | 
42 | In AWS EC2, the Instance Metadata Service (IMDS) provides “data about your instance that you can use to configure or manage the running instance. Every instance has access to its own IMDS using any HTTP client request (such as the `curl` command) located at `http://169.254.169.254/latest/meta-data`. IMDSv1 is fully secure and AWS will continue to support it, but IMDSv2 adds new “belt and braces” protections for four types of vulnerabilities that could be used to try to access the IMDS. For more see:
43 | (https://aws.amazon.com/blogs/security/defense-in-depth-open-firewalls-reverse-proxies-ssrf-vulnerabilities-ec2-instance-metadata-service/)
44 | 
45 | From EMR 5.32 and 6.2 onward, Amazon EMR components use IMDSv2 for all IMDS calls. For IMDS calls in your application code, you can use both IMDSv1 or IMDSv2. It is recommended to turn off IMDSv1 and only allow IMDSv2 for added security. This can be configured in EMR Security Configurations. For more information, see:
46 | https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-create-security-configuration.html#emr-security-configuration-imdsv2)
47 | 
48 | ## Create a separate IAM role for each cluster or use case
49 | 
50 | EMR uses an IAM service Roles to perform actions on your behalf to provision and manage clusters. It is recommended to create a separate IAM Role for each use case and workload, allowing you to segregate access control between clusters. If you have multiple clusters, each cluster can only access the services and data defined within the IAM policy.
51 | 
52 | ## Use scoped down IAM policies for authorization such as AmazonEMRFullAccessPolicy_v2
53 | 
54 | EMR provides managed IAM policies to grant specific access privileges to users. Managed policies offer the benefit of updating automatically if permission requirements change. If you use inline policies, service changes may occur that cause permission errors to appear.
55 | 
56 | It is recommended to use new managed policies (v2 policies) which have been scoped-down to align with AWS best practices. The v2 managed policies restrict access using tags. They allow only specified Amazon EMR actions and require cluster resources that are tagged with an EMR-specific key. For more details and usage, see:
57 | (https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-managed-policy-fullaccess-v2.html)
58 | 
59 | ## Audit user activity with AWS CloudTrail
60 | 
61 | AWS CloudTrail provides a record of actions taken by a user, role, or an AWS service, and is integrated with Amazon EMR. CloudTrail captures all API calls for Amazon EMR as events, which includes calls from the Amazon EMR console or calls to the Amazon EMR API. If you create a Trail, you can enable continuous delivery of CloudTrail events to an Amazon S3 bucket, including events for Amazon EMR.
62 | 
63 | You can also audit the S3 objects that EMR accesses by using S3 access logs. AWS CloudTrail provides logs only for AWS API calls. Thus, if a user runs a job that reads and writes data to S3, the S3 data that was accessed by EMR doesn’t show up in CloudTrail. By using S3 access logs, you can comprehensively monitor and audit access against your data in S3 from anywhere, including EMR.
64 | 
65 | Because you have full control over your EMR cluster, you can always install your own third-party agents or tooling. You do so by using bootstrap actions or custom AMIs to help support your auditing requirements.
66 | 
67 | ## Upgrade your EMR Releases frequently or use a Custom AMI to get the latest OS and application software patches
68 | 
69 | Each Amazon EMR release version is "locked" to the Amazon Linux AMI version to maintain compatibility. This means that the same Amazon Linux AMI version is used for an Amazon EMR release version even when newer Amazon Linux AMIs become available. For this reason, we recommend that you use the latest Amazon EMR release version unless you need an earlier version for compatibility and are unable to migrate.
70 | 
71 | If you must use an earlier release version of Amazon EMR for compatibility, we recommend that you use the latest release in a series. For example, if you must use the 5.12 series, use 5.12.2 instead of 5.12.0 or 5.12.1. If a new release becomes available in a series, consider migrating your applications to the new release.
72 | 


--------------------------------------------------------------------------------
/website/docs/bestpractices/Security/introduction.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 1
 3 | sidebar_label: Introduction
 4 | ---
 5 | 
 6 | # Introduction
 7 | 
 8 | EMR Security best practices discusses how to take advantage of AWS and EMR features to protect data, systems, and assets in a way that can improve your security posture.
 9 | 
10 | It's recommended to first read our Well Architected paper on security to understand the risks we mitigate and how we think about security. 
11 | (https://docs.aws.amazon.com/wellarchitected/latest/security-pillar/welcome.html)
12 | 


--------------------------------------------------------------------------------
/website/docs/bestpractices/Troubleshooting/images/AmazonQ.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Troubleshooting/images/AmazonQ.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Troubleshooting/images/CWagent_cpu_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Troubleshooting/images/CWagent_cpu_graph.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Troubleshooting/images/CWagent_cpu_metric_list.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Troubleshooting/images/CWagent_cpu_metric_list.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Troubleshooting/images/CWagent_cpu_namespace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Troubleshooting/images/CWagent_cpu_namespace.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Troubleshooting/images/CWagent_disk_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Troubleshooting/images/CWagent_disk_graph.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Troubleshooting/images/CWagent_disk_namespace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Troubleshooting/images/CWagent_disk_namespace.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Troubleshooting/images/CWagent_memory_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Troubleshooting/images/CWagent_memory_graph.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Troubleshooting/images/CWagent_memory_namespace.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Troubleshooting/images/CWagent_memory_namespace.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Troubleshooting/images/application_container_log_location.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Troubleshooting/images/application_container_log_location.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Troubleshooting/images/application_master_log_location.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Troubleshooting/images/application_master_log_location.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Troubleshooting/images/datanode_log_location.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Troubleshooting/images/datanode_log_location.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Troubleshooting/images/hdfs_fsadmin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Troubleshooting/images/hdfs_fsadmin.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Troubleshooting/images/hdfs_fsck.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Troubleshooting/images/hdfs_fsck.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Troubleshooting/images/instance_state_log_location.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Troubleshooting/images/instance_state_log_location.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Troubleshooting/images/iostat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Troubleshooting/images/iostat.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Troubleshooting/images/iostat_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Troubleshooting/images/iostat_output.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Troubleshooting/images/namnode_log_location.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Troubleshooting/images/namnode_log_location.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Troubleshooting/images/nodemanager_log_location.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Troubleshooting/images/nodemanager_log_location.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Troubleshooting/images/resourcemanager_log_location.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Troubleshooting/images/resourcemanager_log_location.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/Troubleshooting/images/uptime.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Troubleshooting/images/uptime.png


--------------------------------------------------------------------------------
/website/docs/bestpractices/introduction.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 0
 3 | sidebar_label: EMR Best Practices
 4 | ---
 5 | 
 6 | # EMR Best Practices
 7 | 
 8 | Welcome to the EMR Best Practices Guides.  The goal of this project is to offer a set of best practices, templates and guides for operating Amazon EMR. We elected to publish this guidance to GitHub so we could iterate quickly, provide timely and effective recommendations for variety of concerns, and easily incorporate suggestions from the broader community.  
 9 | 
10 | We currently have published guides for the following topics:
11 | 
12 | * **Applications** Best practices on frameworks that can be installed on an EMR cluster, such as Hadoop, Spark, HBase, etc.
13 | * **Cost Optimizations** Recommended methods for reducing costs in AWS EMR clusters, such as instance type selection, spot instances, autoscaling, and data compression techniques.
14 | * **Observability** Techniques for monitoring and understanding performance metrics, logs, and system health indicators within an EMR cluster.
15 | * **Reliability** Guidelines for ensuring high availability and fault tolerance in EMR deployments, including multi-region setups, automatic failover mechanisms, and backup strategies.
16 | * **Security** Measures for securing EMR clusters against unauthorized access or data breaches
17 | * **Troubleshooting** Common issues faced when working with Amazon EMR and steps to resolve them. This may include connectivity problems, application errors, and configuration issues.
18 | 
19 | ## Contributing
20 | 
21 | We encourage you to contribute to these guides. If you have implemented a practice that has proven to be effective, please share it with us by opening an issue or a pull request. Similarly, if you discover an error or flaw in the guidance we've already published, please submit a PR to correct it.
22 | 


--------------------------------------------------------------------------------
/website/docs/migration/introduction.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 1
 3 | sidebar_label: Migration
 4 | ---
 5 | 
 6 | # Migration
 7 | 
 8 | This section provides best practice guides and tools to migrate data processing applications from self-managed environments to Amazon EMR.
 9 | 
10 | 1. [Amazon EMR Migration Guide](https://d1.awsstatic.com/whitepapers/amazon_emr_migration_guide.pdf) This is a comprehensive technical document that provides best practices and steps for migrating Apache Spark and Apache Hadoop from on-premises to AWS.
11 | 
12 | 2. [Migrating to Apache HBase on Amazon S3 on Amazon EMR](https://docs.aws.amazon.com/whitepapers/latest/migrate-apache-hbase-s3/migrate-apache-hbase-s3.html) This whitepaper provides an overview of Apache HBase on Amazon S3 and guides data engineers and software developers in the migration of an on- premises or HDFS backed Apache HBase cluster to Apache HBase on Amazon S3. The whitepaper offers a migration plan that includes detailed steps for each stage of the migration, including data migration, performance tuning, and operational guidance.
13 | 
14 | 3. Data Migration: We recommend using [AWS DataSync](https://aws.amazon.com/datasync/) for migrating data from HDFS to S3. Check this [blog post](https://aws.amazon.com/blogs/storage/using-aws-datasync-to-move-data-from-hadoop-to-amazon-s3/) to review Datasync capabilities and how to get started with Data migrations.
15 | 
16 | 4. Data pipelines Migrations: The following tools can be useful in migrating your current data pipelines to AWS
17 |     1. [Oozie to MWAA](https://github.com/dgghosalaws/oozie-to-airflow-emr)
18 |     2. [Oozie to AWS Step Functions](https://docs.aws.amazon.com/SchemaConversionTool/latest/userguide/big-data-oozie.html)
19 | 
20 | 5. Data Governance: The following tools can helpful in migrating your current data catalogs to AWS
21 |     1. [Migrate metadata between Hive metastore and AWS Glue Data Catalog](https://github.com/aws-samples/aws-glue-samples/tree/f3baf576d7da13ff79dbfe52938f22834fb6c0d7/utilities/Hive_metastore_migration)
22 |     2. [Hive Glue Catalog Sync Agent](https://github.com/awslabs/aws-glue-catalog-sync-agent-for-hive)
23 | 
24 | For further assistance reach out to [aws-bdms-emr@amazon.com](mailto:aws-bdms-emr@amazon.com?subject=EMR%20Best%20Practices:%20Migration%20Question)
25 | 


--------------------------------------------------------------------------------
/website/docs/utilities/assets/emr_advisor.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/utilities/assets/emr_advisor.png


--------------------------------------------------------------------------------
/website/docs/utilities/introduction.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | sidebar_position: 1
 3 | sidebar_label: Amazon EMR Utilities
 4 | ---
 5 | 
 6 | # Amazon EMR
 7 | 
 8 | ## EMR Utilities
 9 | 
10 | https://github.com/aws-samples/aws-emr-utilities
11 | 
12 | This repository contains sample code and utilities for using Amazon EMR on EC2. This package is structured based on the following directories:
13 | 
14 | - applications - application specific patches, plugins, etc.
15 | - utilities - administrative and maintenance utilities for working with EMR
16 | 
17 | ## EMR Advisor
18 | 
19 | https://github.com/aws-samples/aws-emr-advisor
20 | 
21 | This repository provides a tool to analyze Spark Event Logs to generate insights and costs recommendations using different deployment options for Amazon EMR. The tool generates an HTML report that can be stored locally or on Amazon S3 bucket for a quick review.
22 | 
23 | ![Spark Analysis](./assets/emr_advisor.png)
24 | 
25 | ## EMR Trino Autoscale
26 | 
27 | https://github.com/aws-samples/emr-trino-autoscale
28 | 
29 | This project provides a custom auto-autoscaling for Amazon EMR on EC2 clusters running with Trino. The package support out of the box Instance Groups and Instance Fleets clusters with On Demand / SPOT instances.
30 | 
31 | 


--------------------------------------------------------------------------------
/website/docusaurus.config.js:
--------------------------------------------------------------------------------
  1 | // @ts-check
  2 | // `@type` JSDoc annotations allow editor autocompletion and type checking
  3 | // (when paired with `@ts-check`).
  4 | // There are various equivalent ways to declare your Docusaurus config.
  5 | // See: https://docusaurus.io/docs/api/docusaurus-config
  6 | 
  7 | import { themes as prismThemes } from 'prism-react-renderer';
  8 | 
  9 | /** @type {import('@docusaurus/types').Config} */
 10 | const config = {
 11 |   title: 'AWS Open Data Analytics',
 12 |   tagline: 'All your AWS Open Data Analytics Needs',
 13 | 
 14 |   // Set the production url of your site here
 15 |   url: 'https://aws.github.io',
 16 |   // Set the /<baseUrl>/ pathname under which your site is served
 17 |   // For GitHub pages deployment, it is often '/<projectName>/'
 18 |   baseUrl: '/aws-emr-best-practices/',
 19 | 
 20 |   // GitHub pages deployment config.
 21 |   // If you aren't using GitHub pages, you don't need these.
 22 |   organizationName: 'aws', // Usually your GitHub org/user name.
 23 |   projectName: 'aws-emr-best-practices', // Usually your repo name.
 24 |   onBrokenLinks: 'throw',
 25 |   onBrokenMarkdownLinks: 'warn',
 26 | 
 27 |   // Even if you don't use internationalization, you can use this field to set
 28 |   // useful metadata like html lang. For example, if your site is Chinese, you
 29 |   // may want to replace "en" with "zh-Hans".
 30 |   i18n: {
 31 |     defaultLocale: 'en',
 32 |     locales: ['en'],
 33 |   },
 34 | 
 35 |   presets: [
 36 |     [
 37 |       'classic',
 38 |       /** @type {import('@docusaurus/preset-classic').Options} */
 39 |       ({
 40 |         docs: {
 41 |           sidebarPath: './sidebars.js',
 42 |           // Please change this to your repo.
 43 |           // Remove this to remove the "edit this page" links.
 44 |           editUrl:
 45 |             'https://github.com/facebook/docusaurus/tree/main/packages/create-docusaurus/templates/shared/',
 46 |         },
 47 |         googleAnalytics: {
 48 |           trackingID: 'G-MF59LKNSDN',
 49 |           anonymizeIP: true,
 50 |         },
 51 |         theme: {
 52 |           customCss: './src/css/custom.css',
 53 |         },
 54 |       }),
 55 |     ],
 56 |   ],
 57 | 
 58 |   themeConfig:
 59 |     /** @type {import('@docusaurus/preset-classic').ThemeConfig} */
 60 |     ({
 61 |       // Replace with your project's social card
 62 |       image: 'img/AWS_logo_RGB.png',
 63 |       navbar: {
 64 |         title: 'AWS Open Data Analytics',
 65 |         items: [
 66 |           {
 67 |             type: 'doc',
 68 |             docId: 'bestpractices/introduction',
 69 |             position: 'left',
 70 |             label: 'Best Practices'
 71 |           },
 72 |           {
 73 |             type: 'doc',
 74 |             docId: 'benchmarks/introduction',
 75 |             position: 'left',
 76 |             label: 'Benchmarks'
 77 |           },
 78 |           {
 79 |             type: 'doc',
 80 |             docId: 'migration/introduction',
 81 |             position: 'left',
 82 |             label: 'Migration'
 83 |           },
 84 |           {
 85 |             type: 'doc',
 86 |             docId: 'utilities/introduction',
 87 |             position: 'left',
 88 |             label: 'Utilities'
 89 |           },
 90 |         ],
 91 |       },
 92 |       footer: {
 93 |         style: 'dark',
 94 |         links: [
 95 |           {
 96 |             title: 'Contributing',
 97 |             items: [
 98 |               {
 99 |                 label: 'Github',
100 |                 href: 'https://github.com/aws/aws-emr-best-practices/tree/main',
101 |               },
102 |             ],
103 |           },
104 |         ],
105 |         copyright: `Built with ❤️ at AWS  <br/> © ${new Date().getFullYear()} Amazon.com, Inc. or its affiliates. All Rights Reserved`,
106 |       },
107 |       prism: {
108 |         theme: prismThemes.github,
109 |         darkTheme: prismThemes.dracula,
110 |       },
111 |     }),
112 |   plugins: [require.resolve('docusaurus-lunr-search')],
113 | };
114 | 
115 | export default config;
116 | 


--------------------------------------------------------------------------------
/website/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "website",
 3 |   "version": "0.0.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "docusaurus": "docusaurus",
 7 |     "start": "docusaurus start",
 8 |     "build": "docusaurus build",
 9 |     "swizzle": "docusaurus swizzle",
10 |     "deploy": "docusaurus deploy",
11 |     "clear": "docusaurus clear",
12 |     "serve": "docusaurus serve",
13 |     "write-translations": "docusaurus write-translations",
14 |     "write-heading-ids": "docusaurus write-heading-ids"
15 |   },
16 |   "dependencies": {
17 |     "@docusaurus/core": "^3.5.2",
18 |     "@docusaurus/plugin-google-analytics": "^3.6.1",
19 |     "@docusaurus/preset-classic": "^3.5.2",
20 |     "@docusaurus/theme-search-algolia": "^3.1.0",
21 |     "@mdx-js/react": "^3.0.0",
22 |     "clsx": "^2.0.0",
23 |     "docusaurus-lunr-search": "^3.3.2",
24 |     "prism-react-renderer": "^2.3.0",
25 |     "react": "^18.0.0",
26 |     "react-dom": "^18.0.0"
27 |   },
28 |   "devDependencies": {
29 |     "@docusaurus/module-type-aliases": "3.1.0",
30 |     "@docusaurus/types": "3.1.0"
31 |   },
32 |   "browserslist": {
33 |     "production": [
34 |       ">0.5%",
35 |       "not dead",
36 |       "not op_mini all"
37 |     ],
38 |     "development": [
39 |       "last 3 chrome version",
40 |       "last 3 firefox version",
41 |       "last 5 safari version"
42 |     ]
43 |   },
44 |   "engines": {
45 |     "node": ">=18.0"
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/website/sidebars.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Creating a sidebar enables you to:
 3 |  - create an ordered group of docs
 4 |  - render a sidebar for each doc of that group
 5 |  - provide next/previous navigation
 6 | 
 7 |  The sidebars can be generated from the filesystem, or explicitly defined here.
 8 | 
 9 |  Create as many sidebars as you want.
10 |  */
11 | 
12 | // @ts-check
13 | 
14 | /** @type {import('@docusaurus/plugin-content-docs').SidebarsConfig} */
15 | const sidebars = {
16 |   bestpractices: [{ type: 'autogenerated', dirName: 'bestpractices' }],
17 |   benchmarks: [
18 |     'benchmarks/introduction',
19 |     'benchmarks/price_performance',
20 |     'benchmarks/benchmarking_variables',
21 |     {
22 |       type: 'category',
23 |       label: 'Running your Benchmark',
24 |       items: ['benchmarks/Running/setting_up_environment', 'benchmarks/Running/benchmarking_checklist'],
25 |     },
26 |     {
27 |       type: 'category',
28 |       label: 'Analyzing your Benchmark',
29 |       items: ['benchmarks/Analyzing/retrieve_event_logs', 'benchmarks/Analyzing/read_spark_UI'],
30 |     },
31 |     {
32 |       type: 'category',
33 |       label: 'Resources',
34 |       items: ['benchmarks/Resources/Benchmark_results', 'benchmarks/Resources/Utilities'],
35 |     },
36 |   ],
37 |   migration: [
38 |     'migration/introduction',
39 |   ],
40 |   utilities: [
41 |     'utilities/introduction',
42 |   ],
43 | 
44 | };
45 | 
46 | export default sidebars;
47 | 


--------------------------------------------------------------------------------
/website/src/components/HomepageFeatures/index.js:
--------------------------------------------------------------------------------
 1 | import clsx from 'clsx';
 2 | import Heading from '@theme/Heading';
 3 | import styles from './styles.module.css';
 4 | 
 5 | const FeatureList = [
 6 |   {
 7 |     title: 'Best Practices',
 8 |     Svg: require('@site/static/img/best_practices.svg').default,
 9 |     description: (
10 |       <>
11 |        Best Practices for using AWS Open Data Analytics Services
12 |       </>
13 |     ),
14 |   },
15 |   {
16 |     title: 'Benchmarks',
17 |     Svg: require('@site/static/img/benchmark.svg').default,
18 |     description: (
19 |       <>
20 |         AWS Benchmarks!
21 |       </>
22 |     ),
23 |   },
24 |   {
25 |     title: 'Utilities',
26 |     Svg: require('@site/static/img/utilities.svg').default,
27 |     description: (
28 |       <>
29 |         A repository of solutions and utilities for using AWS Open Data Analytics Services
30 |       </>
31 |     ),
32 |   },
33 | ];
34 | 
35 | function Feature({Svg, title, description}) {
36 |   return (
37 |     <div className={clsx('col col--4')}>
38 |       <div className="text--center">
39 |         <Svg className={styles.featureSvg} role="img" />
40 |       </div>
41 |       <div className="text--center padding-horiz--md">
42 |         <Heading as="h3">{title}</Heading>
43 |         <p>{description}</p>
44 |       </div>
45 |     </div>
46 |   );
47 | }
48 | 
49 | export default function HomepageFeatures() {
50 |   return (
51 |     <section className={styles.features}>
52 |       <div className="container">
53 |         <div className="row">
54 |           {FeatureList.map((props, idx) => (
55 |             <Feature key={idx} {...props} />
56 |           ))}
57 |         </div>
58 |       </div>
59 |     </section>
60 |   );
61 | }
62 | 


--------------------------------------------------------------------------------
/website/src/components/HomepageFeatures/styles.module.css:
--------------------------------------------------------------------------------
 1 | .features {
 2 |   display: flex;
 3 |   align-items: center;
 4 |   padding: 2rem 0;
 5 |   width: 100%;
 6 | }
 7 | 
 8 | .featureSvg {
 9 |   height: 200px;
10 |   width: 200px;
11 | }
12 | 


--------------------------------------------------------------------------------
/website/src/css/custom.css:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Any CSS included here will be global. The classic template
 3 |  * bundles Infima by default. Infima is a CSS framework designed to
 4 |  * work well for content-centric websites.
 5 |  */
 6 | 
 7 | /* You can override the default Infima variables here. */
 8 | :root {
 9 |   --ifm-color-primary: #2e8555;
10 |   --ifm-color-primary-dark: #29784c;
11 |   --ifm-color-primary-darker: #277148;
12 |   --ifm-color-primary-darkest: #205d3b;
13 |   --ifm-color-primary-light: #33925d;
14 |   --ifm-color-primary-lighter: #359962;
15 |   --ifm-color-primary-lightest: #3cad6e;
16 |   --ifm-code-font-size: 95%;
17 |   --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.1);
18 | }
19 | 
20 | /* For readability concerns, you should choose a lighter palette in dark mode. */
21 | [data-theme='dark'] {
22 |   --ifm-color-primary: #25c2a0;
23 |   --ifm-color-primary-dark: #21af90;
24 |   --ifm-color-primary-darker: #1fa588;
25 |   --ifm-color-primary-darkest: #1a8870;
26 |   --ifm-color-primary-light: #29d5b0;
27 |   --ifm-color-primary-lighter: #32d8b4;
28 |   --ifm-color-primary-lightest: #4fddbf;
29 |   --docusaurus-highlighted-code-line-bg: rgba(0, 0, 0, 0.3);
30 | }
31 | 


--------------------------------------------------------------------------------
/website/src/pages/index.js:
--------------------------------------------------------------------------------
 1 | import clsx from 'clsx';
 2 | import Link from '@docusaurus/Link';
 3 | import useDocusaurusContext from '@docusaurus/useDocusaurusContext';
 4 | import Layout from '@theme/Layout';
 5 | import HomepageFeatures from '@site/src/components/HomepageFeatures';
 6 | 
 7 | import Heading from '@theme/Heading';
 8 | import styles from './index.module.css';
 9 | 
10 | function HomepageHeader() {
11 |   const {siteConfig} = useDocusaurusContext();
12 |   return (
13 |     <header className={clsx('hero hero--primary', styles.heroBanner)}>
14 |       <div className="container">
15 |         <Heading as="h1" className="hero__title">
16 |           {siteConfig.title}
17 |         </Heading>
18 |         <p className="hero__subtitle">{siteConfig.tagline}</p>
19 |         <div className={styles.buttons}>
20 |           <Link
21 |             className="button button--secondary button--lg"
22 |             to="/docs/bestpractices/introduction">
23 |             Get Started!
24 |           </Link>
25 |         </div>
26 |       </div>
27 |     </header>
28 |   );
29 | }
30 | 
31 | export default function Home() {
32 |   const {siteConfig} = useDocusaurusContext();
33 |   return (
34 |     <Layout
35 |       title={`AWS Open Data Analytics`}
36 |       description="Amazon EMR Best Practice guides">
37 |       <HomepageHeader />
38 |       <main>
39 |         <HomepageFeatures />
40 |       </main>
41 |     </Layout>
42 |   );
43 | }
44 | 


--------------------------------------------------------------------------------
/website/src/pages/index.module.css:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * CSS files with the .module.css suffix will be treated as CSS modules
 3 |  * and scoped locally.
 4 |  */
 5 | 
 6 | .heroBanner {
 7 |   padding: 4rem 0;
 8 |   text-align: center;
 9 |   position: relative;
10 |   overflow: hidden;
11 | }
12 | 
13 | @media screen and (max-width: 996px) {
14 |   .heroBanner {
15 |     padding: 2rem;
16 |   }
17 | }
18 | 
19 | .buttons {
20 |   display: flex;
21 |   align-items: center;
22 |   justify-content: center;
23 | }
24 | 


--------------------------------------------------------------------------------
/website/src/pages/markdown-page.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Markdown page example
3 | ---
4 | 
5 | # Markdown page example
6 | 
7 | You don't need React to write simple standalone pages.
8 | 


--------------------------------------------------------------------------------
/website/src/theme/SearchBar/SearchBar.js:
--------------------------------------------------------------------------------
 1 | import React from 'react';
 2 | import SearchBar from '@theme-original/SearchBar';
 3 | 
 4 | export default function SearchBarWrapper(props) {
 5 |   return (
 6 |     <>
 7 |       <SearchBar {...props} />
 8 |     </>
 9 |   );
10 | }
11 | 


--------------------------------------------------------------------------------
/website/src/theme/SearchBar/index.js:
--------------------------------------------------------------------------------
 1 | import React from 'react';
 2 | import SearchBar from '@theme-original/SearchBar';
 3 | 
 4 | export default function SearchBarWrapper(props) {
 5 |   return (
 6 |     <>
 7 |       <SearchBar {...props} />
 8 |     </>
 9 |   );
10 | }


--------------------------------------------------------------------------------
/website/static/.nojekyll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/static/.nojekyll


--------------------------------------------------------------------------------
/website/static/img/AWS_logo_RGB.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/static/img/AWS_logo_RGB.png


--------------------------------------------------------------------------------
/website/static/img/best_practices.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" standalone="no"?>
  2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 20010904//EN"
  3 |  "http://www.w3.org/TR/2001/REC-SVG-20010904/DTD/svg10.dtd">
  4 | <svg version="1.0" xmlns="http://www.w3.org/2000/svg"
  5 |  width="300.000000pt" height="300.000000pt" viewBox="0 0 300.000000 300.000000"
  6 |  preserveAspectRatio="xMidYMid meet">
  7 | <metadata>
  8 | Created by potrace 1.10, written by Peter Selinger 2001-2011
  9 | </metadata>
 10 | <g transform="translate(0.000000,300.000000) scale(0.100000,-0.100000)"
 11 | fill="#25c2a0" stroke="none">
 12 | <path d="M1027 2564 c-4 -4 -7 -16 -7 -26 0 -14 6 -19 23 -16 14 2 22 10 22
 13 | 23 0 20 -25 32 -38 19z"/>
 14 | <path d="M1984 2549 c-8 -14 11 -33 25 -25 11 7 4 36 -9 36 -5 0 -12 -5 -16
 15 | -11z"/>
 16 | <path d="M1408 2539 c-120 -16 -272 -82 -370 -162 -10 -8 -18 -25 -18 -39 0
 17 | -31 -17 -48 -51 -48 -22 0 -34 -11 -64 -55 -89 -135 -135 -282 -135 -435 0
 18 | -197 75 -386 209 -529 l41 -45 -42 -43 c-28 -28 -45 -40 -52 -33 -31 31 -58
 19 | 11 -270 -206 -232 -237 -254 -268 -242 -347 9 -61 42 -107 96 -135 51 -25 97
 20 | -28 143 -8 35 15 443 420 465 462 13 24 12 29 -8 56 l-22 30 53 54 54 54 70
 21 | -25 c39 -13 89 -27 113 -31 l42 -6 0 -275 0 -274 26 -24 c26 -24 33 -25 154
 22 | -25 121 0 128 1 154 25 19 17 26 34 26 60 0 33 2 35 34 35 27 0 39 -7 56 -30
 23 | 12 -16 42 -42 68 -57 46 -27 52 -28 220 -31 161 -3 175 -2 218 19 36 17 51 32
 24 | 65 63 10 23 16 52 12 64 -4 16 2 30 21 49 24 24 31 50 26 96 -1 10 10 31 24
 25 | 48 28 33 35 93 15 130 -8 16 -7 25 6 39 29 32 31 102 4 149 -36 65 -80 81
 26 | -227 81 l-123 0 41 73 c26 49 40 86 40 111 0 141 -173 172 -245 45 -11 -20
 27 | -47 -67 -80 -104 -34 -39 -59 -76 -58 -86 2 -12 -26 -47 -73 -93 l-75 -75 1
 28 | -216 c0 -150 4 -222 12 -235 10 -15 10 -22 0 -32 -6 -6 -12 -19 -12 -28 0 -9
 29 | -10 -26 -23 -38 -18 -17 -26 -19 -35 -10 -9 9 -12 9 -12 -3 0 -12 -14 -16 -61
 30 | -17 -34 -1 -65 0 -69 3 -4 2 -3 22 3 43 10 36 13 39 36 32 32 -9 69 7 77 33 5
 31 | 16 8 17 15 6 6 -9 9 -2 9 24 0 22 -4 36 -10 32 -5 -3 -17 6 -26 21 -15 22 -23
 32 | 25 -56 22 l-38 -4 0 102 0 102 35 -1 c36 -1 74 25 76 52 1 9 5 5 9 -9 6 -20 8
 33 | -14 8 25 0 39 -2 45 -8 25 -5 -15 -8 -18 -9 -9 -2 34 -35 61 -74 61 -35 0 -37
 34 | 2 -37 33 0 19 -4 37 -10 40 -7 4 -10 -98 -10 -299 0 -296 -1 -306 -19 -306
 35 | -11 0 -27 10 -35 22 -14 19 -16 66 -16 314 l0 292 48 -5 c44 -4 46 -3 27 12
 36 | -37 30 -95 -2 -95 -51 0 -25 4 -26 -105 6 -170 49 -146 49 -208 -1 -69 -55
 37 | -84 -84 -66 -133 l13 -35 -60 -63 c-154 -162 -361 -367 -381 -377 -45 -23
 38 | -143 -4 -143 27 0 8 -9 16 -20 19 -29 8 -26 43 4 43 16 0 33 13 52 38 27 35
 39 | 269 287 362 376 23 22 42 48 42 58 0 22 9 23 38 2 22 -15 54 -13 50 3 -1 4 15
 40 | 26 35 47 34 37 36 41 22 60 -9 12 -24 39 -34 61 -10 22 -41 67 -68 100 -28 33
 41 | -64 85 -81 115 -38 67 -80 196 -88 264 -3 28 -7 57 -9 64 -4 11 6 106 20 195
 42 | 4 22 16 60 28 85 12 26 24 56 28 67 6 17 7 17 8 -6 1 -14 10 -36 21 -49 22
 43 | -26 24 -38 11 -73 -14 -36 -21 -83 -27 -182 -5 -76 -2 -103 20 -178 29 -105
 44 | 91 -213 160 -282 50 -49 199 -135 212 -122 4 4 -8 14 -26 24 -208 114 -329
 45 | 316 -330 548 0 164 60 307 179 425 122 121 239 170 407 170 49 0 114 -7 144
 46 | -15 298 -79 487 -371 440 -678 -6 -43 -18 -94 -26 -112 -20 -49 -18 -56 12
 47 | -45 36 14 64 12 98 -5 37 -19 40 -15 63 85 25 109 22 264 -6 364 -85 304 -329
 48 | 517 -633 556 -92 11 -105 11 -200 -1z m241 -44 c154 -32 281 -104 388 -219
 49 | 149 -160 204 -337 182 -580 -10 -102 -21 -121 -66 -112 l-30 7 10 57 c5 31 10
 50 | 93 11 137 8 297 -209 571 -501 631 -75 16 -173 15 -288 -1 -34 -5 -44 -3 -63
 51 | 18 l-23 24 37 18 c73 34 231 44 343 20z m-361 -93 c2 -6 -12 -18 -32 -26 -101
 52 | -42 -205 -137 -272 -248 -20 -32 -39 -58 -44 -58 -43 0 16 104 115 204 44 44
 53 | 97 85 140 108 71 38 86 41 93 20z m907 -977 c36 -35 31 -65 -25 -169 -27 -51
 54 | -50 -100 -50 -108 0 -37 31 -46 166 -50 113 -3 135 -6 153 -22 25 -23 27 -53
 55 | 4 -78 -14 -16 -33 -18 -159 -18 -144 0 -168 -6 -144 -35 10 -12 41 -15 146
 56 | -15 122 0 134 -2 144 -19 14 -27 13 -47 -6 -65 -12 -13 -41 -16 -149 -16 -135
 57 | 0 -159 -6 -135 -35 10 -12 38 -15 121 -15 105 0 109 -1 125 -25 15 -23 15 -27
 58 | 0 -50 -16 -24 -20 -25 -131 -25 -115 0 -138 -7 -115 -35 9 -11 35 -15 94 -15
 59 | 84 0 106 -9 106 -42 0 -34 -35 -40 -204 -36 -158 4 -162 4 -223 36 -36 18 -78
 60 | 32 -98 32 l-35 0 0 225 c0 223 0 225 21 225 13 0 38 19 63 47 22 27 79 88 126
 61 | 136 47 49 89 102 94 118 11 36 50 79 71 79 9 0 27 -11 40 -25z m-1142 -263
 62 | c16 -17 15 -20 -16 -50 -33 -32 -34 -32 -51 -13 -18 20 -18 21 10 51 34 35 36
 63 | 36 57 12z m1158 -8 c11 -14 10 -15 -11 -12 -13 1 -25 9 -28 16 -5 17 23 15 39
 64 | -4z m259 -20 c54 -32 73 -93 46 -145 -9 -18 -12 -47 -9 -84 3 -44 -1 -63 -16
 65 | -89 -11 -18 -23 -52 -26 -75 -3 -24 -14 -60 -24 -80 -11 -20 -22 -53 -26 -73
 66 | -4 -25 -17 -46 -40 -67 l-35 -31 -153 2 c-156 1 -222 9 -182 21 11 3 85 6 164
 67 | 6 128 -1 147 2 170 19 21 16 26 27 25 63 -1 35 4 47 27 66 36 31 44 57 28 94
 68 | -11 27 -10 32 10 45 44 31 53 71 29 122 -9 18 -8 26 9 45 45 50 28 120 -36
 69 | 148 -17 8 -31 17 -31 21 0 14 41 9 70 -8z m-1537 -40 c4 -4 -89 -104 -205
 70 | -223 -116 -118 -217 -230 -225 -248 -17 -41 -23 -42 -23 -3 0 26 31 63 218
 71 | 255 119 124 220 225 223 225 3 0 9 -3 12 -6z m711 -106 c16 -26 -4 -53 -40
 72 | -53 -25 0 -30 4 -32 29 -6 46 47 63 72 24z m-4 -333 c7 -9 10 -25 6 -40 -6
 73 | -22 -11 -26 -39 -23 -25 2 -33 8 -35 26 -7 44 41 70 68 37z"/>
 74 | <path d="M2094 2509 c-10 -17 13 -36 27 -22 12 12 4 33 -11 33 -5 0 -12 -5
 75 | -16 -11z"/>
 76 | <path d="M2180 2384 c-11 -12 -10 -18 3 -32 16 -15 18 -15 34 0 13 14 14 20 3
 77 | 32 -7 9 -16 16 -20 16 -4 0 -13 -7 -20 -16z"/>
 78 | <path d="M938 2379 c-10 -5 -18 -18 -18 -27 0 -13 4 -11 16 7 12 16 23 22 40
 79 | 18 13 -2 21 0 18 4 -7 12 -35 11 -56 -2z"/>
 80 | <path d="M1320 2277 c-36 -12 -89 -39 -118 -61 -57 -41 -125 -132 -136 -181
 81 | -4 -18 6 -5 24 32 81 165 283 258 453 209 30 -9 58 -23 60 -31 4 -11 5 -10 3
 82 | 2 -2 25 -86 52 -161 52 -36 0 -86 -8 -125 -22z"/>
 83 | <path d="M730 2265 c-16 -19 -6 -45 19 -45 25 0 34 26 15 45 -19 18 -19 18
 84 | -34 0z"/>
 85 | <path d="M2340 2253 c-8 -23 -22 -36 -46 -45 -40 -14 -45 -28 -10 -28 27 0 56
 86 | -24 56 -46 0 -23 14 -16 32 16 9 17 25 30 36 30 33 0 33 13 2 26 -19 8 -36 26
 87 | -45 46 l-14 33 -11 -32z"/>
 88 | <path d="M1638 2183 c-61 -61 -80 -75 -90 -65 -21 21 -66 14 -99 -16 -80 -71
 89 | 5 -197 100 -148 38 20 54 49 49 88 -4 27 4 39 80 112 82 79 95 106 54 106 -10
 90 | 0 -52 -35 -94 -77z m-84 -125 c12 -33 -14 -68 -49 -68 -46 0 -62 57 -22 85 31
 91 | 22 60 15 71 -17z"/>
 92 | <path d="M1358 2207 c-26 -8 -66 -27 -89 -43 -42 -29 -109 -108 -109 -128 0
 93 | -6 9 3 19 20 28 46 88 101 136 124 57 28 157 36 217 17 46 -14 68 -9 68 18 0
 94 | 5 -5 2 -11 -8 -8 -14 -14 -15 -30 -6 -32 17 -149 20 -201 6z"/>
 95 | <path d="M1692 2097 c-34 -35 -42 -50 -42 -81 0 -99 -97 -151 -205 -111 -15 6
 96 | -17 5 -9 -4 6 -6 33 -14 61 -17 92 -11 163 45 163 131 0 33 7 46 42 82 23 24
 97 | 40 43 37 43 -3 0 -24 -19 -47 -43z"/>
 98 | <path d="M668 2111 c-16 -16 -16 -18 1 -33 18 -15 21 -15 34 0 13 14 13 20 3
 99 | 35 -16 20 -16 20 -38 -2z"/>
100 | <path d="M1754 2103 c-15 -16 -16 -50 -9 -444 l7 -426 -25 -7 c-14 -3 -32 -9
101 | -41 -12 -14 -6 -16 24 -16 300 l0 306 -103 0 c-67 0 -107 -4 -115 -12 -14 -14
102 | -17 -598 -3 -598 18 0 21 38 21 290 0 287 -2 280 69 284 22 1 47 8 56 16 9 7
103 | 15 9 14 4 -1 -5 -1 -138 0 -296 1 -234 4 -288 14 -288 8 0 22 -9 33 -21 25
104 | -28 55 -21 135 31 54 35 64 38 72 25 7 -14 11 -12 28 9 11 13 33 56 49 94 l29
105 | 69 3 347 c2 190 0 346 -4 346 -4 0 -19 -5 -33 -12 l-25 -11 0 -382 0 -383 -32
106 | -24 c-18 -14 -53 -35 -77 -47 -40 -21 -43 -21 -36 -4 5 10 10 92 12 183 2 91
107 | 5 175 7 188 2 12 0 22 -3 22 -3 0 -5 87 -4 192 2 106 3 203 3 215 0 20 6 22
108 | 55 25 30 2 55 7 54 13 0 5 8 12 18 16 10 4 -17 8 -59 8 -59 1 -80 -3 -94 -16z"/>
109 | <path d="M2460 2025 c0 -8 -10 -20 -22 -26 l-22 -12 22 -8 c12 -5 22 -15 22
110 | -24 0 -8 5 -15 10 -15 6 0 10 7 10 16 0 9 9 18 20 21 25 7 25 7 -5 38 -30 29
111 | -35 31 -35 10z"/>
112 | <path d="M1185 1998 c-53 -29 -73 -108 -39 -157 66 -94 216 -26 189 85 -16 64
113 | -95 102 -150 72z m101 -40 c3 -5 -2 -18 -11 -28 -16 -18 -16 -18 4 -13 32 8
114 | 28 -40 -5 -66 -46 -36 -82 -19 -48 22 10 12 5 9 -13 -7 -31 -27 -31 -28 -42
115 | -8 -26 49 -5 104 44 114 27 5 64 -2 71 -14z"/>
116 | <path d="M549 1980 c-15 -16 -15 -19 2 -34 17 -15 21 -15 38 0 18 16 18 17 0
117 | 35 -22 22 -20 22 -40 -1z"/>
118 | <path d="M1053 1940 c0 -36 2 -50 4 -32 2 17 2 47 0 65 -2 17 -4 3 -4 -33z"/>
119 | <path d="M1067 1850 c11 -46 70 -110 100 -110 4 0 -5 8 -20 19 -30 19 -54 49
120 | -74 91 l-13 25 7 -25z"/>
121 | <path d="M1386 1834 c-4 -16 -23 -43 -42 -61 -20 -18 -31 -33 -26 -33 24 0 94
122 | 102 79 116 -2 3 -8 -7 -11 -22z"/>
123 | <path d="M555 1771 c-4 -12 -19 -27 -33 -32 l-27 -11 26 -13 c15 -8 29 -21 33
124 | -30 8 -21 23 -19 29 6 3 11 15 23 26 26 27 7 27 23 1 23 -11 0 -20 4 -20 8 0
125 | 5 -6 17 -14 27 -13 18 -14 18 -21 -4z"/>
126 | <path d="M2441 1740 c-12 -42 -37 -64 -83 -75 -32 -8 -28 -25 6 -25 35 0 74
127 | -36 82 -75 3 -19 10 -35 14 -35 4 0 11 16 14 35 8 39 47 75 82 75 36 0 29 17
128 | -12 30 -36 12 -74 59 -74 93 0 31 -17 18 -29 -23z m39 -86 c0 -8 -9 -14 -20
129 | -14 -22 0 -28 26 -7 33 16 6 27 -1 27 -19z"/>
130 | <path d="M1193 1728 c15 -9 66 -9 90 1 10 4 -7 7 -41 6 -33 0 -55 -3 -49 -7z"/>
131 | <path d="M1142 1658 c-13 -13 -16 -233 -3 -278 13 -47 23 -2 25 114 2 62 6
132 | 118 10 124 3 6 28 14 53 18 26 4 53 13 61 20 11 12 2 14 -60 14 -44 0 -79 -5
133 | -86 -12z"/>
134 | <path d="M1318 1659 c-16 -9 -18 -31 -18 -219 0 -122 4 -210 9 -210 5 0 23
135 | -13 39 -30 27 -27 64 -38 97 -30 6 2 33 3 62 4 56 2 87 16 35 16 -17 0 -66 5
136 | -108 11 l-77 12 5 228 c5 206 4 229 -11 229 -9 0 -24 -5 -33 -11z"/>
137 | <path d="M521 1511 c-15 -10 -8 -41 9 -41 15 0 25 29 14 40 -5 5 -15 5 -23 1z"/>
138 | <path d="M715 1371 c-3 -10 -16 -22 -28 -25 l-22 -7 23 -10 c13 -5 26 -18 29
139 | -29 3 -11 9 -20 13 -20 4 0 10 9 13 20 3 11 16 24 29 29 l23 10 -22 7 c-12 3
140 | -25 15 -28 25 -4 10 -10 19 -15 19 -5 0 -11 -9 -15 -19z"/>
141 | <path d="M2427 1333 c-12 -11 -7 -41 7 -46 21 -8 48 16 40 36 -6 17 -35 23
142 | -47 10z"/>
143 | <path d="M1203 716 l-21 -22 23 -20 23 -20 22 23 c19 20 20 24 6 29 -9 3 -16
144 | 12 -16 20 0 19 -11 16 -37 -10z"/>
145 | <path d="M425 389 c-4 -6 -5 -12 -3 -15 7 -6 2168 -5 2179 2 5 3 7 10 4 15 -9
146 | 13 -2172 12 -2180 -2z"/>
147 | </g>
148 | </svg>
149 | 


--------------------------------------------------------------------------------