├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── LICENSE-SAMPLECODE ├── LICENSE-SUMMARY ├── README.md ├── package-lock.json └── website ├── .gitignore ├── README.md ├── babel.config.js ├── docs ├── benchmarks │ ├── Analyzing │ │ ├── images │ │ │ ├── benchmark-1.png │ │ │ ├── benchmark-10.png │ │ │ ├── benchmark-11.png │ │ │ ├── benchmark-12.png │ │ │ ├── benchmark-13.png │ │ │ ├── benchmark-14.png │ │ │ ├── benchmark-2.png │ │ │ ├── benchmark-3.png │ │ │ ├── benchmark-4.png │ │ │ ├── benchmark-5.png │ │ │ ├── benchmark-6.png │ │ │ ├── benchmark-7.png │ │ │ ├── benchmark-8.png │ │ │ └── benchmark-9.png │ │ ├── read_spark_UI.md │ │ └── retrieve_event_logs.md │ ├── Resources │ │ ├── Benchmark_results.md │ │ └── Utilities.md │ ├── Running │ │ ├── benchmarking_checklist.md │ │ └── setting_up_environment.md │ ├── benchmarking_variables.md │ ├── img1.png │ ├── img2.png │ ├── introduction.md │ └── price_performance.md ├── bestpractices │ ├── Applications │ │ ├── HBase │ │ │ ├── best_practice.md │ │ │ ├── best_practice_hdfs.md │ │ │ ├── best_practice_s3.md │ │ │ ├── data_integrity.md │ │ │ ├── data_migration.md │ │ │ ├── img │ │ │ │ ├── hbase_replication_cross.png │ │ │ │ ├── hbase_replication_oneway.png │ │ │ │ ├── hbase_replication_simple.png │ │ │ │ ├── hbase_s3_replication.png │ │ │ │ ├── management_avg_payload.png │ │ │ │ ├── observability_grafana.png │ │ │ │ └── observabilty_webui.png │ │ │ ├── introduction.md │ │ │ ├── management.md │ │ │ ├── observability.md │ │ │ ├── performance_tests.md │ │ │ ├── scripts │ │ │ │ ├── hbase-snapshot-export.sh │ │ │ │ └── hbase-snapshot-import.sh │ │ │ └── security.md │ │ ├── Hadoop │ │ │ ├── img │ │ │ │ ├── emr_console_events.png │ │ │ │ ├── img.png │ │ │ │ ├── img2.png │ │ │ │ └── three.png │ │ │ ├── introduction.md │ │ │ ├── scripts │ │ │ │ ├── emr-6-ba-yarn_docker_gpu.sh │ │ │ │ ├── emr-6-yarn_docker_gpu.yaml │ │ │ │ ├── yarn_labels_scaling.sh │ │ │ │ └── yarn_labels_scaling.yaml │ │ │ ├── yarn_docker_gpu.md │ │ │ ├── yarn_labels_scaling.md │ │ │ └── yarn_node_ resilience.md │ │ ├── Hive │ │ │ ├── best_practices.md │ │ │ └── introduction.md │ │ └── Spark │ │ │ ├── best_practices.md │ │ │ ├── data_quality.md │ │ │ ├── data_skew.md │ │ │ ├── images │ │ │ ├── spark-bp-1.png │ │ │ ├── spark-bp-10.png │ │ │ ├── spark-bp-11.png │ │ │ ├── spark-bp-12.png │ │ │ ├── spark-bp-13.png │ │ │ ├── spark-bp-14.png │ │ │ ├── spark-bp-15.png │ │ │ ├── spark-bp-16.png │ │ │ ├── spark-bp-17.png │ │ │ ├── spark-bp-18.png │ │ │ ├── spark-bp-19.png │ │ │ ├── spark-bp-2.png │ │ │ ├── spark-bp-20.png │ │ │ ├── spark-bp-21.png │ │ │ ├── spark-bp-22.png │ │ │ ├── spark-bp-23.png │ │ │ ├── spark-bp-24.png │ │ │ ├── spark-bp-25.png │ │ │ ├── spark-bp-26.png │ │ │ ├── spark-bp-27.png │ │ │ ├── spark-bp-28.png │ │ │ ├── spark-bp-29.png │ │ │ ├── spark-bp-3.png │ │ │ ├── spark-bp-30.png │ │ │ ├── spark-bp-31.png │ │ │ ├── spark-bp-32.png │ │ │ ├── spark-bp-33.png │ │ │ ├── spark-bp-4.png │ │ │ ├── spark-bp-5.png │ │ │ ├── spark-bp-6.png │ │ │ ├── spark-bp-7.png │ │ │ ├── spark-bp-8.png │ │ │ ├── spark-bp-9.png │ │ │ ├── spark-bp-range-join-after.png │ │ │ ├── spark-bp-range-join-before.png │ │ │ ├── spark-tt-1.png │ │ │ └── spark-tt-2.png │ │ │ ├── introduction.md │ │ │ ├── joins.md │ │ │ ├── observability.md │ │ │ ├── performance.md │ │ │ ├── thrift.md │ │ │ └── troubleshooting.md │ ├── Cost Optimizations │ │ ├── Introduction.md │ │ ├── best_practices.md │ │ ├── images │ │ │ ├── bp-1.png │ │ │ ├── bp-10.png │ │ │ ├── bp-11.png │ │ │ ├── bp-2.png │ │ │ ├── bp-3.png │ │ │ ├── bp-4.png │ │ │ ├── bp-5.png │ │ │ ├── bp-6.png │ │ │ ├── bp-7.png │ │ │ ├── bp-8.png │ │ │ ├── bp-9.png │ │ │ ├── intro-1.png │ │ │ ├── intro-10.png │ │ │ ├── intro-11.png │ │ │ ├── intro-2.png │ │ │ ├── intro-3.png │ │ │ ├── intro-4.png │ │ │ ├── intro-5.png │ │ │ ├── intro-6.png │ │ │ ├── intro-7.png │ │ │ ├── intro-8.png │ │ │ ├── intro-9.png │ │ │ ├── mru-1.png │ │ │ └── mru-2.png │ │ └── maximizing-resource-utilization.md │ ├── Features │ │ ├── EMRFS │ │ │ ├── Assets │ │ │ │ └── table.png │ │ │ ├── aimd.md │ │ │ └── images │ │ │ │ ├── pic1.png │ │ │ │ └── pic2.png │ │ ├── Managed Scaling │ │ │ ├── best_practices.md │ │ │ ├── images │ │ │ │ ├── bp-1.png │ │ │ │ ├── bp-3.png │ │ │ │ └── ms-metrics.png │ │ │ └── troubleshooting.md │ │ └── Spot Usage │ │ │ └── best_practices.md │ ├── Observability │ │ ├── Assets │ │ │ ├── emr-cw_dashboard.sh │ │ │ ├── preview_1.png │ │ │ └── preview_2.png │ │ ├── best_practices.md │ │ └── intro.md │ ├── Reliability │ │ ├── best_practices.md │ │ ├── images │ │ │ ├── bp-1.png │ │ │ ├── bp-2.png │ │ │ ├── bp-3.png │ │ │ ├── bp-5.png │ │ │ └── bp-6.png │ │ └── introduction.md │ ├── Security │ │ ├── best_practices.md │ │ └── introduction.md │ ├── Troubleshooting │ │ ├── Troubleshooting EMR.md │ │ └── images │ │ │ ├── AmazonQ.png │ │ │ ├── CWagent_cpu_graph.png │ │ │ ├── CWagent_cpu_metric_list.png │ │ │ ├── CWagent_cpu_namespace.png │ │ │ ├── CWagent_disk_graph.png │ │ │ ├── CWagent_disk_namespace.png │ │ │ ├── CWagent_memory_graph.png │ │ │ ├── CWagent_memory_namespace.png │ │ │ ├── application_container_log_location.png │ │ │ ├── application_master_log_location.png │ │ │ ├── datanode_log_location.png │ │ │ ├── hdfs_fsadmin.png │ │ │ ├── hdfs_fsck.png │ │ │ ├── instance_state_log_location.png │ │ │ ├── iostat.png │ │ │ ├── iostat_output.png │ │ │ ├── namnode_log_location.png │ │ │ ├── nodemanager_log_location.htm │ │ │ ├── nodemanager_log_location.png │ │ │ ├── resourcemanager_log_location.png │ │ │ └── uptime.png │ └── introduction.md ├── migration │ └── introduction.md └── utilities │ ├── assets │ └── emr_advisor.png │ └── introduction.md ├── docusaurus.config.js ├── package-lock.json ├── package.json ├── sidebars.js ├── src ├── components │ └── HomepageFeatures │ │ ├── index.js │ │ └── styles.module.css ├── css │ └── custom.css ├── pages │ ├── index.js │ ├── index.module.css │ └── markdown-page.md └── theme │ └── SearchBar │ ├── SearchBar.js │ └── index.js └── static ├── .nojekyll └── img ├── AWS_logo_RGB.png ├── benchmark.svg ├── best_practices.svg └── utilities.svg /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | #IDE 34 | .idea/ 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *,cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Scrapy stuff: 56 | .scrapy 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | # IPython Notebook 62 | .ipynb_checkpoints 63 | 64 | # pyenv 65 | .python-version 66 | 67 | # virtualenv 68 | venv/ 69 | ENV/ 70 | 71 | # MkDocs documentation 72 | site/ 73 | .DS_Store 74 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /LICENSE-SAMPLECODE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 4 | software and associated documentation files (the "Software"), to deal in the Software 5 | without restriction, including without limitation the rights to use, copy, modify, 6 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 7 | permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 10 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 11 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 12 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 13 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 14 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | -------------------------------------------------------------------------------- /LICENSE-SUMMARY: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | The documentation is made available under the Creative Commons Attribution-ShareAlike 4.0 International License. See the LICENSE file. 4 | 5 | The sample code within this documentation is made available under the MIT-0 license. See the LICENSE-SAMPLECODE file. 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Amazon EMR on Amazon Best Practices 2 | 3 | A best practices guide for submitting spark applications, integration with hive metastore, security, storage options, debugging options and performance considerations.. 4 | 5 | Return to [Live Docs](https://aws.github.io/aws-emr-best-practices/). 6 | 7 | ## License Summary 8 | 9 | The documentation is made available under the Creative Commons Attribution-ShareAlike 4.0 International License. See the LICENSE file. 10 | 11 | The sample code within this documentation is made available under the MIT-0 license. See the LICENSE-SAMPLECODE file. 12 | -------------------------------------------------------------------------------- /package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "aws-emr-best-practices", 3 | "lockfileVersion": 3, 4 | "requires": true, 5 | "packages": {} 6 | } 7 | -------------------------------------------------------------------------------- /website/.gitignore: -------------------------------------------------------------------------------- 1 | # Dependencies 2 | /node_modules 3 | 4 | # Production 5 | /build 6 | 7 | # Generated files 8 | .docusaurus 9 | .cache-loader 10 | 11 | # Misc 12 | .DS_Store 13 | .env.local 14 | .env.development.local 15 | .env.test.local 16 | .env.production.local 17 | 18 | npm-debug.log* 19 | yarn-debug.log* 20 | yarn-error.log* 21 | -------------------------------------------------------------------------------- /website/README.md: -------------------------------------------------------------------------------- 1 | # Website 2 | 3 | This website is built using [Docusaurus](https://docusaurus.io/), a modern static website generator. 4 | 5 | ## Installation 6 | 7 | ```bash 8 | cd website 9 | npm install 10 | ``` 11 | 12 | ## Local Development 13 | 14 | ```bash 15 | npm run start 16 | ``` 17 | 18 | This command starts a local development server and opens up a browser window. Most changes are reflected live without having to restart the server. 19 | 20 | ## Deployment 21 | 22 | Using SSH: 23 | 24 | ```bash 25 | USE_SSH=true npm run docusaurus deploy 26 | ``` 27 | 28 | Not using SSH: 29 | 30 | ```bash 31 | GIT_USER= npm run docusaurus deploy 32 | ``` 33 | 34 | If you are using GitHub pages for hosting, this command is a convenient way to build the website and push to the `gh-pages` branch. 35 | -------------------------------------------------------------------------------- /website/babel.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | presets: [require.resolve('@docusaurus/core/lib/babel/preset')], 3 | }; 4 | -------------------------------------------------------------------------------- /website/docs/benchmarks/Analyzing/images/benchmark-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-1.png -------------------------------------------------------------------------------- /website/docs/benchmarks/Analyzing/images/benchmark-10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-10.png -------------------------------------------------------------------------------- /website/docs/benchmarks/Analyzing/images/benchmark-11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-11.png -------------------------------------------------------------------------------- /website/docs/benchmarks/Analyzing/images/benchmark-12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-12.png -------------------------------------------------------------------------------- /website/docs/benchmarks/Analyzing/images/benchmark-13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-13.png -------------------------------------------------------------------------------- /website/docs/benchmarks/Analyzing/images/benchmark-14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-14.png -------------------------------------------------------------------------------- /website/docs/benchmarks/Analyzing/images/benchmark-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-2.png -------------------------------------------------------------------------------- /website/docs/benchmarks/Analyzing/images/benchmark-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-3.png -------------------------------------------------------------------------------- /website/docs/benchmarks/Analyzing/images/benchmark-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-4.png -------------------------------------------------------------------------------- /website/docs/benchmarks/Analyzing/images/benchmark-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-5.png -------------------------------------------------------------------------------- /website/docs/benchmarks/Analyzing/images/benchmark-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-6.png -------------------------------------------------------------------------------- /website/docs/benchmarks/Analyzing/images/benchmark-7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-7.png -------------------------------------------------------------------------------- /website/docs/benchmarks/Analyzing/images/benchmark-8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-8.png -------------------------------------------------------------------------------- /website/docs/benchmarks/Analyzing/images/benchmark-9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/Analyzing/images/benchmark-9.png -------------------------------------------------------------------------------- /website/docs/benchmarks/Analyzing/retrieve_event_logs.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 3 3 | sidebar_label: Retrieving Spark Event Logs 4 | --- 5 | 6 | # Retrieve Spark Event Logs 7 | 8 | When you want to analyze the performance of your workloads, you’ll typically need to check the Spark Web UI to identify areas of improvement or just to detect events that are de-gradating the performance in your application. The Spark Web UI uses the Event Logs that are generated by each job running in your cluster to provide detailed information about Jobs, Stages and Tasks of your application that provides aggregated metrics that can help you to troubleshoot performance issues. 9 | 10 | These files are extremely portable, as they can be collected across different engines or environments and stored in the same Spark History Server to have a single interface where you can review results of different benchmark results across different environment or cloud providers. 11 | 12 | When using Amazon EMR, the Spark Event logs are enabled by default and are automatically stored on the HDFS of the cluster where the job was running under the HDFS path `/var/log/spark/apps/` 13 | 14 | ```bash 15 | $ hdfs dfs -ls -R /var/log/spark/apps/ 16 | -rw-rw---- 1 hadoop spark 408384 2023-09-08 21:00 /var/log/spark/apps/application_1694206676971_0001 17 | ``` 18 | 19 | If you have Event Logs coming from a different environment or cluster, you can easily store them in this folder, and the Spark Web History Server will automatically pick them and you’ll be able to review the information of the job on the Spark History Server. 20 | 21 | As alternative, if you want to export the Event Logs from a running cluster, you can also download them manually from the Spark Web History server from the main page as shown in the image below. 22 | 23 | ![Benchmark - 1](images/benchmark-1.png) 24 | 25 | Finally, if you’re using on premise cluster or any third-party Spark environment, you can automatically enable the Spark Event logs using the following Spark configurations: 26 | 27 | * **spark.eventLog.enabled** (Boolean) Determine if you want to enable or disable event logs collection. False by default 28 | * **spark.eventLog.dir** (String) Location where to store the event logs. Can be an Object Store as Amazon S3, Azure Filesystem, or any path recognized by the Hadoop Filesystem API (e.g. HDFS, Local Filesystem, etc.) 29 | 30 | Below an example to manually enable the Spark event logs in your Spark application. 31 | 32 | ```bash 33 | spark-submit \ 34 | --name "Example App" \ 35 | --conf spark.eventLog.enabled=true \ 36 | --conf spark.eventLog.dir=hdfs:///tmp/spark \ 37 | ... 38 | ``` 39 | -------------------------------------------------------------------------------- /website/docs/benchmarks/Resources/Benchmark_results.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 4 3 | sidebar_label: Benchmark Results 4 | --- 5 | 6 | # Benchmark Results 7 | 8 | ## Amazon EMR 9 | 10 | ### Spark 11 | 12 | * EMR 6.10: https://aws.amazon.com/blogs/big-data/amazon-emr-on-eks-widens-the-performance-gap-run-apache-spark-workloads-5-37-times-faster-and-at-4-3-times-lower-cost/ 13 | * EMR 6.9: https://aws.amazon.com/blogs/big-data/run-apache-spark-workloads-3-5-times-faster-with-amazon-emr-6-9/ 14 | * EMR 6.5: https://aws.amazon.com/blogs/big-data/amazon-emr-on-amazon-eks-provides-up-to-61-lower-costs-and-up-to-68-performance-improvement-for-spark-workloads/ 15 | 16 | ### Graviton 17 | 18 | #### EMR Graviton 2 19 | 20 | * https://aws.amazon.com/blogs/big-data/achieve-up-to-27-better-price-performance-for-spark-workloads-with-aws-graviton2-on-amazon-emr-serverless/ 21 | 22 | #### EMR Graviton 3 23 | 24 | * EMR on EKS - https://aws.amazon.com/blogs/big-data/amazon-emr-on-eks-gets-up-to-19-performance-boost-running-on-aws-graviton3-processors-vs-graviton2/ 25 | * EMR on EC2 - https://aws.amazon.com/blogs/big-data/amazon-emr-launches-support-for-amazon-ec2-c7g-graviton3-instances-to-improve-cost-performance-for-spark-workloads-by-7-13/ 26 | 27 | ### Intel 28 | 29 | * EMR Intel (C6i, M6i, I4i, R6i, and R6id): https://aws.amazon.com/blogs/big-data/amazon-emr-launches-support-for-amazon-ec2-c6i-m6i-i4i-r6i-and-r6id-instances-to-improve-cost-performance-for-spark-workloads-by-6-33/ 30 | 31 | ### AMD 32 | 33 | * EMR AMD (m6a, r6a): https://aws.amazon.com/blogs/big-data/amazon-emr-launches-support-for-amazon-ec2-m6a-r6a-instances-to-improve-cost-performance-for-spark-workloads-by-15-50/ 34 | 35 | ### Managed Scaling 36 | 37 | * Managed Scaling Improvements: https://aws.amazon.com/blogs/big-data/reduce-amazon-emr-cluster-costs-by-up-to-19-with-new-enhancements-in-amazon-emr-managed-scaling/ 38 | 39 | ### EMR on EKS 40 | 41 | * EMR on EKS vs OSS: https://aws.amazon.com/blogs/big-data/amazon-emr-on-amazon-eks-provides-up-to-61-lower-costs-and-up-to-68-performance-improvement-for-spark-workloads/ 42 | 43 | ### Hive 44 | 45 | * Hive Rename Feature: https://aws.amazon.com/blogs/big-data/up-to-15-times-improvement-in-hive-write-performance-with-the-amazon-emr-hive-zero-rename-feature/ 46 | 47 | ### Customer Examples 48 | 49 | * EMR Serverless: https://aws.amazon.com/blogs/big-data/godaddy-benchmarking-results-in-up-to-24-better-price-performance-for-their-spark-workloads-with-aws-graviton2-on-amazon-emr-serverless/ 50 | 51 | ## Amazon Athena 52 | 53 | * Athena V3: https://aws.amazon.com/blogs/big-data/upgrade-to-athena-engine-version-3-to-increase-query-performance-and-access-more-analytics-features/ 54 | * Athena V2: https://aws.amazon.com/blogs/big-data/run-queries-3x-faster-with-up-to-70-cost-savings-on-the-latest-amazon-athena-engine/ 55 | * Athena CBO: https://aws.amazon.com/blogs/big-data/speed-up-queries-with-cost-based-optimizer-in-amazon-athena/ -------------------------------------------------------------------------------- /website/docs/benchmarks/Resources/Utilities.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 4 3 | sidebar_label: Utilities 4 | --- 5 | 6 | # Benchmarking Utilities 7 | 8 | * EMR Spark Benchmark: https://github.com/aws-samples/emr-spark-benchmark 9 | * EMR on EKS Benchmark: https://github.com/aws-samples/emr-on-eks-benchmark 10 | -------------------------------------------------------------------------------- /website/docs/benchmarks/Running/benchmarking_checklist.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 3 3 | sidebar_label: Benchmarking Checklist 4 | --- 5 | 6 | # Benchmarking Checklist 7 | 8 | ## Environment and Infrastructure 9 | 10 | The following checklist assumes you are running benchmarks across deployment models (EC2 vs EKS vs Serverless) or vendors (EMR vs Databricks vs OSS). Comparing at the deployment model or vendor level takes into consideration a number of variables such as runtime performance, scaling and pricing model. 11 | 12 | If running a benchmark for other purposes such as difference in hardware within the same deployment model, items in the checklist will not apply. 13 | 14 | |Checklist | |Notes | 15 | |--- |--- |--- | 16 | |Are all instances On Demand | :black_square_button:|Spot interruptions are unpredictable and impacts price-performance. Only use spot when taking into consideration how your benchmark handles spot interruptions and getting spot capacity. Deployment models EMR on EC2 have product differentiators that select instances with are most likely to not get interrupted. | 17 | |Are all instances the same family, size and generation |:black_square_button: |The total amount of compute (vCPU and Memory) should be consistent across benchmark runs. Compute will determine the performance of the application. Additionally, instances can vary in network performance. Additionally, if using Karpenter or Instancefleet, you should ensure the set of instances provided are the same. Note that depending on when the job is submitted, your results may vary | 18 | |If cluster scaling is enabled, does each deployment model have the same scaling configurations. (min, max) |:black_square_button: |The efficiency of scaling between deployment models and vendors can differ but the configurations as it relates to compute should be consistent | 19 | |Is the EMR cluster or image using the latest EMR version? |:black_square_button: |The latest versions of EMR will contain the best runtime performance | 20 | |Are the Application versions the same across deployment models, OSS and vendors? |:black_square_button: |Spark versions should be the same or the latest version that's offered | 21 | |Is the same data catalog being used across benchmarks? |:black_square_button: |Performance between local and remote hivemetastore and glue data catalog can differ | 22 | |Is the infrastructure being deployed in the same AZ? |:black_square_button: |AZ's may have differences in network latency or instance availability. | 23 | |Are the benchmarks starting from the same state and size. For example, cold start vs warm pool and the # of starting instances |:black_square_button: |Initializing compute resources impact price-performance. When comparing benchmarking, ensure applications are starting from the same state | 24 | |Is the amount and type of local disk consistent? |:black_square_button: |Size and type of local disk volumes impact workloads, especially shuffle heavy ones | 25 | |Are the security settings consistent across deployment models ? This includes IAM role, security groups, data and in transit encryption |:black_square_button: |Security configurations such as encryption can impact performance | 26 | |Are network settings consistent across deployment models? |:black_square_button: |This includes VPC endpoints, NAT Gateways, public or private endpoints, or proxies. The flow of network traffic to access storage, catalog or endpoints impacts performance | 27 | |Are there differences in the AMI, bootstrap actions or container Image? |:black_square_button: |This can impact compute initialization as well as job startup. For example, eliminating the need to load a specific library before executing the job | 28 | |Are JDK settings consistent across deployment models |:black_square_button: |We've seen improved performance with JDK17. Ensure the versions are consistent across benchmarks | 29 | 30 | ## Workload 31 | 32 | |Checklist | |Notes | 33 | |--- |--- |--- | 34 | |Is the input and output data the same (size, location, type, structure)? |:black_square_button: |As a best practice, all benchmark runs should point to the same input data set | 35 | |Are the applications being submitted the same? |:black_square_button: |SQL file or application should be the same | 36 | |Are the applications libraries the same? |:black_square_button: |This includes external libraries, python versions, or anything the application requires to run | 37 | |Are the applications parameters the same? |:black_square_button: |These are application specific parameters passed in the job. These should be identical to ensure the same job is running | 38 | |Are the applications configurations the same? |:black_square_button: |This refers to Spark configuration settings such as executor size, shuffle partitions or Dynamic Resource Allocation settings | 39 | |Is EMR using EMRFS library to write to S3 |:black_square_button: |To take advantage of EMR's optimized run time, EMRFS (s3://) should be used. s3a is not supported and should only be used in OSS | 40 | |If an Open Table Format (OTF) is being used, is it consistent across benchmarks |:black_square_button: |Using OTF's can improve read, write and processing performance. | 41 | |Is the application running in isolation? |:black_square_button: |Resource contention can impact benchmark results because Spark workloads will run on any resource that is available. A best practice is to run each job independently. Also ensure that if submitting multiple jobs, jobs are submitted in the same sequence or sequentially. | 42 | |Is there any data or library caching that impacts future runs? |:black_square_button: |Generally, the first run will be slower than future runs because of caching. Keep this in mind when determining how many iterations of a run you want to do. Additional runs will negate any impact of caching but has a trade off of cost and time | 43 | |Is the applications JVM settings the same? |:black_square_button: |Performance is different across JDK version. JDK17 has seen to have the best performance. JVM settings also extend to GC settings. | 44 | |Is the applications logging configurations the same? |:black_square_button: |Logging parameters that are not the same such as level (DEBUG, INFO) can impact performance or storage requirements | 45 | |Are the applications being submitted the same way? |:black_square_button: |Ensure the entry point for job submission is the same. There are many ways to submit spark jobs such as EMR APIs, Livy, Airflow, Spark-submit. These can result in differences with how jobs are run | -------------------------------------------------------------------------------- /website/docs/benchmarks/Running/setting_up_environment.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 3 3 | sidebar_label: Setting Up Your Environment 4 | --- 5 | 6 | # Setting up the Benchmark Environment 7 | 8 | ## EMR on EC2 9 | 10 | * Getting started guide: [https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-gs.html](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-gs.html) 11 | * Benchmark guide: [https://github.com/aws-samples/emr-spark-benchmark](https://github.com/aws-samples/emr-spark-benchmark) 12 | 13 | ## EMR on EKS 14 | 15 | * Getting started guide: [https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/setting-up.html](https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/setting-up.html) 16 | * Benchmark guide: [https://github.com/aws-samples/emr-on-eks-benchmark/tree/main](https://github.com/aws-samples/emr-on-eks-benchmark/tree/main) 17 | 18 | ## EMR Serverless 19 | 20 | * Getting started guide: [https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/getting-started.html](https://docs.aws.amazon.com/emr/latest/EMR-Serverless-UserGuide/getting-started.html) 21 | * Benchmark guide: [https://github.com/aws-samples/emr-spark-benchmark](https://github.com/aws-samples/emr-spark-benchmark) -------------------------------------------------------------------------------- /website/docs/benchmarks/img1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/img1.png -------------------------------------------------------------------------------- /website/docs/benchmarks/img2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/benchmarks/img2.png -------------------------------------------------------------------------------- /website/docs/benchmarks/introduction.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 1 3 | sidebar_label: Benchmarks 4 | --- 5 | 6 | # Benchmarks 7 | 8 | The purpose of this guide is to provide a methodology for running Spark benchmarks on EMR. By following this guide, you will be able to identify the lowest price-performance option for running Spark workloads, considering various variables such as engine type (EMR, OSS), deployment models (EC2, EKS, Serverless), or hardware options (M, C, R, family). 9 | 10 | The focus of this guide is on price-performance. Other considerations, such as features, user experience, or compatibility with other services, are out of scope. However, it's essential to evaluate these aspects based on your customers' use cases and needs. 11 | -------------------------------------------------------------------------------- /website/docs/benchmarks/price_performance.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 2 3 | sidebar_label: Price-Performance 4 | --- 5 | 6 | # Price Performance 7 | 8 | In the scope of this tutorial, "price-performance" signifies the monetary expense associated with executing a given workload while maintaining a specific degree of performance, expressed in terms of execution duration (seconds). Evaluating price-performance plays a vital role in understanding the impact of factors that are not easily quantifiable, such as deployment architectures, competitive offerings, container allocation strategies, and processing engines. 9 | 10 | For variables that are within our control, such as infrastructure sizing or application settings, ensuring uniformity among all benchmarks is indispensable for accurate comparisons. 11 | 12 | The following examples highlight the importance of price-performance. 13 | 14 | **Example 1:** Customer wants to compare Open Source Software (OSS) Spark vs EMR Spark with different cluster sizes 15 | 16 | | |Cluster #1 |Cluster #2 | 17 | |--- |--- |--- | 18 | |Runtime (s) |12 |30 | 19 | |# of nodes |50 |10 | 20 | |Engine |OSS Spark Runtime |EMR Spark Runtime | 21 | |Cost ($) |600 |300 | 22 | 23 | In the above example, Cluster #1 is running OSS spark and completes in 12s with 50 nodes, while EMR Spark completes in 30s with 10 nodes. However, when we look at total cost, cluster #2 total cost is lower than cluster #1 making it a better option. Comparing cost in relation to the work being done considers the difference in # of nodes and engine. Assuming performance is linear, lets look at what happens when we increase the # of nodes in cluster 2. 24 | 25 | **Example 2:** Customer wants to compare Open Source Software (OSS) Spark vs EMR Spark with same cluster sizes 26 | 27 | | |Cluster #1 |Cluster #2 | 28 | |--- |--- |--- | 29 | |Runtime (s) |12 |6 | 30 | |# of nodes |50 |50 | 31 | |Engine |OSS Spark Runtime |EMR Spark Runtime | 32 | |Cost ($) |600 |300 | 33 | 34 | After increasing the # of nodes to be the same across both clusters, runtime is reduced to 6seconds on Cluster #2 and cost remains the same at 300$. Our conclusion from the first example remains the same. Cluster #2 is the best option from a price-performance perspective. 35 | 36 | It’s important to note that price-performance is not always linear. This is often seen when workloads have data skew. In these cases, adding more compute does not reduce runtime proportionally and adds costs. 37 | 38 | **Example 3:** Same workload across different # of nodes - data skew 39 | 40 | | |Run #1 |Run #2 | 41 | |--- |--- |--- | 42 | |Runtime (s) |100 |75 | 43 | |# of nodes |10 |20 | 44 | |Engine |EMR Spark Runtime |EMR Spark Runtime | 45 | |Cost ($) |1000 |1500 | 46 | 47 | In the above example, performance is not linear. While runtime reduced to 75s, overall cost increased. In these cases, it’s important ensure the # of nodes are the same for both comparisons. 48 | 49 | Another scenario where price-performance is useful is when comparing different pricing models or vendors. Take the example below: 50 | 51 | **Example 4:** Same workload across different pricing models 52 | 53 | | |EMR Spark Runtime |Vendor | 54 | |--- |--- |--- | 55 | |Runtime (s) |50 |40 | 56 | |# of nodes |10 |10 | 57 | |$/s |1 |1.5 | 58 | |Cost ($) |500 |600 | 59 | 60 | In the above example, the same workload on vendor runs in 40s, while EMR runs in 50s. While vendor may seem faster, when we factor in price-performance, we see total cost is lower with EMR. If runtime is a key requirement, we can increase the # of nodes in relation to performance as illustrated in example 5. 61 | 62 | **Example 5:** Same workload across different pricing models with different # of nodes 63 | 64 | | |EMR Spark Runtime |EMR Spark Runtime linear performance |Vendor | 65 | |--- |--- |--- |--- | 66 | |Runtime (s) |50 |25 |40 | 67 | |# of nodes |10 |20 |10 | 68 | |$/s |1 |1 |1.5 | 69 | |Cost ($) |500 |500 |600 | 70 | 71 | The goal with benchmarking should always be to have like-for-like comparisons. This is especially true for factors such as application configuration settings such as executor sizes, input and output dataset, cluster size and instances. However, factors like vendor/aws pricing model, engine optimizations, and schedulers cannot be made the same. As such, it’s important to use price-performance as a key factor. 72 | -------------------------------------------------------------------------------- /website/docs/bestpractices/Applications/HBase/best_practice.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 2 3 | sidebar_label: Best Practices 4 | --- 5 | 6 | # Best Practice 7 | 8 | The following section describes some general HBase tuning and best practice that can be applied both when using HDFS or Amazon S3 as storage layer for HBase. 9 | 10 | ## EMR Multi Master 11 | 12 | When working with HBase on Amazon EMR, it is good practice to enable the [EMR Multi Master](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-plan-ha.html) feature that allows you to launch three EMR master nodes. This functionality allows the HBase cluster to tolerate impairments that might occur if a single master goes down. 13 | 14 | Nevertheless, this functionality is highly recommended both when using HDFS or S3 as storage layer for your HBase cluster. Enabling this, allows you to serve HBase requests (both writes and reads) in case of a master failure. Please note that if you launch the EMR cluster with a single master and this node is terminated for any reason (e.g. human error, hardware impairment, etc.), it will not be possible to recover any data from the HDFS storage on the cluster as the HDFS metadata will be lost after the termination of the EMR master. 15 | 16 | ## EMR Termination Protection 17 | 18 | [Using termination protection](https://docs.aws.amazon.com/emr/latest/ManagementGuide/UsingEMR_TerminationProtection.html) in Amazon EMR is highly recommended both when using HDFS or Amazon S3 for your HBase cluster. 19 | 20 | Amazon EMR periodically checks the Apache Hadoop YARN status of nodes running on CORE and TASK nodes in a cluster. The health status is reported by the [YARN NodeManager health checker service](https://hadoop.apache.org/docs/current/hadoop-yarn/hadoop-yarn-site/NodeManager.html#Health_checker_service). If a node reports an UNHEALTHY status, it will not be possible to allocate YARN containers to it until it becomes healthy again. A common reason for unhealthy nodes is that disk utilization goes above 90%. If the node stays in this state for more than 45 minutes and Termination Protection is disabled, the EMR service terminates the node and launch a fresh new one as replacement. 21 | 22 | When a node is in an UNHEALTHY state, with the termination protection enabled the nodes will not be terminated and replaced by the EMR service. This prevents to lose HDFS data blocks in case the utilization of the disks of a CORE node goes above 90%, so preventing data integrity issues in HBase tables. 23 | 24 | ## HBase RPC Listeners 25 | 26 | One of the most important parameters to configure in your HBase cluster is the number of active RPC listeners defined per Region Server. Tuning the parameter *`hbase.regionserver.handler.count`* (default: 30) can increase the number of requests that you can concurrently serve in each region server and so the overall throughput of your cluster. To modify the default number of RPC listeners you can use the following EMR configuration: 27 | 28 | ```json 29 | [ 30 | { 31 | "Classification": "hbase-site", 32 | "Properties": { 33 | "hbase.regionserver.handler.count": "120" 34 | } 35 | } 36 | ] 37 | ``` 38 | 39 | However, please be mindful that this parameter should be tuned accordingly to the average size of data stored or retrieved from your tables. As rule of thumb, you should increase this number when the payload of your data is lower than 100KB, while you should stick to the default, or decrease it when the payload size is `>= 1MB.` For small payloads (`<= 1KB)`, you can push this value up to 4 times the number of vCpu available in your Region Servers. 40 | 41 | To determine the average payload of data stored in your tables, see [Determine average row size](./management#determine-average-row-size). 42 | 43 | ## HBase Heap Memory 44 | 45 | On Amazon EMR, when you install HBase, the memory will be evenly re-partitioned between Hadoop YARN and HBase services. For a list of the default memory settings used per instance type see [Task configuration](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-hadoop-task-config.html#emr-hadoop-task-jvm) in the EMR documentation. 46 | 47 | However, when working with HBase it might be convenient to override the default parameters and increase the available memory for our HBase services. This might be required if we want to host a higher number of Regions per Region Server. To modify the default memory, you should modify the HBase environmental variables defined in the *hbase-env* which defines the default heap memory available for each HBase service. The following list highlight the variables that should be modified by service: 48 | 49 | * **`HBASE_MASTER_OPTS`** JVM options for the HBase master 50 | * **`HBASE_REGIONSERVER_OPTS`** JVM options for the HBase Region Servers 51 | * **`HBASE_THRIFT_OPTS`** JVM options for the HBase Thrift service 52 | * **`HBASE_REST_OPTS`** JVM options for the HBase REST service 53 | 54 | It’s best practice to modify the memory of each component using its own dedicated variable, rather than using the more general **HBASE_OPTS**, which is used to apply common JVM options across all HBase services. 55 | 56 | To override the default memory we should specify the following java parameter in our environmental variable: `-Xmx[g|G|m|M|k|K]`. Please also make sure to add a self reference in the environmental variable to avoid loosing other parameters that are set in the script. Besides, if we modify the default HBase memory, we should also lower accordingly the memory specified for the YARN Node Manager service to avoid incurring in Out Of Memory errors. 57 | 58 | Please note that either if you’re just installing HBase, it might still be convenient to keep some memory reserved for YARN. This can be useful as some HBase utility runs on YARN (e.g. HBase export utility). 59 | 60 | The example below highlights the configurations that should be modified in an EMR cluster while tuning the HBase heap memory. Please make sure that the sum of the YARN and HBase memory is not greater than the memory available on the node. Also make sure to keep at least 2GB of available memory for the Operating System and other internal components running on the node. 61 | 62 | ```json 63 | [ 64 | { 65 | "Classification": "yarn-site", 66 | "Properties": { 67 | "yarn.scheduler.maximum-allocation-mb": "MAX_MEMORY_BYTES", 68 | "yarn.nodemanager.resource.memory-mb": "MAX_MEMORY_BYTES" 69 | } 70 | }, 71 | { 72 | "Classification": "hbase-env", 73 | "Configurations": [ 74 | { 75 | "Classification": "export", 76 | "Properties": { 77 | "HBASE_MASTER_OPTS": "\"$HBASE_MASTER_OPTS -Xmx30g\"", 78 | "HBASE_REGIONSERVER_OPTS": "\"$HBASE_REGIONSERVER_OPTS -Xmx30g\"" 79 | } 80 | } 81 | ], 82 | "Properties": {} 83 | } 84 | ] 85 | ``` 86 | 87 | ## HBase MultiWal Provider 88 | 89 | By default, HBase uses a single [Write Ahead Log](https://hbase.apache.org/book.html#wal) file (WAL) per Region Server to persist mutate operations that are performed against Regions hosted on the node. This implementation can be a bottleneck as WALs are stored on the HDFS and each operation is performed sequentially against the same file. 90 | 91 | In write intensive clusters, you might increase the HBase throughput by adopting a multiwal strategy. In this scenario is recommended to have multiple disks attached to the node to get the most out of this feature. This configuration can be enabled specifying the following properties while launching an EMR cluster: 92 | 93 | ```json 94 | [ 95 | { 96 | "Classification": "hbase-site", 97 | "Properties": { 98 | "hbase.wal.provider": "multiwal", 99 | "hbase.wal.regiongrouping.numgroups": "2" 100 | } 101 | } 102 | ] 103 | ``` 104 | 105 | The parameter *`hbase.wal.regiongrouping.numgroups`* determines the number of WALs that will be created per Region Server. By default, this parameter is set to two, but you can tune this parameter accordingly to the number of disks attached to the node for better performance. 106 | 107 | ## HBase OffHeap Caching 108 | 109 | The following example, shows how to enable OffHeap memory caching on HBase. This configuration, can be used both when using Amazon S3 or HDFS as storage layer. The example below sets an offheap memory of 5GB while the bucket cache allocated for this memory will be 4GB. 110 | 111 | ```json 112 | [ 113 | { 114 | "Classification": "hbase-env", 115 | "Properties": {}, 116 | "Configurations": [ 117 | { 118 | "Classification": "export", 119 | "Properties": { 120 | "HBASE_OFFHEAPSIZE": "5G" 121 | }, 122 | "Configurations": [] 123 | } 124 | ] 125 | }, 126 | { 127 | "Classification": "hbase-site", 128 | "Properties": { 129 | "hbase.bucketcache.size": "4096", 130 | "hbase.bucketcache.ioengine": "offheap" 131 | } 132 | } 133 | ] 134 | ``` 135 | 136 | In order to use the configured cache, make sure to enable the following configurations in the tables you want to cache. For example, from the HBase shell: 137 | 138 | ```bash 139 | # creating new table t with column family info0 140 | hbase> create 't', {NAME => 'info0', CONFIGURATION => {CACHE_DATA_IN_L1 => 'true'}} 141 | 142 | # modify existing table t with column family info0 143 | hbase> alter 't', {NAME => 'info0', CONFIGURATION => {CACHE_DATA_IN_L1 => 'true'}} 144 | ``` 145 | -------------------------------------------------------------------------------- /website/docs/bestpractices/Applications/HBase/best_practice_s3.md: -------------------------------------------------------------------------------- 1 | # Best Practices for Amazon S3 2 | 3 | This section highlights some of the features / best practice that you can use to improve the performance in your cluster when using Amazon S3 as storage layer for HBase. For additional best practice / tuning parameters, see [Apache HBase on Amazon S3 configuration properties](https://docs.aws.amazon.com/whitepapers/latest/migrate-apache-hbase-s3/identifying-apache-hbase-and-emrfs-tuning-options.html). 4 | 5 | ## Bucket Cache 6 | 7 | When using Amazon S3 as storage layer for HBase, EMR configures the service to use a Bucket Cache for persisting data blocks on the L2 Cache of each region server. The default cache implementation used for Amazon S3 persists blocks on the local volumes of the node as defined by the *`hbase.bucketcache.ioengine`* property. This parameter defines the location of the files used to store the cached data. For example, the following snippet shows the default configurations for a node with 4 EBS volumes attached. 8 | 9 | ```xml 10 | 11 | hbase.bucketcache.ioengine 12 | files:/mnt1/hbase/bucketcache,/mnt2/hbase/bucketcache,/mnt3/hbase/bucketcache 13 | 14 | ``` 15 | 16 | By default, EMR configures N - 1 volumes for caching data, so in our example only 3 volumes out of 4 will be used for the cache. This feature can be useful to persist HOT data on the local disks of the cluster to reduce the latency introduced when accessing HFiles stored on S3. However, by default the cache size is set as 8GB, so you might need to increase it depending on the amount of data you want to store on each node. To modify the default cache value, you can set the following property: 17 | 18 | ``` 19 | hbase.bucketcache.size: 98304 # defined as MB 20 | ``` 21 | 22 | In the above example, we set the cache size for each node to 98GB. In each volume only 32GB (98304 / 3) are used, as the total cache size will be evenly distributed across the volumes defined in the *`hbase.bucketcache.ioengine`*. 23 | 24 | Besides, when using S3 it might be convenient to pre-warm the cache during the region opening to avoid performance degradation when the cache is still not fully initialized. In this case to enable blocks prefetch, you should enable the following configuration. 25 | 26 | ``` 27 | hbase.rs.prefetchblocksonopen: true 28 | ``` 29 | 30 | This configuration can also be set for individual Column Family of an HBase table. In this case you should specify the configuration through the HBase shell using the following command: 31 | 32 | ``` 33 | hbase> create 'MyTable', { NAME => 'myCF', PREFETCH_BLOCKS_ON_OPEN => 'true' } 34 | ``` 35 | 36 | Finally, in write intensive use cases, it might be useful to also enable the following configurations to automatically persist blocks in the cache as they are written, and to repopulate the cache following a compaction (compaction operations invalidate cache blocks). In this case we can set the following additional properties: 37 | 38 | ``` 39 | hbase.rs.cacheblocksonwrite: true 40 | hbase.rs.cachecompactedblocksonwrite: true 41 | ``` 42 | 43 | Below a sample configuration to tune the Bucket Cache in an Amazon EMR cluster: 44 | 45 | ```json 46 | [ 47 | { 48 | "Classification": "hbase-site", 49 | "Properties": { 50 | "hbase.bucketcache.size": "98304", 51 | "hbase.rs.prefetchblocksonopen": "true", 52 | "hbase.rs.cacheblocksonwrite": "true", 53 | "hbase.rs.cachecompactedblocksonwrite": "true" 54 | } 55 | } 56 | ] 57 | ``` 58 | 59 | ## Memstore flush size 60 | 61 | When using Amazon S3 in HBase, it might be convenient to increase the default memstore flush size to avoid performance degradation, or an excessive number of small compaction operations in write intensive clusters. This can be useful if you have manually disabled the [Persistent File Tracking](#persistent-file-tracking) feature that is enabled on EMR greater than 6.2.0 or if you're using an EMR 5.x cluster. 62 | 63 | In this case, you can increase the memstore flush size to 256MB or 512MB (default 128MB). Below an example of how you can change this configuration in an Amazon EMR cluster: 64 | 65 | 66 | ```json 67 | [ 68 | { 69 | "Classification": "hbase-site", 70 | "Properties": { 71 | "hbase.hregion.memstore.flush.size": "268435456" # 256 * 1024 * 1024 72 | } 73 | } 74 | ] 75 | ``` 76 | 77 | ## Region Split Policy 78 | 79 | Depending on the HBase version that you’re using, you will use different region split policies. By default, you’ll have: 80 | 81 | * **HBase 1.x** *`org.apache.hadoop.hbase.regionserver.IncreasingToUpperBoundRegionSplitPolicy`* 82 | * **HBase 2.x** *`org.apache.hadoop.hbase.regionserver.SteppingSplitPolicy`* 83 | 84 | These specific implementations aims to quickly increase the number of regions when you have a fresh new table that wasn’t pre-partitioned. This might be a good strategy for new tables in a cluster. 85 | 86 | However, it might be more convenient for a cluster using S3 as storage layer to use the old split strategy *`org.apache.hadoop.hbase.regionserver.ConstantSizeRegionSplitPolicy`* that performs a split operation only when the overall size of a region goes above a threshold as defined by the parameter: *`hbase.hregion.max.filesize`* (default: 10GB) 87 | 88 | This can help if you want to have more control on the number of regions, as it will allow you to control the growth of the number of regions by a fixed size that you specify. Additionally, this can also be handy in case you’re leveraging Apache Phoenix to query HBase and you have a constant flow of new data. Setting a constant size region split policy will prevent excessive splitting operations. These operations can cause temporary region cache boundaries exceptions while using Phoenix, due to the time required to refresh internal metadata about regions boundaries. This problem might be more frequent when using S3 as storage layer than when using HDFS. 89 | 90 | Below an example to modify the Region Server split logic on an Amazon EMR cluster: 91 | 92 | ```json 93 | [ 94 | { 95 | "Classification": "hbase-site", 96 | "Properties": { 97 | "hbase.regionserver.region.split.policy": "org.apache.hadoop.hbase.regionserver.ConstantSizeRegionSplitPolicy", 98 | "hbase.hregion.max.filesize": "10737418240" 99 | } 100 | } 101 | ] 102 | ``` 103 | 104 | ## Persistent File Tracking 105 | 106 | When using EMR versions greater than 6.2.0, EMR will enable a feature called Persistent File Tracking when using Amazon S3 as storage layer. This specific feature, is enabled by default and provides performance benefits as it avoids HFile rename operations that might delay write operations due to S3 latencies. However, please note that this feature does not support the native [HBase replication](https://hbase.apache.org/book.html#_cluster_replication) feature. So if you want to use replication to implement a Highly Available setup when using Amazon S3, you’ll have to disable this feature. This applies only to S3 and is not required when using HDFS as storage layer. 107 | 108 | For more details on this feature, see [Persistent HFile tracking](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-hbase-s3.html#emr-hbase-s3-hfile-tracking). 109 | 110 | ## Speed up region assignment / opening / closing 111 | 112 | ### HBase 1.x 113 | 114 | Set the below configurations to speed up region assignment, opening and closure on HBase 1.x clusters. These configurations specifically disable the use of zookeeper for the region assignment by setting to false the property *`hbase.assignment.usezk`*. Additionally, you can increase the thread pools the Region Servers use for opening the assigned regions. For Regions Servers handling many regions (in the order of thousands), you can set the thread pools up to 10 times the available number of vCpu on the Region Server. Below, an example EMR Configuration: 115 | 116 | ```json 117 | [ 118 | { 119 | "Classification": "hbase-site", 120 | "Properties": { 121 | "hbase.assignment.usezk": "false", 122 | "hbase.regionserver.executor.openregion.threads": "120", 123 | "hbase.regionserver.executor.closeregion.threads": "120" 124 | } 125 | } 126 | ] 127 | ``` 128 | 129 | ### HBase 2.x 130 | 131 | HBase 2.x introduced a more robust and efficient workflow to manage regions transitions which leverage the ProcedureV2 introduced in [HBASE-14614](https://issues.apache.org/jira/browse/HBASE-14614). In this case, it is only sufficient to increase the default region server thread pools to speed up the initialization of the regions. 132 | 133 | ```json 134 | [ 135 | { 136 | "Classification": "hbase-site", 137 | "Properties": { 138 | "hbase.regionserver.executor.openregion.threads": "120", 139 | "hbase.regionserver.executor.closeregion.threads": "120" 140 | } 141 | } 142 | ] 143 | ``` 144 | -------------------------------------------------------------------------------- /website/docs/bestpractices/Applications/HBase/img/hbase_replication_cross.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/HBase/img/hbase_replication_cross.png -------------------------------------------------------------------------------- /website/docs/bestpractices/Applications/HBase/img/hbase_replication_oneway.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/HBase/img/hbase_replication_oneway.png -------------------------------------------------------------------------------- /website/docs/bestpractices/Applications/HBase/img/hbase_replication_simple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/HBase/img/hbase_replication_simple.png -------------------------------------------------------------------------------- /website/docs/bestpractices/Applications/HBase/img/hbase_s3_replication.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/HBase/img/hbase_s3_replication.png -------------------------------------------------------------------------------- /website/docs/bestpractices/Applications/HBase/img/management_avg_payload.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/HBase/img/management_avg_payload.png -------------------------------------------------------------------------------- /website/docs/bestpractices/Applications/HBase/img/observability_grafana.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/HBase/img/observability_grafana.png -------------------------------------------------------------------------------- /website/docs/bestpractices/Applications/HBase/img/observabilty_webui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws/aws-emr-best-practices/48a3218fd8e8696f63a24a76986c4490d77c8ef7/website/docs/bestpractices/Applications/HBase/img/observabilty_webui.png -------------------------------------------------------------------------------- /website/docs/bestpractices/Applications/HBase/introduction.md: -------------------------------------------------------------------------------- 1 | --- 2 | sidebar_position: 1 3 | sidebar_label: Introduction 4 | --- 5 | 6 | # Introduction 7 | 8 | When working with Amazon EMR on EC2, you have the ability to choose between two deployment options for the underlying storage layer used by HBase: the [Hadoop HDFS](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html) or [Amazon S3](https://docs.aws.amazon.com/AmazonS3/latest/userguide/Welcome.html). 9 | 10 | Although there are no restrictions in the use of these storage options, they serve different purposes, and they both have pros and cons with related performance implications. In this document, we are going to review the main aspects of each storage option. 11 | 12 | ## Which storage layer should I use? 13 | 14 | Typically, to understand which storage layer you should use in your HBase cluster, you must determine what are your application requirements and decide what is most important between these two main decision drivers: performance or costs. Generally speaking, on a large cluster setup, HDFS provides better performance in most cases, while Amazon S3 provides better cost savings due to the reduced amount of storage required to persist all your data, and is the right option when you want to decouple your storage from compute. 15 | 16 | Using HDFS allows you to achieve the best performance for latency responses. This is true if you need milliseconds / sub-milliseconds read responses from HBase. You can also achieve similar results using Amazon S3 as storage layer, but this will require to rely on HBase caching features. Depending on your tables sizes, this can increase costs when provisioning resources for cache, as you’ll have to provision more EBS volumes or use bigger instances to cache your data locally on the nodes, thus losing the main advantages of using Amazon S3. This requires to fine tune HBase to find the right balance between performance and cost for your workload. 17 | 18 | Another common use case to choose HDFS over S3 is a data migration from an on premise cluster. This is typically recommended as first migration step, as this solution provides similar performance compared to your existing cluster. You can more easily migrate your infrastructure to the cloud, and later decide if it makes sense to use Amazon S3. 19 | Besides, using the HDFS for a data migration can be a requirement before moving to Amazon S3. Specifically this can help to optimize the underlying layout of your HBase tables if they have a considerable amount of small HBase regions, and you want to merge them. This operation can be more quickly be performed on an HDFS cluster, and you can later migrate the data to Amazon S3. For more details, see the sections [Reduce number of Regions](./management#reduce-number-of-regions) and [Data Migration](./data_migration). 20 | 21 | Finally, using HDFS is also the right choice if you have a cluster that is mostly used for write intensive workloads. This is because write intensive clusters are subject to intensive compaction and region splitting operations that are performed internally by HBase to manage the underlying data storage. In these cases, using Amazon S3 might not be the right option, because of data movements that occur between Amazon S3 and the cluster to perform compaction processes. This increases the time required to perform such operations, thus impacting the overall cluster performance resulting in higher latencies. 22 | 23 | On the other side, Amazon S3 is a good option for read-intensive HBase clusters. One of the best use cases where S3 excels is when the data that is most frequently accessed (read or modified) is the most recent, while old data is rarely modified. You can use the pre-configured bucket cache, to store a hot copy of the most recent data on local disks of your cluster, thus maintaining a good compromise in terms of costs and performance. For more details, see [Bucket Cache](./best_practice_s3#bucket-cache). 24 | 25 | Another good use case for using Amazon S3 is when you have tables that rarely change over time, and you need to serve a large amount of read requests. In this case, you can opt for Amazon S3 in combination with the [EMR HBase read-replica](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-hbase-s3.html#emr-hbase-s3-read-replica), to distribute your read requests across multiple clusters. For more details about this approach kindly see [Data Integrity](./data_integrity#amazon-emr---read-replica). Moreover, Amazon S3 provides stronger SLA for data durability and availability transparently at the storage level and will not be impacted by failures on EMR instances. 26 | 27 | Finally, one major benefit of relying on S3 for storage is cost saving. If you have significant costs in your cluster due to large amount of data stored on EBS volumes, moving to S3 can reduce costs drastically. Moreover, HDFS uses block replication to provide fault tolerance, which increases the footprint of data stored locally in your cluster. In Amazon EMR, the default [HDFS replication](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-hdfs-config.html) factor is defined automatically when launching the cluster (or you can override it manually using the [EMR configuration API](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-configure-apps.html)). For large tables size this can drastically increase EBS storage costs, so you might want to leverage S3 where replication is handled natively by the service for a more convenient cost. 28 | 29 | ## Which instance should I use? 30 | 31 | When talking about hardware requirements for HBase, it is very important to choose the right EC2 instance type when using HDFS as storage layer, as it might be prohibitive to change it once you have a live production cluster. On the other side, changing instances for an HBase cluster running on Amazon S3 is much easier as data is persisted on S3. This allows us to more easily terminate an EMR cluster without losing data and launch a new one using a different instance type. Below you can find some details that can help you to choose the right instances based on your use case / workloads requirements. 32 | 33 | HBase typically performs better with small instances and when you spread the overall requests across multiple instances. This is because there are some limitations in the number of HBase regions a single Region Server can handle, and having a huge amount of regions on a single node can lead to issues and unexpected behavior. For more details on determining the right number of regions for a specific instance, see the section [Number of HBase Regions](#number-of-hbase-regions). 34 | 35 | Generally speaking, if you want to achieve the best possible performance in your HBase cluster, it’s highly recommended to use EC2 instances powered with an [Instance Store](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html) volume. This is especially true for write intensive / mixed (50% writes 50% reads) workloads. For such use cases, if you have significant write requests, you’ll need disks that can provide a large amount of IOPS in order to accommodate all background operations performed by HBase (compaction, WAL writes). Using disk optimized instances allows you to sustain high volumes of write operations either if HBase is performing compaction or other background operations on disks. Some example of instances that are recommended for such workloads are: 36 | 37 | * [i3](https://aws.amazon.com/ec2/instance-types/i3/) / [i3en](https://aws.amazon.com/ec2/instance-types/i3en/) provide dense SSD storage for data-intensive workloads. They provide the best performance for write intensive workloads but can be prohibitive depending on the amount of storage you want to use. They are recommended if you want to achieve the best possible performance, and if you want to cache several data in memory. 38 | * [m5d](https://aws.amazon.com/ec2/instance-types/m5/) / [r5d](https://aws.amazon.com/ec2/instance-types/r5/) / [c5d](https://aws.amazon.com/ec2/instance-types/c5/) all these families provide NVMe SSD disks to deliver high random I/O performance. They can be used in different ways to exploit HBase features. For example, r5d can be used in combination with HBase off heap caching to maintain a significant amount of data cached in a performant memory (instead of reading data from the disks). On the other side, c5d comes with a higher proportion of vCPU compared to the memory, so they can be a better match if you need to serve huge volumes of requests on a single region server. 39 | 40 | To decide the right instance size, it’s important to understand how many regions you’re going to serve on a single region server. As general rule however, for large HBase tables, it’s recommended to choose an instance type that can provide at least 32GB of memory dedicated for the HBase services (HMaster and Region Servers). Please note that by default Amazon EMR split the available memory of an instance between the YARN Node Manager and the HBase Region Server. For a list of default memory settings, see [Default values for task configuration settings](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-hadoop-task-config.html#emr-hadoop-task-jvm). You can always override the default EMR behavior using the EMR Configuration API. For more details see [Modify Heap Memory](./best_practice#hbase-heap-memory). 41 | 42 | 43 | ## Number of HBase Regions 44 | 45 | As described in the [HBase documentation](https://hbase.apache.org/book.html#ops.capacity.regions.count), you can use the following formula to compute the number of HBase regions that should be hosted on a single region server. You should note that this is gives more of guideline about number of regions, but you should investigate and experiment on your workload to tune the number of regions: 46 | 47 | ``` 48 | (REGION_SERVER_MEM_SIZE * MEMSTORE_FRACTION) / (MEMSTORE_SIZE * NUM_COLUMN_FAMILIES) 49 | ``` 50 | 51 | * **REGION_SERVER_MEM_SIZE** Memory allocated for the Region Server, as defined by the parameter -Xmx in *hbase-env.sh* 52 | * **MEMSTORE_FRACTION** Memstore memory fraction, defined by *hbase.regionserver.global.memstore.size* (default 0.4) 53 | * **MEMSTORE_SIZE** Memstore flush size (default 128MB) 54 | * **NUM_COLUMN_FAMILIES** Number of column families defined for the table 55 | 56 | 57 | For example for a Region Server configured with 32GB of Heap memory and hosting a table with a single column family with the default HBase settings, we'll have an ideal allocation of regions equals to: 58 | 59 | ``` 60 | # Number Recommended Regions 61 | (32GB * 0.4) / (128MB * 1) = 100 62 | ``` 63 | 64 | As previosly mentioned, this is a recommended setting that you can use as a starting point. For example, is not unfrequent to have a region server with 3 / 4 times the recommended value. However, to avoid impacting the performance it’s better that you’re not extensively using these extra regions for write operations to avoid extensive GC operations that might degrade performance or in worst cases failures that will force a Region Server restart. 65 | -------------------------------------------------------------------------------- /website/docs/bestpractices/Applications/HBase/management.md: -------------------------------------------------------------------------------- 1 | # Management 2 | 3 | This section highlights some commands and best practices, that can help you to manage your HBase clusters on Amazon EMR. 4 | 5 | ## Create command alias 6 | 7 | If you administrate your HBase cluster mainly from the shell of the EMR master, it might be convenient to define a command alias to avoid permission issues you might incur by typing erroneous commands using a different user (e.g. root). 8 | 9 | As best practice, you should always run HBase commands as `hbase` user. In order to do that, you can add the following alias in the `~/.bashrc` profile for the user that you use to administrate your cluster (e.g. hadoop) 10 | 11 | ```bash 12 | alias hbase='sudo -u hbase hbase' 13 | ``` 14 | 15 | Once done, you can safely run HBase commands as usual 16 | 17 | ```bash 18 | hbase shell 19 | ``` 20 | 21 | ## Determine average row size 22 | 23 | If you want to determine the average size of a row stored in a HBase table, you can use the following commands to retrieve the payload from a storefile of the table. For example: 24 | 25 | ```bash 26 | # Simple notation 27 | hbase hfile -m -f $HBASE_PATH 28 | 29 | # Extended notation 30 | hbase org.apache.hadoop.hbase.io.hfile.HFile -m -f $HBASE_PATH 31 | ``` 32 | 33 | ![Average Payload Size](./img/management_avg_payload.png "Average Payload Size") 34 | 35 | The class `org.apache.hadoop.hbase.io.hfile.HFile` allows you to analyze HBase store files that are persisted on HDFS or S3. The option `-m` returns the metadata for the file analyzed that reports the average size (bytes) of the Row Key in that particular file, and the average size (bytes) of the values stored in that file. 36 | 37 | To get a rough estimation of the average payload of a single row, you can sum the parameters **avgKeyLen** and **avgValueLen** that are returned in the previous command, to get the average size in bytes of a row. For example: 38 | 39 | ``` 40 | # row_avg_size = avgKeyLen + avgValueLen 41 | row_avg_size = 19 + 7 = 26 42 | ``` 43 | 44 | This command might be useful to get a rough estimate of your data payload when you are not sure about it. You can later on use this value to fine-tuning your cluster (e.g. [increase/decrease RPC Listeners](./best_practice#hbase-rpc-listeners)) 45 | 46 | ## Reduce number of Regions 47 | 48 | The HBase community introduced a new utility in the [hbase-tools](https://github.com/apache/hbase-operator-tools/blob/master/hbase-tools/) package that helps to reduce the number of regions for tables stored in HBase. This utility is available in the class `org.apache.hbase.RegionsMerger` and can help you to automatically merge the number of regions to a value that you define, if you have a high count in your cluster (e.g. wrong table pre-split, or high split rate due to incorrect settings) 49 | 50 | ```bash 51 | # copy library in classpath 52 | sudo cp /usr/lib/hbase-operator-tools/hbase-tools-*.jar /usr/lib/hbase/lib/ 53 | 54 | # merge regions 55 | hbase org.apache.hbase.RegionsMerger 56 | ``` 57 | 58 | This tool is available in HBase versions >= 2.x.x and should only be used with these versions. 59 | -------------------------------------------------------------------------------- /website/docs/bestpractices/Applications/HBase/performance_tests.md: -------------------------------------------------------------------------------- 1 | # Performance Tests 2 | 3 | One of the most important operations to perform before start using an HBase cluster is to perform a stress test to verify if the provisioned infrastructure meets the requirements in terms of latency and throughput for your applications. 4 | 5 | In this section, we’re going to explore some tools that can help to validate the provisioned infrastructure in terms of performance. For optimal results, it’s recommended to setup a monitoring tool as described in the [Observability](./observability#monitoring-hbase) section to collect advanced metrics from the tests. 6 | 7 | ## Evaluation Framework 8 | 9 | Typically, there are different aspects you want to check depending on how your cluster will be used. However, there are two major metrics that are important to define a baseline for the cluster performance: operation throughput (number of requests we can serve for a specific operation in a given period of time, e.g. GET) and operation latency (time required to acknowledge a client request). 10 | 11 | It’s very important to baseline these metrics in a production cluster. They will give you hints on when to scale nodes based on clients requests during the day, and they can suggest configuration tuning if it is not matching expected performance. 12 | 13 | Typically, you can perform a benchmark in an HBase cluster following the steps below: 14 | 15 | * **Write / data load** This is always the first step in the process as you should populate some tables with mock data to perform read tests or simply to evaluate the maximum throughput you can achieve during write operations. For this test, it is important to mimic as much as possible the average payload size of the data that will be ingested in the cluster. This can help to evaluate the number of compactions performed with the ingested volume and see the performance degradation that you might expect during these operations. Besides, this will also give you an idea of the maximum number of write requests you can serve with the specified cluster topology. 16 | 17 | * **Read / latency / cache** This is the next step to define our baseline. The major aim of this test should be to verify the max throughput that the cluster can serve and understand how well you are leveraging the HBase cache to improve response latency. 18 | 19 | As best practice for running these tests, you can follow the following rules: 20 | 21 | * Separate the clients from the HBase cluster. The goal is to collect metrics without having to care about resources used in our cluster. So as best practice, you should run your client fleet on a separate cluster. 22 | 23 | * If your clients are on a separate cluster, make sure that your fleet is co-located on the same subnet of the cluster. This will improve response latency and avoid extra costs you might incur for data trasfer across Availability Zones. 24 | 25 | * Use [EMR Configurations](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-configure-apps.html) defined as JSON files that are stored on an Amazon S3 bucket to launch your test clusters. This will help you to more easily export configurations used in your QA environment to production. Moreover, it will be easier to track specific configurations used in a test cluster, rather than setting them manually while launching the cluster. 26 | 27 | The following section describes some tools that can be used to baseline an HBase cluster. 28 | 29 | ## Performance Evaluation Tool 30 | 31 | The first tool we’re going to use is the HBase Performance Evaluation utility that is already available in your Amazon EMR cluster. This utility can be invoked using the following syntax from the EMR master node: 32 | 33 | ```bash 34 | hbase pe 35 | ``` 36 | 37 | The tool allow us to perform both write and read operations specifying different options to control several aspects of our tests (e.g. create a partitioned table, disable WAL flush, etc.) 38 | 39 | For example, the following command allows us to create a table called *MyWriteTest* that will be pre-partitioned with 200 regions (--presplit) and we’re going to write 2GB (--size) of data using a single client. We also enable the *latency* parameter to report operation latencies that help us to identify if the response time met our requirements. 40 | 41 | ```bash 42 | hbase pe --table=MyWriteTest --presplit=200 --size=2 --latency --nomapred randomWrite 1 43 | ``` 44 | 45 | As described in the [Logs section](./observability#logs), the log output will be stored in the **`/var/log/hbase/hbase.log`** file. Please make sure to run the previous command as **`hbase`** user, or you’ll not have the permissions to modify this file using the standard **`hadoop`** user. The following shows a sample output for the previous command: 46 | 47 | ```log 48 | INFO [TestClient-0] hbase.PerformanceEvaluation: Latency (us) : mean=21.45, min=1.00, max=480941.00, stdDev=992.53, 50th=2.00, 75th=2.00, 95th=2.00, 99th=3.00, 99.9th=24.00, 99.99th=37550.00, 99.999th=46364.23 49 | INFO [TestClient-0] hbase.PerformanceEvaluation: Num measures (latency) : 2097151 50 | INFO [TestClient-0] hbase.PerformanceEvaluation: Mean = 21.45 51 | ... 52 | INFO [TestClient-0] hbase.PerformanceEvaluation: No valueSize statistics available 53 | INFO [TestClient-0] hbase.PerformanceEvaluation: Finished class org.apache.hadoop.hbase.PerformanceEvaluation$RandomWriteTest in 42448ms at offset 0 for 2097152 rows (48.62 MB/s) 54 | INFO [TestClient-0] hbase.PerformanceEvaluation: Finished TestClient-0 in 42448ms over 2097152 rows 55 | INFO [main] hbase.PerformanceEvaluation: [RandomWriteTest] Summary of timings (ms): [42448] 56 | INFO [main] hbase.PerformanceEvaluation: [RandomWriteTest duration ] Min: 42448ms Max: 42448ms Avg: 42448ms 57 | INFO [main] hbase.PerformanceEvaluation: [ Avg latency (us)] 21 58 | INFO [main] hbase.PerformanceEvaluation: [ Avg TPS/QPS] 49405 row per second 59 | ``` 60 | 61 | As you can see this will report min, max and avg response latency for our write requests, along with throughput information about the max number of calls served by the cluster. Please note that in our example we used the *`nomapred`* parameter that will use a local thread to perform the test (in this case client resides on the EMR master node). 62 | 63 | If we want to generate a higher number of requests is better to remove this option, so that the utility will use a Map Reduce (MR) job to perform the test. In this last scenario, it might be convenient to run the MR job on a separate cluster, to avoid using resources (cpu, network bandwidth) from our HBase cluster and gather more realistic results. 64 | 65 | For example, the same tests can performed from a separate EMR cluster adding the following parameter: **`-Dhbase.zookeeper.quorum=TARGET_HBASE_MASTER_DNS`**, and replacing TARGET_HBASE_MASTER_DNS with the EMR master hostname we want to test. 66 | 67 | ```bash 68 | hbase pe -Dhbase.zookeeper.quorum=ip-xxx-xx-x-xxx.compute.internal --table=MyWriteTestTwo --presplit=200 --size=2 --latency randomWrite 1 69 | ``` 70 | 71 | In the same way we can perform Read test operations. For a detailed list of all options and tests available in the utility, please check the help section of the tool from ther command line. 72 | 73 | ## YCSB 74 | 75 | Another popular tool to benchmark your HBase cluster is [YCSB](https://github.com/brianfrankcooper/YCSB) (Yahoo Cloud Serving Benchmark). This utility is not available on Amazon EMR, so it should be manually installed on the EMR master itself, or on a separate EC2 instance. 76 | 77 | This tool, unlike the previous one, is more focused on testing workloads patterns. In fact, in doesn’t provide several options as the HBase PE utility, but allows you to define different types of workloads (typically called workload A,B,C,D, etc.) where you can mix different volumes of write/read/mutate operations, along with sizes of the data that are going to be read or modified. 78 | 79 | By default, the tool comes with pre-defined templates to tests some standard workloads patterns. For example, [workload A](https://github.com/brianfrankcooper/YCSB/blob/master/workloads/workloada) performs 50% of read operations and 50% of update operations using 1KB payloads for each row. 80 | 81 | This tool is especially useful, when you know exactly your workloads patterns, and you want to simulate more realistic use cases. However, please note that the tool can only launch multithreaded clients on the same node. So if you have a large cluster that you want to test, you’ll have to configure a fleet of EC2 instances and run the clients from each node using some automation scripts. 82 | -------------------------------------------------------------------------------- /website/docs/bestpractices/Applications/HBase/scripts/hbase-snapshot-export.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #=============================================================================== 3 | #!# script: hbase-snapshot-export 4 | #!# authors: ripani 5 | #!# version: v0.2 6 | #!# 7 | #!# Generate a snapshot of all the tables present in a specific hbase namespace. 8 | #!# The snapshots so created are then copied over an S3 bucket or HDFS path. 9 | #!# For legacy clusters, use the s3a:// schema and specify the AWS programmatic 10 | #!# keys. 11 | #!# 12 | #!# If you're transfering data between kerberized clusters, make sure the 13 | #!# clusters belong to the same Kerberos realm. 14 | #=============================================================================== 15 | #?# 16 | #?# usage: ./hbase-snapshot-export.sh [S3_ACCESS_KEY] [AWS_SECRET_KEY] 17 | #?# ./hbase-snapshot-export.sh "default" "hdfs://NN:8020/hbase" 18 | #?# ./hbase-snapshot-export.sh "default" "s3://BUCKET/PREFIX" 19 | #?# ./hbase-snapshot-export.sh "default" "s3a://BUCKET/PREFIX" "KEY" "SECRET" 20 | #?# 21 | #?# HBASE_NS HBase namespace to backup 22 | #?# SNAPSHOT_PATH HDFS or S3 path. 23 | #?# Example: s3://BUCKET or hdfs://NN:8020/user/hbase 24 | #?# AWS_ACCESS_KEY [Optional] AWS access key for s3a schema 25 | #?# AWS_SECRET_KEY [Optional] AWS secret key for s3a schema 26 | #?# 27 | #=============================================================================== 28 | 29 | # Print the usage helper using the header as source 30 | function usage() { 31 | [ "$*" ] && echo "$0: $*" 32 | sed -n '/^#?#/,/^$/s/^#?# \{0,1\}//p' "$0" 33 | exit -1 34 | } 35 | 36 | [[ $# -lt 2 ]] && echo "error: wrong parameters" && usage 37 | 38 | #=============================================================================== 39 | # Configurations 40 | #=============================================================================== 41 | HBASE_NS="$1" 42 | SNAPSHOT_PATH="$2" 43 | AWS_ACCESS_KEY="$3" 44 | AWS_SECRET_KEY="$4" 45 | 46 | if [[ -f "/emr/instance-controller/lib/info/extraInstanceData.json" ]]; then 47 | HBASE_CMD="sudo -u hbase hbase" 48 | else 49 | HBASE_CMD="hbase" 50 | fi 51 | 52 | # Retrieve list tables for the namespace 53 | readarray -t tables < <(echo "list_namespace_tables '$HBASE_NS'" | $HBASE_CMD shell 2> /dev/null | sed -e '1,/TABLE/d' -e '/seconds/,$d' | while IFS='' read -r line || [[ -n "$line" ]]; do echo "$line"; done) 54 | 55 | # Generate Snapshots 56 | label="$(date +"%Y%m%d")-$(date +%s)" 57 | for table in "${tables[@]}"; do 58 | echo "Creating snapshot for table $HBASE_NS:$table" 59 | $HBASE_CMD snapshot create -n "$label-$HBASE_NS-$table" -t $HBASE_NS:$table 60 | done 61 | 62 | # Copy Snapshots to S3 63 | snapshots=$($HBASE_CMD snapshot info -list-snapshots | grep $label | awk '{print $1}') 64 | for s in ${snapshots}; do 65 | echo "Transfer snapshot $s to $SNAPSHOT_PATH" 66 | if [[ -z "$AWS_ACCESS_KEY" && -z "$AWS_SECRET_KEY" ]]; then 67 | $HBASE_CMD org.apache.hadoop.hbase.snapshot.ExportSnapshot -snapshot ${s} -copy-to $SNAPSHOT_PATH 68 | else 69 | $HBASE_CMD org.apache.hadoop.hbase.snapshot.ExportSnapshot -Dfs.s3a.access.key=$AWS_ACCESS_KEY -Dfs.s3a.secret.key=$AWS_SECRET_KEY -snapshot ${s} -copy-to $SNAPSHOT_PATH 70 | fi 71 | done 72 | -------------------------------------------------------------------------------- /website/docs/bestpractices/Applications/HBase/scripts/hbase-snapshot-import.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #=============================================================================== 3 | #!# script: hbase-snapshot-import 4 | #!# authors: ripani 5 | #!# version: v0.2 6 | #!# 7 | #!# Import and Restore HBase Snapshots using a label. Make sure all the required 8 | #!# HBase namespaces required by the tables already exist before launching the 9 | #!# script. 10 | #!# 11 | #!# If you're transfering data between kerberized clusters, make sure the 12 | #!# clusters belong to the same Kerberos realm. 13 | #=============================================================================== 14 | #?# 15 | #?# usage: ./hbase-snapshot-import.sh