├── .gitignore ├── CFN └── create_CFN_stack.sh ├── EMR ├── ACID_hive_Tables_On_EMR ├── G1GC.py ├── TERMINATED_BY_SPOT_DUE_TO_NO_CAPACITY.md ├── Unable_to_load_credentials_error.md ├── UnknownHostException-Unable to execute HTTP request_s3.md ├── check_luks_encryption.md ├── configure_emr_notebook.sh ├── copy_bootstrap.sh ├── delay_bootstrap.sh ├── distcp_oozie_memory_change.md ├── emr_arrested_state.md ├── emr_create_CFN.json ├── emrfs_diff_check.md ├── execute_steps_all_nodes.py ├── fairScheduler.md ├── hive_blank_issue.md ├── implement_kerb_on_emr.md ├── inspect_parquet_file.md ├── kill_yarn_multiple_apps.sh ├── list_steps_API.md ├── multidelim_sertde_hive_tbl.md ├── presto.md ├── presto_tuning_config.md ├── resize2fs_error.md ├── spark_bucketing.md ├── spark_on_emr.md ├── spark_submit.sh ├── vCPU_error.md └── yarn_getting_less_memory.md ├── Failed_to_establish_a_new_connection.md ├── README.md ├── athena ├── csv_comma_in_data_issue.ddl ├── float_datatype_isssue_Athena.md ├── get_file_path.md ├── partition_projection_example.md ├── query-exhausted.md ├── scheduling_athena_query.md ├── timestamp_cast_issue.md └── using_map_key.md ├── aws-boto3-sdk ├── ddb │ ├── ddb_to_s3.py │ └── put_auto_scaling.py ├── emr │ ├── add_tags.py │ └── create_emr_run_spark_add_step.py └── glue │ ├── access_glue_data_catalog.py │ ├── create_dev_endpoint.py │ ├── cross_account_data.py │ ├── execute_sql_load_to_amazonaurora_mysql.py │ ├── get_partitions_info.py │ ├── glue_convert_xls_to_csv.py │ ├── glue_decimal_issue.py │ ├── glue_params.py │ ├── postgres_to_glue_datacatalog_extract.py │ └── write_to_redshift_snippet.py ├── cli-examples ├── add_tags_objects.sh ├── create_emr_jupyterhub.sh ├── create_glue_crawler_cfn.json ├── create_glue_dev_endpoint.sh ├── decoding_encode_msg.sh ├── distcp_s3_to_emr.sh ├── glue-start-job.sh ├── install_aws_cli_jupyterhub.sh ├── pass_proxy_spark_shell.sh └── spark-submit-postgres-jars.sh ├── configs ├── enable_s3_consistency_EMR.sh └── enable_s3_consistency_dp.sh ├── data-pipeline └── ddb_to_s3.json ├── ddb ├── ddb_common_term.md ├── hot_partition.md └── latency_explanation.md ├── general-python ├── hudi_dataset_write.py └── spark_context_python.py ├── glue ├── CFN_table_serde.md ├── IAMRole_is_not_authorized_perform_glue_GetUserDefinedFunctions on resource.err ├── SkipArchive_from_hive_spark.md ├── TABLE_VERSION_ResourceNumberLimitExceededException.md ├── Unable_to_execute_HTTP_request.md ├── access_glue_data_catalog.py ├── access_glue_data_catalog_cross_region_from_emr.md ├── access_glue_dc_locally.md ├── add_new_column_in_glue.py ├── benefits_glue.md ├── boilerplate_glue_script.py ├── bookmark_jdbc.py ├── bookmark_testing.md ├── bookmarks_understanding.md ├── check_glue_job_status.py ├── check_install_pkgs.py ├── configparser.py ├── connecting_glue_oracle_db_jdbc_via_etl.md ├── control_output_partitions.py ├── control_partitions_glue.py ├── convert_csv_to_parquet.py ├── copyfiles │ ├── cpy_s3_to_s3.py │ ├── glue_pySpark.py │ ├── s3tos3_copy_python.py │ └── syncs3objects.sh ├── create_python_udf.py ├── creating_glue_connection_using_cfn_secret_manager.md ├── cross_account_copy_from_s3_to_s3.py ├── cross_account_cross_region_read_write_ddb_from_glue.py ├── cross_account_data_catalog_access_via_etl.md ├── cross_account_s3_access_using_IAMRole.md ├── cross_account_sns.md ├── cross_region_glue_connection.md ├── custom_jdbc_mysql8.md ├── datacatalog_migration.md ├── ddb_to_s3.py ├── desc_vpc.py ├── docdb.py ├── dynfrm_to_df.py ├── enable-s3-parquet-optimized-committer.md ├── get_all_free_ips.py ├── get_all_glue_internal_config.py ├── get_free_ips_programatically.py ├── get_partitions_cli.sh ├── glue2.0_datatype.py ├── glue_VPC_jobs.md ├── glue_and_teradata.md ├── glue_conf_check.py ├── glue_connection_ssm_secret.yaml ├── glue_cw_kms_policy.md ├── glue_etl_optimization ├── glue_gc_handling.md ├── glue_general_recom.md ├── glue_jdbc_recom.md ├── glue_job_commit.md ├── glue_logger_python.py ├── glue_logger_scala.py ├── glue_logging_debug.py ├── glue_logging_techniques.py ├── glue_no_space_left_on_device.md ├── glue_oom_container_killed.md ├── glue_rcu_ddb.md ├── glue_s3_eventual_consis_issue.md ├── glue_sigterm_executors_error.md ├── glue_spark_sql_usage.md ├── glue_storage_issue.md ├── glue_table_versioning.md ├── glue_traffic_cross_account.md ├── glue_transformations.md ├── glue_with_security_config.md ├── graphframe_glue_steps.md ├── graphframe_with_glue.py ├── hashexpression_usage.md ├── hudi_pyspark_example.py ├── insta_cart_etl.py ├── jdbc_parallel_reads_using_glue.md ├── job_bookmarks_rewind.md ├── ld.md ├── load_only_new_recs_to_redshift.py ├── load_to_redshift.md ├── log_setting_spark.md ├── metrics_glue_etl.md ├── minimum_permissions_crawler.md ├── multiple_connections_glue.md ├── no_enough_ips.md ├── no_enough_ips_in_subnet.md ├── ouput_logs_error_logs_diff.md ├── out_of_scope.md ├── pandas_vs_spark_dataframes.md ├── pre_post_action_redshift_glue.py ├── print_args_pythonshell.py ├── process_tables_glue_data_catalog_in_loop.py ├── pushdownpredicate.scala ├── read_bookmark_enabled.py ├── read_from_specific_partitions.py ├── read_gzip_glue.py ├── read_postgres_directly.py ├── recordsize_sizekey_crawler.md ├── redshift_from_catalog_and_from_options_read_with_where_clause.py ├── redshift_to_s3.py ├── sample_glue_native_spark.py ├── setting_glue_param_within_etl.md ├── solve_skew_issues.md ├── spark_errors │ ├── Not_enough space to cache rdd_52_0_in_memory.err │ ├── connection_time_out_glue.md │ ├── connection_to_endpoint._s3_timed_outerr │ ├── failed_to_allocate_x_byte_of_direct_memory.md │ ├── missing_an_output_location_for_shuffle.err │ ├── spark-sql_time_out.md │ └── unable_to_load_credentials.md ├── spark_scala_example.py ├── spark_sql_glue.py ├── start_glue_etl_via_lambda.py ├── threading_with_glue_pyspark.py ├── trigger_glue_job_cross_account_setup.md ├── troubleshooting_glue_failures.md ├── try_catch_glue.py ├── update_crawler_cli.md ├── update_glue_boto3_ssl.py ├── update_glue_jdbc_connection_using_boto3.py ├── update_table_api_example.py ├── upgrade_glue_boto3.py ├── write_DateType_SparkDataFrame.py ├── write_excel_using_ExcelWriter.py └── write_sample_partitioned_dataset.py ├── hive └── hive-on-tez │ ├── container_launced_error.md │ ├── hive-debug-mode.md │ ├── hive_container_launched.md │ ├── hive_partition_queries.hql │ ├── tez_benefits.md │ └── tez_config.md ├── lake-formation ├── lake_formation_examples.md └── setting_revoking_lf_permissions.md ├── lambda ├── load_to_rds.py └── trigger_glue_job.py ├── s3 └── s3_deny_policy_explained.md └── spark_configs ├── Estimating_memory.md ├── spark_submit_config.sh └── verbose_logs.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /CFN/create_CFN_stack.sh: -------------------------------------------------------------------------------- 1 | aws cloudformation create-stack --template-body file:///Users/abc/Documents/isemr.json --stack-name ddd --capabilities CAPABILITY_NAMED_IAM 2 | 3 | -------------------------------------------------------------------------------- /EMR/G1GC.py: -------------------------------------------------------------------------------- 1 | ## Option 1 : 2 | 3 | spark-submit \ 4 | > --conf "spark.driver.extraJavaOptions=-XX:+UseG1GC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps" \ 5 | > --conf "spark.executor.extraJavaOptions=-XX:+UseG1GC -XX:+PrintGCDetails -XX:+PrintGCTimeStamps" \ 6 | > sample_pyspark.py 7 | 8 | ## Option 2: 9 | 10 | spark-submit \ 11 | --conf "spark.driver.extraJavaOptions=-XX:+UseG1GC -XX:+PrintFlagsFinal \ 12 | -XX:+PrintReferenceGC -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps \ 13 | -XX:+PrintAdaptiveSizePolicy -XX:+UnlockDiagnosticVMOptions -XX:+G1SummarizeConcMark \ 14 | -XX:InitiatingHeapOccupancyPercent=35 -XX:ConcGCThreads=20" \ 15 | --conf "spark.executor.extraJavaOptions=-XX:+UseG1GC -XX:+PrintFlagsFinal -XX:+PrintReferenceGC \ 16 | -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+PrintAdaptiveSizePolicy \ 17 | -XX:+UnlockDiagnosticVMOptions -XX:+G1SummarizeConcMark -XX:InitiatingHeapOccupancyPercent=35 \ 18 | -XX:ConcGCThreads=20" \ 19 | sample_pyspark.py 20 | 21 | # Ref: 22 | 23 | /* 24 | http://saucam.github.io/blog/2015/10/14/tuning-g1gc-spark/ 25 | https://databricks.com/blog/2015/05/28/tuning-java-garbage-collection-for-spark-applications.html 26 | https://aws.amazon.com/blogs/big-data/best-practices-for-successfully-managing-memory-for-apache-spark-applications-on-amazon-emr/ 27 | https://medium.com/@sambodhi_72782/spark-tuning-manual-47b98ccb2b2c 28 | https://community.cloudera.com/t5/Support-Questions/Spark-Job-long-GC-pauses/td-p/282690 29 | https://stackoverflow.com/questions/34589051/garbage-collection-time-very-high-in-spark-application-causing-program-halt/34590161 (edited 30 | */ 31 | 32 | Option 3: 33 | 34 | spark.executor.extraJavaOptions=-XX:+UseG1GC -XX:+PrintReferenceGC -XX:+PrintGCDetails --conf spark.driver.extraJavaOptions=-XX:+UseG1GC -XX:+PrintReferenceGC -XX:+PrintGCDetails 35 | -------------------------------------------------------------------------------- /EMR/TERMINATED_BY_SPOT_DUE_TO_NO_CAPACITY.md: -------------------------------------------------------------------------------- 1 | Spot Instances are spare compute capacity in the AWS Cloud. Spot Instance capacity is interrupted when Amazon Elastic Compute Cloud (Amazon EC2) needs the capacity back. The no Spot capacity available error occurs when there isn't enough spare capacity to fulfill your Spot Instance or Spot Fleet request. 2 | 3 | As capacity becomes available, Amazon EC2 fulfills requests in the following order: 4 | 5 | 1. Reserved Instances 6 | 2. On-Demand Instances 7 | 3. Spot Instances 8 | 9 | The Spot request continues to automatically make the launch request until capacity becomes available. When capacity becomes available, Amazon EC2 fulfills the Spot request. 10 | 11 | When setting up your Spot Instances, keep the following best practices in mind to help limit capacity issues: 12 | 13 | 1. Use a diverse set of instance types so that you aren't reliant on one, specific type. You can create an Amazon EC2 Auto Scaling group with a mix of On-Demand and Spot Instances so that you aren't completely reliant on capacity availability. You are already doing this . Probably changing the instance type to a different one(s) will help. [1] 14 | 15 | 2. Use the capacity optimized allocation strategy within your Auto Scaling group. The capacity optimized strategy analyzes real-time capacity data in order to launch your Spot Instances into pools with the most available capacity. [2] 16 | 17 | For a complete list of best practices for utilizing Spot Instances successfully, see Best practices for EC2 Spot.[3] 18 | 19 | If you want to find the interrupted Spot Instances and it's associated reason, you can refer this document[4]. In case you still run into this problem , please let us know we can connect over the phone and troubleshoot the problem further. 20 | 21 | References: 22 | 23 | [1] https://docs.aws.amazon.com/autoscaling/ec2/userguide/asg-purchase-options.html 24 | [2] https://docs.aws.amazon.com/autoscaling/ec2/userguide/asg-purchase-options.html#asg-spot-strategy 25 | [3] https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-best-practices.html 26 | [4] https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-request-status.html#get-spot-instance-request-status 27 | -------------------------------------------------------------------------------- /EMR/UnknownHostException-Unable to execute HTTP request_s3.md: -------------------------------------------------------------------------------- 1 | 1. To understand the issue better, Check if the issue was intermittent while trying to connection to S3 via EMR ? Are you able to reproduce this issue? 2 | 3 | 2. Normally these errors are caused due to DNS issues, please run the following commands. 4 | 5 | 1.1) dig +short 6 | 1.2) telnet 443 7 | 1.3) cat /etc/resolv.conf 8 | 9 | If the "dig" command returns an error then you will need to investigate your DNS server settings and the VPC if applicable The DNS settings may be misconfigured or the DNS server you are using is not operational. You may also need to implement DNS caching[1]. To troubleshoot the VPC I would suggest using the VPC flow logs[2]. 10 | 11 | 3. Check the s3 request-ids. 12 | 13 | 4. Running a packet capture tcpdump/wireshark[4][5] and waiting for the fault to resurface. 14 | 15 | 5. Monitoring the VPC[6] 16 | 17 | 6. Create a VPC endpoint for you s3 bucket, this will improve the performance to your s3 bucket as the data will go over the VPC endpoint[7][8]. VPC endpoints are normally implemented for security however it is recommended to use them which increases performance to a s3 bucket from your VPC. 18 | 19 | 7. Lastly, pull the logs from the instance running the application, its found under - /var/log/message. 20 | 21 | 22 | 23 | References: 24 | [1]https://aws.amazon.com/premiumsupport/knowledge-center/dns-resolution-failures-ec2-linux/ 25 | [2]https://docs.aws.amazon.com/vpc/latest/userguide/flow-logs.html 26 | [3]https://docs.aws.amazon.com/Amazon/latest/userguide/get-request-ids.html 27 | [4]https://www.tcpdump.org/ 28 | [5]https://www.wireshark.org/ 29 | [6] https://aws.amazon.com/blogs/networking-and-content-delivery/debugging-tool-for-network-connectivity-from-amazon-vpc/ 30 | [7] https://docs.aws.amazon.com/vpc/latest/privatelink/vpc-endpoints-s3.html 31 | [8] https://aws.amazon.com/premiumsupport/knowledge-center/s3-private-connection-no-authentication/ 32 | 33 | -------------------------------------------------------------------------------- /EMR/check_luks_encryption.md: -------------------------------------------------------------------------------- 1 | When you use LUKS encryption, though your EBS volumes are encrypted along with any instance store volumes, you still see EBS with Not Encrypted status when you use an Amazon EC2 API or the EC2 console to check on the encryption status. This is because the API doesn’t look into the EMR cluster to check the disk status; your auditors would need to SSH into the cluster to check for disk encrypted compliance. However, with EBS encryption, you can check the encryptions status from the EC2 console or through an EC2 API call. 2 | 3 | 4 | Commands to check once you perform ssh on the master/core/task node of an AWS EMR cluster : 5 | 6 | 1. Perform sudo su 7 | 2. Execute the command - lsblk - Running lsblk on the cluster will only check the status of LUKS encryption. 8 | 3. Running lsblk will output the below as an example : 9 | 10 | NAME MAJ:MIN RM SIZE RO TYPE MOUNTPOINT 11 | nvme1n1 259:0 0 32G 0 disk 12 | ├─nvme1n1p1 259:5 0 5G 0 part 13 | │ └─nvme1n1p1 253:0 0 5G 0 crypt /emr 14 | └─nvme1n1p2 259:6 0 27G 0 part 15 | └─nvme1n1p2 253:1 0 27G 0 crypt /mnt 16 | nvme2n1 259:1 0 32G 0 disk 17 | └─nvme2n1 253:2 0 32G 0 crypt /mnt1 18 | nvme0n1 259:2 0 17G 0 disk 19 | ├─nvme0n1p1 259:3 0 17G 0 part / 20 | └─nvme0n1p128 259:4 0 1M 0 part 21 | 22 | 4. In the above , it shows the following are LUKS encrypted : /dev/nvme1n1p1 , /dev/nvme1n1p2 and /dev/nvme2n1 . Like I mentioned and also pointed out by the previous engineer when using LUKS encryption additional steps provided must be taken to encrypt root volume as we can see above root volume "nvme0n1p1" is not encrypted. 23 | 24 | 5. Additionally you can run these following commands as well to verify LUKS encryption on these volumes. As an example , below are the steps I took to verify on my AWS cluster I tested out: 25 | 26 | [root@ip-xxx-xx-xx-xxx hadoop]# cryptsetup isLuks /dev/nvme1n1p1 && echo "$DEV_LUKS is a LUKS Device" || echo "$DEV_LUKS is not a LUKS Device" 27 | /dev/xvda is a LUKS Device 28 | [root@ip-xxx-xx-xx-xxx hadoop]# 29 | [root@ip-xxx-xx-xx-xxx hadoop]# 30 | [root@ip-xxx-xx-xx-xxx hadoop]# cryptsetup isLuks /dev/nvme1n1p2 && echo "$DEV_LUKS is a LUKS Device" || echo "$DEV_LUKS is not a LUKS Device" 31 | /dev/xvda is a LUKS Device 32 | [root@ip-xxx-xx-xx-xxx hadoop]# 33 | [root@ip-xxx-xx-xx-xxx hadoop]# 34 | [root@ip-xxx-xx-xx-xxx hadoop]# cryptsetup isLuks /dev/nvme2n1 && echo "$DEV_LUKS is a LUKS Device" || echo "$DEV_LUKS is not a LUKS Device" 35 | /dev/xvda is a LUKS Device 36 | 37 | https://aws.amazon.com/blogs/big-data/best-practices-for-securing-amazon-emr/ 38 | 39 | 40 | https://aws.amazon.com/blogs/big-data/secure-your-data-on-amazon-emr-using-native-ebs-and-per-bucket-s3-encryption-options/ 41 | -------------------------------------------------------------------------------- /EMR/configure_emr_notebook.sh: -------------------------------------------------------------------------------- 1 | %%configure -f 2 | { 3 | "conf": 4 | { 5 | "spark.driver.extraClassPath": "/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/tmp/predix-connectors-common-2.0.3.jar:/tmp/spark-timeseries-connector-2.0.3.jar", 6 | "spark.executor.extraClassPath": "/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:/usr/share/aws/emr/goodies/lib/emr-spark-goodies.jar:/usr/share/aws/emr/security/conf:/usr/share/aws/emr/security/lib/*:/usr/share/aws/hmclient/lib/aws-glue-datacatalog-spark-client.jar:/usr/share/java/Hive-JSON-Serde/hive-openx-serde.jar:/usr/share/aws/sagemaker-spark-sdk/lib/sagemaker-spark-sdk.jar:/usr/share/aws/emr/s3select/lib/emr-s3-select-spark-connector.jar:/tmp/predix-connectors-common-2.0.3.jar:/tmp/spark-timeseries-connector-2.0.3.jar" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /EMR/copy_bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | aws s3 cp s3://aws-xxx-logs/python_jupyter/satish_install_new.sh /home/hadoop/satish_install_new.sh && sudo bash /home/hadoop/satish_install_new.sh & exit 0 4 | -------------------------------------------------------------------------------- /EMR/delay_bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | while true; do 3 | NODEPROVISIONSTATE=` sed -n '/localInstance [{]/,/[}]/{ 4 | /nodeProvisionCheckinRecord [{]/,/[}]/ { 5 | /status: / { p } 6 | /[}]/a 7 | } 8 | /[}]/a 9 | }' /emr/instance-controller/lib/info/job-flow-state.txt | awk ' { print $2 }'` 10 | 11 | if [ "$NODEPROVISIONSTATE" == "SUCCESSFUL" ]; then 12 | sleep 10; 13 | echo "Running my post provision bootstrap" 14 | # your code here 15 | 16 | sudo docker exec jupyterhub bash -c "pip install tensorflow" 17 | 18 | exit; 19 | fi 20 | 21 | sleep 10; 22 | done 23 | -------------------------------------------------------------------------------- /EMR/distcp_oozie_memory_change.md: -------------------------------------------------------------------------------- 1 | Here are the different approach(s) to change the mapper and reducer memory of s3-dist-cp job that involves hive job executions. 2 | 3 | Option 1- setting the following properties on the script level. An example of hive hql is given below with property values added on it. 4 | 5 | ======example script======= 6 | --disable blob optimization 7 | set hive.blobstore.optimizations.enabled=false; 8 | 9 | --setting tez container size 10 | set hive.tez.container.size=4096; 11 | 12 | --setting s3-dist-cp map and reduce memory (default value is 1024MB) 13 | set mapred.job.map.memory.mb=1400; 14 | set mapred.job.reduce.memory.mb=1400; 15 | 16 | --hive script 17 | drop table ddd; 18 | create table ddd like d; 19 | alter table ddd set location 's3://bucketname/ddd'; 20 | insert overwrite table ddd select * from d; 21 | 22 | =========== 23 | 24 | Option 2. Updating hadoop-distcp-2.8.5-amzn-4.jar and uploading into the Oozie workflow lib path 25 | 26 | 1. SSH to master node 27 | 2. sudo cp /usr/lib/hadoop/hadoop-distcp-2.8.5-amzn-4.jar /home/hadoop 28 | 3. jar xf hadoop-distcp-2.8.5-amzn-5.jar distcp-default.xml . 29 | 4. Edit vi distcp-default.xml and update the value for below propriety as follows- 30 | 31 | mapred.job.map.memory.mb 32 | 1400 33 | 34 | 35 | 36 | mapred.job.reduce.memory.mb 37 | 1400 38 | 39 | 40 | 5. Run $jar -uf hadoop-distcp-2.8.5-amzn-4.jar distcp-default.xml 41 | 42 | 6. Move jar to hdfs or oozie lib location in hdfs as like $hdfs dfs -put hadoop-distcp-2.8.5-amzn-4.jar /user/${user.name}/share/lib 43 | 44 | ===job properties 45 | oozie.libpath=/user/${user.name}/share/lib 46 | oozie.use.system.libpath=true 47 | 48 | 49 | This can be validated from container logs and you can search from "memory" on the log or below sentence- 50 | 51 | ==== 52 | org.apache.hadoop.mapreduce.v2.app.rm.RMContainerAllocator: mapResourceRequest: 53 | 54 | Note- 55 | Above are the random values for these property, so suggest you to change the appropriate values. 56 | -------------------------------------------------------------------------------- /EMR/emr_arrested_state.md: -------------------------------------------------------------------------------- 1 | An instance group goes into arrested state if it encounters too many errors while trying to start the newcluster nodes. For example, if new nodes fail while performing bootstrap actions, the instance group goesinto an ARRESTED state, rather than continuously provisioning new nodes. After you resolve the under-lying issue, reset the desired number of nodes on the cluster's instance group, and then the instancegroup resumes allocating nodes. Modifying an instance group instructs Amazon EMR to attempt to provisionnodes again. No running nodes are restarted or terminated.In the AWS CLI, the list-instances subcommand returns all instances and their states as does thedescribe-cluster subcommand. In the Amazon EMR CLI, the --describe command returns all in-stance groups and node types, and you can see the state of the instance groups for the cluster. If AmazonEMR detects a fault with an instance group, it changes the group's state to ARRESTED 2 | 3 | 4 | To reset a cluster in an ARRESTED state using the AWS CLI•Type the describe-cluster subcommand with the --cluster-id parameter to view the state of the instances in your cluster. 5 | 6 | For example, to view information on all instances and instance groups in a cluster, type: 7 | 8 | aws emr describe-cluster --cluster-id j-3KVXXXXXXY7UG 9 | 10 | The output will display information about your instance groups and the state of the instances. 11 | 12 | To view information on a particular instance group, type the list-instances subcommand withthe --cluster-id and --instance-group-types parameters. You can view information for theMASTER, CORE, or TASK groups: 13 | 14 | aws emr list-instances --cluster-id j-3KVXXXXXXY7UG --instance-group-types"CORE" 15 | 16 | Use the modify-instance-groups subcommand with the --instance-groups parameter toreset a cluster in the ARRESTED state. The instance group id is returned by the describe-clustersubcommand: 17 | 18 | 19 | aws emr modify-instance-groups --instance-groups InstanceGroupId=string,In stanceCount=integer 20 | 21 | Example : 22 | 23 | aws emr modify-instance-groups --instance-groups InstanceGroupId=ig-3SUXXXXXXQ9ZM,InstanceCount=3 24 | 25 | Note : You do not need to change the number of nodes from the original configuration to free a runningcluster. Set -–instance-count to the same count as the original setting. 26 | 27 | References: 28 | 29 | [1] https://docs.aws.amazon.com/cli/latest/reference/emr/modify-instance-groups.html 30 | -------------------------------------------------------------------------------- /EMR/emrfs_diff_check.md: -------------------------------------------------------------------------------- 1 | To discover discrepancies between EMRFS metadata & Amazon S3 and make sure we have the EMRFS metadata in the DynamoDB in synchronized with the actual files on the S3 path.In order to do this, please follow the steps. 2 | 3 | You need ssh into your Master node of your EMR cluster [Connect to the Master Node Using SSH https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-connect-master-node-ssh.html] 4 | 5 | ssh -i youremr.pem hadoop@ipaddress.compute-1.amazonaws.com 6 | 7 | Step 1: Run EMRFS diff to list the difference between your S3 path and the DynamoDB table --> Check for any differences 8 | 9 | Example: $ emrfs diff s3://elasticmapreduce/samples/cloudfront 10 | 11 | Sample provided here https://docs.aws.amazon.com/emr/latest/ManagementGuide/emrfs-cli-reference.html#emrfs-diff 12 | 13 | Step 2: Run EMRFS sync command to synchronize the dynamoDB table with the actual files in the S3 path 14 | 15 | Example :emrfs sync s3://elasticmapreduce/samples/cloudfront 16 | 17 | Step 3: Running EMRFS difff again shouldn't show you any difference with the DynamoDB & S3 path. If you dont find any differences , please try re-running the query again. If you still see a difference. then follow the Step 4. 18 | 19 | Step 4: Run emrfs delete "emrfs delete s3://yourS3path" and then run Step 2 and Step 3 to ensure that there is no difference and then run your query. 20 | -------------------------------------------------------------------------------- /EMR/execute_steps_all_nodes.py: -------------------------------------------------------------------------------- 1 | from boto3 import client 2 | from sys import argv 3 | 4 | try: 5 | clusterId=argv[1] 6 | script=argv[2] 7 | except: 8 | print("Syntax: librariesSsm.py [ClusterId] [S3_Script_Path]") 9 | import sys 10 | sys.exit(1) 11 | 12 | emrclient=client('emr') 13 | 14 | # Get list of core nodes 15 | instances=emrclient.list_instances(ClusterId=clusterId,InstanceGroupTypes=['CORE'])['Instances'] 16 | instance_list=[x['Ec2InstanceId'] for x in instances] 17 | 18 | # Attach tag to core nodes 19 | ec2client=client('ec2') 20 | ec2client.create_tags(Resources=instance_list,Tags=[{"Key":"environment","Value":"coreNodeLibs"}]) 21 | 22 | ssmclient=client('ssm') 23 | 24 | # Download shell script from S3 25 | command = "aws s3 cp " + script + " /home/hadoop" 26 | try: 27 | first_command=ssmclient.send_command(Targets=[{"Key":"tag:environment","Values":["coreNodeLibs"]}], 28 | DocumentName='AWS-RunShellScript', 29 | Parameters={"commands":[command]}, 30 | TimeoutSeconds=3600)['Command']['CommandId'] 31 | 32 | # Wait for command to execute 33 | import time 34 | time.sleep(15) 35 | 36 | first_command_status=ssmclient.list_commands( 37 | CommandId=first_command, 38 | Filters=[ 39 | { 40 | 'key': 'Status', 41 | 'value': 'SUCCESS' 42 | }, 43 | ] 44 | )['Commands'][0]['Status'] 45 | 46 | second_command="" 47 | second_command_status="" 48 | 49 | # Only execute second command if first command is successful 50 | 51 | if (first_command_status=='Success'): 52 | # Run shell script to install libraries 53 | 54 | second_command=ssmclient.send_command(Targets=[{"Key":"tag:environment","Values":["coreNodeLibs"]}], 55 | DocumentName='AWS-RunShellScript', 56 | Parameters={"commands":["bash /home/hadoop/custom_action.sh"]}, 57 | TimeoutSeconds=3600)['Command']['CommandId'] 58 | 59 | second_command_status=ssmclient.list_commands( 60 | CommandId=first_command, 61 | Filters=[ 62 | { 63 | 'key': 'Status', 64 | 'value': 'SUCCESS' 65 | }, 66 | ] 67 | )['Commands'][0]['Status'] 68 | time.sleep(30) 69 | print("First command, " + first_command + ": " + first_command_status) 70 | print("Second command:" + second_command + ": " + second_command_status) 71 | 72 | except Exception as e: 73 | print(e) 74 | -------------------------------------------------------------------------------- /EMR/fairScheduler.md: -------------------------------------------------------------------------------- 1 | https://medium.com/@sohamghosh/schedulers-in-emr-6445180b44f6 2 | -------------------------------------------------------------------------------- /EMR/hive_blank_issue.md: -------------------------------------------------------------------------------- 1 | 2 | hive.metastore.orm.retrieveMapNullsAsEmptyStrings 3 | true 4 | 5 | 6 | 7 | Test Data => 8 | 9 | 251|Paris Hotel|Las Vegas|NV| 10 | 258|Tropicana Hotel|Las Vegas|NV| 11 | 300|Kennedy Center Opera House|Washington|DC|0 12 | 306|Lyric Opera House|Baltimore|MD|0 13 | 308|Metropolitan Opera|New York City|NY|0 14 | 22|Quicken Loans Arena|Cleveland||0 15 | 101|Progressive Field|Cleveland||43345 16 | 17 | Create table in Hive => 18 | 19 | CREATE EXTERNAL TABLE `ajinkya.ajinkya_tests_new2`( 20 | `id` bigint, 21 | `name` string, 22 | `state` string, 23 | `city` string, 24 | `zip` bigint) 25 | ROW FORMAT SERDE 26 | 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe' 27 | WITH SERDEPROPERTIES ( 28 | 'field.delim'='|', 29 | 'line.delim'='\n', 30 | 'serialization.format'='|', 31 | 'serialization.null.format'='') 32 | STORED AS INPUTFORMAT 33 | 'org.apache.hadoop.mapred.TextInputFormat' 34 | OUTPUTFORMAT 35 | 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' 36 | LOCATION 37 | 's3://xx-xx/ajinkya_tests'; 38 | 39 | > select * from ajinkya.ajinkya_tests_new where city is null; 40 | OK 41 | Time taken: 0.442 seconds 42 | hive> 43 | 44 | Config 1 Change - 45 | 46 | Either set the value in hivesite.xml .Path for hivesite.xml on emr cluster - /etc/hive/conf/hive-ste.xml 47 | 48 | or setting the property using beeline at the session level or hive session - > 49 | 50 | set hive.metastore.orm.retrieveMapNullsAsEmptyStrings=true; 51 | 52 | Config 2 => set 'serialization.null.format'='' setting this property while table creation. 53 | 54 | Testing => > select * from ajinkya.ajinkya_tests_new2 where city is null; 55 | OK 56 | 22 Quicken Loans Arena Cleveland NULL 0 57 | 101 Progressive Field Cleveland NULL 43345 58 | Time taken: 0.355 seconds, Fetched: 2 row(s) 59 | hive> 60 | 61 | > select * from ajinkya.ajinkya_tests_new2 where length(trim(city))=0; 62 | OK 63 | Time taken: 0.353 seconds 64 | hive> 65 | 66 | > select * from ajinkya.ajinkya_tests_new2 where city =''; 67 | OK 68 | -------------------------------------------------------------------------------- /EMR/implement_kerb_on_emr.md: -------------------------------------------------------------------------------- 1 | https://medium.com/@neerajsabharwal/how-to-implement-kerberos-in-aws-emr-f56594467bd 2 | -------------------------------------------------------------------------------- /EMR/inspect_parquet_file.md: -------------------------------------------------------------------------------- 1 | install pip 2 | $ curl -O https://bootstrap.pypa.io/get-pip.py 3 | 4 | $ python get-pip.py --user 5 | 6 | or 7 | 8 | python3 get-pip.py --user 9 | 10 | Install parquet cli tools 11 | 12 | pip install parquet-cli --user 13 | 14 | 15 | parq -h 16 | 17 | 18 | parq LOAD00000001.parquet -s 19 | -------------------------------------------------------------------------------- /EMR/kill_yarn_multiple_apps.sh: -------------------------------------------------------------------------------- 1 | Kill all applications on YARN which are in ACCEPTED state: 2 | 3 | for x in $(yarn application -list -appStates ACCEPTED | awk 'NR > 2 { print $1 }'); do yarn application -kill $x; done 4 | 5 | Kill all applications on YARN which are in RUNNING state: 6 | 7 | for x in $(yarn application -list -appStates RUNNING | awk 'NR > 2 { print $1 }'); do yarn application -kill $x; done 8 | 9 | Kill all SPARK applications on YARN: 10 | 11 | for x in $(yarn application -list -appTypes SPARK | awk 'NR > 2 { print $1 }'); do yarn application -kill $x; done 12 | -------------------------------------------------------------------------------- /EMR/list_steps_API.md: -------------------------------------------------------------------------------- 1 | 2 | I would like to mention that this error on rate exceeded comes when the API calls being made is more than the current limit. Since EMR is a shared service, we implement certain limits on the APIs "PER account PER region" to ensure the service is performing correctly and not overloaded. It does not count per cluster or step. 3 | 4 | Here you can see the list of the default API rate limits to the EMR service for your reference :[1]. We could see the below limits for ListStepss API call here[1] - 5 | 6 | API Action | Bucket Maximum Capacity | Bucket Refill Rate (per second) 7 | ListStepss |10 | 0.5 8 | 9 | To give you a better understanding about it, I'll explain it briefly. As a starting point, an AWS account can use up to the amount of calls allotted for that specific API call until it runs out. At the same time, this quota is refilled at a specific rate. So if the user makes the same call, on average, at the same rate or less than this refill rate, you should be okay. Throttling can happen if your account is making more calls faster than the refill rate. 10 | 11 | In more detail : 12 | 13 | for ListSteps = Bucket Size: 10, Refill Rate (per second) 0.5 . What these numbers mean is that you initially have a bucket of 10 ListSteps calls. That number is fixed but as you make ListSteps calls that bucket is refilled at a rate of 0.5 call per second. This means every two seconds the bucket receives 1 new credit. As such, if your AWS account is making the ListSteps API calls faster than one call per two seconds, you will gradually experience throttled exceptions. 14 | 15 | Now in order to avoid this error, increasing the limit could temporarily delay the issue but you might see the same issue again when you hit the increased limits. Hence, the best way to avoid this error is to follow the below recommendation, which can also be found in this [2] blog: 16 | 17 | - Reduce the frequency of the API calls. 18 | - Stagger the intervals of the API calls so that they do not all run at once. 19 | - Implement exponential backoff (better with jitter) on making API calls. 20 | 21 | Another way of reducing List calls to the service is by making parts of your system that keep polling (that keep invoking ListSteps) the service to instead respond to a cluster's state change. A more comprehensive explanation of this can be found in reference[3][4]. 22 | 23 | If none of the above solutions help with your use case, one can request a limit increase. 24 | 25 | References:- 26 | [1] https://docs.aws.amazon.com/general/latest/gr/emr.html#limits_emr 27 | [2] https://aws.amazon.com/premiumsupport/knowledge-center/emr-cluster-status-throttling-error/ 28 | [3] https://aws.amazon.com/blogs/big-data/respond-to-state-changes-on-amazon-emr-clusters-with-amazon-cloudwatch-events/ 29 | [4] https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-manage-cloudwatch-events.html 30 | 31 | -------------------------------------------------------------------------------- /EMR/multidelim_sertde_hive_tbl.md: -------------------------------------------------------------------------------- 1 | CREATE TABLE test_multi 2 | (a string, b string, c string, d string, e string, f string) 3 | ROW FORMAT SERDE 'org.apache.hadoop.hive.contrib.serde2.MultiDelimitSerDe' 4 | WITH SERDEPROPERTIES ( 5 | "field.delim"="~|`", 6 | "collection.delim"=":", 7 | "mapkey.delim"="@" 8 | ); 9 | 10 | This is the preferred way of loading multi-character delimited data into Hive over the use of “org.apache.hadoop.hive.serde2.RegexSerDe”, as it is simpler and faster. 11 | -------------------------------------------------------------------------------- /EMR/presto.md: -------------------------------------------------------------------------------- 1 | [{ 2 | "Classification": "presto-config", 3 | "Properties": { 4 | "query.max-memory": "8000MB", 5 | "query.max-memory-per-node": "30G", 6 | "query.max-total-memory-per-node": "40G", 7 | "memory.heap-headroom-per-node": "20G", 8 | "query.initial-hash-partitions": "20" 9 | }, 10 | "Configurations": [] 11 | }] 12 | 13 | https://github.com/prestodb/presto/issues/11005 14 | 15 | 16 | 17 | To give you more detailed answer here are the memory management properties [1]: 18 | 19 | query.max-memory : 20 | 21 | This is the max amount of user memory a query can use across the entire cluster. User memory is allocated during execution for things that are directly attributable to or controllable by a user query. For example, memory used by the hash tables built during execution, memory used during sorting, etc. When the user memory allocation of a query across all workers hits this limit it will be killed. 22 | 23 | query.max-memory-per-node : 24 | 25 | This is the max amount of user memory a query can use on a worker. User memory is allocated during execution for things that are directly attributable to or controllable by a user query. For example, memory used by the hash tables built during execution, memory used during sorting, etc. When the user memory allocation of a query on any worker hits this limit it will be killed. 26 | 27 | There is one more param i.e. "query.max-total-memory-per-node" 28 | 29 | query.max-total-memory-per-node : 30 | 31 | This is the max amount of user and system memory a query can use on a worker. System memory is allocated during execution for things that are not directly attributable to or controllable by a user query. For example, memory allocated by the readers, writers, network buffers, etc. When the sum of the user and system memory allocated by a query on any worker hits this limit it will be killed. The value of query.max-total-memory-per-node must be greater than query.max-memory-per-node.This config. must be greater than or equal to query.max-memory-per-node (which is only the user memory). The default value of query.max-total-memory-per-node is 30% of the heap size. 32 | 33 | 34 | query.max-total-memory: 35 | 36 | This is the max amount of user and system memory a query can use across the entire cluster. System memory is allocated during execution for things that are not directly attributable to or controllable by a user query. For example, memory allocated by the readers, writers, network buffers, etc. When the sum of the user and system memory allocated by a query across all workers hits this limit it will be killed. The value of query.max-total-memory must be greater than query.max-memory. 37 | 38 | Reference : 39 | 40 | [1] https://prestodb.io/docs/current/admin/properties.html#general-properties 41 | -------------------------------------------------------------------------------- /EMR/presto_tuning_config.md: -------------------------------------------------------------------------------- 1 | It's hard to come up with exact numbers, because these numbers should be set based on your workloads. So, what I can do is to provide you some numbers to start with, and then you should experiment with these configs and your workloads to fine tune them. 2 | 3 | As an example if you have 62G per node you can start with an Xmx of, say 50G, as you should set aside some overhead for the native memory and leave some room for the OS and other daemons running on the machines, if any. In production we use a G1 region size of 32M, which is also the documented value in the deployment docs. 4 | 5 | Given that the max heap size is 50G, I think you can start experimenting with the following values and determine the right values for your workloads: 6 | 7 | query.max-memory-per-node = 20GB 8 | query.max-total-memory-per-node =20GB 9 | memory.heap-headroom-per-node = 10GB (This is the amount of heap memory to set aside as headroom/buffer (e.g., for untracked allocations)). 10 | 11 | With a headroom of 10G and a max total memory per node of 20G the general pool on each worker will be of size 50-10-20 = 20G, and that's 20G*10=200G in the entire cluster. When we determine the query.max-memory (the peak global user memory limit) we also consider the hash partition count (query.initial-hash-partitions configuration, which is the number of partitions for distributed joins and aggregations). 12 | 13 | Assuming you have 10 node cluster, you can set query.initial-hash-partitions to 8, with that if we set query.max-memory to 60G that will result in 60/7.5GB= 8 GB( round-off) per node memory usage roughly (if there is no skew and data is well distributed), and since we have query.max-memory-per-node of 20GB, that means we allow a skew factor of 20/8=2.5=3(round-off) (that is, we allow tasks to consume twice as much memory when the data is not well distributed). Again, you should definitely experiment and tune these values, and figure out what works for you. 14 | -------------------------------------------------------------------------------- /EMR/resize2fs_error.md: -------------------------------------------------------------------------------- 1 | resize2fs at time throws an error while extending the file system on AWS EMR when the EBS volumne storgae is extended( Increase capacity ) 2 | 3 | esize2fs 1.42.9 (28-Dec-2013) 4 | resize2fs: Bad magic number in super-block while trying to open /dev/nvme0n1p1 5 | Couldn't find valid filesystem superblock. 6 | 7 | Couple of commands to extend the partition: 8 | 9 | Extending a partition 10 | # sudo growpart /dev/xvda 1 11 | 12 | Extending the file system 13 | # sudo resize2fs /dev/xvda1 14 | 15 | To confirm 16 | # lsblk 17 | 18 | xfs_growfs /dev/nvme0n1p1 19 | 20 | check the file system type for the EBS root volume using "blkid" command 21 | 22 | The reason for the above error is resize2fs command works on ext4 volumes where as the filesysem XFS requires xfs_growfs. 23 | -------------------------------------------------------------------------------- /EMR/spark_bucketing.md: -------------------------------------------------------------------------------- 1 | 2 | tmp = spark.read.parquet("s3://xx-xx-pds/parquet/product_category=Wireless/part-00009-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet") 3 | 4 | tmp.write.bucketBy(25,"year").saveAsTable("reviewsamazon") 5 | 6 | spark.sql("insert into reviewstest select * from reviewsamazon") 7 | 8 | 9 | Spark Supported bucketed table: 10 | 11 | CREATE TABLE `reviewstest` (`marketplace` STRING, `customer_id` STRING, `review_id` STRING, `product_id` STRING, `product_parent` STRING, `product_title` STRING, `star_rating` INT, `helpful_votes` INT, `total_votes` INT, `vine` STRING, `verified_purchase` STRING, `review_headline` STRING, `review_body` STRING, `review_date` DATE, `year` INT) 12 | USING parquet 13 | OPTIONS ( 14 | `serialization.format` '1' 15 | ) 16 | CLUSTERED BY (year) 17 | INTO 1000 BUCKETS 18 | 19 | 20 | 21 | Hive Supported bucketed table: 22 | 23 | CREATE TABLE `reviewdist` (`marketplace` STRING, `customer_id` STRING, `review_id` STRING, `product_id` STRING, `product_parent` STRING, `product_title` STRING, `star_rating` INT, `helpful_votes` INT, `total_votes` INT, `vine` STRING, `verified_purchase` STRING, `review_headline` STRING, `review_body` STRING, `review_date` DATE, `year` INT) 24 | 25 | CLUSTERED BY (year) 26 | INTO 256 BUCKETS 27 | stored as parquet 28 | 29 | 30 | 31 | dataframe 32 | .withColumn("bucket", pmod(hash($"bucketColumn"), lit(numBuckets))) 33 | .repartition(numBuckets, $"bucket") 34 | .write 35 | .format(fmt) 36 | .bucketBy(numBuckets, "bucketColumn") 37 | .sortBy("bucketColumn") 38 | .option("path", "/path/to/your/table") 39 | .saveAsTable("table_name") 40 | 41 | 42 | -------------------------------------------------------------------------------- /EMR/spark_submit.sh: -------------------------------------------------------------------------------- 1 | spark-submit --master yarn --deploy-mode cluster --executor-memory 4g --conf spark.executor.memoryOverhead=512 demo.py 2 | 3 | --core comes from --conf spark.executor.cores=4. 4 | 5 | spark-submit --deploy-mode cluster --master yarn --conf spark.executor.cores=4 demo.py 6 | 7 | The number of cores is entirely depends on the parameter spark.executor.cores or --executor-cores . As long one defines it in the /etc/spark/conf/spark-defaults.conf it will pick up the default value 8 | from this file or if you want to change this value per application , please pass it using spark-submit as mentioned above. 9 | 10 | 11 | Starting PySpark-shell interactively with more memory etc. 12 | 13 | pyspark --executor-cores 5 --executor-memory 36g --driver-memory 36g --driver-cores 5 14 | -------------------------------------------------------------------------------- /EMR/vCPU_error.md: -------------------------------------------------------------------------------- 1 | "errorCode": "Client.VcpuLimitExceeded", 2 | 3 | "errorMessage": "You have requested more vCPU capacity than your current vCPU limit of 1111 allows for the instance bucket that the specified instance type belongs to. Please visit http://aws.amazon.com/contact-us/ec2-request to request an adjustment to this limit. 4 | 5 | This error indicates that one need to request for the limit increase for instance family type in order to provision on-demand instances. You can calculate the current EC2 limit for the R-type instance by "Calculate vCPU limit" . 6 | 7 | EC2 vCPU limit increases are submitted as a vCPU value. To request an increase, Kindly determine how many vCPUs your On-Demand Instances are using. You can use the vCPU limits calculator to measure the number of vCPUs that you are currently using against vCPU-based limits to determine the appropriate service limit increase to request. 8 | 9 | You can create a service limit request directly from the vCPU limits calculator. 10 | 11 | 12 | 13 | Ref: https://aws.amazon.com/premiumsupport/knowledge-center/ec2-on-demand-instance-vcpu-increase/ 14 | -------------------------------------------------------------------------------- /EMR/yarn_getting_less_memory.md: -------------------------------------------------------------------------------- 1 | About the behavior regarding the requested memory and memory showing up on the spark UI sometime does not match . That's an expected behavior.The memory requested with spark-submit is considered an Upper limit for the application because if the other applications are running on the same cluster those can share. but in case where nothing is running and the cluster is sitting idle . YARN still decides using internal algorithm i.e. How much memory to allocate to the executors and other daemons which runs in the background and support application execution . 2 | 3 | Check out these two stackoverflow posts that I found clarifies some of our doubts : 4 | 5 | 1. https://stackoverflow.com/questions/38347036/spark-on-yarn-less-executor-memory-than-set-via-spark-submit 6 | 2. https://stackoverflow.com/questions/13988328/java-memory-runtime-getruntime-maxmemory/13988748#13988748 7 | 3. https://spoddutur.github.io/spark-notes/distribution_of_executors_cores_and_memory_for_spark_application.html 8 | -------------------------------------------------------------------------------- /Failed_to_establish_a_new_connection.md: -------------------------------------------------------------------------------- 1 | Just to ensure , as we understand there are only two ways to get Glue jobs with internet connectivity: 2 | 3 | I) Scenario 1 : No AWS Glue Connection attached. Jobs that are NOT attached to a Connection will get Internet connectivity out of the box. 4 | 5 | II) Scenario 2 : If AWS Glue Connection is attached, then its associated subnet MUST include a route to a NAT Gateway (NAT GW). Public subnets (that is, subnets with a route to 0.0.0.0/0 via Internet Gateway or IGW) are useless for Glue because the Elastic Network Interfaces (ENIs) that the service creates for the jobs will ONLY have private IP addresses assigned. As you might now, for a private IP to be able to go out to the Internet, a network address translation (NAT) must take place in order to "map" the private IP to a public one. Only then the traffic can go out to the internet.Basically, this setup would be like the one mentioned here [1]. 6 | 7 | 8 | In the meantime, I would recommend you to review our Glue documentation titled "Setting Up Your Environment to Access Data Stores" [2], especially these paragraphs: 9 | 10 | """ 11 | If a job needs to run in your VPC subnet—for example, transforming data from a JDBC data store in a private subnet—AWS Glue sets up elastic network interfaces that enable your jobs to connect securely to other resources within your VPC. Each elastic network interface is assigned a private IP address from the IP address range within the subnet you specified. No public IP addresses are assigned. Security groups specified in the AWS Glue connection are applied on each of the elastic network interfaces. For more information, see Setting Up a VPC to Connect to JDBC Data Stores. 12 | 13 | All JDBC data stores that are accessed by the job must be available from the VPC subnet. To access Amazon S3 from within your VPC, a VPC endpoint is required. 14 | If your job needs to access both VPC resources and the public internet, the VPC needs to have a Network Address Translation (NAT) gateway inside the VPC. 15 | """ 16 | 17 | And then please verify that you Glue Connection's subnet has a route to a NAT Gateway so that your job can get internet connectivity as described above (i.e., "if your job needs to access both VPC resources and the public internet, the VPC needs to have a Network Address Translation -NAT- gateway inside the VPC") 18 | 19 | Please verify all these steps mentioned above , if you still run into issues , please provide us with the glue connection details . You can take a snapshot of glue connection which is attached to the job providing all the networking details. 20 | 21 | ====== REFERENCES ====== 22 | 23 | [1] VPC with public and private subnets (NAT) - https://docs.aws.amazon.com/vpc/latest/userguide/VPC_Scenario2.html 24 | [2] Setting Up Your Environment to Access Data Stores - https://docs.aws.amazon.com/glue/latest/dg/start-connecting.html 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python-boto3-sdk , aws cli examples & BigData Blogs 2 | 3 | This repository provides Code examples written in Python,Spark-Scala using primarily boto3 SDK API methods and aws cli examples for majority of the AWS Big Data services. There are also nicley written Wiki articles for most of the common issues/challenges faced within BigData world. 4 | -------------------------------------------------------------------------------- /athena/csv_comma_in_data_issue.ddl: -------------------------------------------------------------------------------- 1 | ====================================== 2 | 3 | CREATE EXTERNAL TABLE `test`( 4 | `title` string COMMENT 'from deserializer', 5 | `field_pso_number` string COMMENT 'from deserializer', 6 | `field_address` string COMMENT 'from deserializer', 7 | `field_phone` string COMMENT 'from deserializer', 8 | `field_phone_ext` string COMMENT 'from deserializer', 9 | `field_state` string COMMENT 'from deserializer', 10 | `view_node` string COMMENT 'from deserializer') 11 | ROW FORMAT SERDE 12 | 'org.apache.hadoop.hive.serde2.OpenCSVSerde' 13 | STORED AS INPUTFORMAT 14 | 'org.apache.hadoop.mapred.TextInputFormat' 15 | OUTPUTFORMAT 16 | 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' 17 | LOCATION 18 | 's3://your-s3-bucket/path/to/csvfile'; 19 | 20 | ======================================= 21 | -------------------------------------------------------------------------------- /athena/float_datatype_isssue_Athena.md: -------------------------------------------------------------------------------- 1 | 2 | Since Athena query engine is based on Presto. FLOAT datatype is not supported according to Presto documentation[1]. And also, according to the Athena documentation[2], FLOAT datatype can only be used in DDL statements. 3 | 4 | Presto supports REAL and DOUBLE types for float-point values and you can use DOUBLE type to cast as you mentioned. 5 | 6 | select (cast(8765435678 as REAL)/cast(87654 as REAL)) as result 7 | 8 | References: 9 | [1] https://prestodb.io/docs/current/language/types.html#floating-point 10 | [2] https://docs.aws.amazon.com/athena/latest/ug/data-types.html 11 | -------------------------------------------------------------------------------- /athena/get_file_path.md: -------------------------------------------------------------------------------- 1 | SELECT *,"$path" FROM "my_database"."my_table" on original source table 2 | -------------------------------------------------------------------------------- /athena/partition_projection_example.md: -------------------------------------------------------------------------------- 1 | objectCount": "9282", 2 | "UPDATED_BY_CRAWLER": "xxxxx", 3 | "projection.date.type": "date", 4 | "CrawlerSchemaSerializerVersion": "1.0", 5 | "recordCount": "778694", 6 | "averageRecordSize": "81", 7 | "projection.date.format": "yyyy-MM-dd", 8 | "exclusions": "[\"s3://xx-xx-xx-xx-xx/env-p1-np/report-metrics-unique/ad-log-metrics/metrics/_spark_metadata/**\",\"s3://xx-xx-xx-bis-dmp/env-p1-np/report-metrics-unique/ad-log-metrics/metrics/hdfs/**\",\"s3://xx-xx-xx-xx-xx/env-p1-np/xx-xx-unique/ad-log-metrics/metrics/_SUCCESS\"]", 9 | "projection.hour.range": "0,23", 10 | "CrawlerSchemaDeserializerVersion": "1.0", 11 | "compressionType": "none", 12 | "classification": "parquet", 13 | "projection.enabled": "true", 14 | "projection.country.type": "enum", 15 | "projection.hour.type": "integer", 16 | "projection.country.values": "COD,SGP,NCL,MCO,ZAF,REU,GRD,FIN,GLP,PER,FRO,IMN,DZA,SPM,KEN,ALA,GMB,PAN,HUN,COG,ESP,ISR,TWN,SWZ,TUR,QAT,GUM,SYC,LKA,GIN,CHL,PRK,YEM,GHA,SEN,CXR,CCK,AFG,PAK,VCT,AIA,ISL,GTM,ALB,STP,ABW,CRI,VGB,SDN,ITA,NGA,BHS,CAF,TON,IOT,RUS,LTU,ATG,VEN,IRQ,AND,SWE,MTQ,NAM,OMN,KWT,HTI,LAO,BGD,CYP,LCA,GUF,EGY,TTO,SVK,JAM,UGA,TGO,MDA,GAB,COM,CHE,CPV,MKD,ASM,IRN,TKM,CYM,SUR,CUB,SRB,DMA,BLM,VIR,ARM,MLI,KGZ,IND,EST,JPN,DNK,GRL,FSM,AGO,MNE,BRN,PNG,SOM,MLT,CAN,MWI,LBY,KAZ,THA,TUV,MMR,IDN,CZE,BEN,NIC,ZWE,PRY,IRL,LBN,MDV,GIB,BHR,ROU,MHL,KIR,FJI,MYS,JOR,PSE,GUY,PRT,WSM,USA,SSD,MYT,MAR,AUT,TLS,ECU,NLD,ARE,COL,NPL,BFA,GEO,GRC,POL,PYF,NRU,LBR,ZMB,HRV,KHM,COK,CMR,BDI,ETH,JEY,BEL,BLZ,TCD,PRI,SJM,DOM,VAT,PHL,TZA,RWA,BOL,PLW,MNG,MAC,BRA,NZL,LIE,NIU,FRA,MAF,ESH,VUT,SLV,SYR,FLK,BRB,LUX,MEX,HKG,NER,GNB,ARG,TJK,MUS,MNP,BWA,SXM,KOR,SVN,SLE,NOR,ERI,MDG,TUN,NFK,LSO,GGY,BIH,BGR,MRT,URY,BTN,WLF,GBR,MOZ,LVA,VNM,AZE,BLR,GNQ,CIV,DEU,SMR,UZB,BMU,CHN,SLB,TCA,BES,DJI,KNA,ATA,UKR,HND,AUS,CUW,SAU", 17 | "projection.date.range": "2018-11-27,NOW", 18 | "typeOfData": "file" 19 | }, 20 | "retention": 0 21 | -------------------------------------------------------------------------------- /athena/timestamp_cast_issue.md: -------------------------------------------------------------------------------- 1 | -------------- 2 | current_timestamp as insert_timestamp, 3 | current_timestamp as update_timestamp, 4 | current_timestamp as created_at 5 | -------------- 6 | 7 | And since this field is of type "timestamp with time zone", it creates an issue because Athena doesn't support column definition of this type. 8 | 9 | As an example: 10 | 11 | SELECT current_timestamp 12 | 13 | Example Result: 2020-03-11 11:02:14.633 UTC 14 | 15 | SELECT typeof(current_timestamp) 16 | 17 | Result: timestamp with time zone 18 | 19 | Now, if we try to create a table out of this SELECT statement: 20 | 21 | CREATE TABLE test AS SELECT current_timestamp as time 22 | 23 | Result: NOT_SUPPORTED: Unsupported Hive type: timestamp with time zone. 24 | 25 | So, to address this, we could cast this data type into one of the support types, such as: 26 | 27 | CREATE TABLE test AS SELECT cast(current_timestamp as timestamp) as time 28 | Result: Query successful. 29 | -------------------------------------------------------------------------------- /athena/using_map_key.md: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE hivejson_wdcc ( 2 | sysdate string, 3 | referrer string, 4 | clientip string, 5 | queryString map 6 | ) 7 | ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' 8 | LOCATION 9 | 's3://pl200620-ap121142-pzn-kinesisfirehose-dev/kafkahive/'; 10 | 11 | select querystring['p0'],querystring['vsdr'] from hivejson_wdcc; 12 | --> 41 13 | -------------------------------------------------------------------------------- /aws-boto3-sdk/ddb/ddb_to_s3.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | import datetime 4 | from awsglue.utils import getResolvedOptions 5 | from pyspark.context import SparkContext 6 | from awsglue.context import GlueContext 7 | 8 | args = getResolvedOptions(sys.argv,['JOB_NAME']) 9 | 10 | sc = SparkContext() 11 | glueContext = GlueContext(sc) 12 | 13 | table = glueContext.create_dynamic_frame.from_options( 14 | "dynamodb", 15 | connection_options={ 16 | "dynamodb.input.tableName": "hello", 17 | "dynamodb.throughput.read.percent": "1.0" 18 | } 19 | ) 20 | 21 | glueContext.write_dynamic_frame.from_options(frame = table, 22 | connection_type="s3", 23 | connection_options={"path": "s3://your-s3-bucket/ddbs3"}, 24 | format="parquet") 25 | -------------------------------------------------------------------------------- /aws-boto3-sdk/ddb/put_auto_scaling.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | autoscaling_client = boto3.client('application-autoscaling') 3 | 4 | percent_of_use_to_aim_for = 50.0 5 | scale_out_cooldown_in_seconds = 60 6 | scale_in_cooldown_in_seconds = 60 7 | autoscaling_client.put_scaling_policy(ServiceNamespace='dynamodb', 8 | ResourceId='table/Test_table1', 9 | PolicyType='TargetTrackingScaling', 10 | PolicyName='ScaleDynamoDBReadCapacityUtilization', 11 | ScalableDimension='dynamodb:table:ReadCapacityUnits', 12 | TargetTrackingScalingPolicyConfiguration={ 13 | 'TargetValue': percent_of_use_to_aim_for, 14 | 'PredefinedMetricSpecification': { 15 | 'PredefinedMetricType': 'DynamoDBReadCapacityUtilization' 16 | }, 17 | 'ScaleOutCooldown': scale_out_cooldown_in_seconds, 18 | 'ScaleInCooldown': scale_in_cooldown_in_seconds 19 | }) 20 | autoscaling_client.put_scaling_policy(ServiceNamespace='dynamodb', 21 | ResourceId='table/Test_table1', 22 | PolicyType='TargetTrackingScaling', 23 | PolicyName='ScaleDynamoDBWriteCapacityUtilization', 24 | ScalableDimension='dynamodb:table:WriteCapacityUnits', 25 | TargetTrackingScalingPolicyConfiguration={ 26 | 'TargetValue': percent_of_use_to_aim_for, 27 | 'PredefinedMetricSpecification': { 28 | 'PredefinedMetricType': 'DynamoDBWriteCapacityUtilization' 29 | }, 30 | 'ScaleOutCooldown': scale_out_cooldown_in_seconds, 31 | 'ScaleInCooldown': scale_in_cooldown_in_seconds 32 | }) 33 | -------------------------------------------------------------------------------- /aws-boto3-sdk/emr/add_tags.py: -------------------------------------------------------------------------------- 1 | Option 1 : Using boto3 API "put_object_tagging" method .[1] 2 | 3 | Example : 4 | 5 | The following example adds tags to an existing object. 6 | 7 | response = client.put_object_tagging( 8 | Bucket='examplebucket', 9 | Key='HappyFace.jpg', 10 | Tagging={ 11 | 'TagSet': [ 12 | { 13 | 'Key': 'Key3', 14 | 'Value': 'Value3', 15 | }, 16 | { 17 | 'Key': 'Key4', 18 | 'Value': 'Value4', 19 | }, 20 | ], 21 | }, 22 | ) 23 | 24 | print(response) 25 | Expected Output: 26 | 27 | { 28 | 'VersionId': 'null', 29 | 'ResponseMetadata': { 30 | '...': '...', 31 | }, 32 | } 33 | -------------------------------------------------------------------------------- /aws-boto3-sdk/glue/access_glue_data_catalog.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from awsglue.transforms import * 3 | from awsglue.utils import getResolvedOptions 4 | from pyspark.context import SparkContext 5 | from pyspark.context import SparkConf 6 | from pyspark.sql import SparkSession 7 | from awsglue.context import GlueContext 8 | from pyspark.sql import HiveContext 9 | from pyspark.sql import SQLContext 10 | from awsglue.job import Job 11 | from awsglue.dynamicframe import DynamicFrame 12 | ## @params: [JOB_NAME] 13 | 14 | args = getResolvedOptions(sys.argv, ['JOB_NAME']) 15 | sc= SparkContext() 16 | spark=SparkSession.builder.config("hive.metastore.client.factory.class", "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory").enableHiveSupport().getOrCreate() 17 | glueContext = GlueContext(sc) 18 | 19 | job = Job(glueContext) 20 | job.init(args['JOB_NAME'], args) 21 | 22 | spark.sql("SHOW DATABASES").show() 23 | spark.sql("use test") 24 | spark.sql("show tables").show() 25 | 26 | job.commit() 27 | -------------------------------------------------------------------------------- /aws-boto3-sdk/glue/create_dev_endpoint.py: -------------------------------------------------------------------------------- 1 | dep = glue.create_dev_endpoint( 2 | EndpointName="testDevEndpoint", 3 | RoleArn="arn:aws:iam::123456789012", 4 | SecurityGroupIds="sg-7f5ad1ff", 5 | SubnetId="subnet-c12fdba4", 6 | PublicKey="ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCtp04H/y...", 7 | NumberOfNodes=3, 8 | ExtraPythonLibsS3Path="s3://bucket/prefix/lib_A.zip,s3://bucket_B/prefix/lib_X.zip") 9 | -------------------------------------------------------------------------------- /aws-boto3-sdk/glue/cross_account_data.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from awsglue.transforms import * 3 | from awsglue.utils import getResolvedOptions 4 | from pyspark.context import SparkContext 5 | from awsglue.context import GlueContext 6 | from awsglue.job import Job 7 | import boto3 8 | 9 | ## @params: [JOB_NAME] 10 | args = getResolvedOptions(sys.argv, ['JOB_NAME']) 11 | 12 | sts_client = boto3.client('sts') 13 | assumed_role_object=sts_client.assume_role( 14 | RoleArn="arn:aws:iam::123456789:role/assume-access-role ", 15 | RoleSessionName="AssumeRoleSession6" 16 | ) 17 | 18 | credentials=assumed_role_object['Credentials'] 19 | aws_session_token=credentials['SessionToken'] 20 | aws_access_key_id=credentials['AccessKeyId'] 21 | aws_secret_access_key=credentials['SecretAccessKey'] 22 | 23 | sc = SparkContext() 24 | glueContext = GlueContext(sc) 25 | spark = glueContext.spark_session 26 | 27 | # Declare 'org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider' as the credential provider for 's3a' URI and set those credentials for 's3a' URI. 28 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.TemporaryAWSCredentialsProvider") 29 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.session.token", aws_session_token) 30 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.access.key", aws_access_key_id) 31 | spark.sparkContext._jsc.hadoopConfiguration().set("fs.s3a.secret.key", aws_secret_access_key) 32 | 33 | #See here s3a is used in the URI instead of s3 34 | s3_loc = "s3a://s3-bucket-in-diff-account/pathto/data/" 35 | 36 | job = Job(glueContext) 37 | job.init(args['JOB_NAME'], args) 38 | datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "ff-dd", table_name = "ddd", transformation_ctx = "datasource0") 39 | applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [("contactid", "int", "contactid", "int"), ("companyid", "long", "companyid", "long"), ("year", "string", "year", "string"), ("month", "string", "month", "string"), ("day", "string", "day", "string")], transformation_ctx = "applymapping1") 40 | datasink2 = glueContext.write_dynamic_frame.from_options(frame = applymapping1, connection_type = "s3", connection_options = {"path": s3_loc}, format = "CSV", transformation_ctx = "datasink2") 41 | job.commit() 42 | -------------------------------------------------------------------------------- /aws-boto3-sdk/glue/execute_sql_load_to_amazonaurora_mysql.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import sys 3 | import pymysql 4 | from awsglue.utils import getResolvedOptions 5 | 6 | 7 | args = getResolvedOptions(sys.argv,['env']) 8 | 9 | client = boto3.client('ssm', region_name='us-east-1') 10 | text = '/ishan/'+args['env']+'/mysql/' 11 | response = client.get_parameters(Names=[text+'bucketname',text+'dbuser',text+'dbpassword', text+'host', text+'dbname', text+'dbport' ],WithDecryption=False) 12 | 13 | dict = {} 14 | parameters = response['Parameters'] 15 | for i in parameters: 16 | name = i['Name'] 17 | value = i['Value'] 18 | dict[name] = value 19 | 20 | Bucket = dict[text+'bucketname'] 21 | hostName = dict[text+'host'] 22 | portNum = dict[text+'dbport'] 23 | usr = dict[text+'dbuser'] 24 | pswrd = dict[text+'dbpassword'] 25 | dbName = dict[text+'dbname'] 26 | 27 | 28 | def execSql(key): 29 | s3 = boto3.resource('s3') 30 | 31 | query=s3.Object(Bucket, key).get()['Body'].read().decode('utf-8') 32 | # conn = mysql.connector.connect( 33 | conn = pymysql.connect(host= hostName,port=portNum,user=usr,password=pswrd,database= dbName) 34 | conn = pymysql.connect(host="(); 8 | 9 | while(result.getNextToken()!=null){ 10 | if (!result.getPartitions().isEmpty()) { 11 | filteredPartitions.addAll(result.getPartitions()) 12 | } 13 | GetPartitionsResult result = glueClient.getPartitions(new GetPartitionsRequest() 14 | .withCatalogId(CATALOG_ID) 15 | .withDatabaseName(DATABASE_NAME) 16 | .withTableName(TABLE_NAME) 17 | .withExpression(expression) 18 | .withNextToken(result.getNextToken())); 19 | } 20 | 21 | if (!result.getPartitions().isEmpty()) { 22 | filteredPartitions.addAll(result.getPartitions()) 23 | } 24 | 25 | if (filteredPartitions.isEmpty()) { 26 | throw new HealthCheckException( 27 | String.format("Health Check Failed - 0 rows were generated for user case %s.", use_case)); 28 | } 29 | -------------------------------------------------------------------------------- /aws-boto3-sdk/glue/glue_convert_xls_to_csv.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import xlrd 3 | import boto3 4 | import csv 5 | import os 6 | import os.path 7 | 8 | def upload_file(file_name, bucket, object_name=None): 9 | 10 | # If S3 object_name was not specified, use file_name 11 | if object_name is None: 12 | object_name = file_name 13 | 14 | # Upload the file 15 | s3_client = boto3.client('s3') 16 | try: 17 | response = s3_client.upload_file(file_name, bucket, object_name) 18 | except ClientError as e: 19 | logging.error(e) 20 | return False 21 | return True 22 | 23 | s3_client = boto3.client('s3') 24 | 25 | # Download the file from S3 26 | s3_client.download_file('', '', '') 27 | 28 | with xlrd.open_workbook('') as wb: 29 | print("File read successfully") 30 | sh = wb.sheet_by_index(0) 31 | with open('', 'w', newline="") as f: 32 | c = csv.writer(f) 33 | for r in range(sh.nrows): 34 | c.writerow(sh.row_values(r)) 35 | print("Program finished and now uploading file to s3 now") 36 | 37 | upload_file('','','') 38 | 39 | 40 | print("Upload successful") 41 | -------------------------------------------------------------------------------- /aws-boto3-sdk/glue/glue_params.py: -------------------------------------------------------------------------------- 1 | import pyspark 2 | from pyspark import SparkContext 3 | sc =SparkContext() 4 | import sys 5 | from awsglue.utils import getResolvedOptions 6 | 7 | args = getResolvedOptions(sys.argv, 8 | ['JOB_NAME', 9 | 's3_bucket', 10 | 'config', 11 | 'password']) 12 | 13 | print(args['s3_bucket']) 14 | print(args['config']) 15 | print(args['password']) 16 | -------------------------------------------------------------------------------- /aws-boto3-sdk/glue/postgres_to_glue_datacatalog_extract.py: -------------------------------------------------------------------------------- 1 | /** 2 | 3 | 1> launched a Postgres sever and created the table using following query: 4 | 5 | CREATE TABLE ishan ("userID" VARCHAR (50) PRIMARY KEY, "UserName" VARCHAR (50)); 6 | 7 | 2> Ran crawler on the Postgres sever which will create the table. 8 | 9 | 3> Create Glue ETL job by select source table as Postgres table "ishan" and target source as S3 location. 10 | 11 | 4> creata a glue pyspark job with the following code:
 12 | 13 | **/ 14 | 15 | import sys 16 | from awsglue.transforms import * 17 | from awsglue.utils import getResolvedOptions 18 | from pyspark.context import SparkContext 19 | from awsglue.context import GlueContext 20 | from awsglue.job import Job 21 | from awsglue.dynamicframe import DynamicFrame 22 | from pyspark.sql import SQLContext 23 | ## @params: [JOB_NAME] 24 | args = getResolvedOptions(sys.argv, ['JOB_NAME']) 25 | 26 | sc = SparkContext() 27 | glueContext = GlueContext(sc) 28 | spark = glueContext.spark_session 29 | spark.sql('set spark.sql.caseSensitive=true') 30 | job = Job(glueContext) 31 | job.init(args['JOB_NAME'], args) 32 | sqlContext = SQLContext(sc) 33 | sqlContext.setConf("spark.sql.caseSensitive","true") 34 | 35 | 36 | datasource0 = sqlContext.read.format("jdbc").option("url", "jdbc:postgresql://:5432/postgres").option("query","select * from ishan").option("user", "postgres").option("password", "password").load() 37 | datasource12=datasource0.repartition(1) 38 | datasource1=DynamicFrame.fromDF(datasource12,glueContext,"test") 39 | datasink2 = glueContext.write_dynamic_frame.from_options(frame = datasource1, connection_type = "s3", connection_options = {"path": "s3:///images/"}, format = "csv", transformation_ctx = "datasink2") 40 | job.commit() 41 | -------------------------------------------------------------------------------- /aws-boto3-sdk/glue/write_to_redshift_snippet.py: -------------------------------------------------------------------------------- 1 | datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "mydatabase", table_name = "rs_sample", transformation_ctx = "datasource0") 2 | 3 | datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(frame = dropnullfields3, catalog_connection = "glue_conn", connection_options = {"preactions":"CREATE SCHEMA IF NOT EXISTS test_schema;", "dbtable": "test_schema.rs_sample", "database": "dev", "aws_iam_role": "arn:aws:iam::123456789111:role/redshift_role_servicebased", "mode": "overwrite"}, redshift_tmp_dir = "s3://s3-bucket/redshift_output/", transformation_ctx = "datasink4") 4 | job.commit() 5 | -------------------------------------------------------------------------------- /cli-examples/add_tags_objects.sh: -------------------------------------------------------------------------------- 1 | Using AWS CLI command. [2] 2 | 3 | Example: 4 | 5 | The following put-object-tagging example sets a tag with the key designation and the value confidential on the specified object. 6 | 7 | aws s3api put-object-tagging \ 8 | --bucket my-bucket \ 9 | --key doc1.rtf \ 10 | --tagging '{"TagSet": [{ "Key": "designation", "Value": "confidential" }]}' 11 | -------------------------------------------------------------------------------- /cli-examples/create_emr_jupyterhub.sh: -------------------------------------------------------------------------------- 1 | 2 | #creates an EMR with JupyterHub as an application 3 | 4 | aws emr create-cluster --name="MyJupyterHubCluster" --release-label emr-5.29.0 \ 5 | --applications Name=JupyterHub --log-uri s3://aws-isgaur-logs/AWSLogs \ 6 | --use-default-roles --instance-type m5.xlarge --instance-count 2 --ec2-attributes KeyName=training_tst 7 | 8 | # Fetches docker Id after doing ssh to Master node of an EMR Cluster 9 | 10 | sudo docker ps - 11 | 12 | 13 | #Login to Docker as sudo 14 | 15 | sudo docker exec -it 06550d4c4cc0 /bin/bash 16 | 17 | 18 | #Restart jupyterhub inside a Docker 19 | 20 | sudo docker restart jupyterhub 21 | 22 | #Checks Installation for JupyterLab 23 | 24 | sudo docker exec jupyterhub bash -c "conda list" | grep -i "jupyterlab" 25 | 26 | # To open JupyterLab 27 | 28 | sudo vi /etc/jupyter/conf/jupyterhub_config.py 29 | 30 | 31 | -------------------------------------------------------------------------------- /cli-examples/create_glue_crawler_cfn.json: -------------------------------------------------------------------------------- 1 | { 2 | "AWSTemplateFormatVersion": "2010-09-09", 3 | "Metadata": { 4 | "AWS::CloudFormation::Designer": { 5 | "e31888bf-15d1-4df1-a45f-b42518a1a2cb": { 6 | "size": { 7 | "width": 60, 8 | "height": 60 9 | }, 10 | "position": { 11 | "x": 47.005706787109375, 12 | "y": 46.017051696777344 13 | }, 14 | "z": 0, 15 | "embeds": [] 16 | } 17 | } 18 | }, 19 | "Resources": { 20 | "crawlerishan": { 21 | "Type": "AWS::Glue::Crawler", 22 | "Properties": { 23 | "DatabaseName": "datacsv", 24 | "Name": "tblbfsbookinglog", 25 | "Role": "arn:aws:iam::XXXXXXX:role/service-role/AWSGlueServiceRole-defaultRole", 26 | "TablePrefix": "crawl_", 27 | "Tags": 28 | { 29 | "Owner": "bbuhyl", 30 | "Name": "testTag" 31 | } 32 | , 33 | "Targets": { 34 | "S3Targets": [ 35 | { 36 | "Path": "s3://aws-XX-XXX/csv_file_6848275181/" 37 | } 38 | ] 39 | } 40 | }, 41 | "Metadata": { 42 | "AWS::CloudFormation::Designer": { 43 | "id": "e31888bf-15d1-4df1-a45f-b42518a1a2cb" 44 | } 45 | } 46 | } 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /cli-examples/create_glue_dev_endpoint.sh: -------------------------------------------------------------------------------- 1 | aws glue create-dev-endpoint --endpoint-name "endpoint1" --role-arn "arn:aws:iam::account-id:role/role-name" --number-of-nodes "3" --glue-version "1.0" --arguments '{"GLUE_PYTHON_VERSION": "3"}' --region "region-name" 2 | -------------------------------------------------------------------------------- /cli-examples/decoding_encode_msg.sh: -------------------------------------------------------------------------------- 1 | aws sts decode-authorization-message --encoded-message 2 | -------------------------------------------------------------------------------- /cli-examples/glue-start-job.sh: -------------------------------------------------------------------------------- 1 | aws glue start-job-run --job-name "bug_n" --arguments='--s3_bucket="aws-isgaur-logs",--config="test.ini",--password="test"' 2 | -------------------------------------------------------------------------------- /cli-examples/install_aws_cli_jupyterhub.sh: -------------------------------------------------------------------------------- 1 | #Steps to Install AWS-CLI on JupyetrHub 2 | 3 | 4 | 1. Open JupyterLab and go to the new terminal. Use the curl command to download the installation script. The following command uses the -O (uppercase "O") parameter to specify that the downloaded file is to be stored in the current folder using the same name it has on the remote host. 5 | 6 | curl -O https://bootstrap.pypa.io/get-pip.py 7 | 8 | 2. Run the script with Python to download and install the latest version of pip and other required support packages. 9 | 10 | python get-pip.py --user 11 | 12 | Or use the following. 13 | 14 | python3 get-pip.py --user 15 | 16 | When you include the --user switch, the script installs pip to the path ~/.local/bin. 17 | 18 | 3. Ensure the folder that contains pip is part of your PATH variable. 19 | 20 | a. Find your shell's profile script in your user folder. If you're not sure which shell you have, run echo $SHELL. 21 | 22 | b. Add an export command at the end of your profile script that's similar to the following example. 23 | 24 | export PATH=~/.local/bin:$PATH 25 | 26 | 4. Now you can test to verify that pip is installed correctly. 27 | 28 | pip3 --version 29 | 30 | 5. Use pip to install the AWS CLI. 31 | 32 | pip3 install awscli --upgrade --user 33 | 34 | 6. Verify that the AWS CLI installed correctly. 35 | 36 | aws --version 37 | 38 | 7. aws configure 39 | 40 | Provide the AWS_SECRET_ACCESS_KEY and AWS_ACCESS_KEY_ID to configure it and you should be all set to use AWS s3 and sync any files from local dirs/files to AWS s3. 41 | 42 | 8. Test aws s3 access using aws s3 ls - you should be able to list s3 buckets you have access to.Once this is set up you can use aws s3 cp or aws s3 sync command to copy the data from local dir/files to aws s3 or vice versa. 43 | -------------------------------------------------------------------------------- /cli-examples/pass_proxy_spark_shell.sh: -------------------------------------------------------------------------------- 1 | ## How to pass proxy parameters for spark applications which write through EMRFS. 2 | 3 | This can be done as below for SPARK command. 4 | 5 | spark-shell --conf spark.driver.extraJavaOptions=-Dhttp.proxyHost=myproxy.host.com -Dhttp.proxyPort=80 -Dhttps.proxyHost=myproxy.host.com -Dhttps.proxyPort=80” —conf spark.hadoop.fs.s3a.access.key=$AWS_ACCESS_KEY_ID --conf spark.hadoop.fs.s3a.secret.key=$AWS_SECRET_ACCESS_KEY 6 | -------------------------------------------------------------------------------- /cli-examples/spark-submit-postgres-jars.sh: -------------------------------------------------------------------------------- 1 | spark-submit --master yarn --deploy-mode client --class SparkPi /home/hadoop/sparksample/target/scala-2.11/spark-sample_2.11-1.0.jar --jars /usr/lib/spark/jars/postgresql-42.2.8.jar --driver-class-path /usr/lib/spark/jars/postgresql-42.2.8.jar 2 | -------------------------------------------------------------------------------- /configs/enable_s3_consistency_EMR.sh: -------------------------------------------------------------------------------- 1 | If you would like to configure EMRFS consistent view with a configuration object during cluster launch, you can do so using the following configuration: 2 | ================================================= 3 | [ 4 | { 5 | "Classification": "emrfs-site", 6 | "Properties": { 7 | "fs.s3.maxRetries": "20", 8 | "fs.s3.consistent.retryPeriodSeconds": "10", 9 | "fs.s3.consistent": "true", 10 | "fs.s3.consistent.retryCount": "5", 11 | "fs.s3.consistent.metadata.tableName": "EmrFSMetadata" 12 | } 13 | } 14 | ] 15 | ================================================= 16 | -------------------------------------------------------------------------------- /configs/enable_s3_consistency_dp.sh: -------------------------------------------------------------------------------- 1 | 2 | 3 | In order to enable consistent view or add any other sort of configuration, there is an option in EMR resource in data 4 | pipeline to add configuration where you can add EMRFS consistent property so that it would be enabled on EMR. 5 | 6 | To add configurations to the EMR Cluster: 7 | 8 | 1. Click the drop down menu next to "Add an optional field" and then select "Configuration" from the list. (Screen-shots attached) 9 | 10 | 2. Then select "Create new: EmrConfiguration". 11 | 12 | 3. Click "EmrConfiguration DefaultEmrConfiguration 1 "in the flowchart, specify the classification as "emrfs-site" and then select "Property" under "Add an Optional Field" drop down. 13 | 14 | 4. Under property select "Create new: Property" from drop down. 15 | 16 | 5. Now select "Property DefaultProperty1" from flow chat and then give property values as show. 17 | 18 | Key: fs.s3.consistent 19 | Value: true 20 | -------------------------------------------------------------------------------- /data-pipeline/ddb_to_s3.json: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "directoryPath": "s3://xxx-xx-xx/ddb_to_s3/latest_run", 5 | "dataFormat": { 6 | "ref": "DataFormatId_pk6BE" 7 | }, 8 | "name": "DefaultDataNode2", 9 | "id": "DataNodeId_vRA7t", 10 | "type": "S3DataNode" 11 | }, 12 | { 13 | "name": "DefaultDynamoDBDataFormat1", 14 | "column": "id bigint", 15 | "id": "DynamoDBDataFormatId_InlDK", 16 | "type": "DynamoDBExportDataFormat" 17 | }, 18 | { 19 | "output": { 20 | "ref": "DataNodeId_vRA7t" 21 | }, 22 | "input": { 23 | "ref": "DataNodeId_wHAKo" 24 | }, 25 | "scriptUri": "s3://aws-isgaur-logs/ddb_to_s3/ishan.hql", 26 | "name": "DefaultHiveActivity1", 27 | "id": "HiveActivityId_qi7kk", 28 | "runsOn": { 29 | "ref": "EmrClusterId_ar4g5" 30 | }, 31 | "type": "HiveActivity" 32 | }, 33 | { 34 | "dataFormat": { 35 | "ref": "DynamoDBDataFormatId_InlDK" 36 | }, 37 | "name": "DefaultDataNode1", 38 | "id": "DataNodeId_wHAKo", 39 | "type": "DynamoDBDataNode", 40 | "region": "us-west-2", 41 | "tableName": "ProductCatalog" 42 | }, 43 | { 44 | "name": "DefaultDataFormat1", 45 | "column": "id bigint", 46 | "id": "DataFormatId_pk6BE", 47 | "type": "DynamoDBDataFormat" 48 | }, 49 | { 50 | "name": "DefaultEmrCluster1", 51 | "keyPair": "training_tst", 52 | "releaseLabel": "emr-5.29.0", 53 | "id": "EmrClusterId_ar4g5", 54 | "region": "us-west-2", 55 | "type": "EmrCluster" 56 | }, 57 | { 58 | "failureAndRerunMode": "CASCADE", 59 | "resourceRole": "DataPipelineDefaultResourceRole", 60 | "role": "DataPipelineDefaultRole", 61 | "pipelineLogUri": "s3://xx-xx-xx/ddb_to_s3/logs/", 62 | "scheduleType": "ONDEMAND", 63 | "name": "Default", 64 | "id": "Default" 65 | } 66 | ], 67 | "parameters": [] 68 | } 69 | -------------------------------------------------------------------------------- /ddb/ddb_common_term.md: -------------------------------------------------------------------------------- 1 | I further checked the back-end logs and observed a very small transient issue due to one of partition split happened on your table. I also see 'Query' operations are 'Strongly consistent read'. I would suggest only perform strongly consistent reads when is required. 2 | 3 | For future reference, 500 errors can occur because of several possible reasons and are expected in lifetime of a Dynamodb table like network issue, hardware failure, partition split causing change of Mastership between replica nodes and so on. 4 | 5 | DynamoDB is a massive-scale distributed system, with thousands of servers in the back end fleet. Each of these servers has its own workload during a different time of the day, and each of them can fail at any time due to various reasons. When the DynamoDB service receives an API call from the client, the DynamoDB service gives the API call to one of the back-end servers for processing. If the back-end server is able to process the API call, the DynamoDB service returns a 200 OK back to the client. Since the API call is successfully processing in its first attempt, the client sees very small latency. 6 | 7 | However, from time to time an API call cannot be successfully processed within a reasonable time frame in the first attempt, either because the specific back-end server is busy, or the specific back-end server has failed. In this case, the DynamoDB service waits for a server-side timeout, then automatically gives the same API call to another back-end server for processing. If the second attempt is successful, the client receives a 200 OK with an elevated latency, in a scenario where the second attempt also fails on the DynamoDB service side and there are more attempts needed on the service side, the client observes more latency and after the timeout setting is reached and if the API's still fail to do the job it sends the 500 status code to the client and you see SystemErrors. 8 | 9 | As mentioned, these kind of system errors are transient but happen from time to time for a short time duration. As you have noticed, the errors usually last for a brief amount of time, which would get fixed automatically by the underlying DynamoDB infrastructure [1]. This is to be expected within the lifetime of a table. When your application receives this error, if possible please retry the failed requests until successful since the underlying issue should be temporary and transient. Please note that AWS SDKs have built-in retry logic, so you don't need to implement this on your own. 10 | 11 | - https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Programming.Errors.html 12 | -------------------------------------------------------------------------------- /general-python/hudi_dataset_write.py: -------------------------------------------------------------------------------- 1 | //spark-shell --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" --conf "spark.sql.hive.convertMetastoreParquet=false" --jars /usr/lib/hudi/hudi-spark-bundle.jar,/usr/lib/spark/external/lib/spark-avro.jar 2 | 3 | import org.apache.hudi.DataSourceWriteOptions 4 | import org.apache.hudi.config.HoodieWriteConfig 5 | import org.apache.hudi.hive.MultiPartKeysValueExtractor 6 | import org.apache.spark.sql.SaveMode 7 | import org.apache.spark.sql.functions._ 8 | 9 | // Read data from S3 and create a DataFrame with Partition and Record Key 10 | 11 | val inputDF = spark.read.format("parquet").load("s3:////") 12 | val df1 = inputDF.select("timeperiod","flow1","occupancy1") 13 | 14 | //Specify common DataSourceWriteOptions in the single hudiOptions variable 15 | val hudiOptions = Map[String,String]( 16 | HoodieWriteConfig.TABLE_NAME → "my_hudi_table", 17 | DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY -> "timeperiod", 18 | DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY ->"flow1", 19 | DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY -> "occupancy1" 20 | ) 21 | 22 | // Write a DataFrame as a Hudi dataset 23 | df1.write.format("org.apache.hudi").option(DataSourceWriteOptions.OPERATION_OPT_KEY, DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL).options(hudiOptions).mode(SaveMode.Overwrite).save("s3:///myhudidataset/") 24 | -------------------------------------------------------------------------------- /general-python/spark_context_python.py: -------------------------------------------------------------------------------- 1 | def init_spark(): 2 | import findspark 3 | findspark.init("/usr/lib/spark/") 4 | from pyspark.sql import SparkSession 5 | spark = ( 6 | SparkSession.builder 7 | .master("yarn") 8 | .appName("Stage.py") 9 | .getOrCreate() 10 | ) 11 | return spark 12 | 13 | spark = init_spark() 14 | -------------------------------------------------------------------------------- /glue/IAMRole_is_not_authorized_perform_glue_GetUserDefinedFunctions on resource.err: -------------------------------------------------------------------------------- 1 | When running spark/hive on AWS EMR using AWS Glue Data catalog - you might run into a problem where application fails with the below error : 2 | 3 | 4 | MetaException(message:User: arn:aws:sts::xx:assumed-role/SC-xx-xx-xx-Role-xx/i-xx is not authorized to perform: glue:GetUserDefinedFunctions on resource: arn:aws:glue:us-west-2:xx:catalog (Service: AWSGlue; Status Code: 400; Error Code: AccessDeniedException; Request ID: xx-xx-xx-xx-xx)) 5 | 6 | This fix is available in >= emr-5.31 and >= emr-6.1.0. 7 | 8 | When using the above releases: 9 | 10 | For Spark jobs, use the following emr configuration to disable GetUserDefinedFunctios calls. 11 | 12 | [ 13 | { 14 | "Classification": "spark-hive-site", 15 | "Properties": { 16 | "aws.glue.disable-udf": "true" 17 | } 18 | ] 19 | 20 | For Hive jobs, no additional configs needed. 21 | -------------------------------------------------------------------------------- /glue/SkipArchive_from_hive_spark.md: -------------------------------------------------------------------------------- 1 | As of today there is no way to set parameter 'skipArchive' while using Hive/Spark from AWS EMR pointing to AWS glue Data catalog. 2 | 3 | 4 | Workarounds: 5 | 6 | 1) Drop the table and recreate it. 7 | 8 | For example: 9 | ============= 10 | aws glue get-table --database-name --name --region > /tmp/table.json 11 | aws glue create-table --database-name --region --table-input file:///tmp/table-tmp.json 12 | ============ 13 | 14 | 2) Delete the versions using the delete-table-version API 15 | 16 | You can retrieve the all the versions of the a table using the following CLI command: 17 | ============ 18 | aws glue get-table-versions --database-name Your_DataBase_Name --table-name Your_Table_Name 19 | ============ 20 | 21 | And then you can run 'batch-delete-table-version' to delete multiple versions of a table as below: 22 | ============ 23 | aws glue batch-delete-table-version --database-name Your_DataBase_Name --table-name Your_Table_Name --version-ids 0 1 2 24 | ============ 25 | 26 | 3) Increase TABLE_VERSION resources service limit 27 | 28 | TABLE_VERSION resources service limit is a soft limit and can be lifted. 29 | -------------------------------------------------------------------------------- /glue/TABLE_VERSION_ResourceNumberLimitExceededException.md: -------------------------------------------------------------------------------- 1 | 2 | Known Error Message : 3 | 4 | Number of TABLE_VERSION resources exceeds the account limit 1000000 (Service: AWSGlue; Status Code: 400; Error Code: ResourceNumberLimitExceededException; Request ID: 38e9debb-42c5-43a6-b111-a53fa496b2f8) 5 | 6 | 7 | 8 | The reason one gets the above errorr is due to the limit on the number of table versions per Glue account per region which is 1,000,000 [1]. When we update a table in the Glue Data Catalog, the change history is retained as a version. If there are tables that are updated frequently, the number of versions will increase and the above error may occur. 9 | 10 | In order to overcome the issue, one can use any of the following option. 11 | 12 | Option 1 : [Long term fix] Increase the maximum number of table versions in your account by requesting a quota increase[2] . You can update double the limit as mentioned below form the service Quota[2]. 13 | 14 | Recent quota increase → Change quota value: 2000000 → Request 15 | 16 | Option 2 : [ Short term fix ] You can delete the old unnecessary versions of tables with Glue BatchDeleteTableVersion API[3][4]. If unnecessary versions are accumulated in more than one table, consider taking a permanent measure and periodically taking inventory of the version history. 17 | 18 | → Step 1: You can retrieve the all the versions of the a table using the following CLI command: 19 | 20 | aws glue get-table-versions --database-name Your_DataBase_Name --table-name Your_Table_Name 21 | 22 | → Step 2: You can run 'batch-delete-table-version' to delete multiple versions of a table as below: 23 | 24 | aws glue batch-delete-table-version --database-name Your_DataBase_Name --table-name Your_Table_Name --version-ids 0 1 2 25 | 26 | 27 | Option 3: [ To prevent these error proactively ] Additionally, we can prevent the creation of versions during UpdateTable API call, by setting parameter 'skipArchive' to true[5]. By default, UpdateTable always creates an archived version of the table before updating it. However, if skipArchive is set to true, UpdateTable will not create the archived version. 28 | 29 | 30 | 31 | References: 32 | [1] Service Quotas :- https://docs.aws.amazon.com/general/latest/gr/glue.html#limits_glue - Number of table versions per account : 1,000,000 33 | [2] https://console.aws.amazon.com/servicequotas/home/services/glue/quotas/L-337244C9 34 | [3] BatchDeleteTableVersion action (Python: batch_delete_table_version) :- https://docs.aws.amazon.com/glue/latest/dg/aws-glue-api-catalog-tables.html#aws-glue-api-catalog-tables-BatchDeleteTableVersion 35 | [4] AWS CLI Command Reference | batch-delete-table-version :- https://docs.aws.amazon.com/cli/latest/reference/glue/batch-delete-table-version.html 36 | [5] AWS Glue SkipArchive https://docs.aws.amazon.com/glue/latest/webapi/API_UpdateTable.html#Glue-UpdateTable-request-SkipArchive 37 | -------------------------------------------------------------------------------- /glue/Unable_to_execute_HTTP_request.md: -------------------------------------------------------------------------------- 1 | When your Glue Job is facing issues with S3 communication and it does not have a Glue Connection attached, and as such the connection would take place over the Internet (from the managed Glue network). 2 | 3 | If suitable, try attempting to use a Glue Connection with your job. You can configured a Glue Connection of type "Network" and specify a Private Subnet in your VPC. This connection now gives you control over which Subnet is used in the networking, and thus you can add a VPC Endpoint for S3, to ensure your traffic to S3 (for that region) remains routed within the the VPC (not needing internet breakout). If you also require internet breakout for the job, then a NAT Gateway is the way to achieve it. 4 | -------------------------------------------------------------------------------- /glue/access_glue_data_catalog.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from awsglue.transforms import * 3 | from awsglue.utils import getResolvedOptions 4 | from pyspark.context import SparkContext 5 | from pyspark.context import SparkConf 6 | from pyspark.sql import SparkSession 7 | from awsglue.context import GlueContext 8 | from pyspark.sql import HiveContext 9 | from pyspark.sql import SQLContext 10 | from awsglue.job import Job 11 | from awsglue.dynamicframe import DynamicFrame 12 | ## @params: [JOB_NAME] 13 | 14 | args = getResolvedOptions(sys.argv, ['JOB_NAME']) 15 | sc= SparkContext() 16 | spark=SparkSession.builder.config("hive.metastore.client.factory.class", "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory").enableHiveSupport().getOrCreate() 17 | glueContext = GlueContext(sc) 18 | 19 | job = Job(glueContext) 20 | job.init(args['JOB_NAME'], args) 21 | 22 | spark.sql("SHOW DATABASES").show() 23 | spark.sql("use test") 24 | spark.sql("show tables").show() 25 | 26 | job.commit() 27 | -------------------------------------------------------------------------------- /glue/access_glue_data_catalog_cross_region_from_emr.md: -------------------------------------------------------------------------------- 1 | Apply Configuration while Launching an EMR - 2 | 3 | [ 4 | { 5 | "Classification": "spark-hive-site", 6 | "Properties": { 7 | "hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory", 8 | "aws.region": "us-east-1" 9 | } 10 | }, 11 | { 12 | "Classification": "hive-site", 13 | "Properties": { 14 | "hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory", 15 | "aws.region": "us-east-1" 16 | } 17 | }, 18 | { 19 | "classification": "hdfs-site", 20 | "properties": { 21 | "aws.glue.endpoint": "glue.us-east-1.amazonaws.com" 22 | }, 23 | "configurations": [] 24 | } 25 | ] 26 | 27 | 28 | Apply Configuration on Running EMR Cluster - 29 | 30 | Modify the "/etc/hadoop/conf/hdfs-site.xml" and add the following property 31 | 32 | 33 | aws.glue.endpoint 34 | glue..amazonaws.com 35 | 36 | 37 | Restart hive and hcatalog services - 38 | 39 | $ sudo service hive-server2 stop 40 | $ sudo service hive-hcatalog-server stop 41 | $ sudo service hive-server2 start 42 | $ sudo service hive-hcatalog-server start 43 | -------------------------------------------------------------------------------- /glue/benefits_glue.md: -------------------------------------------------------------------------------- 1 | Speed Of implementation 2 | Less code to Mantain < 10 lines versus 200 + lines for array attributes. 3 | Less overhead for Operations team within an Org. 4 | Performance for DDB significantly better using AWS glue - about 10X 5 | Solve for patterns - deeply nested jsons , arrays 6 | makes data easy to query 7 | Generalised ETL process 8 | No cluster management 9 | -------------------------------------------------------------------------------- /glue/boilerplate_glue_script.py: -------------------------------------------------------------------------------- 1 | # Boilerplate script into the development endpoint notebook or a Glue ETL script to import the AWS Glue libraries that you need, and set up a single GlueContext: 2 | 3 | import sys 4 | from awsglue.transforms import * 5 | from awsglue.utils import getResolvedOptions 6 | from pyspark.context import SparkContext 7 | from awsglue.context import GlueContext 8 | from awsglue.job import Job 9 | 10 | glueContext = GlueContext(SparkContext.getOrCreate()) 11 | -------------------------------------------------------------------------------- /glue/bookmark_jdbc.py: -------------------------------------------------------------------------------- 1 | datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "new_stats", table_name = "statsdb_daily_device_usage", transformation_ctx = "datasource0",additional_options = {"hashfield":"start_time","jobBookmarkKeys":["start_time"],"jobBookmarksKeysSortOrder":"asc"}) 2 | +++++++++++ 3 | -------------------------------------------------------------------------------- /glue/bookmark_testing.md: -------------------------------------------------------------------------------- 1 | 1. First, get the job bookmark status to record the "before": 2 | aws glue get-job-bookmark --job-name "YourJobName" 3 | 4 | 2. Then, get the timestamp information of your source files: 5 | aws s3 ls s3://yourdata/sourcepath/ 6 | 7 | 3. Also get the timestamp information of your destination files (where you will be writing your data to: 8 | aws s3 ls s3://yourdata/output/ 9 | 10 | 4. Before starting the job run tests, reset the bookmark completely 11 | aws glue reset-job-bookmark --job-name "YourJobName" 12 | 13 | 5. Confirm its taken effect by looking at the bookmark properties and seeing the reset values: 14 | aws glue get-job-bookmark --job-name "YourJobName" 15 | 16 | 6. Now start your job from the CLI: 17 | aws glue start-job-run --job-name "YourJobName" 18 | 19 | 7. Once the run completes, check to see if it created new files (it should have): 20 | aws s3 ls s3://yourdata/output/ 21 | 22 | 8. Also check that the bookmarks got updated: 23 | aws glue get-job-bookmark --job-name "YourJobName" 24 | 25 | 9. Now run the job once more and let us see if the bookmark work: 26 | aws glue start-job-run --job-name "YourJobName" 27 | 28 | 10. And check status of the files. There should NOT be any new files. 29 | aws s3 ls s3://yourdata/output/ 30 | 31 | 11. But when you check the bookmark, it should have updated: 32 | aws get-job-bookmark --job-name "YourJobName" 33 | 34 | 12. In your source-folder, add a new additional S3 data file. Now run the job once more: 35 | aws glue start-job-run --job-name "YourJobName" 36 | 37 | 13. And let us see if it only processes the new data by checking the S3 output files (there should be new ones): 38 | aws s3 ls s3://yourdata/output/ 39 | 40 | 14. And finally, confirm the bookmark was updated: 41 | aws get-job-bookmark --job-name "YourJobName" 42 | -------------------------------------------------------------------------------- /glue/check_glue_job_status.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | client = boto3.client(service_name='glue', region_name='us-east-1', 3 | endpoint_url='https://glue.us-east-1.amazonaws.com') 4 | response = client.start_job_run(JobName='WHICH U CREATED IN CONSOLE') 5 | status = client.get_job_run(JobName=job_name, RunId=response['JobRunId']) 6 | 7 | if status: 8 | state = status['JobRun']['JobRunState'] 9 | while state not in ['SUCCEEDED']: 10 | time.sleep(30) 11 | status = client.get_job_run(JobName=job_name, RunId=response['JobRunId']) 12 | state = status['JobRun']['JobRunState'] 13 | if state in ['STOPPED', 'FAILED', 'TIMEOUT']: 14 | raise Exception('Failed to execute glue job: ' + status['JobRun']['ErrorMessage'] + '. State is : ' + state) 15 | -------------------------------------------------------------------------------- /glue/check_install_pkgs.py: -------------------------------------------------------------------------------- 1 | from google.cloud import storage 2 | import pkg_resources 3 | installed_packages = pkg_resources.working_set 4 | for package in installed_packages: 5 | print(package) 6 | -------------------------------------------------------------------------------- /glue/configparser.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import configparser 3 | 4 | import boto3 5 | 6 | from io import StringIO 7 | 8 | from awsglue.utils import getResolvedOptions 9 | args = getResolvedOptions(sys.argv, ['env']) 10 | 11 | str = args['env'] 12 | list = str.split('/') 13 | key = str.split("/")[2] + "/" + str.split("/")[3] + "/" + str.split("/")[4] 14 | 15 | s3 = boto3.resource('s3') 16 | #obj = s3.Object(list[2], key) 17 | obj = s3.Object(bucket_name='configparser', key='xxx/configparser/config.ini') 18 | buf = StringIO(obj.get()['Body'].read().decode('utf-8')) 19 | 20 | config = ConfigParser.ConfigParser() 21 | config.readfp(buf) 22 | print(config.get('SectionOne', 'Status')) 23 | 24 | -------------------------------------------------------------------------------- /glue/control_output_partitions.py: -------------------------------------------------------------------------------- 1 | logs_DyF = glueContext.create_dynamic_frame.from_catalog(database="amzn_review", table_name="mydata_amazonreviews", transformation_ctx = "datasource0") 2 | logs_DF=logs_DyF.toDF() 3 | logs_DF.show() 4 | print (logs_DF.show()) 5 | print ("The number of partitions in source is") 6 | print (logs_DF.rdd.getNumPartitions()) 7 | logs_DF=logs_DF.repartition(50) 8 | 9 | logs_DyF2=DynamicFrame.fromDF(logs_DF, glueContext, "logs_DyF2") 10 | datasink2 = glueContext.write_dynamic_frame.from_options( frame = logs_DyF2, connection_type = "s3", connection_options = {"path": "s3://xxx-xxx/xxx/output/", "partitionKeys" : ["product_category"] }, format = "parquet", transformation_ctx = "datasink2") 11 | -------------------------------------------------------------------------------- /glue/control_partitions_glue.py: -------------------------------------------------------------------------------- 1 | You can control the number of files/ size-of-files being written out by choosing to reparation the data before the write. If the number of output files are many, I'd recommend calling dataframe.repartition(x) just before the write operation. A code snippet would look like this: 2 | '''' 3 | 4 | logs_DyF = glueContext.create_dynamic_frame.from_catalog(database="amzn_review", table_name="mydata_amazonreviews", transformation_ctx = "datasource0") 5 | logs_DF=logs_DyF.toDF() 6 | logs_DF.show() 7 | print (logs_DF.show()) 8 | print ("The number of partitions in source is") 9 | print (logs_DF.rdd.getNumPartitions()) 10 | logs_DF=logs_DF.repartition(50) 11 | 12 | logs_DyF2=DynamicFrame.fromDF(logs_DF, glueContext, "logs_DyF2") 13 | datasink2 = glueContext.write_dynamic_frame.from_options( frame = logs_DyF2, connection_type = "s3", connection_options = {"path": "s3://xx-xx/7017122531/output/", "partitionKeys" : ["product_category"] }, format = "parquet", transformation_ctx = "datasink2") 14 | ''' 15 | 16 | Test the values of the X(partition number) to see if this work with your dataset. This link[2] has a reference to how you can get the number of ideal partitions: 17 | Total input dataset size / partition size => number of partitions 18 | 19 | 20 | --- 21 | [1]https://aws.amazon.com/premiumsupport/faqs/ 22 | [2]https://dzone.com/articles/apache-spark-performance-tuning-degree-of-parallel 23 | 24 | 25 | 26 | 27 | The number of files that get written out is controlled by the parallelization of your DataFrame or RDD. So if your data is split across 10 Spark partitions you cannot write fewer than 10 files without reducing partitioning (e.g. coalesce or repartition). 28 | 29 | Now, having said that when data is read back in it could be split into smaller chunks based on your configured split size but depending on format and/or compression. 30 | 31 | If instead you want to increase the number of files written per Spark partition (e.g. to prevent files that are too large), Spark 2.2 introduces a maxRecordsPerFile option when you write data out. With this you can limit the number of records that get written per file in each partition. The other option of course would be to repartition. 32 | 33 | The following will result in 2 files being written out even though it's only got 1 partition: 34 | 35 | val df = spark.range(100).coalesce(1) 36 | df.write.option("maxRecordsPerFile", 50).save("/tmp/foo") 37 | 38 | -------------------------------------------------------------------------------- /glue/copyfiles/cpy_s3_to_s3.py: -------------------------------------------------------------------------------- 1 | import os 2 | sync_command = f"aws s3 sync " 3 | os.system(sync_command) 4 | -------------------------------------------------------------------------------- /glue/copyfiles/glue_pySpark.py: -------------------------------------------------------------------------------- 1 | ~~~~~~~~~~ 2 | # Don't forget to include these job package imports. 3 | import boto3 4 | import os 5 | from subprocess import call 6 | import sys 7 | 8 | # Initalize variables for the original S3 bucket source and the S3 bucket target destination. 9 | copy_target_path = "destination_target_bucket/target_folder" 10 | 11 | # Note: I assume that `source_s3_bucket` and `source_folder` are already initialized in this Spark Glue ETL job. This is for the sake of the example. 12 | source_s3_bucket = "original_source_bucket" 13 | source_folder = "source_folder" 14 | 15 | # Read the bash script file from S3, where the bash script location is: `s3://MYBUCKET/MYFOLDER/syncs3object.sh`. 16 | bash_script_s3_bucket_name = "MYBUCKET" 17 | bash_script_s3_prefix_location = "MYFOLDER/syncs3object.sh" 18 | script_contents = "" 19 | s3 = boto3.client("s3") 20 | result = s3.list_objects(Bucket=bash_script_s3_bucket_name, Prefix=bash_script_s3_prefix_location) 21 | for o in result.get("Contents"): 22 | data = s3.get_object(Bucket=source_s3_bucket, Key=o.get("Key")) 23 | script_contents = data["Body"].read() 24 | 25 | # Run the bash script to run `aws s3 sync s3://source-bucket/source-path s3://destination-bucket/destination-path`. 26 | rc = call([script_contents, "", source_s3_bucket, source_folder, copy_target_path], shell=True) 27 | 28 | print("Copied!") 29 | ~~~~~~~~~~ 30 | -------------------------------------------------------------------------------- /glue/copyfiles/s3tos3_copy_python.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import os 3 | from subprocess import call 4 | import sys 5 | 6 | ### ===== Method 1: Read from the bash script file from the job environment ===== 7 | # # Functions to help locate the file. 8 | # def _find(filename): 9 | # for dirname in sys.path: 10 | # candidate = os.path.join(dirname, filename) 11 | # if os.path.isfile(candidate): 12 | # return candidate 13 | # raise Error("Can't find file %s" % filename) 14 | 15 | # def findFile(filename): 16 | # return _find(filename) 17 | 18 | # # Locating the bash script `syncs3object.sh` in the Glue environment. 19 | # bash_script_path = findFile("syncs3object.sh") 20 | # with open(bash_script_path, 'rb') as file: 21 | # script = file.read() 22 | ### ===== End method 1. ===== 23 | 24 | ### ===== Method 2: Read from S3 object directly. ===== 25 | # Initalize variables for the original S3 bucket source and the S3 bucket target destination. 26 | source_s3_bucket = "original_source_bucket" 27 | source_folder = "source_folder" 28 | copy_target_path = "destination_target_bucket/target_folder" 29 | 30 | # Assuming the bash script location is: `s3://MYBUCKET/MYFOLDER/syncs3object.sh`. 31 | bash_script_s3_bucket_name = "MYBUCKET" 32 | bash_script_s3_prefix_location = "MYFOLDER/syncs3object.sh" 33 | script_contents = "" 34 | 35 | # Read the bash script file from S3 36 | s3 = boto3.client('s3') 37 | result = s3.list_objects(Bucket=bash_script_s3_bucket_name, Prefix=bash_script_s3_prefix_location) 38 | for o in result.get('Contents'): 39 | data = s3.get_object(Bucket=source_s3_bucket, Key=o.get('Key')) 40 | script_contents = data['Body'].read() 41 | ### ===== End method 2. ===== 42 | 43 | # Run the bash script to run `aws s3 sync s3://source-bucket/source-path s3://destination-bucket/destination-path`. 44 | rc = call([script_contents, "", source_s3_bucket, source_folder, copy_target_path], shell=True) 45 | 46 | print("Copied!") 47 | -------------------------------------------------------------------------------- /glue/copyfiles/syncs3objects.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | aws configure set default.s3.max_concurrent_requests 1000 4 | aws configure set default.s3.max_queue_size 10000 5 | 6 | bucket=$1 7 | key=$2 8 | target_path=$3 9 | 10 | # echo "aws s3 sync s3://$bucket/$key/ s3://$target_path/" 11 | aws s3 sync s3://$bucket/$key/ s3://$target_path/ 12 | -------------------------------------------------------------------------------- /glue/create_python_udf.py: -------------------------------------------------------------------------------- 1 | import pyspark 2 | from pyspark import SQLContext 3 | from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType 4 | from pyspark.sql.functions import udf 5 | from pyspark.sql import Row 6 | 7 | 8 | conf = pyspark.SparkConf() 9 | 10 | 11 | sc = pyspark.SparkContext.getOrCreate(conf=conf) 12 | spark = SQLContext(sc) 13 | 14 | schema = StructType([ 15 | StructField("sales", FloatType(),True), 16 | StructField("employee", StringType(),True), 17 | StructField("ID", IntegerType(),True) 18 | ]) 19 | 20 | data = [[ 10.2, "Fred",123]] 21 | 22 | df = spark.createDataFrame(data,schema=schema) 23 | 24 | colsInt = udf(lambda z: toInt(z), IntegerType()) 25 | spark.udf.register("colsInt", colsInt) 26 | 27 | def toInt(s): 28 | if isinstance(s, str) == True: 29 | st = [str(ord(i)) for i in s] 30 | return(int(''.join(st))) 31 | else: 32 | return Null 33 | 34 | 35 | df2 = df.withColumn( 'semployee',colsInt('employee')) 36 | 37 | 38 | #Ref: https://www.bmc.com/blogs/how-to-write-spark-udf-python/ 39 | -------------------------------------------------------------------------------- /glue/cross_account_copy_from_s3_to_s3.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | s3 = boto3.resource('s3') 3 | copy_source = { 4 | 'Bucket': 'aws-logs-456690477084-us-west-2', 5 | 'Key': 'bookmark_test.py' 6 | } 7 | bucket = s3.Bucket('aws-jupyterhubtest') 8 | bucket.copy(copy_source, 'temp/bookmark_test.py') 9 | -------------------------------------------------------------------------------- /glue/cross_account_cross_region_read_write_ddb_from_glue.py: -------------------------------------------------------------------------------- 1 | #Step 1 . Create an IAM role in account A. When defining the permissions of the role, you can choose to attach existing policies such as AmazonDynamoDBFullAccess. 2 | 3 | #Step 2: Follow step 2 in the tutorial to allow account B to switch to the newly-created role. The following example creates a new policy with the following #statement: 4 | 5 | #{ 6 | # "Version": "2012-10-17", 7 | # "Statement": { 8 | # "Effect": "Allow", 9 | # "Action": "sts:AssumeRole", 10 | # "Resource": "" 11 | # } 12 | #} 13 | 14 | #Step 3: Assume the Role in the AWS Glue Job Script 15 | 16 | #For a cross-account read across regions: 17 | 18 | import sys 19 | from pyspark.context import SparkContext 20 | from awsglue.context import GlueContext 21 | from awsglue.job import Job 22 | from awsglue.utils import getResolvedOptions 23 | 24 | args = getResolvedOptions(sys.argv, ["JOB_NAME"]) 25 | glue_context= GlueContext(SparkContext.getOrCreate()) 26 | job = Job(glue_context) 27 | job.init(args["JOB_NAME"], args) 28 | 29 | dyf = glue_context.create_dynamic_frame_from_options( 30 | connection_type="dynamodb", 31 | connection_options={ 32 | "dynamodb.region": "us-east-1", 33 | "dynamodb.input.tableName": "test_source", 34 | "dynamodb.sts.roleArn": "" 35 | } 36 | ) 37 | dyf.show() 38 | job.commit() 39 | 40 | 41 | #For a read and cross-account write across regions: 42 | 43 | 44 | import sys 45 | from pyspark.context import SparkContext 46 | from awsglue.context import GlueContext 47 | from awsglue.job import Job 48 | from awsglue.utils import getResolvedOptions 49 | 50 | args = getResolvedOptions(sys.argv, ["JOB_NAME"]) 51 | glue_context= GlueContext(SparkContext.getOrCreate()) 52 | job = Job(glue_context) 53 | job.init(args["JOB_NAME"], args) 54 | 55 | dyf = glue_context.create_dynamic_frame_from_options( 56 | connection_type="dynamodb", 57 | connection_options={ 58 | "dynamodb.region": "us-east-1", 59 | "dynamodb.input.tableName": "test_source" 60 | } 61 | ) 62 | dyf.show() 63 | 64 | glue_context.write_dynamic_frame_from_options( 65 | frame=dyf, 66 | connection_type="dynamodb", 67 | connection_options={ 68 | "dynamodb.region": "us-west-2", 69 | "dynamodb.output.tableName": "test_sink", 70 | "dynamodb.sts.roleArn": "" 71 | } 72 | ) 73 | 74 | job.commit() 75 | -------------------------------------------------------------------------------- /glue/cross_account_s3_access_using_IAMRole.md: -------------------------------------------------------------------------------- 1 | S3 bucket Example policy : 2 | 3 | 4 | { 5 | "Version": "2012-10-17", 6 | "Statement": [ 7 | { 8 | "Sid": "DelegateS3Access", 9 | "Effect": "Allow", 10 | "Principal": { 11 | "AWS": "arn:aws:iam::destination-aws-account-id:role/destination-IAM-role-arn" 12 | }, 13 | "Action": [ 14 | "s3:Get*", 15 | "s3:List*", 16 | "s3:Put*" 17 | ], 18 | "Resource": [ 19 | "arn:aws:s3:::source-s3-bucket/*", 20 | "arn:aws:s3:::source-s3-bucket" 21 | ] 22 | } 23 | ] 24 | } 25 | 26 | 27 | IAM Role Inline Policy Example : 28 | 29 | { 30 | "Version": "2012-10-17", 31 | "Statement": [ 32 | { 33 | "Effect": "Allow", 34 | "Action": [ 35 | "s3:ListBucket", 36 | "s3:GetObject*", 37 | "s3:GetBucketLocation", 38 | "s3:GetObjectTagging", 39 | "s3:PutObject", 40 | "s3:PutObjectAcl", 41 | "s3:PutObjectTagging" 42 | ], 43 | "Resource": [ 44 | "arn:aws:s3:::source-s3-bucket", 45 | "arn:aws:s3:::source-s3-bucket/*" 46 | ] 47 | }, 48 | { 49 | "Effect": "Allow", 50 | "Action": [ 51 | "s3:ListBucket", 52 | "s3:PutObject", 53 | "s3:PutObjectAcl" 54 | ], 55 | "Resource": [ 56 | "arn:aws:s3:::destination-s3-bucket", 57 | "arn:aws:s3:::destination-s3-bucket/*" 58 | ] 59 | } 60 | ] 61 | } 62 | -------------------------------------------------------------------------------- /glue/cross_region_glue_connection.md: -------------------------------------------------------------------------------- 1 | Use case : If you have a glue "connection" failure in say us-west-2 region and you try to connect to the redshift/postgres etc. cluster running in different region say us-east-1. 2 | 3 | * when a glue connection is created using the VPC id (vpc-a) in us-west-2 region whereas the redshift cluster uses VPC id (vpc-b) in us-east-1. Since the resources are in different VPCs, one must enable VPC peering so that the glue jobs in us-west-2 can connect to the redshift cluster in us-east-1. 4 | 5 | * I would like to highlight that, VPC peering connection is a networking connection between two VPCs that enables you to route traffic between them using private IPv4 addresses or IPv6 addresses[1]. 6 | 7 | * After creating the VPC peering connection[2], one must update the VPC subnets route tables to allow traffic from the accepted VPC CIDR. Please refer the documentation[3] for more details to configure route tables. 8 | 9 | * Update the redshift cluster security group to allow traffic from the us-west-2 VPC CIDR. 10 | 11 | * To test the connection, one can test EC2 instance using the same network properties(subnet & security group) used for glue connection and use the below telnet command to check the connectivity to redshift. 12 | 13 | $ sudo yum install telnet -y 14 | $ telnet 15 | 16 | * Using telnet command one should be able to connect to the redshift cluster and then the glue "test connection" will work too. 17 | 18 | -References: 19 | ============= 20 | [1] https://docs.aws.amazon.com/vpc/latest/peering/what-is-vpc-peering.html 21 | [2] https://docs.aws.amazon.com/vpc/latest/peering/create-vpc-peering-connection.html 22 | [3] https://docs.aws.amazon.com/vpc/latest/peering/peering-configurations-full-access.html 23 | 24 | -------------------------------------------------------------------------------- /glue/custom_jdbc_mysql8.md: -------------------------------------------------------------------------------- 1 | 1. Create Glue JDBC connection for JDBC. Fill the correct syntax, but do not worry about the details because they will not be used. The important part is the Security Group, VPC and Subnet to ensure the cluster networking is created correctly to reach the database. 2 | 3 | 2. Upload the attached Jar into an S3 directory and please make sure your Glue Role have access to that s3 directory. 4 | 5 | 3. Edit your Glue Job and under the Security Configuration, select the jar under the "Dependent jars path". 6 | 7 | 4. Now edit your job script. To use the driver to read your table[1], you will need to use the following additional code: 8 | 9 | Code Snippet - Please modify the below. 10 | ======================================================== 11 | sc = SparkContext() 12 | glueContext = GlueContext(sc) 13 | spark = glueContext.spark_session 14 | connection_mysql8_options = { 15 | "url": "jdbc:mysql://:3306/db", 16 | "dbtable": "test", 17 | "user": "admin", 18 | "password": "pwd", 19 | "customJdbcDriverS3Path": "s3://path/mysql-connector-java-8.0.17.jar", 20 | "customJdbcDriverClassName": "com.mysql.cj.jdbc.Driver"} 21 | 22 | df_mysql8 = glueContext.create_dynamic_frame.from_options(connection_type="mysql",connection_options=connection_mysql8_options) 23 | ### rest of the code 24 | ======================================================== 25 | 26 | 6. Run this ETL job and please test it. 27 | -------------------------------------------------------------------------------- /glue/datacatalog_migration.md: -------------------------------------------------------------------------------- 1 | Unfortunately, there is no built-in solution to migrate glue databases and tables across accounts, and it would need a custom solution.Following are some of the options you can explore : 2 | 3 | 4 | 1-1. Manually migrate the metadata using Glue APIs 5 | 6 | 
In this two-part process, you need to use Glue APIs like GetDatabases, GetTables, GetPartitions, GetConnections and GetUserDefinedFunctions to retrieve and store the metadata information onto an intermediate storage like S3.
 Then use their corresponding CreateTable APIs (similar APIs for other corresponding Get-calls) at the other account to recreate the Data Catalog. 7 | 8 | 1-2. GitHub Script for cross-account Glue Catalog replication [2] 9 | 10 | I found a thread [1] which talks about a solution on Github [2] and uses ETL jobs to migrate Glue catalog from one account to another. I also found a third party [3] blog mentioning backup and restore of glue catalog with Python. You can refer to these solutions for your use case of migration.There is an example on GitHub[2] where they discuss a solution for moving the Catalog between two accounts. It uses two ETL jobs to export the entire data catalog from one account and import it in another account using S3 as an intermediate storage. Please note that this script isn't supported by AWS, officially. You'll need to test the script and modify it according to your scenario. 11 | 12 | 1-3 : Using start_query_execution API[4]. You would have to build custom solution to get the tables first within a given Glue database and then use this specific API to run multiple SQL queries ( show create table ) one by one and then get the DDL as an output. 13 | 14 | 15 | 16 | 17 | References: 18 | [1] https://forums.aws.amazon.com/thread.jspa?messageID=942813 19 | [2] https://github.com/aws-samples/aws-glue-samples/tree/master/utilities/Hive_metastore_migration#aws-glue-data-catalog-to-another-aws-glue-data-catalog 20 | [3] https://www.redaelli.org/matteo/posts/how-to-backup-and-restore-glue-data-catalog/ 21 | [4] https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/athena.html#Athena.Client.start_query_execution 22 | -------------------------------------------------------------------------------- /glue/ddb_to_s3.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | import datetime 4 | from awsglue.utils import getResolvedOptions 5 | from pyspark.context import SparkContext 6 | from awsglue.context import GlueContext 7 | 8 | args = getResolvedOptions(sys.argv, 9 | ['JOB_NAME']) 10 | 11 | sc = SparkContext() 12 | glueContext = GlueContext(sc) 13 | 14 | table = glueContext.create_dynamic_frame.from_options( 15 | "dynamodb", 16 | connection_options={ 17 | "dynamodb.input.tableName": hello, 18 | "dynamodb.throughput.read.percent": 1.0 19 | } 20 | ) 21 | 22 | glueContext.write_dynamic_frame.from_options( 23 | frame=table, 24 | connection_type="s3", 25 | connection_options={ 26 | "path": s3://xx/ddbs3 27 | }, 28 | format=parquet, 29 | transformation_ctx="datasink" 30 | ) 31 | -------------------------------------------------------------------------------- /glue/desc_vpc.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | client = boto3.client('ec2',region_name='us-west-2') 3 | vpc_info=client.describe_vpcs() 4 | print(vpc_info) 5 | -------------------------------------------------------------------------------- /glue/docdb.py: -------------------------------------------------------------------------------- 1 | import pymongo 2 | import sys 3 | import boto3 4 | from pyspark import SparkContext, SparkConf 5 | conf = SparkConf().setAppName(appName).setMaster(master) 6 | sc = SparkContext(conf=conf) 7 | 8 | s3_client = boto3.client('s3') 9 | 10 | # Download the file from S3 11 | s3_client.download_file('xxx', 'docdb_cert/rds-combined-ca-bundle.pem', '/tmp/rds-combined-ca-bundle.pem') 12 | 13 | 14 | 15 | ## Create a MongoDB client, open a connection to Amazon DocumentDB as a replica set and specify the read preference as secondary preferred 16 | client = pymongo.MongoClient('mongodb://ssl:xxx#@docdb-ssl-enabled.cluster-xx.us-west-2.docdb.amazonaws.com:27017/?ssl=true&ssl_ca_certs=rds-combined-ca-bundle.pem&replicaSet=rs0&readPreference=secondaryPreferred&retryWrites=false') 17 | 18 | ##Specify the database to be used 19 | db = client.test 20 | 21 | ##Specify the collection to be used 22 | col = db.ssl 23 | 24 | ##Insert a single document 25 | col.insert_one({'hello':'Amazon DocumentDB'}) 26 | 27 | ##Find the document that was previously written 28 | x = col.find_one({'hello':'Amazon DocumentDB'}) 29 | 30 | ##Print the result to the screen 31 | print(x) 32 | 33 | ##Close the connection 34 | client.close() 35 | -------------------------------------------------------------------------------- /glue/dynfrm_to_df.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from awsglue.transforms import * 3 | from awsglue.utils import getResolvedOptions 4 | from pyspark.context import SparkContext 5 | from awsglue.context import GlueContext 6 | from awsglue.job import Job 7 | 8 | from awsglue.dynamicframe import DynamicFrame 9 | from pyspark.sql.functions import udf 10 | from pyspark.sql.types import * 11 | import re 12 | 13 | ## @params: [JOB_NAME] 14 | args = getResolvedOptions(sys.argv, ['JOB_NAME']) 15 | 16 | sc = SparkContext() 17 | glueContext = GlueContext(sc) 18 | spark = glueContext.spark_session 19 | job = Job(glueContext) 20 | job.init(args['JOB_NAME'], args) 21 | 22 | datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "default", table_name = "aaa", transformation_ctx = "datasource0") 23 | 24 | applymapping1 = ApplyMapping.apply(frame = datasource0, mappings = [("col0", "long", "aa", "long")], transformation_ctx = "applymapping1") 25 | 26 | 27 | 28 | dropnullfields3 = DropNullFields.apply(frame = resolvechoice2, transformation_ctx = "dropnullfields3") 29 | 30 | 31 | ##### start code to add field and use partions 32 | def extractYear(x): 33 | # 11/1/2014 00:00:11 34 | # 2019-02-01 00:00:06.2570 35 | #z = re.search(r"(\d{4}) |^(\d{4})", x) 36 | z = re.search(r"\/(\d{4})$|^(\d{4})", x) 37 | if z.group(1): 38 | return z.group(1) 39 | else: 40 | return z.group(2) 41 | udfExtractYear = udf(extractYear, StringType()) 42 | 43 | def extractMonth(x): 44 | # 11/1/2014 00:00:11 45 | # 2019-02-01 00:00:06.2570 46 | z = re.search("^(\d+)\/|\-(\d+)\-", x) 47 | if z.group(1): 48 | return int(z.group(1)) 49 | else: 50 | return int(z.group(2)) 51 | udfExtractMonth = udf(extractMonth, IntegerType()) 52 | 53 | def extractQuarter(x): 54 | # 11/1/2014 00:00:11 55 | # 2019-02-01 00:00:06.2570 56 | month = "" 57 | quarter = "Q1" 58 | z = re.search("^(\d+)\/|\-(\d+)\-", x) 59 | if z.group(1): 60 | month = int(z.group(1)) 61 | else: 62 | month = int(z.group(2)) 63 | 64 | if month >=1 and month <=3: 65 | quarter = "Q1" 66 | if month >=4 and month <=6: 67 | quarter = "Q2" 68 | if month >=7 and month <=10: 69 | quarter = "Q3" 70 | if month >=10 and month <=12: 71 | quarter = "Q4" 72 | return quarter 73 | udfExtractQuarter = udf(extractQuarter, StringType()) 74 | 75 | df = dropnullfields3.toDF() 76 | 77 | # Filter out null number_borrowers 78 | ## df = df.filter(df.["number of borrowers"].isNotNull()) 79 | 80 | # Add some columns 81 | df = df.withColumn("year", udfExtractYear(df["dd"])) 82 | df = df.withColumn("month", udfExtractMonth(df["dd"])) 83 | df = df.withColumn("quarter", udfExtractQuarter(df["dd"])) 84 | df.printSchema() 85 | df.show(2) 86 | 87 | # Save it, partition. 88 | df.write.mode("overwrite").partitionBy("year").parquet("s3://xxx/Glue/data777/") 89 | 90 | 91 | job.commit() 92 | -------------------------------------------------------------------------------- /glue/enable-s3-parquet-optimized-committer.md: -------------------------------------------------------------------------------- 1 | The prime reason and advantage for setting up this property is as follows : 2 | 3 | Whenever you or one of your applications writes a file to S3, there's a short window of time where the file needs to be propagated throughout the S3 backend system. If you try to access that file within that window of time (as in, immediately after writing it), there's a chance the file has not finished propagating and S3 returns an error.It helps to avoid issue that can occur with Amazon S3 eventual consistency during job and task commit phases, and helps improve job correctness under task failure conditions. 4 | 5 | Glue ETL Jobs run on the Apache Spark framework, which by default writes all output to a temporary directory in S3 and when all executors have finished writing, files are moved from this temporary directory to your selected destination path. S3 does not have the concept of directories (everything is a named prefix), so this move operation is basically just a rename to change the file's prefix. Sometime per my experience and knowledge with the Spark and the GlueETL , the jobs fails while writing the data to s3. The best way to address this is to enable the EMRFS S3-optimized committer [1] which is available in Glue [2] and removes such errors by using an optimized S3 write logic. 6 | -------------------------------------------------------------------------------- /glue/get_all_free_ips.py: -------------------------------------------------------------------------------- 1 | # Collect available IP adressses for subnets 2 | 3 | import boto3 4 | 5 | def lambda_handler(event, context): 6 | ec2 = boto3.client('ec2') 7 | sns = boto3.client('sns') 8 | 9 | result = ec2.describe_subnets( Filters=[{'Name': 'state', 'Values': ['available']}]) 10 | 11 | notify_message = """ 12 | """ 13 | for subnet in result['Subnets']: 14 | m = "Available IP's in subnet %s is %d" % (subnet['SubnetId'], subnet['AvailableIpAddressCount']) 15 | print m 16 | notify_message = notify_message+"\n"+m 17 | 18 | topicArn = 'arn:aws:sns:my_sns_topic' 19 | 20 | sns.publish( 21 | TopicArn = topicArn, 22 | Message = notify_message 23 | 24 | ) 25 | -------------------------------------------------------------------------------- /glue/get_all_glue_internal_config.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from awsglue.transforms import * 3 | from pyspark.context import SparkContext 4 | from awsglue.context import GlueContext 5 | from awsglue.job import Job 6 | ``` 7 | sc = SparkContext() 8 | glueContext = GlueContext(sc) 9 | spark = glueContext.spark_session 10 | logger = glueContext.get_logger() 11 | 12 | logger.info(str(sc._conf.getAll())) 13 | 14 | iterator = sc._jsc.hadoopConfiguration().iterator() 15 | while iterator.hasNext(): 16 | prop = iterator.next() 17 | logger.info("key: " + prop.getKey() + "value : " + prop.getValue()) 18 | ``` 19 | -------------------------------------------------------------------------------- /glue/get_partitions_cli.sh: -------------------------------------------------------------------------------- 1 | Example 1 : aws glue get-partitions --database-name mydb --table-name mytable --expression "category=1" --region us-west-2 2 | Explanation : The way above CLI command will work is - it will fetch the partitions that match the above key criteria i.e. "category=1" . 3 | 4 | Example 2 : aws glue get-partitions --database-name mydb --table-name mytable --expression category=1\ AND\ date=\'2018-08-30\' --region us-west-2 5 | Explanation : In this another example - it demonstrates we are trying to fetch partitons information based on the expression which contains multiple partitions keys defined on the table present in the glue data catalog . 6 | -------------------------------------------------------------------------------- /glue/glue2.0_datatype.py: -------------------------------------------------------------------------------- 1 | created a postgreSQL table: 2 | 3 | create table test(id numeric, name varchar); 4 | 5 | insert into test (id,name) values(1.20,'abc') ; 6 | 7 | ####### 8 | 9 | df = spark.read.format("jdbc").option("url","jdbc:postgresql://postgresssql.chj4saov4rkl.us-east-1.rds.amazonaws.com:5432/postgres").option("user", "postgres").option("password", "xxx,123").option("dbTable", "test").option("fetchSize", "50000").option("driver", "org.postgresql.Driver").load() 10 | 11 | print("count") 12 | print(df.count()) 13 | print(df.printSchema()) 14 | -------------------------------------------------------------------------------- /glue/glue_VPC_jobs.md: -------------------------------------------------------------------------------- 1 | In the case where Glue needs to access data stores in a VPC, a VPC endpoint is required for S3. Glue sets up elastic network interfaces to enable jobs to connect securely to resources within a VPC, where each elastic network interface is assigned a private IP address from the IP address range within the relevant subnet. Use of an S3 VPC endpoint allows Glue to use private IP addresses to access Amazon S3, meaning that traffic between the VPC and S3 stays private and does not leave the Amazon network (i.e. it does not use the public internet). 2 | 3 | Thirdly, and again in the case where Glue needs to access data stores in a VPC, a self-referencing rule is required in the security group assigned to each data store (e.g. Amazon Redshift or RDS). This rule needs to allow access over all TCP ports (inbound and outbound) with the security group itself as a source. This allows for traffic between Glue components and the data store in the VPC. 4 | -------------------------------------------------------------------------------- /glue/glue_and_teradata.md: -------------------------------------------------------------------------------- 1 | As we are aware, Teradata is not natively supported by AWS Glue. With that being said, it is still possible to connect to a Teradata database with use of a dummy connection. 2 | 3 | I have outlined the steps you would need to take to connect to your Teradata On-Prem/Cloud database. 4 | 5 | Step 1: 6 | 7 | Create a Dummy JDBC connection. You will need create a dummy JDBC connection and provide the VPC and subnet configuration which has network connectivity to your On-Prem Teradata database. Please note the JDBC URL is not important and can be as follows: jdbc:mysql://xxx-cluster.cluster-xxx.us-east-1.rds.amazonaws.com:3306/dummy 8 | 9 | The important configuration in this step is to provide the correct VPC, private subnet and security group settings. Please note the subnet cannot be a public subnet (subnet with route to IGW). 10 | 11 | AWS Glue creates elastic network interfaces (ENIs) in the VPC/private subnet. These network interfaces then provide network connectivity for AWS Glue through your VPC. 12 | 13 | I recommend to read through the AWS blog [1] to get more insights into the network architecture and examples of the network configuration to set this up. 14 | 15 | Step 2: 16 | 17 | Create a Glue ETL Job and add the dummy connection to the job. You will also need to provide the S3 location of the JDBC Driver. 18 | 19 | The Teradata blog [2] outlines the steps to create a job and provides the driver required to connect to the database. 20 | 21 | You can then provide the connection details within your script to connect to the database. The blog also provides code samples to connect to the Teradata database and should be edited as per your requirement. 22 | 23 | Providing the connection string within the Glue ETL script allows Glue to create secondary ENI's in the VPC and subnet you have configured to connect to the database. 24 | 25 | 26 | References: 27 | [1] https://aws.amazon.com/blogs/big-data/how-to-access-and-analyze-on-premises-data-stores-using-aws-glue/ 28 | [2] https://www.teradata.com/Blogs/Teradata-and-AWS-Glue 29 | [3] https://aws.amazon.com/blogs/big-data/use-aws-glue-to-run-etl-jobs-against-non-native-jdbc-data-sources/ 30 | [4] https://kontext.tech/column/spark/315/connect-to-teradata-in-pyspark-via-jdbc 31 | 32 | -------------------------------------------------------------------------------- /glue/glue_conf_check.py: -------------------------------------------------------------------------------- 1 | print(sc._conf.getAll()) 2 | 3 | To set --conf parameters in AWS Glue Spark Job, you can refer the below steps & set the multiple --conf parameters. 4 | 5 | 1. Navigate to the 'Script libraries and job parameters (optional)' from the glue job console -> Job parameters -> enter key/value pair' 6 | 7 | 2. Step 2: In Glue Job we can not pass multiple "--conf" values but as a workaround we can use the below method in Job parameter as key/value. 8 | -- 9 | key: --conf 10 | 11 | value: Spark.executor.memory = 16g --conf Spark.yarn.memoryoverhead = 10g 12 | --conf Spark.sql.broadcastTimeout = 600 13 | --conf Spark.sql.autoBroadcastJoinThreshold = 50485760 14 | --conf Spark.dynamicAllocation.minExecutors 20 15 | 16 | Therefore define only one job parameter with "Key" as --conf followed by value in "Value" input box with other --conf parameters in the same "Value" box. 17 | -------------------------------------------------------------------------------- /glue/glue_connection_ssm_secret.yaml: -------------------------------------------------------------------------------- 1 | 2 | --- 3 | AWSTemplateFormatVersion: '2010-09-09' 4 | # Sample CFN YAML to demonstrate creating a connection 5 | # 6 | # Parameters section contains names that are substituted in the Resources section 7 | # These parameters are the names the resources created in the Data Catalog 8 | Parameters: 9 | # The name of the connection to be created 10 | CFNConnectionName: 11 | Type: String 12 | Default: newishan 13 | CFNJDBCString: 14 | Type: String 15 | Default: "jdbc:postgresql://postgres.xxx.us-west-2.rds.amazonaws.com:5432/test" 16 | CFNJDBCUser: 17 | Type: String 18 | Default: "postgres" 19 | CFNJDBCPassword: 20 | Type: String 21 | Default: '{{resolve:secretsmanager:rdspwd:SecretString:password}}' 22 | NoEcho: true 23 | # 24 | # Resources section defines metadata for the Data Catalog 25 | Resources: 26 | CFNConnectionMySQL: 27 | Type: AWS::Glue::Connection 28 | Properties: 29 | CatalogId: !Ref AWS::AccountId 30 | ConnectionInput: 31 | Description: "Connect to MySQL database." 32 | ConnectionType: "JDBC" 33 | #MatchCriteria: none 34 | PhysicalConnectionRequirements: 35 | AvailabilityZone: "us-west-2d" 36 | SecurityGroupIdList: 37 | - "sg-xxx" 38 | SubnetId: "subnet-xxxx" 39 | ConnectionProperties: { 40 | "JDBC_CONNECTION_URL": !Ref CFNJDBCString, 41 | "USERNAME": !Ref CFNJDBCUser, 42 | "PASSWORD": !Ref CFNJDBCPassword 43 | } 44 | Name: !Ref CFNConnectionName 45 | -------------------------------------------------------------------------------- /glue/glue_cw_kms_policy.md: -------------------------------------------------------------------------------- 1 | KMS key policy to be used while setting up the glue crawler to use security config. 2 | 3 | { 4 | "Effect": "Allow", 5 | "Principal": { "Service": "logs.region.amazonaws.com", 6 | "AWS": [ 7 | "role1", 8 | "role2", 9 | "role3" 10 | ] }, 11 | "Action": [ 12 | "kms:Encrypt*", 13 | "kms:Decrypt*", 14 | "kms:ReEncrypt*", 15 | "kms:GenerateDataKey*", 16 | "kms:Describe*" 17 | ], 18 | "Resource": "*" 19 | } 20 | -------------------------------------------------------------------------------- /glue/glue_etl_optimization: -------------------------------------------------------------------------------- 1 | Scenario 1: 2 | 3 | If the 'numberMaxNeededExecutors' metric is reporting very low values , this typically indicates the number of partitions is too low (your dataset is partitioned in a small number of partitions, which causes a low number of tasks, which causes a low number of executors). 4 | 5 | Spark is not launching additional executors because the number of partitions does not need it. As an example : 6 | 7 | Considering your job is running with 25 DPUs of the Standard worker type, your job is severly underutilizing the provided resources - each DPU in Standard runs 2 executors, so 21 DPUs are not being used. 8 | 9 | Now as to why the number of partitions is low: if your code does not specify any kind of custom partitioning, Spark will automatically use the default one. When reading from S3, this means Spark will create: 10 | 11 | 1. One partition for each file in the input path, if the files are smaller than 128 MiB or they are compressed in a non-splittable format. 12 | 13 | 2. If your files are splittable and they are larger than 128 MiB, one partition per each 128 MiB of data. 14 | 15 | The recommendation is to analyze the file sizes and formats of the data being processed here. In any case, you should be able to increase parallelism easily by simply doing a repartition operation on the DynamicFrames/DataFrame, set to a value that guarantees maximum resource utilization. 16 | 17 | Each Spark executor in Glue handles 4 tasks concurrently, which means that with 25 DPU and the Standard worker type 188 partitions or more would yield you maximum resource utilization. 18 | -------------------------------------------------------------------------------- /glue/glue_job_commit.md: -------------------------------------------------------------------------------- 1 | The method job.commit() can be called multiple times and it would not throw any error 2 | as well. However, if job.commit() would be called multiple times in a Glue script 3 | then job bookmark will be updated only once in a single job run that would be after 4 | the first time when job.commit() gets called and the other calls for job.commit() 5 | would be ignored by the bookmark. Hence, job bookmark may get stuck in a loop and 6 | would not able to work well with multiple job.commit(). Thus, I would recommend you 7 | to use job.commit() once in the Glue script. 8 | -------------------------------------------------------------------------------- /glue/glue_logger_python.py: -------------------------------------------------------------------------------- 1 | #You can use the AWS Glue logger to log any application-specific messages in the script that are sent in real time to the driver log stream. 2 | 3 | from awsglue.context import GlueContext 4 | from pyspark.context import SparkContext 5 | 6 | sc = SparkContext() 7 | glueContext = GlueContext(sc) 8 | 9 | # Continuous logging must be Enabled in the job definition for this logging 10 | logger = glueContext.get_logger() 11 | 12 | logger.info("info message") 13 | logger.warn("warn message") 14 | logger.error("error message") 15 | -------------------------------------------------------------------------------- /glue/glue_logger_scala.py: -------------------------------------------------------------------------------- 1 | import com.amazonaws.services.glue.log.GlueLogger 2 | 3 | object GlueApp { 4 | def main(sysArgs: Array[String]) { 5 | val logger = new GlueLogger 6 | logger.info("info message") 7 | logger.warn("warn message") 8 | logger.error("error message") 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /glue/glue_logging_debug.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from awsglue.transforms import * 3 | from awsglue.utils import getResolvedOptions 4 | from pyspark.context import SparkContext 5 | from awsglue.context import GlueContext 6 | from awsglue.job import Job 7 | 8 | 9 | args = getResolvedOptions(sys.argv, ['JOB_NAME']) 10 | 11 | sc = SparkContext() 12 | 13 | ## Set the Glue Logging level to Debug 14 | sc.setLogLevel("DEBUG") 15 | 16 | glueContext = GlueContext(sc) 17 | spark = glueContext.spark_session 18 | job = Job(glueContext) 19 | job.init(args['JOB_NAME'], args) 20 | -------------------------------------------------------------------------------- /glue/glue_logging_techniques.py: -------------------------------------------------------------------------------- 1 | Python => 2 | 3 | Version 1 : 4 | 5 | import logging 6 | 7 | MSG_FORMAT = '%(asctime)s %(levelname)s %(name)s: %(message)s' 8 | DATETIME_FORMAT = '%Y-%m-%d %H:%M:%S' 9 | logging.basicConfig(format=MSG_FORMAT, datefmt=DATETIME_FORMAT) 10 | logger = logging.getLogger() 11 | 12 | logger.setLevel(logging.INFO) 13 | 14 | ... 15 | 16 | logger.info("Test log message") 17 | 18 | 19 | Version 2 : 20 | 21 | import sys 22 | 23 | root = logging.getLogger() 24 | root.setLevel(logging.DEBUG) 25 | 26 | handler = logging.StreamHandler(sys.stdout) 27 | handler.setLevel(logging.DEBUG) 28 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 29 | handler.setFormatter(formatter) 30 | root.addHandler(handler) 31 | root.info("check") 32 | 33 | 34 | PySpark => 35 | 36 | sc = SparkContext() 37 | sc.setLogLevel('DEBUG') 38 | glueContext = GlueContext(sc) 39 | logger = glueContext.get_logger() 40 | logger.info('Hello Glue') 41 | 42 | from awsglue.context import GlueContext 43 | from pyspark.context import SparkContext 44 | 45 | 46 | 47 | SparkScala => 48 | 49 | import com.amazonaws.services.glue.log.GlueLogger 50 | 51 | object GlueApp { 52 | def main(sysArgs: Array[String]) { 53 | val logger = new GlueLogger 54 | logger.info("info message") 55 | logger.warn("warn message") 56 | logger.error("error message") 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /glue/glue_oom_container_killed.md: -------------------------------------------------------------------------------- 1 | Glue sometimes can failed with the error message i.e. 2 | 3 | Container killed by YARN for exceeding memory limits. 5.7 GB of 5.5 GB physical memory used. 4 | 5 | I can confirm the glueetl job fails with the above error because of Spark executor primarily out-of-memory exceptions - basically one of the Spark executors is receiving too much data, causing its memory to overflow and the underlying resource manager (YARN) to kill it. This usually happens 4 different times for the same Spark task, which as per the default configuration caused the entire job to be aborted. 6 | 7 | Executor out-of-memory exceptions are typically caused by data skew: when your dataset's not properly distributed across all your Spark executors, some receive more data than others. If this imbalance is large enough, one of the executors can receive enough data to overflow its memory as I described before. 8 | 9 | This is quite easy to identify by checking your job run's Spark executor count: since Glue ETL jobs use Spark's dynamic executor allocation, the number of active executors at any given time is directly proportional to the number of Spark partitions your dataset has (though the number of pending tasks) - so a low executor count indicates improper partitioning which can be addressed. 10 | 11 | 12 | I can recommend: 13 | 14 | * Enable metrics logging for your ETL job whcih lets you to check your job's metrics to analyze failures such as this one in the future. 15 | 16 | * Once metrics have been enabled, run your job again. When you encounter the same issue, check your job's Executor count metric (glue.driver.ExecutorAllocationManager.executors.numberAllExecutors). Given that your job is running with n number of DPUs of the Standard worker type, you could have up to n + 1 executors. If the number you are getting is any lower than that, there is some issue in your job's partition count. 17 | 18 | ** If there is such an issue, you will need to improve your job's partition count. There are many possible causes here that I cannot identify without knowing more about your job's environment, but a simple tool is to run a 'repartition' method [1] on your DynamicFrame at the moment where the partition count is low. 19 | 20 | ** If there is not such an issue, your dataset is being properly partitioned and your nodes are simply not powerful enough to handle the volume of data. You can either add additional nodes (by increasing the number of DPUs your job has) or use a larger worker type [2] so that your executors have additional memory space. 21 | 22 | Reference : 23 | 24 | 25 | [1] https://docs.aws.amazon.com/glue/latest/dg/aws-glue-api-crawler-pyspark-extensions-dynamic-frame.html#aws-glue-api-crawler-pyspark-extensions-dynamic-frame-repartition 26 | [2] https://aws.amazon.com/about-aws/whats-new/2019/04/aws-glue-now-supports-additional-configuration-options-for-memory-intensive-jobs/ 27 | -------------------------------------------------------------------------------- /glue/glue_rcu_ddb.md: -------------------------------------------------------------------------------- 1 | With the below Spark Scala Code provided : 2 | 3 | 4 | val datasource0 = glueContext.getCatalogSource(database = "mwx_split_events_prod", tableName = "crawler_mwx_id_lookup_prod_restored", redshiftTmpDir = "", transformationContext = "datasource0").getDynamicFrame() 5 | val datasource0 = glueContext.getSource("dynamodb", JsonOptions (s"""{"dynamodb.input.tableName": "mwx_id_lookup_prod_restored", "dynamodb.throughput.read.percent": "0.5", "dynamodb.splits": "20" }""")).getDynamicFrame() 6 | 7 | 8 | In the above code "dynamodb.throughput.read.percent" is set to "0.5" - this means meaning that spark application ran using Glue ETL will attempt to consume half of the read capacity of the table i.e. mwx_id_lookup_prod . 9 | 10 | Let's assume the DynamoDB table i.e. mwx_id_lookup_prod_restored RCU is set to 2500 currently . Therefore per the spark scala code above when the glue etl will run it will consume max 1250 RCU's . Hence this is the expected behavior with dynamodb.throughput.read.percent" when it is set to "0.5" . 11 | -------------------------------------------------------------------------------- /glue/glue_sigterm_executors_error.md: -------------------------------------------------------------------------------- 1 | ==== 2 | 20/01/17 19:04:22 ERROR CoarseGrainedExecutorBackend: RECEIVED SIGNAL TERM 3 | ==== 4 | 5 | SIGTERM is usually received due to memory overutilization, using the bigger Worker types, will let you have much larger memory allocation for them. 6 | 7 | If you see , the job is running with a single executor for most of its lifetime, peaking at only as an example 5 or 6 executors at some point. 8 | 9 | For reference, with the 50 DPUs and the Standard worker type you configure for your job, you should be getting a maximum of 97 executors - which means your resources are being wasted. This also explains why your executors are running out of memory - instead of distributing the dataset amongst them, only a few are trying to load it entirely and failing to do so. 10 | 11 | This proves the reads are not parallelized. I would recommend following our documentation to achieve this: [1]. If you follow the guidelines on our capacity planning section [2], you'll see you should be setting a minimum of 388 Spark partitions (parallel reads) for your job to fully utilize the 50 DPUs. This should divide the dataset across all of your executors, which will decrease memory pressure on them, and in the end make your job run properly. 12 | 13 | For now I would suggest simply making sure there's parallel reads, which should get you a nice number of executors at the start. Then the default number of partitions after a join should keep that number, so you can try to apply the changes stated in the documentation link mentioned before and run again to see how it goes. If the job fails again, check the executor count metric. If it goes low again you'll have to check your code to see if there's any step at which the partition count can be going low (most likely a join) and you can repartition after it. 14 | 15 | [1] https://docs.aws.amazon.com/glue/latest/dg/run-jdbc-parallel-read-job.html 16 | [2] https://docs.aws.amazon.com/glue/latest/dg/monitor-debug-capacity.html 17 | -------------------------------------------------------------------------------- /glue/glue_spark_sql_usage.md: -------------------------------------------------------------------------------- 1 | Executing SQL using SparkSQL in AWS Glue 2 | 3 | AWS Glue Data Catalog as Hive Compatible Metastore 4 | 5 | The AWS Glue Data Catalog is a managed metadata repository compatible with the Apache Hive Metastore API. You can follow the detailed instructions here to configure your AWS Glue ETL jobs and development endpoints to use the Glue Data Catalog. You also need to add the Hive SerDes to the class path of AWS Glue Jobs to serialize/deserialize data for the corresponding formats. You can then natively run Apache Spark SQL queries against your tables stored in the Data Catalog. 6 | 7 | The following example assumes that you have crawled the US legislators dataset available at s3://awsglue-datasets/examples/us-legislators. We’ll use the Spark shell running on AWS Glue developer endpoint to execute SparkSQL queries directly on the legislators’ tables cataloged in the AWS Glue Data Catalog. 8 | 9 | >>> spark.sql("use legislators") 10 | DataFrame[] 11 | >>> spark.sql("show tables").show() 12 | +-----------+------------------+-----------+ 13 | | database| tableName|isTemporary| 14 | +-----------+------------------+-----------+ 15 | |legislators| areas_json| false| 16 | |legislators| countries_json| false| 17 | |legislators| events_json| false| 18 | |legislators| memberships_json| false| 19 | |legislators|organizations_json| false| 20 | |legislators| persons_json| false| 21 | 22 | >>> spark.sql("select distinct organization_id from memberships_json").show() 23 | +--------------------+ 24 | | organization_id| 25 | +--------------------+ 26 | |d56acebe-8fdc-47b...| 27 | |8fa6c3d2-71dc-478...| 28 | +--------------------+ 29 | 30 | A similar approach to the above would be to use AWS Glue DynamicFrame API to read the data from S3. The DynamicFrame is then converted to a Spark DataFrame using the toDF method. Next, a temporary view can be registered for DataFrame, which can be queried using SparkSQL. The key difference between the two approaches is the use of Hive SerDes for the first approach, and native Glue/Spark readers for the second approach. The use of native Glue/Spark provides the performance and flexibility benefits such as computation of the schema at runtime, schema evolution, and job bookmarks support for Glue Dynamic Frames. 31 | 32 | >>> memberships = glueContext.create_dynamic_frame.from_catalog(database="legislators", table_name="memberships_json") 33 | >>> memberships.toDF().createOrReplaceTempView("memberships") 34 | >>> spark.sql("select distinct organization_id from memberships").show() 35 | +--------------------+ 36 | | organization_id| 37 | +--------------------+ 38 | |d56acebe-8fdc-47b...| 39 | |8fa6c3d2-71dc-478...| 40 | +--------------------+ 41 | -------------------------------------------------------------------------------- /glue/glue_storage_issue.md: -------------------------------------------------------------------------------- 1 | Glue Storage Issue 2 | 3 | As you probably know , every time you run a Spark ETL Job Glue provisions a cluster of nodes where your code will be executed on. These nodes have a disk space of 64GB each. Whenever a Spark executor receives more data than it can hold in memory, a spill-to-disk will be initiated - which pretty much means dumping the contents of already-processed data in memory onto disk so that operations can continue happening in memory. The most probable cause for this is either data skew or improper partitioning, resulting in one of your nodes receiving much more data than the rest. 4 | 5 | In order to verify whether that's the case or not , one can always check the CloudWatch metrics - since the Spark executor metrics are usually very revealing as to whether there is proper partitioning or not. 6 | 7 | However, In some cases , if the spark etl job fails in less than 5 min you won't see any executor metrics , because they are generated every 5 minutes and the job runs less then 5 min & fails. 8 | 9 | Recommendations: 10 | 11 | 1. Evaluate how much data each value in the partition key used as the parameter for 'parittionBy' has. You are probably having a lot of data for some values, and not so much data for other ones (what is known as data skew). What I would recommend is creating a Glue Development Endpoint [1] and using it to run each code statement step by step, retrieve the number of partitions and how they are distributed after running the partitionBy and seeing how to alter this number in real time. 12 | 13 | 2. If there is data skew indeed, the most popular technique is to deal with it is adding key salting , basically you need to add random values to your keys that equally distribute them across different partitions. 14 | 15 | 3. Alternatively, you could consider using larger worker types so that each individual node has more memory space to hold your dataset. This will decrease the overall amount of Spark executors, but it will make each one of them larger in memory. 16 | -------------------------------------------------------------------------------- /glue/glue_table_versioning.md: -------------------------------------------------------------------------------- 1 | => How does table versioning work with AWS Glue ? 2 | 3 | Basically, all of your Tables in the Data Catalog can have versions, which are different definitions of their schema and properties ordered in time. You can select the 'active' version of a table at any time. Every time you update the schema of a table, Glue creates a new version of your table and automatically updates the table's currently active version to the new one. 4 | 5 | => Is it the crawler that manages the tables versioning? 6 | 7 | Not strictly. The crawler can update the schema of an existing table, which will result in a new version being created. But you can also create new versions manually. 8 | 9 | => Is it possible to configure versioning? For example the number of versions we want to keep. 10 | 11 | No, that's not an option at the moment. The only configuration that can be done is selecting the active version of a table. 12 | 13 | => If we use AWS Glue ETL Jobs (https://docs.aws.amazon.com/glue/latest/dg/update-from-job.html) to update table schemas, will they be versioned? 14 | 15 | Any operation that updates a table's schema will create a new version, including ETL jobs. 16 | -------------------------------------------------------------------------------- /glue/glue_traffic_cross_account.md: -------------------------------------------------------------------------------- 1 | Creating cross account access for glue connection 2 | 3 | 1. Create VPC with public and private subnets in the account containing the Glue job. 4 | 2. Create a NAT gateway in the public subnet. 5 | 3. Create a route table for the private subnet including a route to "0.0.0.0/0" through the NAT gateway created in step 2. 6 | 4. Update inbound rule attached to Redshift cluster security group to allow traffic through NAT gateway. 7 | 5. Create Glue connection with the private subnet selected. 8 | 6. Edit the Glue job and select the connection created in step 5. 9 | 7. Run the Glue job. 10 | -------------------------------------------------------------------------------- /glue/glue_with_security_config.md: -------------------------------------------------------------------------------- 1 | Need to add the below to the KMS key policy 2 | 3 | 4 | { 5 | "Effect": "Allow", 6 | "Principal": { "Service": "logs.region.amazonaws.com", 7 | "AWS": [ 8 | "role1", 9 | "role2", 10 | "role3" 11 | ] }, 12 | "Action": [ 13 | "kms:Encrypt*", 14 | "kms:Decrypt*", 15 | "kms:ReEncrypt*", 16 | "kms:GenerateDataKey*", 17 | "kms:Describe*" 18 | ], 19 | "Resource": "*" 20 | } 21 | 22 | 23 | Ref: https://docs.aws.amazon.com/glue/latest/dg/encryption-security-configuration.html 24 | -------------------------------------------------------------------------------- /glue/graphframe_glue_steps.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | Run these on a EC2 4 | 5 | 1. we need to get the Graphframes code from either Github / PyPi : (https://pypi.org/project/graphframes/#files) 6 | 2. Download the tar file on a EC2 instance. 7 | 3. Untar (tar -xvf) 8 | 4. cd into the folder. and zip the contents into a zip file. I called it as graphframes.zip. Make sure that the graphframes are a direcly a part of the zip file and not under a folder when zipping. 9 | 5. Upload graphframes.zip to S3 10 | 11 | 6. Upload the downloaded graphframes zip file to an S3 location. 12 | 13 | 7. Create a new Glue job via AWS console. 14 | 15 | 8. Under 'Python Library Path' section of 'Script libraries and job parameters (optional)' section, add the S3 location of the graphframes package file(zip file). 16 | 17 | 9. Under 'Job Parameters' section of 'Script libraries and job parameters(optional)' section, enter the following: 18 | Key: --conf 19 | Value: spark.jars.packages=graphframes:graphframes:0.6.0-spark2.3-s_2.11 20 | 21 | 10. Run a sample code to import 'graphframe' package and use GraphFrame class. I have attached a sample script(graph_sample) for your reference. 22 | Please refer to the screenshots that I have attached to help you with step 3 and step4. 23 | 24 | Please note that some of the algorithms in graphframe package requires setting a Spark checkpoint directory, which can be performed by using the following line in your code. 25 | 26 | " SparkContext.setCheckpointDir('') " 27 | 28 | Also, note that must be a hdfs path. For example, you can use 'user/hadoop/' or '/tmp/'. 29 | -------------------------------------------------------------------------------- /glue/graphframe_with_glue.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from awsglue.transforms import * 4 | 5 | from awsglue.utils import getResolvedOptions 6 | 7 | from pyspark.context import SparkContext 8 | 9 | from pyspark.sql import SQLContext 10 | 11 | from awsglue.context import GlueContext 12 | 13 | from awsglue.dynamicframe import DynamicFrame 14 | 15 | from awsglue.job import Job 16 | 17 | from pyspark.sql import SparkSession 18 | 19 | from pyspark.sql.functions import udf 20 | 21 | from pyspark.sql.types import StringType 22 | 23 | from pyspark.sql import Row 24 | 25 | from graphframes import * 26 | 27 | 28 | glueContext = GlueContext(SparkContext.getOrCreate()) 29 | 30 | sc = SparkContext.getOrCreate() 31 | 32 | sc.setCheckpointDir('/tmp/') 33 | 34 | 35 | spark = glueContext.spark_session 36 | 37 | 38 | v = spark.createDataFrame([ 39 | ("a", "Alice", 34), 40 | ("b", "Bob", 36), 41 | ("c", "Charlie", 30), 42 | ], ["id", "name", "age"]) 43 | # Create an Edge DataFrame with "src" and "dst" columns 44 | e = spark.createDataFrame([ 45 | ("a", "b", "friend"), 46 | ("b", "c", "follow"), 47 | ("c", "b", "follow"), 48 | ], ["src", "dst", "relationship"]) 49 | # Create a GraphFrame 50 | #from graphframes import * 51 | 52 | g = GraphFrame(v, e) 53 | 54 | # Query: Get in-degree of each vertex. 55 | g.inDegrees.show() 56 | 57 | # Query: Count the number of "follow" connections in the graph. 58 | g.edges.filter("relationship = 'follow'").count() 59 | 60 | # Run PageRank algorithm, and show results. 61 | results = g.pageRank(resetProbability=0.01, maxIter=20) 62 | results.vertices.select("id", "pagerank").show() 63 | 64 | result = g.connectedComponents() 65 | result.select("id", "component").orderBy("component").show() 66 | -------------------------------------------------------------------------------- /glue/jdbc_parallel_reads_using_glue.md: -------------------------------------------------------------------------------- 1 | One can use this method for JDBC tables, that is, most tables whose base data is a JDBC data store. These properties are ignored when reading Amazon Redshift and Amazon S3 tables. 2 | 3 | hashfield 4 | 5 | Set hashfield to the name of a column in the JDBC table to be used to divide the data into partitions. For best results, this column should have an even distribution of values to spread the data between partitions. This column can be of any data type. AWS Glue generates non-overlapping queries that run in parallel to read the data partitioned by this column. For example, if your data is evenly distributed by month, you can use the month column to read each month of data in parallel. 6 | 7 | 8 | 'hashfield': 'month' 9 | 10 | 11 | AWS Glue creates a query to hash the field value to a partition number and runs the query for all partitions in parallel. To use your own query to partition a table read, provide a hashexpression instead of a hashfield. 12 | hashexpression 13 | 14 | Set hashexpression to an SQL expression (conforming to the JDBC database engine grammar) that returns a whole number. A simple expression is the name of any numeric column in the table. AWS Glue generates SQL queries to read the JDBC data in parallel using the hashexpression in the WHERE clause to partition data. 15 | 16 | For example, use the numeric column customerID to read data partitioned by a customer number. 17 | 18 | 19 | 'hashexpression': 'customerID' 20 | 21 | 22 | To have AWS Glue control the partitioning, provide a hashfield instead of a hashexpression. 23 | hashpartitions 24 | 25 | Set hashpartitions to the number of parallel reads of the JDBC table. If this property is not set, the default value is 7. 26 | 27 | For example, set the number of parallel reads to 5 so that AWS Glue reads your data with five queries (or fewer). 28 | 29 | 30 | 'hashpartitions': '5' 31 | 32 | -------------------------------------------------------------------------------- /glue/job_bookmarks_rewind.md: -------------------------------------------------------------------------------- 1 | Process incremental data since the last successful run or the data in the range identified by the following sub-options, without updating the state of last bookmark. You are responsible for managing the output from previous job runs. The two sub-options are: 2 | 3 | job-bookmark-from is the run ID which represents all the input that was processed until the last successful run before and including the specified run ID. The corresponding input is ignored. 4 | 5 | job-bookmark-to is the run ID which represents all the input that was processed until the last successful run before and including the specified run ID. The corresponding input excluding the input identified by the is processed by the job. Any input later than this input is also excluded for processing. 6 | 7 | The job bookmark state is not updated when this option set is specified. 8 | 9 | The sub-options are optional, however when used both the sub-options needs to be provided. 10 | -------------------------------------------------------------------------------- /glue/ld.md: -------------------------------------------------------------------------------- 1 | https://www.slideshare.net/AmazonWebServices/building-serverless-analytics-pipelines-with-aws-glue-ant308-aws-reinvent-2018 2 | -------------------------------------------------------------------------------- /glue/load_to_redshift.md: -------------------------------------------------------------------------------- 1 | To load the timestamp with timezone 2 | 3 | datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(frame = persons_DyF, catalog_connection = "test", connection_options = {"dbtable": "testalblog2", "database": "reddb","postactions":"delete from emp1;","extracopyoptions":"TIMEFORMAT 'auto'"}, 4 | redshift_tmp_dir = 's3://s3path', transformation_ctx = "datasink4") 5 | -------------------------------------------------------------------------------- /glue/log_setting_spark.md: -------------------------------------------------------------------------------- 1 | Problem: In Spark, wondering how to stop/disable/turn off INFO and DEBUG message logging to Spark console, when I run a Spark or PySpark program on a cluster or in my local, I see a lot of DEBUG and INFO messages in console and I wanted to turn off this logging. 2 | 3 | 4 | Solution: By default, Spark log configuration has set to INFO hence when you run a Spark or PySpark application in local or in the cluster you see a lot of Spark INFo messages in console or in a log file. 5 | 6 | With default INFO logging, you will see the Spark logging message like below 7 | 8 | Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties 9 | 20/03/29 17:38:24 INFO SparkContext: Running Spark version 2.4.4 10 | 20/03/29 17:38:24 INFO SparkContext: Submitted application: SparkByExamples.com 11 | 12 | 13 | On DEV and QA environment it’s okay to keep the log4j log level to INFO or DEBUG mode. But, for UAT, live or production application we should change the log level to WARN or ERROR as we do not want to verbose logging on these environments. 14 | 15 | Now, Let’s see how to stop/disable/turn off logging DEBUG and INFO messages to the console or to a log file. 16 | 17 | 18 | Using sparkContext.setLogLevel() method you can change the log level to the desired level. Valid log levels include: ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN 19 | 20 | In order to stop DEBUG and INFO messages change the log level to either WARN, ERROR or FATAL. For example, below it changes to ERORR 21 | 22 | val spark:SparkSession = SparkSession.builder() 23 | .master("local[1]") 24 | .appName("SparkByExamples.com") 25 | .getOrCreate() 26 | 27 | spark.sparkContext.setLogLevel("ERROR") 28 | 29 | With the last statement from the above example, it will stop/disable DEBUG or INFO messages in the console and you will see ERROR messages along with the output of println() or show(),printSchema() of the DataFrame methods 30 | -------------------------------------------------------------------------------- /glue/metrics_glue_etl.md: -------------------------------------------------------------------------------- 1 | glue.driver.jvm.heap.used 2 | 3 | glue.executorId.jvm.heap.used 4 | 5 | glue.ALL.jvm.heap.used 6 | 7 | glue.driver.jvm.heap.usage 8 | 9 | glue.executorId.jvm.heap.usage 10 | 11 | glue.ALL.jvm.heap.usage 12 | 13 | glue.driver.aggregate.bytesRead 14 | 15 | glue.driver.aggregate.numFailedTasks 16 | 17 | glue.driver.aggregate.shuffleBytesWritten 18 | 19 | glue.driver.aggregate.shuffleLocalBytesRead 20 | 21 | glue.driver.BlockManager.disk.diskSpaceUsed_MB 22 | 23 | glue.driver.ExecutorAllocationManager.executors.numberAllExecutors 24 | 25 | glue.driver.ExecutorAllocationManager.executors.numberMaxNeededExecutors 26 | 27 | glue.driver.s3.filesystem.read_bytes 28 | 29 | glue.executorId.s3.filesystem.read_bytes 30 | 31 | glue.ALL.s3.filesystem.read_bytes 32 | 33 | glue.driver.s3.filesystem.write_bytes 34 | 35 | glue.executorId.s3.filesystem.write_bytes 36 | 37 | glue.ALL.s3.filesystem.write_bytes 38 | 39 | glue.driver.system.cpuSystemLoad 40 | 41 | glue.executorId.system.cpuSystemLoad 42 | 43 | glue.ALL.system.cpuSystemLoad 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /glue/minimum_permissions_crawler.md: -------------------------------------------------------------------------------- 1 | Here's the list of API calls your Crawler will need access to: 2 | 3 | * s3:ListBucket: To list objects in your S3 path 4 | * s3:GetObject: To read objects in your S3 path and understand their schema 5 | 6 | * glue:GetDatabase: To create tables in the designated database 7 | * glue:GetTable: To verify whether the table already exists or not 8 | * glue:CreateTable: To create the resulting output table 9 | * glue:UpdateTable: To update the resulting output table if necessary 10 | * glue:CreatePartition: To create partitions in the resulting output table 11 | * glue:UpdatePartition: To update partitions in the resulting output table if necessary 12 | 13 | * logs:CreateLogGroup: To create the LogGroup where your Crawler's CloudWatch logs will be written to 14 | * logs:CreateLogStream: To create a LogStream inside the LogGroup 15 | * logs:PutLogEvents: To push log messages to the created LogGroup and LogStream 16 | -------------------------------------------------------------------------------- /glue/multiple_connections_glue.md: -------------------------------------------------------------------------------- 1 | Customers shouldn't use two connections unless they have a security group need. 2 | 3 | If they are different subnets, it causes confusion in the connection since it will only use one of them and its unpredictable which one will be chosen for network provisioning stage. 4 | 5 | And if they are both in the same subnet, what is the point. Single subnet to simplify things. 6 | -------------------------------------------------------------------------------- /glue/no_enough_ips.md: -------------------------------------------------------------------------------- 1 | At times the glue ETL exception occurs because there are not enough IP addresses in the subnet for the number of DPUs configured for the job. The solution to the issue is to use a subnet with 300 or more free IP addresses. The rationale for this explained hereafter. 2 | 3 | When you run a Glue ETL job which uses a JDBC connection, AWS Glue creates elastic network interfaces (ENIs) in a VPC/private subnet to access your VPC data. These network interfaces then provide network connectivity for AWS Glue through your VPC. The number of ENIs depends on the number of data processing units (DPUs) selected for an AWS Glue ETL job. Specifically, you need as many ENI’s as the number of DPUs configured for the job. AWS Glue DPU instances communicate with each other and with your JDBC-compliant database using ENIs. Each ENI is associated with an IP address, therefore, you need as many IP addresses as there are ENIs (DPUs). The number number of available IP addresses depends on the number of host bits allowed when subnetting. Therefore, the solution to the issue is to use a larger subnet with enough free IP addresses. 4 | 5 | By default, resources in a VPC can't be accessed from AWS Glue. To enable AWS Glue to access resources inside your VPC, you must provide additional VPC-specific configuration information that includes VPC subnet IDs and security group IDs. AWS Glue uses this information to set up elastic network interfaces that enable your function to connect securely to other resources in your private VPC. 6 | -------------------------------------------------------------------------------- /glue/ouput_logs_error_logs_diff.md: -------------------------------------------------------------------------------- 1 | Suppose you have an EMR cluster and you submit a Spark application to it from your local laptop. You will have: 2 | 3 | * The logs of the Spark client, as in the logs you will see in your terminal in your local laptop while the application is running. 4 | * The logs of the Spark driver, both for the STDOUT and STDERR streams 5 | * The logs of each one of the Spark executors, both for the STDOUT and STDERR streams 6 | 7 | If you enable continuous logging: 8 | 9 | * While the job is running, you will be able to see the progress bar in the web console 10 | * Output will take you to the log stream of the Spark client 11 | * Logs will take you to a log group where you will find all the STDOUT streams of the driver and each of the execuotrs 12 | * Error logs will take you to a log group where you will find all the STDERR streams of the driver and each of the executors 13 | 14 | If you don't: 15 | 16 | * Output will not be present 17 | * Logs will take you to a log stream containing the combination of the client logs and all the STDOUT streams of the driver and each of the executors 18 | * Error logs will take you to a log stream containing the combination of the STDERR streams of the driver and each of the executors 19 | 20 | ***Glue 2.0 LOGGING BEHAVIOR 21 | 22 | With Continuous logging Enabled : 23 | 24 | ==================================================== 25 | When Triggered Manually ( Via UI - > Run Job) 26 | ==================================================== 27 | 28 | RunStatus If Failed : This points to CloudWatch > CloudWatch Logs > Log groups > /aws-glue/jobs/error > {glue Job run Id} 29 | 30 | Error logs : CloudWatch > CloudWatch Logs > Log groups > /aws-glue/jobs/error > {glue Job run Id} 31 | 32 | Output : Anything you print within ETL code , logged here . 33 | 34 | CloudWatch > CloudWatch Logs > Log groups > /aws-glue/jobs/output > {glue Job run Id} 35 | 36 | Logs : CloudWatch > CloudWatch Logs > Log groups > /aws-glue/jobs/logs-v2 > {glue Job run Id} 37 | 38 | ==================================================== 39 | When executed via AWS Glue Trigger : 40 | ==================================================== 41 | 42 | Logs : CloudWatch > CloudWatch Logs > Log groups > /aws-glue/jobs/output 43 | 44 | progress-bar logs present in the above ^ ( Validated ) 45 | 46 | FileOutputCommitter present in the above ^ ( Validated ) - These are the spark executor logs which gets clubbed in the above. 47 | 48 | DAGScheduler , ApplicationMaster , GlueContext, YARN executor launch context also gets logged in the above ( validated ) 49 | 50 | Print statements - also gets logged in the above ( Validated ) 51 | 52 | ============= 53 | 54 | ***With continuous logging disabled : ( There is no difference in behavior . ) 55 | 56 | CloudWatch > CloudWatch Logs > Log groups > /aws-glue/jobs/output 57 | -------------------------------------------------------------------------------- /glue/out_of_scope.md: -------------------------------------------------------------------------------- 1 | First of all I would like to clarify the scope of AWS Premium Support - while we can certainly help you in Glue or Spark-related issues, we do not provide support for custom and/or third-party libraries that do not come bundled with the service. I wanted to stress this because we are not familiar with the "<>" library and from what I can gather in our internal AWS wikis this seems to be an internal package. Again, we are not familiar enough with it to provide any solid recommendations on its troubleshooting. 2 | -------------------------------------------------------------------------------- /glue/pandas_vs_spark_dataframes.md: -------------------------------------------------------------------------------- 1 | ========================================================= 2 | Diff. between Spark and Pandas DataFrame/ Pros & Cons 3 | ========================================================= 4 | 5 | 4. Using Pandas vs Spark DataFrames ==>> Since you are using CSV which is not supported natively by Spark, With Pandas, you easily read CSV files with read_csv() or write it using to_csv(). Spark and Pandas DataFrames are very similar. Still, Pandas API remains more convenient and powerful.The number of API calls made by using Pandas and Spark dataframe should not differ.But overall there are few differences I would like to call out while using one over the other as below : 6 | 7 | A. Pandas and Spark DataFrame are designed for structural and semistructral data processing. Both share some similar properties (which I have discussed above). The few differences between Pandas and PySpark DataFrame are: 8 | 9 | B. Operation on Pyspark DataFrame run parallel on different nodes in cluster but, in case of pandas it is not possible.Pandas data frames are in-memory, single-server. So their size is limited by your server memory, and you will process them with the power of a single server. 10 | 11 | C. Operations in PySpark DataFrame are lazy in nature but, in case of pandas we get the result as soon as we apply any operation. 12 | 13 | D. In PySpark DataFrame, we can’t change the DataFrame due to it’s immutable property, we need to transform it. But in pandas it is not the case. 14 | 15 | E. Pandas API support more operations than PySpark DataFrame. Still pandas API is more powerful than Spark. 16 | 17 | F. Complex operations in pandas are easier to perform than Pyspark DataFrame 18 | 19 | I found some more articles( Non-AWS docs ) for you to go through in order to get more details on pandas vs Spark DataFrames here . [1] , [2] and [3] 20 | 21 | ================= 22 | RECOMMENDATION 23 | ================ 24 | 25 | If your use case has a growing data where you probably have billions of rows and columns down the line ,involving complex operations like merging or grouping of data it definitely require parallelization and distributed computing. These operations are very slow and quite expensive and become difficult to handle with a Pandas dataframe, which does not support parallelization. Hence I recommend Spark DataFrame over Pandas DataFrame. 26 | -------------------------------------------------------------------------------- /glue/print_args_pythonshell.py: -------------------------------------------------------------------------------- 1 | import pyspark 2 | from pyspark import SparkContext 3 | sc =SparkContext() 4 | import sys 5 | from awsglue.utils import getResolvedOptions 6 | import pg8000 as redshift 7 | 8 | args = getResolvedOptions(sys.argv, 9 | ['JOB_NAME', 10 | 's3_bucket', 11 | 'config', 12 | 'password']) 13 | 14 | print(args['s3_bucket']) 15 | print(args['config']) 16 | print(args['password']) 17 | -------------------------------------------------------------------------------- /glue/process_tables_glue_data_catalog_in_loop.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from awsglue.transforms import * 3 | from awsglue.utils import getResolvedOptions 4 | from pyspark.context import SparkContext 5 | from awsglue.context import GlueContext 6 | from awsglue.job import Job 7 | import datetime 8 | import boto3 9 | args = getResolvedOptions(sys.argv, ['JOB_NAME', 'region', 'source_database', 10 | 'target_location', 'target_prefix']) 11 | job_name = args['JOB_NAME'] 12 | region = args['region'] 13 | 14 | source_db = args['source_database'] 15 | target_bucket = args['target_location'] 16 | target_prefix = args['target_prefix'] 17 | sc = SparkContext() 18 | glueContext = GlueContext(sc) 19 | job = Job(glueContext) 20 | job.init(job_name, args) 21 | client = boto3.client('glue', region_name=region) 22 | 23 | 24 | 25 | # Make this a global variable so it can't change if we run near midnight UTC 26 | # Plus I'm not creating a new object for every record! 27 | now = datetime.datetime.now() 28 | 29 | 30 | 31 | def AddPartitions(rec): 32 | rec["yyyy"] = now.year 33 | rec["mm"] = now.month 34 | rec["day"] = now.day 35 | return rec 36 | 37 | 38 | def transform(source_db, target_bucket, target_prefix, table_name, partition_keys = []): 39 | 40 | datasource0 = glueContext.create_dynamic_frame.from_catalog(database=source_db, 41 | table_name=table_name, 42 | transformation_ctx=table_name+"datasource0") 43 | 44 | 45 | 46 | map1 = Map.apply(frame=datasource0, f=AddPartitions, transformation_ctx=table_name+"map1") 47 | 48 | datasink1 = glueContext.write_dynamic_frame.from_options( 49 | frame=map1, 50 | connection_type="s3", 51 | connection_options={ 52 | "path": "s3://" + target_bucket + "/" + target_prefix + "/" + table_name + "/", 53 | "partitionKeys": ["yyyy", "mm", "day"] + partition_keys}, 54 | format="parquet" 55 | ) 56 | 57 | 58 | try: 59 | print('\nSource database name: ' + source_db) 60 | tables = client.get_tables(DatabaseName=source_db) 61 | for table in tables['TableList']: 62 | if 'DEPRECATED_BY_CRAWLER' not in table['Parameters']: 63 | table_name = table['Name'] 64 | print('\n-- tableName: ' + table_name) 65 | partitions = table['PartitionKeys'] 66 | if partitions is not []: 67 | # get partition keys 68 | partition_keys = [] 69 | [partition_keys.append(partition['Name']) for partition in partitions] 70 | transform(source_db, target_bucket, target_prefix, table_name, partition_keys) 71 | else: 72 | transform(source_db, target_bucket, target_prefix, table_name) 73 | 74 | except Exception as e: 75 | print(e) 76 | 77 | job.commit() 78 | -------------------------------------------------------------------------------- /glue/pushdownpredicate.scala: -------------------------------------------------------------------------------- 1 | import com.amazonaws.services.glue.DynamicFrame 2 | import com.amazonaws.services.glue.DynamicRecord 3 | import com.amazonaws.services.glue.GlueContext 4 | import com.amazonaws.services.glue.util.JsonOptions 5 | import org.apache.spark.SparkContext 6 | import java.util.Calendar 7 | import java.util.GregorianCalendar 8 | import scala.collection.JavaConversions._ 9 | 10 | val spark: SparkContext = SparkContext.getOrCreate() 11 | val glueContext: GlueContext = new GlueContext(spark) 12 | 13 | 14 | val githubEvents: DynamicFrame = glueContext.getCatalogSource( 15 | database = "avro", 16 | tableName = "glue_pd" 17 | ).getDynamicFrame() 18 | 19 | githubEvents.schema.asFieldList.foreach { field => 20 | println(s"${field.getName}: ${field.getType.getType.getName}") 21 | } 22 | 23 | 24 | val partitionPredicate = 25 | "date_format(to_date(concat(year, '-', month, '-', day)), 'E') in ('Sat', 'Sun')" 26 | 27 | val pushdownEvents = glueContext.getCatalogSource( 28 | database = "avro", 29 | tableName = "glue_pd", 30 | pushDownPredicate = partitionPredicate).getDynamicFrame() 31 | -------------------------------------------------------------------------------- /glue/read_bookmark_enabled.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from awsglue.transforms import * 3 | from awsglue.utils import getResolvedOptions 4 | from pyspark.context import SparkContext 5 | from awsglue.context import GlueContext 6 | from awsglue.job import Job 7 | import datetime 8 | import boto3 9 | 10 | 11 | args = getResolvedOptions(sys.argv, ['JOB_NAME']) 12 | sc = SparkContext() 13 | glueContext = GlueContext(sc) 14 | spark = glueContext.spark_session 15 | job = Job(glueContext) 16 | job.init(args['JOB_NAME'], args) 17 | 18 | 19 | datasource0 = glueContext.create_dynamic_frame_from_options("s3", {'paths': ["s3://xxx-xx-logs/Glue/parquet_sample_dataset/"]}, format="parquet",transformation_ctx = "datasource0") 20 | 21 | 22 | 23 | datasink3 = glueContext.write_dynamic_frame.from_options(frame = datasource0, connection_type = "s3", connection_options = {"path": "s3://xx-xx-logs/Glue/glue_bm_issue_11_12/"}, format = "parquet",transformation_ctx = "datasink3") 24 | 25 | 26 | 27 | job.commit() 28 | -------------------------------------------------------------------------------- /glue/read_from_specific_partitions.py: -------------------------------------------------------------------------------- 1 | 2 | In the following example, the job processes data in the s3://awsexamplebucket/product_category=Video partition only: 3 | 4 | datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "testdata", table_name = "sampletable", transformation_ctx = "datasource0",push_down_predicate = "(product_category == 'Video')") 5 | 6 | In this example, the job processes data in the s3://awsexamplebucket/year=2019/month=08/day=02 partition only: 7 | 8 | datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "testdata", table_name = "sampletable", transformation_ctx = "datasource0",push_down_predicate = "(year == '2019' and month == '08' and day == '02')") 9 | 10 | 11 | For non-Hive style partitions. In this example, the job processes data in the s3://awsexamplebucket/2019/07/03 partition only: 12 | 13 | datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "testdata", table_name = "sampletable", transformation_ctx = "datasource0",push_down_predicate ="(partition_0 == '2019' and partition_1 == '07' and partition_2 == '03')" ) 14 | 15 | -------------------------------------------------------------------------------- /glue/read_gzip_glue.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import gzip 3 | import pandas as pd 4 | import csv 5 | from io import BytesIO, TextIOWrapper, StringIO 6 | 7 | s3_client = boto3.client('s3') 8 | s3_client.download_file('xxx', 'glue_internal_ser_error/cur-hourly-01.csv.gz', '/tmp/cur-hourly-01.csv.gz') 9 | 10 | with gzip.open('/tmp/cur-hourly-01.csv.gz', 'rt') as f: 11 | file_content = f.readlines() 12 | print(file_content[:100]) 13 | -------------------------------------------------------------------------------- /glue/read_postgres_directly.py: -------------------------------------------------------------------------------- 1 | from awsglue.transforms import * 2 | from awsglue.utils import getResolvedOptions 3 | from pyspark.context import SparkContext 4 | from awsglue.context import GlueContext 5 | from awsglue.job import Job 6 | 7 | ## @params: [JOB_NAME] 8 | args = getResolvedOptions(sys.argv, ['JOB_NAME']) 9 | 10 | sc = SparkContext() 11 | glueContext = GlueContext(sc) 12 | spark = glueContext.spark_session 13 | job = Job(glueContext) 14 | job.init(args['JOB_NAME'], args) 15 | 16 | datasource0 = glueContext.create_dynamic_frame.from_options(connection_type=“Postgres”, connection_options={"url": "/", "user": "", "password": "","dbtable": ""}) 17 | datasink2 = glueContext.write_dynamic_frame.from_options(frame = datasource0, connection_type = "s3", connection_options = {"path": "s3:///postgres-matview/"}, format = "json", transformation_ctx = "datasink2") 18 | job.commit() 19 | 20 | #Registering it as a temporary View within Spark 21 | 22 | memberships.toDF().createOrReplaceTempView("memberships") 23 | spark.sql("select distinct organization_id from memberships").show() 24 | 25 | +--------------------+ 26 | | organization_id| 27 | +--------------------+ 28 | |d56acebe-8fdc-47b...| 29 | |8fa6c3d2-71dc-478...| 30 | +--------------------+ 31 | -------------------------------------------------------------------------------- /glue/recordsize_sizekey_crawler.md: -------------------------------------------------------------------------------- 1 | ** One of my "raw" named table created by crawler has below property set. It shows "recordCount" as "3554". 2 | 3 | However when I query same table from Athena record count are "25200", so basically "recordCount" metadata used by Crawler to detect schema and real record count in source data is different. 4 | 5 | In general when crawler runs to determine specific table format [1], for some formats reads the beginning of the file to determine format. Example : For JSON format "Reads the beginning of the file to determine format."[1] while determining schema and crawler update these properties. 6 | 7 | As it is reading beginning of file, it estimates this metadata for its internal use only. For my example table, I also see averageRecordSize = sizeKey/recordCount, 3071623/3554 = 864.2720 ~ 864. These calculations are internal to Crawler working and it may differ by formats. 8 | 9 | 10 | 11 | ++ Sample table "raw" created by Glue crawler. 12 | ============================================= 13 | Table properties - JSON classification 14 | sizeKey 3071623 15 | objectCount 2 16 | UPDATED_BY_CRAWLER thermostat-data-crawler 17 | CrawlerSchemaSerializerVersion 1.0 18 | recordCount 3554 19 | averageRecordSize 864 20 | ============================================= 21 | 22 | ++ Sample Athena query : 23 | ============================================= 24 | SELECT count(*) FROM "awsblogsgluedemo"."raw" 25 | >> Results 26 | _col0 27 | 1 25200 28 | ============================================= 29 | -------------------------------------------------------------------------------- /glue/redshift_from_catalog_and_from_options_read_with_where_clause.py: -------------------------------------------------------------------------------- 1 | connection_options using from_catalog : 2 | 3 | ------------- 4 | connection_options = { 5 | "query": "SELECT * FROM public.users WHERE userid=3", #This should be the schema name and table name in the Redshift source 6 | "aws_iam_role": "arn:aws:iam::my_account_id:role/MyRedshiftRole" 7 | } 8 | datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "my_glue_database", table_name = "my_glue_table", redshift_tmp_dir = args["TempDir"], additional_options = connection_options, transformation_ctx = "datasource0") 9 | ------------- 10 | 11 | 12 | connection_options using create_dynamic_frame_from_options : 13 | 14 | ------------- 15 | connection_options = { 16 | "url": "jdbc:redshift://redshift-cluster-1.xxxxx.us-east-1.redshift.amazonaws.com:5439/dev", 17 | "query": "SELECT * FROM public.users WHERE userid=3", 18 | "user": "myuser", 19 | "password": "password", 20 | "redshiftTmpDir": "s3://mybucket/tmp/", 21 | "aws_iam_role": "arn:aws:iam::my_account_id:role/MyRedshiftRole" 22 | } 23 | 24 | dynf_records = glueContext.create_dynamic_frame_from_options("redshift", connection_options) 25 | dynf_records.count() 26 | -------------------------------------------------------------------------------- /glue/redshift_to_s3.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from awsglue.transforms import * 3 | from awsglue.utils import getResolvedOptions 4 | from pyspark.context import SparkContext 5 | from awsglue.context import GlueContext 6 | from awsglue.job import Job 7 | from awsglue.dynamicframe import DynamicFrame 8 | 9 | ## @params: [TempDir, JOB_NAME] 10 | args = getResolvedOptions(sys.argv, ['TempDir','JOB_NAME']) 11 | 12 | sc = SparkContext() 13 | glueContext = GlueContext(sc) 14 | spark = glueContext.spark_session 15 | job = Job(glueContext) 16 | job.init(args['JOB_NAME'], args) 17 | 18 | datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "red", table_name = "red_ishan_public_temp", redshift_tmp_dir = args["TempDir"],additional_options = {"aws_iam_role": "arn:aws:iam::150139034114:role/glue_full_access_for_s3"} ) 19 | from_catalog(frame, name_space, table_name, redshift_tmp_dir="", transformation_ctx="") 20 | datasource0.printSchema() 21 | datasource0.show() 22 | -------------------------------------------------------------------------------- /glue/sample_glue_native_spark.py: -------------------------------------------------------------------------------- 1 | import pyspark 2 | from pyspark import SparkContext 3 | from pyspark.sql import Row 4 | from pyspark.sql import SQLContext 5 | sc = SparkContext() 6 | sqlContext = SQLContext(sc) 7 | list_p = [('John',19),('Smith',29),('Adam',35),('Henry',50)] 8 | rdd = sc.parallelize(list_p) 9 | ppl = rdd.map(lambda x: Row(name=x[0], age=int(x[1]))) 10 | DF_ppl = sqlContext.createDataFrame(ppl) 11 | DF_ppl.show(2) 12 | -------------------------------------------------------------------------------- /glue/setting_glue_param_within_etl.md: -------------------------------------------------------------------------------- 1 | --------------------------- 2 | sc = SparkContext() 3 | glueContext = GlueContext(sc) 4 | spark = glueContext.spark_session 5 | job = Job(glueContext) 6 | job.init(args['JOB_NAME'], args) 7 | sparkSession = spark.builder.config('spark.sql.sources.partitionColumnTypeInference.enabled', False).config('spark.driver.memory', '7g').config('spark.executor.memory', '5g').config('spark.speculation',False).getOrCreate() 8 | --------------------------- 9 | 10 | 11 | print("Spark Parametsr passed:", spark.sparkContext.getConf().getAll()), in my glue job, I was able to verify that the parameters were populated correctly. 12 | -------------------------------------------------------------------------------- /glue/solve_skew_issues.md: -------------------------------------------------------------------------------- 1 | There is no sure shot solution to handle data skewness issue as we know. 2 | 3 | 1. Run Spark application with spark 3.x which supports Adaptive query execution (AQE) i.e. query re-optimization that occurs during query execution . This might give better performance in terms of comparison to spark 2.x .More on AQE - https://docs.databricks.com/spark/latest/spark-sql/aqe.html#dataframeexplain 4 | 5 | 2. You will need to improve your job's partition count. A simple tool is to run a 'repartition' method on your Spark DF to check where the partition count is low. 6 | 7 | 3. Evaluate how much data each value in the partition key used as the parameter for 'parittionBy' has. You are probably having a lot of data for some values, and not so much data for other ones (what is known as data skew). What I would recommend is run code interactively nd using it to run each code statement step by step, retrieve the number of partitions and how they are distributed after running the partitionBy and seeing how to alter this number in real time. 8 | 9 | 4. The most popular technique is to deal with it is adding key salting , basically you need to add random values to your keys that equally distribute them across different partitions.Check out these article for salting technique and to understand skewness problem in spark. 10 | 11 | Reference : 12 | 13 | 1.https://itnext.io/handling-data-skew-in-apache-spark-9f56343e58e8 14 | 2.https://www.davidmcginnis.net/post/spark-job-optimization-dealing-with-data-skew 15 | 3.https://michaelheil.medium.com/understanding-common-performance-issues-in-apache-spark-deep-dive-data-skew-e962909f3d07 16 | -------------------------------------------------------------------------------- /glue/spark_errors/connection_to_endpoint._s3_timed_outerr: -------------------------------------------------------------------------------- 1 | In general - if the subnet used with the glue connection i.e. "[connection]" is public subnet and it has IGW on its routing table. 2 | 3 | When job is created using connection, Glue attach an ENI in underlying worker and assign it Private IP from the subnet used in the connection. By default secret manager endpoint get resolved to public endpoint and hence it can not be reached from the private ip. 4 | 5 | To access an endpoint for any given service from Glue job, you need to follow any one of the below option: 6 | 7 | 1. Create private subnet, use Nat Gateway on private subnet routing table to allow private subnet traffic to internet to a given service endpoint is resolved to public ip. Then create a connection using this subnet and run the job again. 8 | 9 | OR 10 | 11 | 2. Create VPC endpoint for secrets manager for your public subnet. This will allow access to a given service endpoint using private ip from the public subnet. When you use VPC endpoint, the Secret manager endpoint will be resolved to private ip and that will be accessible using private ip from public subnet. 12 | 13 | Please refer [1] and [2] for detail around VPC endpoint for secret manager as an example. 14 | 15 | Reference: 16 | [1] https://docs.aws.amazon.com/secretsmanager/latest/userguide/vpc-endpoint-overview.html 17 | [2] https://aws.amazon.com/blogs/security/how-to-connect-to-aws-secrets-manager-service-within-a-virtual-private-cloud/ 18 | -------------------------------------------------------------------------------- /glue/spark_errors/failed_to_allocate_x_byte_of_direct_memory.md: -------------------------------------------------------------------------------- 1 | 2 | This occurs due to running out of direct memory problem for the spark executors , The problem usually occurs at the shuffle read stage when there is a very large block due to a severe data skew on the shuffle write side. This problem usually occurs in a large number of shuffle operation, the task failed, and then re-implementation, has been circulating until the application failed. 3 | 4 | 5 | https://dzone.com/articles/four-common-reasons-for-fetchfailed-exception-in-a 6 | 7 | https://stackoverflow.com/questions/60808693/spark-shuffle-memory-error-failed-to-allocate-direct-memory 8 | 9 | 10 | https://splice.atlassian.net/browse/SPLICE-2349 11 | 12 | https://gankrin.org/fix-spark-error-org-apache-spark-shuffle-fetchfailedexception-too-large-frame/ 13 | 14 | https://docs.qubole.com/en/latest/troubleshooting-guide/spark-ts/troubleshoot-spark.html 15 | 16 | 17 | 18 | 19 | 20 | Ref: 21 | 22 | [1] https://spark.apache.org/docs/2.3.0/api/java/org/apache/spark/FetchFailed.html 23 | 24 | -------------------------------------------------------------------------------- /glue/spark_errors/missing_an_output_location_for_shuffle.err: -------------------------------------------------------------------------------- 1 | Spark Metadata Fetch Failed Exception: Missing an output location for shuffle 2 | 3 | ERROR STACKTRACE will look like : 4 | 5 | 2020-11-19 19:38:52,874 ERROR executionlogs:128 - g-e77a768b31735ca3fe3ee126fbf6d5ed62dca00d:2020-11-19 19:38:52,871 WARN [task-result-getter-0] scheduler.TaskSetManager (Logging.scala:logWarning(66)): Lost task 0.2 in stage 289.3 (TID 132629, 172.34.154.194, executor 1640): FetchFailed(null, shuffleId=41, mapId=-1, reduceId=0, message= 6 | org.apache.spark.shuffle.MetadataFetchFailedException: Missing an output location for shuffle 41 7 | 8 | Problem definition : 9 | 10 | MetadataFetchFailedException is thrown when a MapOutputTracker on an executor could not find requested shuffle map outputs for partitions in local cache and tried to fetch them remotely from the driver's MapOutputTracker 11 | 12 | That could lead to few conclusions: 13 | 14 | The driver's memory issues 15 | The executors' memory issues 16 | Executors being lost 17 | 18 | Please review the logs looking for issues reported as "Executor lost" INFO messages and/or review web UI's Executors page and see how the executors work 19 | 20 | 21 | RootCause : 22 | 23 | For MetadataFetchFailedException, it usually happens when one executor suddenly being killed or terminated, but this executor has some shuffle output, then when another executor try to fetch metadata of this shuffle output, exception happens. 24 | 25 | For FetchFailedException, it usually happens when one executor hosting some shuffle output is too busy or temporily dead. This could be caused by slow disk IO or network IO. It could be common when you have over 1000 executors. 26 | 27 | 28 | 1. Primarily due to repartition 29 | 2. more memory to the worker node 30 | 31 | Resolution : 32 | 33 | 1) You could try to run with more partitions (do a repartition on your dataframe). Memory issues typically arise when one or more partitions contain more data than will fit in memory. 34 | 35 | 2) Look in the log files on the failing nodes. You want to look for the text "Killing container". Change the Worker type from say Standard to G1.x or G1.x to G2.x 36 | 37 | 3) --conf spark.blacklist.enabled=true - This can be set either at the job level using the --conf job argument --conf spark.blacklist.enabled=true or in the job script as a SparkConf before creating the SparkContext. 38 | 39 | The blacklist feature will make sure that a task is not rescheduled on the failed executor more than once. This will prevent the race condition from happening.Details on spark executor blacklist - https://blog.cloudera.com/blacklisting-in-apache-spark/ 40 | 41 | 4) leaded to MetadataFetchFailedException that in map stage not reduce stage . just do df_rdd.repartition(nums) before reduceByKey() 42 | 43 | 44 | 5) You can enable spark debug level logs by doing glueContext.sparkContext.setLogLevel("DEBUG") for troubleshooting purpose. 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /glue/spark_errors/spark-sql_time_out.md: -------------------------------------------------------------------------------- 1 | If you are trying to connect to Glue catalog from EMR by adding the following in /etc/spark/conf/hive-site.xml 2 | 3 | 4 | hive.metastore.client.factory.class 5 | com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory 6 | 7 | 8 | After that, when you start spark-sql from the EMR master node, it hangs and fails with the following error 9 | 10 | Caused by: org.apache.http.conn.ConnectTimeoutException: Connect to glue.us-west-2.amazonaws.com:443 [glue.us-west-2.amazonaws.com/52.34.21.56, glue.us-west-2.amazonaws.com/54.244.180.174, glue.us-west-2.amazonaws.com/52.38.63.163, glue.us-west-2.amazonaws.com/52.27.227.131] failed: connect timed out 11 | 12 | This happens because the EMR cluster is not able to reach the regional endpoints for AWS Glue. 13 | 14 | ============================ 15 | Resolution Steps to Follow 16 | ============================= 17 | 18 | - Check the security groups for master and Slave nodes and see if outbound traffic to port 443 is restricted and allow traffic to 443 if the traffic is restricted. 19 | - Check NACLs in VPC settings and make sure 443 egress and ephemeral port traffic for ingress is allowed - Refer https://docs.aws.amazon.com/vpc/latest/userguide/vpc-network-acls.html#nacl-ephemeral-ports. 20 | - If the cluster is launched in a private subnet, please make sure there is a NAT gateway attached (for the cluster to communicate with glue using public endpoints/IPs) or a VPC Interface Endpoint (for the cluster to communicate with Glue using private endpoints/IPs) for AWS Glue is created in the VPC where the cluster is located - Please refer https://docs.aws.amazon.com/glue/latest/dg/vpc-endpoint.html. 21 | 22 | You can check the connectivity using Telnet to reach the regional endpoint. 23 | 24 | ====== 25 | [hadoop@ip-xx-0-xx-xx ~]$ telnet glue.us-west-2.amazonaws.com 443 26 | ====== 27 | 28 | Once you're able to connect to the endpoint from Telnet, you can try using spark-sql again. 29 | -------------------------------------------------------------------------------- /glue/spark_errors/unable_to_load_credentials.md: -------------------------------------------------------------------------------- 1 | ======================== 2 | import os 3 | os.environ["AWS_METADATA_SERVICE_TIMEOUT"] = "10" 4 | os.environ["AWS_METADATA_SERVICE_NUM_ATTEMPTS"] = "50" 5 | ======================== 6 | -------------------------------------------------------------------------------- /glue/spark_sql_glue.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from awsglue.transforms import * 3 | from awsglue.utils import getResolvedOptions 4 | from pyspark.context import SparkContext 5 | from awsglue.context import GlueContext 6 | from awsglue.job import Job 7 | from awsglue.dynamicframe import DynamicFrame 8 | 9 | ## @params: [JOB_NAME] 10 | args = getResolvedOptions(sys.argv, ['JOB_NAME']) 11 | sc = SparkContext() 12 | glueContext = GlueContext(sc) 13 | spark = glueContext.spark_session 14 | job = Job(glueContext) 15 | job.init(args['JOB_NAME'], args) 16 | 17 | spark.sql("SHOW TABLES").show() 18 | spark.sql('show databases').show() 19 | spark.catalog.currentDatabase() 20 | spark.sql('show tables from default').show() 21 | -------------------------------------------------------------------------------- /glue/start_glue_etl_via_lambda.py: -------------------------------------------------------------------------------- 1 | # 2 | #Function that starts a Glue job called ProductsETL and is invoked by 3 | #an object creation event on an S3 bucket called productscsvforetl 4 | # 5 | import boto3 6 | 7 | glue = boto3.client('glue') 8 | 9 | def lambda_handler(event, context): 10 | glue.start_job_run( 11 | JobName = 'ProductsETL', 12 | Arguments = { 13 | '--glue_db' : 'productsetl', 14 | '--glue_table_products' : 'productscsv', 15 | '--glue_table_categories' : 'categoriescsv', 16 | '--redshift_db' : 'salesdw', 17 | '--redshift_table' : 'products', 18 | '--s3_error_path' : 's3://productscsvforetl/Errors' 19 | } 20 | ) 21 | 22 | -------------------------------------------------------------------------------- /glue/trigger_glue_job_cross_account_setup.md: -------------------------------------------------------------------------------- 1 | Trigger a crawler in Account B account after successful completion of Glue job in other account A. You can achieve this using CloudWatch Events. In CloudWatch you can make use of Event Buses, which allow to you achieve your current use-case. Please refer to [1] for information on how to set this up. 2 | 3 | This will mainly allow you to trigger a lambda function using CloudWatch Event in Account B after successful completion of Glue ETL job in Account A. You can make use of Boto3 Glue API calls [2] to trigger a crawler within your lambda script. 4 | 5 | Steps - 6 | 1. Create a Glue Job in Account A 7 | 8 | 2. In Account B, select Event Buses in CloudWatch console, click on Add Permission, and enter the Account ID of Account A 9 | 10 | 3. Create a CloudWatch rule for the job in Account A as below - 11 | 12 | ======== 13 | { 14 | "source": [ 15 | "aws.glue" 16 | ], 17 | "detail-type": [ 18 | "Glue Job State Change" 19 | ], 20 | "detail": { 21 | "jobName": [ 22 | "YourJobName" 23 | ], 24 | "state": [ 25 | "SUCCEEDED" 26 | ] 27 | } 28 | } 29 | ========= 30 | 31 | And add a target that will send events to the event bus in Account B. 32 | 33 | 4. In Account B, create a Lambda function using Boto3 Glue API calls, which will start the crawler. 34 | 35 | 5. In Account B, create a similar rule as step 3, but for targets, select the Lambda function created in step 4. 36 | 37 | 38 | 39 | References - 40 | [1] Event Buses - https://aws.amazon.com/blogs/aws/new-cross-account-delivery-of-cloudwatch-events/ 41 | [2] Boto3 Glue API calls - https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue.html 42 | -------------------------------------------------------------------------------- /glue/try_catch_glue.py: -------------------------------------------------------------------------------- 1 | from py4j.protocol import Py4JJavaError 2 | 3 | def write_frame(frame): 4 | frame.write... 5 | 6 | retries = 3 7 | 8 | while retries >= 0: 9 | try: 10 | write_frame(frame) 11 | except Py4JJavaError as je: 12 | stack_trace = je.java_exception.toString() 13 | if "Caused by: java.lang.NullPointerException" in stack_trace: 14 | retries = retries - 1 15 | continue 16 | else: 17 | break 18 | break 19 | -------------------------------------------------------------------------------- /glue/update_crawler_cli.md: -------------------------------------------------------------------------------- 1 | aws glue update-crawler --name ulap-glue-crawler-typeahead --targets '{\"DynamoDBTargets\": [{\"Path\": \"ulap-uat-typeahead\",\"scanAll\": false,\"scanRate\": 1}]}' 2 | -------------------------------------------------------------------------------- /glue/update_glue_boto3_ssl.py: -------------------------------------------------------------------------------- 1 | from pprint import pprint 2 | import boto3 3 | 4 | client = boto3.client('glue') 5 | response = client.update_connection( 6 | Name='testdb', 7 | ConnectionInput={ 8 | 'Name': 'testdb', # Replace with your database name 9 | 'Description': 'created via boto3', 10 | 'ConnectionType': 'JDBC', 11 | 'ConnectionProperties': { 12 | 'JDBC_CONNECTION_URL': 'jdbc:mysql://10.0.0.0:3306/dbname', # Replace with your database URL 13 | 'JDBC_ENFORCE_SSL': 'true',# This Parameter is responsible for checking if SSL is enabled or not 14 | 'PASSWORD': 'pw', 15 | 'USERNAME': 'un' 16 | }, 17 | 'PhysicalConnectionRequirements': { 18 | 'SubnetId': '', # Replace with your subnet ID 19 | 'SecurityGroupIdList': [ 20 | '',# Replace with your security group 21 | ], 22 | 'AvailabilityZone': '' #Replace with your AvailabilityZone 23 | } 24 | } 25 | ) 26 | pprint(response) 27 | -------------------------------------------------------------------------------- /glue/update_glue_jdbc_connection_using_boto3.py: -------------------------------------------------------------------------------- 1 | 2 | # Please replace all the parameter value's as per your use case 3 | 4 | from pprint import pprint 5 | import boto3 6 | 7 | client = boto3.client('glue') 8 | response = client.update_connection( 9 | Name='testdb', 10 | ConnectionInput={ 11 | 'Name': 'testdb', # Replace with your database name 12 | 'Description': 'created via boto3', 13 | 'ConnectionType': 'JDBC', 14 | 'ConnectionProperties': { 15 | 'JDBC_CONNECTION_URL': 'jdbc:mysql://10.0.0.0:3306/dbname', # Replace with your database URL 16 | 'JDBC_ENFORCE_SSL': 'true',# This Parameter is responsible for checking if SSL is enabled or not 17 | 'PASSWORD': 'pw', 18 | 'USERNAME': 'un' 19 | }, 20 | 'PhysicalConnectionRequirements': { 21 | 'SubnetId': '', # Replace with your subnet ID 22 | 'SecurityGroupIdList': [ 23 | '',# Replace with your security group 24 | ], 25 | 'AvailabilityZone': '' #Replace with your AvailabilityZone 26 | } 27 | } 28 | ) 29 | pprint(response) 30 | -------------------------------------------------------------------------------- /glue/update_table_api_example.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | client = boto3.client('glue') 3 | 4 | response = client.update_table( 5 | DatabaseName='default', 6 | TableInput={ 7 | 'Name': 'scott_null_issue_pyspark_300241152ec040eab67902c52473bd37', 8 | }, 9 | SkipArchive=True 10 | ) 11 | -------------------------------------------------------------------------------- /glue/upgrade_glue_boto3.py: -------------------------------------------------------------------------------- 1 | # Upload boto3 and awscli wheel file to your S3 bucket. Boto3 and awscli both of these wheel file are available in pypi.org. (https://pypi.org/project/) 2 | #Insert below codes at the beginning of your python script. (The print statements can obviously be omitted) 3 | 4 | import sys 5 | sys.path.insert(0, '/glue/lib/installation') 6 | keys = [k for k in sys.modules.keys() if 'boto' in k] 7 | for k in keys: 8 | if 'boto' in k: 9 | del sys.modules[k] 10 | 11 | import boto3 12 | print('boto3 version') 13 | print(boto3.__version__) 14 | 15 | athena = boto3.client("athena") 16 | res = athena.list_data_catalogs() 17 | -------------------------------------------------------------------------------- /glue/write_DateType_SparkDataFrame.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from pyspark.sql.types import Row, StructType, StructField, StringType, IntegerType, DateType 3 | from pyspark.sql.functions import col, to_date 4 | 5 | schema = StructType([ 6 | StructField('A', IntegerType(), True), 7 | StructField('date', DateType(), True) 8 | ]) 9 | 10 | values=sc.parallelize([(3,'2012-02-02'),(5,'2018-08-08')]) 11 | 12 | rdd= values.map(lambda t: Row(A=t[0],date=datetime.datetime.strptime(t[1], "%Y-%m-%d"))) 13 | 14 | df = sqlContext.createDataFrame(rdd, schema) 15 | 16 | from pyspark.sql.functions import lit 17 | 18 | 19 | df5 = df4.withColumn("name", lit('ishan')) 20 | 21 | df6 = df5.withColumn("roll_id", lit(7)) 22 | -------------------------------------------------------------------------------- /glue/write_excel_using_ExcelWriter.py: -------------------------------------------------------------------------------- 1 | import re 2 | import io 3 | import sys 4 | import boto3 5 | import pandas as pd 6 | import numpy as np 7 | from awsglue.transforms import * 8 | from awsglue.utils import getResolvedOptions 9 | from awsglue.context import GlueContext 10 | from awsglue.job import Job 11 | from io import StringIO 12 | from botocore.exceptions import ClientError 13 | import datetime 14 | from collections import OrderedDict 15 | import json 16 | from urllib.parse import unquote_plus 17 | from itertools import chain, starmap 18 | from pandas import DataFrame, ExcelWriter 19 | import xlrd 20 | import xlsxwriter 21 | import configparser 22 | from zipfile import ZipFile 23 | from io import BytesIO 24 | import zipfile 25 | import openpyxl 26 | from openpyxl import Workbook 27 | ######################################################################### 28 | """ 29 | # create Workbook object 30 | wb=Workbook() 31 | # set file path 32 | filepath="s3://mybucket/test1.xlsx" 33 | # save workbook 34 | wb.save(filepath) 35 | """ 36 | bucket = 'theegc' 37 | filepath = 'demo1.xlsx' 38 | df = pd.read_csv('s3://mybucket/test1.csv') 39 | df2 = pd.read_csv('s3://mybucket/test2.csv') 40 | for colname in df: 41 | if ( colname == 'originaltid' ): 42 | df.rename(columns={'originaltid': 'tid'}, inplace=True, errors = 'raise')#new 43 | for colname in df2: 44 | if ( colname == 'originaltid' ): 45 | df2.rename(columns={'originaltid': 'tid'}, inplace=True, errors = 'raise')#new 46 | convert_dict = { 'tid': str, 'zip': str } 47 | df = df.astype(convert_dict) 48 | df2 = df2.astype(convert_dict) 49 | df['ZIP_length'] = df.zip.str.len() 50 | df['TID_length'] = df.tid.str.len() 51 | print('Padding ZIP and TID columns') 52 | df['tid'] = np.where( (df['TID_length'] > 1) & (df['TID_length'] < 9) ,df['tid'].apply(lambda x: str(x).rjust(9,"0")),df['tid'] ) 53 | df['zip'] = np.where( (df['ZIP_length'] > 1) & (df['ZIP_length'] < 5) ,df['zip'].apply(lambda x: str(x).rjust(5,"0")),df['zip'] ) 54 | df.drop(['TID_length', 'ZIP_length'], axis = 1,inplace= True) 55 | df2['ZIP_length'] = df2.zip.str.len() 56 | df2['TID_length'] = df2.tid.str.len() 57 | print('Padding ZIP and TID columns') 58 | df2['tid'] = np.where( (df2['TID_length'] > 1) & (df2['TID_length'] < 9) ,df2['tid'].apply(lambda x: str(x).rjust(9,"0")),df2['tid'] ) 59 | df2['zip'] = np.where( (df2['ZIP_length'] > 1) & (df2['ZIP_length'] < 5) ,df2['zip'].apply(lambda x: str(x).rjust(5,"0")),df2['zip'] ) 60 | df2.drop(['TID_length', 'ZIP_length'], axis = 1,inplace= True) 61 | 62 | with io.BytesIO() as output: 63 | with ExcelWriter(output,engine='xlsxwriter', mode='w') as writer: 64 | df.to_excel(writer, sheet_name="Sheet1",index= False) 65 | df2.to_excel(writer, sheet_name="Sheet2",index= False) 66 | data = output.getvalue() 67 | s3 = boto3.resource('s3') 68 | s3.Bucket(bucket).put_object(Key=filepath, Body=data) 69 | -------------------------------------------------------------------------------- /glue/write_sample_partitioned_dataset.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from awsglue.transforms import * 3 | from awsglue.utils import getResolvedOptions 4 | from pyspark.context import SparkContext 5 | from awsglue.context import GlueContext 6 | from awsglue.job import Job 7 | from pyspark.sql.functions import unix_timestamp 8 | from pyspark.sql.functions import from_unixtime 9 | from pyspark.sql.functions import isnan, when, count, col 10 | from awsglue.dynamicframe import DynamicFrame 11 | from pyspark.sql import SparkSession 12 | sc = SparkContext() 13 | glueContext = GlueContext(sc) 14 | spark = glueContext.spark_session 15 | 16 | job = Job(glueContext) 17 | job.init(job_name, args) 18 | 19 | 20 | 21 | df = sqlContext.createDataFrame([ 22 | (7,"ishan","kompressor","mbenz"), 23 | (14,"john","wrangler","jeep"),], 24 | ["HOUR","NAME","car","brand"]) 25 | 26 | 27 | df.write.parquet("s3://xx-xx-xx/Glue/oge/cars/") 28 | 29 | datasource0 = glueContext.create_dynamic_frame_from_options("s3", {'paths': ["s3://xx-xx-xx/Glue/ogenew/"], "recurse":True}, format="parquet",transformation_ctx = "datasource0") 30 | 31 | 32 | datasink2 = glueContext.write_dynamic_frame.from_options(frame = datasource0, connection_type = "s3", connection_options = {"path": "s3://xx-xx-xx/Glue/ogenew_pp/", "partitionKeys": ["brand","car"]}, format = "parquet",transformation_ctx = "datasink2") 33 | 34 | df = sqlContext.createDataFrame([ 35 | (2019,10,7,7,"ishan","chicago"), 36 | (2018,11,8,9,"james","italy"), 37 | (2017,12,9,14,"john","plano"), 38 | (2016,1,10,13,"adam","texas"), 39 | (2015,2,11,12,"chris","mexico"), 40 | (2014,3,12,22,"niel","portland"),], 41 | ["YEAR","MONTH","DAY","HOUR","NAME","CITY"]) 42 | 43 | df.write.parquet("/aws-xx-logs/Glue/glue_bookmark_issue_non_partitioned/") 44 | 45 | 46 | datasource0 = glueContext.create_dynamic_frame_from_options("s3", {'paths': ["s3://aws-xx-logs/Glue/glue_bookmark_issue_non_partitioned/"], "recurse":True}, format="parquet",transformation_ctx = "datasource0") 47 | datasource0.show() 48 | datasource0.printSchema() 49 | 50 | datasink2 = glueContext.write_dynamic_frame.from_options(frame = datasource0, connection_type = "s3", connection_options = {"path": "s3://aws-xx-logs/Glue/glue_bookmark_issue_partitioned/", "partitionKeys": ["YEAR","MONTH","DAY","HOUR"]}, format = "parquet",transformation_ctx = "datasink2") 51 | 52 | #Date partitioned 53 | 54 | df = sqlContext.createDataFrame([ 55 | ("7","ishan","kompressor","mbenz","honda",10.2,"hello","how",3,11.1,11.2,11.3,11.4,11.5,"10-20-2020"), 56 | ("8","rajat","komp","mb","ho",123.2,"bye","wow53",55,55.2,55.3,55.4,55.5,55.6,"11-20-2020"),], 57 | ["cusip","sym_cd","bsym_id","issuer_nm","scrty_ds","cpn_rt","mtrty_dt","num_trades","tot_qty_opb","high_price","low_price","median_px","vwap_px","px_stand_dev_vw","date"]) 58 | -------------------------------------------------------------------------------- /hive/hive-on-tez/container_launced_error.md: -------------------------------------------------------------------------------- 1 | Exit code: 255 2 | Stack trace: org.apache.hadoop.yarn.server.nodemanager.containermanager.runtime.ContainerExecutionException: Launch container failed 3 | at org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.DefaultLinuxContainerRuntime.launchContainer(DefaultLinuxContainerRuntime.java:113) 4 | at org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.DelegatingLinuxContainerRuntime.launchContainer(DelegatingLinuxContainerRuntime.java:130) 5 | at org.apache.hadoop.yarn.server.nodemanager.LinuxContainerExecutor.launchContainer(LinuxContainerExecutor.java:395) 6 | at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:299) 7 | at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:83) 8 | at java.util.concurrent.FutureTask.run(FutureTask.java:266) 9 | at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149) 10 | at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624) 11 | at java.lang.Thread.run(Thread.java:748) 12 | -------------------------------------------------------------------------------- /hive/hive-on-tez/hive-debug-mode.md: -------------------------------------------------------------------------------- 1 | hive --hiveconf hive.root.logger=DEBUG,console 2 | -------------------------------------------------------------------------------- /hive/hive-on-tez/hive_container_launched.md: -------------------------------------------------------------------------------- 1 | Check Hive server logs 2 | 3 | ======Hive server logs===== 4 | 2020-06-16T21:20:14,294 INFO [8c563000-9fde-4280-b549-598dabbaa2ad HiveServer2-HttpHandler-Pool: Thread-35082([])]: session.SessionState (SessionState.java:resetThreadName(436)) - Resetting thread name to HiveServer2-HttpHandler-Pool: Thread-35082 5 | 2020-06-16T21:20:14,351 INFO [HiveServer2-HttpHandler-Pool: Thread-35082([])]: thrift.ThriftHttpServlet (ThriftHttpServlet.java:doPost(145)) - Could not validate cookie sent, will try to generate a new cookie 6 | 2020-06-16T21:20:14,351 INFO [HiveServer2-HttpHandler-Pool: Thread-35082([])]: thrift.ThriftHttpServlet (ThriftHttpServlet.java:doKerberosAuth(398)) - Failed to authenticate with http/_HOST kerberos principal, trying with hive/_HOST kerberos principal 7 | 2020-06-16T21:20:14,351 ERROR [HiveServer2-HttpHandler-Pool: Thread-35082([])]: thrift.ThriftHttpServlet (ThriftHttpServlet.java:doKerberosAuth(406)) - Failed to authenticate with hive/_HOST kerberos principal 8 | 9 | Ask yourself these questions : - 10 | 11 | if you are using Knox+AD to connect Hive Server2, so there are multiple scenarios for such issues as like- 12 | 13 | 1. If you are using same AD domain controllers for multiple service then it might have more loads on it. 14 | 15 | 2. Network lag between KDC and EMR 16 | 17 | 3. Another thing it might be having issue while using the default fetch-size . 18 | 19 | Try the following : - 20 | 21 | 1. Run the query using mr engine (set hive.execution.engine=mr;) 22 | 23 | 2. Run the same query from hive cli 24 | 25 | 3. Change the beeline to debug mode then run the query (Please provide the output of console and application ID) 26 | 27 | sudo cp /etc/hive/conf/beeline-log4j2.properties /home/hadoop 28 | vi beeline-log4j2.properties 29 | ====change below property=== 30 | status = DEBUG (default info) 31 | name = BeelineLog4j2 32 | packages = org.apache.hadoop.hive.ql.log 33 | 34 | # list of properties 35 | property.hive.log.level = DEBUG (default warn) 36 | property.hive.root.logger = console 37 | ===== 38 | 39 | Run beeline 40 | beeline -u "jdbc:hive2://......" --poperty-file "/home/hadoop/beeline-log4j2.properties" 41 | 42 | 4. Try running the same query by increasing the fetch size (set hive.server2.thrift.resultset.max.fetch.size=2000 (default is 1000) 43 | 44 | 5. Try running the same query directly using !connect jdbc:hive2://hostip:port/;principal=hive/HOST_@domainname;transportMode=http;httpPath=cliservice 45 | 46 | Note-port for http connection in direct mode will be 1001 (you can verify same from /etc/hive/conf/hive-site.xml hive.server2.thrift.http.port) 47 | 48 | -------------------------------------------------------------------------------- /hive/hive-on-tez/hive_partition_queries.hql: -------------------------------------------------------------------------------- 1 | #creating a hive partitioned table 2 | 3 | create table drop_partition ( 4 | id int, 5 | name string 6 | ) 7 | partitioned by (city string,country string) 8 | location 's3://aws-isgaur-logs/drop_partition/'; 9 | 10 | #Inserting a record into a hive table 11 | 12 | INSERT INTO TABLE emr7 PARTITION (city='sfo',country='usa') values (123,'ishan'); 13 | 14 | #Dropping a partition from a hive table 15 | 16 | alter table employee drop partition ( city='sfo'); 17 | 18 | 19 | CREATE TABLE glue_tony ( 20 | name string ) 21 | PARTITIONED BY ( 22 | year string, 23 | month string, 24 | day string) 25 | LOCATION 26 | 's3://aws-isgaur-logs/glue_pd' 27 | 28 | insert into table glue_tony PARTITION (year='2020',month='01',day='01') values (123); 29 | 30 | alter table drop_partition drop partition ( city='sydney',country='australia'); 31 | 32 | INSERT INTO TABLE drop_partition PARTITION (city='sydney',country='australia') values (777,'sam'); 33 | -------------------------------------------------------------------------------- /hive/hive-on-tez/tez_benefits.md: -------------------------------------------------------------------------------- 1 | Tez is faster because: 2 | 3 | Execute Directed Acyclic Graph (DAG) as a single job in the MapReduce engine. The DAG requires each set of mappers to be followed by one set of reducers. This requirement causes multiple MapReduce jobs to be spun off for each Hive query. Tez doesn't have such constraint and can process complex DAG as one job minimizing job startup overhead. Avoids unnecessary writes. Multiple jobs are used to process the same Hive query in the MapReduce engine. The output of each MapReduce job is written to HDFS for intermediate data. Since Tez minimizes number of jobs for each Hive query, it's able to avoid unnecessary writes. Minimizes start-up delays. Tez is better able to minimize start-up delay by reducing the number of mappers it needs to start and also improving optimization throughout. Reuses containers. Whenever possible Tez will reuse containers to ensure that latency from starting up containers is reduced. Continuous optimization techniques. Traditionally optimization was done during compilation phase. However more information about the inputs is available that allow for better optimization during runtime. Tez uses continuous optimization techniques that allow it to optimize the plan further into the runtime phase. 4 | -------------------------------------------------------------------------------- /hive/hive-on-tez/tez_config.md: -------------------------------------------------------------------------------- 1 | As far as my experience goes , I would focus on these properties to optimize the TEZ workloads . I always recommend setting these TEZ properties at the Hive session level first and monitor the hive query execution progress for a certain number of days ( 3-4 days ) : 2 | 3 | 1. This parameter control the number of mappers for splittable formats with Tez - 4 | 5 | set tez.grouping.min-size = 167772; 6 | 7 | 2. Container Size => 8 | 9 | set hive.tez.container.size=10752; 10 | 11 | 3. Heap size => 12 | 13 | set hive.tez.java.opts=-Xmx8600m; 14 | 15 | 4. TEZ Application master and Container Java Heap sizes => 16 | 17 | set tez.am.resource.memory.mb=15360; 18 | set tez.am.launch.cmd-opts=-Xmx12288m; 19 | 20 | References : 21 | 22 | [1] https://community.cloudera.com/t5/Community-Articles/Demystify-Apache-Tez-Memory-Tuning-Step-by-Step/ta-p/245279 23 | [2] https://community.cloudera.com/t5/Community-Articles/Hive-on-Tez-Performance-Tuning-Determining-Reducer-Counts/ta-p/245680 24 | -------------------------------------------------------------------------------- /lake-formation/lake_formation_examples.md: -------------------------------------------------------------------------------- 1 | https://tokern.io/blog/lake-formation-permissions/ 2 | -------------------------------------------------------------------------------- /lake-formation/setting_revoking_lf_permissions.md: -------------------------------------------------------------------------------- 1 | As an Example - Create a crawler, crawling the location that your table was located at, and provide a prefix of aws_test to the table to differentiate the table from the existing table. 2 | 3 | On running the crawler, check to see the table getting created in Glue under the database xxxxx And the corresponding table(aws_test_xxxxxxxx) should also be visible in LakeFormation with an IAMAllowedPrincipals rule on the table. 4 | 5 | In order to revoke the permissions on a table in LakeFormation do :- 6 | 7 | I] Select the respective table(From the Tables tab) --> Actions --> View permissions 8 | 9 | II] Select the respective IAM Entity --> Revoke --> Revoke 10 | 11 | In order to grant an IAM Entity, the permissions on a table in LakeFormation do :- 12 | 13 | I] Select the respective table(From the Tables tab) --> Actions --> View permissions 14 | 15 | II] Grant --> Choose IAM Principals to add --> Database(select the respective Database) --> Select the respective table (column optional, i.e. you can also mention the specific columns by default the permissions that will be selected in the "Table permissions" will be acting on all the columns present on the table) 16 | 17 | Following the above steps you can:- 18 | 19 | 1. Remove all the permissions from the table and queried in Athena, this results in failed query due to insufficient LF permissions. 20 | 21 | 2. Add the IAMAllowedPrincipals on the table and you were able to query the table from Athena. 22 | 23 | IAMAllowedPrincipals allows all the IAM entities that have the respective S3, Athena and Glue permissions to successfully query the table and effectively removes the necessity of allowing permissions by adding rules in LakeFormation. 24 | -------------------------------------------------------------------------------- /lambda/load_to_rds.py: -------------------------------------------------------------------------------- 1 | # 2 | #Lambda function used to write inbound IoT sensor data to RDS PostgeSQL database 3 | # 4 | import sys 5 | import os 6 | import json 7 | import psycopg2 8 | 9 | #Connect to PostgreSQL database and insert sensor data record 10 | def handler(event, context): 11 | 12 | try: 13 | conn = psycopg2.connect(host=os.environ['rds_host'], port=os.environ['rds_port'], 14 | dbname=os.environ['rds_dbname'], user=os.environ['rds_username'], 15 | password=os.environ['rds_password']) 16 | conn.autocommit = True 17 | 18 | cur = conn.cursor() 19 | 20 | cur.execute('insert into "SensorData" ("DeviceID", "DateTime", "Temperature", "Humidity", ' 21 | '"WindDirection", "WindIntensity", "RainHeight") values (%s, %s, %s, %s, %s, %s, %s)', 22 | (event['deviceid'], event['datetime'], event['temperature'], event['humidity'], 23 | event['windDirection'], event['windIntensity'], event['rainHeight'])) 24 | cur.close() 25 | 26 | #No except statement is used since any exceptions should fail the function so that the 27 | #failed message is sent to the SQS destination configured for the Lambda function 28 | finally: 29 | try: 30 | conn.close() 31 | except: 32 | pass 33 | 34 | #Used when testing from the Linux command line 35 | #if __name__== "__main__": 36 | # handler(None, None) 37 | -------------------------------------------------------------------------------- /lambda/trigger_glue_job.py: -------------------------------------------------------------------------------- 1 | import json 2 | import boto3 3 | 4 | def lambda_handler(event, context): 5 | # TODO implement 6 | print( "Ishan Glue Lambda") 7 | client = boto3.client('glue') 8 | client.start_job_run(JobName = 'MovieDataRawToRefine',Arguments = {} ) 9 | return { 10 | 'statusCode': 200, 11 | 'body': json.dumps('Hello from Lambda!') 12 | } 13 | -------------------------------------------------------------------------------- /s3/s3_deny_policy_explained.md: -------------------------------------------------------------------------------- 1 | { 2 | "Sid": "DenyPublicReadACL", 3 | "Effect": "Deny", 4 | "Principal": { 5 | "AWS": "*" 6 | }, 7 | "Action": [ 8 | "s3:PutObject", 9 | "s3:PutObjectAcl" 10 | ], 11 | "Resource": "arn:aws:s3:::examplebucket/*", 12 | "Condition": { 13 | "StringEquals": { 14 | "s3:x-amz-acl": [ 15 | "public-read", 16 | "public-read-write", 17 | "authenticated-read" 18 | ] 19 | } 20 | } 21 | } 22 | 23 | 24 | 25 | “Deny any Amazon S3 request to PutObject or PutObjectAcl in the bucket examplebucket when the request includes one of the following access control lists (ACLs): public-read, public-read-write, or authenticated-read.” 26 | 27 | Instead, IAM evaluates first if there is an explicit Deny. If there is not, IAM continues to evaluate if you have an explicit Allow and then you have an implicit Deny. 28 | -------------------------------------------------------------------------------- /spark_configs/Estimating_memory.md: -------------------------------------------------------------------------------- 1 | Estimating Memory and CPU utilization for Spark jobs : 2 | 3 | Analytically calculating memory and CPU utilization for each Spark job is not a straightforward process. However, to estimate resources for troubleshooting purposes, the following methodology typically helps. 4 | 5 | The following example illustrates the use of cores and executors in estimating resources for any job. 6 | 7 | Let's consider an example for 5 node cluster with 80 cores and 320 GB memory. 8 | Let us assume that for 1 node, we estimate 16 cores and 64GB memory. 9 | 10 | The math justifying the above estimates is as follows: 11 | 1) Let's save 2 cores and 8 GB per machine for OS and stuff (Then you have 74 cores and 280 GB for Spark) 12 | 13 | 2) As a rule of thumb, use 3 - 5 threads per executor reading from MFS. Assume 3, then it is 3 cores per executor. 14 | 15 | --executor-cores = 3 16 | 17 | 18 | 3) Per node we have 14 cores, to be on the safe side subtract 1 core for AM, divide it by cores per executor. Then the number of executors per node is (14 - 1) / 3 = 4. 19 | We have 5 nodes, so: 20 | 21 | --num-executors = 20 22 | 23 | 24 | BTW. 3 cores * 4 executors mean that potentially 12 threads are trying to read from MFS per machine. 25 | 26 | 4) Per node we have 64 - 8 = 56 GB. Having from above 4 executors per node, this is 14 GB per executor. 27 | Remove 10% as YARN overhead, leaving 12GB 28 | 29 | --executor-memory = 12 30 | 31 | 32 | This leads to 20*3 = 60 cores and 12 * 20 = 240 GB, which leaves some further room for the machines. 33 | 34 | You can also start with 4 executor-cores, you'll then have 3 executors per node (num-executors = 15) and 19 GB of executor memory. 35 | -------------------------------------------------------------------------------- /spark_configs/spark_submit_config.sh: -------------------------------------------------------------------------------- 1 | -executor-memory = 4g 2 | --driver-memory=4g 3 | --spark.driver.memoryOverhead=512 4 | --spark.executor.memoryOverhead=512 5 | 6 | 7 | Config Set II: fails in 5th iteration 8 | --driver-memory=5g 9 | --spark.driver.memoryOverhead=1000 10 | 11 | 12 | Config Set III: 13 | --executor-memory = 5g 14 | --driver-memory=5g 15 | --spark.driver.memoryOverhead=1000 16 | --spark.executor.memoryOverhead=1000 17 | --spark.sql.files.maxPartitionBytes=163421772 18 | --spark.sql.hive.caseSensitiveInferenceMode=NEVER_INFER 19 | --spark.speculation=false 20 | --spark.hadoop.fs.s3.maxRetries=3 21 | --spark.hadoop.fs.s3.consistent.retryPolicyType=exponential 22 | -------------------------------------------------------------------------------- /spark_configs/verbose_logs.md: -------------------------------------------------------------------------------- 1 | How do I add verbose logs for Spark Driver and Executor? 2 | 3 | Since Spark application runs on JVM, the --verbose and the --verbose:class options are both available. 4 | 5 | 6 | Two switches are relevant in this context. The --verbose option provides configuration details and --verbose:class option reveals the classes loaded by the driver and executor. This debugging utility helps you trace class path conflicts for driver and executor. 7 | 8 | 1) To list the classes loaded by JVM while running a Java program, use --verbose option. The output is a list of all the classes loaded by the Class loader and the source that called the class. The following is a sample code using the --verbose:class option. 9 | 10 | ./spark-shell --conf "spark.executor.extraJavaOptions=-verbose:class" --conf "spark.driver.extraJavaOptions=-verbose:class" 11 | 12 | 13 | 2) To launch the spark-shell or to run a Spark program using spark-submit, use --verbose option. The --verbose option outputs fine-grained debugging information like where the application loading source. 14 | 15 | ./spark-shell --verbose 16 | --------------------------------------------------------------------------------