├── LICENSE ├── README.md ├── config ├── catalog │ ├── hive.properties │ ├── jmx.properties │ ├── tpcds.properties │ └── tpch.properties └── config.properties ├── packer ├── .atlanrc ├── Makefile ├── base_configs │ ├── config.properties │ ├── env.sh │ ├── jvm.config │ ├── log.properties │ └── node.properties ├── presto.json ├── presto.sh ├── presto_metrics_cloudwatch.service ├── presto_metrics_prometheus.service └── presto_scaling_service.service ├── presto.yaml ├── sample_presto_config.zip └── scripts ├── autoscaling_termination_wait └── lambda_function.py ├── graceful_shutdown_handler └── lambda_function.py └── ha_lambda └── lambda_function.py /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

Presto on AWS

2 | 3 | 4 | This is a cloudformation template for deploying [Presto](https://prestosql.io) on AWS. It deploys coordinators and workers in an autoscaling group. 5 | 6 | ## Features 7 | 8 | - Graceful shutdown of workers using Autoscaling lifecycle management. Presto worker will not shutdown until all the queries finish on that worker. 9 | - Highly available coordinator nodes. 10 | - Autoscaling of presto workers based on presto's memory and CPU usage. 11 | - A cloudwatch and prometheus agent which runs inside presto coordinator to push presto's metrics such as input data, CPU usage, running/blocked/failed queries. 12 | - A query logger which pushes completed queries and its stats to ElasticSearch. 13 | - A presto AMI creation packer script to easily update presto version. 14 | - Logs of presto coordinator and workers available in Cloudwatch. 15 | - Health check in Presto workers to remove unhealthy workers 16 | 17 | ## Architecture 18 | 19 | ![Screen Shot 2020-06-04 at 8.52.37 PM](https://user-images.githubusercontent.com/10682054/83781909-c81ab280-a6ac-11ea-8e48-b36ec631f5ac.png) 20 | 21 | 22 | 23 | ## Pre-requisites 24 | 25 | - A VPC and subnet 26 | - A user with following permissions. // TODO: Add permissions 27 | 28 | ### Modiying the presto configuration 29 | 30 | - New connectors: To add presto connectors (like hive connector, postgres connector etc) configuration to the deployment create a directory with following structure. Add properties file for each connector, zip the directory. Add the connector file copying command into the boostrap script in CFT. 31 | ``` 32 | ├── catalog 33 | │ ├── hive.properties 34 | │ ├── jmx.properties 35 | │ ├── tpcds.properties 36 | │ └── tpch.properties 37 | └── config.properties 38 | ``` 39 | 40 | - To modify the core configuration such as enabling spill or reserved pool disabling/enabling modify the config.properties file mentioned above. Memory based configurations like JVM memory, max memory per node is automatically handled based on selected instances. 41 | 42 | Add the URL of above directory as zip file in `AdditionalConfigsUri` parameter in CFT. 43 | 44 | ### Creating Presto AMI using Packer 45 | 46 | - Go inside `packer` directory and change the parameters of `.atlanrc` file. The presto version is 330 by default. Source AMI is Amazon Linux 2 in the region you want to create the AMI in. 47 | - Run the following command 48 | ```bash 49 | make build_presto_image 50 | ``` 51 | - Change the `presto.json` to modify the AMI further. 52 | - To use this AMI add the AMI ID in the mapping in `presto.yaml` with AMI's region. 53 | 54 | ### Deployment 55 | 56 | The CFT requires following parameters for deployment 57 | - VPC ID: VPC to deploy Presto cluster 58 | - Subnet ID: Subnet to deploy Presto cluster 59 | - Security groups ID: SGs to attach to presto coordinators and workers 60 | - Keyname: Private key to use to launch presto machines 61 | - Coordinator Instance type: EC2 Instance type for coordinator 62 | - Coordinator Instance Count: For HA Coordinator deployment set it to 2 else set it to 1. 63 | - Min workers count: Minimum numbers of EC2 machines in Presto workers ASG 64 | - Max workers count: Maximum numbers of EC2 machines in Presto workers ASG 65 | - Workers instance type: EC2 Instance type for workers 66 | - Presto Version: Presto version, required for compatibility before and after version 330 67 | - EC2 Root volume size: EBS Volume size (GB) for presto workers and coordinators. Increase the value to few hundred GBs if you have disk spill based workload. 68 | - Hive IP: Format `thrift://:9083` 69 | - Elasticsearch Host: Elasticsearch host for query logger to push SQL queries into. 70 | - Elasticsearch Port: Elasticsearch port for query logger to push SQL queries into 71 | - Environment: Identifier for Dev, Production presto clusters. 72 | 73 | Create the AMI and provide the ID with region in CFT. Now deploy the CFT by following the guide from AWS. 74 | 75 | ### Configuring autoscaling of workers 76 | 77 | You can configure presto workers autoscaling based on metrics from presto like running queries, heap usage etc. These metrics gets pushed into Cloudwatch by presto coordinator. You can configure the Cloudwatch alarams and autoscaling based on these Cloudwatch Metrics. 78 | 79 | ### Limitations/Future work 80 | - Add support for TLS in the deployment. 81 | - Graceful shutdown lambda only waits for 1 hour for queries to finish. Add feature to wait to terminate the worker until all the queries finish on that worker. 82 | - High availibility feature only switches between standby and live coordinator but doesn't restart the failed coordinator. 83 | - No retention policy configuration for presto logs in Cloudwatch 84 | 85 | ### Contribute 86 | 87 | 1. Fork it 88 | 2. Create your feature branch (`git checkout -b my-new-feature`) 89 | 3. Commit your changes (`git commit -am 'Add some feature'`) 90 | 4. Push to the branch (`git push origin my-new-feature`) 91 | 5. Create new Pull Request 92 | -------------------------------------------------------------------------------- /config/catalog/hive.properties: -------------------------------------------------------------------------------- 1 | connector.name=hive-hadoop2 2 | hive.metastore-refresh-interval=1s 3 | hive.metastore-cache-ttl=5s 4 | hive.non-managed-table-writes-enabled = true 5 | hive.max-partitions-per-writers=1000 6 | hive.orc.use-column-names = true 7 | hive.parquet.use-column-names = true 8 | hive.metastore-timeout=5m 9 | -------------------------------------------------------------------------------- /config/catalog/jmx.properties: -------------------------------------------------------------------------------- 1 | connector.name=jmx -------------------------------------------------------------------------------- /config/catalog/tpcds.properties: -------------------------------------------------------------------------------- 1 | connector.name=tpcds -------------------------------------------------------------------------------- /config/catalog/tpch.properties: -------------------------------------------------------------------------------- 1 | connector.name=tpch -------------------------------------------------------------------------------- /config/config.properties: -------------------------------------------------------------------------------- 1 | experimental.spill-enabled=false 2 | experimental.spill-order-by=true 3 | experimental.spill-window-operator=true 4 | experimental.spiller-spill-path=/var/lib/presto/spill/ 5 | experimental.spiller-max-used-space-threshold=0.8 6 | experimental.max-spill-per-node=260GB 7 | experimental.query-max-spill-per-node=150GB 8 | experimental.reserved-pool-enabled=false 9 | query.low-memory-killer.policy=total-reservation-on-blocked-nodes 10 | shutdown.grace-period=60.00m -------------------------------------------------------------------------------- /packer/.atlanrc: -------------------------------------------------------------------------------- 1 | AWS_ACCESS_KEY= 2 | AWS_SECRET_KEY= 3 | VPC_ID= 4 | SUBNET_ID= 5 | PRESTO_VERSION=330 6 | AMI_NAME= 7 | SOURCE_AMI=ami-0323c3dd2da7fb37d 8 | REGION=us-east-1 9 | -------------------------------------------------------------------------------- /packer/Makefile: -------------------------------------------------------------------------------- 1 | FILE := .atlanrc 2 | -include ./$(FILE) 3 | 4 | build_presto_image: 5 | @packer build \ 6 | -var 'vpc_id=$(VPC_ID)' \ 7 | -var 'subnet_id=$(SUBNET_ID)' \ 8 | -var 'aws_access_key=$(AWS_ACCESS_KEY)' \ 9 | -var 'aws_secret_key=$(AWS_SECRET_KEY)' \ 10 | -var 'presto_version=$(PRESTO_VERSION)' \ 11 | -var 'ami_name=$(AMI_NAME)' \ 12 | -var 'source_ami=$(SOURCE_AMI)' \ 13 | -var 'region=$(REGION)' \ 14 | presto.json 15 | 16 | validate_config: 17 | @packer validate presto.json 18 | -------------------------------------------------------------------------------- /packer/base_configs/config.properties: -------------------------------------------------------------------------------- 1 | coordinator={{isCoordinator}} 2 | node-scheduler.include-coordinator=false 3 | http-server.http.port=8080 4 | discovery.uri={{coordinatorDiscoveryUri}} -------------------------------------------------------------------------------- /packer/base_configs/env.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atlanhq/presto-on-aws/a0f31d44b8d9729c9fdbccdb516969c735b148f3/packer/base_configs/env.sh -------------------------------------------------------------------------------- /packer/base_configs/jvm.config: -------------------------------------------------------------------------------- 1 | -server 2 | -Xmx{{jvmMemory}}G 3 | -XX:-UseBiasedLocking 4 | -XX:+UseG1GC 5 | -XX:+ExplicitGCInvokesConcurrent 6 | -XX:+HeapDumpOnOutOfMemoryError 7 | -XX:+UseGCOverheadLimit 8 | -XX:+ExitOnOutOfMemoryError 9 | -XX:ReservedCodeCacheSize=512M 10 | -Djdk.attach.allowAttachSelf=true 11 | -------------------------------------------------------------------------------- /packer/base_configs/log.properties: -------------------------------------------------------------------------------- 1 | # Enable verbose logging from Presto 2 | #com.facebook.presto=DEBUG 3 | -------------------------------------------------------------------------------- /packer/base_configs/node.properties: -------------------------------------------------------------------------------- 1 | node.environment={{envName}} 2 | node.id={{instanceId}} 3 | node.data-dir=/var/lib/presto/data 4 | catalog.config-dir=/etc/presto/catalog 5 | plugin.dir=/usr/lib/presto/plugin 6 | node.server-log-file=/var/log/presto/server.log 7 | node.launcher-log-file=/var/log/presto/launcher.log 8 | -------------------------------------------------------------------------------- /packer/presto.json: -------------------------------------------------------------------------------- 1 | { 2 | "variables": { 3 | "subnet_id": "", 4 | "vpc_id": "", 5 | "aws_access_key": "", 6 | "aws_secret_key": "", 7 | "presto_version": "", 8 | "ami_name": "", 9 | "environment": "dev", 10 | "source_ami": "", 11 | "region": "" 12 | }, 13 | "builders": [{ 14 | "type": "amazon-ebs", 15 | "region": "{{ user `region` }}", 16 | "source_ami": "{{ user `source_ami` }}", 17 | "instance_type": "t3a.large", 18 | "ssh_username": "ec2-user", 19 | "ssh_timeout": "5m", 20 | "ami_name": "{{ user `ami_name`}}", 21 | "vpc_id": "{{ user `vpc_id` }}", 22 | "subnet_id": "{{ user `subnet_id` }}", 23 | "force_deregister": true, 24 | "run_tags": { 25 | "Name": "atlan-presto" 26 | }, 27 | "tags": { 28 | "Name": "Atlan-Presto", 29 | "Environment": "{{ user `environment` }}", 30 | "user": "arpit", 31 | "presto_version": "{{ user `presto_version` }}" 32 | }, 33 | "ami_block_device_mappings": [ 34 | { 35 | "device_name": "/dev/xvda", 36 | "volume_size": 8, 37 | "delete_on_termination": true 38 | } 39 | ] 40 | }], 41 | "provisioners": [ 42 | { 43 | "type": "shell", 44 | "inline": [ 45 | "sudo mkdir -p /etc/presto", 46 | "sudo mkdir -p /etc/presto_metrics", 47 | "sudo chown -R ec2-user:ec2-user /etc/presto /etc/presto_metrics" 48 | ] 49 | }, 50 | { 51 | "type": "file", 52 | "source": "./presto", 53 | "destination": "/tmp/presto" 54 | }, 55 | { 56 | "type": "file", 57 | "source": "./presto_metrics_prometheus.service", 58 | "destination": "/tmp/presto_metrics_prometheus.service" 59 | }, 60 | { 61 | "type": "file", 62 | "source": "./presto_metrics_cloudwatch.service", 63 | "destination": "/tmp/presto_metrics_cloudwatch.service" 64 | }, 65 | { 66 | "type": "file", 67 | "source": "./presto_scaling_service.service", 68 | "destination": "/tmp/presto_scaling_service.service" 69 | }, 70 | { 71 | "type": "file", 72 | "source": "./base_configs/", 73 | "destination": "/etc/presto" 74 | }, 75 | { 76 | "type": "shell", 77 | "environment_vars": [ 78 | "aws_access_key={{ user `aws_access_key`}}", 79 | "aws_secret_key={{ user `aws_secret_key`}}", 80 | "presto_version={{ user `presto_version`}}" 81 | ], 82 | "scripts": [ 83 | "presto.sh" 84 | ] 85 | } 86 | ] 87 | } 88 | -------------------------------------------------------------------------------- /packer/presto.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | set -euxo 3 | echo "HELLO WORLD" 4 | 5 | version=$presto_version 6 | 7 | echo $version 8 | # Install Java 9 | sudo amazon-linux-extras install java-openjdk11 10 | java -version 11 | sudo yum install -y awslogs aws-cfn-bootstrap 12 | 13 | sudo mkdir -p /usr/lib/presto /var/log/presto /var/lib/presto/data /var/lib/presto/spill /etc/presto_metrics /etc/presto_scaling_service /var/run/presto 14 | 15 | # Install presto 16 | wget -O /tmp/presto-server.tar.gz https://repo1.maven.org/maven2/io/prestosql/presto-server/$version/presto-server-$version.tar.gz 17 | tar -xvf /tmp/presto-server.tar.gz -C /tmp/ 18 | sudo cp -r /tmp/presto-server-$version/* /usr/lib/presto/ 19 | ls /usr/lib/presto 20 | 21 | 22 | sudo chown -R ec2-user:ec2-user /etc/presto /usr/lib/presto /var/lib/presto /var/log/presto /etc/presto_metrics /etc/presto_scaling_service /var/run/presto 23 | 24 | # install presto cli 25 | wget -O /tmp/presto-cli https://repo1.maven.org/maven2/io/prestosql/presto-cli/$version/presto-cli-$version-executable.jar 26 | sudo mv /tmp/presto-cli /usr/local/bin/presto-cli 27 | sudo chmod +x /usr/local/bin/presto-cli 28 | 29 | # create additional services 30 | 31 | # presto metrics prometheus service 32 | 33 | sudo wget -O /usr/local/bin/presto_metrics https://github.com/atlanhq/presto-metrics/releases/download/v1.0.0/presto_metrics_v1.0.0_linux_amd64 34 | sudo chmod +x /usr/local/bin/presto_metrics 35 | sudo chown -R ec2-user:ec2-user /usr/local/bin/presto_metrics 36 | 37 | 38 | cat < /etc/presto_metrics/env.prometheus 39 | PRESTO_HOST=localhost 40 | PRESTO_PORT=8080 41 | SERVICE_NAME=prometheus 42 | STACK_NAME=atlan-presto-test-stack 43 | CLOUDWATCH_NAMESPACE=presto 44 | EOF 45 | 46 | cat < /etc/presto_metrics/env.cloudwatch 47 | PRESTO_HOST=localhost 48 | PRESTO_PORT=8080 49 | SERVICE_NAME=cloudwatch 50 | STACK_NAME=atlan-presto-test-stack 51 | CLOUDWATCH_NAMESPACE=presto 52 | EOF 53 | 54 | sudo touch /etc/default/presto && sudo chown ec2-user:ec2-user /etc/default/presto 55 | /usr/bin/printf "PRESTO_OPTS= \ 56 | --pid-file=/var/run/presto/presto.pid \ 57 | --node-config=/etc/presto/node.properties \ 58 | --jvm-config=/etc/presto/jvm.config \ 59 | --config=/etc/presto/config.properties \ 60 | --launcher-log-file=/var/log/presto/launcher.log \ 61 | --server-log-file=/var/log/presto/server.log \ 62 | -Dhttp-server.log.path=/var/log/presto/http-request.log \ 63 | -Dcatalog.config-dir=/etc/presto/catalog 64 | [Install] 65 | WantedBy=default.target 66 | " >> /etc/default/presto 67 | 68 | sudo touch /etc/systemd/system/presto.service && sudo chown ec2-user:ec2-user /etc/systemd/system/presto.service 69 | 70 | /usr/bin/printf " 71 | [Unit] 72 | Description=Presto Server 73 | Documentation=https://prestosql.io/ 74 | After=network-online.target 75 | [Service] 76 | User=ec2-user 77 | Restart=on-failure 78 | Type=forking 79 | PIDFile=/var/run/presto/presto.pid 80 | RuntimeDirectory=presto 81 | EnvironmentFile=/etc/default/presto 82 | ExecStart=/usr/lib/presto/bin/launcher start \$PRESTO_OPTS 83 | ExecStop=/usr/lib/presto/bin/launcher stop \$PRESTO_OPTS 84 | [Install] 85 | WantedBy=default.target 86 | " >> /etc/systemd/system/presto.service 87 | 88 | 89 | sudo cp /tmp/presto_metrics_prometheus.service /etc/systemd/system/presto_metrics_prometheus.service 90 | sudo cp /tmp/presto_metrics_cloudwatch.service /etc/systemd/system/presto_metrics_cloudwatch.service 91 | 92 | sudo systemctl daemon-reload 93 | -------------------------------------------------------------------------------- /packer/presto_metrics_cloudwatch.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Presto Metrics Prometheus exporter 3 | [Service] 4 | User=ec2-user 5 | #change this to your workspace 6 | EnvironmentFile=/etc/presto_metrics/env.cloudwatch 7 | WorkingDirectory=/etc/presto_metrics/ 8 | #path to executable. 9 | ExecStart=/usr/local/bin/presto_metrics --web.service-name=${SERVICE_NAME} \ 10 | --web.presto-host=${PRESTO_HOST} \ 11 | --web.presto-port=${PRESTO_PORT} \ 12 | --web.stack-name=${STACK_NAME} \ 13 | --web.cloudwatch-namespace=${CLOUDWATCH_NAMESPACE} \ 14 | --web.api-prefix=${API_PREFIX} \ 15 | --web.cloudwatch-region=${CLOUDWATCH_REGION} 16 | SuccessExitStatus=143 17 | TimeoutStopSec=10 18 | Restart=on-failure 19 | RestartSec=5 20 | [Install] 21 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /packer/presto_metrics_prometheus.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Presto Metrics Prometheus exporter 3 | [Service] 4 | User=ec2-user 5 | #change this to your workspace 6 | EnvironmentFile=/etc/presto_metrics/env.prometheus 7 | WorkingDirectory=/etc/presto_metrics/ 8 | #path to executable. 9 | ExecStart=/usr/local/bin/presto_metrics --web.service-name=${SERVICE_NAME} \ 10 | --web.presto-host=${PRESTO_HOST} \ 11 | --web.presto-port=${PRESTO_PORT} \ 12 | --web.stack-name=${STACK_NAME} \ 13 | --web.cloudwatch-namespace=${CLOUDWATCH_NAMESPACE} \ 14 | --web.api-prefix=${API_PREFIX} 15 | 16 | SuccessExitStatus=143 17 | TimeoutStopSec=10 18 | Restart=on-failure 19 | RestartSec=5 20 | [Install] 21 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /packer/presto_scaling_service.service: -------------------------------------------------------------------------------- 1 | [Unit] 2 | Description=Presto Scaling Service 3 | [Service] 4 | User=ec2-user 5 | #change this to your workspace 6 | EnvironmentFile=/etc/presto_scaling_service/env 7 | WorkingDirectory=/etc/presto_scaling_service 8 | #path to executable. 9 | ExecStart=/usr/local/bin/presto_scaling_service --web.presto-host=${PRESTO_HOST} \ 10 | --web.presto-port=${PRESTO_PORT} \ 11 | --web.workers-asg-name=${PRESTO_WORKERS_ASG_NAME} \ 12 | --web.api-prefix=${API_PREFIX} 13 | SuccessExitStatus=143 14 | TimeoutStopSec=10 15 | Restart=on-failure 16 | RestartSec=5 17 | [Install] 18 | WantedBy=multi-user.target -------------------------------------------------------------------------------- /presto.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | AWSTemplateFormatVersion: '2010-09-09' 3 | Description: 'CloudFormation Template for OpenSource Presto' 4 | Metadata: 5 | AWS::CloudFormation::Interface: 6 | ParameterGroups: 7 | - 8 | Label: 9 | default: "AWS Configuration" 10 | Parameters: 11 | - VPC 12 | - Subnet 13 | - KeyName 14 | - SecurityGroups 15 | - 16 | Label: 17 | default: "Presto Configuration" 18 | Parameters: 19 | - CoordinatorInstanceType 20 | - WorkersInstanceType 21 | - WorkersCount 22 | - Ec2RootVolumeSize 23 | - MaxWorkersCount 24 | - 25 | Label: 26 | default: "Additional Parameters" 27 | Parameters: 28 | - Environment 29 | 30 | Mappings: 31 | RegionMap: 32 | ap-south-1: 33 | PRESTOIMAGE: ami-0bcffb0a9872eb14c 34 | us-east-1: 35 | PRESTOIMAGE: ami-04254dc35836a5c71 36 | 37 | Parameters: 38 | VPC: 39 | Type: 'AWS::EC2::VPC::Id' 40 | #aws-permission @cft ec2:DescribeVpcs 41 | Description: VPC ID 42 | AllowedPattern: ".+" 43 | Subnet: 44 | Type: 'AWS::EC2::Subnet::Id' 45 | #aws-permission @cft ec2:DescribeSubnets 46 | Description: Subnet to use for Presto nodes (must belong to the selected VPC) 47 | AllowedPattern: ".+" 48 | KeyName: 49 | Description: EC2 Key Name 50 | Type: AWS::EC2::KeyPair::KeyName 51 | #aws-permission @cft ec2:DescribeKeyPairs 52 | AllowedPattern: ".+" 53 | SecurityGroups: 54 | Type: 'List' 55 | #aws-permission @cft ec2:DescribeSecurityGroups 56 | Description: 'Security Groups for Presto nodes (e.g: allowing SSH access). Must select at least one.' 57 | AllowedPattern: ".+" 58 | CoordinatorInstanceType: 59 | Type: String 60 | Default: m5.large 61 | Description: EC2 instance type of the coordinator 62 | CoordinatorInstanceCount: 63 | Type: String 64 | Default: 1 65 | Description: Number of Coordinator instances to deploy 66 | WorkersInstanceType: 67 | Type: String 68 | Default: m5.large 69 | Description: EC2 instance type of the workers 70 | ElasticsearchHost: 71 | Type: String 72 | Default: dev-admin-search.atlan.com 73 | ElasticsearchPort: 74 | Type: "String" 75 | Default: 443 76 | AdditionalConfigsUri: 77 | Type: "String" 78 | Description: Additional Configuration zip file to use, provide an https s3 public url to fetch the zip file from. 79 | WorkersCount: 80 | Description: Number of dedicated Presto worker nodes (apart from coordinator) to instantiate. 81 | Type: Number 82 | Default: 1 83 | MinValue: 1 84 | MaxWorkersCount: 85 | Description: Number of max dedicated Presto worker nodes. 86 | Type: Number 87 | Default: 5 88 | MinValue: 1 89 | Ec2RootVolumeSize: 90 | Type: String 91 | Default: 100 92 | Description: EC2 root volume size 93 | HiveIPAddress: 94 | Type: String 95 | Default: thrift://emr.dev.services:9083 96 | Description: Hive IP Address 97 | Environment: 98 | Type: String 99 | Description: Presto Launch Environment 100 | AllowedValues: 101 | - dev 102 | - prod 103 | Default: "dev" 104 | PrestoVersion: 105 | Type: String 106 | Default: 330 107 | Description: Presto Version which is being deployed 108 | 109 | Resources: 110 | PrestoSecurityGroup: 111 | Type: "AWS::EC2::SecurityGroup" 112 | #aws-permission @cft ec2:CreateSecurityGroup 113 | #aws-permission @cft ec2:DeleteSecurityGroup 114 | Properties: 115 | GroupDescription: Presto nodes Security Group 116 | VpcId: !Ref VPC 117 | Tags: 118 | - { Key: Name, Value: !Sub "${AWS::StackName}-presto-sg" } 119 | - { Key: "presto:opensource:identification:role", Value: "presto:security-group" } 120 | PrestoHttpsOutboundRule: 121 | Type: AWS::EC2::SecurityGroupEgress 122 | #aws-permission @cft ec2:AuthorizeSecurityGroupEgress 123 | #aws-permission @cft ec2:RevokeSecurityGroupEgress 124 | Properties: 125 | IpProtocol: tcp 126 | FromPort: '443' 127 | ToPort: '443' 128 | CidrIp: 0.0.0.0/0 129 | GroupId: !GetAtt PrestoSecurityGroup.GroupId 130 | PrestoOutboundRule: 131 | Type: AWS::EC2::SecurityGroupEgress 132 | #aws-permission @cft ec2:AuthorizeSecurityGroupEgress 133 | #aws-permission @cft ec2:RevokeSecurityGroupEgress 134 | Properties: 135 | IpProtocol: tcp 136 | FromPort: '8080' 137 | ToPort: '8080' 138 | DestinationSecurityGroupId: !GetAtt PrestoSecurityGroup.GroupId 139 | GroupId: !GetAtt PrestoSecurityGroup.GroupId 140 | PrestoInboundRule: 141 | Type: AWS::EC2::SecurityGroupIngress 142 | #aws-permission @cft ec2:AuthorizeSecurityGroupIngress 143 | #aws-permission @cft ec2:RevokeSecurityGroupIngress 144 | Properties: 145 | IpProtocol: tcp 146 | FromPort: '8080' 147 | ToPort: '8080' 148 | SourceSecurityGroupId: !GetAtt PrestoSecurityGroup.GroupId 149 | GroupId: !GetAtt PrestoSecurityGroup.GroupId 150 | PrestoClusterIAMRole: 151 | Type: AWS::IAM::Role 152 | #aws-permission @cft iam:CreateRole 153 | #aws-permission @cft iam:DeleteRole 154 | # Condition: CreateIamInstanceProfile 155 | Properties: 156 | RoleName: !Sub ${AWS::StackName}-presto-cluster-iam-role 157 | AssumeRolePolicyDocument: 158 | Statement: 159 | - Effect: Allow 160 | Principal: 161 | Service: [ec2.amazonaws.com, apigateway.amazonaws.com] 162 | Action: ['sts:AssumeRole'] 163 | Policies: 164 | #aws-permission @cft iam:AttachRolePolicy 165 | #aws-permission @cft iam:DeleteRolePolicy 166 | #aws-permission @cft iam:DetachRolePolicy 167 | #aws-permission @cft iam:PutRolePolicy 168 | - PolicyName: !Sub ${AWS::StackName}-presto-cf-policy 169 | PolicyDocument: 170 | Version: "2012-10-17" 171 | Statement: 172 | - Effect: Allow 173 | Action: 174 | - "autoscaling:CompleteLifecycleAction" 175 | - "autoscaling:RecordLifecycleActionHeartbeat" 176 | - "autoscaling:DescribeAutoScalingGroups" 177 | - "autoscaling:PutScalingPolicy" 178 | - "autoscaling:DescribeAutoScalingInstances" 179 | - "autoscaling:DescribeLaunchConfigurations" 180 | - "autoscaling:DescribeScalingActivities" 181 | - "autoscaling:UpdateAutoScalingGroup" 182 | - "autoscaling:SetDesiredCapacity" 183 | - "cloudformation:SignalResource" 184 | - "ec2:DescribeInstances" 185 | - "glue:BatchGetPartition" 186 | - "glue:BatchCreatePartition" 187 | - "glue:CreateDatabase" 188 | - "glue:CreateTable" 189 | - "glue:DeleteDatabase" 190 | - "glue:DeletePartition" 191 | - "glue:DeleteTable" 192 | - "glue:GetDatabase" 193 | - "glue:GetDatabases" 194 | - "glue:GetPartition" 195 | - "glue:GetPartitions" 196 | - "glue:GetTable" 197 | - "glue:GetTables" 198 | - "glue:UpdateTable" 199 | - "glue:UpdatePartition" 200 | - "s3:GetObject" 201 | - "s3:ListBucket" 202 | - "s3:PutObject" 203 | - "sqs:ChangeMessageVisibility" 204 | - "sqs:DeleteMessage" 205 | - "sqs:GetQueueUrl" 206 | - "sqs:ReceiveMessage" 207 | - "logs:CreateLogGroup" 208 | - "logs:CreateLogStream" 209 | - "logs:PutLogEvents" 210 | - "logs:PutRetentionPolicy" 211 | - "logs:DescribeLogGroups" 212 | - "logs:DescribeLogStreams" 213 | - "cloudwatch:PutMetricData" 214 | - "ec2:CreateNetworkInterface" 215 | - "ec2:DescribeNetworkInterfaces" 216 | - "ec2:DeleteNetworkInterface" 217 | - "ec2:AttachNetworkInterface" 218 | - "ec2:DetachNetworkInterface" 219 | - "ec2:DescribeNetworkInterfaceAttribute" 220 | Resource: 221 | - "*" 222 | PrestoClusterInstanceProfile: 223 | Type: AWS::IAM::InstanceProfile 224 | #aws-permission @cft iam:CreateInstanceProfile 225 | #aws-permission @cft iam:DeleteInstanceProfile 226 | #aws-permission @cft iam:GetRole 227 | #aws-permission @cft iam:AddRoleToInstanceProfile 228 | #aws-permission @cft iam:RemoveRoleFromInstanceProfile 229 | Properties: 230 | Roles: 231 | - Ref: PrestoClusterIAMRole 232 | CoordinatorENI: 233 | Type: 'AWS::EC2::NetworkInterface' 234 | #aws-permission @cft ec2:CreateNetworkInterface 235 | #aws-permission @cft ec2:DescribeNetworkInterfaces 236 | #aws-permission @cft ec2:ModifyNetworkInterfaceAttribute 237 | #aws-permission @cft ec2:DeleteNetworkInterface 238 | Properties: 239 | Description: !Sub "${AWS::StackName} coordinator ENI" 240 | GroupSet: !Split 241 | - ',' 242 | - !Join 243 | - ',' 244 | - - !GetAtt PrestoSecurityGroup.GroupId 245 | - !Join 246 | - ',' 247 | - !Ref SecurityGroups 248 | SubnetId: !Ref Subnet 249 | Tags: 250 | - { Key: Name, Value: !Sub "${AWS::StackName}-coordinator-ENI" } 251 | - { Key: "presto:opensource:identification:role", Value: "presto:coordinator-eni" } 252 | 253 | PrestoCoordinatorServerLogGroup: 254 | Type: AWS::Logs::LogGroup 255 | Properties: 256 | LogGroupName: !Sub /prestosql/presto/${AWS::StackName}/coordinators/server 257 | RetentionInDays: 7 258 | PrestoCoordinatorSyslogLogGroup: 259 | Type: AWS::Logs::LogGroup 260 | Properties: 261 | LogGroupName: !Sub /prestosql/presto/${AWS::StackName}/coordinators/syslog 262 | RetentionInDays: 3 263 | PrestoWorkerServerLogGroup: 264 | Type: AWS::Logs::LogGroup 265 | Properties: 266 | LogGroupName: !Sub /prestosql/presto/${AWS::StackName}/workers/server 267 | RetentionInDays: 1 268 | PrestoWorkerSyslogLogGroup: 269 | Type: AWS::Logs::LogGroup 270 | Properties: 271 | LogGroupName: !Sub /prestosql/presto/${AWS::StackName}/workers/syslog 272 | RetentionInDays: 1 273 | 274 | Coordinator: 275 | Type: 'AWS::AutoScaling::LaunchConfiguration' 276 | #aws-permission @cft autoscaling:CreateLaunchConfiguration 277 | #aws-permission @cft autoscaling:DeleteLaunchConfiguration 278 | #aws-permission @cft autoscaling:DescribeLaunchConfigurations 279 | Properties: 280 | InstanceType: !Ref CoordinatorInstanceType 281 | ImageId: !FindInMap [RegionMap, !Ref "AWS::Region", PRESTOIMAGE] 282 | BlockDeviceMappings: 283 | - DeviceName: /dev/xvda 284 | Ebs: 285 | DeleteOnTermination: true 286 | VolumeSize: !Ref Ec2RootVolumeSize 287 | VolumeType: gp2 288 | KeyName: !Ref KeyName 289 | #aws-permission @cft iam:GetInstanceProfile 290 | IamInstanceProfile: !Ref PrestoClusterInstanceProfile 291 | SecurityGroups: !Split 292 | - ',' 293 | - !Join 294 | - ',' 295 | - - !GetAtt PrestoSecurityGroup.GroupId 296 | - !Join 297 | - ',' 298 | - !Ref SecurityGroups 299 | UserData: 300 | Fn::Base64: 301 | !Sub | 302 | #!/bin/bash 303 | set -xtrace 304 | 305 | sed -i -e "s/{{isCoordinator}}/true/g" /etc/presto/config.properties 306 | sed -i -e "s#{{coordinatorDiscoveryUri}}#http://localhost:8080#g" /etc/presto/config.properties 307 | echo "" >> /etc/presto/config.properties 308 | echo discovery-server.enabled=true >> /etc/presto/config.properties 309 | 310 | R=$(($(grep MemTotal /proc/meminfo | awk '{print $2}')/1048576)) 311 | X=$(($R*8/10)) 312 | sed -i -e "s/{{jvmMemory}}/$X/g" /etc/presto/jvm.config 313 | sed -i -e "s/{{envName}}/${Environment}/g" /etc/presto/node.properties 314 | sed -i -e "s/{{instanceId}}/$(curl http://169.254.169.254/latest/meta-data/instance-id/)/g" /etc/presto/node.properties 315 | 316 | Z=$(($X*6/10)) 317 | echo "query.max-memory-per-node="$Z"GB" >> /etc/presto/config.properties 318 | 319 | Y=$(($X*7/10)) 320 | echo "query.max-total-memory-per-node="$Y"GB" >> /etc/presto/config.properties 321 | 322 | echo "query.max-memory=1PB" >> /etc/presto/config.properties 323 | echo "query.low-memory-killer.policy=total-reservation-on-blocked-nodes" >> /etc/presto/config.properties 324 | 325 | sudo tee /etc/awslogs/awslogs.conf > /dev/null < /dev/null < /dev/null < /dev/null <> /etc/presto/config.properties 362 | cat /tmp/config/catalog/hive.properties >> /etc/presto/catalog/hive.properties 363 | cp /tmp/config/catalog/jmx.properties /etc/presto/catalog/jmx.properties 364 | cp /tmp/config/catalog/tpch.properties /etc/presto/catalog/tpch.properties 365 | cp /tmp/config/catalog/tpcds.properties /etc/presto/catalog/tpcds.properties 366 | 367 | mkdir -p /usr/lib/presto/plugin/atlan-audit-logger-presto-experimental/ 368 | rm -rf /usr/lib/presto/plugin/atlan-audit-logger-presto-experimental/* 369 | wget -O /usr/lib/presto/plugin/atlan-audit-logger-presto-experimental/QueryAuditEventListener-1.4-prestosql.jar https://github.com/atlanhq/presto-query-logger/releases/download/v1.3/presto-query-logger-1.3.jar 370 | 371 | mkdir /usr/lib/presto/etc/ 372 | sudo tee /usr/lib/presto/etc/event-listener.properties > /dev/null < 329 )); then 384 | echo "Presto version greater than 329" 385 | apiPrefix='ui/api/' 386 | fi 387 | 388 | 389 | cat < /etc/presto_metrics/env.prometheus 390 | PRESTO_HOST=localhost 391 | PRESTO_PORT=8080 392 | SERVICE_NAME=prometheus 393 | STACK_NAME=${AWS::StackName} 394 | CLOUDWATCH_NAMESPACE=presto 395 | API_PREFIX=$apiPrefix 396 | EOF 397 | 398 | cat < /etc/presto_metrics/env.cloudwatch 399 | PRESTO_HOST=localhost 400 | PRESTO_PORT=8080 401 | SERVICE_NAME=cloudwatch 402 | STACK_NAME=${AWS::StackName} 403 | CLOUDWATCH_NAMESPACE=presto 404 | CLOUDWATCH_REGION=${AWS::Region} 405 | API_PREFIX=$apiPrefix 406 | EOF 407 | 408 | 409 | systemctl start awslogsd 410 | service presto start 411 | 412 | HTTP_URL="http://localhost:8080/v1/status" 413 | CURL_CMD="curl -w httpcode=%{http_code}" 414 | 415 | # -m, --max-time FOR curl operation 416 | CURL_MAX_CONNECTION_TIMEOUT="-m 5" 417 | 418 | # perform curl operation 419 | 420 | for i in {1..30} 421 | do 422 | sleep 5 423 | CURL_RETURN_CODE=0 424 | CURL_OUTPUT=`$CURL_CMD $CURL_MAX_CONNECTION_TIMEOUT $HTTP_URL 2> /dev/null` || CURL_RETURN_CODE=$? 425 | if [ $CURL_RETURN_CODE -ne 0 ]; then 426 | echo "Curl connection failed with return code - $CURL_RETURN_CODE" 427 | else 428 | echo "Success" 429 | break 430 | fi 431 | done 432 | 433 | 434 | if [ $CURL_RETURN_CODE -ne 0 ]; then 435 | /opt/aws/bin/cfn-signal -s 'false' --stack ${AWS::StackName} --resource Coordinators --region ${AWS::Region} 436 | else 437 | systemctl start presto_metrics_prometheus.service 438 | systemctl start presto_metrics_cloudwatch.service 439 | /opt/aws/bin/cfn-signal -s 'true' --stack ${AWS::StackName} --resource Coordinators --region ${AWS::Region} 440 | 441 | fi 442 | Coordinators: 443 | Type: 'AWS::AutoScaling::AutoScalingGroup' 444 | #aws-permission @cft autoscaling:CreateAutoScalingGroup 445 | #aws-permission @cft autoscaling:DeleteAutoScalingGroup 446 | #aws-permission @cft autoscaling:DescribeAutoScalingGroups 447 | #aws-permission @cft autoscaling:UpdateAutoScalingGroup 448 | #aws-permission @cft autoscaling:DescribeScalingActivities 449 | #aws-permission @cft autoscaling:DescribeLaunchConfigurations 450 | #aws-permission @cft autoscaling:DescribeAutoScalingInstances 451 | #aws-permission @cft ec2:CreateTags 452 | #aws-permission @cft ec2:RunInstances 453 | #aws-permission @cft ec2:TerminateInstances 454 | #aws-permission @cft ec2:DescribeInstances 455 | UpdatePolicy: 456 | # Make updates to LaunchConfiguration cause rolling update of coordinators 457 | AutoScalingReplacingUpdate: 458 | WillReplace: true 459 | CreationPolicy: 460 | ResourceSignal: 461 | Timeout: PT15M 462 | Count: !Ref CoordinatorInstanceCount 463 | Properties: 464 | LaunchConfigurationName: !Ref Coordinator 465 | VPCZoneIdentifier: 466 | - !Ref Subnet 467 | MinSize: !Ref CoordinatorInstanceCount 468 | MaxSize: !Ref CoordinatorInstanceCount 469 | DesiredCapacity: !Ref CoordinatorInstanceCount 470 | Tags: 471 | - { Key: Name, Value: !Sub "${AWS::StackName}-presto-coordinator", PropagateAtLaunch: true } 472 | - { Key: "presto:opensource:identification:role", Value: "presto:coordinator", PropagateAtLaunch: true } 473 | HealthCheckGracePeriod: 300 474 | HealthCheckType: ELB 475 | LoadBalancerNames: !Split 476 | - ',' 477 | - !Join 478 | - ',' 479 | - - !Ref PrestoCoordinatorsELB 480 | Worker: 481 | Type: 'AWS::AutoScaling::LaunchConfiguration' 482 | #aws-permission @cft autoscaling:CreateLaunchConfiguration 483 | #aws-permission @cft autoscaling:DeleteLaunchConfiguration 484 | #aws-permission @cft autoscaling:DescribeLaunchConfigurations 485 | #aws-permission @cft autoscaling:UpdateAutoScalingGroup 486 | Properties: 487 | InstanceType: !Ref WorkersInstanceType 488 | ImageId: !FindInMap [RegionMap, !Ref "AWS::Region", PRESTOIMAGE] 489 | BlockDeviceMappings: 490 | - DeviceName: /dev/xvda 491 | Ebs: 492 | DeleteOnTermination: true 493 | VolumeSize: !Ref Ec2RootVolumeSize 494 | VolumeType: gp2 495 | KeyName: !Ref KeyName 496 | #aws-permission @cft iam:GetInstanceProfile 497 | IamInstanceProfile: !Ref PrestoClusterInstanceProfile 498 | SecurityGroups: !Split 499 | - ',' 500 | - !Join 501 | - ',' 502 | - - !GetAtt PrestoSecurityGroup.GroupId 503 | - !Join 504 | - ',' 505 | - !Ref SecurityGroups 506 | # When worker has private IP only, following things are problematic: 507 | # - S3 cannot be accessed (can be fixed with NAT box in VPC or "VPC endpoint for S3") 508 | # - EC2 boot is very long, as it includes `yum upgrade` which retries timeouts (can 509 | # be fixed with NAT box in VPC or "repo_upgrade: none" in cloud init) 510 | #AssociatePublicIpAddress: false 511 | UserData: 512 | Fn::Base64: 513 | !Sub | 514 | #!/bin/bash 515 | set -xtrace 516 | 517 | sed -i -e "s/{{isCoordinator}}/false/g" /etc/presto/config.properties 518 | sed -i -e "s#{{coordinatorDiscoveryUri}}#http://${CoordinatorENI.PrimaryPrivateIpAddress}:8080#g" /etc/presto/config.properties 519 | 520 | R=$(($(grep MemTotal /proc/meminfo | awk '{print $2}')/1048576)) 521 | X=$(($R*8/10)) 522 | sed -i -e "s/{{jvmMemory}}/$X/g" /etc/presto/jvm.config 523 | sed -i -e "s/{{envName}}/${Environment}/g" /etc/presto/node.properties 524 | sed -i -e "s/{{instanceId}}/$(curl http://169.254.169.254/latest/meta-data/instance-id/)/g" /etc/presto/node.properties 525 | 526 | echo "" >> /etc/presto/config.properties 527 | 528 | Z=$(($X*6/10)) 529 | echo "query.max-memory-per-node="$Z"GB" >> /etc/presto/config.properties 530 | 531 | Y=$(($X*7/10)) 532 | echo "query.max-total-memory-per-node="$Y"GB" >> /etc/presto/config.properties 533 | echo "query.low-memory-killer.policy=total-reservation-on-blocked-nodes" >> /etc/presto/config.properties 534 | 535 | sudo tee /etc/awslogs/awslogs.conf > /dev/null < /dev/null < /dev/null <> /etc/presto/config.properties 568 | cat /tmp/config/catalog/hive.properties >> /etc/presto/catalog/hive.properties 569 | cp /tmp/config/catalog/jmx.properties /etc/presto/catalog/jmx.properties 570 | cp /tmp/config/catalog/tpch.properties /etc/presto/catalog/tpch.properties 571 | cp /tmp/config/catalog/tpcds.properties /etc/presto/catalog/tpcds.properties 572 | 573 | mkdir /usr/lib/presto/plugin/atlan-audit-logger-presto-experimental/ 574 | mkdir /usr/lib/presto/etc/ 575 | rm -rf /usr/lib/presto/plugin/atlan-audit-logger-presto-experimental/* 576 | wget -O /usr/lib/presto/plugin/atlan-audit-logger-presto-experimental/QueryAuditEventListener-1.4-prestosql.jar https://athena-cloudformation-templates.s3.ap-south-1.amazonaws.com/unilever/config/QueryAuditEventListener-1.4-prestosql-jar-with-dependencies.jar 577 | 578 | sudo tee /usr/lib/presto/etc/event-listener.properties > /dev/null < 329 )); then 590 | echo "Presto version greater than 329" 591 | apiPrefix='ui/api/' 592 | fi 593 | 594 | 595 | cat < /etc/presto_metrics/env.prometheus 596 | PRESTO_HOST=localhost 597 | PRESTO_PORT=8080 598 | SERVICE_NAME=prometheus 599 | STACK_NAME=${AWS::StackName} 600 | CLOUDWATCH_NAMESPACE=presto 601 | API_PREFIX=$apiPrefix 602 | EOF 603 | 604 | cat < /etc/presto_metrics/env.cloudwatch 605 | PRESTO_HOST=localhost 606 | PRESTO_PORT=8080 607 | SERVICE_NAME=cloudwatch 608 | STACK_NAME=${AWS::StackName} 609 | CLOUDWATCH_NAMESPACE=presto 610 | API_PREFIX=$apiPrefix 611 | EOF 612 | 613 | systemctl start awslogsd 614 | service presto start 615 | 616 | HTTP_URL="http://localhost:8080/v1/status" 617 | CURL_CMD="curl -w httpcode=%{http_code}" 618 | 619 | # -m, --max-time FOR curl operation 620 | CURL_MAX_CONNECTION_TIMEOUT="-m 5" 621 | 622 | # perform curl operation 623 | 624 | for i in {1..30} 625 | do 626 | sleep 5 627 | CURL_RETURN_CODE=0 628 | CURL_OUTPUT=`$CURL_CMD $CURL_MAX_CONNECTION_TIMEOUT $HTTP_URL 2> /dev/null` || CURL_RETURN_CODE=$? 629 | if [ $CURL_RETURN_CODE -ne 0 ]; then 630 | echo "Curl connection failed with return code - $CURL_RETURN_CODE" 631 | else 632 | echo "Success" 633 | break 634 | fi 635 | done 636 | 637 | if [ $CURL_RETURN_CODE -ne 0 ]; then 638 | /opt/aws/bin/cfn-signal -s 'false' --stack ${AWS::StackName} --resource Workers --region ${AWS::Region} 639 | else 640 | systemctl start presto_metrics_prometheus.service 641 | systemctl start presto_metrics_cloudwatch.service 642 | /opt/aws/bin/cfn-signal -s 'true' --stack ${AWS::StackName} --resource Workers --region ${AWS::Region} 643 | fi 644 | 645 | Workers: 646 | Type: 'AWS::AutoScaling::AutoScalingGroup' 647 | #aws-permission @cft autoscaling:CreateAutoScalingGroup 648 | #aws-permission @cft autoscaling:DeleteAutoScalingGroup 649 | #aws-permission @cft autoscaling:DescribeAutoScalingGroups 650 | #aws-permission @cft autoscaling:UpdateAutoScalingGroup 651 | #aws-permission @cft autoscaling:DescribeScalingActivities 652 | #aws-permission @cft autoscaling:DescribeLaunchConfigurations 653 | #aws-permission @cft autoscaling:DescribeAutoScalingInstances 654 | #aws-permission @cft ec2:CreateTags 655 | #aws-permission @cft ec2:RunInstances 656 | #aws-permission @cft ec2:TerminateInstances 657 | #aws-permission @cft ec2:DescribeInstances 658 | UpdatePolicy: 659 | # Make updates to LaunchConfiguration cause rolling update of workers 660 | AutoScalingReplacingUpdate: 661 | WillReplace: true 662 | CreationPolicy: 663 | ResourceSignal: 664 | Timeout: PT15M 665 | Count: !Ref WorkersCount 666 | Properties: 667 | LaunchConfigurationName: !Ref Worker 668 | MetricsCollection: 669 | - Granularity: "1Minute" 670 | VPCZoneIdentifier: 671 | - !Ref Subnet 672 | MinSize: !Ref WorkersCount 673 | MaxSize: !Ref MaxWorkersCount 674 | DesiredCapacity: !Ref WorkersCount 675 | Tags: 676 | - { Key: Name, Value: !Sub "${AWS::StackName}-presto-worker", PropagateAtLaunch: true } 677 | - { Key: "presto:opensource:identification:role", Value: "presto:worker", PropagateAtLaunch: true } 678 | HealthCheckGracePeriod: 180 679 | HealthCheckType: ELB 680 | LoadBalancerNames: !Split 681 | - ',' 682 | - !Join 683 | - ',' 684 | - - !Ref PrestoWorkersELB 685 | WorkersScaleUpPolicy: 686 | Type: AWS::AutoScaling::ScalingPolicy 687 | Properties: 688 | AdjustmentType: ChangeInCapacity 689 | AutoScalingGroupName: 690 | Ref: Workers 691 | Cooldown: '60' 692 | ScalingAdjustment: '1' 693 | WorkersScaleDownPolicy: 694 | Type: AWS::AutoScaling::ScalingPolicy 695 | Properties: 696 | AdjustmentType: ChangeInCapacity 697 | AutoScalingGroupName: 698 | Ref: Workers 699 | Cooldown: '60' 700 | ScalingAdjustment: "-1" 701 | WorkersUserCPUAlarmHigh: 702 | Type: AWS::CloudWatch::Alarm 703 | Properties: 704 | AlarmDescription: Scale-up if CPU > 70% for 1 minutes 705 | MetricName: MeanWorkerUserCPUUtilisation 706 | Namespace: presto 707 | Statistic: Average 708 | Period: '120' 709 | EvaluationPeriods: '1' 710 | Threshold: '0.7' 711 | AlarmActions: 712 | - Ref: WorkersScaleUpPolicy 713 | Dimensions: 714 | - Name: prestoStackName 715 | Value: 716 | Ref: "AWS::StackName" 717 | ComparisonOperator: GreaterThanThreshold 718 | WorkersUserCPUAlarmLow: 719 | Type: AWS::CloudWatch::Alarm 720 | Properties: 721 | AlarmDescription: Scale-down if CPU < 50% for 5 minutes 722 | MetricName: MeanWorkerUserCPUUtilisation 723 | Namespace: presto 724 | Statistic: Average 725 | Period: '300' 726 | EvaluationPeriods: '1' 727 | Threshold: '0.5' 728 | AlarmActions: 729 | - Ref: WorkersScaleDownPolicy 730 | Dimensions: 731 | - Name: prestoStackName 732 | Value: 733 | Ref: "AWS::StackName" 734 | ComparisonOperator: LessThanThreshold 735 | 736 | WorkersSystemCPUAlarmHigh: 737 | Type: AWS::CloudWatch::Alarm 738 | Properties: 739 | AlarmDescription: Scale-up if CPU > 70% for 1 minutes 740 | MetricName: MeanWorkerSystemCPUUtilisation 741 | Namespace: presto 742 | Statistic: Average 743 | Period: '120' 744 | EvaluationPeriods: '1' 745 | Threshold: '0.7' 746 | AlarmActions: 747 | - Ref: WorkersScaleUpPolicy 748 | Dimensions: 749 | - Name: prestoStackName 750 | Value: 751 | Ref: "AWS::StackName" 752 | ComparisonOperator: GreaterThanThreshold 753 | WorkersSystemCPUAlarmLow: 754 | Type: AWS::CloudWatch::Alarm 755 | Properties: 756 | AlarmDescription: Scale-down if CPU < 50% for 5 minutes 757 | MetricName: MeanWorkerSystemCPUUtilisation 758 | Namespace: presto 759 | Statistic: Average 760 | Period: '300' 761 | EvaluationPeriods: '1' 762 | Threshold: '0.5' 763 | AlarmActions: 764 | - Ref: WorkersScaleDownPolicy 765 | Dimensions: 766 | - Name: prestoStackName 767 | Value: 768 | Ref: "AWS::StackName" 769 | ComparisonOperator: LessThanThreshold 770 | 771 | GracefulNodeShutdownQueueIAMRole: 772 | Type: AWS::IAM::Role 773 | #aws-permission @cft iam:CreateRole 774 | #aws-permission @cft iam:DeleteRole 775 | #Condition: CreateIamInstanceProfile 776 | Properties: 777 | RoleName: !Sub ${AWS::StackName}-graceful-shutdown-iam-role 778 | AssumeRolePolicyDocument: 779 | Statement: 780 | - Effect: Allow 781 | Principal: 782 | Service: 783 | - ec2.amazonaws.com 784 | - lambda.amazonaws.com 785 | Action: ['sts:AssumeRole'] 786 | Policies: 787 | #aws-permission @cft iam:AttachRolePolicy 788 | #aws-permission @cft iam:DeleteRolePolicy 789 | #aws-permission @cft iam:DetachRolePolicy 790 | #aws-permission @cft iam:PutRolePolicy 791 | - PolicyName: !Sub ${AWS::StackName}-graceful-shutdown-iam-role 792 | PolicyDocument: 793 | Version: "2012-10-17" 794 | Statement: 795 | - Effect: Allow 796 | Action: 797 | - "ec2:DescribeInstances" 798 | - "sqs:ReceiveMessage" 799 | - "sqs:SendMessage" 800 | - "sqs:DeleteMessage" 801 | - "sqs:GetQueueAttributes" 802 | - "sqs:GetQueueUrl" 803 | - "logs:PutLogEvents" 804 | - "logs:CreateLogStream" 805 | - "logs:CreateLogGroup" 806 | - "ec2:CreateNetworkInterface" 807 | - "ec2:DescribeNetworkInterfaces" 808 | - "ec2:DeleteNetworkInterface" 809 | - "ec2:AttachNetworkInterfaces" 810 | Resource: 811 | - "*" 812 | 813 | AutoScalingTerminationWaitLambdaIAMRole: 814 | Type: AWS::IAM::Role 815 | #aws-permission @cft iam:CreateRole 816 | #aws-permission @cft iam:DeleteRole 817 | #Condition: CreateIamInstanceProfile 818 | Properties: 819 | RoleName: !Sub ${AWS::StackName}-asg-terminate-lambda-iam-role 820 | AssumeRolePolicyDocument: 821 | Statement: 822 | - Effect: Allow 823 | Principal: 824 | Service: 825 | - ec2.amazonaws.com 826 | - lambda.amazonaws.com 827 | Action: ['sts:AssumeRole'] 828 | Policies: 829 | #aws-permission @cft iam:AttachRolePolicy 830 | #aws-permission @cft iam:DeleteRolePolicy 831 | #aws-permission @cft iam:DetachRolePolicy 832 | #aws-permission @cft iam:PutRolePolicy 833 | - PolicyName: !Sub ${AWS::StackName}-asg-terminate-lambda-iam-role 834 | PolicyDocument: 835 | Version: "2012-10-17" 836 | Statement: 837 | - Effect: Allow 838 | Action: 839 | - "autoscaling:CompleteLifecycleAction" 840 | - "sqs:ReceiveMessage" 841 | - "sqs:SendMessage" 842 | - "sqs:GetQueueUrl" 843 | - "sqs:GetQueueAttributes" 844 | - "sqs:DeleteMessage" 845 | - "logs:PutLogEvents" 846 | - "logs:CreateLogStream" 847 | - "logs:CreateLogGroup" 848 | - "ec2:CreateNetworkInterface" 849 | - "ec2:DescribeNetworkInterfaces" 850 | - "ec2:DeleteNetworkInterface" 851 | - "ec2:AttachNetworkInterfaces" 852 | - "ec2:Describe*" 853 | Resource: 854 | - "*" 855 | 856 | GracefulNodeShutdownQueue: 857 | Type: 'AWS::SQS::Queue' 858 | #aws-permission @cft sqs:CreateQueue 859 | #aws-permission @cft sqs:DeleteQueue 860 | #aws-permission @cft sqs:GetQueueAttributes 861 | #aws-permission @cft sqs:TagQueue 862 | Properties: 863 | # This is required so that threads that handle those messages will not try to process the same message over and over again 864 | # The value is higher than the usual graceful shutdown, so that in most cases there will not be a need to request 865 | # more time. Otherwise if the handling thread is reaching 250s it will tell SQS to keep his message private for longer. 866 | # See AWS docs on VisibilityTimeout for more details. 867 | VisibilityTimeout: 250 868 | AutoScalingTerminationWaitQueue: 869 | Type: 'AWS::SQS::Queue' 870 | Properties: 871 | VisibilityTimeout: 250 872 | 873 | AutoScalingNotificationIAMRole: 874 | Type: 'AWS::IAM::Role' 875 | Properties: 876 | AssumeRolePolicyDocument: 877 | Version: 2012-10-17 878 | Statement: 879 | - Effect: Allow 880 | Principal: 881 | Service: 882 | - autoscaling.amazonaws.com 883 | - ec2.amazonaws.com 884 | Action: 885 | - 'sts:AssumeRole' 886 | Path: / 887 | ManagedPolicyArns: 888 | - arn:aws:iam::aws:policy/service-role/AutoScalingNotificationAccessRole 889 | - arn:aws:iam::aws:policy/AmazonEC2ReadOnlyAccess 890 | 891 | GracefulPrestoNodeShutdownHook: 892 | Type: "AWS::AutoScaling::LifecycleHook" 893 | #aws-permission @cft autoscaling:PutLifecycleHook 894 | #aws-permission @cft autoscaling:DeleteLifecycleHook 895 | Properties: 896 | AutoScalingGroupName: !Ref Workers 897 | LifecycleTransition: 'autoscaling:EC2_INSTANCE_TERMINATING' 898 | NotificationTargetARN: !GetAtt GracefulNodeShutdownQueue.Arn 899 | RoleARN: !GetAtt AutoScalingNotificationIAMRole.Arn 900 | HeartbeatTimeout: 3600 901 | DefaultResult: CONTINUE 902 | GracefulNodeShutdownLambda: 903 | Type: "AWS::Lambda::Function" 904 | Properties: 905 | Code: 906 | ZipFile: | 907 | import os 908 | import json 909 | import boto3 910 | from botocore.vendored import requests 911 | 912 | def lambda_handler(event, context): 913 | # TODO implement 914 | print(event) 915 | event_body = json.loads(event['Records'][0]["body"]) 916 | if event_body["LifecycleTransition"] != "autoscaling:EC2_INSTANCE_TERMINATING": 917 | print("Not a terminating condition return") 918 | return 919 | ec2_instance_id = event_body["EC2InstanceId"] 920 | instance_id = ec2_instance_id 921 | ec2 = boto3.resource('ec2') 922 | ec2_instance = ec2.Instance(instance_id) 923 | ip = ec2_instance.private_ip_address 924 | for i in range(3): 925 | try: 926 | url = 'http://{}:8080/v1/info/state'.format(ip) 927 | payload = "\"SHUTTING_DOWN\"" 928 | headers = { 929 | 'Content-Type': "application/json", 930 | 'cache-control': "no-cache" 931 | } 932 | 933 | response = requests.request("PUT", url, data=payload, headers=headers) 934 | print(response.text) 935 | except Exception as e: 936 | pass 937 | print(ip) 938 | queue_url = os.getenv('QUEUE_URL') 939 | print(queue_url) 940 | sqs = boto3.client('sqs') 941 | response = sqs.send_message( 942 | QueueUrl=queue_url, 943 | MessageBody=json.dumps(event_body) 944 | ) 945 | print(response) 946 | 947 | return { 948 | 'statusCode': 200, 949 | 'body': json.dumps('Hello from Lambda!') 950 | } 951 | 952 | 953 | Environment: 954 | Variables: 955 | QUEUE_URL: !Ref AutoScalingTerminationWaitQueue 956 | COORDINATOR_IP: !GetAtt CoordinatorENI.PrimaryPrivateIpAddress 957 | Role: !GetAtt GracefulNodeShutdownQueueIAMRole.Arn 958 | Timeout: 10 959 | Handler: index.lambda_handler 960 | Runtime: python3.6 961 | VpcConfig: 962 | SubnetIds: 963 | - !Ref Subnet 964 | SecurityGroupIds: !Split 965 | - ',' 966 | - !Join 967 | - ',' 968 | - - !GetAtt PrestoSecurityGroup.GroupId 969 | - !Join 970 | - ',' 971 | - !Ref SecurityGroups 972 | 973 | GracefulNodeShutdownLambdaEventSourceMapping: 974 | Type: AWS::Lambda::EventSourceMapping 975 | Properties: 976 | BatchSize: 1 977 | EventSourceArn: !GetAtt GracefulNodeShutdownQueue.Arn 978 | FunctionName: !GetAtt GracefulNodeShutdownLambda.Arn 979 | DependsOn: 980 | - GracefulNodeShutdownLambda 981 | - GracefulNodeShutdownQueue 982 | 983 | AutoScalingTerminationWaitLambda: 984 | Type: "AWS::Lambda::Function" 985 | Properties: 986 | Code: 987 | ZipFile: | 988 | import json 989 | import boto3 990 | import os 991 | from botocore.vendored import requests 992 | 993 | def lambda_handler(event, context): 994 | # TODO implement 995 | 996 | def enqueue_message(event_body): 997 | queue_url = os.getenv('QUEUE_URL') 998 | print(queue_url) 999 | sqs = boto3.client('sqs') 1000 | response = sqs.send_message( 1001 | QueueUrl=queue_url, 1002 | MessageBody=json.dumps(event_body), 1003 | DelaySeconds=60 1004 | ) 1005 | print(response) 1006 | 1007 | def complete_lifecycle(event_body): 1008 | res = autoscaling.complete_lifecycle_action( 1009 | LifecycleHookName=event_body["LifecycleHookName"], 1010 | AutoScalingGroupName=event_body["AutoScalingGroupName"], 1011 | LifecycleActionToken=event_body["LifecycleActionToken"], 1012 | LifecycleActionResult='CONTINUE' 1013 | ) 1014 | print(res) 1015 | 1016 | event_body = json.loads(event['Records'][0]["body"]) 1017 | if event_body["LifecycleTransition"] != "autoscaling:EC2_INSTANCE_TERMINATING": 1018 | print("Not a terminating condition return") 1019 | return 1020 | ec2_instance_id = event_body["EC2InstanceId"] 1021 | ec2 = boto3.resource("ec2") 1022 | autoscaling = boto3.client('autoscaling') 1023 | ec2_instance = ec2.Instance(ec2_instance_id) 1024 | ip = ec2_instance.private_ip_address 1025 | print(ec2_instance_id) 1026 | request_url = "http://{ip}:8080/v1/task".format(ip=ip, node_id=ec2_instance_id) 1027 | try: 1028 | print(request_url) 1029 | worker_tasks = requests.get(request_url) 1030 | worker_tasks = worker_tasks.json() 1031 | print(len(worker_tasks)) 1032 | for task in worker_tasks: 1033 | if task['taskStatus']['state'] == 'RUNNING': 1034 | print('RUNNING QUEURIES FOUND') 1035 | enqueue_message(event_body) 1036 | return 1037 | 1038 | print('NO_QUERIES') 1039 | complete_lifecycle(event_body) 1040 | return 1041 | except Exception as e: 1042 | print(str(e)) 1043 | print("Terminating instance because worker not responding") 1044 | complete_lifecycle(event_body) 1045 | return 1046 | 1047 | 1048 | Environment: 1049 | Variables: 1050 | QUEUE_URL: !Ref AutoScalingTerminationWaitQueue 1051 | COORDINATOR_IP: !GetAtt CoordinatorENI.PrimaryPrivateIpAddress 1052 | Role: !GetAtt AutoScalingTerminationWaitLambdaIAMRole.Arn 1053 | Timeout: 10 1054 | Handler: index.lambda_handler 1055 | Runtime: python3.6 1056 | VpcConfig: 1057 | SubnetIds: 1058 | - !Ref Subnet 1059 | SecurityGroupIds: !Split 1060 | - ',' 1061 | - !Join 1062 | - ',' 1063 | - - !GetAtt PrestoSecurityGroup.GroupId 1064 | - !Join 1065 | - ',' 1066 | - !Ref SecurityGroups 1067 | AutoScalingTerminationWaitLambdaEventSourceMapping: 1068 | Type: AWS::Lambda::EventSourceMapping 1069 | Properties: 1070 | BatchSize: 1 1071 | EventSourceArn: !GetAtt AutoScalingTerminationWaitQueue.Arn 1072 | FunctionName: !GetAtt AutoScalingTerminationWaitLambda.Arn 1073 | DependsOn: 1074 | - AutoScalingTerminationWaitLambda 1075 | - AutoScalingTerminationWaitQueue 1076 | 1077 | AutoScalingNotificationRole: 1078 | Type: 'AWS::IAM::Role' 1079 | Properties: 1080 | AssumeRolePolicyDocument: 1081 | Version: 2012-10-17 1082 | Statement: 1083 | - Effect: Allow 1084 | Principal: 1085 | Service: 1086 | - autoscaling.amazonaws.com 1087 | Action: 1088 | - 'sts:AssumeRole' 1089 | Path: / 1090 | ManagedPolicyArns: 1091 | - arn:aws:iam::aws:policy/service-role/AutoScalingNotificationAccessRole 1092 | 1093 | PrestoWorkersELB: 1094 | Type: AWS::ElasticLoadBalancing::LoadBalancer 1095 | Properties: 1096 | HealthCheck: 1097 | HealthyThreshold: 2 1098 | Interval: 15 1099 | Target: HTTP:8080/v1/status 1100 | Timeout: 10 1101 | UnhealthyThreshold: 2 1102 | Scheme: internal 1103 | Subnets: !Split 1104 | - ',' 1105 | - !Join 1106 | - ',' 1107 | - - !Ref Subnet 1108 | SecurityGroups: !Split 1109 | - ',' 1110 | - !Join 1111 | - ',' 1112 | - - !GetAtt PrestoSecurityGroup.GroupId 1113 | - !Join 1114 | - ',' 1115 | - !Ref SecurityGroups 1116 | Listeners: 1117 | - InstancePort: 8080 1118 | InstanceProtocol: HTTP 1119 | LoadBalancerPort: 8080 1120 | Protocol: HTTP 1121 | 1122 | PrestoCoordinatorsELB: 1123 | Type: AWS::ElasticLoadBalancing::LoadBalancer 1124 | Properties: 1125 | HealthCheck: 1126 | HealthyThreshold: 2 1127 | Interval: 15 1128 | Target: HTTP:8080/v1/status 1129 | Timeout: 10 1130 | UnhealthyThreshold: 2 1131 | Scheme: internal 1132 | Subnets: !Split 1133 | - ',' 1134 | - !Join 1135 | - ',' 1136 | - - !Ref Subnet 1137 | SecurityGroups: !Split 1138 | - ',' 1139 | - !Join 1140 | - ',' 1141 | - - !GetAtt PrestoSecurityGroup.GroupId 1142 | - !Join 1143 | - ',' 1144 | - !Ref SecurityGroups 1145 | Listeners: 1146 | - InstancePort: 8080 1147 | InstanceProtocol: HTTP 1148 | LoadBalancerPort: 8080 1149 | Protocol: HTTP 1150 | 1151 | HALambdaIamRole: 1152 | Type: AWS::IAM::Role 1153 | #aws-permission @cft iam:CreateRole 1154 | #aws-permission @cft iam:DeleteRole 1155 | Properties: 1156 | RoleName: !Sub ${AWS::StackName}-HA-lambda-role 1157 | AssumeRolePolicyDocument: 1158 | Statement: 1159 | - Effect: Allow 1160 | Principal: 1161 | Service: [lambda.amazonaws.com] 1162 | Action: ['sts:AssumeRole'] 1163 | Policies: 1164 | #aws-permission @cft iam:AttachRolePolicy 1165 | #aws-permission @cft iam:DeleteRolePolicy 1166 | #aws-permission @cft iam:DetachRolePolicy 1167 | #aws-permission @cft iam:PutRolePolicy 1168 | - PolicyName: !Sub ${AWS::StackName}-HA-lambda-policy 1169 | PolicyDocument: 1170 | Version: "2012-10-17" 1171 | Statement: 1172 | - Effect: Allow 1173 | Action: 1174 | - "ec2:DescribeInstances" 1175 | - "ec2:DescribeNetworkInterfaces" 1176 | - "ec2:AttachNetworkInterface" 1177 | - "ec2:DetachNetworkInterface" 1178 | - "ec2:CreateNetworkInterface" 1179 | - "ec2:DeleteNetworkInterface" 1180 | - "logs:CreateLogGroup" 1181 | - "logs:CreateLogStream" 1182 | - "logs:PutLogEvents" 1183 | Resource: 1184 | - "*" 1185 | HALambda: 1186 | Type: AWS::Lambda::Function 1187 | DependsOn: Coordinators 1188 | Properties: 1189 | Role: !GetAtt HALambdaIamRole.Arn 1190 | Handler: index.lambda_handler 1191 | MemorySize: 128 1192 | Runtime: python3.7 1193 | Code: 1194 | ZipFile: | 1195 | import json 1196 | import boto3 1197 | import http.client 1198 | import time 1199 | import os 1200 | 1201 | def detach_eni_instance(eniAttachmentId): 1202 | client = boto3.client('ec2') 1203 | print("detaching eni {}".format(eniAttachmentId)) 1204 | response = client.detach_network_interface( 1205 | AttachmentId = eniAttachmentId, 1206 | Force = True 1207 | ) 1208 | time.sleep(5) 1209 | print(response) 1210 | print("ENI detached") 1211 | 1212 | def attach_eni_instance(instanceId, eni_id): 1213 | client = boto3.client('ec2') 1214 | print("attaching eni {} to instance {}".format(eni_id, instanceId)) 1215 | response = client.attach_network_interface( 1216 | DeviceIndex=1, 1217 | InstanceId = instanceId, 1218 | NetworkInterfaceId = eni_id, 1219 | ) 1220 | return response 1221 | 1222 | def instance_health(instanceId): 1223 | client = boto3.client('ec2') 1224 | response = client.describe_instances( 1225 | InstanceIds=[ 1226 | instanceId 1227 | ] 1228 | ) 1229 | print("checking health for instance {}".format(instanceId)) 1230 | try: 1231 | conn = http.client.HTTPConnection(response['Reservations'][0]['Instances'][0]['PrivateIpAddress'], 8080) 1232 | conn.request("GET", "/v1/info") 1233 | r1 = conn.getresponse() 1234 | print(r1.status, r1.reason) 1235 | data = json.loads(r1.read().decode('utf-8').replace("'", '"')) 1236 | except Exception as e: 1237 | print("AN EXCEPTION OCCURED", str(e)) 1238 | data = { 1239 | "starting": True 1240 | } 1241 | return data 1242 | # return response 1243 | 1244 | def attach_eni(eni_id): 1245 | client = boto3.client('ec2') 1246 | response = client.describe_instances( 1247 | Filters=[ 1248 | { 1249 | 'Name': 'tag:presto:opensource:identification:role', 1250 | 'Values': [ 1251 | 'presto:coordinator' 1252 | ] 1253 | }, 1254 | { 1255 | 'Name': 'tag:aws:cloudformation:stack-name', 1256 | 'Values': [ 1257 | os.environ['STACK_NAME'] 1258 | ] 1259 | } 1260 | ] 1261 | ) 1262 | for j in range(len(response['Reservations'])): 1263 | for i in range(len(response['Reservations'][j]['Instances'])): 1264 | if response['Reservations'][j]['Instances'][i]['State']['Name'] != 'running': 1265 | continue 1266 | privateIpAddress = response['Reservations'][j]['Instances'][i]['PrivateIpAddress'] 1267 | instanceId = response['Reservations'][j]['Instances'][i]['InstanceId'] 1268 | print("Found instance to attach {}, {}".format(privateIpAddress, instanceId)) 1269 | try: 1270 | conn = http.client.HTTPConnection(privateIpAddress, 8080) 1271 | conn.request("GET", "/v1/info") 1272 | r1 = conn.getresponse() 1273 | print(privateIpAddress, r1.status, r1.reason) 1274 | data = json.loads(r1.read().decode('utf-8').replace("'", '"')) 1275 | except Exception as e: 1276 | print("AN EXCEPTION OCCURED", str(e)) 1277 | data = { 1278 | "starting": True 1279 | } 1280 | if not data['starting']: 1281 | print("Instance {} is healthy | Attaching ENI to Instance".format(instanceId)) 1282 | print(attach_eni_instance(instanceId, eni_id)) 1283 | break 1284 | else: 1285 | print(instanceId + "Instance is unhealthy ...") 1286 | 1287 | def lambda_handler(event, context): 1288 | client = boto3.resource('ec2') 1289 | network_interface = client.NetworkInterface(os.environ['ENI_ID']) 1290 | print("Network ENI status: ", network_interface.status) 1291 | if network_interface.status == "available": 1292 | print("ENI not attached to any coordinator | Looking for suitable coordinator") 1293 | attach_eni(os.environ['ENI_ID']) 1294 | else: 1295 | print("ENI is attached | Checking health of the coordinator") 1296 | data = instance_health(network_interface.attachment['InstanceId']) 1297 | # data = instance_health("i-0b9a126690a1fe099") 1298 | if not data['starting']: 1299 | print("Coordinator is healthy | EXITING") 1300 | else: 1301 | print("Coordinator is unhealthy | REPLACING") 1302 | detach_eni_instance(network_interface.attachment['AttachmentId']) 1303 | attach_eni(os.environ['ENI_ID']) 1304 | 1305 | 1306 | VpcConfig: 1307 | SubnetIds: 1308 | - !Ref Subnet 1309 | SecurityGroupIds: !Split 1310 | - ',' 1311 | - !Join 1312 | - ',' 1313 | - - !GetAtt PrestoSecurityGroup.GroupId 1314 | - !Join 1315 | - ',' 1316 | - !Ref SecurityGroups 1317 | Timeout: 60 1318 | Environment: 1319 | Variables: 1320 | ENI_ID: !Ref CoordinatorENI 1321 | STACK_NAME: !Sub ${AWS::StackName} 1322 | HALambdaTriggerRule: 1323 | Type: AWS::Events::Rule 1324 | #aws-permission @cft events:PutRule 1325 | #aws-permission @cft events:DeleteRule 1326 | #aws-permission @cft events:DescribeRule 1327 | #aws-permission @cft events:PutTargets 1328 | #aws-permission @cft events:RemoveTargets 1329 | DependsOn: Coordinators 1330 | Properties: 1331 | ScheduleExpression: rate(1 minute) 1332 | State: ENABLED 1333 | Targets: 1334 | - 1335 | Arn: !GetAtt HALambda.Arn 1336 | Id: 'HALambda' 1337 | PermissionForEventsToInvokeHALambda: 1338 | Type: AWS::Lambda::Permission 1339 | #aws-permission @cft lambda:AddPermission 1340 | #aws-permission @cft lambda:RemovePermission 1341 | Properties: 1342 | FunctionName: 1343 | Ref: HALambda 1344 | Action: 'lambda:InvokeFunction' 1345 | Principal: 'events.amazonaws.com' 1346 | SourceArn: !GetAtt HALambdaTriggerRule.Arn 1347 | 1348 | Outputs: 1349 | PrestoCoordinatorIp: 1350 | Description: Coordinator Instance Ip 1351 | Value: !GetAtt CoordinatorENI.PrimaryPrivateIpAddress 1352 | CoordinatorDashboard: 1353 | Description: Coordinator Dashboard URL 1354 | Value: !Sub "http://${CoordinatorENI.PrimaryPrivateIpAddress}:8080/ui" -------------------------------------------------------------------------------- /sample_presto_config.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/atlanhq/presto-on-aws/a0f31d44b8d9729c9fdbccdb516969c735b148f3/sample_presto_config.zip -------------------------------------------------------------------------------- /scripts/autoscaling_termination_wait/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import boto3 3 | import os 4 | from botocore.vendored import requests 5 | 6 | def lambda_handler(event, context): 7 | # TODO implement 8 | 9 | def enqueue_message(event_body): 10 | queue_url = os.getenv('QUEUE_URL') 11 | print(queue_url) 12 | sqs = boto3.client('sqs') 13 | response = sqs.send_message( 14 | QueueUrl=queue_url, 15 | MessageBody=json.dumps(event_body), 16 | DelaySeconds=60 17 | ) 18 | print(response) 19 | 20 | def complete_lifecycle(event_body): 21 | res = autoscaling.complete_lifecycle_action( 22 | LifecycleHookName=event_body["LifecycleHookName"], 23 | AutoScalingGroupName=event_body["AutoScalingGroupName"], 24 | LifecycleActionToken=event_body["LifecycleActionToken"], 25 | LifecycleActionResult='CONTINUE' 26 | ) 27 | print(res) 28 | 29 | event_body = json.loads(event['Records'][0]["body"]) 30 | if event_body["LifecycleTransition"] != "autoscaling:EC2_INSTANCE_TERMINATING": 31 | print("Not a terminating condition return") 32 | return 33 | ec2_instance_id = event_body["EC2InstanceId"] 34 | ec2 = boto3.resource("ec2") 35 | autoscaling = boto3.client('autoscaling') 36 | ec2_instance = ec2.Instance(ec2_instance_id) 37 | ip = ec2_instance.private_ip_address 38 | print(ec2_instance_id) 39 | request_url = "http://{ip}:8080/v1/task".format(ip=ip, node_id=ec2_instance_id) 40 | try: 41 | print(request_url) 42 | worker_tasks = requests.get(request_url) 43 | worker_tasks = worker_tasks.json() 44 | print(len(worker_tasks)) 45 | for task in worker_tasks: 46 | if task['taskStatus']['state'] == 'RUNNING': 47 | print('RUNNING QUEURIES FOUND') 48 | enqueue_message(event_body) 49 | return 50 | 51 | print('NO_QUERIES') 52 | complete_lifecycle(event_body) 53 | return 54 | except Exception as e: 55 | print(str(e)) 56 | print("Terminating instance because worker not responding") 57 | complete_lifecycle(event_body) 58 | return 59 | -------------------------------------------------------------------------------- /scripts/graceful_shutdown_handler/lambda_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import boto3 4 | from botocore.vendored import requests 5 | 6 | def lambda_handler(event, context): 7 | # TODO implement 8 | print(event) 9 | event_body = json.loads(event['Records'][0]["body"]) 10 | if event_body["LifecycleTransition"] != "autoscaling:EC2_INSTANCE_TERMINATING": 11 | print("Not a terminating condition return") 12 | return 13 | ec2_instance_id = event_body["EC2InstanceId"] 14 | instance_id = ec2_instance_id 15 | ec2 = boto3.resource('ec2') 16 | ec2_instance = ec2.Instance(instance_id) 17 | ip = ec2_instance.private_ip_address 18 | for i in range(3): 19 | try: 20 | url = 'http://{}:8080/v1/info/state'.format(ip) 21 | payload = "\"SHUTTING_DOWN\"" 22 | headers = { 23 | 'Content-Type': "application/json", 24 | 'cache-control': "no-cache" 25 | } 26 | 27 | response = requests.request("PUT", url, data=payload, headers=headers) 28 | print(response.text) 29 | except Exception as e: 30 | pass 31 | print(ip) 32 | queue_url = os.getenv('QUEUE_URL') 33 | print(queue_url) 34 | sqs = boto3.client('sqs') 35 | response = sqs.send_message( 36 | QueueUrl=queue_url, 37 | MessageBody=json.dumps(event_body) 38 | ) 39 | print(response) 40 | 41 | return { 42 | 'statusCode': 200, 43 | 'body': json.dumps('Hello from Lambda!') 44 | } 45 | -------------------------------------------------------------------------------- /scripts/ha_lambda/lambda_function.py: -------------------------------------------------------------------------------- 1 | import json 2 | import boto3 3 | import http.client 4 | import time 5 | import os 6 | 7 | def detach_eni_instance(eniAttachmentId): 8 | client = boto3.client('ec2') 9 | print("detaching eni {}".format(eniAttachmentId)) 10 | response = client.detach_network_interface( 11 | AttachmentId = eniAttachmentId, 12 | Force = True 13 | ) 14 | time.sleep(5) 15 | print(response) 16 | print("ENI detached") 17 | 18 | def attach_eni_instance(instanceId, eni_id): 19 | client = boto3.client('ec2') 20 | print("attaching eni {} to instance {}".format(eni_id, instanceId)) 21 | response = client.attach_network_interface( 22 | DeviceIndex=1, 23 | InstanceId = instanceId, 24 | NetworkInterfaceId = eni_id, 25 | ) 26 | return response 27 | 28 | def instance_health(instanceId): 29 | client = boto3.client('ec2') 30 | response = client.describe_instances( 31 | InstanceIds=[ 32 | instanceId 33 | ] 34 | ) 35 | print("checking health for instance {}".format(instanceId)) 36 | try: 37 | conn = http.client.HTTPConnection(response['Reservations'][0]['Instances'][0]['PrivateIpAddress'], 8080) 38 | conn.request("GET", "/v1/info") 39 | r1 = conn.getresponse() 40 | print(r1.status, r1.reason) 41 | data = json.loads(r1.read().decode('utf-8').replace("'", '"')) 42 | except Exception as e: 43 | print("AN EXCEPTION OCCURED", str(e)) 44 | data = { 45 | "starting": True 46 | } 47 | return data 48 | # return response 49 | 50 | def attach_eni(eni_id): 51 | client = boto3.client('ec2') 52 | response = client.describe_instances( 53 | Filters=[ 54 | { 55 | 'Name': 'tag:presto:opensource:identification:role', 56 | 'Values': [ 57 | 'presto:coordinator' 58 | ] 59 | }, 60 | { 61 | 'Name': 'tag:aws:cloudformation:stack-name', 62 | 'Values': [ 63 | os.environ['STACK_NAME'] 64 | ] 65 | } 66 | ] 67 | ) 68 | for j in range(len(response['Reservations'])): 69 | for i in range(len(response['Reservations'][j]['Instances'])): 70 | if response['Reservations'][j]['Instances'][i]['State']['Name'] != 'running': 71 | continue 72 | privateIpAddress = response['Reservations'][j]['Instances'][i]['PrivateIpAddress'] 73 | instanceId = response['Reservations'][j]['Instances'][i]['InstanceId'] 74 | print("Found instance to attach {}, {}".format(privateIpAddress, instanceId)) 75 | try: 76 | conn = http.client.HTTPConnection(privateIpAddress, 8080) 77 | conn.request("GET", "/v1/info") 78 | r1 = conn.getresponse() 79 | print(privateIpAddress, r1.status, r1.reason) 80 | data = json.loads(r1.read().decode('utf-8').replace("'", '"')) 81 | except Exception as e: 82 | print("AN EXCEPTION OCCURED", str(e)) 83 | data = { 84 | "starting": True 85 | } 86 | if not data['starting']: 87 | print("Instance {} is healthy | Attaching ENI to Instance".format(instanceId)) 88 | print(attach_eni_instance(instanceId, eni_id)) 89 | break 90 | else: 91 | print(instanceId + "Instance is unhealthy ...") 92 | 93 | def lambda_handler(event, context): 94 | client = boto3.resource('ec2') 95 | network_interface = client.NetworkInterface(os.environ['ENI_ID']) 96 | print("Network ENI status: ", network_interface.status) 97 | if network_interface.status == "available": 98 | print("ENI not attached to any coordinator | Looking for suitable coordinator") 99 | attach_eni(os.environ['ENI_ID']) 100 | else: 101 | print("ENI is attached | Checking health of the coordinator") 102 | data = instance_health(network_interface.attachment['InstanceId']) 103 | # data = instance_health("i-0b9a126690a1fe099") 104 | if not data['starting']: 105 | print("Coordinator is healthy | EXITING") 106 | else: 107 | print("Coordinator is unhealthy | REPLACING") 108 | detach_eni_instance(network_interface.attachment['AttachmentId']) 109 | attach_eni(os.environ['ENI_ID']) 110 | 111 | --------------------------------------------------------------------------------