├── .gitignore ├── LICENSE ├── README.md ├── assets ├── client_user_data.sh ├── hive-site.xml ├── nginx.conf ├── user_data.sh ├── zeppelin-interpreter.json └── zeppelin-jdbc-0.11.0-SNAPSHOT.jar ├── catalogs ├── blackhole.properties ├── glue.properties ├── hive.properties ├── iceberg.properties ├── jmx.properties ├── memory.properties ├── mysql.properties ├── tpcds.properties └── tpch.properties ├── packer ├── README.md ├── presto.json ├── presto │ ├── install-hive.sh │ ├── install-java.sh │ ├── install-presto-cli.sh │ ├── install-presto.sh │ ├── install-trino-cli.sh │ ├── install-trino.sh │ └── update-machine.sh ├── prestoclients.json ├── prestoclients │ ├── install-redash.sh │ ├── install-superset.sh │ ├── install-zeppelin.sh │ └── update-machine.sh └── variables.json └── terraform-aws ├── README.md ├── clients.tf ├── coordinator.tf ├── disks.tf ├── iam.tf ├── main.tf ├── output.tf ├── variables.tf ├── versions.tf ├── vpc.tf ├── workers-spot.tf └── workers.tf /.gitignore: -------------------------------------------------------------------------------- 1 | # Local .terraform directories 2 | **/.terraform/* 3 | 4 | # .tfstate files 5 | *.tfstate 6 | *.tfstate.* 7 | 8 | # .tfvars files 9 | *.tfvars 10 | .idea/ 11 | *.pem 12 | 13 | # credentials files 14 | gcp-account.json 15 | .gcp-account.json -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Deploying Presto on the Cloud easily 2 | 3 | > [Presto](https://prestosql.io/) is an open source distributed SQL query engine for running interactive analytic queries against data sources of all sizes ranging from gigabytes to petabytes. 4 | 5 | > Presto was designed and written from the ground up for interactive analytics and approaches the speed of commercial data warehouses while scaling to the size of organizations like Facebook. 6 | 7 | ![presto](https://user-images.githubusercontent.com/212252/43953322-43ffedba-9ca1-11e8-9031-2ccf1660c12b.png) 8 | 9 | This repository allows deploying a Presto cluster on the cloud, using best-practices and state of the art tooling. You need to have the latest versions of Terraform and Packer for all features to work correctly. 10 | 11 | Features: 12 | 13 | * Fully functional deployment of Presto in a cluster mode (1 coordinator and multiple workers) 14 | * Spot / Preemptible worker instances supported. 15 | * Single coordinator-worker node deployment mode supported for testing and experimentation. 16 | * Auto-healing features baked in. 17 | * Easily manage and add more catalogs (connect Presto to more data-sources). 18 | * AWS deployment support (under `terraform-aws`) 19 | * Google Cloud Platform deployment (coming soon) 20 | 21 | ## Usage 22 | 23 | Clone this repo to work locally. You might want to fork it in case you need to apply some additional configurations or commit changes to the variables file. 24 | 25 | Create images with Packer (see `packer` folder in this repo), and then go into the terraform folder and run `terraform init`. See README files in each respective folder for more detailed instructions. 26 | 27 | Once you run `terraform apply` on any of the terraform folders in this repo, a file `terraform.tfstate` will be created. This file contains the mapping between your cloud elements to the terraform configuration. Make sure to keep this file safe. 28 | 29 | See [this guide](https://blog.gruntwork.io/how-to-manage-terraform-state-28f5697e68fa#.fbb2nalw6) for a discussion on `tfstate` management and locking between team members. We highly recommend using dedicated backends for real-world clusters to avoid state loss. 30 | 31 | ## Presto 101 32 | 33 | Presto has a nice UI for viewing cluster operations and currently running queries. It can be accessed from http://presto-coordinator-ip:8080/ui/. 34 | 35 | In order to run queries, you can connect to Presto [via JDBC](https://prestosql.io/docs/current/installation/jdbc.html) or SSH into the coordinator node and use the Presto CLI: 36 | 37 | ```bash 38 | presto --catalog hive --schema default 39 | ``` 40 | 41 | Note the use of Presto's "catalogs". A Catalog in Presto is a definition of a connection to a data-source. A Catalog can be a schema on a MySQL server, an S3 bucket with partitions and schema that is defined in Hive Metastore, data on Kafka or Cassandra, and many other such options. The use of Catalogs makes it possible to query and join data from multiple data-sources in one Presto query. 42 | 43 | By default, we enable the local Hive Metastore catalog, and the JMX catalog. To customize or add your own, see the `catalogs` folder. Changes to this folder require running `packer` again. 44 | 45 | See [here](https://prestosql.io/docs/current/overview/concepts.html) for more Presto concepts. 46 | 47 | ## Configuration 48 | 49 | Presto needs to be carefully fine-tuned for best performance, mainly taking good care of memory allocations, number of cores and parallelisation (number of concurrent queries, splits, etc). This can only be achieved through experimentation, but at the base of this deployment is a good starting point for a typical cluster. We will be adding more guidance and more configuration options soon. 50 | 51 | ## Try it out 52 | 53 | The fastest way to test your installation is to follow AWS Athena's examples in https://aws.amazon.com/blogs/big-data/analyzing-data-in-s3-using-amazon-athena/. If you are running on AWS, this should work out-of-the-box, otherwise you will need to specify your AWS credentials in the hive catalog. 54 | 55 | SSH into the Presto coordinator VM and run the Hive REPL (`$HIVE_HOME/bin/hive`), and within it run the following DDL: 56 | 57 | ```sql 58 | CREATE EXTERNAL TABLE IF NOT EXISTS elb_logs_pq ( 59 | request_timestamp string, 60 | elb_name string, 61 | request_ip string, 62 | request_port int, 63 | backend_ip string, 64 | backend_port int, 65 | request_processing_time double, 66 | backend_processing_time double, 67 | client_response_time double, 68 | elb_response_code string, 69 | backend_response_code string, 70 | received_bytes bigint, 71 | sent_bytes bigint, 72 | request_verb string, 73 | url string, 74 | protocol string, 75 | user_agent string, 76 | ssl_cipher string, 77 | ssl_protocol string ) 78 | PARTITIONED BY(year int, month int, day int) 79 | STORED AS PARQUET 80 | LOCATION 's3a://athena-examples/elb/parquet/' 81 | tblproperties ("parquet.compress"="SNAPPY"); 82 | 83 | msck repair table elb_logs_pq; 84 | ``` 85 | 86 | This will create a partitioned "external" Hive table with data on S3. Once done, you can query it via Hive, or you can logout of Hive and query it via the Presto CLI: 87 | 88 | ```sql 89 | SELECT elb_name, 90 | sum(case elb_response_code 91 | WHEN '200' THEN 92 | 1 93 | ELSE 0 end) AS uptime, sum(case elb_response_code 94 | WHEN '404' THEN 95 | 1 96 | ELSE 0 end) AS downtime 97 | FROM elb_logs_pq 98 | GROUP BY elb_name; 99 | ``` 100 | 101 | ```bash 102 | ubuntu@ip-172-31-32-64:~$ presto --catalog hive --schema default 103 | presto:default> [paste query copied from above] 104 | 105 | elb_name | uptime | downtime 106 | --------------+-----------+---------- 107 | elb_demo_004 | 383616619 | 21261503 108 | elb_demo_008 | 383360093 | 21350497 109 | elb_demo_002 | 383632502 | 21300518 110 | elb_demo_009 | 383427076 | 21335844 111 | elb_demo_001 | 383671436 | 21270594 112 | elb_demo_007 | 383490605 | 21303122 113 | elb_demo_005 | 383734702 | 21341740 114 | elb_demo_003 | 383351477 | 21231655 115 | elb_demo_006 | 383506485 | 21336487 116 | (9 rows) 117 | 118 | Query 20180810_121913_00002_s3bz8, FINISHED, 3 nodes 119 | Splits: 2,418 total, 2,418 done (100.00%) 120 | 0:53 [3.84B rows, 2.51GB] [71.7M rows/s, 48MB/s] 121 | ``` 122 | -------------------------------------------------------------------------------- /assets/client_user_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1 4 | 5 | ### SSL Certs 6 | mkdir -p /opt/certs 7 | cat <<'EOF' >/opt/certs/server.crt 8 | ${cert_pem} 9 | EOF 10 | cat <<'EOF' >/opt/certs/server.key 11 | ${key_pem} 12 | EOF 13 | 14 | ### Redash 15 | 16 | export COMPOSE_INTERACTIVE_NO_CLI=1 17 | cd /opt/redash 18 | 19 | sudo -E docker-compose exec -T server ./manage.py users create_root admin@redash admin --password "${admin_password}" 20 | sudo -E docker-compose exec -T server ./manage.py ds new trino --type trino --options '{"host": "${presto_coordinator_host}", "username": "admin"}' 21 | 22 | # Redash OAuth setup 23 | # See https://redash.io/help/open-source/admin-guide/google-developer-account-setup 24 | #docker-compose down 25 | #cat <<'EOF' >/opt/redash/env 26 | #REDASH_GOOGLE_CLIENT_ID=# 27 | #REDASH_GOOGLE_CLIENT_SECRET=# 28 | #EOF 29 | #docker-compose up -d 30 | 31 | cd - 32 | 33 | ### Zeppelin 34 | /usr/bin/printf "[users] 35 | admin = ${admin_password}, admin 36 | [main] 37 | sessionManager = org.apache.shiro.web.session.mgt.DefaultWebSessionManager 38 | cookie = org.apache.shiro.web.servlet.SimpleCookie 39 | cookie.name = JSESSIONID 40 | cookie.httpOnly = true 41 | sessionManager.sessionIdCookie = \$cookie 42 | securityManager.sessionManager = \$sessionManager 43 | securityManager.sessionManager.globalSessionTimeout = 86400000 44 | shiro.loginUrl = /api/login 45 | [roles] 46 | admin = * 47 | [urls] 48 | /api/version = anon 49 | /api/interpreter/setting/restart/** = authc 50 | /api/interpreter/** = authc, roles[admin] 51 | /api/configurations/** = authc, roles[admin] 52 | /api/credential/** = authc, roles[admin] 53 | /** = authc 54 | " | sudo tee /opt/zeppelin/conf/shiro.ini 55 | 56 | xmlstarlet ed \ 57 | -u "//property[name='zeppelin.anonymous.allowed']/value" \ 58 | -v false < /opt/zeppelin/conf/zeppelin-site.xml.template | sudo tee /opt/zeppelin/conf/zeppelin-site.xml 59 | 60 | cat /opt/zeppelin/conf/interpreter.json | jq --argfile presto /opt/zeppelin/conf/zeppelin-interpreter-partial.json '.interpreterSettings.presto = $presto' > /tmp/interpreter.json 61 | sed -i 's/PRESTO_HOST/${presto_coordinator_host}:${coordinator_port}/g' /tmp/interpreter.json 62 | sudo mv /tmp/interpreter.json /opt/zeppelin/conf/interpreter.json 63 | sudo rm /opt/zeppelin/conf/zeppelin-interpreter-partial.json 64 | 65 | sudo chown zeppelin:zeppelin /opt/zeppelin/conf -R 66 | sudo service zeppelin restart 67 | 68 | ### Apache Superset 69 | 70 | # Create presto datasource 71 | sudo sed -i -E "s/PRESTO_COORDINATOR_HOST/${presto_coordinator_host}/g" /opt/superset/config/presto-datasource.yaml 72 | sudo sed -ie '/^x-superset-volumes/a \ 73 | \ \ - /opt/superset/config/presto-datasource.yaml:/tmp/presto-datasource.yaml\ 74 | ' /opt/superset/docker-compose-non-dev.yml 75 | 76 | #superset import datasources and gunicorn - not yet active 77 | cd /opt/superset 78 | sudo docker-compose -f docker-compose-non-dev.yml up -d 79 | # sudo docker exec -it superset_app superset import-datasources -p /tmp/presto-datasource.yaml 80 | sudo rm /opt/superset/config/presto-datasource.yaml 81 | 82 | 83 | 84 | # SUPERSET_VENV_PATH="/opt/superset/venv" 85 | # apt-get install python3-venv 86 | # python3 -m venv $SUPERSET_VENV_PATH 87 | # . $SUPERSET_VENV_PATH/bin/activate 88 | # 89 | # pip install --upgrade setuptools pip 90 | # pip install gevent 91 | # nohup gunicorn -w 10 \ 92 | # -k gevent \ 93 | # --timeout 120 \ 94 | # -b localhost:6000 \ 95 | # --limit-request-line 0 \ 96 | # --limit-request-field_size 0 \ 97 | # --forwarded-allow-ips="*" \ 98 | # superset:app & 99 | 100 | 101 | # Presto OAuth setup 102 | # See https://superset.incubator.apache.org/faq.html?highlight=oauth#how-can-i-configure-oauth-authentication-and-authorization 103 | #cat <<'EOF' >>/opt/superset/config/superset_config.py 104 | #AUTH_TYPE = AUTH_OAUTH 105 | # 106 | #OAUTH_PROVIDERS = [ 107 | # { 108 | # "name": "twitter", 109 | # "icon": "fa-twitter", 110 | # "remote_app": { 111 | # "consumer_key": os.environ.get("TWITTER_KEY"), 112 | # "consumer_secret": os.environ.get("TWITTER_SECRET"), 113 | # "base_url": "https://api.twitter.com/1.1/", 114 | # "request_token_url": "https://api.twitter.com/oauth/request_token", 115 | # "access_token_url": "https://api.twitter.com/oauth/access_token", 116 | # "authorize_url": "https://api.twitter.com/oauth/authenticate", 117 | # }, 118 | # }, 119 | # { 120 | # "name": "google", 121 | # "icon": "fa-google", 122 | # "token_key": "access_token", 123 | # "remote_app": { 124 | # "consumer_key": os.environ.get("GOOGLE_KEY"), 125 | # "consumer_secret": os.environ.get("GOOGLE_SECRET"), 126 | # "base_url": "https://www.googleapis.com/oauth2/v2/", 127 | # "request_token_params": {"scope": "email profile"}, 128 | # "request_token_url": None, 129 | # "access_token_url": "https://accounts.google.com/o/oauth2/token", 130 | # "authorize_url": "https://accounts.google.com/o/oauth2/auth", 131 | # }, 132 | # }, 133 | # { 134 | # "name": "azure", 135 | # "icon": "fa-windows", 136 | # "token_key": "access_token", 137 | # "remote_app": { 138 | # "consumer_key": os.environ.get("AZURE_APPLICATION_ID"), 139 | # "consumer_secret": os.environ.get("AZURE_SECRET"), 140 | # "base_url": "https://login.microsoftonline.com/{AZURE_TENANT_ID}/oauth2", 141 | # "request_token_params": { 142 | # "scope": "User.read name preferred_username email profile", 143 | # "resource": os.environ.get("AZURE_APPLICATION_ID"), 144 | # }, 145 | # "request_token_url": None, 146 | # "access_token_url": "https://login.microsoftonline.com/{AZURE_TENANT_ID}/oauth2/token", 147 | # "authorize_url": "https://login.microsoftonline.com/{AZURE_TENANT_ID}/oauth2/authorize", 148 | # }, 149 | # } 150 | #] 151 | #EOF 152 | 153 | 154 | 155 | sudo systemctl restart nginx.service 156 | -------------------------------------------------------------------------------- /assets/hive-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | hive.exec.script.wrapper 6 | 7 | 8 | 9 | 10 | hive.exec.plan 11 | 12 | 13 | 14 | 15 | hive.query.results.cache.max.size 16 | 2147483648 17 | Maximum total size in bytes that the query results cache directory is allowed to use on the filesystem. 18 | 19 | 20 | hive.query.results.cache.max.entry.size 21 | 10485760 22 | Maximum size in bytes that a single query result is allowed to use in the results cache directory 23 | 24 | 25 | hive.notification.event.poll.interval 26 | 60s 27 | 28 | Expects a time value with unit (d/day, h/hour, m/min, s/sec, ms/msec, us/usec, ns/nsec), which is sec if not specified. 29 | How often the notification log is polled for new NotificationEvents from the metastore.A nonpositive value means the notification log is never polled. 30 | 31 | 32 | 33 | hive.blobstore.supported.schemes 34 | s3,s3a,s3n 35 | Comma-separated list of supported blobstore schemes. 36 | 37 | 38 | 45 | 46 | 47 | javax.jdo.option.ConnectionURL 48 | jdbc:mysql://localhost:3306/hive?useSSL=false&createDatabaseIfNotExist=true 49 | JDBC connect string for a JDBC metastore 50 | 51 | 52 | javax.jdo.option.ConnectionDriverName 53 | com.mysql.jdbc.Driver 54 | 55 | 56 | javax.jdo.option.ConnectionUserName 57 | root 58 | username to use against metastore database 59 | 60 | 61 | javax.jdo.option.ConnectionPassword 62 | pwd 63 | 64 | 65 | 66 | hive.metastore.uris 67 | thrift://localhost:9083 68 | 69 | 70 | 71 | datanucleus.autoCreateSchema 72 | false 73 | 74 | 75 | 76 | fs.file.impl.disable.cache 77 | true 78 | 79 | 80 | fs.hdfs.impl.disable.cache 81 | true 82 | 83 | 84 | 85 | hive.server2.logging.operation.enabled 86 | true 87 | 88 | 89 | hive.server2.logging.operation.log.location 90 | /tmp/hive-metastore/operation_logs 91 | 92 | 93 | hive.server2.logging.operation.verbose 94 | true 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | hive.server2.allow.user.substitution 103 | true 104 | 105 | 106 | hive.server2.enable.doAs 107 | true 108 | 109 | 110 | 111 | hive.server2.thrift.port 112 | 10000 113 | 114 | 115 | 116 | hive.server2.thrift.http.port 117 | 10001 118 | 119 | 120 | 121 | 122 | hive.server2.in.place.progress 123 | false 124 | 125 | 126 | 127 | datanucleus.fixedDatastore 128 | true 129 | 130 | 131 | 132 | mapred.reduce.tasks 133 | -1 134 | 135 | 136 | 137 | mapred.max.split.size 138 | 256000000 139 | 140 | 141 | 142 | hive.metastore.connect.retries 143 | 15 144 | 145 | 146 | 147 | hive.optimize.sort.dynamic.partition 148 | true 149 | 150 | 151 | -------------------------------------------------------------------------------- /assets/nginx.conf: -------------------------------------------------------------------------------- 1 | upstream redash { 2 | server localhost:5000; 3 | } 4 | 5 | upstream superset { 6 | server localhost:8088; 7 | } 8 | 9 | upstream zeppelin { 10 | server localhost:9090; 11 | } 12 | 13 | server { 14 | listen 8500 ssl http2; 15 | listen [::]:8500 ssl http2; 16 | server_name _; 17 | 18 | add_header Strict-Transport-Security "max-age=31536000" always; 19 | 20 | ssl_session_cache shared:SSL:20m; 21 | ssl_session_timeout 10m; 22 | 23 | ssl_protocols TLSv1 TLSv1.1 TLSv1.2; 24 | ssl_prefer_server_ciphers on; 25 | ssl_ciphers "ECDH+AESGCM:ECDH+AES256:ECDH+AES128:!ADH:!AECDH:!MD5;"; 26 | 27 | ssl_certificate /opt/certs/server.crt; 28 | ssl_certificate_key /opt/certs/server.key; 29 | 30 | error_page 497 https://$host:$server_port$request_uri; 31 | 32 | location / { 33 | proxy_set_header Host $http_host; 34 | proxy_set_header X-Real-IP $remote_addr; 35 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 36 | proxy_set_header X-Forwarded-Proto $scheme; 37 | 38 | proxy_pass http://redash; 39 | } 40 | } 41 | 42 | server { 43 | listen 8600 ssl http2; 44 | listen [::]:8600 ssl http2; 45 | server_name _; 46 | 47 | add_header Strict-Transport-Security "max-age=31536000" always; 48 | 49 | ssl_session_cache shared:SSL:20m; 50 | ssl_session_timeout 10m; 51 | 52 | ssl_protocols TLSv1 TLSv1.1 TLSv1.2; 53 | ssl_prefer_server_ciphers on; 54 | ssl_ciphers "ECDH+AESGCM:ECDH+AES256:ECDH+AES128:!ADH:!AECDH:!MD5;"; 55 | 56 | ssl_certificate /opt/certs/server.crt; 57 | ssl_certificate_key /opt/certs/server.key; 58 | 59 | error_page 497 https://$host:$server_port$request_uri; 60 | 61 | location / { 62 | proxy_set_header Host $http_host; 63 | proxy_set_header X-Real-IP $remote_addr; 64 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 65 | proxy_set_header X-Forwarded-Proto $scheme; 66 | 67 | proxy_pass http://superset; 68 | } 69 | } 70 | 71 | server { 72 | listen 8700 ssl http2; 73 | listen [::]:8700 ssl http2; 74 | server_name _; 75 | 76 | add_header Strict-Transport-Security "max-age=31536000" always; 77 | 78 | ssl_session_cache shared:SSL:20m; 79 | ssl_session_timeout 10m; 80 | 81 | ssl_protocols TLSv1 TLSv1.1 TLSv1.2; 82 | ssl_prefer_server_ciphers on; 83 | ssl_ciphers "ECDH+AESGCM:ECDH+AES256:ECDH+AES128:!ADH:!AECDH:!MD5;"; 84 | 85 | ssl_certificate /opt/certs/server.crt; 86 | ssl_certificate_key /opt/certs/server.key; 87 | 88 | error_page 497 https://$host:$server_port$request_uri; 89 | 90 | location / { # For regular webserver support 91 | proxy_pass http://zeppelin; 92 | proxy_set_header X-Real-IP $remote_addr; 93 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 94 | proxy_set_header Host $http_host; 95 | proxy_set_header X-NginX-Proxy true; 96 | proxy_redirect off; 97 | } 98 | 99 | location /ws { # For websocket support 100 | proxy_pass http://zeppelin; 101 | proxy_http_version 1.1; 102 | proxy_set_header Upgrade websocket; 103 | proxy_set_header Connection upgrade; 104 | proxy_read_timeout 86400; 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /assets/user_data.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1 5 | 6 | cat <<'EOF' >/etc/security/limits.d/100-trino-nofile.conf 7 | trino soft nofile 16384 8 | trino hard nofile 16384 9 | EOF 10 | 11 | /usr/bin/printf " 12 | node.environment=${environment_name} 13 | node.id=$(hostname) 14 | node.data-dir=/var/lib/trino/ 15 | " > /etc/trino/node.properties 16 | 17 | /usr/bin/printf "-server 18 | -Xmx${heap_size}G 19 | -XX:-UseBiasedLocking 20 | -XX:+UseG1GC 21 | -XX:G1HeapRegionSize=32M 22 | -XX:+ExplicitGCInvokesConcurrent 23 | -XX:+HeapDumpOnOutOfMemoryError 24 | -XX:+ExitOnOutOfMemoryError 25 | -XX:+UseGCOverheadLimit 26 | -XX:ReservedCodeCacheSize=512M 27 | -Djdk.attach.allowAttachSelf=true 28 | -Djdk.nio.maxCachedBufferSize=2000000 29 | -Duser.timezone=UTC 30 | " > /etc/trino/jvm.config 31 | 32 | function setup_hive_metastore { 33 | AV_ZONE="$(ec2metadata --availability-zone)" 34 | ENVIRONMENT_NAME="$(aws ec2 describe-tags --region "${aws_region}" --filters Name=resource-id,Values=$(ec2metadata --instance-id) | jq -r '.Tags[] | select(.Key == "Environment") | .Value')" 35 | echo "AV_ZONE: $AV_ZONE" 36 | echo "ENVIRONMENT_NAME: $ENVIRONMENT_NAME" 37 | 38 | while true; do 39 | UNATTACHED_VOLUME_ID="$(aws ec2 describe-volumes --region ${aws_region} --filters Name=tag:Environment,Values=$ENVIRONMENT_NAME Name=tag:PrestoCoordinator,Values=true Name=availability-zone,Values=$AV_ZONE | jq -r '.Volumes[] | select(.Attachments | length == 0) | .VolumeId' | shuf -n 1)" 40 | echo "UNATTACHED_VOLUME_ID: $UNATTACHED_VOLUME_ID" 41 | 42 | aws ec2 attach-volume --device "/dev/xvdh" --instance-id=$(ec2metadata --instance-id) --volume-id "$UNATTACHED_VOLUME_ID" --region ${aws_region} 43 | if [ "$?" != "0" ]; then 44 | sleep 10 45 | continue 46 | fi 47 | 48 | sleep 30 49 | 50 | ATTACHMENTS_COUNT="$(aws ec2 describe-volumes --region "${aws_region}" --filters Name=volume-id,Values="$UNATTACHED_VOLUME_ID" | jq -r '.Volumes[0].Attachments | length')" 51 | if [ "$ATTACHMENTS_COUNT" != "0" ]; then break; fi 52 | done 53 | 54 | echo 'Waiting for 30 seconds for the disk to become mountable...' 55 | sleep 30 56 | 57 | # Mount persistent storage and apply Hive Metastore schema if needed 58 | DEVICE_NAME=$(lsblk -ip | tail -n +2 | awk '{print $1 " " ($7? "MOUNTEDPART" : "") }' | sed ':a;N;$!ba;s/\n`/ /g' | grep -v MOUNTEDPART | sed -e 's/[[:space:]]*$//') 59 | MOUNT_PATH=/var/lib/mysql 60 | 61 | sudo mv $MOUNT_PATH /tmp/mysql.backup 62 | sudo mkdir -p $MOUNT_PATH 63 | 64 | if sudo mount -o defaults -t ext4 "$DEVICE_NAME" $MOUNT_PATH; then 65 | echo 'Successfully mounted existing disk' 66 | else 67 | echo 'Trying to mount a fresh disk' 68 | sudo mkfs.ext4 -m 0 -F -E lazy_itable_init=0,lazy_journal_init=0,discard "$DEVICE_NAME" 69 | sudo mount -o defaults -t ext4 "$DEVICE_NAME" $MOUNT_PATH && echo 'Successfully mounted a fresh disk' 70 | sudo cp -ar /tmp/mysql.backup/* $MOUNT_PATH/ 71 | fi 72 | 73 | sudo chown mysql:mysql -R $MOUNT_PATH 74 | sudo chmod 700 $MOUNT_PATH 75 | 76 | service mysql start 77 | systemctl enable mysql 78 | 79 | . /etc/environment 80 | export HADOOP_HOME=$HADOOP_HOME 81 | 82 | if ! "$HIVE_HOME"/bin/schematool -validate -dbType mysql; then 83 | echo "Mysql schema is not valid" 84 | "$HIVE_HOME"/bin/schematool -dbType mysql -initSchema 85 | fi 86 | 87 | echo "Initializing Hive Metastore ($HIVE_HOME)..." 88 | service hive-metastore start 89 | systemctl enable hive-metastore 90 | } 91 | 92 | # 93 | # Configure as COORDINATOR 94 | # 95 | if [[ "${mode_presto}" == "coordinator" ]]; then 96 | echo "Configuring node as a [${mode_presto}]..." 97 | 98 | /usr/bin/printf " 99 | # 100 | # coordinator 101 | # 102 | coordinator=true 103 | discovery-server.enabled=true 104 | discovery.uri=http://localhost:${http_port} 105 | node-scheduler.include-coordinator=false 106 | 107 | http-server.http.port=${http_port} 108 | # query.max-memory-per-node has to be <= query.max-total-memory-per-node 109 | #query.max-memory-per-node=${query_max_memory_per_node}GB 110 | #query.max-total-memory-per-node=${query_max_total_memory_per_node}GB 111 | query.max-memory=${query_max_memory}GB 112 | # query.max-total-memory defaults to query.max-memory * 2 so we are good 113 | ${extra_worker_configs} 114 | " > /etc/trino/config.properties 115 | 116 | setup_hive_metastore 117 | fi 118 | 119 | # 120 | # Configure as WORKER 121 | # 122 | if [[ "${mode_presto}" == "worker" ]]; then 123 | echo "Configuring node as a [${mode_presto}]..." 124 | 125 | /usr/bin/printf " 126 | # 127 | # worker 128 | # 129 | coordinator=false 130 | discovery.uri=http://${address_presto_coordinator}:${http_port} 131 | node-scheduler.include-coordinator=false 132 | 133 | http-server.http.port=${http_port} 134 | # query.max-memory-per-node has to be <= query.max-total-memory-per-node 135 | #query.max-memory-per-node=${query_max_memory_per_node}GB 136 | #query.max-total-memory-per-node=${query_max_total_memory_per_node}GB 137 | query.max-memory=${query_max_memory}GB 138 | # query.max-total-memory defaults to query.max-memory * 2 so we are good 139 | ${extra_worker_configs} 140 | " > /etc/trino/config.properties 141 | fi 142 | 143 | # 144 | # Configure as BOTH coordinator and worker 145 | # 146 | if [[ "${mode_presto}" == "coordinator-worker" ]]; then 147 | echo "Configuring node as a [${mode_presto}]..." 148 | 149 | /usr/bin/printf " 150 | # 151 | # coordinator-worker 152 | # 153 | coordinator=true 154 | discovery-server.enabled=true 155 | discovery.uri=http://localhost:${http_port} 156 | node-scheduler.include-coordinator=true 157 | 158 | http-server.http.port=${http_port} 159 | # query.max-memory-per-node has to be <= query.max-total-memory-per-node 160 | #query.max-memory-per-node=${query_max_memory_per_node}GB 161 | #query.max-total-memory-per-node=${query_max_total_memory_per_node}GB 162 | query.max-memory=${query_max_memory}GB 163 | # query.max-total-memory defaults to query.max-memory * 2 so we are good 164 | ${extra_worker_configs} 165 | " > /etc/trino/config.properties 166 | 167 | setup_hive_metastore 168 | fi 169 | 170 | if [[ "${mode_presto}" == "worker" ]]; then 171 | echo "Waiting for Presto Coordinator to come online at: http://${address_presto_coordinator}:${http_port}" 172 | while ! nc -z ${address_presto_coordinator} ${http_port}; do 173 | sleep 5 174 | done 175 | fi 176 | 177 | if [ ! -z "${aws_access_key_id}" ] && [ ! -z "${aws_secret_access_key}" ]; then 178 | # Update hive-site.xml 179 | /usr/bin/printf " 180 | 181 | fs.s3.impl 182 | org.apache.hadoop.fs.s3native.NativeS3FileSystem 183 | 184 | 185 | fs.s3.awsAccessKeyId 186 | ${aws_access_key_id} 187 | 188 | 189 | fs.s3.awsSecretAccessKey 190 | ${aws_secret_access_key} 191 | " > /tmp/hive-site-partial.txt 192 | sudo sed -i "s//$(sed 's@[/\&]@\\&@g;$!s/$/\\/' /tmp/hive-site-partial.txt)/g" /usr/local/apache-hive-*-bin/conf/hive-site.xml 193 | rm /tmp/hive-site-partial.txt 194 | 195 | # Update hive.properties 196 | /usr/bin/printf "\nhive.allow-drop-table=true" >> /etc/trino/catalog/hive.properties 197 | /usr/bin/printf "\nhive.non-managed-table-writes-enabled=true" >> /etc/trino/catalog/hive.properties 198 | /usr/bin/printf "\n#hive.time-zone=UTC" >> /etc/trino/catalog/hive.properties 199 | /usr/bin/printf "\nhive.s3.aws-access-key=${aws_access_key_id}" >> /etc/trino/catalog/hive.properties 200 | /usr/bin/printf "\nhive.s3.aws-secret-key=${aws_secret_access_key}" >> /etc/trino/catalog/hive.properties 201 | /usr/bin/printf "\n" >> /etc/trino/catalog/hive.properties 202 | /usr/bin/printf "\nhive.s3.aws-access-key=${aws_access_key_id}" >> /etc/trino/catalog/iceberg.properties 203 | /usr/bin/printf "\nhive.s3.aws-secret-key=${aws_secret_access_key}" >> /etc/trino/catalog/iceberg.properties 204 | /usr/bin/printf "\n" >> /etc/trino/catalog/iceberg.properties 205 | fi 206 | 207 | echo "Starting presto..." 208 | systemctl enable trino.service 209 | systemctl start trino.service 210 | 211 | if [[ "${mode_presto}" == "coordinator" ]] || [[ "${mode_presto}" == "coordinator-worker" ]]; then 212 | echo "Waiting for Presto Coordinator to start" 213 | while ! presto --execute='select * from system.runtime.nodes'; do 214 | sleep 10 215 | done 216 | echo "Presto Coordinator is now online" 217 | fi 218 | 219 | echo "Executing additional bootstrap scripts" 220 | 221 | %{ for script in additional_bootstrap_scripts ~} 222 | %{ if script.type == "s3" ~} 223 | if [ ! -z "${aws_access_key_id}" ]; then 224 | export AWS_ACCESS_KEY_ID=${aws_access_key_id} 225 | export AWS_SECRET_ACCESS_KEY=${aws_secret_access_key} 226 | fi 227 | aws s3 cp "${script.script_url}" "/tmp/${script.script_name}" 228 | %{ else ~} 229 | curl "${script.script_url}" -o "/tmp/${script.script_name}" 230 | %{ endif ~} 231 | chmod +x "/tmp/${script.script_name}" 232 | sh -c "/tmp/${script.script_name} %{ for param in script.params ~} ${param} %{ endfor ~}" 233 | %{ endfor ~} 234 | 235 | echo "Restarting Presto service" 236 | 237 | systemctl restart trino -------------------------------------------------------------------------------- /assets/zeppelin-interpreter.json: -------------------------------------------------------------------------------- 1 | { 2 | "id": "presto", 3 | "name": "presto", 4 | "group": "jdbc", 5 | "properties": { 6 | "default.url": { 7 | "name": "default.url", 8 | "value": "jdbc:trino://PRESTO_HOST", 9 | "type": "string" 10 | }, 11 | "default.driver": { 12 | "name": "default.driver", 13 | "value": "io.trino.jdbc.TrinoDriver", 14 | "type": "string" 15 | }, 16 | "default.user": { 17 | "name": "default.user", 18 | "value": "presto", 19 | "type": "string" 20 | } 21 | }, 22 | "status": "READY", 23 | "interpreterGroup": [ 24 | { 25 | "name": "sql", 26 | "class": "org.apache.zeppelin.jdbc.JDBCInterpreter", 27 | "defaultInterpreter": false, 28 | "editor": { 29 | "language": "sql", 30 | "editOnDblClick": false, 31 | "completionSupport": true 32 | } 33 | } 34 | ], 35 | "dependencies": [ 36 | { 37 | "groupArtifactVersion": "/opt/zeppelin/interpreter/jdbc/trino-jdbc-370.jar", 38 | "local": false 39 | } 40 | ], 41 | "option": { 42 | "remote": true, 43 | "port": -1, 44 | "perNote": "shared", 45 | "perUser": "shared", 46 | "isExistingProcess": false, 47 | "setPermission": false, 48 | "owners": [], 49 | "isUserImpersonate": false 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /assets/zeppelin-jdbc-0.11.0-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BigDataBoutique/presto-cloud-deploy/fb8a3c45a9b1a1858b808fc7d2e09c2262e4e89b/assets/zeppelin-jdbc-0.11.0-SNAPSHOT.jar -------------------------------------------------------------------------------- /catalogs/blackhole.properties: -------------------------------------------------------------------------------- 1 | connector.name=blackhole 2 | -------------------------------------------------------------------------------- /catalogs/glue.properties: -------------------------------------------------------------------------------- 1 | connector.name=hive-hadoop2 2 | hive.metastore = glue 3 | hive.non-managed-table-writes-enabled = true 4 | #hive.metastore.glue.aws-access-key = 5 | #hive.metastore.glue.aws-secret-key = 6 | hive.metastore.glue.region = us-east-1 7 | hive.max-partitions-per-writers = 200 8 | 9 | -------------------------------------------------------------------------------- /catalogs/hive.properties: -------------------------------------------------------------------------------- 1 | connector.name=hive-hadoop2 2 | hive.metastore.uri=thrift://localhost:9083 3 | 4 | # For a full list of configuration options, see https://prestodb.io/docs/current/connector/hive.html 5 | 6 | # By default Presto / Hive assume data is mutable in table partitions 7 | #hive.immutable-partitions=false 8 | 9 | # The default security permissions don't allow to drop tables 10 | #hive.allow-drop-table=false 11 | 12 | # Theses settings control how often the catalog metadata is going to be updated 13 | hive.metastore-cache-ttl=30s 14 | hive.metastore-refresh-interval=10s 15 | 16 | # When deploying on AWS the default is to enable S3 access via instance credentials and control access via IAM roles 17 | # For non-AWS deployments, or when for other reasons you need to control S3 access via explicit credentials, you 18 | # can use the settings below. 19 | #hive.s3.use-instance-credentials=true 20 | #hive.s3.ssl.enabled=true 21 | #hive.s3.aws-access-key=AKIA... 22 | #hive.s3.aws-secret-key=.... 23 | #hive.s3.endpoint=... -------------------------------------------------------------------------------- /catalogs/iceberg.properties: -------------------------------------------------------------------------------- 1 | connector.name=iceberg 2 | iceberg.catalog.type=hive_metastore 3 | hive.metastore.uri=thrift://localhost:9083 4 | 5 | #iceberg.catalog.type=glue -------------------------------------------------------------------------------- /catalogs/jmx.properties: -------------------------------------------------------------------------------- 1 | connector.name=jmx -------------------------------------------------------------------------------- /catalogs/memory.properties: -------------------------------------------------------------------------------- 1 | # https://prestodb.io/docs/current/connector/memory.html 2 | connector.name=memory 3 | memory.max-data-per-node=2GB -------------------------------------------------------------------------------- /catalogs/mysql.properties: -------------------------------------------------------------------------------- 1 | connector.name=mysql 2 | connection-url=jdbc:mysql://localhost:3306 3 | connection-user=root 4 | connection-password=pwd -------------------------------------------------------------------------------- /catalogs/tpcds.properties: -------------------------------------------------------------------------------- 1 | connector.name=tpcds -------------------------------------------------------------------------------- /catalogs/tpch.properties: -------------------------------------------------------------------------------- 1 | connector.name=tpch -------------------------------------------------------------------------------- /packer/README.md: -------------------------------------------------------------------------------- 1 | # Presto machine images 2 | 3 | This Packer configuration will generate Ubuntu images with Presto and Presto CLI installed, for deploying and managing Presto clusters on the cloud. 4 | 5 | ## On Amazon Web Services (AWS) 6 | 7 | Using the AWS builder will create the two images and store them as AMIs. 8 | 9 | The base AMI is maintained by [Canonical](https://canonical.com/) and listed on 10 | the [Amazon EC2 AMI Locator](https://cloud-images.ubuntu.com/locator/ec2/). 11 | 12 | As a convention the Packer builders will use a dedicated IAM roles, which you will need to have present. 13 | 14 | ```bash 15 | aws iam create-role --role-name packer --assume-role-policy-document '{ 16 | "Version": "2012-10-17", 17 | "Statement": { 18 | "Effect": "Allow", 19 | "Principal": {"Service": "ec2.amazonaws.com"}, 20 | "Action": "sts:AssumeRole", 21 | "Sid": "" 22 | } 23 | }' 24 | ``` 25 | 26 | Response will look something like this: 27 | 28 | ```json 29 | { 30 | "Role": { 31 | "AssumeRolePolicyDocument": { 32 | "Version": "2012-10-17", 33 | "Statement": { 34 | "Action": "sts:AssumeRole", 35 | "Effect": "Allow", 36 | "Principal": { 37 | "Service": "ec2.amazonaws.com" 38 | } 39 | } 40 | }, 41 | "RoleId": "AROAJ7Q2L7NZJHZBB6JKY", 42 | "CreateDate": "2016-12-16T13:22:47.254Z", 43 | "RoleName": "packer", 44 | "Path": "/", 45 | "Arn": "arn:aws:iam::611111111117:role/packer" 46 | } 47 | } 48 | ``` 49 | 50 | Follow up by executing the following 51 | 52 | ```bash 53 | aws iam create-instance-profile --instance-profile-name packer 54 | aws iam add-role-to-instance-profile --instance-profile-name packer --role-name packer 55 | 56 | ``` 57 | 58 | ## On Microsoft Azure 59 | 60 | Before running Packer for the first time you will need to do a one-time initial setup. 61 | 62 | Use PowerShell, and login to AzureRm. See here for more details: https://docs.microsoft.com/en-us/powershell/azure/authenticate-azureps. Once logged in, take note of the subscription and tenant IDs which will be printed out. Alternatively, you can retrieve them by running `Get-AzureRmSubscription` once logged-in. 63 | 64 | ```Powershell 65 | $rgName = "packer-presto-images" 66 | $location = "East US" 67 | New-AzureRmResourceGroup -Name $rgName -Location $location 68 | $Password = ([char[]]([char]33..[char]95) + ([char[]]([char]97..[char]126)) + 0..9 | sort {Get-Random})[0..8] -join '' 69 | "Password: " + $Password 70 | $sp = New-AzureRmADServicePrincipal -DisplayName "Azure Packer IKF" -Password $Password 71 | New-AzureRmRoleAssignment -RoleDefinitionName Contributor -ServicePrincipalName $sp.ApplicationId 72 | $sp.ApplicationId 73 | ``` 74 | 75 | Note the resource group name, location, password, sp.ApplicationId as used in the script and emitted as output and update `variables.json`. 76 | 77 | To learn more about using Packer on Azure see https://docs.microsoft.com/en-us/azure/virtual-machines/windows/build-image-with-packer 78 | 79 | Similarly, using the Azure CLI is going to look something like below: 80 | 81 | ```bash 82 | export rgName=packer-presto-images 83 | az group create -n ${rgName} -l eastus 84 | 85 | az ad sp create-for-rbac --query "{ client_id: appId, client_secret: password, tenant_id: tenant }" 86 | # outputs client_id, client_secret and tenant_id 87 | az account show --query "{ subscription_id: id }" 88 | # outputs subscription_id 89 | ``` 90 | 91 | ## Building 92 | 93 | Building the AMIs is done using the following commands: 94 | 95 | ```bash 96 | packer build -only=amazon-ebs -var-file=variables.json presto.json 97 | ``` 98 | 99 | Override the aws_region and aws_az variables to change the target region and 100 | availability zone, which default respectively to us-east-1 and us-east-1a. 101 | 102 | Replace the `-only` parameter to `azure-arm` to build images for Azure instead of AWS. 103 | -------------------------------------------------------------------------------- /packer/presto.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Presto Image", 3 | "builders": [ 4 | { 5 | "type": "amazon-ebs", 6 | "ami_name": "presto-{{isotime | clean_resource_name}}", 7 | "availability_zone": "{{user `aws_az`}}", 8 | "iam_instance_profile": "packer", 9 | "instance_type": "t2.large", 10 | "region": "{{user `aws_region`}}", 11 | "run_tags": { 12 | "role": "packer" 13 | }, 14 | "source_ami_filter": { 15 | "filters": { 16 | "virtualization-type": "hvm", 17 | "name": "ubuntu/images/hvm-ssd/ubuntu-bionic-18.04-amd64-server-*", 18 | "root-device-type": "ebs" 19 | }, 20 | "owners": ["099720109477"], 21 | "most_recent": true 22 | }, 23 | "ssh_timeout": "10m", 24 | "ssh_username": "ubuntu", 25 | "ssh_interface": "public_ip", 26 | "tags": { 27 | "ImageType": "presto-packer-image" 28 | }, 29 | 30 | "spot_price_auto_product": "Linux/UNIX (Amazon VPC)", 31 | "spot_price": "auto" 32 | }, 33 | { 34 | "type": "azure-arm", 35 | 36 | "client_id": "{{user `azure_client_id`}}", 37 | "client_secret": "{{user `azure_client_secret`}}", 38 | "tenant_id": "{{user `azure_tenant_id`}}", 39 | "subscription_id": "{{user `azure_subscription_id`}}", 40 | 41 | "managed_image_resource_group_name": "{{user `azure_resource_group_name`}}", 42 | "managed_image_name": "presto-{{isotime \"2006-01-02T030405\"}}", 43 | 44 | "os_type": "Linux", 45 | "image_publisher": "Canonical", 46 | "image_offer": "UbuntuServer", 47 | "image_sku": "18.04-LTS", 48 | 49 | "location": "{{user `azure_location`}}", 50 | "vm_size": "Standard_DS2_v2" 51 | }, 52 | { 53 | "type": "googlecompute", 54 | "account_file": "{{user `gcp_account_file`}}", 55 | "project_id": "{{user `gcp_project_id`}}", 56 | "source_image_family": "ubuntu-1804-lts", 57 | "zone": "{{user `gcp_zone`}}", 58 | "image_family": "presto", 59 | "image_name": "presto-{{isotime \"20060102t030405\"}}", 60 | "preemptible": true, 61 | "ssh_username": "ubuntu" 62 | } 63 | ], 64 | "provisioners": [ 65 | { 66 | "type": "shell", 67 | "script": "presto/update-machine.sh", 68 | "execute_command": "echo '' | {{ .Vars }} sudo -E -S sh '{{ .Path }}'" 69 | }, 70 | { 71 | "type": "shell", 72 | "script": "presto/install-java.sh", 73 | "execute_command": "echo '' | {{ .Vars }} sudo -E -S sh '{{ .Path }}'" 74 | }, 75 | { 76 | "type": "file", 77 | "source": "../assets/hive-site.xml", 78 | "destination": "hive-site.xml" 79 | }, 80 | { 81 | "type": "shell", 82 | "script": "presto/install-hive.sh", 83 | "environment_vars": [ "HIVE_VERSION={{user `hive_version`}}", "HADOOP_VERSION={{user `hadoop_version`}}" ], 84 | "execute_command": "echo '' | {{ .Vars }} sudo -E -S bash '{{ .Path }}'" 85 | }, 86 | { 87 | "type": "file", 88 | "source": "../catalogs", 89 | "destination": "presto-catalogs" 90 | }, 91 | { 92 | "type": "shell", 93 | "script": "presto/install-trino.sh", 94 | "environment_vars": [ "PRESTO_VERSION={{user `presto_version`}}" ], 95 | "execute_command": "echo '' | {{ .Vars }} sudo -E -S sh '{{ .Path }}'" 96 | }, 97 | { 98 | "type": "shell", 99 | "script": "presto/install-trino-cli.sh", 100 | "environment_vars": [ "PRESTO_VERSION={{user `presto_version`}}" ], 101 | "execute_command": "echo '' | {{ .Vars }} sudo -E -S sh '{{ .Path }}'" 102 | } 103 | ] 104 | } 105 | -------------------------------------------------------------------------------- /packer/presto/install-hive.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | log() { 5 | echo "==> $(basename ${0}): ${1}" 6 | } 7 | 8 | export path_install="/usr/local/apache-hive-${HIVE_VERSION}-bin" 9 | export path_file="hive-${HIVE_VERSION}.tar.gz" 10 | export HIVE_HOME=${path_install} 11 | 12 | export path_hadoop="/usr/local/hadoop-${HADOOP_VERSION}" 13 | export path_hadoop_file="hadoop-${HADOOP_VERSION}.tar.gz" 14 | export HADOOP_HOME=${path_hadoop} 15 | 16 | log "Downloading Hadoop ${HADOOP_VERSION}..." 17 | wget -q -O ${path_hadoop_file} https://archive.apache.org/dist/hadoop/core/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz 18 | tar -xzf ${path_hadoop_file} -C /usr/local/ 19 | rm ${path_hadoop_file} 20 | 21 | log "Downloading Hive ${HIVE_VERSION}..." 22 | wget -q -O ${path_file} https://archive.apache.org/dist/hive/hive-${HIVE_VERSION}/apache-hive-${HIVE_VERSION}-bin.tar.gz 23 | 24 | log "Installing Hive..." 25 | useradd -m hive || log "User [hive] already exists. Continuing..." 26 | 27 | install -d -o hive -g hive ${path_install} 28 | tar -xzf ${path_file} -C /usr/local/ 29 | mv hive-site.xml ${path_install}/conf/hive-site.xml 30 | ln -s /usr/share/java/mysql-connector-java.jar ${HIVE_HOME}/lib/mysql-connector-java.jar 31 | cp -n ${HADOOP_HOME}/share/hadoop/tools/lib/* ${HIVE_HOME}/lib/ 32 | echo "export JAVA_HOME=$JAVA8_HOME" >> ${path_install}/bin/hive-config.sh 33 | chown -R hive:hive ${path_install} 34 | rm ${path_file} 35 | echo "export PATH=\"\$PATH:${path_install}/bin\"" > /etc/profile.d/apache-hive.sh 36 | 37 | /usr/bin/printf " 38 | HADOOP_HOME=${path_hadoop} 39 | HIVE_HOME=${path_install}" >> /etc/environment 40 | 41 | install -d -o hive -g hive /tmp/hive 42 | ${HADOOP_HOME}/bin/hadoop fs -chmod -R 777 /tmp/hive/ 43 | 44 | log "Setup MySQL backend for Hive Metastore..." 45 | sudo debconf-set-selections <<< 'mysql-server-5.6 mysql-server/root_password password pwd' 46 | sudo debconf-set-selections <<< 'mysql-server-5.6 mysql-server/root_password_again password pwd' 47 | DEBIAN_FRONTEND=noninteractive apt-get install -y -qq mysql-server libmysql-java 48 | 49 | # Disable the mysql service - we will only need it on the coordinator node 50 | systemctl disable mysql 51 | 52 | log "Installing the Hive Metastore service" 53 | /usr/bin/printf "[Unit] 54 | Description=Hive Metastore 55 | After=network-online.target 56 | [Service] 57 | User=root 58 | Restart=on-failure 59 | Type=simple 60 | Environment="HADOOP_HOME=${HADOOP_HOME}" "JAVA_HOME=${JAVA_8_HOME}" "HIVE_HOME=${HIVE_HOME}" 61 | ExecStart=${HIVE_HOME}/bin/hive --service metastore 62 | [Install] 63 | WantedBy=default.target 64 | " > /etc/systemd/system/hive-metastore.service -------------------------------------------------------------------------------- /packer/presto/install-java.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | log() { 5 | echo "==> $(basename ${0}): ${1}" 6 | } 7 | 8 | sudo apt-get update 9 | sudo apt-get install -y -qq openjdk-8-jdk openjdk-11-jdk default-jdk 10 | 11 | 12 | sudo update-java-alternatives --jre-headless --jre -s java-1.8.0-openjdk-amd64 13 | export JAVA8_HOME=$(jrunscript -e 'java.lang.System.out.println(java.lang.System.getProperty("java.home"));') 14 | 15 | sudo update-java-alternatives --jre-headless --jre -s java-1.11.0-openjdk-amd64 16 | export JAVA_HOME=$(jrunscript -e 'java.lang.System.out.println(java.lang.System.getProperty("java.home"));') 17 | 18 | /usr/bin/printf " 19 | JAVA8_HOME=${JAVA8_HOME} 20 | JAVA_HOME=${JAVA_HOME}" >> /etc/environment -------------------------------------------------------------------------------- /packer/presto/install-presto-cli.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | log() { 5 | echo "==> $(basename ${0}): ${1}" 6 | } 7 | 8 | export version_presto=${PRESTO_VERSION} 9 | export path_install="/usr/local/bin" 10 | export path_file="presto-cli-${version_presto}-executable.jar" 11 | 12 | log "Downloading Presto CLI ${version_presto}..." 13 | 14 | wget -q -O ${path_file} "https://repo1.maven.org/maven2/io/prestosql/presto-cli/${version_presto}/presto-cli-${version_presto}-executable.jar" 15 | 16 | log "Installing Presto CLI ${version_presto}..." 17 | 18 | install -d -o presto -g presto ${path_install} 19 | mv ${path_file} ${path_install}/presto 20 | chmod +x ${path_install}/presto 21 | -------------------------------------------------------------------------------- /packer/presto/install-presto.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -e 3 | 4 | log() { 5 | echo "==> $(basename ${0}): ${1}" 6 | } 7 | 8 | export version_presto=${PRESTO_VERSION} 9 | export path_install="/usr/local/presto-server-${version_presto}" 10 | export path_file="presto-server-${version_presto}.tar.gz" 11 | export pid_file="/var/run/presto/presto.pid" 12 | export user_presto='presto' 13 | 14 | log "Downloading Presto ${version_presto}..." 15 | 16 | wget -q -O "${path_file}" "https://repo1.maven.org/maven2/io/prestosql/presto-server/${version_presto}/presto-server-${version_presto}.tar.gz" 17 | 18 | log "Installing Presto ${version_presto}..." 19 | useradd ${user_presto} || log "User [${user_presto}] already exists. Continuing..." 20 | 21 | install -d -o ${user_presto} -g ${user_presto} "${path_install}" 22 | tar -xzf "${path_file}" -C /usr/local/ 23 | install -d -o ${user_presto} -g ${user_presto} /etc/presto/ 24 | install -d -o ${user_presto} -g ${user_presto} /etc/presto/catalog 25 | install -d -o ${user_presto} -g ${user_presto} /var/lib/presto/ # this is the data dir 26 | install -d -o ${user_presto} -g ${user_presto} /var/log/presto/ 27 | mv ./presto-catalogs/* /etc/presto/catalog/ 28 | rm -rf ./presto-catalogs 29 | rm -rf "$path_install/etc" 30 | ln -s /etc/presto/ "$path_install/etc" 31 | 32 | log "Adding PRESTO_HOME to system profile" 33 | /usr/bin/printf "export PRESTO_HOME=\"${path_install}\"" >> /etc/profile.d/presto.sh 34 | 35 | 36 | 37 | /usr/bin/printf "PRESTO_OPTS= \ 38 | --pid-file=${pid_file} \ 39 | --node-config=/etc/presto/node.properties \ 40 | --jvm-config=/etc/presto/jvm.config \ 41 | --config=/etc/presto/config.properties \ 42 | --launcher-log-file=/var/log/presto/launcher.log \ 43 | --server-log-file=/var/log/presto/server.log \ 44 | -Dhttp-server.log.path=/var/log/presto/http-request.log \ 45 | -Dcatalog.config-dir=/etc/presto/catalog 46 | [Install] 47 | WantedBy=default.target 48 | " >> /etc/default/presto 49 | chown ${user_presto}:${user_presto} /etc/default/presto 50 | 51 | log "Installing the Presto service" 52 | /usr/bin/printf " 53 | [Unit] 54 | Description=Presto Server 55 | Documentation=https://trino.io/docs/current/index.html 56 | After=network-online.target 57 | [Service] 58 | User=${user_presto} 59 | Restart=on-failure 60 | Type=forking 61 | PIDFile=${pid_file} 62 | RuntimeDirectory=presto 63 | EnvironmentFile=/etc/default/presto 64 | ExecStart=${path_install}/bin/launcher start \$PRESTO_OPTS 65 | ExecStop=${path_install}/bin/launcher stop \$PRESTO_OPTS 66 | [Install] 67 | WantedBy=default.target 68 | " > /etc/systemd/system/presto.service 69 | 70 | systemctl daemon-reload 71 | 72 | rm "${path_file}" 73 | -------------------------------------------------------------------------------- /packer/presto/install-trino-cli.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | log() { 5 | echo "==> $(basename ${0}): ${1}" 6 | } 7 | 8 | export version_trino=${PRESTO_VERSION} 9 | export path_install="/usr/local/bin" 10 | export path_file="trino-cli-${version_trino}-executable.jar" 11 | 12 | log "Downloading Presto CLI ${version_trino}..." 13 | 14 | wget -q -O ${path_file} "https://repo1.maven.org/maven2/io/trino/trino-cli/${version_trino}/trino-cli-${version_trino}-executable.jar" 15 | 16 | log "Installing Presto CLI ${version_trino}..." 17 | 18 | install -d -o trino -g trino ${path_install} 19 | mv ${path_file} ${path_install}/trino 20 | chmod +x ${path_install}/trino 21 | ln -s ${path_install}/trino ${path_install}/presto 22 | -------------------------------------------------------------------------------- /packer/presto/install-trino.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -ex 3 | 4 | log() { 5 | echo "==> $(basename ${0}): ${1}" 6 | } 7 | 8 | export version_trino=${PRESTO_VERSION} 9 | export path_install="/usr/local/trino-server-${version_trino}" 10 | export path_file="trino-server-${version_trino}.tar.gz" 11 | export pid_file="/var/run/trino/trino.pid" 12 | export user_trino='trino' 13 | 14 | log "Downloading Presto ${version_trino}..." 15 | 16 | wget -q -O "${path_file}" "https://repo1.maven.org/maven2/io/trino/trino-server/${version_trino}/trino-server-${version_trino}.tar.gz" 17 | 18 | log "Installing Presto / Trino ${version_trino}..." 19 | useradd ${user_trino} || log "User [${user_trino}] already exists. Continuing..." 20 | 21 | install -d -o ${user_trino} -g ${user_trino} "${path_install}" 22 | tar -xzf "${path_file}" -C /usr/local/ 23 | install -d -o ${user_trino} -g ${user_trino} /etc/trino/ 24 | install -d -o ${user_trino} -g ${user_trino} /etc/trino/catalog 25 | install -d -o ${user_trino} -g ${user_trino} /var/lib/trino/ # this is the data dir 26 | install -d -o ${user_trino} -g ${user_trino} /var/log/trino/ 27 | mv ./presto-catalogs/* /etc/trino/catalog/ 28 | rm -rf ./presto-catalogs 29 | rm -rf "$path_install/etc" 30 | ln -s /etc/trino/ "$path_install/etc" 31 | 32 | log "Adding TRINO_HOME to system profile" 33 | /usr/bin/printf "export TRINO_HOME=\"${path_install}\"" >> /etc/profile.d/trino.sh 34 | 35 | 36 | 37 | /usr/bin/printf "TRINO_OPTS= \ 38 | --pid-file=${pid_file} \ 39 | --node-config=/etc/trino/node.properties \ 40 | --jvm-config=/etc/trino/jvm.config \ 41 | --config=/etc/trino/config.properties \ 42 | --launcher-log-file=/var/log/trino/launcher.log \ 43 | --server-log-file=/var/log/trino/server.log \ 44 | -Dhttp-server.log.path=/var/log/trino/http-request.log \ 45 | -Dcatalog.config-dir=/etc/trino/catalog 46 | [Install] 47 | WantedBy=default.target 48 | " >> /etc/default/trino 49 | chown ${user_trino}:${user_trino} /etc/default/trino 50 | 51 | log "Installing the Presto service" 52 | /usr/bin/printf " 53 | [Unit] 54 | Description=Presto Server 55 | Documentation=https://trino.io/docs/current/index.html 56 | After=network-online.target 57 | [Service] 58 | User=${user_trino} 59 | Restart=on-failure 60 | Type=forking 61 | PIDFile=${pid_file} 62 | RuntimeDirectory=trino 63 | EnvironmentFile=/etc/default/trino 64 | ExecStart=${path_install}/bin/launcher start \$TRINO_OPTS 65 | ExecStop=${path_install}/bin/launcher stop \$TRINO_OPTS 66 | [Install] 67 | WantedBy=default.target 68 | " > /etc/systemd/system/trino.service 69 | 70 | systemctl daemon-reload 71 | 72 | rm "${path_file}" 73 | -------------------------------------------------------------------------------- /packer/presto/update-machine.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | log() { 5 | echo "==> $(basename ${0}): ${1}" 6 | } 7 | 8 | export DEBIAN_FRONTEND=noninteractive 9 | 10 | TZ=Etc/UTC 11 | ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone 12 | 13 | log "Updating package index..." 14 | sudo -E apt-get update -y -qq 15 | 16 | log "Upgrading existing packages" 17 | sudo -E apt-get upgrade -y -qq 18 | 19 | log "Updating package index..." 20 | sudo -E apt-get update -y -qq 21 | 22 | log "Installing prerequisites..." 23 | sudo -E apt-get install -y -qq --no-install-recommends \ 24 | wget software-properties-common htop apt-transport-https python3 jq awscli vim 25 | 26 | sudo update-alternatives --install /usr/bin/python python /usr/bin/python3 10 27 | 28 | # Disable daily apt unattended updates. 29 | echo 'APT::Periodic::Enable "0";' >> /etc/apt/apt.conf.d/10periodic -------------------------------------------------------------------------------- /packer/prestoclients.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Presto clients image", 3 | "builders": [ 4 | { 5 | "type": "amazon-ebs", 6 | "ami_name": "prestoclients-{{isotime | clean_resource_name}}", 7 | "availability_zone": "{{user `aws_az`}}", 8 | "iam_instance_profile": "packer", 9 | "instance_type": "t2.medium", 10 | "region": "{{user `aws_region`}}", 11 | "run_tags": { 12 | "role": "packer" 13 | }, 14 | "source_ami_filter": { 15 | "filters": { 16 | "virtualization-type": "hvm", 17 | "name": "presto-*", 18 | "root-device-type": "ebs" 19 | }, 20 | "owners": ["self"], 21 | "most_recent": true 22 | }, 23 | "launch_block_device_mappings": [ 24 | { 25 | "device_name": "/dev/sda1", 26 | "volume_size": 15, 27 | "volume_type": "standard", 28 | "delete_on_termination": true 29 | } 30 | ], 31 | "ssh_timeout": "10m", 32 | "ssh_username": "ubuntu", 33 | "ssh_interface": "public_ip", 34 | "tags": { 35 | "ImageType": "prestoclients-packer-image" 36 | }, 37 | "spot_price_auto_product": "Linux/UNIX (Amazon VPC)", 38 | "spot_price": "auto" 39 | } 40 | ], 41 | "provisioners": [ 42 | { 43 | "type": "file", 44 | "source": "../assets/nginx.conf", 45 | "destination": "/tmp/clients-nginx.conf" 46 | }, 47 | { 48 | "type": "file", 49 | "source": "../assets/zeppelin-interpreter.json", 50 | "destination": "/tmp/zeppelin-interpreter-partial.json" 51 | }, 52 | { 53 | "type": "file", 54 | "source": "../assets/zeppelin-jdbc-0.11.0-SNAPSHOT.jar", 55 | "destination": "/tmp/zeppelin-jdbc-0.11.0-SNAPSHOT.jar" 56 | }, 57 | { 58 | "type": "shell", 59 | "script": "prestoclients/update-machine.sh", 60 | "execute_command": "echo '' | {{ .Vars }} sudo -E -S sh '{{ .Path }}'" 61 | }, 62 | { 63 | "type": "shell", 64 | "script": "prestoclients/install-redash.sh", 65 | "environment_vars": [ "REDASH_VERSION={{user `redash_version`}}" ], 66 | "execute_command": "echo '' | {{ .Vars }} sudo -E -S sh '{{ .Path }}'" 67 | }, 68 | { 69 | "type": "shell", 70 | "script": "prestoclients/install-superset.sh", 71 | "execute_command": "echo '' | {{ .Vars }} sudo -E -S sh '{{ .Path }}'" 72 | }, 73 | { 74 | "type": "shell", 75 | "script": "prestoclients/install-zeppelin.sh", 76 | "execute_command": "echo '' | {{ .Vars }} sudo -E -S sh '{{ .Path }}'" 77 | } 78 | ] 79 | } 80 | -------------------------------------------------------------------------------- /packer/prestoclients/install-redash.sh: -------------------------------------------------------------------------------- 1 | git clone https://github.com/getredash/setup /tmp/redash 2 | cd /tmp/redash/ 3 | 4 | #export REDASH_BRANCH="v$REDASH_VERSION" 5 | sed 's/\$LATEST_VERSION/10.1.0.b50633/g' setup.sh > setup2.sh 6 | mv setup2.sh setup.sh 7 | bash ./setup.sh 8 | 9 | cd /opt/redash 10 | docker-compose down 11 | sed -i '/^.*nginx:$/,$d' docker-compose.yml # patch out nginx service 12 | docker-compose up -d 13 | -------------------------------------------------------------------------------- /packer/prestoclients/install-superset.sh: -------------------------------------------------------------------------------- 1 | SUPERSET_CONFIG_PATH="/opt/superset/config" 2 | 3 | git clone https://github.com/apache/superset.git /opt/superset 4 | 5 | sudo mkdir -p $SUPERSET_CONFIG_PATH 6 | 7 | cat <<'EOF' >$SUPERSET_CONFIG_PATH/presto-datasource.yaml 8 | databases: 9 | - database_name: trino 10 | expose_in_sqllab: true 11 | extra: "{\r\n \"metadata_params\": {},\r\n \"engine_params\": {},\r\n \"\ 12 | metadata_cache_timeout\": {},\r\n \"schemas_allowed_for_csv_upload\": []\r\n\ 13 | }\r\n" 14 | sqlalchemy_uri: trino://trino@PRESTO_COORDINATOR_HOST:8080 15 | tables: [] 16 | EOF 17 | 18 | cat <<'EOF' >$SUPERSET_CONFIG_PATH/superset_config.py 19 | ENABLE_PROXY_FIX = True 20 | PREFERRED_URL_SCHEME = 'https' 21 | EOF 22 | 23 | cd /opt/superset 24 | docker-compose -f docker-compose-non-dev.yml pull 25 | -------------------------------------------------------------------------------- /packer/prestoclients/install-zeppelin.sh: -------------------------------------------------------------------------------- 1 | cd /tmp 2 | wget --no-verbose https://www-eu.apache.org/dist/zeppelin/zeppelin-0.10.0/zeppelin-0.10.0-bin-all.tgz 3 | sudo tar xf zeppelin-*-bin-all.tgz -C /opt 4 | rm zeppelin-0.10.0-bin-all.tgz 5 | sudo mv /opt/zeppelin-*-bin-all /opt/zeppelin 6 | sudo cp zeppelin-interpreter-partial.json /opt/zeppelin/conf/zeppelin-interpreter-partial.json 7 | 8 | sudo cp zeppelin-jdbc-0.11.0-SNAPSHOT.jar /opt/zeppelin/interpreter/jdbc/zeppelin-jdbc-0.10.0.jar 9 | # trino support for 358 and above 10 | # https://issues.apache.org/jira/browse/ZEPPELIN-5551 11 | sudo wget --no-verbose https://repo1.maven.org/maven2/io/trino/trino-jdbc/370/trino-jdbc-370.jar -P /opt/zeppelin/interpreter/jdbc 12 | 13 | 14 | sudo useradd -d /opt/zeppelin -s /bin/false zeppelin 15 | 16 | /usr/bin/printf " 17 | export JAVA_HOME=$JAVA8_HOME 18 | export ZEPPELIN_PORT=9090 19 | " >> /opt/zeppelin/conf/zeppelin-env.sh 20 | 21 | /usr/bin/printf " 22 | [Unit] 23 | Description=Zeppelin service 24 | After=syslog.target network.target 25 | 26 | [Service] 27 | Type=forking 28 | ExecStart=/opt/zeppelin/bin/zeppelin-daemon.sh start 29 | ExecStop=/opt/zeppelin/bin/zeppelin-daemon.sh stop 30 | ExecReload=/opt/zeppelin/bin/zeppelin-daemon.sh reload 31 | User=zeppelin 32 | Group=zeppelin 33 | Restart=always 34 | 35 | [Install] 36 | WantedBy=multi-user.target 37 | " > /etc/systemd/system/zeppelin.service 38 | 39 | sudo chown -R zeppelin:zeppelin /opt/zeppelin 40 | sudo systemctl enable zeppelin 41 | -------------------------------------------------------------------------------- /packer/prestoclients/update-machine.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | log() { 5 | echo "==> $(basename ${0}): ${1}" 6 | } 7 | 8 | export DEBIAN_FRONTEND=noninteractive 9 | 10 | log "Updating package index..." 11 | sudo -E apt-get update -qq 12 | 13 | log "Upgrading existing packages" 14 | sudo -E apt-get upgrade -y 15 | 16 | log "Installing prerequisites..." 17 | sudo -E apt-get install -y -qq --no-install-recommends \ 18 | build-essential libssl-dev libffi-dev \ 19 | python-dev python3.6-dev python3-pip python3-venv \ 20 | libsasl2-dev libldap2-dev \ 21 | nginx jq xmlstarlet 22 | 23 | log "Generating temporary certificates" 24 | mkdir -p /opt/certs 25 | cd /opt/certs 26 | openssl genrsa -des3 -passout pass:xxxx -out keypair 2048 27 | openssl rsa -passin pass:xxxx -in keypair -out server.key 28 | rm keypair 29 | touch /home/ubuntu/.rnd 30 | openssl req -new -key server.key -out server.csr -subj "/CN=*" 31 | openssl x509 -req -days 365 -in server.csr -signkey server.key -out server.crt 32 | rm server.csr 33 | cd - 34 | 35 | systemctl enable nginx.service 36 | systemctl stop nginx.service 37 | 38 | sudo mkdir -p /etc/nginx/conf.d 39 | sudo mv /tmp/clients-nginx.conf /etc/nginx/conf.d/clients.conf -------------------------------------------------------------------------------- /packer/variables.json: -------------------------------------------------------------------------------- 1 | { 2 | "presto_version": "375", 3 | "hive_version": "2.3.9", 4 | "hadoop_version": "2.10.1", 5 | "redash_version": "10.0.x", 6 | 7 | "aws_region": "us-east-1", 8 | "aws_az": "us-east-1a", 9 | 10 | "azure_client_id": "", 11 | "azure_client_secret": "", 12 | "azure_subscription_id": "", 13 | "azure_tenant_id": "", 14 | 15 | "gcp_zone": "us-central1-a", 16 | "gcp_project_id": "my-project", 17 | "gcp_account_file": ".gcp_account.json", 18 | 19 | "azure_location": "East US", 20 | "azure_resource_group_name": "packer-presto-images" 21 | } 22 | -------------------------------------------------------------------------------- /terraform-aws/README.md: -------------------------------------------------------------------------------- 1 | # AWS deployment 2 | 3 | ## Create the AMIs with Packer 4 | 5 | Go to the packer folder and see the README there. Once you have the generated an 6 | AMI for the presto instance and the prestoclient instance, return here and 7 | continue with the next steps. 8 | 9 | ## Create key-pair 10 | 11 | ```bash 12 | aws ec2 create-key-pair --key-name presto --query 'KeyMaterial' --output text > presto.pem 13 | ``` 14 | 15 | ## VPC 16 | 17 | The Presto cluster is going to be deployed in a single subnet, within a single VPC, in a single availability zone. The idea behind this decision is to reduce latency and costs associated with transferring data between networks and AZs. Since Presto is usually used for non-mission critical parts of a system, this is usually acceptable. 18 | 19 | A load balancer is placed in front of the the Presto cluster and another in 20 | front of the Presto clients. To create a load balancer you need to associate it 21 | with two subnets of the same VPC in distinct availability zones, even if one of 22 | the availability zones is never used. 23 | 24 | Create a VPC or use an existing one. Make a list of least two subnet IDs in 25 | distinct availability zones. The first subnet in the list will be used to deploy 26 | the Presto cluster and related resources. The subsequent subnets will be used to 27 | configure the load balancer. 28 | 29 | ## Configurations 30 | 31 | The most important variables specified in `variables.tf` are the following: 32 | 33 | * `aws_region` - the region in which to launch the cluster. 34 | * `key_name` - the name of the key pair for root SSH access to the EC2 instance. You can use the one created earlier. 35 | * `subnet_ids` - the IDs of the VPC to launch the cluster in, as described above. 36 | * `public_facing` - whether or not the coordinator node should be open to the internet. The default and the highly recommended value is `false`. 37 | * `additional_security_groups` - here you add IDs for security groups you want to add to the coordinator load balancer so your clients (e.g. Redash, applications, etc) can access the coordinator for querying. 38 | * `count_clients` - number of client nodes with Redash and Apache Superset installed, with configured admin user and datasource pointing to the Presto cluster. Default is `0`. 39 | * `clients_lb_subnets` - list of subnet IDs to attach to the clients load balancer. At least two subnets from different availability zones must be provided. 40 | 41 | We recommend using `tfvars` file to override all variables and configurations, 42 | see https://www.terraform.io/intro/getting-started/variables.html#from-a-file 43 | for more details. 44 | 45 | You must create at least one client to generate the credentials to access the Presto UI. 46 | 47 | You can launch workers and spot-workers (workers which run on spot-instances). 48 | 49 | There are some more configurations to notice (like machine sizes, memory allocation, etc) which we will document soon 50 | 51 | ### Cluster topology 52 | 53 | Two modes of deployment are supported: 54 | 55 | * Production deployment with a single coordinator node and a bunch of worker nodes (number of workers is configurable) 56 | * Single node mode - one node acting as both coordinator and worker 57 | 58 | ## Launch the cluster with Terraform 59 | 60 | On first usage, you will need to execute `terraform init` to initialize the terraform providers used. 61 | 62 | To deploy the cluster, or apply any changes to an existing cluster deployed using this project, run: 63 | 64 | ```bash 65 | terraform plan 66 | terraform apply 67 | ``` 68 | 69 | When terraform is done, you should see a lot of output ending with something like this: 70 | 71 | ``` 72 | Apply complete! Resources: 11 added, 0 changed, 0 destroyed. 73 | 74 | The state of your infrastructure has been saved to the path 75 | below. This state is required to modify and destroy your 76 | infrastructure, so keep it safe. To inspect the complete state 77 | use the `terraform show` command. 78 | 79 | State path: terraform.tfstate 80 | 81 | Outputs: 82 | 83 | clients-admin-password = [ 84 | "********", 85 | ] 86 | clients-lb-dns = [ 87 | "example-presto-client-lb-1234567890.eu-west-1.elb.amazonaws.com", 88 | ] 89 | coordinator-lb-dns = [ 90 | "example-presto-lb-1234567890.eu-west-1.elb.amazonaws.com", 91 | ] 92 | ``` 93 | 94 | Note `coordinator-lb-dns` - that's your entry point to the Presto cluster. All 95 | queries should go to that URL, and the Presto UI accessible at that address as 96 | well (port 8080). 97 | 98 | To enter the UI you pass the `clients-admin-password` as the user name and don't 99 | set a password. 100 | 101 | ### Look around 102 | 103 | You can pull the list of instances by their state and role using aws-cli: 104 | 105 | ```bash 106 | aws ec2 describe-instances --filters Name=instance-state-name,Values=running 107 | aws ec2 describe-instances --filters Name=instance-state-name,Values=running,Name=tag:Role,Values=client 108 | ``` 109 | 110 | To login to one of the instances: 111 | 112 | ```bash 113 | ssh -i presto.pem ubuntu@{public IP / DNS of the instance} 114 | ``` 115 | -------------------------------------------------------------------------------- /terraform-aws/clients.tf: -------------------------------------------------------------------------------- 1 | data "template_file" "client-userdata-script" { 2 | count = var.count_clients != "0" ? 1 : 0 3 | template = file("${path.module}/../assets/client_user_data.sh") 4 | 5 | vars = { 6 | presto_coordinator_host = aws_elb.coordinator-lb.dns_name 7 | coordinator_port = var.http_port 8 | admin_password = var.count_clients != "0" ? random_string.clients-admin-password[0].result : "" 9 | cert_pem = tls_self_signed_cert.presto-clients-cert.cert_pem 10 | key_pem = tls_private_key.presto-clients-private-key.private_key_pem 11 | } 12 | } 13 | 14 | resource "random_string" "clients-admin-password" { 15 | count = var.count_clients != "0" ? 1 : 0 16 | length = 16 17 | special = false 18 | } 19 | 20 | resource "tls_private_key" "presto-clients-private-key" { 21 | algorithm = "ECDSA" 22 | ecdsa_curve = "P384" 23 | } 24 | 25 | resource "tls_self_signed_cert" "presto-clients-cert" { 26 | key_algorithm = "ECDSA" 27 | private_key_pem = tls_private_key.presto-clients-private-key.private_key_pem 28 | 29 | subject { 30 | common_name = "*" 31 | } 32 | 33 | validity_period_hours = 48 34 | 35 | allowed_uses = [ 36 | "key_encipherment", 37 | "digital_signature", 38 | "server_auth", 39 | ] 40 | } 41 | 42 | resource "aws_iam_server_certificate" "presto-clients-cert" { 43 | name_prefix = "presto-clients-cert" 44 | certificate_body = tls_self_signed_cert.presto-clients-cert.cert_pem 45 | private_key = tls_private_key.presto-clients-private-key.private_key_pem 46 | 47 | lifecycle { 48 | create_before_destroy = true 49 | } 50 | } 51 | 52 | # Redash LB configuration 53 | resource "aws_lb_target_group" "redash-https-clients" { 54 | name = "redash-https-clients-tg" 55 | port = "8500" 56 | protocol = "HTTPS" 57 | vpc_id = data.aws_subnet.main_subnet.vpc_id 58 | 59 | stickiness { 60 | type = "lb_cookie" 61 | } 62 | 63 | health_check { 64 | protocol = "HTTPS" 65 | matcher = "302" 66 | } 67 | } 68 | 69 | resource "aws_lb_listener" "redash-https-clients" { 70 | count = var.count_clients != "0" ? 1 : 0 71 | load_balancer_arn = aws_lb.clients-lb[0].arn 72 | port = "8500" 73 | protocol = "HTTPS" 74 | ssl_policy = "ELBSecurityPolicy-2016-08" 75 | certificate_arn = aws_iam_server_certificate.presto-clients-cert.arn 76 | 77 | default_action { 78 | type = "forward" 79 | target_group_arn = aws_lb_target_group.redash-https-clients.arn 80 | } 81 | } 82 | 83 | # Superset LB configuration 84 | resource "aws_lb_target_group" "superset-https-clients" { 85 | name = "superset-https-clients-tg" 86 | port = "8600" 87 | protocol = "HTTPS" 88 | vpc_id = data.aws_subnet.main_subnet.vpc_id 89 | 90 | stickiness { 91 | type = "lb_cookie" 92 | } 93 | 94 | health_check { 95 | path = "/health" 96 | protocol = "HTTPS" 97 | } 98 | } 99 | 100 | resource "aws_lb_listener" "superset-https-clients" { 101 | count = var.count_clients != "0" ? 1 : 0 102 | load_balancer_arn = aws_lb.clients-lb[0].arn 103 | port = "8600" 104 | protocol = "HTTPS" 105 | ssl_policy = "ELBSecurityPolicy-2016-08" 106 | certificate_arn = aws_iam_server_certificate.presto-clients-cert.arn 107 | 108 | default_action { 109 | type = "forward" 110 | target_group_arn = aws_lb_target_group.superset-https-clients.arn 111 | } 112 | } 113 | 114 | # Zeppelin LB configuration 115 | resource "aws_lb_target_group" "zeppelin-https-clients" { 116 | name = "zeppelin-https-clients-tg" 117 | port = "8700" 118 | protocol = "HTTPS" 119 | vpc_id = data.aws_subnet.main_subnet.vpc_id 120 | 121 | stickiness { 122 | type = "lb_cookie" 123 | } 124 | } 125 | 126 | resource "aws_lb_listener" "zeppelin-https-clients" { 127 | count = var.count_clients != "0" ? 1 : 0 128 | load_balancer_arn = aws_lb.clients-lb[0].arn 129 | port = "8700" 130 | protocol = "HTTPS" 131 | ssl_policy = "ELBSecurityPolicy-2016-08" 132 | certificate_arn = aws_iam_server_certificate.presto-clients-cert.arn 133 | 134 | default_action { 135 | type = "forward" 136 | target_group_arn = aws_lb_target_group.zeppelin-https-clients.arn 137 | } 138 | } 139 | 140 | resource "aws_lb_listener_rule" "zeppelin-https-clients-websockets-rule" { 141 | count = var.count_clients != "0" ? 1 : 0 142 | listener_arn = aws_lb_listener.zeppelin-https-clients[0].arn 143 | priority = 99 144 | 145 | action { 146 | type = "forward" 147 | target_group_arn = aws_lb_target_group.zeppelin-https-clients.arn 148 | } 149 | 150 | condition { 151 | path_pattern { 152 | values = ["/ws"] 153 | } 154 | } 155 | } 156 | 157 | # Clients ALB 158 | resource "aws_lb" "clients-lb" { 159 | count = var.count_clients != "0" ? 1 : 0 160 | load_balancer_type = "application" 161 | internal = "false" 162 | name = format("%s-presto-client-lb", var.environment_name) 163 | security_groups = concat( 164 | [aws_security_group.presto-clients.id], 165 | var.additional_security_groups, 166 | ) 167 | 168 | subnets = [for s in data.aws_subnet.subnets : s.id] 169 | 170 | idle_timeout = 400 171 | 172 | tags = { 173 | Name = format("%s-presto-client-lb", var.environment_name) 174 | } 175 | } 176 | 177 | resource "aws_launch_configuration" "clients" { 178 | count = var.count_clients != "0" ? 1 : 0 179 | name_prefix = "presto-${var.environment_name}-client" 180 | image_id = data.aws_ami.presto-clients.id 181 | instance_type = var.client_instance_type 182 | security_groups = [aws_security_group.presto-clients.id] 183 | user_data = data.template_file.client-userdata-script[0].rendered 184 | key_name = var.key_name 185 | associate_public_ip_address = false 186 | spot_price = var.clients_use_spot == "true" ? var.client_spot_hourly_price : "" 187 | 188 | root_block_device { 189 | volume_size = 15 # GB 190 | } 191 | 192 | lifecycle { 193 | create_before_destroy = true 194 | } 195 | } 196 | 197 | resource "aws_autoscaling_group" "clients" { 198 | count = var.count_clients != "0" ? 1 : 0 199 | name = "presto-${var.environment_name}-client" 200 | min_size = "0" 201 | max_size = "999" 202 | desired_capacity = var.count_clients 203 | launch_configuration = aws_launch_configuration.clients[0].id 204 | vpc_zone_identifier = [for s in data.aws_subnet.subnets : s.id] 205 | target_group_arns = [ 206 | aws_lb_target_group.redash-https-clients.arn, 207 | aws_lb_target_group.superset-https-clients.arn, 208 | aws_lb_target_group.zeppelin-https-clients.arn, 209 | ] 210 | 211 | tag { 212 | key = "Name" 213 | value = format("presto-%s-client", var.environment_name) 214 | propagate_at_launch = true 215 | } 216 | tag { 217 | key = "Environment" 218 | value = var.environment_name 219 | propagate_at_launch = true 220 | } 221 | tag { 222 | key = "Role" 223 | value = "worker" 224 | propagate_at_launch = true 225 | } 226 | tag { 227 | key = "Spot" 228 | value = var.clients_use_spot 229 | propagate_at_launch = true 230 | } 231 | 232 | lifecycle { 233 | create_before_destroy = true 234 | } 235 | } 236 | 237 | -------------------------------------------------------------------------------- /terraform-aws/coordinator.tf: -------------------------------------------------------------------------------- 1 | data "template_file" "coordinator-userdata-script" { 2 | template = templatefile("${path.module}/../assets/user_data.sh", { 3 | cloud_provider = "aws" 4 | environment_name = var.environment_name 5 | aws_region = var.aws_region 6 | http_port = var.http_port 7 | mode_presto = var.count_workers == "0" && var.count_workers_spot == "0" ? "coordinator-worker" : "coordinator" 8 | heap_size = var.coordinator_heap_size 9 | query_max_memory_per_node = ceil(var.worker_heap_size * 0.4) 10 | query_max_total_memory_per_node = ceil(var.worker_heap_size * 0.6) 11 | query_max_memory = var.query_max_memory 12 | security_groups = aws_security_group.presto.id 13 | aws_access_key_id = var.aws_access_key_id 14 | aws_secret_access_key = var.aws_secret_access_key 15 | address_presto_coordinator = "" 16 | extra_worker_configs = var.extra_worker_configs 17 | additional_bootstrap_scripts = var.additional_bootstrap_scripts 18 | 19 | }) 20 | } 21 | 22 | resource "aws_launch_configuration" "coordinator" { 23 | name_prefix = "presto-${var.environment_name}-coordinator" 24 | image_id = data.aws_ami.presto.id 25 | instance_type = var.coordinator_instance_type 26 | security_groups = concat([aws_security_group.presto.id], var.additional_security_groups) 27 | iam_instance_profile = aws_iam_instance_profile.presto.id 28 | associate_public_ip_address = var.public_facing 29 | user_data = data.template_file.coordinator-userdata-script.rendered 30 | key_name = var.key_name 31 | 32 | lifecycle { 33 | create_before_destroy = true 34 | } 35 | } 36 | 37 | resource "aws_autoscaling_group" "coordinator" { 38 | name = "presto-${var.environment_name}-coordinator" 39 | min_size = "0" 40 | max_size = "1" 41 | desired_capacity = "1" 42 | launch_configuration = aws_launch_configuration.coordinator.id 43 | vpc_zone_identifier = [data.aws_subnet.main_subnet.id] 44 | 45 | load_balancers = [aws_elb.coordinator-lb.id] 46 | 47 | tag { 48 | key = "Name" 49 | value = format("presto-%s-coordinator", var.environment_name) 50 | propagate_at_launch = true 51 | } 52 | tag { 53 | key = "Environment" 54 | value = var.environment_name 55 | propagate_at_launch = true 56 | } 57 | tag { 58 | key = "Role" 59 | value = "coordinator" 60 | propagate_at_launch = true 61 | } 62 | 63 | lifecycle { 64 | create_before_destroy = true 65 | } 66 | } 67 | 68 | resource "aws_elb" "coordinator-lb" { 69 | name = format("%s-presto-lb", var.environment_name) 70 | security_groups = concat( 71 | [aws_security_group.presto.id], 72 | var.additional_security_groups, 73 | ) 74 | subnets = [for s in data.aws_subnet.subnets : s.id] 75 | internal = !var.public_facing 76 | 77 | cross_zone_load_balancing = false 78 | idle_timeout = 400 79 | 80 | listener { 81 | instance_port = var.http_port 82 | instance_protocol = "http" 83 | lb_port = var.http_port 84 | lb_protocol = "http" 85 | } 86 | 87 | health_check { 88 | healthy_threshold = 2 89 | unhealthy_threshold = 2 90 | timeout = 3 91 | target = "HTTP:8080/ui/login.html" 92 | interval = 6 93 | } 94 | 95 | tags = { 96 | Name = format("%s-presto-lb", var.environment_name) 97 | } 98 | } 99 | 100 | -------------------------------------------------------------------------------- /terraform-aws/disks.tf: -------------------------------------------------------------------------------- 1 | resource "aws_ebs_volume" "coordinator" { 2 | availability_zone = data.aws_subnet.main_subnet.availability_zone 3 | size = 10 4 | type = "gp2" 5 | encrypted = var.volume_encryption 6 | 7 | tags = { 8 | Name = "presto-${var.environment_name}-coordinator" 9 | Environment = var.environment_name 10 | PrestoCoordinator = true 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /terraform-aws/iam.tf: -------------------------------------------------------------------------------- 1 | resource "aws_iam_role" "presto-service-role" { 2 | name_prefix = "presto-service-role" 3 | 4 | assume_role_policy = <