├── .gitignore
├── LICENSE
├── README.md
├── assets
├── client_user_data.sh
├── hive-site.xml
├── nginx.conf
├── user_data.sh
├── zeppelin-interpreter.json
└── zeppelin-jdbc-0.11.0-SNAPSHOT.jar
├── catalogs
├── blackhole.properties
├── glue.properties
├── hive.properties
├── iceberg.properties
├── jmx.properties
├── memory.properties
├── mysql.properties
├── tpcds.properties
└── tpch.properties
├── packer
├── README.md
├── presto.json
├── presto
│ ├── install-hive.sh
│ ├── install-java.sh
│ ├── install-presto-cli.sh
│ ├── install-presto.sh
│ ├── install-trino-cli.sh
│ ├── install-trino.sh
│ └── update-machine.sh
├── prestoclients.json
├── prestoclients
│ ├── install-redash.sh
│ ├── install-superset.sh
│ ├── install-zeppelin.sh
│ └── update-machine.sh
└── variables.json
└── terraform-aws
├── README.md
├── clients.tf
├── coordinator.tf
├── disks.tf
├── iam.tf
├── main.tf
├── output.tf
├── variables.tf
├── versions.tf
├── vpc.tf
├── workers-spot.tf
└── workers.tf
/.gitignore:
--------------------------------------------------------------------------------
1 | # Local .terraform directories
2 | **/.terraform/*
3 |
4 | # .tfstate files
5 | *.tfstate
6 | *.tfstate.*
7 |
8 | # .tfvars files
9 | *.tfvars
10 | .idea/
11 | *.pem
12 |
13 | # credentials files
14 | gcp-account.json
15 | .gcp-account.json
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Deploying Presto on the Cloud easily
2 |
3 | > [Presto](https://prestosql.io/) is an open source distributed SQL query engine for running interactive analytic queries against data sources of all sizes ranging from gigabytes to petabytes.
4 |
5 | > Presto was designed and written from the ground up for interactive analytics and approaches the speed of commercial data warehouses while scaling to the size of organizations like Facebook.
6 |
7 | 
8 |
9 | This repository allows deploying a Presto cluster on the cloud, using best-practices and state of the art tooling. You need to have the latest versions of Terraform and Packer for all features to work correctly.
10 |
11 | Features:
12 |
13 | * Fully functional deployment of Presto in a cluster mode (1 coordinator and multiple workers)
14 | * Spot / Preemptible worker instances supported.
15 | * Single coordinator-worker node deployment mode supported for testing and experimentation.
16 | * Auto-healing features baked in.
17 | * Easily manage and add more catalogs (connect Presto to more data-sources).
18 | * AWS deployment support (under `terraform-aws`)
19 | * Google Cloud Platform deployment (coming soon)
20 |
21 | ## Usage
22 |
23 | Clone this repo to work locally. You might want to fork it in case you need to apply some additional configurations or commit changes to the variables file.
24 |
25 | Create images with Packer (see `packer` folder in this repo), and then go into the terraform folder and run `terraform init`. See README files in each respective folder for more detailed instructions.
26 |
27 | Once you run `terraform apply` on any of the terraform folders in this repo, a file `terraform.tfstate` will be created. This file contains the mapping between your cloud elements to the terraform configuration. Make sure to keep this file safe.
28 |
29 | See [this guide](https://blog.gruntwork.io/how-to-manage-terraform-state-28f5697e68fa#.fbb2nalw6) for a discussion on `tfstate` management and locking between team members. We highly recommend using dedicated backends for real-world clusters to avoid state loss.
30 |
31 | ## Presto 101
32 |
33 | Presto has a nice UI for viewing cluster operations and currently running queries. It can be accessed from http://presto-coordinator-ip:8080/ui/.
34 |
35 | In order to run queries, you can connect to Presto [via JDBC](https://prestosql.io/docs/current/installation/jdbc.html) or SSH into the coordinator node and use the Presto CLI:
36 |
37 | ```bash
38 | presto --catalog hive --schema default
39 | ```
40 |
41 | Note the use of Presto's "catalogs". A Catalog in Presto is a definition of a connection to a data-source. A Catalog can be a schema on a MySQL server, an S3 bucket with partitions and schema that is defined in Hive Metastore, data on Kafka or Cassandra, and many other such options. The use of Catalogs makes it possible to query and join data from multiple data-sources in one Presto query.
42 |
43 | By default, we enable the local Hive Metastore catalog, and the JMX catalog. To customize or add your own, see the `catalogs` folder. Changes to this folder require running `packer` again.
44 |
45 | See [here](https://prestosql.io/docs/current/overview/concepts.html) for more Presto concepts.
46 |
47 | ## Configuration
48 |
49 | Presto needs to be carefully fine-tuned for best performance, mainly taking good care of memory allocations, number of cores and parallelisation (number of concurrent queries, splits, etc). This can only be achieved through experimentation, but at the base of this deployment is a good starting point for a typical cluster. We will be adding more guidance and more configuration options soon.
50 |
51 | ## Try it out
52 |
53 | The fastest way to test your installation is to follow AWS Athena's examples in https://aws.amazon.com/blogs/big-data/analyzing-data-in-s3-using-amazon-athena/. If you are running on AWS, this should work out-of-the-box, otherwise you will need to specify your AWS credentials in the hive catalog.
54 |
55 | SSH into the Presto coordinator VM and run the Hive REPL (`$HIVE_HOME/bin/hive`), and within it run the following DDL:
56 |
57 | ```sql
58 | CREATE EXTERNAL TABLE IF NOT EXISTS elb_logs_pq (
59 | request_timestamp string,
60 | elb_name string,
61 | request_ip string,
62 | request_port int,
63 | backend_ip string,
64 | backend_port int,
65 | request_processing_time double,
66 | backend_processing_time double,
67 | client_response_time double,
68 | elb_response_code string,
69 | backend_response_code string,
70 | received_bytes bigint,
71 | sent_bytes bigint,
72 | request_verb string,
73 | url string,
74 | protocol string,
75 | user_agent string,
76 | ssl_cipher string,
77 | ssl_protocol string )
78 | PARTITIONED BY(year int, month int, day int)
79 | STORED AS PARQUET
80 | LOCATION 's3a://athena-examples/elb/parquet/'
81 | tblproperties ("parquet.compress"="SNAPPY");
82 |
83 | msck repair table elb_logs_pq;
84 | ```
85 |
86 | This will create a partitioned "external" Hive table with data on S3. Once done, you can query it via Hive, or you can logout of Hive and query it via the Presto CLI:
87 |
88 | ```sql
89 | SELECT elb_name,
90 | sum(case elb_response_code
91 | WHEN '200' THEN
92 | 1
93 | ELSE 0 end) AS uptime, sum(case elb_response_code
94 | WHEN '404' THEN
95 | 1
96 | ELSE 0 end) AS downtime
97 | FROM elb_logs_pq
98 | GROUP BY elb_name;
99 | ```
100 |
101 | ```bash
102 | ubuntu@ip-172-31-32-64:~$ presto --catalog hive --schema default
103 | presto:default> [paste query copied from above]
104 |
105 | elb_name | uptime | downtime
106 | --------------+-----------+----------
107 | elb_demo_004 | 383616619 | 21261503
108 | elb_demo_008 | 383360093 | 21350497
109 | elb_demo_002 | 383632502 | 21300518
110 | elb_demo_009 | 383427076 | 21335844
111 | elb_demo_001 | 383671436 | 21270594
112 | elb_demo_007 | 383490605 | 21303122
113 | elb_demo_005 | 383734702 | 21341740
114 | elb_demo_003 | 383351477 | 21231655
115 | elb_demo_006 | 383506485 | 21336487
116 | (9 rows)
117 |
118 | Query 20180810_121913_00002_s3bz8, FINISHED, 3 nodes
119 | Splits: 2,418 total, 2,418 done (100.00%)
120 | 0:53 [3.84B rows, 2.51GB] [71.7M rows/s, 48MB/s]
121 | ```
122 |
--------------------------------------------------------------------------------
/assets/client_user_data.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 | exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1
4 |
5 | ### SSL Certs
6 | mkdir -p /opt/certs
7 | cat <<'EOF' >/opt/certs/server.crt
8 | ${cert_pem}
9 | EOF
10 | cat <<'EOF' >/opt/certs/server.key
11 | ${key_pem}
12 | EOF
13 |
14 | ### Redash
15 |
16 | export COMPOSE_INTERACTIVE_NO_CLI=1
17 | cd /opt/redash
18 |
19 | sudo -E docker-compose exec -T server ./manage.py users create_root admin@redash admin --password "${admin_password}"
20 | sudo -E docker-compose exec -T server ./manage.py ds new trino --type trino --options '{"host": "${presto_coordinator_host}", "username": "admin"}'
21 |
22 | # Redash OAuth setup
23 | # See https://redash.io/help/open-source/admin-guide/google-developer-account-setup
24 | #docker-compose down
25 | #cat <<'EOF' >/opt/redash/env
26 | #REDASH_GOOGLE_CLIENT_ID=#
27 | #REDASH_GOOGLE_CLIENT_SECRET=#
28 | #EOF
29 | #docker-compose up -d
30 |
31 | cd -
32 |
33 | ### Zeppelin
34 | /usr/bin/printf "[users]
35 | admin = ${admin_password}, admin
36 | [main]
37 | sessionManager = org.apache.shiro.web.session.mgt.DefaultWebSessionManager
38 | cookie = org.apache.shiro.web.servlet.SimpleCookie
39 | cookie.name = JSESSIONID
40 | cookie.httpOnly = true
41 | sessionManager.sessionIdCookie = \$cookie
42 | securityManager.sessionManager = \$sessionManager
43 | securityManager.sessionManager.globalSessionTimeout = 86400000
44 | shiro.loginUrl = /api/login
45 | [roles]
46 | admin = *
47 | [urls]
48 | /api/version = anon
49 | /api/interpreter/setting/restart/** = authc
50 | /api/interpreter/** = authc, roles[admin]
51 | /api/configurations/** = authc, roles[admin]
52 | /api/credential/** = authc, roles[admin]
53 | /** = authc
54 | " | sudo tee /opt/zeppelin/conf/shiro.ini
55 |
56 | xmlstarlet ed \
57 | -u "//property[name='zeppelin.anonymous.allowed']/value" \
58 | -v false < /opt/zeppelin/conf/zeppelin-site.xml.template | sudo tee /opt/zeppelin/conf/zeppelin-site.xml
59 |
60 | cat /opt/zeppelin/conf/interpreter.json | jq --argfile presto /opt/zeppelin/conf/zeppelin-interpreter-partial.json '.interpreterSettings.presto = $presto' > /tmp/interpreter.json
61 | sed -i 's/PRESTO_HOST/${presto_coordinator_host}:${coordinator_port}/g' /tmp/interpreter.json
62 | sudo mv /tmp/interpreter.json /opt/zeppelin/conf/interpreter.json
63 | sudo rm /opt/zeppelin/conf/zeppelin-interpreter-partial.json
64 |
65 | sudo chown zeppelin:zeppelin /opt/zeppelin/conf -R
66 | sudo service zeppelin restart
67 |
68 | ### Apache Superset
69 |
70 | # Create presto datasource
71 | sudo sed -i -E "s/PRESTO_COORDINATOR_HOST/${presto_coordinator_host}/g" /opt/superset/config/presto-datasource.yaml
72 | sudo sed -ie '/^x-superset-volumes/a \
73 | \ \ - /opt/superset/config/presto-datasource.yaml:/tmp/presto-datasource.yaml\
74 | ' /opt/superset/docker-compose-non-dev.yml
75 |
76 | #superset import datasources and gunicorn - not yet active
77 | cd /opt/superset
78 | sudo docker-compose -f docker-compose-non-dev.yml up -d
79 | # sudo docker exec -it superset_app superset import-datasources -p /tmp/presto-datasource.yaml
80 | sudo rm /opt/superset/config/presto-datasource.yaml
81 |
82 |
83 |
84 | # SUPERSET_VENV_PATH="/opt/superset/venv"
85 | # apt-get install python3-venv
86 | # python3 -m venv $SUPERSET_VENV_PATH
87 | # . $SUPERSET_VENV_PATH/bin/activate
88 | #
89 | # pip install --upgrade setuptools pip
90 | # pip install gevent
91 | # nohup gunicorn -w 10 \
92 | # -k gevent \
93 | # --timeout 120 \
94 | # -b localhost:6000 \
95 | # --limit-request-line 0 \
96 | # --limit-request-field_size 0 \
97 | # --forwarded-allow-ips="*" \
98 | # superset:app &
99 |
100 |
101 | # Presto OAuth setup
102 | # See https://superset.incubator.apache.org/faq.html?highlight=oauth#how-can-i-configure-oauth-authentication-and-authorization
103 | #cat <<'EOF' >>/opt/superset/config/superset_config.py
104 | #AUTH_TYPE = AUTH_OAUTH
105 | #
106 | #OAUTH_PROVIDERS = [
107 | # {
108 | # "name": "twitter",
109 | # "icon": "fa-twitter",
110 | # "remote_app": {
111 | # "consumer_key": os.environ.get("TWITTER_KEY"),
112 | # "consumer_secret": os.environ.get("TWITTER_SECRET"),
113 | # "base_url": "https://api.twitter.com/1.1/",
114 | # "request_token_url": "https://api.twitter.com/oauth/request_token",
115 | # "access_token_url": "https://api.twitter.com/oauth/access_token",
116 | # "authorize_url": "https://api.twitter.com/oauth/authenticate",
117 | # },
118 | # },
119 | # {
120 | # "name": "google",
121 | # "icon": "fa-google",
122 | # "token_key": "access_token",
123 | # "remote_app": {
124 | # "consumer_key": os.environ.get("GOOGLE_KEY"),
125 | # "consumer_secret": os.environ.get("GOOGLE_SECRET"),
126 | # "base_url": "https://www.googleapis.com/oauth2/v2/",
127 | # "request_token_params": {"scope": "email profile"},
128 | # "request_token_url": None,
129 | # "access_token_url": "https://accounts.google.com/o/oauth2/token",
130 | # "authorize_url": "https://accounts.google.com/o/oauth2/auth",
131 | # },
132 | # },
133 | # {
134 | # "name": "azure",
135 | # "icon": "fa-windows",
136 | # "token_key": "access_token",
137 | # "remote_app": {
138 | # "consumer_key": os.environ.get("AZURE_APPLICATION_ID"),
139 | # "consumer_secret": os.environ.get("AZURE_SECRET"),
140 | # "base_url": "https://login.microsoftonline.com/{AZURE_TENANT_ID}/oauth2",
141 | # "request_token_params": {
142 | # "scope": "User.read name preferred_username email profile",
143 | # "resource": os.environ.get("AZURE_APPLICATION_ID"),
144 | # },
145 | # "request_token_url": None,
146 | # "access_token_url": "https://login.microsoftonline.com/{AZURE_TENANT_ID}/oauth2/token",
147 | # "authorize_url": "https://login.microsoftonline.com/{AZURE_TENANT_ID}/oauth2/authorize",
148 | # },
149 | # }
150 | #]
151 | #EOF
152 |
153 |
154 |
155 | sudo systemctl restart nginx.service
156 |
--------------------------------------------------------------------------------
/assets/hive-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | hive.exec.script.wrapper
6 |
7 |
8 |
9 |
10 | hive.exec.plan
11 |
12 |
13 |
14 |
15 | hive.query.results.cache.max.size
16 | 2147483648
17 | Maximum total size in bytes that the query results cache directory is allowed to use on the filesystem.
18 |
19 |
20 | hive.query.results.cache.max.entry.size
21 | 10485760
22 | Maximum size in bytes that a single query result is allowed to use in the results cache directory
23 |
24 |
25 | hive.notification.event.poll.interval
26 | 60s
27 |
28 | Expects a time value with unit (d/day, h/hour, m/min, s/sec, ms/msec, us/usec, ns/nsec), which is sec if not specified.
29 | How often the notification log is polled for new NotificationEvents from the metastore.A nonpositive value means the notification log is never polled.
30 |
31 |
32 |
33 | hive.blobstore.supported.schemes
34 | s3,s3a,s3n
35 | Comma-separated list of supported blobstore schemes.
36 |
37 |
38 |
45 |
46 |
47 | javax.jdo.option.ConnectionURL
48 | jdbc:mysql://localhost:3306/hive?useSSL=false&createDatabaseIfNotExist=true
49 | JDBC connect string for a JDBC metastore
50 |
51 |
52 | javax.jdo.option.ConnectionDriverName
53 | com.mysql.jdbc.Driver
54 |
55 |
56 | javax.jdo.option.ConnectionUserName
57 | root
58 | username to use against metastore database
59 |
60 |
61 | javax.jdo.option.ConnectionPassword
62 | pwd
63 |
64 |
65 |
66 | hive.metastore.uris
67 | thrift://localhost:9083
68 |
69 |
70 |
71 | datanucleus.autoCreateSchema
72 | false
73 |
74 |
75 |
76 | fs.file.impl.disable.cache
77 | true
78 |
79 |
80 | fs.hdfs.impl.disable.cache
81 | true
82 |
83 |
84 |
85 | hive.server2.logging.operation.enabled
86 | true
87 |
88 |
89 | hive.server2.logging.operation.log.location
90 | /tmp/hive-metastore/operation_logs
91 |
92 |
93 | hive.server2.logging.operation.verbose
94 | true
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 | hive.server2.allow.user.substitution
103 | true
104 |
105 |
106 | hive.server2.enable.doAs
107 | true
108 |
109 |
110 |
111 | hive.server2.thrift.port
112 | 10000
113 |
114 |
115 |
116 | hive.server2.thrift.http.port
117 | 10001
118 |
119 |
120 |
121 |
122 | hive.server2.in.place.progress
123 | false
124 |
125 |
126 |
127 | datanucleus.fixedDatastore
128 | true
129 |
130 |
131 |
132 | mapred.reduce.tasks
133 | -1
134 |
135 |
136 |
137 | mapred.max.split.size
138 | 256000000
139 |
140 |
141 |
142 | hive.metastore.connect.retries
143 | 15
144 |
145 |
146 |
147 | hive.optimize.sort.dynamic.partition
148 | true
149 |
150 |
151 |
--------------------------------------------------------------------------------
/assets/nginx.conf:
--------------------------------------------------------------------------------
1 | upstream redash {
2 | server localhost:5000;
3 | }
4 |
5 | upstream superset {
6 | server localhost:8088;
7 | }
8 |
9 | upstream zeppelin {
10 | server localhost:9090;
11 | }
12 |
13 | server {
14 | listen 8500 ssl http2;
15 | listen [::]:8500 ssl http2;
16 | server_name _;
17 |
18 | add_header Strict-Transport-Security "max-age=31536000" always;
19 |
20 | ssl_session_cache shared:SSL:20m;
21 | ssl_session_timeout 10m;
22 |
23 | ssl_protocols TLSv1 TLSv1.1 TLSv1.2;
24 | ssl_prefer_server_ciphers on;
25 | ssl_ciphers "ECDH+AESGCM:ECDH+AES256:ECDH+AES128:!ADH:!AECDH:!MD5;";
26 |
27 | ssl_certificate /opt/certs/server.crt;
28 | ssl_certificate_key /opt/certs/server.key;
29 |
30 | error_page 497 https://$host:$server_port$request_uri;
31 |
32 | location / {
33 | proxy_set_header Host $http_host;
34 | proxy_set_header X-Real-IP $remote_addr;
35 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
36 | proxy_set_header X-Forwarded-Proto $scheme;
37 |
38 | proxy_pass http://redash;
39 | }
40 | }
41 |
42 | server {
43 | listen 8600 ssl http2;
44 | listen [::]:8600 ssl http2;
45 | server_name _;
46 |
47 | add_header Strict-Transport-Security "max-age=31536000" always;
48 |
49 | ssl_session_cache shared:SSL:20m;
50 | ssl_session_timeout 10m;
51 |
52 | ssl_protocols TLSv1 TLSv1.1 TLSv1.2;
53 | ssl_prefer_server_ciphers on;
54 | ssl_ciphers "ECDH+AESGCM:ECDH+AES256:ECDH+AES128:!ADH:!AECDH:!MD5;";
55 |
56 | ssl_certificate /opt/certs/server.crt;
57 | ssl_certificate_key /opt/certs/server.key;
58 |
59 | error_page 497 https://$host:$server_port$request_uri;
60 |
61 | location / {
62 | proxy_set_header Host $http_host;
63 | proxy_set_header X-Real-IP $remote_addr;
64 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
65 | proxy_set_header X-Forwarded-Proto $scheme;
66 |
67 | proxy_pass http://superset;
68 | }
69 | }
70 |
71 | server {
72 | listen 8700 ssl http2;
73 | listen [::]:8700 ssl http2;
74 | server_name _;
75 |
76 | add_header Strict-Transport-Security "max-age=31536000" always;
77 |
78 | ssl_session_cache shared:SSL:20m;
79 | ssl_session_timeout 10m;
80 |
81 | ssl_protocols TLSv1 TLSv1.1 TLSv1.2;
82 | ssl_prefer_server_ciphers on;
83 | ssl_ciphers "ECDH+AESGCM:ECDH+AES256:ECDH+AES128:!ADH:!AECDH:!MD5;";
84 |
85 | ssl_certificate /opt/certs/server.crt;
86 | ssl_certificate_key /opt/certs/server.key;
87 |
88 | error_page 497 https://$host:$server_port$request_uri;
89 |
90 | location / { # For regular webserver support
91 | proxy_pass http://zeppelin;
92 | proxy_set_header X-Real-IP $remote_addr;
93 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
94 | proxy_set_header Host $http_host;
95 | proxy_set_header X-NginX-Proxy true;
96 | proxy_redirect off;
97 | }
98 |
99 | location /ws { # For websocket support
100 | proxy_pass http://zeppelin;
101 | proxy_http_version 1.1;
102 | proxy_set_header Upgrade websocket;
103 | proxy_set_header Connection upgrade;
104 | proxy_read_timeout 86400;
105 | }
106 | }
107 |
--------------------------------------------------------------------------------
/assets/user_data.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -ex
3 |
4 | exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1
5 |
6 | cat <<'EOF' >/etc/security/limits.d/100-trino-nofile.conf
7 | trino soft nofile 16384
8 | trino hard nofile 16384
9 | EOF
10 |
11 | /usr/bin/printf "
12 | node.environment=${environment_name}
13 | node.id=$(hostname)
14 | node.data-dir=/var/lib/trino/
15 | " > /etc/trino/node.properties
16 |
17 | /usr/bin/printf "-server
18 | -Xmx${heap_size}G
19 | -XX:-UseBiasedLocking
20 | -XX:+UseG1GC
21 | -XX:G1HeapRegionSize=32M
22 | -XX:+ExplicitGCInvokesConcurrent
23 | -XX:+HeapDumpOnOutOfMemoryError
24 | -XX:+ExitOnOutOfMemoryError
25 | -XX:+UseGCOverheadLimit
26 | -XX:ReservedCodeCacheSize=512M
27 | -Djdk.attach.allowAttachSelf=true
28 | -Djdk.nio.maxCachedBufferSize=2000000
29 | -Duser.timezone=UTC
30 | " > /etc/trino/jvm.config
31 |
32 | function setup_hive_metastore {
33 | AV_ZONE="$(ec2metadata --availability-zone)"
34 | ENVIRONMENT_NAME="$(aws ec2 describe-tags --region "${aws_region}" --filters Name=resource-id,Values=$(ec2metadata --instance-id) | jq -r '.Tags[] | select(.Key == "Environment") | .Value')"
35 | echo "AV_ZONE: $AV_ZONE"
36 | echo "ENVIRONMENT_NAME: $ENVIRONMENT_NAME"
37 |
38 | while true; do
39 | UNATTACHED_VOLUME_ID="$(aws ec2 describe-volumes --region ${aws_region} --filters Name=tag:Environment,Values=$ENVIRONMENT_NAME Name=tag:PrestoCoordinator,Values=true Name=availability-zone,Values=$AV_ZONE | jq -r '.Volumes[] | select(.Attachments | length == 0) | .VolumeId' | shuf -n 1)"
40 | echo "UNATTACHED_VOLUME_ID: $UNATTACHED_VOLUME_ID"
41 |
42 | aws ec2 attach-volume --device "/dev/xvdh" --instance-id=$(ec2metadata --instance-id) --volume-id "$UNATTACHED_VOLUME_ID" --region ${aws_region}
43 | if [ "$?" != "0" ]; then
44 | sleep 10
45 | continue
46 | fi
47 |
48 | sleep 30
49 |
50 | ATTACHMENTS_COUNT="$(aws ec2 describe-volumes --region "${aws_region}" --filters Name=volume-id,Values="$UNATTACHED_VOLUME_ID" | jq -r '.Volumes[0].Attachments | length')"
51 | if [ "$ATTACHMENTS_COUNT" != "0" ]; then break; fi
52 | done
53 |
54 | echo 'Waiting for 30 seconds for the disk to become mountable...'
55 | sleep 30
56 |
57 | # Mount persistent storage and apply Hive Metastore schema if needed
58 | DEVICE_NAME=$(lsblk -ip | tail -n +2 | awk '{print $1 " " ($7? "MOUNTEDPART" : "") }' | sed ':a;N;$!ba;s/\n`/ /g' | grep -v MOUNTEDPART | sed -e 's/[[:space:]]*$//')
59 | MOUNT_PATH=/var/lib/mysql
60 |
61 | sudo mv $MOUNT_PATH /tmp/mysql.backup
62 | sudo mkdir -p $MOUNT_PATH
63 |
64 | if sudo mount -o defaults -t ext4 "$DEVICE_NAME" $MOUNT_PATH; then
65 | echo 'Successfully mounted existing disk'
66 | else
67 | echo 'Trying to mount a fresh disk'
68 | sudo mkfs.ext4 -m 0 -F -E lazy_itable_init=0,lazy_journal_init=0,discard "$DEVICE_NAME"
69 | sudo mount -o defaults -t ext4 "$DEVICE_NAME" $MOUNT_PATH && echo 'Successfully mounted a fresh disk'
70 | sudo cp -ar /tmp/mysql.backup/* $MOUNT_PATH/
71 | fi
72 |
73 | sudo chown mysql:mysql -R $MOUNT_PATH
74 | sudo chmod 700 $MOUNT_PATH
75 |
76 | service mysql start
77 | systemctl enable mysql
78 |
79 | . /etc/environment
80 | export HADOOP_HOME=$HADOOP_HOME
81 |
82 | if ! "$HIVE_HOME"/bin/schematool -validate -dbType mysql; then
83 | echo "Mysql schema is not valid"
84 | "$HIVE_HOME"/bin/schematool -dbType mysql -initSchema
85 | fi
86 |
87 | echo "Initializing Hive Metastore ($HIVE_HOME)..."
88 | service hive-metastore start
89 | systemctl enable hive-metastore
90 | }
91 |
92 | #
93 | # Configure as COORDINATOR
94 | #
95 | if [[ "${mode_presto}" == "coordinator" ]]; then
96 | echo "Configuring node as a [${mode_presto}]..."
97 |
98 | /usr/bin/printf "
99 | #
100 | # coordinator
101 | #
102 | coordinator=true
103 | discovery-server.enabled=true
104 | discovery.uri=http://localhost:${http_port}
105 | node-scheduler.include-coordinator=false
106 |
107 | http-server.http.port=${http_port}
108 | # query.max-memory-per-node has to be <= query.max-total-memory-per-node
109 | #query.max-memory-per-node=${query_max_memory_per_node}GB
110 | #query.max-total-memory-per-node=${query_max_total_memory_per_node}GB
111 | query.max-memory=${query_max_memory}GB
112 | # query.max-total-memory defaults to query.max-memory * 2 so we are good
113 | ${extra_worker_configs}
114 | " > /etc/trino/config.properties
115 |
116 | setup_hive_metastore
117 | fi
118 |
119 | #
120 | # Configure as WORKER
121 | #
122 | if [[ "${mode_presto}" == "worker" ]]; then
123 | echo "Configuring node as a [${mode_presto}]..."
124 |
125 | /usr/bin/printf "
126 | #
127 | # worker
128 | #
129 | coordinator=false
130 | discovery.uri=http://${address_presto_coordinator}:${http_port}
131 | node-scheduler.include-coordinator=false
132 |
133 | http-server.http.port=${http_port}
134 | # query.max-memory-per-node has to be <= query.max-total-memory-per-node
135 | #query.max-memory-per-node=${query_max_memory_per_node}GB
136 | #query.max-total-memory-per-node=${query_max_total_memory_per_node}GB
137 | query.max-memory=${query_max_memory}GB
138 | # query.max-total-memory defaults to query.max-memory * 2 so we are good
139 | ${extra_worker_configs}
140 | " > /etc/trino/config.properties
141 | fi
142 |
143 | #
144 | # Configure as BOTH coordinator and worker
145 | #
146 | if [[ "${mode_presto}" == "coordinator-worker" ]]; then
147 | echo "Configuring node as a [${mode_presto}]..."
148 |
149 | /usr/bin/printf "
150 | #
151 | # coordinator-worker
152 | #
153 | coordinator=true
154 | discovery-server.enabled=true
155 | discovery.uri=http://localhost:${http_port}
156 | node-scheduler.include-coordinator=true
157 |
158 | http-server.http.port=${http_port}
159 | # query.max-memory-per-node has to be <= query.max-total-memory-per-node
160 | #query.max-memory-per-node=${query_max_memory_per_node}GB
161 | #query.max-total-memory-per-node=${query_max_total_memory_per_node}GB
162 | query.max-memory=${query_max_memory}GB
163 | # query.max-total-memory defaults to query.max-memory * 2 so we are good
164 | ${extra_worker_configs}
165 | " > /etc/trino/config.properties
166 |
167 | setup_hive_metastore
168 | fi
169 |
170 | if [[ "${mode_presto}" == "worker" ]]; then
171 | echo "Waiting for Presto Coordinator to come online at: http://${address_presto_coordinator}:${http_port}"
172 | while ! nc -z ${address_presto_coordinator} ${http_port}; do
173 | sleep 5
174 | done
175 | fi
176 |
177 | if [ ! -z "${aws_access_key_id}" ] && [ ! -z "${aws_secret_access_key}" ]; then
178 | # Update hive-site.xml
179 | /usr/bin/printf "
180 |
181 | fs.s3.impl
182 | org.apache.hadoop.fs.s3native.NativeS3FileSystem
183 |
184 |
185 | fs.s3.awsAccessKeyId
186 | ${aws_access_key_id}
187 |
188 |
189 | fs.s3.awsSecretAccessKey
190 | ${aws_secret_access_key}
191 | " > /tmp/hive-site-partial.txt
192 | sudo sed -i "s//$(sed 's@[/\&]@\\&@g;$!s/$/\\/' /tmp/hive-site-partial.txt)/g" /usr/local/apache-hive-*-bin/conf/hive-site.xml
193 | rm /tmp/hive-site-partial.txt
194 |
195 | # Update hive.properties
196 | /usr/bin/printf "\nhive.allow-drop-table=true" >> /etc/trino/catalog/hive.properties
197 | /usr/bin/printf "\nhive.non-managed-table-writes-enabled=true" >> /etc/trino/catalog/hive.properties
198 | /usr/bin/printf "\n#hive.time-zone=UTC" >> /etc/trino/catalog/hive.properties
199 | /usr/bin/printf "\nhive.s3.aws-access-key=${aws_access_key_id}" >> /etc/trino/catalog/hive.properties
200 | /usr/bin/printf "\nhive.s3.aws-secret-key=${aws_secret_access_key}" >> /etc/trino/catalog/hive.properties
201 | /usr/bin/printf "\n" >> /etc/trino/catalog/hive.properties
202 | /usr/bin/printf "\nhive.s3.aws-access-key=${aws_access_key_id}" >> /etc/trino/catalog/iceberg.properties
203 | /usr/bin/printf "\nhive.s3.aws-secret-key=${aws_secret_access_key}" >> /etc/trino/catalog/iceberg.properties
204 | /usr/bin/printf "\n" >> /etc/trino/catalog/iceberg.properties
205 | fi
206 |
207 | echo "Starting presto..."
208 | systemctl enable trino.service
209 | systemctl start trino.service
210 |
211 | if [[ "${mode_presto}" == "coordinator" ]] || [[ "${mode_presto}" == "coordinator-worker" ]]; then
212 | echo "Waiting for Presto Coordinator to start"
213 | while ! presto --execute='select * from system.runtime.nodes'; do
214 | sleep 10
215 | done
216 | echo "Presto Coordinator is now online"
217 | fi
218 |
219 | echo "Executing additional bootstrap scripts"
220 |
221 | %{ for script in additional_bootstrap_scripts ~}
222 | %{ if script.type == "s3" ~}
223 | if [ ! -z "${aws_access_key_id}" ]; then
224 | export AWS_ACCESS_KEY_ID=${aws_access_key_id}
225 | export AWS_SECRET_ACCESS_KEY=${aws_secret_access_key}
226 | fi
227 | aws s3 cp "${script.script_url}" "/tmp/${script.script_name}"
228 | %{ else ~}
229 | curl "${script.script_url}" -o "/tmp/${script.script_name}"
230 | %{ endif ~}
231 | chmod +x "/tmp/${script.script_name}"
232 | sh -c "/tmp/${script.script_name} %{ for param in script.params ~} ${param} %{ endfor ~}"
233 | %{ endfor ~}
234 |
235 | echo "Restarting Presto service"
236 |
237 | systemctl restart trino
--------------------------------------------------------------------------------
/assets/zeppelin-interpreter.json:
--------------------------------------------------------------------------------
1 | {
2 | "id": "presto",
3 | "name": "presto",
4 | "group": "jdbc",
5 | "properties": {
6 | "default.url": {
7 | "name": "default.url",
8 | "value": "jdbc:trino://PRESTO_HOST",
9 | "type": "string"
10 | },
11 | "default.driver": {
12 | "name": "default.driver",
13 | "value": "io.trino.jdbc.TrinoDriver",
14 | "type": "string"
15 | },
16 | "default.user": {
17 | "name": "default.user",
18 | "value": "presto",
19 | "type": "string"
20 | }
21 | },
22 | "status": "READY",
23 | "interpreterGroup": [
24 | {
25 | "name": "sql",
26 | "class": "org.apache.zeppelin.jdbc.JDBCInterpreter",
27 | "defaultInterpreter": false,
28 | "editor": {
29 | "language": "sql",
30 | "editOnDblClick": false,
31 | "completionSupport": true
32 | }
33 | }
34 | ],
35 | "dependencies": [
36 | {
37 | "groupArtifactVersion": "/opt/zeppelin/interpreter/jdbc/trino-jdbc-370.jar",
38 | "local": false
39 | }
40 | ],
41 | "option": {
42 | "remote": true,
43 | "port": -1,
44 | "perNote": "shared",
45 | "perUser": "shared",
46 | "isExistingProcess": false,
47 | "setPermission": false,
48 | "owners": [],
49 | "isUserImpersonate": false
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/assets/zeppelin-jdbc-0.11.0-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BigDataBoutique/presto-cloud-deploy/fb8a3c45a9b1a1858b808fc7d2e09c2262e4e89b/assets/zeppelin-jdbc-0.11.0-SNAPSHOT.jar
--------------------------------------------------------------------------------
/catalogs/blackhole.properties:
--------------------------------------------------------------------------------
1 | connector.name=blackhole
2 |
--------------------------------------------------------------------------------
/catalogs/glue.properties:
--------------------------------------------------------------------------------
1 | connector.name=hive-hadoop2
2 | hive.metastore = glue
3 | hive.non-managed-table-writes-enabled = true
4 | #hive.metastore.glue.aws-access-key =
5 | #hive.metastore.glue.aws-secret-key =
6 | hive.metastore.glue.region = us-east-1
7 | hive.max-partitions-per-writers = 200
8 |
9 |
--------------------------------------------------------------------------------
/catalogs/hive.properties:
--------------------------------------------------------------------------------
1 | connector.name=hive-hadoop2
2 | hive.metastore.uri=thrift://localhost:9083
3 |
4 | # For a full list of configuration options, see https://prestodb.io/docs/current/connector/hive.html
5 |
6 | # By default Presto / Hive assume data is mutable in table partitions
7 | #hive.immutable-partitions=false
8 |
9 | # The default security permissions don't allow to drop tables
10 | #hive.allow-drop-table=false
11 |
12 | # Theses settings control how often the catalog metadata is going to be updated
13 | hive.metastore-cache-ttl=30s
14 | hive.metastore-refresh-interval=10s
15 |
16 | # When deploying on AWS the default is to enable S3 access via instance credentials and control access via IAM roles
17 | # For non-AWS deployments, or when for other reasons you need to control S3 access via explicit credentials, you
18 | # can use the settings below.
19 | #hive.s3.use-instance-credentials=true
20 | #hive.s3.ssl.enabled=true
21 | #hive.s3.aws-access-key=AKIA...
22 | #hive.s3.aws-secret-key=....
23 | #hive.s3.endpoint=...
--------------------------------------------------------------------------------
/catalogs/iceberg.properties:
--------------------------------------------------------------------------------
1 | connector.name=iceberg
2 | iceberg.catalog.type=hive_metastore
3 | hive.metastore.uri=thrift://localhost:9083
4 |
5 | #iceberg.catalog.type=glue
--------------------------------------------------------------------------------
/catalogs/jmx.properties:
--------------------------------------------------------------------------------
1 | connector.name=jmx
--------------------------------------------------------------------------------
/catalogs/memory.properties:
--------------------------------------------------------------------------------
1 | # https://prestodb.io/docs/current/connector/memory.html
2 | connector.name=memory
3 | memory.max-data-per-node=2GB
--------------------------------------------------------------------------------
/catalogs/mysql.properties:
--------------------------------------------------------------------------------
1 | connector.name=mysql
2 | connection-url=jdbc:mysql://localhost:3306
3 | connection-user=root
4 | connection-password=pwd
--------------------------------------------------------------------------------
/catalogs/tpcds.properties:
--------------------------------------------------------------------------------
1 | connector.name=tpcds
--------------------------------------------------------------------------------
/catalogs/tpch.properties:
--------------------------------------------------------------------------------
1 | connector.name=tpch
--------------------------------------------------------------------------------
/packer/README.md:
--------------------------------------------------------------------------------
1 | # Presto machine images
2 |
3 | This Packer configuration will generate Ubuntu images with Presto and Presto CLI installed, for deploying and managing Presto clusters on the cloud.
4 |
5 | ## On Amazon Web Services (AWS)
6 |
7 | Using the AWS builder will create the two images and store them as AMIs.
8 |
9 | The base AMI is maintained by [Canonical](https://canonical.com/) and listed on
10 | the [Amazon EC2 AMI Locator](https://cloud-images.ubuntu.com/locator/ec2/).
11 |
12 | As a convention the Packer builders will use a dedicated IAM roles, which you will need to have present.
13 |
14 | ```bash
15 | aws iam create-role --role-name packer --assume-role-policy-document '{
16 | "Version": "2012-10-17",
17 | "Statement": {
18 | "Effect": "Allow",
19 | "Principal": {"Service": "ec2.amazonaws.com"},
20 | "Action": "sts:AssumeRole",
21 | "Sid": ""
22 | }
23 | }'
24 | ```
25 |
26 | Response will look something like this:
27 |
28 | ```json
29 | {
30 | "Role": {
31 | "AssumeRolePolicyDocument": {
32 | "Version": "2012-10-17",
33 | "Statement": {
34 | "Action": "sts:AssumeRole",
35 | "Effect": "Allow",
36 | "Principal": {
37 | "Service": "ec2.amazonaws.com"
38 | }
39 | }
40 | },
41 | "RoleId": "AROAJ7Q2L7NZJHZBB6JKY",
42 | "CreateDate": "2016-12-16T13:22:47.254Z",
43 | "RoleName": "packer",
44 | "Path": "/",
45 | "Arn": "arn:aws:iam::611111111117:role/packer"
46 | }
47 | }
48 | ```
49 |
50 | Follow up by executing the following
51 |
52 | ```bash
53 | aws iam create-instance-profile --instance-profile-name packer
54 | aws iam add-role-to-instance-profile --instance-profile-name packer --role-name packer
55 |
56 | ```
57 |
58 | ## On Microsoft Azure
59 |
60 | Before running Packer for the first time you will need to do a one-time initial setup.
61 |
62 | Use PowerShell, and login to AzureRm. See here for more details: https://docs.microsoft.com/en-us/powershell/azure/authenticate-azureps. Once logged in, take note of the subscription and tenant IDs which will be printed out. Alternatively, you can retrieve them by running `Get-AzureRmSubscription` once logged-in.
63 |
64 | ```Powershell
65 | $rgName = "packer-presto-images"
66 | $location = "East US"
67 | New-AzureRmResourceGroup -Name $rgName -Location $location
68 | $Password = ([char[]]([char]33..[char]95) + ([char[]]([char]97..[char]126)) + 0..9 | sort {Get-Random})[0..8] -join ''
69 | "Password: " + $Password
70 | $sp = New-AzureRmADServicePrincipal -DisplayName "Azure Packer IKF" -Password $Password
71 | New-AzureRmRoleAssignment -RoleDefinitionName Contributor -ServicePrincipalName $sp.ApplicationId
72 | $sp.ApplicationId
73 | ```
74 |
75 | Note the resource group name, location, password, sp.ApplicationId as used in the script and emitted as output and update `variables.json`.
76 |
77 | To learn more about using Packer on Azure see https://docs.microsoft.com/en-us/azure/virtual-machines/windows/build-image-with-packer
78 |
79 | Similarly, using the Azure CLI is going to look something like below:
80 |
81 | ```bash
82 | export rgName=packer-presto-images
83 | az group create -n ${rgName} -l eastus
84 |
85 | az ad sp create-for-rbac --query "{ client_id: appId, client_secret: password, tenant_id: tenant }"
86 | # outputs client_id, client_secret and tenant_id
87 | az account show --query "{ subscription_id: id }"
88 | # outputs subscription_id
89 | ```
90 |
91 | ## Building
92 |
93 | Building the AMIs is done using the following commands:
94 |
95 | ```bash
96 | packer build -only=amazon-ebs -var-file=variables.json presto.json
97 | ```
98 |
99 | Override the aws_region and aws_az variables to change the target region and
100 | availability zone, which default respectively to us-east-1 and us-east-1a.
101 |
102 | Replace the `-only` parameter to `azure-arm` to build images for Azure instead of AWS.
103 |
--------------------------------------------------------------------------------
/packer/presto.json:
--------------------------------------------------------------------------------
1 | {
2 | "description": "Presto Image",
3 | "builders": [
4 | {
5 | "type": "amazon-ebs",
6 | "ami_name": "presto-{{isotime | clean_resource_name}}",
7 | "availability_zone": "{{user `aws_az`}}",
8 | "iam_instance_profile": "packer",
9 | "instance_type": "t2.large",
10 | "region": "{{user `aws_region`}}",
11 | "run_tags": {
12 | "role": "packer"
13 | },
14 | "source_ami_filter": {
15 | "filters": {
16 | "virtualization-type": "hvm",
17 | "name": "ubuntu/images/hvm-ssd/ubuntu-bionic-18.04-amd64-server-*",
18 | "root-device-type": "ebs"
19 | },
20 | "owners": ["099720109477"],
21 | "most_recent": true
22 | },
23 | "ssh_timeout": "10m",
24 | "ssh_username": "ubuntu",
25 | "ssh_interface": "public_ip",
26 | "tags": {
27 | "ImageType": "presto-packer-image"
28 | },
29 |
30 | "spot_price_auto_product": "Linux/UNIX (Amazon VPC)",
31 | "spot_price": "auto"
32 | },
33 | {
34 | "type": "azure-arm",
35 |
36 | "client_id": "{{user `azure_client_id`}}",
37 | "client_secret": "{{user `azure_client_secret`}}",
38 | "tenant_id": "{{user `azure_tenant_id`}}",
39 | "subscription_id": "{{user `azure_subscription_id`}}",
40 |
41 | "managed_image_resource_group_name": "{{user `azure_resource_group_name`}}",
42 | "managed_image_name": "presto-{{isotime \"2006-01-02T030405\"}}",
43 |
44 | "os_type": "Linux",
45 | "image_publisher": "Canonical",
46 | "image_offer": "UbuntuServer",
47 | "image_sku": "18.04-LTS",
48 |
49 | "location": "{{user `azure_location`}}",
50 | "vm_size": "Standard_DS2_v2"
51 | },
52 | {
53 | "type": "googlecompute",
54 | "account_file": "{{user `gcp_account_file`}}",
55 | "project_id": "{{user `gcp_project_id`}}",
56 | "source_image_family": "ubuntu-1804-lts",
57 | "zone": "{{user `gcp_zone`}}",
58 | "image_family": "presto",
59 | "image_name": "presto-{{isotime \"20060102t030405\"}}",
60 | "preemptible": true,
61 | "ssh_username": "ubuntu"
62 | }
63 | ],
64 | "provisioners": [
65 | {
66 | "type": "shell",
67 | "script": "presto/update-machine.sh",
68 | "execute_command": "echo '' | {{ .Vars }} sudo -E -S sh '{{ .Path }}'"
69 | },
70 | {
71 | "type": "shell",
72 | "script": "presto/install-java.sh",
73 | "execute_command": "echo '' | {{ .Vars }} sudo -E -S sh '{{ .Path }}'"
74 | },
75 | {
76 | "type": "file",
77 | "source": "../assets/hive-site.xml",
78 | "destination": "hive-site.xml"
79 | },
80 | {
81 | "type": "shell",
82 | "script": "presto/install-hive.sh",
83 | "environment_vars": [ "HIVE_VERSION={{user `hive_version`}}", "HADOOP_VERSION={{user `hadoop_version`}}" ],
84 | "execute_command": "echo '' | {{ .Vars }} sudo -E -S bash '{{ .Path }}'"
85 | },
86 | {
87 | "type": "file",
88 | "source": "../catalogs",
89 | "destination": "presto-catalogs"
90 | },
91 | {
92 | "type": "shell",
93 | "script": "presto/install-trino.sh",
94 | "environment_vars": [ "PRESTO_VERSION={{user `presto_version`}}" ],
95 | "execute_command": "echo '' | {{ .Vars }} sudo -E -S sh '{{ .Path }}'"
96 | },
97 | {
98 | "type": "shell",
99 | "script": "presto/install-trino-cli.sh",
100 | "environment_vars": [ "PRESTO_VERSION={{user `presto_version`}}" ],
101 | "execute_command": "echo '' | {{ .Vars }} sudo -E -S sh '{{ .Path }}'"
102 | }
103 | ]
104 | }
105 |
--------------------------------------------------------------------------------
/packer/presto/install-hive.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | log() {
5 | echo "==> $(basename ${0}): ${1}"
6 | }
7 |
8 | export path_install="/usr/local/apache-hive-${HIVE_VERSION}-bin"
9 | export path_file="hive-${HIVE_VERSION}.tar.gz"
10 | export HIVE_HOME=${path_install}
11 |
12 | export path_hadoop="/usr/local/hadoop-${HADOOP_VERSION}"
13 | export path_hadoop_file="hadoop-${HADOOP_VERSION}.tar.gz"
14 | export HADOOP_HOME=${path_hadoop}
15 |
16 | log "Downloading Hadoop ${HADOOP_VERSION}..."
17 | wget -q -O ${path_hadoop_file} https://archive.apache.org/dist/hadoop/core/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz
18 | tar -xzf ${path_hadoop_file} -C /usr/local/
19 | rm ${path_hadoop_file}
20 |
21 | log "Downloading Hive ${HIVE_VERSION}..."
22 | wget -q -O ${path_file} https://archive.apache.org/dist/hive/hive-${HIVE_VERSION}/apache-hive-${HIVE_VERSION}-bin.tar.gz
23 |
24 | log "Installing Hive..."
25 | useradd -m hive || log "User [hive] already exists. Continuing..."
26 |
27 | install -d -o hive -g hive ${path_install}
28 | tar -xzf ${path_file} -C /usr/local/
29 | mv hive-site.xml ${path_install}/conf/hive-site.xml
30 | ln -s /usr/share/java/mysql-connector-java.jar ${HIVE_HOME}/lib/mysql-connector-java.jar
31 | cp -n ${HADOOP_HOME}/share/hadoop/tools/lib/* ${HIVE_HOME}/lib/
32 | echo "export JAVA_HOME=$JAVA8_HOME" >> ${path_install}/bin/hive-config.sh
33 | chown -R hive:hive ${path_install}
34 | rm ${path_file}
35 | echo "export PATH=\"\$PATH:${path_install}/bin\"" > /etc/profile.d/apache-hive.sh
36 |
37 | /usr/bin/printf "
38 | HADOOP_HOME=${path_hadoop}
39 | HIVE_HOME=${path_install}" >> /etc/environment
40 |
41 | install -d -o hive -g hive /tmp/hive
42 | ${HADOOP_HOME}/bin/hadoop fs -chmod -R 777 /tmp/hive/
43 |
44 | log "Setup MySQL backend for Hive Metastore..."
45 | sudo debconf-set-selections <<< 'mysql-server-5.6 mysql-server/root_password password pwd'
46 | sudo debconf-set-selections <<< 'mysql-server-5.6 mysql-server/root_password_again password pwd'
47 | DEBIAN_FRONTEND=noninteractive apt-get install -y -qq mysql-server libmysql-java
48 |
49 | # Disable the mysql service - we will only need it on the coordinator node
50 | systemctl disable mysql
51 |
52 | log "Installing the Hive Metastore service"
53 | /usr/bin/printf "[Unit]
54 | Description=Hive Metastore
55 | After=network-online.target
56 | [Service]
57 | User=root
58 | Restart=on-failure
59 | Type=simple
60 | Environment="HADOOP_HOME=${HADOOP_HOME}" "JAVA_HOME=${JAVA_8_HOME}" "HIVE_HOME=${HIVE_HOME}"
61 | ExecStart=${HIVE_HOME}/bin/hive --service metastore
62 | [Install]
63 | WantedBy=default.target
64 | " > /etc/systemd/system/hive-metastore.service
--------------------------------------------------------------------------------
/packer/presto/install-java.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | log() {
5 | echo "==> $(basename ${0}): ${1}"
6 | }
7 |
8 | sudo apt-get update
9 | sudo apt-get install -y -qq openjdk-8-jdk openjdk-11-jdk default-jdk
10 |
11 |
12 | sudo update-java-alternatives --jre-headless --jre -s java-1.8.0-openjdk-amd64
13 | export JAVA8_HOME=$(jrunscript -e 'java.lang.System.out.println(java.lang.System.getProperty("java.home"));')
14 |
15 | sudo update-java-alternatives --jre-headless --jre -s java-1.11.0-openjdk-amd64
16 | export JAVA_HOME=$(jrunscript -e 'java.lang.System.out.println(java.lang.System.getProperty("java.home"));')
17 |
18 | /usr/bin/printf "
19 | JAVA8_HOME=${JAVA8_HOME}
20 | JAVA_HOME=${JAVA_HOME}" >> /etc/environment
--------------------------------------------------------------------------------
/packer/presto/install-presto-cli.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | log() {
5 | echo "==> $(basename ${0}): ${1}"
6 | }
7 |
8 | export version_presto=${PRESTO_VERSION}
9 | export path_install="/usr/local/bin"
10 | export path_file="presto-cli-${version_presto}-executable.jar"
11 |
12 | log "Downloading Presto CLI ${version_presto}..."
13 |
14 | wget -q -O ${path_file} "https://repo1.maven.org/maven2/io/prestosql/presto-cli/${version_presto}/presto-cli-${version_presto}-executable.jar"
15 |
16 | log "Installing Presto CLI ${version_presto}..."
17 |
18 | install -d -o presto -g presto ${path_install}
19 | mv ${path_file} ${path_install}/presto
20 | chmod +x ${path_install}/presto
21 |
--------------------------------------------------------------------------------
/packer/presto/install-presto.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -e
3 |
4 | log() {
5 | echo "==> $(basename ${0}): ${1}"
6 | }
7 |
8 | export version_presto=${PRESTO_VERSION}
9 | export path_install="/usr/local/presto-server-${version_presto}"
10 | export path_file="presto-server-${version_presto}.tar.gz"
11 | export pid_file="/var/run/presto/presto.pid"
12 | export user_presto='presto'
13 |
14 | log "Downloading Presto ${version_presto}..."
15 |
16 | wget -q -O "${path_file}" "https://repo1.maven.org/maven2/io/prestosql/presto-server/${version_presto}/presto-server-${version_presto}.tar.gz"
17 |
18 | log "Installing Presto ${version_presto}..."
19 | useradd ${user_presto} || log "User [${user_presto}] already exists. Continuing..."
20 |
21 | install -d -o ${user_presto} -g ${user_presto} "${path_install}"
22 | tar -xzf "${path_file}" -C /usr/local/
23 | install -d -o ${user_presto} -g ${user_presto} /etc/presto/
24 | install -d -o ${user_presto} -g ${user_presto} /etc/presto/catalog
25 | install -d -o ${user_presto} -g ${user_presto} /var/lib/presto/ # this is the data dir
26 | install -d -o ${user_presto} -g ${user_presto} /var/log/presto/
27 | mv ./presto-catalogs/* /etc/presto/catalog/
28 | rm -rf ./presto-catalogs
29 | rm -rf "$path_install/etc"
30 | ln -s /etc/presto/ "$path_install/etc"
31 |
32 | log "Adding PRESTO_HOME to system profile"
33 | /usr/bin/printf "export PRESTO_HOME=\"${path_install}\"" >> /etc/profile.d/presto.sh
34 |
35 |
36 |
37 | /usr/bin/printf "PRESTO_OPTS= \
38 | --pid-file=${pid_file} \
39 | --node-config=/etc/presto/node.properties \
40 | --jvm-config=/etc/presto/jvm.config \
41 | --config=/etc/presto/config.properties \
42 | --launcher-log-file=/var/log/presto/launcher.log \
43 | --server-log-file=/var/log/presto/server.log \
44 | -Dhttp-server.log.path=/var/log/presto/http-request.log \
45 | -Dcatalog.config-dir=/etc/presto/catalog
46 | [Install]
47 | WantedBy=default.target
48 | " >> /etc/default/presto
49 | chown ${user_presto}:${user_presto} /etc/default/presto
50 |
51 | log "Installing the Presto service"
52 | /usr/bin/printf "
53 | [Unit]
54 | Description=Presto Server
55 | Documentation=https://trino.io/docs/current/index.html
56 | After=network-online.target
57 | [Service]
58 | User=${user_presto}
59 | Restart=on-failure
60 | Type=forking
61 | PIDFile=${pid_file}
62 | RuntimeDirectory=presto
63 | EnvironmentFile=/etc/default/presto
64 | ExecStart=${path_install}/bin/launcher start \$PRESTO_OPTS
65 | ExecStop=${path_install}/bin/launcher stop \$PRESTO_OPTS
66 | [Install]
67 | WantedBy=default.target
68 | " > /etc/systemd/system/presto.service
69 |
70 | systemctl daemon-reload
71 |
72 | rm "${path_file}"
73 |
--------------------------------------------------------------------------------
/packer/presto/install-trino-cli.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -ex
3 |
4 | log() {
5 | echo "==> $(basename ${0}): ${1}"
6 | }
7 |
8 | export version_trino=${PRESTO_VERSION}
9 | export path_install="/usr/local/bin"
10 | export path_file="trino-cli-${version_trino}-executable.jar"
11 |
12 | log "Downloading Presto CLI ${version_trino}..."
13 |
14 | wget -q -O ${path_file} "https://repo1.maven.org/maven2/io/trino/trino-cli/${version_trino}/trino-cli-${version_trino}-executable.jar"
15 |
16 | log "Installing Presto CLI ${version_trino}..."
17 |
18 | install -d -o trino -g trino ${path_install}
19 | mv ${path_file} ${path_install}/trino
20 | chmod +x ${path_install}/trino
21 | ln -s ${path_install}/trino ${path_install}/presto
22 |
--------------------------------------------------------------------------------
/packer/presto/install-trino.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | set -ex
3 |
4 | log() {
5 | echo "==> $(basename ${0}): ${1}"
6 | }
7 |
8 | export version_trino=${PRESTO_VERSION}
9 | export path_install="/usr/local/trino-server-${version_trino}"
10 | export path_file="trino-server-${version_trino}.tar.gz"
11 | export pid_file="/var/run/trino/trino.pid"
12 | export user_trino='trino'
13 |
14 | log "Downloading Presto ${version_trino}..."
15 |
16 | wget -q -O "${path_file}" "https://repo1.maven.org/maven2/io/trino/trino-server/${version_trino}/trino-server-${version_trino}.tar.gz"
17 |
18 | log "Installing Presto / Trino ${version_trino}..."
19 | useradd ${user_trino} || log "User [${user_trino}] already exists. Continuing..."
20 |
21 | install -d -o ${user_trino} -g ${user_trino} "${path_install}"
22 | tar -xzf "${path_file}" -C /usr/local/
23 | install -d -o ${user_trino} -g ${user_trino} /etc/trino/
24 | install -d -o ${user_trino} -g ${user_trino} /etc/trino/catalog
25 | install -d -o ${user_trino} -g ${user_trino} /var/lib/trino/ # this is the data dir
26 | install -d -o ${user_trino} -g ${user_trino} /var/log/trino/
27 | mv ./presto-catalogs/* /etc/trino/catalog/
28 | rm -rf ./presto-catalogs
29 | rm -rf "$path_install/etc"
30 | ln -s /etc/trino/ "$path_install/etc"
31 |
32 | log "Adding TRINO_HOME to system profile"
33 | /usr/bin/printf "export TRINO_HOME=\"${path_install}\"" >> /etc/profile.d/trino.sh
34 |
35 |
36 |
37 | /usr/bin/printf "TRINO_OPTS= \
38 | --pid-file=${pid_file} \
39 | --node-config=/etc/trino/node.properties \
40 | --jvm-config=/etc/trino/jvm.config \
41 | --config=/etc/trino/config.properties \
42 | --launcher-log-file=/var/log/trino/launcher.log \
43 | --server-log-file=/var/log/trino/server.log \
44 | -Dhttp-server.log.path=/var/log/trino/http-request.log \
45 | -Dcatalog.config-dir=/etc/trino/catalog
46 | [Install]
47 | WantedBy=default.target
48 | " >> /etc/default/trino
49 | chown ${user_trino}:${user_trino} /etc/default/trino
50 |
51 | log "Installing the Presto service"
52 | /usr/bin/printf "
53 | [Unit]
54 | Description=Presto Server
55 | Documentation=https://trino.io/docs/current/index.html
56 | After=network-online.target
57 | [Service]
58 | User=${user_trino}
59 | Restart=on-failure
60 | Type=forking
61 | PIDFile=${pid_file}
62 | RuntimeDirectory=trino
63 | EnvironmentFile=/etc/default/trino
64 | ExecStart=${path_install}/bin/launcher start \$TRINO_OPTS
65 | ExecStop=${path_install}/bin/launcher stop \$TRINO_OPTS
66 | [Install]
67 | WantedBy=default.target
68 | " > /etc/systemd/system/trino.service
69 |
70 | systemctl daemon-reload
71 |
72 | rm "${path_file}"
73 |
--------------------------------------------------------------------------------
/packer/presto/update-machine.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | log() {
5 | echo "==> $(basename ${0}): ${1}"
6 | }
7 |
8 | export DEBIAN_FRONTEND=noninteractive
9 |
10 | TZ=Etc/UTC
11 | ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
12 |
13 | log "Updating package index..."
14 | sudo -E apt-get update -y -qq
15 |
16 | log "Upgrading existing packages"
17 | sudo -E apt-get upgrade -y -qq
18 |
19 | log "Updating package index..."
20 | sudo -E apt-get update -y -qq
21 |
22 | log "Installing prerequisites..."
23 | sudo -E apt-get install -y -qq --no-install-recommends \
24 | wget software-properties-common htop apt-transport-https python3 jq awscli vim
25 |
26 | sudo update-alternatives --install /usr/bin/python python /usr/bin/python3 10
27 |
28 | # Disable daily apt unattended updates.
29 | echo 'APT::Periodic::Enable "0";' >> /etc/apt/apt.conf.d/10periodic
--------------------------------------------------------------------------------
/packer/prestoclients.json:
--------------------------------------------------------------------------------
1 | {
2 | "description": "Presto clients image",
3 | "builders": [
4 | {
5 | "type": "amazon-ebs",
6 | "ami_name": "prestoclients-{{isotime | clean_resource_name}}",
7 | "availability_zone": "{{user `aws_az`}}",
8 | "iam_instance_profile": "packer",
9 | "instance_type": "t2.medium",
10 | "region": "{{user `aws_region`}}",
11 | "run_tags": {
12 | "role": "packer"
13 | },
14 | "source_ami_filter": {
15 | "filters": {
16 | "virtualization-type": "hvm",
17 | "name": "presto-*",
18 | "root-device-type": "ebs"
19 | },
20 | "owners": ["self"],
21 | "most_recent": true
22 | },
23 | "launch_block_device_mappings": [
24 | {
25 | "device_name": "/dev/sda1",
26 | "volume_size": 15,
27 | "volume_type": "standard",
28 | "delete_on_termination": true
29 | }
30 | ],
31 | "ssh_timeout": "10m",
32 | "ssh_username": "ubuntu",
33 | "ssh_interface": "public_ip",
34 | "tags": {
35 | "ImageType": "prestoclients-packer-image"
36 | },
37 | "spot_price_auto_product": "Linux/UNIX (Amazon VPC)",
38 | "spot_price": "auto"
39 | }
40 | ],
41 | "provisioners": [
42 | {
43 | "type": "file",
44 | "source": "../assets/nginx.conf",
45 | "destination": "/tmp/clients-nginx.conf"
46 | },
47 | {
48 | "type": "file",
49 | "source": "../assets/zeppelin-interpreter.json",
50 | "destination": "/tmp/zeppelin-interpreter-partial.json"
51 | },
52 | {
53 | "type": "file",
54 | "source": "../assets/zeppelin-jdbc-0.11.0-SNAPSHOT.jar",
55 | "destination": "/tmp/zeppelin-jdbc-0.11.0-SNAPSHOT.jar"
56 | },
57 | {
58 | "type": "shell",
59 | "script": "prestoclients/update-machine.sh",
60 | "execute_command": "echo '' | {{ .Vars }} sudo -E -S sh '{{ .Path }}'"
61 | },
62 | {
63 | "type": "shell",
64 | "script": "prestoclients/install-redash.sh",
65 | "environment_vars": [ "REDASH_VERSION={{user `redash_version`}}" ],
66 | "execute_command": "echo '' | {{ .Vars }} sudo -E -S sh '{{ .Path }}'"
67 | },
68 | {
69 | "type": "shell",
70 | "script": "prestoclients/install-superset.sh",
71 | "execute_command": "echo '' | {{ .Vars }} sudo -E -S sh '{{ .Path }}'"
72 | },
73 | {
74 | "type": "shell",
75 | "script": "prestoclients/install-zeppelin.sh",
76 | "execute_command": "echo '' | {{ .Vars }} sudo -E -S sh '{{ .Path }}'"
77 | }
78 | ]
79 | }
80 |
--------------------------------------------------------------------------------
/packer/prestoclients/install-redash.sh:
--------------------------------------------------------------------------------
1 | git clone https://github.com/getredash/setup /tmp/redash
2 | cd /tmp/redash/
3 |
4 | #export REDASH_BRANCH="v$REDASH_VERSION"
5 | sed 's/\$LATEST_VERSION/10.1.0.b50633/g' setup.sh > setup2.sh
6 | mv setup2.sh setup.sh
7 | bash ./setup.sh
8 |
9 | cd /opt/redash
10 | docker-compose down
11 | sed -i '/^.*nginx:$/,$d' docker-compose.yml # patch out nginx service
12 | docker-compose up -d
13 |
--------------------------------------------------------------------------------
/packer/prestoclients/install-superset.sh:
--------------------------------------------------------------------------------
1 | SUPERSET_CONFIG_PATH="/opt/superset/config"
2 |
3 | git clone https://github.com/apache/superset.git /opt/superset
4 |
5 | sudo mkdir -p $SUPERSET_CONFIG_PATH
6 |
7 | cat <<'EOF' >$SUPERSET_CONFIG_PATH/presto-datasource.yaml
8 | databases:
9 | - database_name: trino
10 | expose_in_sqllab: true
11 | extra: "{\r\n \"metadata_params\": {},\r\n \"engine_params\": {},\r\n \"\
12 | metadata_cache_timeout\": {},\r\n \"schemas_allowed_for_csv_upload\": []\r\n\
13 | }\r\n"
14 | sqlalchemy_uri: trino://trino@PRESTO_COORDINATOR_HOST:8080
15 | tables: []
16 | EOF
17 |
18 | cat <<'EOF' >$SUPERSET_CONFIG_PATH/superset_config.py
19 | ENABLE_PROXY_FIX = True
20 | PREFERRED_URL_SCHEME = 'https'
21 | EOF
22 |
23 | cd /opt/superset
24 | docker-compose -f docker-compose-non-dev.yml pull
25 |
--------------------------------------------------------------------------------
/packer/prestoclients/install-zeppelin.sh:
--------------------------------------------------------------------------------
1 | cd /tmp
2 | wget --no-verbose https://www-eu.apache.org/dist/zeppelin/zeppelin-0.10.0/zeppelin-0.10.0-bin-all.tgz
3 | sudo tar xf zeppelin-*-bin-all.tgz -C /opt
4 | rm zeppelin-0.10.0-bin-all.tgz
5 | sudo mv /opt/zeppelin-*-bin-all /opt/zeppelin
6 | sudo cp zeppelin-interpreter-partial.json /opt/zeppelin/conf/zeppelin-interpreter-partial.json
7 |
8 | sudo cp zeppelin-jdbc-0.11.0-SNAPSHOT.jar /opt/zeppelin/interpreter/jdbc/zeppelin-jdbc-0.10.0.jar
9 | # trino support for 358 and above
10 | # https://issues.apache.org/jira/browse/ZEPPELIN-5551
11 | sudo wget --no-verbose https://repo1.maven.org/maven2/io/trino/trino-jdbc/370/trino-jdbc-370.jar -P /opt/zeppelin/interpreter/jdbc
12 |
13 |
14 | sudo useradd -d /opt/zeppelin -s /bin/false zeppelin
15 |
16 | /usr/bin/printf "
17 | export JAVA_HOME=$JAVA8_HOME
18 | export ZEPPELIN_PORT=9090
19 | " >> /opt/zeppelin/conf/zeppelin-env.sh
20 |
21 | /usr/bin/printf "
22 | [Unit]
23 | Description=Zeppelin service
24 | After=syslog.target network.target
25 |
26 | [Service]
27 | Type=forking
28 | ExecStart=/opt/zeppelin/bin/zeppelin-daemon.sh start
29 | ExecStop=/opt/zeppelin/bin/zeppelin-daemon.sh stop
30 | ExecReload=/opt/zeppelin/bin/zeppelin-daemon.sh reload
31 | User=zeppelin
32 | Group=zeppelin
33 | Restart=always
34 |
35 | [Install]
36 | WantedBy=multi-user.target
37 | " > /etc/systemd/system/zeppelin.service
38 |
39 | sudo chown -R zeppelin:zeppelin /opt/zeppelin
40 | sudo systemctl enable zeppelin
41 |
--------------------------------------------------------------------------------
/packer/prestoclients/update-machine.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | log() {
5 | echo "==> $(basename ${0}): ${1}"
6 | }
7 |
8 | export DEBIAN_FRONTEND=noninteractive
9 |
10 | log "Updating package index..."
11 | sudo -E apt-get update -qq
12 |
13 | log "Upgrading existing packages"
14 | sudo -E apt-get upgrade -y
15 |
16 | log "Installing prerequisites..."
17 | sudo -E apt-get install -y -qq --no-install-recommends \
18 | build-essential libssl-dev libffi-dev \
19 | python-dev python3.6-dev python3-pip python3-venv \
20 | libsasl2-dev libldap2-dev \
21 | nginx jq xmlstarlet
22 |
23 | log "Generating temporary certificates"
24 | mkdir -p /opt/certs
25 | cd /opt/certs
26 | openssl genrsa -des3 -passout pass:xxxx -out keypair 2048
27 | openssl rsa -passin pass:xxxx -in keypair -out server.key
28 | rm keypair
29 | touch /home/ubuntu/.rnd
30 | openssl req -new -key server.key -out server.csr -subj "/CN=*"
31 | openssl x509 -req -days 365 -in server.csr -signkey server.key -out server.crt
32 | rm server.csr
33 | cd -
34 |
35 | systemctl enable nginx.service
36 | systemctl stop nginx.service
37 |
38 | sudo mkdir -p /etc/nginx/conf.d
39 | sudo mv /tmp/clients-nginx.conf /etc/nginx/conf.d/clients.conf
--------------------------------------------------------------------------------
/packer/variables.json:
--------------------------------------------------------------------------------
1 | {
2 | "presto_version": "375",
3 | "hive_version": "2.3.9",
4 | "hadoop_version": "2.10.1",
5 | "redash_version": "10.0.x",
6 |
7 | "aws_region": "us-east-1",
8 | "aws_az": "us-east-1a",
9 |
10 | "azure_client_id": "",
11 | "azure_client_secret": "",
12 | "azure_subscription_id": "",
13 | "azure_tenant_id": "",
14 |
15 | "gcp_zone": "us-central1-a",
16 | "gcp_project_id": "my-project",
17 | "gcp_account_file": ".gcp_account.json",
18 |
19 | "azure_location": "East US",
20 | "azure_resource_group_name": "packer-presto-images"
21 | }
22 |
--------------------------------------------------------------------------------
/terraform-aws/README.md:
--------------------------------------------------------------------------------
1 | # AWS deployment
2 |
3 | ## Create the AMIs with Packer
4 |
5 | Go to the packer folder and see the README there. Once you have the generated an
6 | AMI for the presto instance and the prestoclient instance, return here and
7 | continue with the next steps.
8 |
9 | ## Create key-pair
10 |
11 | ```bash
12 | aws ec2 create-key-pair --key-name presto --query 'KeyMaterial' --output text > presto.pem
13 | ```
14 |
15 | ## VPC
16 |
17 | The Presto cluster is going to be deployed in a single subnet, within a single VPC, in a single availability zone. The idea behind this decision is to reduce latency and costs associated with transferring data between networks and AZs. Since Presto is usually used for non-mission critical parts of a system, this is usually acceptable.
18 |
19 | A load balancer is placed in front of the the Presto cluster and another in
20 | front of the Presto clients. To create a load balancer you need to associate it
21 | with two subnets of the same VPC in distinct availability zones, even if one of
22 | the availability zones is never used.
23 |
24 | Create a VPC or use an existing one. Make a list of least two subnet IDs in
25 | distinct availability zones. The first subnet in the list will be used to deploy
26 | the Presto cluster and related resources. The subsequent subnets will be used to
27 | configure the load balancer.
28 |
29 | ## Configurations
30 |
31 | The most important variables specified in `variables.tf` are the following:
32 |
33 | * `aws_region` - the region in which to launch the cluster.
34 | * `key_name` - the name of the key pair for root SSH access to the EC2 instance. You can use the one created earlier.
35 | * `subnet_ids` - the IDs of the VPC to launch the cluster in, as described above.
36 | * `public_facing` - whether or not the coordinator node should be open to the internet. The default and the highly recommended value is `false`.
37 | * `additional_security_groups` - here you add IDs for security groups you want to add to the coordinator load balancer so your clients (e.g. Redash, applications, etc) can access the coordinator for querying.
38 | * `count_clients` - number of client nodes with Redash and Apache Superset installed, with configured admin user and datasource pointing to the Presto cluster. Default is `0`.
39 | * `clients_lb_subnets` - list of subnet IDs to attach to the clients load balancer. At least two subnets from different availability zones must be provided.
40 |
41 | We recommend using `tfvars` file to override all variables and configurations,
42 | see https://www.terraform.io/intro/getting-started/variables.html#from-a-file
43 | for more details.
44 |
45 | You must create at least one client to generate the credentials to access the Presto UI.
46 |
47 | You can launch workers and spot-workers (workers which run on spot-instances).
48 |
49 | There are some more configurations to notice (like machine sizes, memory allocation, etc) which we will document soon
50 |
51 | ### Cluster topology
52 |
53 | Two modes of deployment are supported:
54 |
55 | * Production deployment with a single coordinator node and a bunch of worker nodes (number of workers is configurable)
56 | * Single node mode - one node acting as both coordinator and worker
57 |
58 | ## Launch the cluster with Terraform
59 |
60 | On first usage, you will need to execute `terraform init` to initialize the terraform providers used.
61 |
62 | To deploy the cluster, or apply any changes to an existing cluster deployed using this project, run:
63 |
64 | ```bash
65 | terraform plan
66 | terraform apply
67 | ```
68 |
69 | When terraform is done, you should see a lot of output ending with something like this:
70 |
71 | ```
72 | Apply complete! Resources: 11 added, 0 changed, 0 destroyed.
73 |
74 | The state of your infrastructure has been saved to the path
75 | below. This state is required to modify and destroy your
76 | infrastructure, so keep it safe. To inspect the complete state
77 | use the `terraform show` command.
78 |
79 | State path: terraform.tfstate
80 |
81 | Outputs:
82 |
83 | clients-admin-password = [
84 | "********",
85 | ]
86 | clients-lb-dns = [
87 | "example-presto-client-lb-1234567890.eu-west-1.elb.amazonaws.com",
88 | ]
89 | coordinator-lb-dns = [
90 | "example-presto-lb-1234567890.eu-west-1.elb.amazonaws.com",
91 | ]
92 | ```
93 |
94 | Note `coordinator-lb-dns` - that's your entry point to the Presto cluster. All
95 | queries should go to that URL, and the Presto UI accessible at that address as
96 | well (port 8080).
97 |
98 | To enter the UI you pass the `clients-admin-password` as the user name and don't
99 | set a password.
100 |
101 | ### Look around
102 |
103 | You can pull the list of instances by their state and role using aws-cli:
104 |
105 | ```bash
106 | aws ec2 describe-instances --filters Name=instance-state-name,Values=running
107 | aws ec2 describe-instances --filters Name=instance-state-name,Values=running,Name=tag:Role,Values=client
108 | ```
109 |
110 | To login to one of the instances:
111 |
112 | ```bash
113 | ssh -i presto.pem ubuntu@{public IP / DNS of the instance}
114 | ```
115 |
--------------------------------------------------------------------------------
/terraform-aws/clients.tf:
--------------------------------------------------------------------------------
1 | data "template_file" "client-userdata-script" {
2 | count = var.count_clients != "0" ? 1 : 0
3 | template = file("${path.module}/../assets/client_user_data.sh")
4 |
5 | vars = {
6 | presto_coordinator_host = aws_elb.coordinator-lb.dns_name
7 | coordinator_port = var.http_port
8 | admin_password = var.count_clients != "0" ? random_string.clients-admin-password[0].result : ""
9 | cert_pem = tls_self_signed_cert.presto-clients-cert.cert_pem
10 | key_pem = tls_private_key.presto-clients-private-key.private_key_pem
11 | }
12 | }
13 |
14 | resource "random_string" "clients-admin-password" {
15 | count = var.count_clients != "0" ? 1 : 0
16 | length = 16
17 | special = false
18 | }
19 |
20 | resource "tls_private_key" "presto-clients-private-key" {
21 | algorithm = "ECDSA"
22 | ecdsa_curve = "P384"
23 | }
24 |
25 | resource "tls_self_signed_cert" "presto-clients-cert" {
26 | key_algorithm = "ECDSA"
27 | private_key_pem = tls_private_key.presto-clients-private-key.private_key_pem
28 |
29 | subject {
30 | common_name = "*"
31 | }
32 |
33 | validity_period_hours = 48
34 |
35 | allowed_uses = [
36 | "key_encipherment",
37 | "digital_signature",
38 | "server_auth",
39 | ]
40 | }
41 |
42 | resource "aws_iam_server_certificate" "presto-clients-cert" {
43 | name_prefix = "presto-clients-cert"
44 | certificate_body = tls_self_signed_cert.presto-clients-cert.cert_pem
45 | private_key = tls_private_key.presto-clients-private-key.private_key_pem
46 |
47 | lifecycle {
48 | create_before_destroy = true
49 | }
50 | }
51 |
52 | # Redash LB configuration
53 | resource "aws_lb_target_group" "redash-https-clients" {
54 | name = "redash-https-clients-tg"
55 | port = "8500"
56 | protocol = "HTTPS"
57 | vpc_id = data.aws_subnet.main_subnet.vpc_id
58 |
59 | stickiness {
60 | type = "lb_cookie"
61 | }
62 |
63 | health_check {
64 | protocol = "HTTPS"
65 | matcher = "302"
66 | }
67 | }
68 |
69 | resource "aws_lb_listener" "redash-https-clients" {
70 | count = var.count_clients != "0" ? 1 : 0
71 | load_balancer_arn = aws_lb.clients-lb[0].arn
72 | port = "8500"
73 | protocol = "HTTPS"
74 | ssl_policy = "ELBSecurityPolicy-2016-08"
75 | certificate_arn = aws_iam_server_certificate.presto-clients-cert.arn
76 |
77 | default_action {
78 | type = "forward"
79 | target_group_arn = aws_lb_target_group.redash-https-clients.arn
80 | }
81 | }
82 |
83 | # Superset LB configuration
84 | resource "aws_lb_target_group" "superset-https-clients" {
85 | name = "superset-https-clients-tg"
86 | port = "8600"
87 | protocol = "HTTPS"
88 | vpc_id = data.aws_subnet.main_subnet.vpc_id
89 |
90 | stickiness {
91 | type = "lb_cookie"
92 | }
93 |
94 | health_check {
95 | path = "/health"
96 | protocol = "HTTPS"
97 | }
98 | }
99 |
100 | resource "aws_lb_listener" "superset-https-clients" {
101 | count = var.count_clients != "0" ? 1 : 0
102 | load_balancer_arn = aws_lb.clients-lb[0].arn
103 | port = "8600"
104 | protocol = "HTTPS"
105 | ssl_policy = "ELBSecurityPolicy-2016-08"
106 | certificate_arn = aws_iam_server_certificate.presto-clients-cert.arn
107 |
108 | default_action {
109 | type = "forward"
110 | target_group_arn = aws_lb_target_group.superset-https-clients.arn
111 | }
112 | }
113 |
114 | # Zeppelin LB configuration
115 | resource "aws_lb_target_group" "zeppelin-https-clients" {
116 | name = "zeppelin-https-clients-tg"
117 | port = "8700"
118 | protocol = "HTTPS"
119 | vpc_id = data.aws_subnet.main_subnet.vpc_id
120 |
121 | stickiness {
122 | type = "lb_cookie"
123 | }
124 | }
125 |
126 | resource "aws_lb_listener" "zeppelin-https-clients" {
127 | count = var.count_clients != "0" ? 1 : 0
128 | load_balancer_arn = aws_lb.clients-lb[0].arn
129 | port = "8700"
130 | protocol = "HTTPS"
131 | ssl_policy = "ELBSecurityPolicy-2016-08"
132 | certificate_arn = aws_iam_server_certificate.presto-clients-cert.arn
133 |
134 | default_action {
135 | type = "forward"
136 | target_group_arn = aws_lb_target_group.zeppelin-https-clients.arn
137 | }
138 | }
139 |
140 | resource "aws_lb_listener_rule" "zeppelin-https-clients-websockets-rule" {
141 | count = var.count_clients != "0" ? 1 : 0
142 | listener_arn = aws_lb_listener.zeppelin-https-clients[0].arn
143 | priority = 99
144 |
145 | action {
146 | type = "forward"
147 | target_group_arn = aws_lb_target_group.zeppelin-https-clients.arn
148 | }
149 |
150 | condition {
151 | path_pattern {
152 | values = ["/ws"]
153 | }
154 | }
155 | }
156 |
157 | # Clients ALB
158 | resource "aws_lb" "clients-lb" {
159 | count = var.count_clients != "0" ? 1 : 0
160 | load_balancer_type = "application"
161 | internal = "false"
162 | name = format("%s-presto-client-lb", var.environment_name)
163 | security_groups = concat(
164 | [aws_security_group.presto-clients.id],
165 | var.additional_security_groups,
166 | )
167 |
168 | subnets = [for s in data.aws_subnet.subnets : s.id]
169 |
170 | idle_timeout = 400
171 |
172 | tags = {
173 | Name = format("%s-presto-client-lb", var.environment_name)
174 | }
175 | }
176 |
177 | resource "aws_launch_configuration" "clients" {
178 | count = var.count_clients != "0" ? 1 : 0
179 | name_prefix = "presto-${var.environment_name}-client"
180 | image_id = data.aws_ami.presto-clients.id
181 | instance_type = var.client_instance_type
182 | security_groups = [aws_security_group.presto-clients.id]
183 | user_data = data.template_file.client-userdata-script[0].rendered
184 | key_name = var.key_name
185 | associate_public_ip_address = false
186 | spot_price = var.clients_use_spot == "true" ? var.client_spot_hourly_price : ""
187 |
188 | root_block_device {
189 | volume_size = 15 # GB
190 | }
191 |
192 | lifecycle {
193 | create_before_destroy = true
194 | }
195 | }
196 |
197 | resource "aws_autoscaling_group" "clients" {
198 | count = var.count_clients != "0" ? 1 : 0
199 | name = "presto-${var.environment_name}-client"
200 | min_size = "0"
201 | max_size = "999"
202 | desired_capacity = var.count_clients
203 | launch_configuration = aws_launch_configuration.clients[0].id
204 | vpc_zone_identifier = [for s in data.aws_subnet.subnets : s.id]
205 | target_group_arns = [
206 | aws_lb_target_group.redash-https-clients.arn,
207 | aws_lb_target_group.superset-https-clients.arn,
208 | aws_lb_target_group.zeppelin-https-clients.arn,
209 | ]
210 |
211 | tag {
212 | key = "Name"
213 | value = format("presto-%s-client", var.environment_name)
214 | propagate_at_launch = true
215 | }
216 | tag {
217 | key = "Environment"
218 | value = var.environment_name
219 | propagate_at_launch = true
220 | }
221 | tag {
222 | key = "Role"
223 | value = "worker"
224 | propagate_at_launch = true
225 | }
226 | tag {
227 | key = "Spot"
228 | value = var.clients_use_spot
229 | propagate_at_launch = true
230 | }
231 |
232 | lifecycle {
233 | create_before_destroy = true
234 | }
235 | }
236 |
237 |
--------------------------------------------------------------------------------
/terraform-aws/coordinator.tf:
--------------------------------------------------------------------------------
1 | data "template_file" "coordinator-userdata-script" {
2 | template = templatefile("${path.module}/../assets/user_data.sh", {
3 | cloud_provider = "aws"
4 | environment_name = var.environment_name
5 | aws_region = var.aws_region
6 | http_port = var.http_port
7 | mode_presto = var.count_workers == "0" && var.count_workers_spot == "0" ? "coordinator-worker" : "coordinator"
8 | heap_size = var.coordinator_heap_size
9 | query_max_memory_per_node = ceil(var.worker_heap_size * 0.4)
10 | query_max_total_memory_per_node = ceil(var.worker_heap_size * 0.6)
11 | query_max_memory = var.query_max_memory
12 | security_groups = aws_security_group.presto.id
13 | aws_access_key_id = var.aws_access_key_id
14 | aws_secret_access_key = var.aws_secret_access_key
15 | address_presto_coordinator = ""
16 | extra_worker_configs = var.extra_worker_configs
17 | additional_bootstrap_scripts = var.additional_bootstrap_scripts
18 |
19 | })
20 | }
21 |
22 | resource "aws_launch_configuration" "coordinator" {
23 | name_prefix = "presto-${var.environment_name}-coordinator"
24 | image_id = data.aws_ami.presto.id
25 | instance_type = var.coordinator_instance_type
26 | security_groups = concat([aws_security_group.presto.id], var.additional_security_groups)
27 | iam_instance_profile = aws_iam_instance_profile.presto.id
28 | associate_public_ip_address = var.public_facing
29 | user_data = data.template_file.coordinator-userdata-script.rendered
30 | key_name = var.key_name
31 |
32 | lifecycle {
33 | create_before_destroy = true
34 | }
35 | }
36 |
37 | resource "aws_autoscaling_group" "coordinator" {
38 | name = "presto-${var.environment_name}-coordinator"
39 | min_size = "0"
40 | max_size = "1"
41 | desired_capacity = "1"
42 | launch_configuration = aws_launch_configuration.coordinator.id
43 | vpc_zone_identifier = [data.aws_subnet.main_subnet.id]
44 |
45 | load_balancers = [aws_elb.coordinator-lb.id]
46 |
47 | tag {
48 | key = "Name"
49 | value = format("presto-%s-coordinator", var.environment_name)
50 | propagate_at_launch = true
51 | }
52 | tag {
53 | key = "Environment"
54 | value = var.environment_name
55 | propagate_at_launch = true
56 | }
57 | tag {
58 | key = "Role"
59 | value = "coordinator"
60 | propagate_at_launch = true
61 | }
62 |
63 | lifecycle {
64 | create_before_destroy = true
65 | }
66 | }
67 |
68 | resource "aws_elb" "coordinator-lb" {
69 | name = format("%s-presto-lb", var.environment_name)
70 | security_groups = concat(
71 | [aws_security_group.presto.id],
72 | var.additional_security_groups,
73 | )
74 | subnets = [for s in data.aws_subnet.subnets : s.id]
75 | internal = !var.public_facing
76 |
77 | cross_zone_load_balancing = false
78 | idle_timeout = 400
79 |
80 | listener {
81 | instance_port = var.http_port
82 | instance_protocol = "http"
83 | lb_port = var.http_port
84 | lb_protocol = "http"
85 | }
86 |
87 | health_check {
88 | healthy_threshold = 2
89 | unhealthy_threshold = 2
90 | timeout = 3
91 | target = "HTTP:8080/ui/login.html"
92 | interval = 6
93 | }
94 |
95 | tags = {
96 | Name = format("%s-presto-lb", var.environment_name)
97 | }
98 | }
99 |
100 |
--------------------------------------------------------------------------------
/terraform-aws/disks.tf:
--------------------------------------------------------------------------------
1 | resource "aws_ebs_volume" "coordinator" {
2 | availability_zone = data.aws_subnet.main_subnet.availability_zone
3 | size = 10
4 | type = "gp2"
5 | encrypted = var.volume_encryption
6 |
7 | tags = {
8 | Name = "presto-${var.environment_name}-coordinator"
9 | Environment = var.environment_name
10 | PrestoCoordinator = true
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/terraform-aws/iam.tf:
--------------------------------------------------------------------------------
1 | resource "aws_iam_role" "presto-service-role" {
2 | name_prefix = "presto-service-role"
3 |
4 | assume_role_policy = <