├── .gitignore
├── images
├── presto-adls-wasb-play.png
└── presto-azure-data-services-play.png
├── docker-compose.yml
├── .travis.yml
├── presto
├── files
│ ├── motd.txt
│ ├── adls-wasb-site.xml
│ ├── create-configs.sh
│ └── presto-start.sh
└── Dockerfile
├── hive
├── files
│ └── metastore-start.sh
├── README.md
└── Dockerfile
├── LICENSE
├── env.conf
├── README.md
└── azure-data-services.md
/.gitignore:
--------------------------------------------------------------------------------
1 | *private*
--------------------------------------------------------------------------------
/images/presto-adls-wasb-play.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arsenvlad/docker-presto-adls-wasb/HEAD/images/presto-adls-wasb-play.png
--------------------------------------------------------------------------------
/images/presto-azure-data-services-play.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arsenvlad/docker-presto-adls-wasb/HEAD/images/presto-azure-data-services-play.png
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | version: "3"
2 |
3 | services:
4 | hive:
5 | build: hive
6 | env_file:
7 | - ./env.conf.private
8 |
9 | presto:
10 | build: presto
11 | depends_on:
12 | - hive
13 | env_file:
14 | - ./env.conf.private
15 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: required
2 |
3 | services:
4 | - docker
5 | before_install:
6 | - docker --version
7 | script:
8 | - cd presto
9 | - docker build -t presto-adls-wasb .
10 | - cd ../hive
11 | - docker build -t hive-adls-wasb .
12 | - docker images
13 | - exit 0
14 |
15 |
--------------------------------------------------------------------------------
/presto/files/motd.txt:
--------------------------------------------------------------------------------
1 |
2 | ========================================================
3 | Presto Example with ADLS and WASB Docker Container
4 | ========================================================
5 |
6 |
7 | 1. /opt/presto/etc/catalog/hive.properties sets the Hive Server metastore URL
8 | 2. /opt/presto/etc/catalog/adls-wasb-site.xml configures the ADLS and WASB credentials
9 | 3. Presto is configured on port 8080 which can be changed in /opt/presto/etc/config.properties file
10 | 4. Additional catalogs for Azure CosmosDB, Azure SQL Database, Azure MySQL, and Azure PostreSQL also optionally configured
11 | 5. Presto is started /opt/presto/bin/launcher run
12 |
13 | ========================================================
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/hive/files/metastore-start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Start Hadoop
4 | /etc/bootstrap.sh
5 |
6 | # Start the metastore
7 | hive --service metastore --hiveconf hive.root.logger=DEBUG,console --hiveconf fs.azure.account.key.${AZURE_STORAGE_ACCOUNT_NAME}.blob.core.windows.net=${AZURE_STORAGE_ACCOUNT_KEY} --hiveconf fs.adl.impl=org.apache.hadoop.fs.adl.AdlFileSystem --hiveconf fs.AbstractFileSystem.adl.impl=org.apache.hadoop.fs.adl.Adl --hiveconf dfs.adls.oauth2.access.token.provider.type=ClientCredential --hiveconf dfs.adls.oauth2.client.id=${ADLS_CLIENT_ID} --hiveconf dfs.adls.oauth2.credential=${ADLS_CLIENT_SECRET} --hiveconf dfs.adls.oauth2.refresh.url=https://login.microsoftonline.com/${ADLS_TENANT_ID}/oauth2/token
8 |
9 | # Spin wait
10 | while true; do sleep 1000; done
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Arsen Vladimirskiy
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/env.conf:
--------------------------------------------------------------------------------
1 | # Azure Blob Storage Key
2 | AZURE_STORAGE_ACCOUNT_NAME=
3 | AZURE_STORAGE_ACCOUNT_KEY=
4 |
5 | # Azure Data Lake Store (ADLS) Service Principal Credentials
6 | ADLS_ACCOUNT_NAME=
7 | ADLS_TENANT_ID=
8 | ADLS_CLIENT_ID=
9 | ADLS_CLIENT_SECRET=
10 |
11 | # MongoDB on Azure CosmosDB (SSL must be enabled)
12 | MONGODB_SEEDS=YOURAZURECOSMOSDBACCOUNT.documents.azure.com:10255
13 | MONGODB_CREDENTIALS=USERNAME:PASSWORD@COLLECTION
14 | MONGODB_SSL_ENABLED=true
15 |
16 | # SQL Server
17 | SQLSERVER_JDBC_URL=jdbc:sqlserver://YOURAZURESQLDBACCOUNT.database.windows.net:1433;database=YOURDATABASENAME;encrypt=true;trustServerCertificate=false;hostNameInCertificate=*.database.windows.net;loginTimeout=30;
18 | SQLSERVER_USERNAME=
19 | SQLSERVER_PASSWORD=
20 |
21 | # MySQL
22 | MYSQL_JDBC_URL=jdbc:mysql://YOURAZUREMYSQLACCOUNT.mysql.database.azure.com:3306/?verifyServerCertificate=true&useSSL=true&requireSSL=false
23 | MYSQL_USERNAME=USERNAME@YOURAZUREMYSQLACCOUNT
24 | MYSQL_PASSWORD=
25 |
26 | # PostgreSQL
27 | POSTGRESQL_JDBC_URL=jdbc:postgresql://YOURAZUREPOSTRESQLACCOUNT.postgres.database.azure.com:5432/YOURDATABASENAME?ssl=true
28 | POSTGRESQL_USERNAME=USERNAME@YOURAZUREPOSTRESQLACCOUNT
29 | POSTGRESQL_PASSWORD=
30 |
--------------------------------------------------------------------------------
/presto/files/adls-wasb-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | fs.adl.impl
4 | org.apache.hadoop.fs.adl.AdlFileSystem
5 |
6 |
7 |
8 | fs.AbstractFileSystem.adl.impl
9 | org.apache.hadoop.fs.adl.Adl
10 |
11 |
12 |
13 | dfs.adls.oauth2.access.token.provider.type
14 | ClientCredential
15 |
16 |
17 |
18 | dfs.adls.oauth2.refresh.url
19 | https://login.microsoftonline.com/ADLS_TENANT_ID/oauth2/token
20 |
21 |
22 |
23 | dfs.adls.oauth2.client.id
24 | ADLS_CLIENT_ID
25 |
26 |
27 |
28 | dfs.adls.oauth2.credential
29 | ADLS_CLIENT_SECRET
30 |
31 |
32 |
33 |
34 | fs.azure.account.key.AZURE_STORAGE_ACCOUNT_NAME.blob.core.windows.net
35 | AZURE_STORAGE_ACCOUNT_KEY
36 |
37 |
38 |
--------------------------------------------------------------------------------
/presto/files/create-configs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Node properties
4 | # "__uuidgen__" is a placeholder to be replaced when container starts.
5 | # Using "node.id=(uuidgen)"" here would cause duplicate node ID problem in multi-node setup
6 | cat > $PRESTO_DIR/etc/node.properties < $PRESTO_DIR/etc/jvm.config < $PRESTO_DIR/etc/config.properties < $PRESTO_DIR/etc/log.properties < $PRESTO_DIR/etc/catalog/hive.properties < $PRESTO_DIR/etc/catalog/tpch.properties < $PRESTO_DIR/etc/catalog/cosmosdb.properties < $PRESTO_DIR/etc/catalog/azuresql.properties < $PRESTO_DIR/etc/catalog/mysql.properties < $PRESTO_DIR/etc/catalog/postgresql.properties <> /etc/bash.bashrc
55 |
56 | WORKDIR $PRESTO_DIR
57 |
58 | CMD /etc/presto-start.sh
59 |
60 |
61 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Example of a single node Presto with Azure Data Lake Store (ADLS) and Azure Blob Storage (WASB)
2 |
3 | Click to watch video
4 | [](https://youtu.be/hflThIJdkrI)
5 |
6 | ## Start local Hive metastore and Presto containers
7 |
8 | Clone this repo
9 |
10 | ```git clone https://github.com/arsenvlad/docker-presto-adls-wasb```
11 |
12 | Run Hive and Presto containers using config specified in *env.conf.private*
13 |
14 | ```docker-compose up --build```
15 |
16 | In a separate terminal window, list currently running containers
17 |
18 | ```docker ps```
19 |
20 | ## Connect to Hive bash
21 |
22 | In a separate terminal window, open interactive tty bash on the Hive container
23 |
24 | ```docker exec -it dockerprestoadlswasb_hive_1 bash```
25 |
26 | In the Hive container bash session, open Hive CLI pointing to itself as an external metastore. If you get an error saying "Name node is in safe mode", wait for a few minutes and try again.
27 |
28 | ```hive --hiveconf hive.metastore.uris=thrift://localhost:9083```
29 |
30 | Create table using Azure Storage Blobs (change the storage account name and container name to yours)
31 |
32 | ```create table wasbtable1 (id int, name varchar(255)) row format delimited fields terminated by ',' stored as textfile location 'wasb://test-hive@avdatarepo1.blob.core.windows.net/wasbtable1';```
33 |
34 | Create table using Azure Data Lake Store (change the ADLS account name to yours)
35 |
36 | ```create table adltable1 (id int, name varchar(255)) row format delimited fields terminated by ',' stored as textfile location 'adl://avdatalake1.azuredatalakestore.net/adltable1';```
37 |
38 | Confirm you can see the tables
39 | ```show tables;```
40 |
41 | ## Connect to Presto bash
42 |
43 | In a separate terminal window, open interactive tty bash on the Presto container
44 |
45 | ```docker exec -it dockerprestoadlswasb_presto_1 bash```
46 |
47 | Presto is configured with a single node with Hive connector as described in [/etc/motd](files/motd.txt)
48 |
49 | Use Presto CLI to connect to the running Presto server
50 |
51 | ```/opt/presto/presto --server http://localhost:8080```
52 |
53 | List shemas in Hive catalog
54 |
55 | ```show schemas from hive;```
56 |
57 | List tables in the Hive default catalog
58 |
59 | ```show tables from hive.default;```
60 |
61 | Insert data into the tables
62 |
63 | ```
64 | insert into hive.default.wasbtable1 (id, name) values (1,'1');
65 | insert into hive.default.wasbtable1 (id, name) select id, name from hive.default.wasbtable1 union all select id, name from hive.default.wasbtable1 union all select id, name from hive.default.wasbtable1;
66 |
67 | insert into hive.default.adltable1 (id, name) values (1,'1');
68 | insert into hive.default.adltable1 (id, name) select id, name from hive.default.adltable1 union all select id, name from hive.default.adltable1 union all select id, name from hive.default.adltable1;
69 | ```
70 |
71 | Select from the table
72 |
73 | ```select * from hive.default.adltable1;```
74 |
75 | ## When using with HDInsight
76 |
77 | NOTE: To access Azure HDInsight Hive Thrift Service your Docker host VM must be within the same network.
78 |
79 | To find the URLs of the HDInsight Hive Thrift Service (i.e. hive.metastore.uri), SSH into the HDInsight cluster and run this grep command:
80 |
81 | ```echo $(grep -n1 "hive.metastore.uri" /etc/hive/conf/hive-site.xml | grep -o ".*/value>" | sed 's:::g' | sed 's:::g')```
82 |
83 | ## Presto with Azure Data Services
84 |
85 | See [azure-data-services.md](azure-data-services.md) for an example showing how to configure Presto connectors to Azure Data Services to query and join data from Azure CosmosDB (using MongoDB API), Azure SQL Database, Azure MySQL, Azure PostgreSQL and store the joined results in Azure Blob Storage.
86 |
87 | [](https://youtu.be/XDfCK6Ejz-A)
88 |
89 |
--------------------------------------------------------------------------------
/azure-data-services.md:
--------------------------------------------------------------------------------
1 | [](https://youtu.be/XDfCK6Ejz-A)
2 |
3 | # Configuring Presto with Azure Data Services
4 |
5 | ## Azure CosmosDB with MongoDB API
6 |
7 | Create sample data using mongo.exe command line client
8 |
9 | ```
10 | mongo.exe YOURAZURECOSMOSDBACCOUNT.documents.azure.com:10255 -u USERNAME -p PASSWORD --ssl --sslAllowInvalidCertificates
11 | ```
12 |
13 | ```
14 | db.movies.insert({"id":100, "name": "The Shawshank Redemption", "year": 1994, "length": 120, "contentRating": "R"})
15 | db.movies.insert({"id":200, "name": "The Godfather", "year": 1972, "length": 175, "contentRating": "R"})
16 | db.movies.insert({"id":300, "name": "The Dark Knight", "year": 2008, "length": 120, "contentRating": "PG-13"})
17 |
18 | db.movies.find()
19 | ```
20 |
21 | ## Azure SQL Database
22 |
23 | Create sample data using Azure SQL Database Query Editor (preview) in portal.azure.com (under Azure SQL Database Tools)
24 |
25 | ```
26 | sp_tables;
27 | create table users (userid bigint, name varchar(255));
28 | insert into users (userid, name) values (1, 'John');
29 | insert into users (userid, name) values (2, 'Mary');
30 | insert into users (userid, name) values (3, 'John');
31 | insert into users (userid, name) values (4, 'Mike');
32 | insert into users (userid, name) values (5, 'Kate');
33 | insert into users (userid, name) values (7, 'Elizabeth');
34 | select * from users;
35 | ```
36 |
37 | ## Azure MySQL Database
38 |
39 | Create sample data using mysql command line client
40 |
41 | ```
42 | mysql -h YOURAZUREMYSQLACCOUNT.mysql.database.azure.com -D DATABASE -u USERNAME -p --ssl-mode Preferred
43 | ```
44 |
45 | ```
46 | create table tickets (userid bigint, movieid bigint, price decimal, purchase_date date);
47 | insert into tickets (userid, movieid, price, purchase_date) values (1, 100, 8.00, STR_TO_DATE('2016-06-01', '%Y-%m-%d'));
48 | insert into tickets (userid, movieid, price, purchase_date) values (1, 200, 9.00, STR_TO_DATE('2016-06-02', '%Y-%m-%d'));
49 | insert into tickets (userid, movieid, price, purchase_date) values (2, 100, 12.00, STR_TO_DATE('2016-06-03', '%Y-%m-%d'));
50 | insert into tickets (userid, movieid, price, purchase_date) values (2, 200, 9.00, STR_TO_DATE('2016-06-02', '%Y-%m-%d'));
51 | select * from tickets;
52 | ```
53 |
54 | ## Azure PostgreSQL Database
55 |
56 | Create sample data using pgsql command line client
57 |
58 | ```
59 | psql -h YOURAZUREPOSTGRESQLACCOUNT.postgres.database.azure.com -d postgres -U USERNAME "sslmode=require dbname=DATABASE"
60 | ```
61 |
62 | ```
63 | create table user_movie_ratings (userid bigint, movieid bigint, rating int);
64 | insert into user_movie_ratings (userid, movieid, rating) values (1, 100, 5);
65 | insert into user_movie_ratings (userid, movieid, rating) values (1, 200, 3);
66 | insert into user_movie_ratings (userid, movieid, rating) values (2, 100, 2);
67 | insert into user_movie_ratings (userid, movieid, rating) values (2, 200, 4);
68 | select * from user_movie_ratings;
69 | ```
70 |
71 | ## Create Hive table located on WASB
72 |
73 | Use Hive CLI from within the container
74 |
75 | ```
76 | hive --hiveconf hive.metastore.uris=thrift://localhost:9083
77 | ```
78 |
79 | ```
80 | create table user_movie_rating_ticket (userid bigint, user_name varchar(255), rating int, price decimal, purchase_date date, movie_name varchar(255), year bigint, length bigint, content_rating varchar(255))
81 | row format delimited fields terminated by ',' stored as textfile location 'wasb://test-hive@avdatarepo1.blob.core.windows.net/user_movie_rating_ticket';
82 | ```
83 |
84 | ## Presto Queries
85 |
86 | Using Presto CLI from within the container
87 |
88 | ```
89 | ./presto --server localhost:8080
90 | ```
91 |
92 | ```
93 | select * from cosmosdb.mydb1.movies;
94 | select * from azuresql.dbo.users;
95 | select * from mysql.mydb1.tickets;
96 | select * from postgresql.public.user_movie_ratings;
97 | ```
98 |
99 | Join data from Azure CosmosDB with Azure SQL Server, Azure MySQL, and Azure PostgreSQL
100 |
101 | ```
102 | select u.userid, u.name, r.rating, t.price, t.purchase_date, m.name, m.year, m.length, m.contentRating from azuresql.dbo.users u inner join mysql.mydb1.tickets t on u.userid = t.userid inner join cosmosdb.mydb1.movies m on m.id = t.movieid
103 | inner join postgresql.public.user_movie_ratings r on r.movieid = t.movieid and r.userid = u.userid order by u.userid, m.id;
104 | ```
105 |
106 | Insert results into WASB table
107 |
108 | ```
109 | insert into hive.default.user_movie_rating_ticket
110 | select u.userid, u.name, r.rating, cast(t.price as decimal(10,0)), t.purchase_date, cast(m.name as varchar(255)), m.year, m.length, cast(m.contentRating as varchar(255)) from azuresql.dbo.users u inner join mysql.mydb1.tickets t on u.userid = t.userid inner join cosmosdb.mydb1.movies m on m.id = t.movieid
111 | inner join postgresql.public.user_movie_ratings r on r.movieid = t.movieid and r.userid = u.userid order by u.userid, m.id;
112 | ```
113 |
114 | ```
115 | select * from hive.default.user_movie_rating_ticket;
116 | ```
117 |
--------------------------------------------------------------------------------