├── .gitignore ├── images ├── presto-adls-wasb-play.png └── presto-azure-data-services-play.png ├── docker-compose.yml ├── .travis.yml ├── presto ├── files │ ├── motd.txt │ ├── adls-wasb-site.xml │ ├── create-configs.sh │ └── presto-start.sh └── Dockerfile ├── hive ├── files │ └── metastore-start.sh ├── README.md └── Dockerfile ├── LICENSE ├── env.conf ├── README.md └── azure-data-services.md /.gitignore: -------------------------------------------------------------------------------- 1 | *private* -------------------------------------------------------------------------------- /images/presto-adls-wasb-play.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arsenvlad/docker-presto-adls-wasb/HEAD/images/presto-adls-wasb-play.png -------------------------------------------------------------------------------- /images/presto-azure-data-services-play.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arsenvlad/docker-presto-adls-wasb/HEAD/images/presto-azure-data-services-play.png -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | hive: 5 | build: hive 6 | env_file: 7 | - ./env.conf.private 8 | 9 | presto: 10 | build: presto 11 | depends_on: 12 | - hive 13 | env_file: 14 | - ./env.conf.private 15 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: required 2 | 3 | services: 4 | - docker 5 | before_install: 6 | - docker --version 7 | script: 8 | - cd presto 9 | - docker build -t presto-adls-wasb . 10 | - cd ../hive 11 | - docker build -t hive-adls-wasb . 12 | - docker images 13 | - exit 0 14 | 15 | -------------------------------------------------------------------------------- /presto/files/motd.txt: -------------------------------------------------------------------------------- 1 | 2 | ======================================================== 3 | Presto Example with ADLS and WASB Docker Container 4 | ======================================================== 5 | 6 | 7 | 1. /opt/presto/etc/catalog/hive.properties sets the Hive Server metastore URL 8 | 2. /opt/presto/etc/catalog/adls-wasb-site.xml configures the ADLS and WASB credentials 9 | 3. Presto is configured on port 8080 which can be changed in /opt/presto/etc/config.properties file 10 | 4. Additional catalogs for Azure CosmosDB, Azure SQL Database, Azure MySQL, and Azure PostreSQL also optionally configured 11 | 5. Presto is started /opt/presto/bin/launcher run 12 | 13 | ======================================================== 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /hive/files/metastore-start.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Start Hadoop 4 | /etc/bootstrap.sh 5 | 6 | # Start the metastore 7 | hive --service metastore --hiveconf hive.root.logger=DEBUG,console --hiveconf fs.azure.account.key.${AZURE_STORAGE_ACCOUNT_NAME}.blob.core.windows.net=${AZURE_STORAGE_ACCOUNT_KEY} --hiveconf fs.adl.impl=org.apache.hadoop.fs.adl.AdlFileSystem --hiveconf fs.AbstractFileSystem.adl.impl=org.apache.hadoop.fs.adl.Adl --hiveconf dfs.adls.oauth2.access.token.provider.type=ClientCredential --hiveconf dfs.adls.oauth2.client.id=${ADLS_CLIENT_ID} --hiveconf dfs.adls.oauth2.credential=${ADLS_CLIENT_SECRET} --hiveconf dfs.adls.oauth2.refresh.url=https://login.microsoftonline.com/${ADLS_TENANT_ID}/oauth2/token 8 | 9 | # Spin wait 10 | while true; do sleep 1000; done -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Arsen Vladimirskiy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /env.conf: -------------------------------------------------------------------------------- 1 | # Azure Blob Storage Key 2 | AZURE_STORAGE_ACCOUNT_NAME= 3 | AZURE_STORAGE_ACCOUNT_KEY= 4 | 5 | # Azure Data Lake Store (ADLS) Service Principal Credentials 6 | ADLS_ACCOUNT_NAME= 7 | ADLS_TENANT_ID= 8 | ADLS_CLIENT_ID= 9 | ADLS_CLIENT_SECRET= 10 | 11 | # MongoDB on Azure CosmosDB (SSL must be enabled) 12 | MONGODB_SEEDS=YOURAZURECOSMOSDBACCOUNT.documents.azure.com:10255 13 | MONGODB_CREDENTIALS=USERNAME:PASSWORD@COLLECTION 14 | MONGODB_SSL_ENABLED=true 15 | 16 | # SQL Server 17 | SQLSERVER_JDBC_URL=jdbc:sqlserver://YOURAZURESQLDBACCOUNT.database.windows.net:1433;database=YOURDATABASENAME;encrypt=true;trustServerCertificate=false;hostNameInCertificate=*.database.windows.net;loginTimeout=30; 18 | SQLSERVER_USERNAME= 19 | SQLSERVER_PASSWORD= 20 | 21 | # MySQL 22 | MYSQL_JDBC_URL=jdbc:mysql://YOURAZUREMYSQLACCOUNT.mysql.database.azure.com:3306/?verifyServerCertificate=true&useSSL=true&requireSSL=false 23 | MYSQL_USERNAME=USERNAME@YOURAZUREMYSQLACCOUNT 24 | MYSQL_PASSWORD= 25 | 26 | # PostgreSQL 27 | POSTGRESQL_JDBC_URL=jdbc:postgresql://YOURAZUREPOSTRESQLACCOUNT.postgres.database.azure.com:5432/YOURDATABASENAME?ssl=true 28 | POSTGRESQL_USERNAME=USERNAME@YOURAZUREPOSTRESQLACCOUNT 29 | POSTGRESQL_PASSWORD= 30 | -------------------------------------------------------------------------------- /presto/files/adls-wasb-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | fs.adl.impl 4 | org.apache.hadoop.fs.adl.AdlFileSystem 5 | 6 | 7 | 8 | fs.AbstractFileSystem.adl.impl 9 | org.apache.hadoop.fs.adl.Adl 10 | 11 | 12 | 13 | dfs.adls.oauth2.access.token.provider.type 14 | ClientCredential 15 | 16 | 17 | 18 | dfs.adls.oauth2.refresh.url 19 | https://login.microsoftonline.com/ADLS_TENANT_ID/oauth2/token 20 | 21 | 22 | 23 | dfs.adls.oauth2.client.id 24 | ADLS_CLIENT_ID 25 | 26 | 27 | 28 | dfs.adls.oauth2.credential 29 | ADLS_CLIENT_SECRET 30 | 31 | 32 | 33 | 34 | fs.azure.account.key.AZURE_STORAGE_ACCOUNT_NAME.blob.core.windows.net 35 | AZURE_STORAGE_ACCOUNT_KEY 36 | 37 | 38 | -------------------------------------------------------------------------------- /presto/files/create-configs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Node properties 4 | # "__uuidgen__" is a placeholder to be replaced when container starts. 5 | # Using "node.id=(uuidgen)"" here would cause duplicate node ID problem in multi-node setup 6 | cat > $PRESTO_DIR/etc/node.properties < $PRESTO_DIR/etc/jvm.config < $PRESTO_DIR/etc/config.properties < $PRESTO_DIR/etc/log.properties < $PRESTO_DIR/etc/catalog/hive.properties < $PRESTO_DIR/etc/catalog/tpch.properties < $PRESTO_DIR/etc/catalog/cosmosdb.properties < $PRESTO_DIR/etc/catalog/azuresql.properties < $PRESTO_DIR/etc/catalog/mysql.properties < $PRESTO_DIR/etc/catalog/postgresql.properties <> /etc/bash.bashrc 55 | 56 | WORKDIR $PRESTO_DIR 57 | 58 | CMD /etc/presto-start.sh 59 | 60 | 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Example of a single node Presto with Azure Data Lake Store (ADLS) and Azure Blob Storage (WASB) 2 | 3 | Click to watch video 4 | [![Presto with ADLS and WASB](images/presto-adls-wasb-play.png)](https://youtu.be/hflThIJdkrI) 5 | 6 | ## Start local Hive metastore and Presto containers 7 | 8 | Clone this repo 9 | 10 | ```git clone https://github.com/arsenvlad/docker-presto-adls-wasb``` 11 | 12 | Run Hive and Presto containers using config specified in *env.conf.private* 13 | 14 | ```docker-compose up --build``` 15 | 16 | In a separate terminal window, list currently running containers 17 | 18 | ```docker ps``` 19 | 20 | ## Connect to Hive bash 21 | 22 | In a separate terminal window, open interactive tty bash on the Hive container 23 | 24 | ```docker exec -it dockerprestoadlswasb_hive_1 bash``` 25 | 26 | In the Hive container bash session, open Hive CLI pointing to itself as an external metastore. If you get an error saying "Name node is in safe mode", wait for a few minutes and try again. 27 | 28 | ```hive --hiveconf hive.metastore.uris=thrift://localhost:9083``` 29 | 30 | Create table using Azure Storage Blobs (change the storage account name and container name to yours) 31 | 32 | ```create table wasbtable1 (id int, name varchar(255)) row format delimited fields terminated by ',' stored as textfile location 'wasb://test-hive@avdatarepo1.blob.core.windows.net/wasbtable1';``` 33 | 34 | Create table using Azure Data Lake Store (change the ADLS account name to yours) 35 | 36 | ```create table adltable1 (id int, name varchar(255)) row format delimited fields terminated by ',' stored as textfile location 'adl://avdatalake1.azuredatalakestore.net/adltable1';``` 37 | 38 | Confirm you can see the tables 39 | ```show tables;``` 40 | 41 | ## Connect to Presto bash 42 | 43 | In a separate terminal window, open interactive tty bash on the Presto container 44 | 45 | ```docker exec -it dockerprestoadlswasb_presto_1 bash``` 46 | 47 | Presto is configured with a single node with Hive connector as described in [/etc/motd](files/motd.txt) 48 | 49 | Use Presto CLI to connect to the running Presto server 50 | 51 | ```/opt/presto/presto --server http://localhost:8080``` 52 | 53 | List shemas in Hive catalog 54 | 55 | ```show schemas from hive;``` 56 | 57 | List tables in the Hive default catalog 58 | 59 | ```show tables from hive.default;``` 60 | 61 | Insert data into the tables 62 | 63 | ``` 64 | insert into hive.default.wasbtable1 (id, name) values (1,'1'); 65 | insert into hive.default.wasbtable1 (id, name) select id, name from hive.default.wasbtable1 union all select id, name from hive.default.wasbtable1 union all select id, name from hive.default.wasbtable1; 66 | 67 | insert into hive.default.adltable1 (id, name) values (1,'1'); 68 | insert into hive.default.adltable1 (id, name) select id, name from hive.default.adltable1 union all select id, name from hive.default.adltable1 union all select id, name from hive.default.adltable1; 69 | ``` 70 | 71 | Select from the table 72 | 73 | ```select * from hive.default.adltable1;``` 74 | 75 | ## When using with HDInsight 76 | 77 | NOTE: To access Azure HDInsight Hive Thrift Service your Docker host VM must be within the same network. 78 | 79 | To find the URLs of the HDInsight Hive Thrift Service (i.e. hive.metastore.uri), SSH into the HDInsight cluster and run this grep command: 80 | 81 | ```echo $(grep -n1 "hive.metastore.uri" /etc/hive/conf/hive-site.xml | grep -o ".*/value>" | sed 's:::g' | sed 's:::g')``` 82 | 83 | ## Presto with Azure Data Services 84 | 85 | See [azure-data-services.md](azure-data-services.md) for an example showing how to configure Presto connectors to Azure Data Services to query and join data from Azure CosmosDB (using MongoDB API), Azure SQL Database, Azure MySQL, Azure PostgreSQL and store the joined results in Azure Blob Storage. 86 | 87 | [![Azure CosmosDB with MongoDB API, Azure SQL Database, Azure MySQL, Azure PostgreSQL](images/presto-azure-data-services-play.png)](https://youtu.be/XDfCK6Ejz-A) 88 | 89 | -------------------------------------------------------------------------------- /azure-data-services.md: -------------------------------------------------------------------------------- 1 | [![Azure CosmosDB with MongoDB API, Azure SQL Database, Azure MySQL, Azure PostgreSQL](images/presto-azure-data-services-play.png)](https://youtu.be/XDfCK6Ejz-A) 2 | 3 | # Configuring Presto with Azure Data Services 4 | 5 | ## Azure CosmosDB with MongoDB API 6 | 7 | Create sample data using mongo.exe command line client 8 | 9 | ``` 10 | mongo.exe YOURAZURECOSMOSDBACCOUNT.documents.azure.com:10255 -u USERNAME -p PASSWORD --ssl --sslAllowInvalidCertificates 11 | ``` 12 | 13 | ``` 14 | db.movies.insert({"id":100, "name": "The Shawshank Redemption", "year": 1994, "length": 120, "contentRating": "R"}) 15 | db.movies.insert({"id":200, "name": "The Godfather", "year": 1972, "length": 175, "contentRating": "R"}) 16 | db.movies.insert({"id":300, "name": "The Dark Knight", "year": 2008, "length": 120, "contentRating": "PG-13"}) 17 | 18 | db.movies.find() 19 | ``` 20 | 21 | ## Azure SQL Database 22 | 23 | Create sample data using Azure SQL Database Query Editor (preview) in portal.azure.com (under Azure SQL Database Tools) 24 | 25 | ``` 26 | sp_tables; 27 | create table users (userid bigint, name varchar(255)); 28 | insert into users (userid, name) values (1, 'John'); 29 | insert into users (userid, name) values (2, 'Mary'); 30 | insert into users (userid, name) values (3, 'John'); 31 | insert into users (userid, name) values (4, 'Mike'); 32 | insert into users (userid, name) values (5, 'Kate'); 33 | insert into users (userid, name) values (7, 'Elizabeth'); 34 | select * from users; 35 | ``` 36 | 37 | ## Azure MySQL Database 38 | 39 | Create sample data using mysql command line client 40 | 41 | ``` 42 | mysql -h YOURAZUREMYSQLACCOUNT.mysql.database.azure.com -D DATABASE -u USERNAME -p --ssl-mode Preferred 43 | ``` 44 | 45 | ``` 46 | create table tickets (userid bigint, movieid bigint, price decimal, purchase_date date); 47 | insert into tickets (userid, movieid, price, purchase_date) values (1, 100, 8.00, STR_TO_DATE('2016-06-01', '%Y-%m-%d')); 48 | insert into tickets (userid, movieid, price, purchase_date) values (1, 200, 9.00, STR_TO_DATE('2016-06-02', '%Y-%m-%d')); 49 | insert into tickets (userid, movieid, price, purchase_date) values (2, 100, 12.00, STR_TO_DATE('2016-06-03', '%Y-%m-%d')); 50 | insert into tickets (userid, movieid, price, purchase_date) values (2, 200, 9.00, STR_TO_DATE('2016-06-02', '%Y-%m-%d')); 51 | select * from tickets; 52 | ``` 53 | 54 | ## Azure PostgreSQL Database 55 | 56 | Create sample data using pgsql command line client 57 | 58 | ``` 59 | psql -h YOURAZUREPOSTGRESQLACCOUNT.postgres.database.azure.com -d postgres -U USERNAME "sslmode=require dbname=DATABASE" 60 | ``` 61 | 62 | ``` 63 | create table user_movie_ratings (userid bigint, movieid bigint, rating int); 64 | insert into user_movie_ratings (userid, movieid, rating) values (1, 100, 5); 65 | insert into user_movie_ratings (userid, movieid, rating) values (1, 200, 3); 66 | insert into user_movie_ratings (userid, movieid, rating) values (2, 100, 2); 67 | insert into user_movie_ratings (userid, movieid, rating) values (2, 200, 4); 68 | select * from user_movie_ratings; 69 | ``` 70 | 71 | ## Create Hive table located on WASB 72 | 73 | Use Hive CLI from within the container 74 | 75 | ``` 76 | hive --hiveconf hive.metastore.uris=thrift://localhost:9083 77 | ``` 78 | 79 | ``` 80 | create table user_movie_rating_ticket (userid bigint, user_name varchar(255), rating int, price decimal, purchase_date date, movie_name varchar(255), year bigint, length bigint, content_rating varchar(255)) 81 | row format delimited fields terminated by ',' stored as textfile location 'wasb://test-hive@avdatarepo1.blob.core.windows.net/user_movie_rating_ticket'; 82 | ``` 83 | 84 | ## Presto Queries 85 | 86 | Using Presto CLI from within the container 87 | 88 | ``` 89 | ./presto --server localhost:8080 90 | ``` 91 | 92 | ``` 93 | select * from cosmosdb.mydb1.movies; 94 | select * from azuresql.dbo.users; 95 | select * from mysql.mydb1.tickets; 96 | select * from postgresql.public.user_movie_ratings; 97 | ``` 98 | 99 | Join data from Azure CosmosDB with Azure SQL Server, Azure MySQL, and Azure PostgreSQL 100 | 101 | ``` 102 | select u.userid, u.name, r.rating, t.price, t.purchase_date, m.name, m.year, m.length, m.contentRating from azuresql.dbo.users u inner join mysql.mydb1.tickets t on u.userid = t.userid inner join cosmosdb.mydb1.movies m on m.id = t.movieid 103 | inner join postgresql.public.user_movie_ratings r on r.movieid = t.movieid and r.userid = u.userid order by u.userid, m.id; 104 | ``` 105 | 106 | Insert results into WASB table 107 | 108 | ``` 109 | insert into hive.default.user_movie_rating_ticket 110 | select u.userid, u.name, r.rating, cast(t.price as decimal(10,0)), t.purchase_date, cast(m.name as varchar(255)), m.year, m.length, cast(m.contentRating as varchar(255)) from azuresql.dbo.users u inner join mysql.mydb1.tickets t on u.userid = t.userid inner join cosmosdb.mydb1.movies m on m.id = t.movieid 111 | inner join postgresql.public.user_movie_ratings r on r.movieid = t.movieid and r.userid = u.userid order by u.userid, m.id; 112 | ``` 113 | 114 | ``` 115 | select * from hive.default.user_movie_rating_ticket; 116 | ``` 117 | --------------------------------------------------------------------------------