├── .dockerignore ├── .gitignore ├── examples ├── r │ ├── test.R │ ├── README.md │ └── setup.R └── python │ ├── README.md │ └── test.py ├── hive-site.xml ├── LICENSE.md ├── README.md └── Dockerfile /.dockerignore: -------------------------------------------------------------------------------- 1 | .gitignore -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.tar.gz 2 | apache-hive-*/ 3 | hadoop-*/ 4 | -------------------------------------------------------------------------------- /examples/r/test.R: -------------------------------------------------------------------------------- 1 | source("setup.R") 2 | 3 | conn <- getHiveConn() 4 | 5 | db <- dbGetQuery(conn, "show databases") 6 | print(db) 7 | 8 | print("Success!") 9 | -------------------------------------------------------------------------------- /examples/python/README.md: -------------------------------------------------------------------------------- 1 | # Python example code 2 | 3 | 1. Install Python. 4 | 1. [Start the Hive server.](../../README.md#remote-connection) 5 | 1. From this directory, run: 6 | 7 | ```sh 8 | pip3 install PyHive[hive] 9 | python3 test.py 10 | ``` 11 | -------------------------------------------------------------------------------- /examples/python/test.py: -------------------------------------------------------------------------------- 1 | from pyhive import hive 2 | 3 | cursor = hive.connect("localhost").cursor() 4 | cursor.execute("CREATE TABLE IF NOT EXISTS people (name string)") 5 | cursor.execute("INSERT INTO people (name) VALUES ('Aidan')") 6 | cursor.execute("SELECT * FROM people LIMIT 10") 7 | print(cursor.fetchone()) 8 | 9 | print("Success!") 10 | -------------------------------------------------------------------------------- /examples/r/README.md: -------------------------------------------------------------------------------- 1 | # R example code 2 | 3 | 1. [Start the Hive server.](../../README.md#remote-connection) 4 | 1. Download [Hadoop](https://hadoop.apache.org/releases.html) and [Hive](https://hive.apache.org/downloads.html), and extract to the top level of this repository. 5 | - You may need to change the `.jar` paths in [the setup script](setup.R) to match the paths/versions. 6 | 1. From this directory, run: 7 | 8 | ```sh 9 | R --no-save < test.R 10 | ``` 11 | -------------------------------------------------------------------------------- /hive-site.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | datanucleus.autoStartMechanismMode 6 | 7 | ignored 8 | Throw exception if metadata tables are incorrect. 9 | 10 | 11 | mapreduce.framework.name 12 | local 13 | https://cwiki.apache.org/confluence/display/Hive/GettingStarted#GettingStarted-Hive,Map-ReduceandLocal-Mode 14 | 15 | 16 | -------------------------------------------------------------------------------- /examples/r/setup.R: -------------------------------------------------------------------------------- 1 | # https://stackoverflow.com/a/39280008/358804 2 | if (!require(RJDBC)) { 3 | install.packages(c("RJDBC"), repos='http://cran.us.r-project.org') 4 | require(RJDBC) 5 | } 6 | 7 | getHiveConn <- function() { 8 | # https://mapr.com/support/s/article/How-to-connect-R-to-Hiveserver2-using-hive-jdbc?language=en_US 9 | 10 | cp=c( 11 | "../../apache-hive-3.1.1-bin/jdbc/hive-jdbc-3.1.1-standalone.jar", 12 | "../../hadoop-3.2.0/share/hadoop/common/hadoop-common-3.2.0.jar" 13 | ) 14 | .jinit(classpath=cp) 15 | drv <- JDBC( 16 | "org.apache.hive.jdbc.HiveDriver", 17 | "../../apache-hive-3.1.1-bin/lib/hive-jdbc-3.1.1.jar", 18 | identifier.quote="`" 19 | ) 20 | 21 | dbConnect(drv, "jdbc:hive2://localhost:10000/;") #, "", "") 22 | } 23 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | As a work of the United States government, this project is in the 2 | public domain within the United States. 3 | 4 | Additionally, we waive copyright and related rights in the work 5 | worldwide through the CC0 1.0 Universal public domain dedication. 6 | 7 | ## CC0 1.0 Universal Summary 8 | 9 | This is a human-readable summary of the 10 | [Legal Code (read the full text)](https://creativecommons.org/publicdomain/zero/1.0/legalcode). 11 | 12 | ### No Copyright 13 | 14 | The person who associated a work with this deed has dedicated the work to 15 | the public domain by waiving all of his or her rights to the work worldwide 16 | under copyright law, including all related and neighboring rights, to the 17 | extent allowed by law. 18 | 19 | You can copy, modify, distribute and perform the work, even for commercial 20 | purposes, all without asking permission. 21 | 22 | ### Other Information 23 | 24 | In no way are the patent or trademark rights of any person affected by CC0, 25 | nor are the rights that other persons may have in the work or in how the 26 | work is used, such as publicity or privacy rights. 27 | 28 | Unless expressly stated otherwise, the person who associated a work with 29 | this deed makes no warranties about the work, and disclaims liability for 30 | all uses of the work, to the fullest extent permitted by applicable law. 31 | When using or citing the work, you should not imply endorsement by the 32 | author or the affirmer. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WeeHive 2 | 3 | A minimal-as-possible Docker container running [Apache Hive](https://hive.apache.org/) on [Hadoop](https://hadoop.apache.org/). Intended for non-production use cases like testing out Hive code or running integration tests. 4 | 5 | ## Setup 6 | 7 | 1. Install Docker. 8 | 1. Make sure that you have at least a few GB of memory allocated to Docker. Instructions: 9 | - [Docker for Mac](https://docs.docker.com/docker-for-mac/#advanced) 10 | - [Docker for Windows](https://docs.docker.com/docker-for-windows/#advanced) 11 | 1. Clone this repository. 12 | 1. From the repository root, build the Docker image. 13 | 14 | ```sh 15 | docker build -t weehive . 16 | ``` 17 | 18 | ## Usage 19 | 20 | ### Beeline 21 | 22 | ```sh 23 | docker run --rm -it \ 24 | -v weehive_hadoop:/usr/local/hadoop/warehouse \ 25 | -v weehive_meta:/usr/local/hadoop/metastore_db \ 26 | weehive 27 | ``` 28 | 29 | You will be shown the [Beeline](https://cwiki.apache.org/confluence/display/Hive/HiveServer2+Clients#HiveServer2Clients-Beeline%E2%80%93CommandLineShell) shell. The `weehive_hadoop` and `weehive_meta` [volume names](https://docs.docker.com/storage/volumes/#choose-the--v-or---mount-flag) can be changed to be project-specific names if you want. 30 | 31 | ### Remote connection 32 | 33 | 1. Run the server. 34 | 35 | ```sh 36 | docker run --rm -it -p 10000:10000 \ 37 | -v weehive_hadoop:/usr/local/hadoop/warehouse \ 38 | -v weehive_meta:/usr/local/hadoop/metastore_db \ 39 | weehive hiveserver2 40 | ``` 41 | 42 | 1. Wait ~90 seconds for Hive to fully start. 43 | 1. Connect using the [JDBC URL](https://cwiki.apache.org/confluence/display/Hive/HiveServer2+Clients#HiveServer2Clients-JDBC) `jdbc:hive2://localhost:10000`. Example from an external `beeline`: 44 | 45 | ```sh 46 | beeline -u jdbc:hive2://localhost:10000 47 | ``` 48 | 49 | ## Loading data from file 50 | 51 | 1. [Mount the data as a volume](https://docs.docker.com/storage/volumes/#start-a-container-with-a-volume) by adding a `-v :/usr/local/hadoop/data` to one of the `docker run` commands above. 52 | 1. Follow [instructions to load data](https://cwiki.apache.org/confluence/display/Hive/Tutorial#Tutorial-LoadingData) 53 | 54 | ## Development 55 | 56 | ```sh 57 | docker build -t weehive:local . 58 | docker run --rm -it weehive:local 59 | ``` 60 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # split downloads so the layers are cached independently, and the .tar.gzs aren't included in the final image (reducing the size) 2 | # https://medium.com/@tonistiigi/advanced-multi-stage-build-patterns-6f741b852fae 3 | 4 | FROM alpine as hadoop 5 | 6 | ARG MIRROR=https://apache.osuosl.org 7 | ARG HADOOP_VERSION=3.2.1 8 | 9 | # download remotely 10 | RUN wget $MIRROR/hadoop/common/stable/hadoop-$HADOOP_VERSION.tar.gz 11 | RUN tar -xzf hadoop-$HADOOP_VERSION.tar.gz 12 | 13 | # copy from local - to use, remove .dockerignore 14 | # ADD hadoop-$HADOOP_VERSION.tar.gz . 15 | 16 | RUN mv hadoop-$HADOOP_VERSION hadoop 17 | 18 | 19 | FROM alpine as hive 20 | 21 | ARG MIRROR=https://apache.osuosl.org 22 | ARG HIVE_VERSION=3.1.2 23 | 24 | # download remotely 25 | RUN wget $MIRROR/hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz 26 | RUN tar -xzf apache-hive-$HIVE_VERSION-bin.tar.gz 27 | 28 | # copy from local - to use, remove .dockerignore 29 | # ADD apache-hive-$HIVE_VERSION-bin.tar.gz . 30 | 31 | RUN mv apache-hive-$HIVE_VERSION-bin hive 32 | # https://stackoverflow.com/a/41789082/358804 33 | RUN rm hive/lib/log4j-slf4j-impl-2.10.0.jar 34 | 35 | # replace guava versions - issue: https://issues.apache.org/jira/browse/HIVE-22915 36 | RUN rm hive/lib/guava-19.0.jar 37 | RUN wget https://repo1.maven.org/maven2/com/google/guava/guava/29.0-jre/guava-29.0-jre.jar 38 | RUN mv guava-29.0-jre.jar hive/lib/ 39 | 40 | 41 | # https://www.digitalocean.com/community/tutorials/how-to-install-hadoop-in-stand-alone-mode-on-ubuntu-18-04 42 | 43 | FROM ubuntu:bionic 44 | 45 | WORKDIR /usr/local/hadoop 46 | 47 | RUN apt-get update && apt-get install -y openjdk-8-jdk 48 | ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/ 49 | 50 | COPY --from=hadoop /hadoop hadoop 51 | ENV HADOOP_HOME /usr/local/hadoop/hadoop 52 | ENV PATH="${HADOOP_HOME}/bin:${PATH}" 53 | 54 | COPY --from=hive /hive hive 55 | ENV HIVE_HOME /usr/local/hadoop/hive 56 | ENV PATH="${HIVE_HOME}/bin:${PATH}" 57 | COPY hive-site.xml $HIVE_HOME/conf/ 58 | 59 | ARG HADOOP_STORAGE=/usr/local/hadoop/warehouse 60 | # https://stackoverflow.com/a/13651963/358804 61 | ARG METASTORE_DB=/usr/local/hadoop/metastore_db 62 | 63 | # https://cwiki.apache.org/confluence/display/Hive/GettingStarted#GettingStarted-RunningHive 64 | RUN hadoop fs -mkdir -p /tmp 65 | RUN hadoop fs -mkdir -p $HADOOP_STORAGE 66 | RUN hadoop fs -chmod g+w /tmp 67 | RUN hadoop fs -chmod g+w $HADOOP_STORAGE 68 | 69 | # https://cwiki.apache.org/confluence/display/Hive/GettingStarted#GettingStarted-RunningHiveServer2andBeeline.1 70 | RUN schematool -dbType derby -initSchema 71 | VOLUME [ "${HADOOP_STORAGE}", "${METASTORE_DB}" ] 72 | CMD beeline -u jdbc:hive2:// --------------------------------------------------------------------------------