├── .dockerignore
├── .gitignore
├── examples
├── r
│ ├── test.R
│ ├── README.md
│ └── setup.R
└── python
│ ├── README.md
│ └── test.py
├── hive-site.xml
├── LICENSE.md
├── README.md
└── Dockerfile
/.dockerignore:
--------------------------------------------------------------------------------
1 | .gitignore
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.tar.gz
2 | apache-hive-*/
3 | hadoop-*/
4 |
--------------------------------------------------------------------------------
/examples/r/test.R:
--------------------------------------------------------------------------------
1 | source("setup.R")
2 |
3 | conn <- getHiveConn()
4 |
5 | db <- dbGetQuery(conn, "show databases")
6 | print(db)
7 |
8 | print("Success!")
9 |
--------------------------------------------------------------------------------
/examples/python/README.md:
--------------------------------------------------------------------------------
1 | # Python example code
2 |
3 | 1. Install Python.
4 | 1. [Start the Hive server.](../../README.md#remote-connection)
5 | 1. From this directory, run:
6 |
7 | ```sh
8 | pip3 install PyHive[hive]
9 | python3 test.py
10 | ```
11 |
--------------------------------------------------------------------------------
/examples/python/test.py:
--------------------------------------------------------------------------------
1 | from pyhive import hive
2 |
3 | cursor = hive.connect("localhost").cursor()
4 | cursor.execute("CREATE TABLE IF NOT EXISTS people (name string)")
5 | cursor.execute("INSERT INTO people (name) VALUES ('Aidan')")
6 | cursor.execute("SELECT * FROM people LIMIT 10")
7 | print(cursor.fetchone())
8 |
9 | print("Success!")
10 |
--------------------------------------------------------------------------------
/examples/r/README.md:
--------------------------------------------------------------------------------
1 | # R example code
2 |
3 | 1. [Start the Hive server.](../../README.md#remote-connection)
4 | 1. Download [Hadoop](https://hadoop.apache.org/releases.html) and [Hive](https://hive.apache.org/downloads.html), and extract to the top level of this repository.
5 | - You may need to change the `.jar` paths in [the setup script](setup.R) to match the paths/versions.
6 | 1. From this directory, run:
7 |
8 | ```sh
9 | R --no-save < test.R
10 | ```
11 |
--------------------------------------------------------------------------------
/hive-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | datanucleus.autoStartMechanismMode
6 |
7 | ignored
8 | Throw exception if metadata tables are incorrect.
9 |
10 |
11 | mapreduce.framework.name
12 | local
13 | https://cwiki.apache.org/confluence/display/Hive/GettingStarted#GettingStarted-Hive,Map-ReduceandLocal-Mode
14 |
15 |
16 |
--------------------------------------------------------------------------------
/examples/r/setup.R:
--------------------------------------------------------------------------------
1 | # https://stackoverflow.com/a/39280008/358804
2 | if (!require(RJDBC)) {
3 | install.packages(c("RJDBC"), repos='http://cran.us.r-project.org')
4 | require(RJDBC)
5 | }
6 |
7 | getHiveConn <- function() {
8 | # https://mapr.com/support/s/article/How-to-connect-R-to-Hiveserver2-using-hive-jdbc?language=en_US
9 |
10 | cp=c(
11 | "../../apache-hive-3.1.1-bin/jdbc/hive-jdbc-3.1.1-standalone.jar",
12 | "../../hadoop-3.2.0/share/hadoop/common/hadoop-common-3.2.0.jar"
13 | )
14 | .jinit(classpath=cp)
15 | drv <- JDBC(
16 | "org.apache.hive.jdbc.HiveDriver",
17 | "../../apache-hive-3.1.1-bin/lib/hive-jdbc-3.1.1.jar",
18 | identifier.quote="`"
19 | )
20 |
21 | dbConnect(drv, "jdbc:hive2://localhost:10000/;") #, "", "")
22 | }
23 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | As a work of the United States government, this project is in the
2 | public domain within the United States.
3 |
4 | Additionally, we waive copyright and related rights in the work
5 | worldwide through the CC0 1.0 Universal public domain dedication.
6 |
7 | ## CC0 1.0 Universal Summary
8 |
9 | This is a human-readable summary of the
10 | [Legal Code (read the full text)](https://creativecommons.org/publicdomain/zero/1.0/legalcode).
11 |
12 | ### No Copyright
13 |
14 | The person who associated a work with this deed has dedicated the work to
15 | the public domain by waiving all of his or her rights to the work worldwide
16 | under copyright law, including all related and neighboring rights, to the
17 | extent allowed by law.
18 |
19 | You can copy, modify, distribute and perform the work, even for commercial
20 | purposes, all without asking permission.
21 |
22 | ### Other Information
23 |
24 | In no way are the patent or trademark rights of any person affected by CC0,
25 | nor are the rights that other persons may have in the work or in how the
26 | work is used, such as publicity or privacy rights.
27 |
28 | Unless expressly stated otherwise, the person who associated a work with
29 | this deed makes no warranties about the work, and disclaims liability for
30 | all uses of the work, to the fullest extent permitted by applicable law.
31 | When using or citing the work, you should not imply endorsement by the
32 | author or the affirmer.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # WeeHive
2 |
3 | A minimal-as-possible Docker container running [Apache Hive](https://hive.apache.org/) on [Hadoop](https://hadoop.apache.org/). Intended for non-production use cases like testing out Hive code or running integration tests.
4 |
5 | ## Setup
6 |
7 | 1. Install Docker.
8 | 1. Make sure that you have at least a few GB of memory allocated to Docker. Instructions:
9 | - [Docker for Mac](https://docs.docker.com/docker-for-mac/#advanced)
10 | - [Docker for Windows](https://docs.docker.com/docker-for-windows/#advanced)
11 | 1. Clone this repository.
12 | 1. From the repository root, build the Docker image.
13 |
14 | ```sh
15 | docker build -t weehive .
16 | ```
17 |
18 | ## Usage
19 |
20 | ### Beeline
21 |
22 | ```sh
23 | docker run --rm -it \
24 | -v weehive_hadoop:/usr/local/hadoop/warehouse \
25 | -v weehive_meta:/usr/local/hadoop/metastore_db \
26 | weehive
27 | ```
28 |
29 | You will be shown the [Beeline](https://cwiki.apache.org/confluence/display/Hive/HiveServer2+Clients#HiveServer2Clients-Beeline%E2%80%93CommandLineShell) shell. The `weehive_hadoop` and `weehive_meta` [volume names](https://docs.docker.com/storage/volumes/#choose-the--v-or---mount-flag) can be changed to be project-specific names if you want.
30 |
31 | ### Remote connection
32 |
33 | 1. Run the server.
34 |
35 | ```sh
36 | docker run --rm -it -p 10000:10000 \
37 | -v weehive_hadoop:/usr/local/hadoop/warehouse \
38 | -v weehive_meta:/usr/local/hadoop/metastore_db \
39 | weehive hiveserver2
40 | ```
41 |
42 | 1. Wait ~90 seconds for Hive to fully start.
43 | 1. Connect using the [JDBC URL](https://cwiki.apache.org/confluence/display/Hive/HiveServer2+Clients#HiveServer2Clients-JDBC) `jdbc:hive2://localhost:10000`. Example from an external `beeline`:
44 |
45 | ```sh
46 | beeline -u jdbc:hive2://localhost:10000
47 | ```
48 |
49 | ## Loading data from file
50 |
51 | 1. [Mount the data as a volume](https://docs.docker.com/storage/volumes/#start-a-container-with-a-volume) by adding a `-v :/usr/local/hadoop/data` to one of the `docker run` commands above.
52 | 1. Follow [instructions to load data](https://cwiki.apache.org/confluence/display/Hive/Tutorial#Tutorial-LoadingData)
53 |
54 | ## Development
55 |
56 | ```sh
57 | docker build -t weehive:local .
58 | docker run --rm -it weehive:local
59 | ```
60 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # split downloads so the layers are cached independently, and the .tar.gzs aren't included in the final image (reducing the size)
2 | # https://medium.com/@tonistiigi/advanced-multi-stage-build-patterns-6f741b852fae
3 |
4 | FROM alpine as hadoop
5 |
6 | ARG MIRROR=https://apache.osuosl.org
7 | ARG HADOOP_VERSION=3.2.1
8 |
9 | # download remotely
10 | RUN wget $MIRROR/hadoop/common/stable/hadoop-$HADOOP_VERSION.tar.gz
11 | RUN tar -xzf hadoop-$HADOOP_VERSION.tar.gz
12 |
13 | # copy from local - to use, remove .dockerignore
14 | # ADD hadoop-$HADOOP_VERSION.tar.gz .
15 |
16 | RUN mv hadoop-$HADOOP_VERSION hadoop
17 |
18 |
19 | FROM alpine as hive
20 |
21 | ARG MIRROR=https://apache.osuosl.org
22 | ARG HIVE_VERSION=3.1.2
23 |
24 | # download remotely
25 | RUN wget $MIRROR/hive/hive-$HIVE_VERSION/apache-hive-$HIVE_VERSION-bin.tar.gz
26 | RUN tar -xzf apache-hive-$HIVE_VERSION-bin.tar.gz
27 |
28 | # copy from local - to use, remove .dockerignore
29 | # ADD apache-hive-$HIVE_VERSION-bin.tar.gz .
30 |
31 | RUN mv apache-hive-$HIVE_VERSION-bin hive
32 | # https://stackoverflow.com/a/41789082/358804
33 | RUN rm hive/lib/log4j-slf4j-impl-2.10.0.jar
34 |
35 | # replace guava versions - issue: https://issues.apache.org/jira/browse/HIVE-22915
36 | RUN rm hive/lib/guava-19.0.jar
37 | RUN wget https://repo1.maven.org/maven2/com/google/guava/guava/29.0-jre/guava-29.0-jre.jar
38 | RUN mv guava-29.0-jre.jar hive/lib/
39 |
40 |
41 | # https://www.digitalocean.com/community/tutorials/how-to-install-hadoop-in-stand-alone-mode-on-ubuntu-18-04
42 |
43 | FROM ubuntu:bionic
44 |
45 | WORKDIR /usr/local/hadoop
46 |
47 | RUN apt-get update && apt-get install -y openjdk-8-jdk
48 | ENV JAVA_HOME /usr/lib/jvm/java-8-openjdk-amd64/
49 |
50 | COPY --from=hadoop /hadoop hadoop
51 | ENV HADOOP_HOME /usr/local/hadoop/hadoop
52 | ENV PATH="${HADOOP_HOME}/bin:${PATH}"
53 |
54 | COPY --from=hive /hive hive
55 | ENV HIVE_HOME /usr/local/hadoop/hive
56 | ENV PATH="${HIVE_HOME}/bin:${PATH}"
57 | COPY hive-site.xml $HIVE_HOME/conf/
58 |
59 | ARG HADOOP_STORAGE=/usr/local/hadoop/warehouse
60 | # https://stackoverflow.com/a/13651963/358804
61 | ARG METASTORE_DB=/usr/local/hadoop/metastore_db
62 |
63 | # https://cwiki.apache.org/confluence/display/Hive/GettingStarted#GettingStarted-RunningHive
64 | RUN hadoop fs -mkdir -p /tmp
65 | RUN hadoop fs -mkdir -p $HADOOP_STORAGE
66 | RUN hadoop fs -chmod g+w /tmp
67 | RUN hadoop fs -chmod g+w $HADOOP_STORAGE
68 |
69 | # https://cwiki.apache.org/confluence/display/Hive/GettingStarted#GettingStarted-RunningHiveServer2andBeeline.1
70 | RUN schematool -dbType derby -initSchema
71 | VOLUME [ "${HADOOP_STORAGE}", "${METASTORE_DB}" ]
72 | CMD beeline -u jdbc:hive2://
--------------------------------------------------------------------------------