├── .gitignore
├── 0-gendata.txt
├── 1a-R-datatable.txt
├── 1b-R-dplyr.txt
├── 1c-python-pandas.txt
├── 1d-R-MonetDBLite.txt
├── 2a-mysql.txt
├── 2b-postgres.txt
├── 3a-hive.txt
├── 3b-impala.txt
├── 3c-spark.txt
├── 4-mapd.txt
├── README.md
├── plot.R
├── plot.csv
├── plot.png
├── query_plans.txt
└── versions.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | .*.sw?
6 | benchm-databases.Rproj
7 | 


--------------------------------------------------------------------------------
/0-gendata.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | time R --vanilla --quiet << EOF
 3 | set.seed(123)
 4 | n <- 100e6
 5 | m <- 1e6
 6 | d <- data.frame(x = sample(m, n, replace=TRUE), y = runif(n))
 7 | dm <- data.frame(x = sample(m))
 8 | write.table(d, file = "/tmp/d.csv", row.names = FALSE, col.names = FALSE, sep = ",")
 9 | write.table(dm, file = "/tmp/dm.csv", row.names = FALSE, col.names = FALSE, sep = ",")
10 | EOF
11 | ## 3m52.052s
12 | 
13 | 
14 | du -h /tmp/d*csv
15 | ## 2.4G    /tmp/d.csv
16 | ## 6.6M    /tmp/dm.csv
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/1a-R-datatable.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | library(data.table)
 4 | 
 5 | d <- fread("/tmp/d.csv")
 6 | dm <- fread("/tmp/dm.csv")
 7 | setnames(d, c("x","y"))
 8 | setnames(dm, "x")
 9 | 
10 | tables()
11 | ##      NAME        NROW NCOL    MB COLS KEY
12 | ## [1,] d    100,000,000    2 1,145 x,y     
13 | ## [2,] dm     1,000,000    1     4 x       
14 | ## Total: 1,149MB
15 | 
16 | 
17 | system.time(
18 |   print(head(d[, list(ym=mean(y)), by=x][order(-ym)],5))
19 | )
20 | 
21 | 
22 | system.time(
23 |   setkey(d, x)
24 | )
25 | 
26 | system.time(
27 |   print(nrow(d[dm, nomatch=0]))
28 | )
29 | 
30 | 
31 | system.time(
32 |   print(head(d[, list(ym=mean(y)), by=x][order(-ym)],5))
33 | )
34 | 
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/1b-R-dplyr.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | library(data.table)
 3 | library(dplyr)
 4 | 
 5 | d <- tbl_df(as.data.frame(fread("/tmp/d.csv")))
 6 | dm <- tbl_df(as.data.frame(fread("/tmp/dm.csv")))
 7 | names(d) <- c("x","y")
 8 | names(dm) <- "x"
 9 | 
10 | 
11 | system.time(
12 |   print(d %>% group_by(x) %>% summarize(ym = mean(y)) %>% arrange(desc(ym)) %>% head(5))
13 | )
14 | 
15 | system.time(
16 |   print(d %>% inner_join(dm, by="x") %>% summarize(n = n()))
17 | )
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/1c-python-pandas.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | ipython
 3 | 
 4 | 
 5 | import pandas as pd
 6 | 
 7 | %time d = pd.read_csv("/tmp/d.csv", header = None, names=["x","y"])
 8 | %time dm = pd.read_csv("/tmp/dm.csv", header = None, names=["x"])
 9 | 
10 | 
11 | %time d.groupby("x", as_index = False)["y"].mean().sort_values("y", ascending = False).head(5)
12 | %time d.groupby("x")["y"].mean().sort_values(ascending = False)[:5]
13 | 
14 | %time pd.merge(d, dm).shape[0]
15 | 


--------------------------------------------------------------------------------
/1d-R-MonetDBLite.txt:
--------------------------------------------------------------------------------
 1 | # install.packages(c("MonetDB.R", "MonetDBLite"), repos=c("http://dev.monetdb.org/Assets/R/", "http://cran.rstudio.com/"))
 2 | 
 3 | library(MonetDB.R)
 4 | 
 5 | con <- dbConnect(MonetDBLite())
 6 | dbSendQuery(con, "create table d(x int, y double)")
 7 | dbSendQuery(con, "create table dm(x int)")
 8 | 
 9 | dbSendQuery(con, "copy into d from '/tmp/d.csv' using delimiters ','")
10 | dbSendQuery(con, "copy into dm from '/tmp/dm.csv' using delimiters ','")
11 | 
12 | system.time(dbGetQuery(con, "
13 | 	select x, avg(y) as ym 
14 | 	from d 
15 | 	group by x
16 | 	order by ym desc 
17 | 	limit 5;
18 | "))
19 | 
20 | # 7s on my box
21 | 
22 | system.time(dbGetQuery(con, "
23 | 	select count(*) as cnt 
24 | 	from d
25 | 	inner join dm on d.x = dm.x;
26 | "))
27 | 
28 | # 1.7s on my box


--------------------------------------------------------------------------------
/2a-mysql.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | sudo apt-get install mysql-server
 3 | 
 4 | /etc/mysql/my.cnf:
 5 | tmp_table_size          = 128M     ## implicit default 16M
 6 | max_heap_table_size     = 128M     ## implicit default 16M
 7 | innodb_buffer_pool_size = 16G      ## implicit default 128M
 8 | 
 9 | service mysql restart
10 | 
11 | 
12 | mysql -u root --local-infile
13 | 
14 | create database benchm;
15 | use benchm
16 | 
17 | create table d(x int, y float) engine = MyISAM;
18 | create table dm(x int) engine = MyISAM;
19 | 
20 | load data local infile '/tmp/dm.csv' into table dm fields terminated by ',';
21 | load data local infile '/tmp/d.csv' into table d fields terminated by ',';
22 | 
23 | create index ixm on dm(x);
24 | 
25 | 
26 | select x, avg(y) as ym 
27 | from d 
28 | group by x
29 | order by ym desc 
30 | limit 5;
31 | 
32 | select count(*) as cnt 
33 | from d
34 | inner join dm on d.x = dm.x;
35 | 
36 | 
37 | drop table d;
38 | drop table dm;
39 | create table d(x int, y float);
40 | create table dm(x int);
41 | 
42 | load data local infile '/tmp/dm.csv' into table dm fields terminated by ',';
43 | load data local infile '/tmp/d.csv' into table d fields terminated by ',';
44 | 
45 | create index ixm on dm(x);
46 | 
47 | 
48 | select x, avg(y) as ym 
49 | from d 
50 | group by x
51 | order by ym desc 
52 | limit 5;
53 | 
54 | select count(*) as cnt 
55 | from d
56 | inner join dm on d.x = dm.x;
57 | 
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/2b-postgres.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | apt-get install postgresql-9.3
 4 | 
 5 | sudo -u postgres psql
 6 | 
 7 | \timing
 8 | 
 9 | create table d(x int, y float);
10 | create table dm(x int);
11 | 
12 | copy dm from '/tmp/dm.csv' delimiter ',';
13 | copy d from '/tmp/d.csv' delimiter ',';
14 | 
15 | create index ixm on dm(x);
16 | 
17 | 
18 | select x, avg(y) as ym 
19 | from d 
20 | group by x
21 | order by ym desc 
22 | limit 5;
23 | 
24 | select count(*) as cnt 
25 | from d
26 | inner join dm on d.x = dm.x;
27 | 
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/3a-hive.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | sudo -u hive hive
 5 | 
 6 | dfs -mkdir /tmp/benchm;
 7 | dfs -mkdir /tmp/benchm/d;
 8 | dfs -mkdir /tmp/benchm/dm;
 9 | 
10 | dfs -put /tmp/dm.csv /tmp/benchm/dm;
11 | dfs -put /tmp/d.csv /tmp/benchm/d;
12 | 
13 | create database benchm;
14 | use benchm;
15 | 
16 | create external table d (x int, y float) 
17 | row format delimited fields terminated by ',' lines terminated by '\n'
18 | stored as textfile location '/tmp/benchm/d';
19 | 
20 | create external table dm (x int) 
21 | row format delimited fields terminated by ',' lines terminated by '\n'
22 | stored as textfile location '/tmp/benchm/dm';
23 | 
24 | 
25 | select x, avg(y) as ym 
26 | from d 
27 | group by x
28 | order by ym desc 
29 | limit 5;
30 | 
31 | select count(*) as cnt 
32 | from d
33 | inner join dm on d.x = dm.x;
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/3b-impala.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | impala-shell
 4 | 
 5 | invalidate metadata;
 6 | use benchm;
 7 | 
 8 | 
 9 | select x, avg(y) as ym 
10 | from d 
11 | group by x
12 | order by ym desc 
13 | limit 5;
14 | 
15 | select count(*) as cnt 
16 | from d
17 | inner join dm on d.x = dm.x;
18 | 
19 | 


--------------------------------------------------------------------------------
/3c-spark.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Amazon (AWS) EMR 
 3 | Bootstrap Actions:
 4 | 
 5 | S3 location:  s3://support.elasticmapreduce/spark/install-spark
 6 | Optional arguments:  -x
 7 | 
 8 | -x: Prepares the default Spark config for dedicated Spark single application use [1 executor per node, 
 9 | num of executors equivalent to core nodes at creation of cluster, all memory/vcores allocated]
10 | 
11 | 
12 | 
13 | /home/hadoop/spark/bin/spark-sql --master yarn-client
14 | 
15 | use benchm;
16 | 
17 | 
18 | select x, avg(y) as ym 
19 | from d 
20 | group by x
21 | order by ym desc 
22 | limit 5;
23 | 
24 | select count(*) as cnt 
25 | from d
26 | inner join dm on d.x = dm.x;
27 | 
28 | 
29 | 
30 | cache table dm;
31 | cache table d;
32 | 


--------------------------------------------------------------------------------
/4-mapd.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | AWS Marketplace:
 4 | https://aws.amazon.com/marketplace/pp/B071H71L2Y
 5 | 
 6 | EC2 Launch p2.xlarge 
 7 | 
 8 | 
 9 | ssh -i XXX centos@ec2-XXX
10 | 
11 | 
12 | sudo yum install screen htop R
13 | 
14 | 
15 | R
16 | 
17 | set.seed(123)
18 | n <- 100e6
19 | m <- 1e6
20 | d <- data.frame(x = sample(m, n, replace=TRUE), y = runif(n))
21 | dm <- data.frame(x = sample(m))
22 | write.table(d, file = "/tmp/d.csv", row.names = FALSE, col.names = FALSE, sep = ",")
23 | write.table(dm, file = "/tmp/dm.csv", row.names = FALSE, col.names = FALSE, sep = ",")
24 | 
25 | 
26 | 
27 | [as root:]   ./cloud-init/per-once-bootstrap-mapd.sh
28 | 
29 | 
30 | /raidStorage/installs/mapd-ce-3.0.0-20170507-7626e30-Linux-x86_64-render/bin/mapdql -p XXX
31 | 
32 | 
33 | \timing
34 | 
35 | create table d(x int, y float);
36 | create table dm(x int);
37 | 
38 | copy dm from '/tmp/dm.csv' with (header = 'false');
39 | copy d from '/tmp/d.csv'  with (header = 'false');
40 | --Loaded: 100000000 recs, Rejected: 0 recs in 52.022000 secs
41 | 
42 | 
43 | select x, avg(y) as ym 
44 | from d 
45 | group by x
46 | order by ym desc 
47 | limit 5;
48 | -- Execution time: 1342 ms, Total time: 1343 ms
49 | -- Execution time: 1131 ms, Total time: 1132 ms
50 | 
51 | select count(*) as cnt 
52 | from d
53 | inner join dm on d.x = dm.x;
54 | -- Execution time: 199 ms, Total time: 199 ms
55 | -- Execution time: 83 ms, Total time: 83 ms
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | ======================
63 | 
64 | On the above MapD is using CPU on the aggregation query and GPU only on the join.
65 | 
66 | 
67 | According to MapD's @dwayneberry:
68 | 
69 | I ran your steps on my laptop (1 x 1060 GPU) and these are the times I got.
70 | I have checked the code on a kepler machine (k80machine like you are running on AWS) and I notice the IR code is targeting the CPU for the AVG(float) query. My laptop is pascal so didn't see the CPU behaviour.
71 | We have recently changed this so more operations will run on Float on kepler architecture. I will get that applied to this edition as soon as possible.
72 | 
73 | https://community.mapd.com/t/toy-benchmark-on-aws-cloud-gpu-usage/34/3
74 | 
75 | 
76 | @dwayneberry's timing:
77 | 
78 | mapdql> select x, avg(y) as ym from d group by x order by ym desc limit 5;
79 | Execution time: 464 ms, Total time: 465 ms
80 | Execution time: 467 ms, Total time: 467 ms
81 | 
82 | mapdql> select count(*) as cnt 
83 | ..> from d
84 | ..> inner join dm on d.x = dm.x;
85 | 
86 | Execution time: 141 ms, Total time: 142 ms
87 | Execution time: 67 ms, Total time: 67 ms
88 | Execution time: 69 ms, Total time: 69 ms
89 | 
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ### Simple/basic/limited/incomplete benchmark for operations on tabular data (aggregates and joins)
  3 | 
  4 | For structured/tabular/relational data most transformations for data analysis are based on a few
  5 | primitives. Computationally, aggregates and joins are taking the majority
  6 | of time. This project aims at a *minimal* benchmark of various tools 
  7 | (statistical software, databases etc.) for working with tabular data of moderately
  8 | large sizes (but still within the reach of interactive data analysis - response times
  9 | of a few seconds on commodity hardware).
 10 | 
 11 | 
 12 | #### Tools
 13 | 
 14 | The tools analysed are:
 15 | 
 16 | 1. Statistical software: R (dplyr, data.table) and Python (pandas)
 17 | 2. Databases (MySQL, PostgreSQL)
 18 | 3. Analytical databases (3 MPP/columnar stores)
 19 | 4. "Big data" systems (Hive, Impala, Spark)
 20 | 
 21 | **Update:** MapD open sourced their analytical database using GPUs, which should be a new class here.
 22 | 
 23 | All but the analytical databases are open source. The analytical databases have free (community) editions
 24 | or inexpensive cloud offerings. 
 25 | 
 26 | **Update:** @hannesmuehleisen submitted results for MonetDB, an analytical/columnar db that unlike all the
 27 | above is open source.
 28 | 
 29 | **Update:** @JazzFan submitted results for another analytical db ("MPP-4" here) 
 30 | 
 31 | R/Python operate in memory and can integrate the tabular operations
 32 | with rich visualization, statistical modeling etc. On the other hand, they are limited to data sizes
 33 | that fit in RAM, run single-threaded, and unlike the other systems do not have a query optimizer.
 34 | 
 35 | MySQL/PostgreSQL have been designed for mixed OLTP/analytical workloads and while 
 36 | the database can use multiple cores to run different queries, any given query runs
 37 | on one processor core only.
 38 | 
 39 | The analytical databases and the "big data" systems can scale-out to multiple nodes (and use all cores on them). 
 40 | The analytical (MPP) databases have parallel/shared-nothing architecture, columnar storage, compression and are specifically
 41 | designed for large aggregations and joins.
 42 | 
 43 | Hive/Spark are based on the map-reduce paradigm, SQL operations are translated to 
 44 | map/shuffle/reduce primitives (Hive generates traditional Hadoop jobs, while Spark leverages in-memory
 45 | architecture). Impala uses MPP-like technology to query data in HDFS (Hadoop's distributed file system).
 46 | 
 47 | 
 48 | #### Data
 49 | 
 50 | The data is [randomly generated](0-gendata.txt): 
 51 | one table `d` (`x` integer, `y` float) of 100 million rows for aggregation
 52 | (`x` takes 1 million distinct values) and another table `dm` (`x` integer) of 1 million rows for the join only.
 53 | (The larger table `d` is of ~2GB size in the CSV format and results in ~1GB usage when loaded in database or
 54 | read in memory.)
 55 | 
 56 | 
 57 | #### Transformations
 58 | 
 59 | SQL query for aggregation:
 60 | 
 61 | ```
 62 | select x, avg(y) as ym 
 63 | from d 
 64 | group by x
 65 | order by ym desc 
 66 | limit 5;
 67 | ```
 68 | 
 69 | and for join:
 70 | 
 71 | ```
 72 | select count(*) as cnt 
 73 | from d
 74 | inner join dm on d.x = dm.x;
 75 | ```
 76 | 
 77 | 
 78 | #### Setup
 79 | 
 80 | The tests have been performed on a m3.2xlarge EC2 instance (8 cores, 30GB RAM) running Ubuntu 14.04.
 81 | The queries have been run 2 times and the second
 82 | time was recorded (warm run). In this case various caching mechanisms come into play and the data is
 83 | effectively in RAM. 
 84 | 
 85 | While I'm a great fan of reproducibility, in this benchmark I'm more interested in orders
 86 | of magnitude and not strict precision and exact reproducibility. With some more work one can create install and test
 87 | scripts that can run in a fully automated fashion for complete reproducibility.
 88 | 
 89 | The software tools have been installed with the latest available 
 90 | [versions](versions.txt)
 91 | using standard instructions with no tuning 
 92 | (with a few exceptions as noted). For Hive/Impala/Spark Amazon's EMR was used to avoid a more elaborate installation.
 93 | 
 94 | 
 95 | The following running times have been measured:
 96 | 
 97 | 1. For R/Python data has been read from csv file and then aggregates/joins happen in memory.
 98 | 2. For MySQL/Postgres and the analytical databases, the data has to be loaded first into the database, and only then 
 99 | can one run queries.
100 | 3. For the "Big Data" systems the data has to be copied into HDFS (much faster than loading it to a database); 
101 | optionally it can be transformed into a columnar format (such as parquet). Queries can run readily. For Spark the data
102 | can be either read from HDFS on the fly or loaded into memory (cached) and then queried.
103 | 
104 | 
105 | 
106 | #### Limitations
107 | 
108 | This is far from a comprehensive benchmark. It is my attempt to *quickly* get an idea of the order
109 | of magnitude of running times for aggregations and joins on datasets of sizes of interest to *me* at the moment. 
110 | 
111 | The results are expected to vary with hardware, tuning, and likely even more with dataset size, 
112 | dataset structure, or the number of nodes for the scale-out systems etc. Perhaps the strongest
113 | critique against the general relevance of this benchmark could be that it uses a certain
114 | data size and structure only, instead of examining a variety of.
115 | 
116 | I'm not looking in detail either at the scaling by the number of nodes for the 
117 | big data systems as I'm primarily concerned with the efficiency on a single or a small number of nodes.
118 | 
119 | In the tests the only computation running on the system is the target query, therefore I'm not
120 | studying the behavior in function of the load (e.g. the number of concurrent queries running on the system).
121 | 
122 | Finally, one might say that queries in practice are complex and the running times depend not only 
123 | on how fast are these primitives, but also on how the query optimizer can deal with complexity. Again,
124 | a comprehensive SQL benchmark is out of the scope here (but see e.g. TPC-DS).
125 | 
126 | 
127 | 
128 | #### Results
129 | 
130 | (times in seconds)
131 | 
132 | |  Type      | System                              |  Load/Read    |   Aggregation  |   Join   |
133 | | ---------- | ----------------------------------- | ------------- | -------------- | -------- |
134 | |  Stats     | [R DT](1a-R-datatable.txt)          |   30          |       5.5      |    6.5   |
135 | |  Stats     | [R DT key](1a-R-datatable.txt)      |   35          |       1.3      |    1.7   |
136 | |  Stats     | [R dplyr](1b-R-dplyr.txt)           |   30          |       45       |    40    |
137 | |  Stats     | [Py pandas](1c-python-pandas.txt)   |   30          |       8        |    25    |
138 | |  DB        | [MySQL MyISAM](2a-mysql.txt)        |   40          |       45       |    470   |  
139 | |  DB        | [MySQL InnoDB](2a-mysql.txt)        |   430         |       70       |    140   |
140 | |  DB        | [PostgreSQL](2b-postgres.txt)       |   120         |       175      |    55    |
141 | |  MPP       | MPP-1                               |   70          |       0.5      |    2.5   |
142 | |  MPP       | MPP-2                               |   130         |       9        |    4     |
143 | |  MPP       | MPP-3                               |   130         |       6.5      |    15    |
144 | |  MPP       | MPP-4                               |   102         |       7.1      |    8.5   |
145 | |  MPP       | MonetDB                             |   70          |       7        |    1.5   |
146 | |  Big Data  | [Hive](3a-hive.txt)                 |   20          |       250      |    80    |
147 | |  Big Data  | [Impala](3b-impala.txt)             |   20          |       25       |    15    |
148 | |  Big Data  | [Spark](3c-spark.txt)               |   20          |       120      |    130   |
149 | |  Big Data  | [Spark cached](3c-spark.txt)        |   50          |       90       |    17    |
150 | |  GPU       | [MapD](4-mapd.txt)                  |   12          |       0.5      |    0.07  |
151 | 
152 | 
153 | **Update:** [Timings](https://gist.github.com/szilard/7d810877fcc6b019a478#gistcomment-1693662) 
154 | by @hannesmuehleisen on MonetDB on same m3.2xlarge: aggregation 7.0, join 1.5.
155 | 
156 | **Update:** With newer [versions](versions.txt) of Hive and Spark:
157 | 
158 | |  System        |   Aggr  |   Join   |
159 | |----------------|---------|----------|
160 | | Hive           |  250    |    100   |
161 | | Spark          |   40    |     25   |
162 | | Spark cached   |   12    |     2.5  |
163 | 
164 | (also one should use parquet rather than csv for un-cached Spark).
165 | 
166 | **Update:** MapD [timings](https://community.mapd.com/t/toy-benchmark-on-aws-cloud-gpu-usage/34/3) by @dwayneberry
167 | on laptop with GPU (1060) show impressive 0.5sec for aggregation and 70ms (!!) for the join.
168 | 
169 | ![plots](plot.png)
170 | 
171 | Multinode results (with older versions of Spark/Hive):
172 | 
173 | | System        | Aggr 1-node | Aggr 5-node | Join 1-node | Join 5-node |
174 | | ------------- | ----------- | ----------- | ----------- | ----------- |
175 | | Hive          |    250      |   160       |    80       |     50      |
176 | | Impala        |    25       |   16        |    15       |     10      |
177 | | Spark         |    120      |   50        |    130      |     30      |
178 | | Spark cached  |    90       |   40        |    17       |     10      |
179 | 
180 | 
181 | #### Discussions
182 | 
183 | It seems that for data manipulation with ~100 million rows / ~1GB datasets MPP databases
184 | are the fastest, next statistical software, then traditional databases, and finally big data
185 | systems (see graph above). Some more insights can be gained from 
186 | [query plans](query_plans.txt).
187 | 
188 | The largest surprise (to me) is that traditional databases (MySQL/PostgreSQL) perform so poorly
189 | at this size. 
190 | 
191 | Naturally, analytical databases are the fastest. Even on 1 node, their columnar architecture and
192 | the features that come with it (efficient binary storage format, compression) along with the 
193 | ability to process a query on multiple processor cores are ideal for the task.
194 | 
195 | Statistical software fares pretty well. While not able to use multiple cores and no query planning,
196 | the data is in RAM in a format that makes this type of processing fast. (For the primitives studied, query
197 | planning does not play an important role such as for complex queries.)
198 | 
199 | As an MPP-like tool but with data stored in Hadoop, Impala achieves higher performance than traditional
200 | Hadoop tools, but it is significantly slower than the full MPPs that can marshal their data as they 
201 | please. 
202 | 
203 | While traditional databases like MySQL/PostgreSQL aim to be reasonable for both OLTP and analytical 
204 | workloads, it comes as a surprize that even for 1GB sizes (which in today's terms is not large at all)
205 | they perform so poorly.
206 | 
207 | While it is no surprise that Hive (which generates traditional Hadoop map-reduce jobs) is the slowest, 
208 | the order of magnitude (100x vs analytical databases) is surprizing a bit.
209 | 
210 | Spark is a new map-reduce like architecture that leverages in-memory computations, therefore significantly faster 
211 | than Hive (especially when data is first read/cached in memory), but initially slower for example than Impala,
212 | though further optimizations (codenamed Tungsten in Spark 1.5, 1.6 and 2.0) obtained significant improvements.
213 | 
214 | For ever larger datasets, statistical software will run out of memory, while traditional databases
215 | seem to become prohibitively slow. MPPs and big data systems can scale-out to multiple nodes, though 
216 | the speed advantage of MPPs seems so large that it's hard to imagine anything but extreme data sizes
217 | when the big data systems can overcome the MPPs (though their respective scaling in function of the number 
218 | of nodes and concurrent users has to be studied).
219 | 
220 | 
221 | 
222 | 


--------------------------------------------------------------------------------
/plot.R:
--------------------------------------------------------------------------------
 1 | library(dplyr)
 2 | library(ggplot2)
 3 | 
 4 | ticks <- c(0.1,0.3,1,3,10,30,100,300)
 5 | 
 6 | d <- read.csv("plot.csv", sep="|", comment.char = "#") 
 7 | names(d) <- tolower(names(d))
 8 | 
 9 | d %>% ggplot(aes(x = aggregation, y = join, label = system, color = type)) +
10 |     geom_text() +
11 |     scale_x_log10(breaks = ticks, labels = ticks, limits = c(0.1,500)) + 
12 |     scale_y_log10(breaks = ticks, labels = ticks, limits = c(0.1,500)) +
13 |     geom_smooth(aes(x = aggregation, y = join), method = "lm", se = FALSE,
14 |                 formula = y ~ x + 0, color="grey70")
15 | 
16 | 


--------------------------------------------------------------------------------
/plot.csv:
--------------------------------------------------------------------------------
 1 | |  Type      | System           |  Load/Read    |   Aggregation  |   Join   |
 2 | |  Stats     | R DT             |   30          |       5.5      |    6.5   |
 3 | |  Stats     | R DT key         |   35          |       1.3      |    1.7   |
 4 | |  Stats     | R dplyr          |   30          |       45       |    40    |
 5 | |  Stats     | Py pandas        |   30          |       8        |    25    |
 6 | |  DB        | MySQL MyISAM     |   40          |       45       |    470   |  
 7 | |  DB        | MySQL InnoDB     |   430         |       70       |    140   |
 8 | |  DB        | PostgreSQL       |   120         |       175      |    55    |
 9 | |  MPP       | MPP-1            |   70          |       0.5      |    2.5   |
10 | |  MPP       | MPP-2            |   130         |       9        |    4     |
11 | |  MPP       | MPP-3            |   130         |       6.5      |    15    |
12 | |  MPP       | MPP-4            |   102         |       7.1      |    8.5   |
13 | |  MPP       | MonetDB          |   70          |       7        |    1.5   |
14 | |  Big Data  | Hive             |   20          |       250      |    80    |
15 | |  Big Data  | Impala           |   20          |       25       |    15    |
16 | |  Big Data  | Spark            |   20          |       40       |    30    |
17 | |  Big Data  | Spark-cached     |   50          |       12       |    2.5   |
18 | |  GPU       | MapD             |   12          |       0.5      |    0.1   |
19 | ## Spark Join 15->13 to avoid overlap in plot
20 | ## MapD join 0.07->0.1 to fit in scale
21 | 
22 | 


--------------------------------------------------------------------------------
/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/szilard/benchm-databases/3725093aaaa8d118b83a22181d23201f0f6225d7/plot.png


--------------------------------------------------------------------------------
/query_plans.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | ##### MySQL MyISAM
  4 | 
  5 | 
  6 | mysql> explain extended select x, avg(y) as ym  from d  group by x order by ym desc  limit 5;
  7 | +----+-------------+-------+------+---------------+------+---------+------+-----------+----------+---------------------------------+
  8 | | id | select_type | table | type | possible_keys | key  | key_len | ref  | rows      | filtered | Extra                           |
  9 | +----+-------------+-------+------+---------------+------+---------+------+-----------+----------+---------------------------------+
 10 | |  1 | SIMPLE      | d     | ALL  | NULL          | NULL | NULL    | NULL | 100000000 |   100.00 | Using temporary; Using filesort |
 11 | +----+-------------+-------+------+---------------+------+---------+------+-----------+----------+---------------------------------+
 12 | 
 13 | 
 14 | mysql>  explain extended select count(*) as cnt  from d inner join dm on d.x = dm.x;
 15 | +----+-------------+-------+------+---------------+------+---------+------------+-----------+----------+--------------------------+
 16 | | id | select_type | table | type | possible_keys | key  | key_len | ref        | rows      | filtered | Extra                    |
 17 | +----+-------------+-------+------+---------------+------+---------+------------+-----------+----------+--------------------------+
 18 | |  1 | SIMPLE      | d     | ALL  | NULL          | NULL | NULL    | NULL       | 100000000 |   100.00 |                          |
 19 | |  1 | SIMPLE      | dm    | ref  | ixm           | ixm  | 5       | benchm.d.x |         1 |   100.00 | Using where; Using index |
 20 | +----+-------------+-------+------+---------------+------+---------+------------+-----------+----------+--------------------------+
 21 | 
 22 | 
 23 | 
 24 | 
 25 | 
 26 | ##### MySQL InnoDB
 27 | 
 28 | 
 29 | mysql> explain extended select x, avg(y) as ym  from d  group by x order by ym desc  limit 5;
 30 | +----+-------------+-------+------+---------------+------+---------+------+-----------+----------+---------------------------------+
 31 | | id | select_type | table | type | possible_keys | key  | key_len | ref  | rows      | filtered | Extra                           |
 32 | +----+-------------+-------+------+---------------+------+---------+------+-----------+----------+---------------------------------+
 33 | |  1 | SIMPLE      | d     | ALL  | NULL          | NULL | NULL    | NULL | 100000319 |   100.00 | Using temporary; Using filesort |
 34 | +----+-------------+-------+------+---------------+------+---------+------+-----------+----------+---------------------------------+
 35 | 
 36 | 
 37 | mysql> explain extended select count(*) as cnt  from d inner join dm on d.x = dm.x;
 38 | +----+-------------+-------+------+---------------+------+---------+------------+-----------+----------+--------------------------+
 39 | | id | select_type | table | type | possible_keys | key  | key_len | ref        | rows      | filtered | Extra                    |
 40 | +----+-------------+-------+------+---------------+------+---------+------------+-----------+----------+--------------------------+
 41 | |  1 | SIMPLE      | d     | ALL  | NULL          | NULL | NULL    | NULL       | 100000319 |   100.00 |                          |
 42 | |  1 | SIMPLE      | dm    | ref  | ixm           | ixm  | 5       | benchm.d.x |         1 |   100.00 | Using where; Using index |
 43 | +----+-------------+-------+------+---------------+------+---------+------------+-----------+----------+--------------------------+
 44 | 
 45 | 
 46 | 
 47 | 
 48 | ###### PostgreSQL
 49 | 
 50 | 
 51 | postgres=# explain select x, avg(y) as ym  from d  group by x order by ym desc  limit 5;
 52 |                                       QUERY PLAN                                       
 53 | ---------------------------------------------------------------------------------------
 54 |  Limit  (cost=24151458.52..24151458.53 rows=5 width=12)
 55 |    ->  Sort  (cost=24151458.52..24153916.54 rows=983208 width=12)
 56 |          Sort Key: (avg(y))
 57 |          ->  GroupAggregate  (cost=23372848.55..24135127.79 rows=983208 width=12)
 58 |                ->  Sort  (cost=23372848.55..23622844.93 rows=99998552 width=12)
 59 |                      Sort Key: x
 60 |                      ->  Seq Scan on d  (cost=0.00..1540526.52 rows=99998552 width=12)
 61 | 
 62 | 
 63 | postgres=# explain select count(*) as cnt  from d inner join dm on d.x = dm.x;
 64 |                                   QUERY PLAN                                  
 65 | ------------------------------------------------------------------------------
 66 |  Aggregate  (cost=4356476.56..4356476.57 rows=1 width=0)
 67 |    ->  Hash Join  (cost=30832.00..4106480.18 rows=99998552 width=0)
 68 |          Hash Cond: (d.x = dm.x)
 69 |          ->  Seq Scan on d  (cost=0.00..1540526.52 rows=99998552 width=4)
 70 |          ->  Hash  (cost=14425.00..14425.00 rows=1000000 width=4)
 71 |                ->  Seq Scan on dm  (cost=0.00..14425.00 rows=1000000 width=4)
 72 | 
 73 | 
 74 | 
 75 | 
 76 | 
 77 | ###### Impala
 78 | 
 79 | 
 80 | [localhost:21000] > explain select x, avg(y) as ym  from d  group by x order by ym desc  limit 5;
 81 | Query: explain select x, avg(y) as ym  from d  group by x order by ym desc  limit 5
 82 | +------------------------------------------------------------------------------------+
 83 | | Explain String                                                                     |
 84 | +------------------------------------------------------------------------------------+
 85 | | Estimated Per-Host Requirements: Memory=2.00GB VCores=2                            |
 86 | | WARNING: The following tables are missing relevant table and/or column statistics. |
 87 | | benchm.d                                                                           |
 88 | |                                                                                    |
 89 | | 05:MERGING-EXCHANGE [UNPARTITIONED]                                                |
 90 | | |  order by: avg(y) DESC                                                           |
 91 | | |  limit: 5                                                                        |
 92 | | |                                                                                  |
 93 | | 02:TOP-N [LIMIT=5]                                                                 |
 94 | | |  order by: avg(y) DESC                                                           |
 95 | | |                                                                                  |
 96 | | 04:AGGREGATE [FINALIZE]                                                            |
 97 | | |  output: avg:merge(y)                                                            |
 98 | | |  group by: x                                                                     |
 99 | | |                                                                                  |
100 | | 03:EXCHANGE [HASH(x)]                                                              |
101 | | |                                                                                  |
102 | | 01:AGGREGATE                                                                       |
103 | | |  output: avg(y)                                                                  |
104 | | |  group by: x                                                                     |
105 | | |                                                                                  |
106 | | 00:SCAN HDFS [benchm.d]                                                            |
107 | |    partitions=1/1 files=1 size=2.32GB                                              |
108 | +------------------------------------------------------------------------------------+
109 | 
110 | 
111 | [localhost:21000] > explain select count(*) as cnt  from d inner join dm on d.x = dm.x;
112 | Query: explain select count(*) as cnt  from d inner join dm on d.x = dm.x
113 | +------------------------------------------------------------------------------------+
114 | | Explain String                                                                     |
115 | +------------------------------------------------------------------------------------+
116 | | Estimated Per-Host Requirements: Memory=3.88GB VCores=2                            |
117 | | WARNING: The following tables are missing relevant table and/or column statistics. |
118 | | benchm.d, benchm.dm                                                                |
119 | |                                                                                    |
120 | | 06:AGGREGATE [FINALIZE]                                                            |
121 | | |  output: count:merge(*)                                                          |
122 | | |                                                                                  |
123 | | 05:EXCHANGE [UNPARTITIONED]                                                        |
124 | | |                                                                                  |
125 | | 03:AGGREGATE                                                                       |
126 | | |  output: count(*)                                                                |
127 | | |                                                                                  |
128 | | 02:HASH JOIN [INNER JOIN, BROADCAST]                                               |
129 | | |  hash predicates: d.x = dm.x                                                     |
130 | | |                                                                                  |
131 | | |--04:EXCHANGE [BROADCAST]                                                         |
132 | | |  |                                                                               |
133 | | |  01:SCAN HDFS [benchm.dm]                                                        |
134 | | |     partitions=1/1 files=1 size=6.57MB                                           |
135 | | |                                                                                  |
136 | | 00:SCAN HDFS [benchm.d]                                                            |
137 | |    partitions=1/1 files=1 size=2.32GB                                              |
138 | +------------------------------------------------------------------------------------+
139 | 
140 | 
141 | 
142 | 
143 | 
144 | ##### Hive
145 | 
146 | 
147 | hive> explain select x, avg(y) as ym  from d  group by x order by ym desc  limit 5;
148 | OK
149 | STAGE DEPENDENCIES:
150 |   Stage-1 is a root stage
151 |   Stage-2 depends on stages: Stage-1
152 |   Stage-0 is a root stage
153 | 
154 | STAGE PLANS:
155 |   Stage: Stage-1
156 |     Map Reduce
157 |       Map Operator Tree:
158 |           TableScan
159 |             alias: d
160 |             Statistics: Num rows: 311108576 Data size: 2488868608 Basic stats: COMPLETE Column stats: NONE
161 |             Select Operator
162 |               expressions: x (type: int), y (type: float)
163 |               outputColumnNames: x, y
164 |               Statistics: Num rows: 311108576 Data size: 2488868608 Basic stats: COMPLETE Column stats: NONE
165 |               Group By Operator
166 |                 aggregations: avg(y)
167 |                 keys: x (type: int)
168 |                 mode: hash
169 |                 outputColumnNames: _col0, _col1
170 |                 Statistics: Num rows: 311108576 Data size: 2488868608 Basic stats: COMPLETE Column stats: NONE
171 |                 Reduce Output Operator
172 |                   key expressions: _col0 (type: int)
173 |                   sort order: +
174 |                   Map-reduce partition columns: _col0 (type: int)
175 |                   Statistics: Num rows: 311108576 Data size: 2488868608 Basic stats: COMPLETE Column stats: NONE
176 |                   value expressions: _col1 (type: struct<count:bigint,sum:double,input:float>)
177 |       Reduce Operator Tree:
178 |         Group By Operator
179 |           aggregations: avg(VALUE._col0)
180 |           keys: KEY._col0 (type: int)
181 |           mode: mergepartial
182 |           outputColumnNames: _col0, _col1
183 |           Statistics: Num rows: 155554288 Data size: 1244434304 Basic stats: COMPLETE Column stats: NONE
184 |           Select Operator
185 |             expressions: _col0 (type: int), _col1 (type: double)
186 |             outputColumnNames: _col0, _col1
187 |             Statistics: Num rows: 155554288 Data size: 1244434304 Basic stats: COMPLETE Column stats: NONE
188 |             File Output Operator
189 |               compressed: false
190 |               table:
191 |                   input format: org.apache.hadoop.mapred.SequenceFileInputFormat
192 |                   output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
193 |                   serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
194 | 
195 |   Stage: Stage-2
196 |     Map Reduce
197 |       Map Operator Tree:
198 |           TableScan
199 |             Reduce Output Operator
200 |               key expressions: _col1 (type: double)
201 |               sort order: -
202 |               Statistics: Num rows: 155554288 Data size: 1244434304 Basic stats: COMPLETE Column stats: NONE
203 |               value expressions: _col0 (type: int), _col1 (type: double)
204 |       Reduce Operator Tree:
205 |         Extract
206 |           Statistics: Num rows: 155554288 Data size: 1244434304 Basic stats: COMPLETE Column stats: NONE
207 |           Limit
208 |             Number of rows: 5
209 |             Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE
210 |             File Output Operator
211 |               compressed: false
212 |               Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE
213 |               table:
214 |                   input format: org.apache.hadoop.mapred.TextInputFormat
215 |                   output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
216 |                   serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
217 | 
218 |   Stage: Stage-0
219 |     Fetch Operator
220 |       limit: 5
221 | 
222 | 
223 | 
224 | hive> explain select count(*) as cnt  from d inner join dm on d.x = dm.x;
225 | OK
226 | STAGE DEPENDENCIES:
227 |   Stage-5 is a root stage
228 |   Stage-2 depends on stages: Stage-5
229 |   Stage-0 is a root stage
230 | 
231 | STAGE PLANS:
232 |   Stage: Stage-5
233 |     Map Reduce Local Work
234 |       Alias -> Map Local Tables:
235 |         dm 
236 |           Fetch Operator
237 |             limit: -1
238 |       Alias -> Map Local Operator Tree:
239 |         dm 
240 |           TableScan
241 |             alias: dm
242 |             Statistics: Num rows: 1722224 Data size: 6888896 Basic stats: COMPLETE Column stats: NONE
243 |             HashTable Sink Operator
244 |               condition expressions:
245 |                 0 
246 |                 1 
247 |               keys:
248 |                 0 x (type: int)
249 |                 1 x (type: int)
250 | 
251 |   Stage: Stage-2
252 |     Map Reduce
253 |       Map Operator Tree:
254 |           TableScan
255 |             alias: d
256 |             Statistics: Num rows: 622217152 Data size: 2488868608 Basic stats: COMPLETE Column stats: NONE
257 |             Map Join Operator
258 |               condition map:
259 |                    Inner Join 0 to 1
260 |               condition expressions:
261 |                 0 
262 |                 1 
263 |               keys:
264 |                 0 x (type: int)
265 |                 1 x (type: int)
266 |               Statistics: Num rows: 684438912 Data size: 2737755648 Basic stats: COMPLETE Column stats: NONE
267 |               Select Operator
268 |                 Statistics: Num rows: 684438912 Data size: 2737755648 Basic stats: COMPLETE Column stats: NONE
269 |                 Group By Operator
270 |                   aggregations: count()
271 |                   mode: hash
272 |                   outputColumnNames: _col0
273 |                   Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
274 |                   Reduce Output Operator
275 |                     sort order: 
276 |                     Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
277 |                     value expressions: _col0 (type: bigint)
278 |       Local Work:
279 |         Map Reduce Local Work
280 |       Reduce Operator Tree:
281 |         Group By Operator
282 |           aggregations: count(VALUE._col0)
283 |           mode: mergepartial
284 |           outputColumnNames: _col0
285 |           Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
286 |           Select Operator
287 |             expressions: _col0 (type: bigint)
288 |             outputColumnNames: _col0
289 |             Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
290 |             File Output Operator
291 |               compressed: false
292 |               Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE
293 |               table:
294 |                   input format: org.apache.hadoop.mapred.TextInputFormat
295 |                   output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
296 |                   serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
297 | 
298 |   Stage: Stage-0
299 |     Fetch Operator
300 |       limit: -1
301 | 
302 | 
303 | 
304 | 
305 | 
306 | 
307 | 
308 | 
309 | ######### Spark SQL
310 | 
311 | 
312 | explain select x, avg(y) as ym  from d  group by x order by ym desc  limit 5;
313 | 
314 | == Physical Plan ==
315 | TakeOrdered 5, [ym#23 DESC]
316 |  Aggregate false, [x#29], [x#29,(CAST(SUM(PartialSum#33), DoubleType) / CAST(SUM(PartialCount#34L), DoubleType)) AS ym#23]
317 |   Exchange (HashPartitioning [x#29], 200)
318 |    Aggregate true, [x#29], [x#29,COUNT(CAST(y#30, DoubleType)) AS PartialCount#34L,SUM(CAST(y#30, DoubleType)) AS PartialSum#33]
319 |     HiveTableScan [x#29,y#30], (MetastoreRelation benchm, d, None), None
320 | 
321 | 
322 | -- cached:
323 | 
324 | == Physical Plan ==
325 | TakeOrdered 5, [ym#105 DESC]
326 |  Aggregate false, [x#111], [x#111,(CAST(SUM(PartialSum#125), DoubleType) / CAST(SUM(PartialCount#126L), DoubleType)) AS ym#105]
327 |   Exchange (HashPartitioning [x#111], 200)
328 |    Aggregate true, [x#111], [x#111,COUNT(CAST(y#112, DoubleType)) AS PartialCount#126L,SUM(CAST(y#112, DoubleType)) AS PartialSum#125]
329 |     InMemoryColumnarTableScan [x#111,y#112], [], (InMemoryRelation [x#111,y#112], true, 10000, StorageLevel(true, true, false, true, 1), (HiveTableScan [x#57,y#58], (MetastoreRelation benchm, d, None), None), Some(d))
330 | 
331 | 
332 | 
333 | 
334 | explain select count(*) as cnt  from d inner join dm on d.x = dm.x;
335 | 
336 | == Physical Plan ==
337 | Aggregate false, [], [Coalesce(SUM(PartialCount#51L),0) AS cnt#41L]
338 |  Exchange SinglePartition
339 |   Aggregate true, [], [COUNT(1) AS PartialCount#51L]
340 |    Project []
341 |     ShuffledHashJoin [x#47], [x#49], BuildRight
342 |      Exchange (HashPartitioning [x#47], 200)
343 |       HiveTableScan [x#47], (MetastoreRelation benchm, d, None), None
344 |      Exchange (HashPartitioning [x#49], 200)
345 |       HiveTableScan [x#49], (MetastoreRelation benchm, dm, None), None
346 | 
347 | 
348 | -- cached:
349 | 
350 | == Physical Plan ==
351 | Aggregate false, [], [Coalesce(SUM(PartialCount#158L),0) AS cnt#133L]
352 |  Exchange SinglePartition
353 |   Aggregate true, [], [COUNT(1) AS PartialCount#158L]
354 |    Project []
355 |     BroadcastHashJoin [x#139], [x#141], BuildRight
356 |      InMemoryColumnarTableScan [x#139], [], (InMemoryRelation [x#139,y#140], true, 10000, StorageLevel(true, true, false, true, 1), (HiveTableScan [x#57,y#58], (MetastoreRelation benchm, d, None), None), Some(d))
357 |      InMemoryColumnarTableScan [x#141], [], (InMemoryRelation [x#141], true, 10000, StorageLevel(true, true, false, true, 1), (HiveTableScan [x#87], (MetastoreRelation benchm, dm, None), None), Some(dm))
358 | 
359 | 
360 | 
361 | 


--------------------------------------------------------------------------------
/versions.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | R 3.1.2
 3 | data.table 1.9.4
 4 | dplyr 0.4.1
 5 | 
 6 | Python 2.7.8 
 7 |   Anaconda 2.1.0 
 8 |   Ipython 2.2.0
 9 | pandas 0.14.1
10 | 
11 | 
12 | MySQL 5.5.41
13 | 
14 | PostgreSQL 9.3.6 
15 | 
16 | 
17 | Amazon (AWS) EMR 3.5.0
18 | ~
19 | Hive 0.13.1 
20 | Impala 1.2.4
21 | Spark 1.2.1
22 | 
23 | ====================
24 | 
25 | Update:
26 | 
27 | Amazon (AWS) EMR 4.3.0
28 | ~
29 | Hive 1.0.0
30 | Spark 1.6
31 | (Impala not available in EMR anymore)
32 | 
33 | 
34 | ====================
35 | 
36 | Update:
37 | 
38 | emr-5.0.0
39 | ~
40 | Spark 2.0.0
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------