├── .gitignore ├── 0-gendata.txt ├── 1a-R-datatable.txt ├── 1b-R-dplyr.txt ├── 1c-python-pandas.txt ├── 1d-R-MonetDBLite.txt ├── 2a-mysql.txt ├── 2b-postgres.txt ├── 3a-hive.txt ├── 3b-impala.txt ├── 3c-spark.txt ├── 4-mapd.txt ├── README.md ├── plot.R ├── plot.csv ├── plot.png ├── query_plans.txt └── versions.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | .*.sw? 6 | benchm-databases.Rproj 7 | -------------------------------------------------------------------------------- /0-gendata.txt: -------------------------------------------------------------------------------- 1 | 2 | time R --vanilla --quiet << EOF 3 | set.seed(123) 4 | n <- 100e6 5 | m <- 1e6 6 | d <- data.frame(x = sample(m, n, replace=TRUE), y = runif(n)) 7 | dm <- data.frame(x = sample(m)) 8 | write.table(d, file = "/tmp/d.csv", row.names = FALSE, col.names = FALSE, sep = ",") 9 | write.table(dm, file = "/tmp/dm.csv", row.names = FALSE, col.names = FALSE, sep = ",") 10 | EOF 11 | ## 3m52.052s 12 | 13 | 14 | du -h /tmp/d*csv 15 | ## 2.4G /tmp/d.csv 16 | ## 6.6M /tmp/dm.csv 17 | 18 | 19 | -------------------------------------------------------------------------------- /1a-R-datatable.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | library(data.table) 4 | 5 | d <- fread("/tmp/d.csv") 6 | dm <- fread("/tmp/dm.csv") 7 | setnames(d, c("x","y")) 8 | setnames(dm, "x") 9 | 10 | tables() 11 | ## NAME NROW NCOL MB COLS KEY 12 | ## [1,] d 100,000,000 2 1,145 x,y 13 | ## [2,] dm 1,000,000 1 4 x 14 | ## Total: 1,149MB 15 | 16 | 17 | system.time( 18 | print(head(d[, list(ym=mean(y)), by=x][order(-ym)],5)) 19 | ) 20 | 21 | 22 | system.time( 23 | setkey(d, x) 24 | ) 25 | 26 | system.time( 27 | print(nrow(d[dm, nomatch=0])) 28 | ) 29 | 30 | 31 | system.time( 32 | print(head(d[, list(ym=mean(y)), by=x][order(-ym)],5)) 33 | ) 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /1b-R-dplyr.txt: -------------------------------------------------------------------------------- 1 | 2 | library(data.table) 3 | library(dplyr) 4 | 5 | d <- tbl_df(as.data.frame(fread("/tmp/d.csv"))) 6 | dm <- tbl_df(as.data.frame(fread("/tmp/dm.csv"))) 7 | names(d) <- c("x","y") 8 | names(dm) <- "x" 9 | 10 | 11 | system.time( 12 | print(d %>% group_by(x) %>% summarize(ym = mean(y)) %>% arrange(desc(ym)) %>% head(5)) 13 | ) 14 | 15 | system.time( 16 | print(d %>% inner_join(dm, by="x") %>% summarize(n = n())) 17 | ) 18 | 19 | 20 | -------------------------------------------------------------------------------- /1c-python-pandas.txt: -------------------------------------------------------------------------------- 1 | 2 | ipython 3 | 4 | 5 | import pandas as pd 6 | 7 | %time d = pd.read_csv("/tmp/d.csv", header = None, names=["x","y"]) 8 | %time dm = pd.read_csv("/tmp/dm.csv", header = None, names=["x"]) 9 | 10 | 11 | %time d.groupby("x", as_index = False)["y"].mean().sort_values("y", ascending = False).head(5) 12 | %time d.groupby("x")["y"].mean().sort_values(ascending = False)[:5] 13 | 14 | %time pd.merge(d, dm).shape[0] 15 | -------------------------------------------------------------------------------- /1d-R-MonetDBLite.txt: -------------------------------------------------------------------------------- 1 | # install.packages(c("MonetDB.R", "MonetDBLite"), repos=c("http://dev.monetdb.org/Assets/R/", "http://cran.rstudio.com/")) 2 | 3 | library(MonetDB.R) 4 | 5 | con <- dbConnect(MonetDBLite()) 6 | dbSendQuery(con, "create table d(x int, y double)") 7 | dbSendQuery(con, "create table dm(x int)") 8 | 9 | dbSendQuery(con, "copy into d from '/tmp/d.csv' using delimiters ','") 10 | dbSendQuery(con, "copy into dm from '/tmp/dm.csv' using delimiters ','") 11 | 12 | system.time(dbGetQuery(con, " 13 | select x, avg(y) as ym 14 | from d 15 | group by x 16 | order by ym desc 17 | limit 5; 18 | ")) 19 | 20 | # 7s on my box 21 | 22 | system.time(dbGetQuery(con, " 23 | select count(*) as cnt 24 | from d 25 | inner join dm on d.x = dm.x; 26 | ")) 27 | 28 | # 1.7s on my box -------------------------------------------------------------------------------- /2a-mysql.txt: -------------------------------------------------------------------------------- 1 | 2 | sudo apt-get install mysql-server 3 | 4 | /etc/mysql/my.cnf: 5 | tmp_table_size = 128M ## implicit default 16M 6 | max_heap_table_size = 128M ## implicit default 16M 7 | innodb_buffer_pool_size = 16G ## implicit default 128M 8 | 9 | service mysql restart 10 | 11 | 12 | mysql -u root --local-infile 13 | 14 | create database benchm; 15 | use benchm 16 | 17 | create table d(x int, y float) engine = MyISAM; 18 | create table dm(x int) engine = MyISAM; 19 | 20 | load data local infile '/tmp/dm.csv' into table dm fields terminated by ','; 21 | load data local infile '/tmp/d.csv' into table d fields terminated by ','; 22 | 23 | create index ixm on dm(x); 24 | 25 | 26 | select x, avg(y) as ym 27 | from d 28 | group by x 29 | order by ym desc 30 | limit 5; 31 | 32 | select count(*) as cnt 33 | from d 34 | inner join dm on d.x = dm.x; 35 | 36 | 37 | drop table d; 38 | drop table dm; 39 | create table d(x int, y float); 40 | create table dm(x int); 41 | 42 | load data local infile '/tmp/dm.csv' into table dm fields terminated by ','; 43 | load data local infile '/tmp/d.csv' into table d fields terminated by ','; 44 | 45 | create index ixm on dm(x); 46 | 47 | 48 | select x, avg(y) as ym 49 | from d 50 | group by x 51 | order by ym desc 52 | limit 5; 53 | 54 | select count(*) as cnt 55 | from d 56 | inner join dm on d.x = dm.x; 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /2b-postgres.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | apt-get install postgresql-9.3 4 | 5 | sudo -u postgres psql 6 | 7 | \timing 8 | 9 | create table d(x int, y float); 10 | create table dm(x int); 11 | 12 | copy dm from '/tmp/dm.csv' delimiter ','; 13 | copy d from '/tmp/d.csv' delimiter ','; 14 | 15 | create index ixm on dm(x); 16 | 17 | 18 | select x, avg(y) as ym 19 | from d 20 | group by x 21 | order by ym desc 22 | limit 5; 23 | 24 | select count(*) as cnt 25 | from d 26 | inner join dm on d.x = dm.x; 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /3a-hive.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | sudo -u hive hive 5 | 6 | dfs -mkdir /tmp/benchm; 7 | dfs -mkdir /tmp/benchm/d; 8 | dfs -mkdir /tmp/benchm/dm; 9 | 10 | dfs -put /tmp/dm.csv /tmp/benchm/dm; 11 | dfs -put /tmp/d.csv /tmp/benchm/d; 12 | 13 | create database benchm; 14 | use benchm; 15 | 16 | create external table d (x int, y float) 17 | row format delimited fields terminated by ',' lines terminated by '\n' 18 | stored as textfile location '/tmp/benchm/d'; 19 | 20 | create external table dm (x int) 21 | row format delimited fields terminated by ',' lines terminated by '\n' 22 | stored as textfile location '/tmp/benchm/dm'; 23 | 24 | 25 | select x, avg(y) as ym 26 | from d 27 | group by x 28 | order by ym desc 29 | limit 5; 30 | 31 | select count(*) as cnt 32 | from d 33 | inner join dm on d.x = dm.x; 34 | 35 | 36 | -------------------------------------------------------------------------------- /3b-impala.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | impala-shell 4 | 5 | invalidate metadata; 6 | use benchm; 7 | 8 | 9 | select x, avg(y) as ym 10 | from d 11 | group by x 12 | order by ym desc 13 | limit 5; 14 | 15 | select count(*) as cnt 16 | from d 17 | inner join dm on d.x = dm.x; 18 | 19 | -------------------------------------------------------------------------------- /3c-spark.txt: -------------------------------------------------------------------------------- 1 | 2 | Amazon (AWS) EMR 3 | Bootstrap Actions: 4 | 5 | S3 location: s3://support.elasticmapreduce/spark/install-spark 6 | Optional arguments: -x 7 | 8 | -x: Prepares the default Spark config for dedicated Spark single application use [1 executor per node, 9 | num of executors equivalent to core nodes at creation of cluster, all memory/vcores allocated] 10 | 11 | 12 | 13 | /home/hadoop/spark/bin/spark-sql --master yarn-client 14 | 15 | use benchm; 16 | 17 | 18 | select x, avg(y) as ym 19 | from d 20 | group by x 21 | order by ym desc 22 | limit 5; 23 | 24 | select count(*) as cnt 25 | from d 26 | inner join dm on d.x = dm.x; 27 | 28 | 29 | 30 | cache table dm; 31 | cache table d; 32 | -------------------------------------------------------------------------------- /4-mapd.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | AWS Marketplace: 4 | https://aws.amazon.com/marketplace/pp/B071H71L2Y 5 | 6 | EC2 Launch p2.xlarge 7 | 8 | 9 | ssh -i XXX centos@ec2-XXX 10 | 11 | 12 | sudo yum install screen htop R 13 | 14 | 15 | R 16 | 17 | set.seed(123) 18 | n <- 100e6 19 | m <- 1e6 20 | d <- data.frame(x = sample(m, n, replace=TRUE), y = runif(n)) 21 | dm <- data.frame(x = sample(m)) 22 | write.table(d, file = "/tmp/d.csv", row.names = FALSE, col.names = FALSE, sep = ",") 23 | write.table(dm, file = "/tmp/dm.csv", row.names = FALSE, col.names = FALSE, sep = ",") 24 | 25 | 26 | 27 | [as root:] ./cloud-init/per-once-bootstrap-mapd.sh 28 | 29 | 30 | /raidStorage/installs/mapd-ce-3.0.0-20170507-7626e30-Linux-x86_64-render/bin/mapdql -p XXX 31 | 32 | 33 | \timing 34 | 35 | create table d(x int, y float); 36 | create table dm(x int); 37 | 38 | copy dm from '/tmp/dm.csv' with (header = 'false'); 39 | copy d from '/tmp/d.csv' with (header = 'false'); 40 | --Loaded: 100000000 recs, Rejected: 0 recs in 52.022000 secs 41 | 42 | 43 | select x, avg(y) as ym 44 | from d 45 | group by x 46 | order by ym desc 47 | limit 5; 48 | -- Execution time: 1342 ms, Total time: 1343 ms 49 | -- Execution time: 1131 ms, Total time: 1132 ms 50 | 51 | select count(*) as cnt 52 | from d 53 | inner join dm on d.x = dm.x; 54 | -- Execution time: 199 ms, Total time: 199 ms 55 | -- Execution time: 83 ms, Total time: 83 ms 56 | 57 | 58 | 59 | 60 | 61 | 62 | ====================== 63 | 64 | On the above MapD is using CPU on the aggregation query and GPU only on the join. 65 | 66 | 67 | According to MapD's @dwayneberry: 68 | 69 | I ran your steps on my laptop (1 x 1060 GPU) and these are the times I got. 70 | I have checked the code on a kepler machine (k80machine like you are running on AWS) and I notice the IR code is targeting the CPU for the AVG(float) query. My laptop is pascal so didn't see the CPU behaviour. 71 | We have recently changed this so more operations will run on Float on kepler architecture. I will get that applied to this edition as soon as possible. 72 | 73 | https://community.mapd.com/t/toy-benchmark-on-aws-cloud-gpu-usage/34/3 74 | 75 | 76 | @dwayneberry's timing: 77 | 78 | mapdql> select x, avg(y) as ym from d group by x order by ym desc limit 5; 79 | Execution time: 464 ms, Total time: 465 ms 80 | Execution time: 467 ms, Total time: 467 ms 81 | 82 | mapdql> select count(*) as cnt 83 | ..> from d 84 | ..> inner join dm on d.x = dm.x; 85 | 86 | Execution time: 141 ms, Total time: 142 ms 87 | Execution time: 67 ms, Total time: 67 ms 88 | Execution time: 69 ms, Total time: 69 ms 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ### Simple/basic/limited/incomplete benchmark for operations on tabular data (aggregates and joins) 3 | 4 | For structured/tabular/relational data most transformations for data analysis are based on a few 5 | primitives. Computationally, aggregates and joins are taking the majority 6 | of time. This project aims at a *minimal* benchmark of various tools 7 | (statistical software, databases etc.) for working with tabular data of moderately 8 | large sizes (but still within the reach of interactive data analysis - response times 9 | of a few seconds on commodity hardware). 10 | 11 | 12 | #### Tools 13 | 14 | The tools analysed are: 15 | 16 | 1. Statistical software: R (dplyr, data.table) and Python (pandas) 17 | 2. Databases (MySQL, PostgreSQL) 18 | 3. Analytical databases (3 MPP/columnar stores) 19 | 4. "Big data" systems (Hive, Impala, Spark) 20 | 21 | **Update:** MapD open sourced their analytical database using GPUs, which should be a new class here. 22 | 23 | All but the analytical databases are open source. The analytical databases have free (community) editions 24 | or inexpensive cloud offerings. 25 | 26 | **Update:** @hannesmuehleisen submitted results for MonetDB, an analytical/columnar db that unlike all the 27 | above is open source. 28 | 29 | **Update:** @JazzFan submitted results for another analytical db ("MPP-4" here) 30 | 31 | R/Python operate in memory and can integrate the tabular operations 32 | with rich visualization, statistical modeling etc. On the other hand, they are limited to data sizes 33 | that fit in RAM, run single-threaded, and unlike the other systems do not have a query optimizer. 34 | 35 | MySQL/PostgreSQL have been designed for mixed OLTP/analytical workloads and while 36 | the database can use multiple cores to run different queries, any given query runs 37 | on one processor core only. 38 | 39 | The analytical databases and the "big data" systems can scale-out to multiple nodes (and use all cores on them). 40 | The analytical (MPP) databases have parallel/shared-nothing architecture, columnar storage, compression and are specifically 41 | designed for large aggregations and joins. 42 | 43 | Hive/Spark are based on the map-reduce paradigm, SQL operations are translated to 44 | map/shuffle/reduce primitives (Hive generates traditional Hadoop jobs, while Spark leverages in-memory 45 | architecture). Impala uses MPP-like technology to query data in HDFS (Hadoop's distributed file system). 46 | 47 | 48 | #### Data 49 | 50 | The data is [randomly generated](0-gendata.txt): 51 | one table `d` (`x` integer, `y` float) of 100 million rows for aggregation 52 | (`x` takes 1 million distinct values) and another table `dm` (`x` integer) of 1 million rows for the join only. 53 | (The larger table `d` is of ~2GB size in the CSV format and results in ~1GB usage when loaded in database or 54 | read in memory.) 55 | 56 | 57 | #### Transformations 58 | 59 | SQL query for aggregation: 60 | 61 | ``` 62 | select x, avg(y) as ym 63 | from d 64 | group by x 65 | order by ym desc 66 | limit 5; 67 | ``` 68 | 69 | and for join: 70 | 71 | ``` 72 | select count(*) as cnt 73 | from d 74 | inner join dm on d.x = dm.x; 75 | ``` 76 | 77 | 78 | #### Setup 79 | 80 | The tests have been performed on a m3.2xlarge EC2 instance (8 cores, 30GB RAM) running Ubuntu 14.04. 81 | The queries have been run 2 times and the second 82 | time was recorded (warm run). In this case various caching mechanisms come into play and the data is 83 | effectively in RAM. 84 | 85 | While I'm a great fan of reproducibility, in this benchmark I'm more interested in orders 86 | of magnitude and not strict precision and exact reproducibility. With some more work one can create install and test 87 | scripts that can run in a fully automated fashion for complete reproducibility. 88 | 89 | The software tools have been installed with the latest available 90 | [versions](versions.txt) 91 | using standard instructions with no tuning 92 | (with a few exceptions as noted). For Hive/Impala/Spark Amazon's EMR was used to avoid a more elaborate installation. 93 | 94 | 95 | The following running times have been measured: 96 | 97 | 1. For R/Python data has been read from csv file and then aggregates/joins happen in memory. 98 | 2. For MySQL/Postgres and the analytical databases, the data has to be loaded first into the database, and only then 99 | can one run queries. 100 | 3. For the "Big Data" systems the data has to be copied into HDFS (much faster than loading it to a database); 101 | optionally it can be transformed into a columnar format (such as parquet). Queries can run readily. For Spark the data 102 | can be either read from HDFS on the fly or loaded into memory (cached) and then queried. 103 | 104 | 105 | 106 | #### Limitations 107 | 108 | This is far from a comprehensive benchmark. It is my attempt to *quickly* get an idea of the order 109 | of magnitude of running times for aggregations and joins on datasets of sizes of interest to *me* at the moment. 110 | 111 | The results are expected to vary with hardware, tuning, and likely even more with dataset size, 112 | dataset structure, or the number of nodes for the scale-out systems etc. Perhaps the strongest 113 | critique against the general relevance of this benchmark could be that it uses a certain 114 | data size and structure only, instead of examining a variety of. 115 | 116 | I'm not looking in detail either at the scaling by the number of nodes for the 117 | big data systems as I'm primarily concerned with the efficiency on a single or a small number of nodes. 118 | 119 | In the tests the only computation running on the system is the target query, therefore I'm not 120 | studying the behavior in function of the load (e.g. the number of concurrent queries running on the system). 121 | 122 | Finally, one might say that queries in practice are complex and the running times depend not only 123 | on how fast are these primitives, but also on how the query optimizer can deal with complexity. Again, 124 | a comprehensive SQL benchmark is out of the scope here (but see e.g. TPC-DS). 125 | 126 | 127 | 128 | #### Results 129 | 130 | (times in seconds) 131 | 132 | | Type | System | Load/Read | Aggregation | Join | 133 | | ---------- | ----------------------------------- | ------------- | -------------- | -------- | 134 | | Stats | [R DT](1a-R-datatable.txt) | 30 | 5.5 | 6.5 | 135 | | Stats | [R DT key](1a-R-datatable.txt) | 35 | 1.3 | 1.7 | 136 | | Stats | [R dplyr](1b-R-dplyr.txt) | 30 | 45 | 40 | 137 | | Stats | [Py pandas](1c-python-pandas.txt) | 30 | 8 | 25 | 138 | | DB | [MySQL MyISAM](2a-mysql.txt) | 40 | 45 | 470 | 139 | | DB | [MySQL InnoDB](2a-mysql.txt) | 430 | 70 | 140 | 140 | | DB | [PostgreSQL](2b-postgres.txt) | 120 | 175 | 55 | 141 | | MPP | MPP-1 | 70 | 0.5 | 2.5 | 142 | | MPP | MPP-2 | 130 | 9 | 4 | 143 | | MPP | MPP-3 | 130 | 6.5 | 15 | 144 | | MPP | MPP-4 | 102 | 7.1 | 8.5 | 145 | | MPP | MonetDB | 70 | 7 | 1.5 | 146 | | Big Data | [Hive](3a-hive.txt) | 20 | 250 | 80 | 147 | | Big Data | [Impala](3b-impala.txt) | 20 | 25 | 15 | 148 | | Big Data | [Spark](3c-spark.txt) | 20 | 120 | 130 | 149 | | Big Data | [Spark cached](3c-spark.txt) | 50 | 90 | 17 | 150 | | GPU | [MapD](4-mapd.txt) | 12 | 0.5 | 0.07 | 151 | 152 | 153 | **Update:** [Timings](https://gist.github.com/szilard/7d810877fcc6b019a478#gistcomment-1693662) 154 | by @hannesmuehleisen on MonetDB on same m3.2xlarge: aggregation 7.0, join 1.5. 155 | 156 | **Update:** With newer [versions](versions.txt) of Hive and Spark: 157 | 158 | | System | Aggr | Join | 159 | |----------------|---------|----------| 160 | | Hive | 250 | 100 | 161 | | Spark | 40 | 25 | 162 | | Spark cached | 12 | 2.5 | 163 | 164 | (also one should use parquet rather than csv for un-cached Spark). 165 | 166 | **Update:** MapD [timings](https://community.mapd.com/t/toy-benchmark-on-aws-cloud-gpu-usage/34/3) by @dwayneberry 167 | on laptop with GPU (1060) show impressive 0.5sec for aggregation and 70ms (!!) for the join. 168 | 169 | ![plots](plot.png) 170 | 171 | Multinode results (with older versions of Spark/Hive): 172 | 173 | | System | Aggr 1-node | Aggr 5-node | Join 1-node | Join 5-node | 174 | | ------------- | ----------- | ----------- | ----------- | ----------- | 175 | | Hive | 250 | 160 | 80 | 50 | 176 | | Impala | 25 | 16 | 15 | 10 | 177 | | Spark | 120 | 50 | 130 | 30 | 178 | | Spark cached | 90 | 40 | 17 | 10 | 179 | 180 | 181 | #### Discussions 182 | 183 | It seems that for data manipulation with ~100 million rows / ~1GB datasets MPP databases 184 | are the fastest, next statistical software, then traditional databases, and finally big data 185 | systems (see graph above). Some more insights can be gained from 186 | [query plans](query_plans.txt). 187 | 188 | The largest surprise (to me) is that traditional databases (MySQL/PostgreSQL) perform so poorly 189 | at this size. 190 | 191 | Naturally, analytical databases are the fastest. Even on 1 node, their columnar architecture and 192 | the features that come with it (efficient binary storage format, compression) along with the 193 | ability to process a query on multiple processor cores are ideal for the task. 194 | 195 | Statistical software fares pretty well. While not able to use multiple cores and no query planning, 196 | the data is in RAM in a format that makes this type of processing fast. (For the primitives studied, query 197 | planning does not play an important role such as for complex queries.) 198 | 199 | As an MPP-like tool but with data stored in Hadoop, Impala achieves higher performance than traditional 200 | Hadoop tools, but it is significantly slower than the full MPPs that can marshal their data as they 201 | please. 202 | 203 | While traditional databases like MySQL/PostgreSQL aim to be reasonable for both OLTP and analytical 204 | workloads, it comes as a surprize that even for 1GB sizes (which in today's terms is not large at all) 205 | they perform so poorly. 206 | 207 | While it is no surprise that Hive (which generates traditional Hadoop map-reduce jobs) is the slowest, 208 | the order of magnitude (100x vs analytical databases) is surprizing a bit. 209 | 210 | Spark is a new map-reduce like architecture that leverages in-memory computations, therefore significantly faster 211 | than Hive (especially when data is first read/cached in memory), but initially slower for example than Impala, 212 | though further optimizations (codenamed Tungsten in Spark 1.5, 1.6 and 2.0) obtained significant improvements. 213 | 214 | For ever larger datasets, statistical software will run out of memory, while traditional databases 215 | seem to become prohibitively slow. MPPs and big data systems can scale-out to multiple nodes, though 216 | the speed advantage of MPPs seems so large that it's hard to imagine anything but extreme data sizes 217 | when the big data systems can overcome the MPPs (though their respective scaling in function of the number 218 | of nodes and concurrent users has to be studied). 219 | 220 | 221 | 222 | -------------------------------------------------------------------------------- /plot.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(ggplot2) 3 | 4 | ticks <- c(0.1,0.3,1,3,10,30,100,300) 5 | 6 | d <- read.csv("plot.csv", sep="|", comment.char = "#") 7 | names(d) <- tolower(names(d)) 8 | 9 | d %>% ggplot(aes(x = aggregation, y = join, label = system, color = type)) + 10 | geom_text() + 11 | scale_x_log10(breaks = ticks, labels = ticks, limits = c(0.1,500)) + 12 | scale_y_log10(breaks = ticks, labels = ticks, limits = c(0.1,500)) + 13 | geom_smooth(aes(x = aggregation, y = join), method = "lm", se = FALSE, 14 | formula = y ~ x + 0, color="grey70") 15 | 16 | -------------------------------------------------------------------------------- /plot.csv: -------------------------------------------------------------------------------- 1 | | Type | System | Load/Read | Aggregation | Join | 2 | | Stats | R DT | 30 | 5.5 | 6.5 | 3 | | Stats | R DT key | 35 | 1.3 | 1.7 | 4 | | Stats | R dplyr | 30 | 45 | 40 | 5 | | Stats | Py pandas | 30 | 8 | 25 | 6 | | DB | MySQL MyISAM | 40 | 45 | 470 | 7 | | DB | MySQL InnoDB | 430 | 70 | 140 | 8 | | DB | PostgreSQL | 120 | 175 | 55 | 9 | | MPP | MPP-1 | 70 | 0.5 | 2.5 | 10 | | MPP | MPP-2 | 130 | 9 | 4 | 11 | | MPP | MPP-3 | 130 | 6.5 | 15 | 12 | | MPP | MPP-4 | 102 | 7.1 | 8.5 | 13 | | MPP | MonetDB | 70 | 7 | 1.5 | 14 | | Big Data | Hive | 20 | 250 | 80 | 15 | | Big Data | Impala | 20 | 25 | 15 | 16 | | Big Data | Spark | 20 | 40 | 30 | 17 | | Big Data | Spark-cached | 50 | 12 | 2.5 | 18 | | GPU | MapD | 12 | 0.5 | 0.1 | 19 | ## Spark Join 15->13 to avoid overlap in plot 20 | ## MapD join 0.07->0.1 to fit in scale 21 | 22 | -------------------------------------------------------------------------------- /plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/szilard/benchm-databases/3725093aaaa8d118b83a22181d23201f0f6225d7/plot.png -------------------------------------------------------------------------------- /query_plans.txt: -------------------------------------------------------------------------------- 1 | 2 | 3 | ##### MySQL MyISAM 4 | 5 | 6 | mysql> explain extended select x, avg(y) as ym from d group by x order by ym desc limit 5; 7 | +----+-------------+-------+------+---------------+------+---------+------+-----------+----------+---------------------------------+ 8 | | id | select_type | table | type | possible_keys | key | key_len | ref | rows | filtered | Extra | 9 | +----+-------------+-------+------+---------------+------+---------+------+-----------+----------+---------------------------------+ 10 | | 1 | SIMPLE | d | ALL | NULL | NULL | NULL | NULL | 100000000 | 100.00 | Using temporary; Using filesort | 11 | +----+-------------+-------+------+---------------+------+---------+------+-----------+----------+---------------------------------+ 12 | 13 | 14 | mysql> explain extended select count(*) as cnt from d inner join dm on d.x = dm.x; 15 | +----+-------------+-------+------+---------------+------+---------+------------+-----------+----------+--------------------------+ 16 | | id | select_type | table | type | possible_keys | key | key_len | ref | rows | filtered | Extra | 17 | +----+-------------+-------+------+---------------+------+---------+------------+-----------+----------+--------------------------+ 18 | | 1 | SIMPLE | d | ALL | NULL | NULL | NULL | NULL | 100000000 | 100.00 | | 19 | | 1 | SIMPLE | dm | ref | ixm | ixm | 5 | benchm.d.x | 1 | 100.00 | Using where; Using index | 20 | +----+-------------+-------+------+---------------+------+---------+------------+-----------+----------+--------------------------+ 21 | 22 | 23 | 24 | 25 | 26 | ##### MySQL InnoDB 27 | 28 | 29 | mysql> explain extended select x, avg(y) as ym from d group by x order by ym desc limit 5; 30 | +----+-------------+-------+------+---------------+------+---------+------+-----------+----------+---------------------------------+ 31 | | id | select_type | table | type | possible_keys | key | key_len | ref | rows | filtered | Extra | 32 | +----+-------------+-------+------+---------------+------+---------+------+-----------+----------+---------------------------------+ 33 | | 1 | SIMPLE | d | ALL | NULL | NULL | NULL | NULL | 100000319 | 100.00 | Using temporary; Using filesort | 34 | +----+-------------+-------+------+---------------+------+---------+------+-----------+----------+---------------------------------+ 35 | 36 | 37 | mysql> explain extended select count(*) as cnt from d inner join dm on d.x = dm.x; 38 | +----+-------------+-------+------+---------------+------+---------+------------+-----------+----------+--------------------------+ 39 | | id | select_type | table | type | possible_keys | key | key_len | ref | rows | filtered | Extra | 40 | +----+-------------+-------+------+---------------+------+---------+------------+-----------+----------+--------------------------+ 41 | | 1 | SIMPLE | d | ALL | NULL | NULL | NULL | NULL | 100000319 | 100.00 | | 42 | | 1 | SIMPLE | dm | ref | ixm | ixm | 5 | benchm.d.x | 1 | 100.00 | Using where; Using index | 43 | +----+-------------+-------+------+---------------+------+---------+------------+-----------+----------+--------------------------+ 44 | 45 | 46 | 47 | 48 | ###### PostgreSQL 49 | 50 | 51 | postgres=# explain select x, avg(y) as ym from d group by x order by ym desc limit 5; 52 | QUERY PLAN 53 | --------------------------------------------------------------------------------------- 54 | Limit (cost=24151458.52..24151458.53 rows=5 width=12) 55 | -> Sort (cost=24151458.52..24153916.54 rows=983208 width=12) 56 | Sort Key: (avg(y)) 57 | -> GroupAggregate (cost=23372848.55..24135127.79 rows=983208 width=12) 58 | -> Sort (cost=23372848.55..23622844.93 rows=99998552 width=12) 59 | Sort Key: x 60 | -> Seq Scan on d (cost=0.00..1540526.52 rows=99998552 width=12) 61 | 62 | 63 | postgres=# explain select count(*) as cnt from d inner join dm on d.x = dm.x; 64 | QUERY PLAN 65 | ------------------------------------------------------------------------------ 66 | Aggregate (cost=4356476.56..4356476.57 rows=1 width=0) 67 | -> Hash Join (cost=30832.00..4106480.18 rows=99998552 width=0) 68 | Hash Cond: (d.x = dm.x) 69 | -> Seq Scan on d (cost=0.00..1540526.52 rows=99998552 width=4) 70 | -> Hash (cost=14425.00..14425.00 rows=1000000 width=4) 71 | -> Seq Scan on dm (cost=0.00..14425.00 rows=1000000 width=4) 72 | 73 | 74 | 75 | 76 | 77 | ###### Impala 78 | 79 | 80 | [localhost:21000] > explain select x, avg(y) as ym from d group by x order by ym desc limit 5; 81 | Query: explain select x, avg(y) as ym from d group by x order by ym desc limit 5 82 | +------------------------------------------------------------------------------------+ 83 | | Explain String | 84 | +------------------------------------------------------------------------------------+ 85 | | Estimated Per-Host Requirements: Memory=2.00GB VCores=2 | 86 | | WARNING: The following tables are missing relevant table and/or column statistics. | 87 | | benchm.d | 88 | | | 89 | | 05:MERGING-EXCHANGE [UNPARTITIONED] | 90 | | | order by: avg(y) DESC | 91 | | | limit: 5 | 92 | | | | 93 | | 02:TOP-N [LIMIT=5] | 94 | | | order by: avg(y) DESC | 95 | | | | 96 | | 04:AGGREGATE [FINALIZE] | 97 | | | output: avg:merge(y) | 98 | | | group by: x | 99 | | | | 100 | | 03:EXCHANGE [HASH(x)] | 101 | | | | 102 | | 01:AGGREGATE | 103 | | | output: avg(y) | 104 | | | group by: x | 105 | | | | 106 | | 00:SCAN HDFS [benchm.d] | 107 | | partitions=1/1 files=1 size=2.32GB | 108 | +------------------------------------------------------------------------------------+ 109 | 110 | 111 | [localhost:21000] > explain select count(*) as cnt from d inner join dm on d.x = dm.x; 112 | Query: explain select count(*) as cnt from d inner join dm on d.x = dm.x 113 | +------------------------------------------------------------------------------------+ 114 | | Explain String | 115 | +------------------------------------------------------------------------------------+ 116 | | Estimated Per-Host Requirements: Memory=3.88GB VCores=2 | 117 | | WARNING: The following tables are missing relevant table and/or column statistics. | 118 | | benchm.d, benchm.dm | 119 | | | 120 | | 06:AGGREGATE [FINALIZE] | 121 | | | output: count:merge(*) | 122 | | | | 123 | | 05:EXCHANGE [UNPARTITIONED] | 124 | | | | 125 | | 03:AGGREGATE | 126 | | | output: count(*) | 127 | | | | 128 | | 02:HASH JOIN [INNER JOIN, BROADCAST] | 129 | | | hash predicates: d.x = dm.x | 130 | | | | 131 | | |--04:EXCHANGE [BROADCAST] | 132 | | | | | 133 | | | 01:SCAN HDFS [benchm.dm] | 134 | | | partitions=1/1 files=1 size=6.57MB | 135 | | | | 136 | | 00:SCAN HDFS [benchm.d] | 137 | | partitions=1/1 files=1 size=2.32GB | 138 | +------------------------------------------------------------------------------------+ 139 | 140 | 141 | 142 | 143 | 144 | ##### Hive 145 | 146 | 147 | hive> explain select x, avg(y) as ym from d group by x order by ym desc limit 5; 148 | OK 149 | STAGE DEPENDENCIES: 150 | Stage-1 is a root stage 151 | Stage-2 depends on stages: Stage-1 152 | Stage-0 is a root stage 153 | 154 | STAGE PLANS: 155 | Stage: Stage-1 156 | Map Reduce 157 | Map Operator Tree: 158 | TableScan 159 | alias: d 160 | Statistics: Num rows: 311108576 Data size: 2488868608 Basic stats: COMPLETE Column stats: NONE 161 | Select Operator 162 | expressions: x (type: int), y (type: float) 163 | outputColumnNames: x, y 164 | Statistics: Num rows: 311108576 Data size: 2488868608 Basic stats: COMPLETE Column stats: NONE 165 | Group By Operator 166 | aggregations: avg(y) 167 | keys: x (type: int) 168 | mode: hash 169 | outputColumnNames: _col0, _col1 170 | Statistics: Num rows: 311108576 Data size: 2488868608 Basic stats: COMPLETE Column stats: NONE 171 | Reduce Output Operator 172 | key expressions: _col0 (type: int) 173 | sort order: + 174 | Map-reduce partition columns: _col0 (type: int) 175 | Statistics: Num rows: 311108576 Data size: 2488868608 Basic stats: COMPLETE Column stats: NONE 176 | value expressions: _col1 (type: struct) 177 | Reduce Operator Tree: 178 | Group By Operator 179 | aggregations: avg(VALUE._col0) 180 | keys: KEY._col0 (type: int) 181 | mode: mergepartial 182 | outputColumnNames: _col0, _col1 183 | Statistics: Num rows: 155554288 Data size: 1244434304 Basic stats: COMPLETE Column stats: NONE 184 | Select Operator 185 | expressions: _col0 (type: int), _col1 (type: double) 186 | outputColumnNames: _col0, _col1 187 | Statistics: Num rows: 155554288 Data size: 1244434304 Basic stats: COMPLETE Column stats: NONE 188 | File Output Operator 189 | compressed: false 190 | table: 191 | input format: org.apache.hadoop.mapred.SequenceFileInputFormat 192 | output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat 193 | serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe 194 | 195 | Stage: Stage-2 196 | Map Reduce 197 | Map Operator Tree: 198 | TableScan 199 | Reduce Output Operator 200 | key expressions: _col1 (type: double) 201 | sort order: - 202 | Statistics: Num rows: 155554288 Data size: 1244434304 Basic stats: COMPLETE Column stats: NONE 203 | value expressions: _col0 (type: int), _col1 (type: double) 204 | Reduce Operator Tree: 205 | Extract 206 | Statistics: Num rows: 155554288 Data size: 1244434304 Basic stats: COMPLETE Column stats: NONE 207 | Limit 208 | Number of rows: 5 209 | Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE 210 | File Output Operator 211 | compressed: false 212 | Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column stats: NONE 213 | table: 214 | input format: org.apache.hadoop.mapred.TextInputFormat 215 | output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat 216 | serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe 217 | 218 | Stage: Stage-0 219 | Fetch Operator 220 | limit: 5 221 | 222 | 223 | 224 | hive> explain select count(*) as cnt from d inner join dm on d.x = dm.x; 225 | OK 226 | STAGE DEPENDENCIES: 227 | Stage-5 is a root stage 228 | Stage-2 depends on stages: Stage-5 229 | Stage-0 is a root stage 230 | 231 | STAGE PLANS: 232 | Stage: Stage-5 233 | Map Reduce Local Work 234 | Alias -> Map Local Tables: 235 | dm 236 | Fetch Operator 237 | limit: -1 238 | Alias -> Map Local Operator Tree: 239 | dm 240 | TableScan 241 | alias: dm 242 | Statistics: Num rows: 1722224 Data size: 6888896 Basic stats: COMPLETE Column stats: NONE 243 | HashTable Sink Operator 244 | condition expressions: 245 | 0 246 | 1 247 | keys: 248 | 0 x (type: int) 249 | 1 x (type: int) 250 | 251 | Stage: Stage-2 252 | Map Reduce 253 | Map Operator Tree: 254 | TableScan 255 | alias: d 256 | Statistics: Num rows: 622217152 Data size: 2488868608 Basic stats: COMPLETE Column stats: NONE 257 | Map Join Operator 258 | condition map: 259 | Inner Join 0 to 1 260 | condition expressions: 261 | 0 262 | 1 263 | keys: 264 | 0 x (type: int) 265 | 1 x (type: int) 266 | Statistics: Num rows: 684438912 Data size: 2737755648 Basic stats: COMPLETE Column stats: NONE 267 | Select Operator 268 | Statistics: Num rows: 684438912 Data size: 2737755648 Basic stats: COMPLETE Column stats: NONE 269 | Group By Operator 270 | aggregations: count() 271 | mode: hash 272 | outputColumnNames: _col0 273 | Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE 274 | Reduce Output Operator 275 | sort order: 276 | Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE 277 | value expressions: _col0 (type: bigint) 278 | Local Work: 279 | Map Reduce Local Work 280 | Reduce Operator Tree: 281 | Group By Operator 282 | aggregations: count(VALUE._col0) 283 | mode: mergepartial 284 | outputColumnNames: _col0 285 | Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE 286 | Select Operator 287 | expressions: _col0 (type: bigint) 288 | outputColumnNames: _col0 289 | Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE 290 | File Output Operator 291 | compressed: false 292 | Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE 293 | table: 294 | input format: org.apache.hadoop.mapred.TextInputFormat 295 | output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat 296 | serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe 297 | 298 | Stage: Stage-0 299 | Fetch Operator 300 | limit: -1 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | ######### Spark SQL 310 | 311 | 312 | explain select x, avg(y) as ym from d group by x order by ym desc limit 5; 313 | 314 | == Physical Plan == 315 | TakeOrdered 5, [ym#23 DESC] 316 | Aggregate false, [x#29], [x#29,(CAST(SUM(PartialSum#33), DoubleType) / CAST(SUM(PartialCount#34L), DoubleType)) AS ym#23] 317 | Exchange (HashPartitioning [x#29], 200) 318 | Aggregate true, [x#29], [x#29,COUNT(CAST(y#30, DoubleType)) AS PartialCount#34L,SUM(CAST(y#30, DoubleType)) AS PartialSum#33] 319 | HiveTableScan [x#29,y#30], (MetastoreRelation benchm, d, None), None 320 | 321 | 322 | -- cached: 323 | 324 | == Physical Plan == 325 | TakeOrdered 5, [ym#105 DESC] 326 | Aggregate false, [x#111], [x#111,(CAST(SUM(PartialSum#125), DoubleType) / CAST(SUM(PartialCount#126L), DoubleType)) AS ym#105] 327 | Exchange (HashPartitioning [x#111], 200) 328 | Aggregate true, [x#111], [x#111,COUNT(CAST(y#112, DoubleType)) AS PartialCount#126L,SUM(CAST(y#112, DoubleType)) AS PartialSum#125] 329 | InMemoryColumnarTableScan [x#111,y#112], [], (InMemoryRelation [x#111,y#112], true, 10000, StorageLevel(true, true, false, true, 1), (HiveTableScan [x#57,y#58], (MetastoreRelation benchm, d, None), None), Some(d)) 330 | 331 | 332 | 333 | 334 | explain select count(*) as cnt from d inner join dm on d.x = dm.x; 335 | 336 | == Physical Plan == 337 | Aggregate false, [], [Coalesce(SUM(PartialCount#51L),0) AS cnt#41L] 338 | Exchange SinglePartition 339 | Aggregate true, [], [COUNT(1) AS PartialCount#51L] 340 | Project [] 341 | ShuffledHashJoin [x#47], [x#49], BuildRight 342 | Exchange (HashPartitioning [x#47], 200) 343 | HiveTableScan [x#47], (MetastoreRelation benchm, d, None), None 344 | Exchange (HashPartitioning [x#49], 200) 345 | HiveTableScan [x#49], (MetastoreRelation benchm, dm, None), None 346 | 347 | 348 | -- cached: 349 | 350 | == Physical Plan == 351 | Aggregate false, [], [Coalesce(SUM(PartialCount#158L),0) AS cnt#133L] 352 | Exchange SinglePartition 353 | Aggregate true, [], [COUNT(1) AS PartialCount#158L] 354 | Project [] 355 | BroadcastHashJoin [x#139], [x#141], BuildRight 356 | InMemoryColumnarTableScan [x#139], [], (InMemoryRelation [x#139,y#140], true, 10000, StorageLevel(true, true, false, true, 1), (HiveTableScan [x#57,y#58], (MetastoreRelation benchm, d, None), None), Some(d)) 357 | InMemoryColumnarTableScan [x#141], [], (InMemoryRelation [x#141], true, 10000, StorageLevel(true, true, false, true, 1), (HiveTableScan [x#87], (MetastoreRelation benchm, dm, None), None), Some(dm)) 358 | 359 | 360 | 361 | -------------------------------------------------------------------------------- /versions.txt: -------------------------------------------------------------------------------- 1 | 2 | R 3.1.2 3 | data.table 1.9.4 4 | dplyr 0.4.1 5 | 6 | Python 2.7.8 7 | Anaconda 2.1.0 8 | Ipython 2.2.0 9 | pandas 0.14.1 10 | 11 | 12 | MySQL 5.5.41 13 | 14 | PostgreSQL 9.3.6 15 | 16 | 17 | Amazon (AWS) EMR 3.5.0 18 | ~ 19 | Hive 0.13.1 20 | Impala 1.2.4 21 | Spark 1.2.1 22 | 23 | ==================== 24 | 25 | Update: 26 | 27 | Amazon (AWS) EMR 4.3.0 28 | ~ 29 | Hive 1.0.0 30 | Spark 1.6 31 | (Impala not available in EMR anymore) 32 | 33 | 34 | ==================== 35 | 36 | Update: 37 | 38 | emr-5.0.0 39 | ~ 40 | Spark 2.0.0 41 | 42 | 43 | --------------------------------------------------------------------------------