├── .gitignore ├── README.md ├── code ├── 1-generate-datasets.r ├── 2-benchmark-r.r ├── 3-benchmark-stata.do └── 4-graph.r └── output └── 1e7.png /.gitignore: -------------------------------------------------------------------------------- 1 | .* 2 | !/.gitignore 3 | *.csv 4 | !/output/results.csv 5 | /output/results_graph.r 6 | *.rds 7 | *.dta 8 | dataset/* -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Benchmarks 2 | 3 | ## Results 4 | This page compares the speed of R and Stata for typical data analysis. Instructions are runned on randomly generated datasets of with 10 millions observations. I try to use the fastest command available in each language. In particular, I use [gtools](https://github.com/mcaceresb/stata-gtools) in Stata. I use [data.table](https://github.com/Rdatatable/data.table), [fst](https://github.com/fstpackage/fst), and [fixest](https://github.com/lrberge/fixest/) in R. 5 | 6 | 7 | 8 | 9 | 10 | ## Code 11 | 12 | All the code below can be downloaded in the code folder in the repository. 13 | The dataset is generated in R using the file [1-generate-datasets.r](code/1-generate-datasets.r). 14 | The R code in the file [2-benchmark-r.r](code/2-benchmark-r.r): 15 | The Stata code in the file [3-benchmark-stata.do](code/3-benchmark-stata.do): 16 | 17 | 18 | ## Session Info 19 | 20 | The machine used for this benchmark has a 3.5 GHz Intel Core i5 (4 cores) with a SSD disk. 21 | 22 | The Stata version is Stata 16 MP with 2 cores. The R session info is 23 | 24 | ````R 25 | R version 3.6.0 (2019-04-26) 26 | Platform: x86_64-apple-darwin15.6.0 (64-bit) 27 | Running under: macOS High Sierra 10.13.6 28 | 29 | Matrix products: default 30 | BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib 31 | LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib 32 | 33 | locale: 34 | [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 35 | 36 | attached base packages: 37 | [1] stats graphics grDevices utils datasets methods base 38 | 39 | other attached packages: 40 | [1] scales_1.0.0 ggplot2_3.2.1 stringr_1.4.0 fst_0.9.0 41 | [5] statar_0.7.1 lfe_2.8-3 Matrix_1.2-17 tidyr_1.0.0 42 | [9] data.table_1.12.2 43 | 44 | loaded via a namespace (and not attached): 45 | [1] Rcpp_1.0.2 pillar_1.4.2 compiler_3.6.0 tools_3.6.0 46 | [5] zeallot_0.1.0 lifecycle_0.1.0 tibble_2.1.3 gtable_0.3.0 47 | [9] lattice_0.20-38 pkgconfig_2.0.3 rlang_0.4.0 parallel_3.6.0 48 | [13] withr_2.1.2 dplyr_0.8.3 vctrs_0.2.0 grid_3.6.0 49 | [17] tidyselect_0.2.5 glue_1.3.1 R6_2.4.0 Formula_1.2-3 50 | [21] purrr_0.3.2 magrittr_1.5 ellipsis_0.3.0 backports_1.1.4 51 | [25] matrixStats_0.55.0 assertthat_0.2.1 xtable_1.8-4 colorspace_1.4-1 52 | [29] sandwich_2.5-1 stringi_1.4.3 lazyeval_0.2.2 munsell_0.5.0 53 | [33] crayon_1.3.4 zoo_1.8-6 54 | ```` 55 | -------------------------------------------------------------------------------- /code/1-generate-datasets.r: -------------------------------------------------------------------------------- 1 | # To run the script, download the relevant packages: 2 | # install.packages("data.table") 3 | 4 | library(data.table) 5 | library(readr) 6 | K <- 20 7 | N <- 1e7L 8 | set.seed(1) 9 | DT <- data.table( 10 | id1 = sample(sprintf("id%03d",1:K), N, TRUE), # few groups (char) 11 | id2 = sample(sprintf("id%03d",1:K), N, TRUE), # few groups (char) 12 | id3 = sample(sprintf("id%010d",1:(N/K)), N, TRUE), # many groups (char) 13 | id4 = sample(K, N, TRUE), # few groups (int) 14 | id5 = sample(K, N, TRUE), # few groups (int) 15 | id6 = sample(N/K, N, TRUE), # many groups (int) 16 | v1 = sample(5, N, TRUE), # int in range [1,5] 17 | v2 = sample(1e6, N, TRUE), # int in range [1,1e6] 18 | v3 = sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749 19 | ) 20 | fwrite(DT, "~/statabenchmark/1e7.csv") 21 | fwrite(unique(DT[, list(id1, id3)]),"~/statabenchmark/merge_string.csv") 22 | fwrite(unique(DT[, list(id4, id6)]),"~/statabenchmark/merge_int.csv") 23 | 24 | 25 | -------------------------------------------------------------------------------- /code/2-benchmark-r.r: -------------------------------------------------------------------------------- 1 | # To run the script, download the relevant packages: 2 | # install.packages("data.table") 3 | # install.packages("fst") 4 | # install.packages("statar") 5 | # install.packages("fixest") 6 | # install.packages("ggplot") 7 | 8 | 9 | # loading packages 10 | library(data.table) 11 | library(fixest) 12 | library(statar) 13 | library(fst) 14 | library(ggplot2) 15 | 16 | 17 | # setting options 18 | options(mc.cores=4) 19 | setFixest_nthreads(4) 20 | 21 | # creating the file to merge with 22 | write.fst(fread("~/statabenchmark/merge_string.csv", data.table = FALSE), "~/statabenchmark/merge_string.fst") 23 | write.fst(fread("~/statabenchmark/merge_int.csv", data.table = FALSE), "~/statabenchmark/merge_int.fst") 24 | 25 | 26 | 27 | # define the time function 28 | time <- function(x){system.time(x)[3]} 29 | 30 | 31 | out <- NULL 32 | names <- NULL 33 | i <- 0 34 | 35 | # write and read 36 | 37 | names <- append(names, "open csv") 38 | out <- append(out, time(DT <- fread("~/statabenchmark/1e7.csv", data.table = FALSE))) 39 | 40 | names <- append(names, "save binary") 41 | out <- append(out, time(write.fst(DT, "~/statabenchmark/1e7.fst"))) 42 | 43 | names <- append(names, "open binary") 44 | out <- append(out, time(DT <- read.fst("~/statabenchmark/1e7.fst"))) 45 | 46 | # sort and duplicates 47 | setDT(DT) 48 | 49 | names <- append(names, "sort string") 50 | out <- append(out, time(setkeyv(DT, c("id3")))) 51 | 52 | names <- append(names, "sort int") 53 | out <- append(out, time(setkeyv(DT, c("id6")))) 54 | 55 | names <- append(names, "sort float") 56 | out <- append(out, time(setkeyv(DT, c("v3")))) 57 | 58 | names <- append(names, "count distinct strings") 59 | out <- append(out, time(uniqueN(DT, by = c("id3")))) 60 | 61 | names <- append(names, "count distinct ints") 62 | out <- append(out, time(uniqueN(DT, by = c("id6")))) 63 | 64 | # merge 65 | DT <- read.fst("~/statabenchmark/1e7.fst") 66 | setDT(DT) 67 | f <- function(){ 68 | DT_merge <- read.fst("~/statabenchmark/merge_string.fst") 69 | setDT(DT_merge) 70 | setkey(DT, id1, id3) 71 | setkey(DT_merge, id1, id3) 72 | merge(DT, DT_merge, all.x = TRUE, all.y = FALSE) 73 | } 74 | 75 | names <- append(names, "merge string") 76 | out <- append(out, time(f())) 77 | 78 | DT <- read.fst("~/statabenchmark/1e7.fst") 79 | setDT(DT) 80 | f <- function(){ 81 | DT_merge <- read.fst("~/statabenchmark/merge_int.fst") 82 | setDT(DT_merge) 83 | setkey(DT, id4, id6) 84 | setkey(DT_merge, id4, id6) 85 | merge(DT, DT_merge, all.x = TRUE, all.y = FALSE) 86 | } 87 | 88 | names <- append(names, "merge int") 89 | out <- append(out, time(f())) 90 | 91 | 92 | # append 93 | 94 | names <- append(names, "append") 95 | DT1 <- copy(DT) 96 | out <- append(out, time(rbindlist(list(DT,DT1), fill = TRUE))) 97 | 98 | # reshape 99 | DT <- read.fst("~/statabenchmark/1e7.fst") 100 | setDT(DT) 101 | DT1 <- unique(DT, by = c("id1", "id2", "id3")) 102 | DT1 <- DT1[1:(nrow(DT1)/10),] 103 | 104 | names <- append(names, "reshape long") 105 | out <- append(out, time(DT2 <- melt(DT1, id.vars = c("id1", "id2", "id3")))) 106 | rm(DT1) 107 | 108 | names <- append(names, "reshape wide") 109 | out <- append(out, time(DT3 <- dcast(DT2, id1 + id2 + id3 ~ variable, value.var = "value"))) 110 | rm(list = c("DT2", "DT3")) 111 | 112 | # recode 113 | f <- function(){ 114 | DT[v1 == 1, v1_name := "first"] 115 | DT[v1 %in% c(2,3), v1_name := "second"] 116 | DT[v1 %in% c(4,5), v1_name := "third"] 117 | } 118 | 119 | names <- append(names, "recode") 120 | out <- append(out, time(f())) 121 | DT[, v1_name := NULL] 122 | 123 | # functions 124 | 125 | names <- append(names, "xtile") 126 | out <- append(out, time(DT[, temp := xtile(v3, 10)])) 127 | DT[, temp := NULL] 128 | 129 | names <- append(names, "group strings") 130 | out <- append(out, time(DT[, temp := .GRP, by = c("id1", "id3")])) 131 | DT[, temp := NULL] 132 | 133 | names <- append(names, "group int") 134 | out <- append(out, time(DT[, temp := .GRP, by = c("id4", "id6")])) 135 | DT[, temp := NULL] 136 | 137 | # sum groups 138 | 139 | names <- append(names, "sum over few groups (string)") 140 | out <- append(out, time(DT[, temp := sum(v3, na.rm = TRUE), by = c("id1")])) 141 | DT[, temp := NULL] 142 | 143 | names <- append(names, "sum over many groups (string)") 144 | out <- append(out, time(DT[, temp := sum(v3, na.rm = TRUE), by = c("id3")])) 145 | DT[, temp := NULL] 146 | 147 | names <- append(names, "sum over few groups (int)") 148 | out <- append(out, time(DT[, temp := sum(v3, na.rm = TRUE), by = c("id4")])) 149 | DT[, temp := NULL] 150 | 151 | names <- append(names, "sum over many groups (int)") 152 | out <- append(out, time(DT[, temp := sum(v3, na.rm = TRUE), by = c("id6")])) 153 | DT[, temp := NULL] 154 | 155 | 156 | # sd groups 157 | 158 | names <- append(names, "sd over few groups (int)") 159 | out <- append(out, time(DT[, temp := sd(v3, na.rm = TRUE), by = c("id4")])) 160 | DT[, temp := NULL] 161 | 162 | names <- append(names, "sd over many groups (int)") 163 | out <- append(out, time(DT[, temp := sd(v3, na.rm = TRUE), by = c("id6")])) 164 | DT[, temp := NULL] 165 | 166 | 167 | 168 | # collapse large groups 169 | 170 | names <- append(names, "collapse over few groups") 171 | out <- append(out, time(DT[, list(v1 = mean(v1, na.rm = TRUE), v2 = mean(v2, na.rm = TRUE), v3 = sum(v3, na.rm = TRUE), sd = sd(v3, na.rm = TRUE)), by = c("id1")])) 172 | 173 | # collapse small groups 174 | 175 | names <- append(names, "collapse over many groups") 176 | out <- append(out, time(DT[, list(v1 = mean(v1, na.rm = TRUE), v2 = mean(v2, na.rm = TRUE), v3 = sum(v3, na.rm = TRUE), sd = sd(v3, na.rm = TRUE)), by = c("id3")])) 177 | 178 | 179 | # regress 180 | DT1 <- DT[1:(nrow(DT)/2),] 181 | 182 | names <- append(names, "reg") 183 | out <- append(out, time(feols(v3 ~ v1 + v2 + id4 + id5, DT1))) 184 | 185 | names <- append(names, "reg fe") 186 | out <- append(out, time(feols(v3 ~ v2 + id4 + id5 + as.factor(v1), DT1))) 187 | 188 | names <- append(names, "reg hfe") 189 | ## Automatically clusters by id6 too 190 | out <- append(out, time(feols(v3 ~ v2 + id4 + id5 + as.factor(v1) | id6, DT1))) 191 | 192 | names <- append(names, "reg 2 hfe") 193 | ## Automatically clusters by id6 too) 194 | out <- append(out, time(feols(v3 ~ v2 + id4 + id5 + as.factor(v1) | id6 + id3, DT1))) 195 | 196 | # plot 197 | 198 | names <- append(names, "plot 1000 points") 199 | out <- append(out, time(ggsave("~/statabenchmark/plot.pdf", ggplot(DT1[1:1000], aes(x = v1, y = v2)) + geom_point()))) 200 | 201 | # run benchmark 202 | fwrite(data.table(command = names, result = out), "~/statabenchmark/resultR1e7.csv") 203 | -------------------------------------------------------------------------------- /code/3-benchmark-stata.do: -------------------------------------------------------------------------------- 1 | /*************************************************************************************************** 2 | To run the script, download the following packages: 3 | ssc install gtools 4 | ssc install fastreshape 5 | ssc install reghdfe 6 | ssc install autorename 7 | ssc install ftools 8 | ***************************************************************************************************/ 9 | /* timer helpers */ 10 | cap program drop Tic 11 | program define Tic 12 | syntax, n(integer) 13 | timer on `n' 14 | end 15 | 16 | cap program drop Toc 17 | program define Toc 18 | syntax, n(integer) 19 | timer off `n' 20 | end 21 | 22 | 23 | import delimited using "~/statabenchmark/merge_string.csv", clear 24 | autorename 25 | save "~/statabenchmark/merge_string.dta", replace 26 | 27 | import delimited using "~/statabenchmark/merge_int.csv", clear 28 | save "~/statabenchmark/merge_int.dta", replace 29 | 30 | /*************************************************************************************************** 31 | 32 | ***************************************************************************************************/ 33 | 34 | /* benchmark */ 35 | set processors 2 36 | 37 | timer clear 38 | local i = 0 39 | /* write and read */ 40 | Tic, n(`++i') 41 | import delimited using "~/statabenchmark/1e7.csv", clear 42 | Toc, n(`i') 43 | 44 | Tic, n(`++i') 45 | save "~/statabenchmark/1e7.dta", replace 46 | Toc, n(`i') 47 | 48 | drop _all 49 | Tic, n(`++i') 50 | use "~/statabenchmark/1e7.dta", clear 51 | Toc, n(`i') 52 | 53 | /* sort */ 54 | Tic, n(`++i') 55 | sort id3 56 | Toc, n(`i') 57 | 58 | Tic, n(`++i') 59 | sort id6 60 | Toc, n(`i') 61 | 62 | Tic, n(`++i') 63 | sort v3 64 | Toc, n(`i') 65 | 66 | Tic, n(`++i') 67 | gdistinct id3 68 | Toc, n(`i') 69 | 70 | Tic, n(`++i') 71 | gdistinct id6 72 | Toc, n(`i') 73 | 74 | /* merge */ 75 | use "~/statabenchmark/1e7.dta", clear 76 | Tic, n(`++i') 77 | fmerge m:1 id1 id3 using "~/statabenchmark/merge_string.dta", keep(master matched) nogen 78 | Toc, n(`i') 79 | 80 | use "~/statabenchmark/1e7.dta", clear 81 | Tic, n(`++i') 82 | fmerge m:1 id4 id6 using "~/statabenchmark/merge_int.dta", keep(master matched) nogen 83 | Toc, n(`i') 84 | 85 | /* append */ 86 | use "~/statabenchmark/1e7.dta", clear 87 | Tic, n(`++i') 88 | append using "~/statabenchmark/1e7.dta" 89 | Toc, n(`i') 90 | 91 | /* reshape */ 92 | bys id1 id2 id3: keep if _n == 1 93 | keep if _n < _N/10 94 | foreach v of varlist id4 id5 id6 v1 v2 v3{ 95 | rename `v' v_`v' 96 | } 97 | Tic, n(`++i') 98 | greshape long v_, i(id1 id2 id3) j(variable) string 99 | Toc, n(`i') 100 | 101 | Tic, n(`++i') 102 | greshape wide v_, i(id1 id2 id3) j(variable) string 103 | Toc, n(`i') 104 | 105 | /* recode */ 106 | use "~/statabenchmark/1e7.dta", clear 107 | Tic, n(`++i') 108 | gen v1_name = "" 109 | replace v1_name = "first" if v1 == 1 110 | replace v1_name = "second" if inlist(v1, 2, 3) 111 | replace v1_name = "third" if inlist(v1, 4, 5) 112 | Toc, n(`i') 113 | drop v1_name 114 | 115 | /* functions */ 116 | Tic, n(`++i') 117 | gquantiles temp = v3, n(10) xtile 118 | Toc, n(`i') 119 | drop temp 120 | 121 | Tic, n(`++i') 122 | gegen temp = group(id1 id3) 123 | Toc, n(`i') 124 | drop temp 125 | 126 | Tic, n(`++i') 127 | gegen temp = group(id4 id6) 128 | Toc, n(`i') 129 | drop temp 130 | 131 | 132 | /* split apply combine */ 133 | Tic, n(`++i') 134 | gegen temp = sum(v3), by(id1) 135 | Toc, n(`i') 136 | drop temp 137 | 138 | Tic, n(`++i') 139 | gegen temp = sum(v3), by(id3) 140 | Toc, n(`i') 141 | drop temp 142 | 143 | Tic, n(`++i') 144 | gegen temp = sum(v3), by(id4) 145 | Toc, n(`i') 146 | drop temp 147 | 148 | Tic, n(`++i') 149 | gegen temp = sum(v3), by(id6) 150 | Toc, n(`i') 151 | drop temp 152 | 153 | 154 | Tic, n(`++i') 155 | gegen temp = sd(v3), by(id4) 156 | Toc, n(`i') 157 | drop temp 158 | 159 | Tic, n(`++i') 160 | gegen temp = sd(v3), by(id6) 161 | Toc, n(`i') 162 | drop temp 163 | 164 | 165 | Tic, n(`++i') 166 | gcollapse (mean) v1 v2 (sum) v3, by(id1) fast 167 | Toc, n(`i') 168 | 169 | use "~/statabenchmark/1e7.dta", clear 170 | Tic, n(`++i') 171 | gcollapse (mean) v1 v2 (sum) v3, by(id3) fast 172 | Toc, n(`i') 173 | 174 | 175 | /* regress */ 176 | use "~/statabenchmark/1e7.dta", clear 177 | keep if _n <= _N/2 178 | Tic, n(`++i') 179 | reg v3 v1 v2 id4 id5 180 | Toc, n(`i') 181 | 182 | Tic, n(`++i') 183 | reg v3 i.v1 v2 id4 id5 184 | Toc, n(`i') 185 | 186 | Tic, n(`++i') 187 | reghdfe v3 v2 id4 id5 i.v1, a(id6) vce(cluster id6) tolerance(1e-6) 188 | Toc, n(`i') 189 | 190 | gegen g = group(id3) 191 | Tic, n(`++i') 192 | reghdfe v3 v2 id4 id5 i.v1, absorb(id6 g) vce(cluster id6) tolerance(1e-6) 193 | Toc, n(`i') 194 | 195 | /* plot */ 196 | keep if _n <= 1000 197 | Tic, n(`++i') 198 | twoway (scatter v2 v1) 199 | graph export "~/statabenchmark/plot_stata.pdf", replace 200 | Toc, n(`i') 201 | 202 | drop _all 203 | gen result = . 204 | set obs `i' 205 | timer list 206 | forval j = 1/`i'{ 207 | replace result = r(t`j') if _n == `j' 208 | } 209 | outsheet using "~/statabenchmark/resultStata1e7.csv", replace 210 | -------------------------------------------------------------------------------- /code/4-graph.r: -------------------------------------------------------------------------------- 1 | library(data.table) 2 | library(tidyr) 3 | library(stringr) 4 | library(ggplot2) 5 | library(scales) 6 | library(svglite) 7 | 8 | DT = fread("~/statabenchmark/resultR1e7.csv") 9 | DT2 = fread("~/statabenchmark/resultStata1e7.csv") 10 | setnames(DT, "result", "R") 11 | DT[, Stata := DT2[["result"]]] 12 | 13 | DT[, value := Stata / R] 14 | DT[, language := "Stata"] 15 | setDT(DT) 16 | 17 | 18 | DT[, command := factor(command, levels=rev(unique(command)))] 19 | image = ggplot(DT,aes(x=command,y=value, fill = "red", width=0.2)) + geom_bar(position=position_dodge(width=0.2), stat="identity")+ coord_flip() + scale_fill_discrete(breaks=c("Stata","R")) + ylab("Time spent in Stata (relative to time in R)") + scale_y_log10(breaks = c(0.1, 1, 10, 100), labels = c("0.1", "1", "10", "100")) 20 | ggsave("~/statabenchmark/1e7.svg", image) 21 | ggsave("~/statabenchmark/1e7.png", image) 22 | -------------------------------------------------------------------------------- /output/1e7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/matthieugomez/benchmark-stata-r/7de3857388e7ff531aa8ff3048873c7ef25cad26/output/1e7.png --------------------------------------------------------------------------------