├── images ├── dsn.PNG ├── drivers.PNG └── commercial.png ├── big-data-with-R-full.pdf ├── .gitignore ├── simple_query.sql ├── big-data-w-r.Rproj ├── README.md └── databases-and-R.Rmd /images/dsn.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edgararuiz-zz/big-data-w-r/HEAD/images/dsn.PNG -------------------------------------------------------------------------------- /images/drivers.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edgararuiz-zz/big-data-w-r/HEAD/images/drivers.PNG -------------------------------------------------------------------------------- /images/commercial.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edgararuiz-zz/big-data-w-r/HEAD/images/commercial.png -------------------------------------------------------------------------------- /big-data-with-R-full.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/edgararuiz-zz/big-data-w-r/HEAD/big-data-with-R-full.pdf -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | logs 6 | flights_pipeline 7 | flights_model 8 | derby.log 9 | -------------------------------------------------------------------------------- /simple_query.sql: -------------------------------------------------------------------------------- 1 | -- !preview conn=DBI::dbConnect(odbc::odbc(), "datawarehouse") 2 | 3 | select "origin", count(*) from production.flights group by "origin" -------------------------------------------------------------------------------- /big-data-w-r.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Big Data with R 2 | --------------- 3 | 4 | ### Presentation at the **Symposium on Data Science and Statistics** (SDSS) 2018 5 | 6 | 7 | *Abstract:* A review of techniques and R packages to aid in the success of Big Data analysis using R. The central idea is to use R to interface with the computation power of Spark, Hadoop, and/or databases remotely, as opposed to importing and analyzing in memory inside R. We will cover techniques for visualizing, modeling, scoring, dashboarding, and production pipelines. -------------------------------------------------------------------------------- /databases-and-R.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Databases and R" 3 | output: html_notebook 4 | --- 5 | 6 | ## Connecting to a database 7 | 8 | ```{r} 9 | library(knitr) 10 | include_graphics(("images/drivers.PNG")) 11 | ``` 12 | 13 | ```{r} 14 | library(odbc) 15 | 16 | sort(unique(odbcListDrivers()[[1]])) 17 | ``` 18 | 19 | 20 | ```{r} 21 | library(DBI) 22 | ``` 23 | 24 | 25 | ```{r} 26 | con <- dbConnect(odbc(), 27 | Driver = "SQL Server", 28 | Server = "localhost\\SQLEXPRESS", 29 | Database = "datawarehouse", 30 | Trusted_Connection = "True") 31 | 32 | ``` 33 | 34 | ## Using DBI 35 | 36 | ```{r} 37 | dbGetQuery(con, "select year, count(*) from production.flights group by year") 38 | ``` 39 | 40 | ```{sql, connection = con} 41 | select "origin", count(*) from production.flights group by "origin" 42 | ``` 43 | 44 | simple_query.sql 45 | 46 | ## dplyr 47 | 48 | ```{r} 49 | library(dplyr) 50 | library(dbplyr) 51 | 52 | tbl(con, in_schema("production", "flights")) 53 | 54 | ``` 55 | 56 | ```{r} 57 | db_flights <- tbl(con, in_schema("production", "flights")) 58 | ``` 59 | 60 | ```{r} 61 | db_flights %>% 62 | head() 63 | ``` 64 | 65 | ### Under the hood 66 | 67 | ```{r} 68 | db_flights %>% 69 | head() %>% 70 | show_query() 71 | ``` 72 | 73 | ```{r} 74 | sql_render(head(db_flights), con = simulate_mysql()) 75 | ``` 76 | 77 | ## Mode dplyr 78 | 79 | ```{r} 80 | db_flights %>% 81 | group_by(year) %>% 82 | tally() 83 | ``` 84 | 85 | Create summarizations 86 | ```{r} 87 | db_flights %>% 88 | group_by(year, month) %>% 89 | summarise( 90 | no_flights = n(), 91 | avg_dep_delay = mean(depdelay, na.rm = TRUE), 92 | avg_arr_delay = mean(arrdelay, na.rm = TRUE) 93 | ) 94 | ``` 95 | 96 | Join tables 97 | ```{r} 98 | db_airports <- tbl(con, in_schema("production", "airports")) 99 | 100 | db_joined <- db_flights %>% 101 | inner_join(db_airports, by = c("origin" = "faa")) 102 | 103 | db_joined 104 | ``` 105 | 106 | Top 10 busiest airports. Take advantage of `dplyr` lazy evaluation 107 | ```{r} 108 | db_joined %>% 109 | group_by(name) %>% 110 | tally() %>% 111 | arrange(desc(n)) %>% 112 | head(10) 113 | ``` 114 | 115 | ## Visualization 116 | 117 | ```{r} 118 | library(ggplot2) 119 | 120 | t <- db_joined %>% 121 | group_by(name) %>% 122 | tally() %>% 123 | arrange(desc(n)) %>% 124 | head(10) %>% 125 | collect() 126 | 127 | ggplot(t) + 128 | geom_col(aes(x = name, y = n)) + 129 | coord_flip() 130 | 131 | ``` 132 | 133 | ```{r} 134 | db_joined %>% 135 | group_by(lon, lat) %>% 136 | tally() %>% 137 | select(n, lon, lat) %>% 138 | collect() %>% 139 | ggplot() + 140 | geom_point(aes(x = lon, y = lat, size = n, color = n), alpha = 0.3) 141 | ``` 142 | 143 | ## dbplot 144 | 145 | http://db.rstudio.com/dbplot/ 146 | 147 | ```{r} 148 | library(dbplot) 149 | 150 | db_flights %>% 151 | filter(year == 2006) %>% 152 | dbplot_line(month , mean(arrdelay, na.rm = TRUE)) 153 | ``` 154 | 155 | ```{r} 156 | db_flights %>% 157 | filter(arrdelay < 100, arrdelay > (-100)) %>% 158 | dbplot_histogram(arrdelay) 159 | ``` 160 | 161 | 162 | ## tidypredict 163 | 164 | ```{r} 165 | model <- 166 | db_flights %>% 167 | head(10000) %>% 168 | filter(arrdelay < 100, arrdelay > (-100)) %>% 169 | mutate( dayofmonth = paste0("d", dayofmonth)) %>% 170 | lm(arrdelay ~ crsdeptime + crsarrtime, data = .) 171 | 172 | summary(model) 173 | ``` 174 | 175 | 176 | ```{r} 177 | library(tidypredict) 178 | 179 | tidypredict_sql(model, con) 180 | ``` 181 | 182 | ```{r} 183 | db_flights %>% 184 | filter(arrdelay < 100, arrdelay > (-100), year == 2007) %>% 185 | tidypredict_to_column(model) %>% 186 | select(fit, arrdelay) 187 | ``` 188 | 189 | ```{r} 190 | db_flights %>% 191 | filter( 192 | arrdelay < 100, 193 | arrdelay > (-100), 194 | year == 2007, 195 | month == 1 196 | ) %>% 197 | tidypredict_to_column(model) %>% 198 | mutate(diff = fit - arrdelay) %>% 199 | dbplot_histogram(diff) 200 | 201 | ``` 202 | 203 | ## modeldb 204 | 205 | ```{r} 206 | library(modeldb) 207 | 208 | remote_model <- db_flights %>% 209 | filter(year == 2006) %>% 210 | group_by(month) %>% 211 | mutate( 212 | arrdelay = as.numeric(arrdelay), 213 | depdelay = as.numeric(depdelay) 214 | ) %>% 215 | select(arrdelay, depdelay) %>% 216 | linear_regression_db(arrdelay) 217 | 218 | remote_model 219 | 220 | ``` 221 | 222 | ```{r, fig.height = 7, fig.width = 4} 223 | remote_model %>% 224 | ggplot() + 225 | geom_point(aes(`(Intercept)`, as.factor(month))) 226 | ``` 227 | 228 | 229 | ## Spark 230 | 231 | ```{r} 232 | library(nycflights13) 233 | library(sparklyr) 234 | library(dplyr) 235 | 236 | sc <- spark_connect(master = "local", version = "2.1.0") 237 | 238 | spark_flights <- sdf_copy_to(sc, flights) 239 | ``` 240 | 241 | ```{r} 242 | df <- spark_flights %>% 243 | filter(!is.na(dep_delay)) %>% 244 | mutate( 245 | month = paste0("m", month), 246 | day = paste0("d", day), 247 | sched_dep_time = as.numeric(sched_dep_time), 248 | dep_delay = as.numeric(dep_delay) 249 | ) %>% 250 | select(dep_delay, sched_dep_time, month, day, distance) 251 | ``` 252 | 253 | 254 | ```{r} 255 | flights_pipeline <- ml_pipeline(sc) %>% 256 | ft_dplyr_transformer( 257 | tbl = df 258 | ) %>% 259 | ft_binarizer( 260 | input.col = "dep_delay", 261 | output.col = "delayed", 262 | threshold = 15 263 | ) %>% 264 | ft_bucketizer( 265 | input.col = "sched_dep_time", 266 | output.col = "hours", 267 | splits = c(400, 800, 1200, 1600, 2000, 2400) 268 | ) %>% 269 | ft_r_formula(delayed ~ month + day + hours + distance) %>% 270 | ml_logistic_regression() 271 | 272 | flights_pipeline 273 | ``` 274 | 275 | ```{r} 276 | partitioned_flights <- sdf_partition( 277 | spark_flights, 278 | training = 0.1, 279 | testing = 0.1, 280 | rest = 0.9 281 | ) 282 | ``` 283 | 284 | ```{r} 285 | fitted_pipeline <- ml_fit( 286 | flights_pipeline, 287 | partitioned_flights$training 288 | ) 289 | fitted_pipeline 290 | 291 | ``` 292 | 293 | 294 | ```{r} 295 | predictions <- ml_transform( 296 | fitted_pipeline, 297 | partitioned_flights$testing 298 | ) 299 | 300 | predictions %>% 301 | group_by(delayed, prediction) %>% 302 | tally() 303 | ``` 304 | 305 | ```{r} 306 | ml_save( 307 | flights_pipeline, 308 | "flights_pipeline", 309 | overwrite = TRUE 310 | ) 311 | ``` 312 | 313 | 314 | ```{r} 315 | ml_save( 316 | fitted_pipeline, 317 | "flights_model", 318 | overwrite = TRUE 319 | ) 320 | ``` 321 | 322 | ```{r} 323 | spark_disconnect(sc) 324 | ``` 325 | 326 | ## Use a re-loaded model 327 | 328 | ```{r} 329 | sc <- spark_connect(master = "local", version = "2.1.0") 330 | spark_flights <- sdf_copy_to(sc, flights) 331 | ``` 332 | 333 | ```{r} 334 | reloaded_model <- ml_load(sc, "flights_model") 335 | 336 | 337 | new_df <- spark_flights %>% 338 | filter( 339 | month == 7, 340 | day == 5 341 | ) 342 | 343 | ml_transform(reloaded_model, new_df) 344 | ``` 345 | 346 | ## Re-fit the same pipeline with new data 347 | 348 | ```{r} 349 | reloaded_pipeline <- ml_load(sc, "flights_pipeline") 350 | 351 | new_model <- ml_fit(reloaded_pipeline, sample_frac(spark_flights, 0.01)) 352 | 353 | new_model 354 | ``` 355 | 356 | 357 | http://colorado.rstudio.com:3939/content/671/ 358 | 359 | http://colorado.rstudio.com:3939/content/1101/ 360 | 361 | 362 | --------------------------------------------------------------------------------