├── .gitignore ├── README.md ├── dev ├── babynames │ ├── babynames-dplyr.Rmd │ ├── babynames-dplyr.nb.html │ └── derby.log ├── cloudera │ ├── bigvis_tile.R │ ├── livy-architecture.png │ ├── livy.Rmd │ ├── livy.sh │ ├── livy_connection.Rmd │ ├── nyct2010r.csv │ ├── spark_ml_classification_titanic.Rmd │ ├── spark_plot_boxbin.R │ ├── spark_plot_hist.R │ ├── spark_plot_point.R │ ├── spark_toolchain.Rmd │ ├── sqlvis_histogram.R │ ├── sqlvis_raster.R │ ├── taxiDemoCloudera.Rmd │ ├── taxiDemoCloudera.nb.html │ ├── taxiDemoCloudera2.Rmd │ ├── taxiDemoCloudera3.Rmd │ ├── taxiDemoCloudera_backup.Rmd │ └── testCloudera.R ├── flights-cdh │ ├── flights_pred_2008.RData │ ├── images │ │ └── clusterDemo │ │ │ ├── data-analysis-1.png │ │ │ ├── flex-1.png │ │ │ ├── forecast-1.png │ │ │ ├── hue-metastore-1.png │ │ │ ├── manager-landing-page.png │ │ │ ├── performance-1.png │ │ │ ├── sign-in-1.png │ │ │ ├── spark-history-server-1.png │ │ │ ├── spark-pane-1.png │ │ │ ├── spark-rdd-1.png │ │ │ └── tables-1.png │ ├── nycflights_flexdashboard.Rmd │ ├── sparkClusterDemo-source.R │ ├── sparkClusterDemo.Rmd │ └── sparkClusterDemo.html ├── flights │ ├── flightsAnalysis.Rmd │ ├── flightsAnalysis.nb.html │ ├── flightsApp │ │ └── app.R │ ├── flightsApp2 │ │ ├── global.R │ │ ├── server.R │ │ └── ui.R │ ├── flights_pred_2008.RData │ ├── images │ │ └── clusterDemo │ │ │ ├── awsClusterConnect.png │ │ │ ├── awsCreateCluster.png │ │ │ ├── awsCreateCluster2.png │ │ │ ├── awsNewSecurityGroup.png │ │ │ ├── awsSecurityGroup.png │ │ │ ├── awsSecurityGroup2.png │ │ │ ├── emrArchitecture.png │ │ │ ├── emrConfigStep1.png │ │ │ ├── emrConfigStep2.png │ │ │ ├── emrConfigStep3.png │ │ │ ├── emrConfigStep4.png │ │ │ ├── emrLogin.png │ │ │ ├── flightsDashboard.png │ │ │ ├── flightsDeciles.png │ │ │ ├── flightsDecilesDesc.png │ │ │ ├── flightsPredicted.png │ │ │ ├── rstudio.png │ │ │ ├── rstudioData.png │ │ │ ├── rstudioLogin.png │ │ │ ├── rstudioModel.png │ │ │ ├── rstudioModelDetail.png │ │ │ ├── rstudioSparkPane.png │ │ │ ├── workflow.png │ │ │ ├── workflowCommands.png │ │ │ ├── workflowRSC.png │ │ │ └── workflowShare.png │ ├── nycflights_flexdashboard.Rmd │ ├── nycflights_flexdashboard_spark.Rmd │ ├── recode_for_prediction.R │ ├── sparkClusterDemo.Rmd │ └── sparkClusterDemo.html ├── h2o-demo │ ├── emr_h2o_setup.sh │ ├── h2oHadoop.Rmd │ ├── h2oModels.Rmd │ ├── h2oSetup.R │ ├── h2oSetup.Rmd │ ├── h2oSetup.nb.html │ ├── h2oSetup_2_0_0.R │ ├── iris.csv │ ├── livy.R │ ├── livy.Rmd │ ├── nyct2010.csv │ ├── sqlvis_histogram.R │ ├── sqlvis_raster.R │ ├── taxiDemoH2O.Rmd │ └── taxiDemoH2O.nb.html ├── h2o │ ├── 01_h2o_setup.R │ ├── 02_h2o_rsparkling.Rmd │ ├── 02_h2o_rsparkling.nb.html │ ├── 03_h2o_ml.Rmd │ ├── 03_h2o_ml.nb.html │ └── 04_h2o_grid.R ├── helloworld │ ├── derby.log │ ├── helloWorld.Rmd │ ├── helloWorld.html │ └── helloWorld.nb.html ├── hive │ ├── hiveJDBC.R │ ├── hiveMetastore.R │ ├── hiveMetastore.Rmd │ └── hiveMetastore.nb.html ├── nyc-taxi-data │ ├── .gitignore │ ├── taxiAnalysis.R │ ├── taxiApp.R │ ├── taxiApp │ │ └── app.R │ ├── taxiDashboard.Rmd │ ├── taxiDemo.Rmd │ └── taxiDemo.nb.html ├── nycflights13 │ ├── .gitignore │ ├── dplyr.Rmd │ ├── dplyr.nb.html │ ├── nycflights13_flexdashboard_rdata.Rmd │ └── nycflights13_flexdashboard_sparkdata.Rmd ├── performance │ ├── collect.Rmd │ └── collect.html └── titanic │ ├── .gitignore │ ├── notebook-classification-rdata.Rmd │ ├── notebook-classification-rdata.nb.html │ ├── notebook-classification.Rmd │ ├── notebook-classification.html │ ├── notebook-classification.nb.html │ ├── rmarkdown-classification.Rmd │ ├── rmarkdown-classification_files │ └── figure-html │ │ ├── auc-1.png │ │ ├── importance-1.png │ │ └── lift-1.png │ └── titanic-parquet │ ├── ._SUCCESS.crc │ ├── .part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet.crc │ ├── _SUCCESS │ └── part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet ├── img ├── sparklyr-illustration.png ├── sparklyr-presentation-demos.001.jpeg ├── sparklyr-presentation-demos.002.jpeg ├── sparklyr-presentation-demos.003.jpeg ├── sparklyr-presentation-demos.004.jpeg ├── sparklyr-presentation-demos.005.jpeg ├── sparklyr-presentation-demos.006.jpeg ├── sparklyr-presentation-demos.007.jpeg ├── sparklyr-presentation-demos.008.jpeg ├── sparklyr-presentation-demos.009.jpeg ├── sparklyr-presentation-demos.010.jpeg ├── sparklyr-presentation-demos.011.jpeg ├── sparklyr-presentation-demos.012.jpeg ├── sparklyr-presentation-demos.013.jpeg ├── sparklyr-presentation-demos.014.jpeg ├── sparklyr-presentation-demos.015.jpeg ├── sparklyr-presentation-demos.016.jpeg ├── sparklyr-presentation-demos.017.jpeg ├── sparklyr-presentation-demos.018.jpeg ├── sparklyr-presentation-demos.019.jpeg ├── sparklyr-presentation-demos.020.jpeg └── sparklyr-presentation-demos.021.jpeg └── prod ├── apps ├── iris-k-means │ ├── DESCRIPTION │ ├── app.R │ ├── config.yml │ └── iris-parquet │ │ ├── ._SUCCESS.crc │ │ ├── ._common_metadata.crc │ │ ├── ._metadata.crc │ │ ├── .part-r-00000-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet.crc │ │ ├── .part-r-00001-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet.crc │ │ ├── _SUCCESS │ │ ├── _common_metadata │ │ ├── _metadata │ │ ├── part-r-00000-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet │ │ └── part-r-00001-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet ├── nycflights13-app-spark │ ├── DESCRIPTION │ ├── Readme.md │ ├── app.R │ └── config.yml └── titanic-classification │ ├── .gitignore │ ├── DESCRIPTION │ ├── app.R │ ├── helpers.R │ └── titanic-parquet │ ├── ._SUCCESS.crc │ ├── .part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet.crc │ ├── _SUCCESS │ └── part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet ├── conf ├── config.yml └── shiny-server.conf ├── dashboards ├── diamonds-explorer │ ├── config.yml │ ├── diamonds-parquet │ │ ├── ._SUCCESS.crc │ │ ├── ._common_metadata.crc │ │ ├── ._metadata.crc │ │ ├── .part-r-00000-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet.crc │ │ ├── .part-r-00001-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet.crc │ │ ├── _SUCCESS │ │ ├── _common_metadata │ │ ├── _metadata │ │ ├── part-r-00000-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet │ │ └── part-r-00001-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet │ └── flexdashboard-shiny-diamonds.Rmd ├── ggplot2-brushing │ └── ggplot2Brushing.Rmd ├── nycflights13-dash-spark │ ├── config.yml │ └── nycflights13-dash-spark.Rmd └── tor-project │ ├── .gitignore │ ├── metricsgraphicsTorProject.Rmd │ └── metricsgraphicsTorProject.html ├── notebooks ├── babynames │ ├── .gitignore │ ├── babynames-dplyr.Rmd │ └── babynames-dplyr.nb.html ├── end-to-end-flights │ ├── end-to-end-flights-flexdashboard.Rmd │ ├── end-to-end-flights-htmldoc.html │ ├── end-to-end-flights.Rmd │ └── flights_pred_2008.RData ├── ml_classification_titanic │ ├── spark_ml_classification_titanic.Rmd │ ├── spark_ml_classification_titanic.html │ ├── spark_ml_classification_titanic.nb.html │ └── titanic-parquet │ │ ├── ._SUCCESS.crc │ │ ├── .part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet.crc │ │ ├── _SUCCESS │ │ └── part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet └── taxi_demo │ ├── readme.md │ ├── taxiDemo.Rmd │ └── taxiDemo.nb.html └── presentations ├── cazena ├── 01_taxiR.Rmd ├── 02_taxiDemo.Rmd ├── 03_taxiGadget.Rmd ├── README.md ├── emr_setup.sh ├── kerberos.R ├── nyct2010.csv ├── sqlvis_histogram.R └── sqlvis_raster.R ├── cloudera ├── livy-architecture.png ├── livy.Rmd ├── readme.html ├── readme.md ├── sqlvis_histogram.R ├── sqlvis_raster.R └── taxiDemoCloudera.Rmd ├── sparkSummitEast ├── README.md ├── img │ ├── img.001.jpeg │ ├── img.002.jpeg │ ├── img.003.jpeg │ ├── img.004.jpeg │ ├── img.005.jpeg │ ├── img.006.jpeg │ ├── img.007.jpeg │ ├── img.008.jpeg │ ├── img.009.jpeg │ ├── img.010.jpeg │ ├── img.011.jpeg │ ├── img.012.jpeg │ ├── img.013.jpeg │ ├── img.014.jpeg │ ├── img.015.jpeg │ ├── img.016.jpeg │ └── img.017.jpeg ├── livy.Rmd ├── nyct2010.csv ├── sqlvis_histogram.R ├── sqlvis_raster.R └── taxiDemoH2O.Rmd └── tidyverse ├── 01_taxiR.Rmd ├── 02_taxiDemo.Rmd ├── 03_taxiGadget.Rmd ├── README.md ├── emr_setup.sh ├── img ├── tidyverse.001.jpeg ├── tidyverse.002.jpeg ├── tidyverse.003.jpeg ├── tidyverse.004.jpeg ├── tidyverse.005.jpeg ├── tidyverse.006.jpeg ├── tidyverse.007.jpeg ├── tidyverse.008.jpeg ├── tidyverse.009.jpeg ├── tidyverse.010.jpeg ├── tidyverse.011.jpeg ├── tidyverse.012.jpeg ├── tidyverse.013.jpeg ├── tidyverse.014.jpeg ├── tidyverse.015.jpeg └── tidyverse.016.jpeg ├── nyct2010.csv ├── sqlvis_histogram.R ├── sqlvis_raster.R └── tidyverseAndSpark.pdf /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | sparkDemos.Rproj 6 | rsconnect 7 | derby.log 8 | *.nb.html 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Sparklyr Demos" 3 | output: html_document 4 | --- 5 | 6 | ![](img/sparklyr-presentation-demos.001.jpeg) 7 | 8 | *** 9 | 10 | ![](img/sparklyr-presentation-demos.002.jpeg) 11 | 12 | *** 13 | 14 | ![](img/sparklyr-presentation-demos.003.jpeg) 15 | 16 | *** 17 | 18 | ![](img/sparklyr-presentation-demos.004.jpeg) 19 | 20 | *** 21 | 22 | ![](img/sparklyr-presentation-demos.005.jpeg) 23 | 24 | *** 25 | 26 | ![](img/sparklyr-presentation-demos.006.jpeg) 27 | 28 | *** 29 | 30 | ![](img/sparklyr-presentation-demos.007.jpeg) 31 | 32 | *** 33 | 34 | ![](img/sparklyr-presentation-demos.008.jpeg) 35 | 36 | *** 37 | 38 | ![](img/sparklyr-presentation-demos.009.jpeg) 39 | 40 | *** 41 | 42 | ![](img/sparklyr-presentation-demos.010.jpeg) 43 | 44 | *** 45 | 46 | ![](img/sparklyr-presentation-demos.011.jpeg) 47 | 48 | *** 49 | 50 | ![](img/sparklyr-presentation-demos.012.jpeg) 51 | 52 | *** 53 | 54 | ![](img/sparklyr-presentation-demos.013.jpeg) 55 | 56 | *** 57 | 58 | ![](img/sparklyr-presentation-demos.014.jpeg) 59 | 60 | *** 61 | 62 | ![](img/sparklyr-presentation-demos.015.jpeg) 63 | 64 | *** 65 | 66 | ![](img/sparklyr-presentation-demos.016.jpeg) 67 | 68 | *** 69 | 70 | ![](img/sparklyr-presentation-demos.017.jpeg) 71 | 72 | *** 73 | 74 | ![](img/sparklyr-presentation-demos.018.jpeg) 75 | 76 | *** 77 | 78 | ![](img/sparklyr-presentation-demos.019.jpeg) 79 | 80 | *** 81 | 82 | ![](img/sparklyr-presentation-demos.020.jpeg) 83 | 84 | *** 85 | 86 | ![](img/sparklyr-presentation-demos.021.jpeg) 87 | -------------------------------------------------------------------------------- /dev/babynames/babynames-dplyr.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Analysis of babynames with dplyr" 3 | output: html_notebook 4 | --- 5 | 6 | Use dplyr syntax to write Apache Spark SQL queries. Use select, where, group by, joins, and window functions in Aparche Spark SQL. 7 | 8 | ## Setup 9 | 10 | ```{r setup} 11 | knitr::opts_chunk$set(warning = FALSE, message = FALSE) 12 | library(sparklyr) 13 | library(dplyr) 14 | library(babynames) 15 | library(ggplot2) 16 | library(dygraphs) 17 | library(rbokeh) 18 | ``` 19 | 20 | ## Connect to Spark 21 | 22 | Install and connect to a local Spark instance. Copy data into Spark DataFrames. 23 | 24 | ```{r} 25 | #spark_install("2.0.0") 26 | sc <- spark_connect(master = "local", version = "2.0.0") 27 | babynames_tbl <- copy_to(sc, babynames, "babynames") 28 | applicants_tbl <- copy_to(sc, applicants, "applicants") 29 | ``` 30 | 31 | ## Total US births 32 | 33 | Plot total US births recorded from the Social Security Administration. 34 | 35 | ```{r} 36 | birthsYearly <- applicants_tbl %>% 37 | mutate(male = ifelse(sex == "M", n_all, 0), female = ifelse(sex == "F", n_all, 0)) %>% 38 | group_by(year) %>% 39 | summarize(Male = sum(male) / 1000000, Female = sum(female) / 1000000) %>% 40 | arrange(year) %>% 41 | collect 42 | 43 | birthsYearly %>% 44 | dygraph(main = "Total US Births (SSN)", ylab = "Millions") %>% 45 | dySeries("Female") %>% 46 | dySeries("Male") %>% 47 | dyOptions(stackedGraph = TRUE) %>% 48 | dyRangeSelector(height = 20) 49 | ``` 50 | 51 | ## Aggregate data by name 52 | 53 | Use Spark SQL to create a look up table. Register and cache the look up table in Spark for future queries. 54 | 55 | ```{r} 56 | topNames_tbl <- babynames_tbl %>% 57 | filter(year >= 1986) %>% 58 | group_by(name, sex) %>% 59 | summarize(count = as.numeric(sum(n))) %>% 60 | filter(count > 1000) %>% 61 | select(name, sex) 62 | 63 | filteredNames_tbl <- babynames_tbl %>% 64 | filter(year >= 1986) %>% 65 | inner_join(topNames_tbl) 66 | 67 | yearlyNames_tbl <- filteredNames_tbl %>% 68 | group_by(year, name, sex) %>% 69 | summarize(count = as.numeric(sum(n))) 70 | 71 | sdf_register(yearlyNames_tbl, "yearlyNames") 72 | tbl_cache(sc, "yearlyNames") 73 | ``` 74 | 75 | ## Most popular names (1986) 76 | 77 | Identify the top 5 male and female names from 1986. Visualize the popularity trend over time. 78 | 79 | ```{r} 80 | topNames1986_tbl <- yearlyNames_tbl %>% 81 | filter(year == 1986) %>% 82 | group_by(name, sex) %>% 83 | summarize(count = sum(count)) %>% 84 | group_by(sex) %>% 85 | mutate(rank = min_rank(desc(count))) %>% 86 | filter(rank < 5) %>% 87 | arrange(sex, rank) %>% 88 | select(name, sex, rank) %>% 89 | sdf_register("topNames1986") 90 | 91 | tbl_cache(sc, "topNames1986") 92 | 93 | topNames1986Yearly <- yearlyNames_tbl %>% 94 | inner_join(topNames1986_tbl) %>% 95 | collect 96 | 97 | ggplot(topNames1986Yearly, aes(year, count, color=name)) + 98 | facet_grid(~sex) + 99 | geom_line() + 100 | ggtitle("Most Popular Names of 1986") 101 | ``` 102 | 103 | ## Most popular names (2014) 104 | 105 | Identify the top 5 male and female names from 2014. Visualize the popularity trend over time. 106 | 107 | ```{r} 108 | topNames2014_tbl <- yearlyNames_tbl %>% 109 | filter(year == 2014) %>% 110 | group_by(name, sex) %>% 111 | summarize(count = sum(count)) %>% 112 | group_by(sex) %>% 113 | mutate(rank = min_rank(desc(count))) %>% 114 | filter(rank < 5) %>% 115 | arrange(sex, rank) %>% 116 | select(name, sex, rank) %>% 117 | sdf_register("topNames2014") 118 | 119 | tbl_cache(sc, "topNames2014") 120 | 121 | topNames2014Yearly <- yearlyNames_tbl %>% 122 | inner_join(topNames2014_tbl) %>% 123 | collect 124 | 125 | ggplot(topNames2014Yearly, aes(year, count, color=name)) + 126 | facet_grid(~sex) + 127 | geom_line() + 128 | ggtitle("Most Popular Names of 2014") 129 | ``` 130 | 131 | ## Shared names 132 | 133 | Visualize the most popular names that are shared by both males and females. 134 | 135 | ```{r} 136 | sharedName <- babynames_tbl %>% 137 | mutate(male = ifelse(sex == "M", n, 0), female = ifelse(sex == "F", n, 0)) %>% 138 | group_by(name) %>% 139 | summarize(Male = as.numeric(sum(male)), 140 | Female = as.numeric(sum(female)), 141 | count = as.numeric(sum(n)), 142 | AvgYear = round(as.numeric(sum(year * n) / sum(n)),0)) %>% 143 | filter(Male > 30000 & Female > 30000) %>% 144 | collect 145 | 146 | figure(width = NULL, height = NULL, 147 | xlab = "Log10 Number of Males", 148 | ylab = "Log10 Number of Females", 149 | title = "Top shared names (1880 - 2014)") %>% 150 | ly_points(log10(Male), log10(Female), data = sharedName, 151 | color = AvgYear, size = scale(sqrt(count)), 152 | hover = list(name, Male, Female, AvgYear), legend = FALSE) 153 | ``` -------------------------------------------------------------------------------- /dev/babynames/derby.log: -------------------------------------------------------------------------------- 1 | ---------------------------------------------------------------- 2 | Wed Feb 15 12:46:01 UTC 2017: 3 | Booting Derby version The Apache Software Foundation - Apache Derby - 10.11.1.1 - (1616546): instance a816c00e-015a-41ce-df39-000016b90d28 4 | on database directory memory:/home/nathan/projects/spark/sparkDemos/dev/babynames/databaseName=metastore_db with class loader org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1@53a3cfef 5 | Loaded from file:/home/nathan/.cache/spark/spark-2.0.0-bin-hadoop2.7/jars/derby-10.11.1.1.jar 6 | java.vendor=Oracle Corporation 7 | java.runtime.version=1.7.0_85-b01 8 | user.dir=/home/nathan/projects/spark/sparkDemos/dev/babynames 9 | os.name=Linux 10 | os.arch=amd64 11 | os.version=3.13.0-48-generic 12 | derby.system.home=null 13 | Database Class Loader started - derby.database.classpath='' 14 | -------------------------------------------------------------------------------- /dev/cloudera/bigvis_tile.R: -------------------------------------------------------------------------------- 1 | ### Big data tile plot 2 | 3 | bigvis_compute_tiles <- function(data, x_field, y_field, resolution = 500){ 4 | 5 | data_prep <- data %>% 6 | select_(x = x_field, y = y_field) %>% 7 | filter(!is.na(x), !is.na(y)) 8 | 9 | s <- data_prep %>% 10 | summarise(max_x = max(x), 11 | max_y = max(y), 12 | min_x = min(x), 13 | min_y = min(y)) %>% 14 | mutate(rng_x = max_x - min_x, 15 | rng_y = max_y - min_y) %>% 16 | collect() 17 | 18 | image_frame_pre <- data_prep %>% 19 | mutate(res_x = round((x-s$min_x)/s$rng_x*resolution, 0), 20 | res_y = round((y-s$min_y)/s$rng_y*resolution, 0)) %>% 21 | count(res_x, res_y) %>% 22 | collect 23 | 24 | image_frame_pre %>% 25 | rename(freq = n) %>% 26 | mutate(alpha = round(freq / max(freq), 2)) %>% 27 | rename_(.dots=setNames(list("res_x", "res_y"), c(x_field, y_field))) 28 | 29 | } 30 | 31 | bigvis_ggplot_tiles <- function(data){ 32 | data %>% 33 | select(x = 1, y = 2, Freq = 4) %>% 34 | ggplot(aes(x, y)) + 35 | geom_tile(aes(fill = Freq)) + 36 | xlab(colnames(data)[1]) + 37 | ylab(colnames(data)[2]) 38 | } 39 | -------------------------------------------------------------------------------- /dev/cloudera/livy-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/cloudera/livy-architecture.png -------------------------------------------------------------------------------- /dev/cloudera/livy.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Connecting to Spark through Livy" 3 | output: html_notebook 4 | --- 5 | 6 | With Livy you can anaylze data in your spark cluster via R on your desktop. 7 | 8 | ## Livy 9 | 10 | Livy is a service that enables easy interaction with an Apache Spark cluster over a REST interface. It enables easy submission of Spark jobs or snippets of Spark code, synchronous or asynchronous result retrieval, as well as SparkContext management, all via a simple REST interface or a RPC client library. Livy also simplifies the interaction between Spark from application servers, thus enabling the use of Spark for interactive web/mobile applications. 11 | 12 |
13 | ![Image](http://livy.io/img/livy-architecture.png) 14 |
15 | 16 | ## Start Livy 17 | 18 | Set home environment variables and start a Livy server to handle local requests. 19 | 20 | ```{bash} 21 | export JAVA_HOME=/usr/lib/jvm/java-7-oracle-cloudera 22 | export SPARK_HOME=/opt/cloudera/parcels/CDH/lib/spark 23 | /home/ubuntu/livy/livy-server-0.2.0/bin/livy-server 24 | ``` 25 | 26 | ## Connect to Spark 27 | 28 | Use `method = "livy"` to connect to the cluster. 29 | 30 | ```{r} 31 | library(sparklyr) 32 | library(dplyr) 33 | sc <- spark_connect( 34 | master = "http://ec2-***.us-west-2.compute.amazonaws.com:8998", 35 | method = "livy") 36 | ``` 37 | 38 | ## Analyze 39 | 40 | Use R code on your workstation as you normally would. Your R commands will be sent to the cluster via Livy for processing. Collect your results back to the desktop for further processing in R. 41 | 42 | ```{r} 43 | library(ggplot2) 44 | trips_model_data_tbl <- tbl(sc, "trips_model_data") 45 | pickup_dropoff_tbl <- trips_model_data_tbl %>% 46 | filter(pickup_nta == "Turtle Bay-East Midtown" & dropoff_nta == "Airport") %>% 47 | mutate(pickup_hour = hour(pickup_datetime)) %>% 48 | mutate(trip_time = unix_timestamp(dropoff_datetime) - unix_timestamp(pickup_datetime)) %>% 49 | group_by(pickup_hour) %>% 50 | summarize(n = n(), 51 | trip_time_mean = mean(trip_time), 52 | trip_time_p10 = percentile(trip_time, 0.10), 53 | trip_time_p25 = percentile(trip_time, 0.25), 54 | trip_time_p50 = percentile(trip_time, 0.50), 55 | trip_time_p75 = percentile(trip_time, 0.75), 56 | trip_time_p90 = percentile(trip_time, 0.90)) 57 | 58 | # Collect results 59 | pickup_dropoff <- collect(pickup_dropoff_tbl) 60 | 61 | # Plot 62 | ggplot(pickup_dropoff, aes(x = pickup_hour)) + 63 | geom_line(aes(y = trip_time_p50, alpha = "Median")) + 64 | geom_ribbon(aes(ymin = trip_time_p25, ymax = trip_time_p75, 65 | alpha = "25–75th percentile")) + 66 | geom_ribbon(aes(ymin = trip_time_p10, ymax = trip_time_p90, 67 | alpha = "10–90th percentile")) + 68 | scale_y_continuous("trip duration in minutes") 69 | ``` 70 | -------------------------------------------------------------------------------- /dev/cloudera/livy.sh: -------------------------------------------------------------------------------- 1 | export JAVA_HOME=/usr/lib/jvm/java-7-oracle-cloudera 2 | export SPARK_HOME=/opt/cloudera/parcels/CDH/lib/spark 3 | /home/ubuntu/livy/livy-server-0.2.0/bin/livy-server 4 | -------------------------------------------------------------------------------- /dev/cloudera/livy_connection.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Livy Connection" 3 | output: html_notebook 4 | --- 5 | 6 | ```{bash} 7 | export JAVA_HOME=/usr/lib/jvm/java-7-oracle-cloudera 8 | export SPARK_HOME=/opt/cloudera/parcels/CDH/lib/spark 9 | /home/ubuntu/livy/livy-server-0.2.0/bin/livy-server 10 | ``` 11 | -------------------------------------------------------------------------------- /dev/cloudera/spark_plot_hist.R: -------------------------------------------------------------------------------- 1 | spark_plot_hist <- function(data, 2 | x_field, 3 | breaks=30) 4 | { 5 | #----- Pre calculating the max x brings down the time considerably 6 | max_x <- data %>% 7 | select_(x=x_field) %>% 8 | summarise(xmax = max(x)) %>% 9 | collect() 10 | max_x <- max_x$xmax[1] 11 | 12 | #----- The entire function is one long pipe 13 | data %>% 14 | select_(x=x_field) %>% 15 | filter(!is.na(x)) %>% 16 | mutate(bucket = round(x/(max_x/(breaks-1)),0)) %>% 17 | group_by(bucket) %>% 18 | summarise(top=max(x), 19 | bottom=min(x), 20 | count=n()) %>% 21 | arrange(bucket) %>% 22 | collect %>% 23 | ggplot() + 24 | geom_bar(aes(x=((top-bottom)/2)+bottom, y=count), color="black", stat = "identity") + 25 | labs(x=x_field) + 26 | theme_minimal() + 27 | theme(legend.position="none")} -------------------------------------------------------------------------------- /dev/cloudera/spark_plot_point.R: -------------------------------------------------------------------------------- 1 | spark_plot_point<- function(data, 2 | x_field=NULL, 3 | y_field=NULL, 4 | color_field=NULL) 5 | { 6 | 7 | data %>% 8 | select_(x=x_field, y=y_field) %>% 9 | group_by(x,y) %>% 10 | tally() %>% 11 | collect() %>% 12 | ggplot() + 13 | geom_point(aes(x=x, y=y, color=n)) + 14 | labs(x=x_field, y=y_field) 15 | 16 | } -------------------------------------------------------------------------------- /dev/cloudera/spark_toolchain.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Data Science Tool Chain with Spark" 3 | output: html_notebook 4 | --- 5 | 6 | ```{r} 7 | library(sparklyr) 8 | library(dplyr) 9 | library(ggplot2) 10 | 11 | Sys.setenv(JAVA_HOME="/usr/lib/jvm/java-7-oracle-cloudera/") 12 | Sys.setenv(SPARK_HOME = '/opt/cloudera/parcels/CDH/lib/spark') 13 | 14 | conf <- spark_config() 15 | conf$spark.executor.cores <- 16 16 | conf$spark.executor.memory <- "24G" 17 | conf$spark.yarn.am.cores <- 16 18 | conf$spark.yarn.am.memory <- "24G" 19 | 20 | sc <- spark_connect(master = "yarn-client", version="1.6.0", config = conf) 21 | 22 | nyct2010_tbl <- tbl(sc, "nyct2010") 23 | trips_par_tbl <- tbl(sc, "trips_par") 24 | trips_model_data_tbl <- tbl(sc, "trips_model_data") 25 | ``` 26 | 27 | ### Histogram 28 | 29 | ```{r} 30 | source("bigvis_histogram.R") 31 | 32 | bigvis_compute_histogram(nyct2010_tbl, "ct2010") %>% 33 | bigvis_ggplot_histogram 34 | 35 | ``` 36 | 37 | ### Tile plot 38 | 39 | ```{r} 40 | source("bigvis_tile.R") 41 | 42 | trips_model_data_tbl %>% 43 | bigvis_compute_tiles("pickup_longitude", "pickup_latitude", 500) %>% 44 | bigvis_ggplot_tiles 45 | 46 | ``` 47 | 48 | -------------------------------------------------------------------------------- /dev/cloudera/sqlvis_histogram.R: -------------------------------------------------------------------------------- 1 | ### Big data histogram 2 | sqlvis_compute_histogram <- function(data, x_name, bins = 30){ 3 | 4 | data_prep <- data %>% 5 | select_(x_field = x_name) %>% 6 | filter(!is.na(x_field)) %>% 7 | mutate(x_field = as.double(x_field)) 8 | 9 | s <- data_prep %>% 10 | summarise(max_x = max(x_field), min_x = min(x_field)) %>% 11 | mutate(bin_value = (max_x - min_x) / bins) %>% 12 | collect() 13 | 14 | new_bins <- as.numeric(c((0:(bins - 1) * s$bin_value) + s$min_x, s$max_x)) 15 | 16 | plot_table <- data_prep %>% 17 | ft_bucketizer(input.col = "x_field", output.col = "key_bin", splits = new_bins) %>% 18 | group_by(key_bin) %>% 19 | tally() %>% 20 | collect() 21 | 22 | all_bins <- data.frame( 23 | key_bin = 0:(bins - 1), 24 | bin = 1:bins, 25 | bin_ceiling = head(new_bins, -1) 26 | ) 27 | 28 | plot_table %>% 29 | full_join(all_bins, by="key_bin") %>% 30 | arrange(key_bin) %>% 31 | mutate(n = ifelse(!is.na(n), n, 0)) %>% 32 | select(bin = key_bin, count = n, bin_ceiling) %>% 33 | rename_(.dots = setNames(list("bin_ceiling"), x_name)) 34 | 35 | } 36 | 37 | sqlvis_ggplot_histogram <- function(plot_table, ...){ 38 | plot_table %>% 39 | select(x = 3, y = 2) %>% 40 | ggplot(aes(x, y)) + 41 | geom_bar(stat = "identity", fill = "cornflowerblue") + 42 | theme(legend.position = "none") + 43 | labs(x = colnames(plot_table)[3], y = colnames(plot_table)[2], ...) 44 | } 45 | 46 | sqlvis_ggvis_histogram <- function(plot_table, ...){ 47 | plot_table %>% 48 | select(x = 3, y = 2) %>% 49 | ggvis(x = ~x, y = ~y) %>% 50 | layer_bars() %>% 51 | add_axis("x", title = colnames(plot_table)[3]) %>% 52 | add_axis("y", title = colnames(plot_table)[2]) 53 | } 54 | 55 | -------------------------------------------------------------------------------- /dev/cloudera/sqlvis_raster.R: -------------------------------------------------------------------------------- 1 | ### Big data tile plot 2 | 3 | # data <- tbl(sc, "trips_model_data") 4 | # x_field <- "pickup_longitude" 5 | # y_field <- "pickup_latitude" 6 | # resolution <- 50 7 | 8 | sqlvis_compute_raster <- function(data, x_field, y_field, resolution = 300){ 9 | 10 | data_prep <- data %>% 11 | select_(x = x_field, y = y_field) %>% 12 | filter(!is.na(x), !is.na(y)) 13 | 14 | s <- data_prep %>% 15 | summarise(max_x = max(x), 16 | max_y = max(y), 17 | min_x = min(x), 18 | min_y = min(y)) %>% 19 | mutate(rng_x = max_x - min_x, 20 | rng_y = max_y - min_y, 21 | resolution = resolution) %>% 22 | collect() 23 | 24 | counts <- data_prep %>% 25 | mutate(res_x = round((x - s$min_x) / s$rng_x * resolution, 0), 26 | res_y = round((y - s$min_y) / s$rng_y * resolution, 0)) %>% 27 | count(res_x, res_y) %>% 28 | collect 29 | 30 | list(counts = counts, 31 | limits = s, 32 | vnames = c(x_field, y_field) 33 | ) 34 | 35 | } 36 | 37 | sqlvis_ggplot_raster <- function(data, ...) { 38 | 39 | d <- data$counts 40 | s <- data$limits 41 | v <- data$vnames 42 | 43 | xx <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_x, s$max_x, len = 6),2)) 44 | yy <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_y, s$max_y, len = 6),2)) 45 | 46 | ggplot(d, aes(res_x, res_y)) + 47 | geom_raster(aes(fill = n)) + 48 | coord_fixed() + 49 | scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") + 50 | scale_x_continuous(breaks = xx, labels = names(xx)) + 51 | scale_y_continuous(breaks = yy, labels = names(yy)) + 52 | labs(x = v[1], y = v[2], ...) 53 | 54 | } 55 | 56 | ### Facets 57 | 58 | sqlvis_compute_raster_g <- function(data, x_field, y_field, g_field, resolution = 300){ 59 | 60 | data_prep <- data %>% 61 | mutate_(group = g_field) %>% 62 | select_(g = "group", x = x_field, y = y_field) %>% 63 | filter(!is.na(x), !is.na(y)) 64 | 65 | s <- data_prep %>% 66 | summarise(max_x = max(x), 67 | max_y = max(y), 68 | min_x = min(x), 69 | min_y = min(y)) %>% 70 | mutate(rng_x = max_x - min_x, 71 | rng_y = max_y - min_y, 72 | resolution = resolution) %>% 73 | collect() 74 | 75 | counts <- data_prep %>% 76 | mutate(res_x = round((x-s$min_x)/s$rng_x*resolution, 0), 77 | res_y = round((y-s$min_y)/s$rng_y*resolution, 0)) %>% 78 | count(g, res_x, res_y) %>% 79 | collect 80 | 81 | list(counts = counts, 82 | limits = s, 83 | vnames = c(x_field, y_field) 84 | ) 85 | 86 | } 87 | 88 | sqlvis_ggplot_raster_g <- function(data, ncol = 4, ...) { 89 | 90 | s <- data$limits 91 | d <- data$counts 92 | v <- data$vnames 93 | 94 | xx <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_x, s$max_x, len = 3), 1)) 95 | yy <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_y, s$max_y, len = 3), 1)) 96 | 97 | ggplot(d, aes(res_x, res_y)) + 98 | geom_raster(aes(fill = n)) + 99 | coord_fixed() + 100 | facet_wrap(~ g, ncol = ncol) + 101 | scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") + 102 | scale_x_continuous(breaks = xx, labels = names(xx)) + 103 | scale_y_continuous(breaks = yy, labels = names(yy)) + 104 | labs(x = v[1], y = v[2], ...) 105 | 106 | } 107 | -------------------------------------------------------------------------------- /dev/flights-cdh/flights_pred_2008.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/flights_pred_2008.RData -------------------------------------------------------------------------------- /dev/flights-cdh/images/clusterDemo/data-analysis-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/images/clusterDemo/data-analysis-1.png -------------------------------------------------------------------------------- /dev/flights-cdh/images/clusterDemo/flex-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/images/clusterDemo/flex-1.png -------------------------------------------------------------------------------- /dev/flights-cdh/images/clusterDemo/forecast-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/images/clusterDemo/forecast-1.png -------------------------------------------------------------------------------- /dev/flights-cdh/images/clusterDemo/hue-metastore-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/images/clusterDemo/hue-metastore-1.png -------------------------------------------------------------------------------- /dev/flights-cdh/images/clusterDemo/manager-landing-page.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/images/clusterDemo/manager-landing-page.png -------------------------------------------------------------------------------- /dev/flights-cdh/images/clusterDemo/performance-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/images/clusterDemo/performance-1.png -------------------------------------------------------------------------------- /dev/flights-cdh/images/clusterDemo/sign-in-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/images/clusterDemo/sign-in-1.png -------------------------------------------------------------------------------- /dev/flights-cdh/images/clusterDemo/spark-history-server-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/images/clusterDemo/spark-history-server-1.png -------------------------------------------------------------------------------- /dev/flights-cdh/images/clusterDemo/spark-pane-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/images/clusterDemo/spark-pane-1.png -------------------------------------------------------------------------------- /dev/flights-cdh/images/clusterDemo/spark-rdd-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/images/clusterDemo/spark-rdd-1.png -------------------------------------------------------------------------------- /dev/flights-cdh/images/clusterDemo/tables-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/images/clusterDemo/tables-1.png -------------------------------------------------------------------------------- /dev/flights-cdh/nycflights_flexdashboard.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Time Gained in Flight" 3 | output: 4 | flexdashboard::flex_dashboard: 5 | orientation: rows 6 | social: menu 7 | source_code: embed 8 | runtime: shiny 9 | --- 10 | 11 | ```{r setup, include=F} 12 | # Attach packages 13 | library(dplyr) 14 | library(ggplot2) 15 | library(DT) 16 | library(leaflet) 17 | library(geosphere) 18 | load('flights_pred_2008.RData') 19 | airports <- mutate(airports, lat = as.numeric(lat), lon = as.numeric(lon)) 20 | ``` 21 | 22 | 23 | Summary 24 | ======================================================================== 25 | 26 | Inputs {.sidebar} 27 | ----------------------------------------------------------------------- 28 | 29 | ### Select Airports 30 | 31 | ```{r} 32 | # Shiny inputs for flight orgin and destination 33 | carrier_origin <- ungroup(pred_data) %>% distinct(origin) %>% .[['origin']] 34 | carrier_dest <- ungroup(pred_data) %>% distinct(dest) %>% .[['dest']] 35 | selectInput("origin", "Flight origin", carrier_origin, selected = "JFK") 36 | selectInput("dest", "Flight destination", carrier_dest, selected = "LAS") 37 | ``` 38 | 39 | ### Background 40 | 41 | Given that your flight was delayed by 15 minutes or more, what is the likelihood 42 | your airline carrier will make up time in route? Some of the most signficant factors 43 | for making up time are flight distance and airline carrier. The data model behind 44 | this dashboard is based on flights from NYC airports in 2013. 45 | 46 | 47 | Row 48 | ----------------------------------------------------------------------- 49 | 50 | ### Observed versus predicted time gain 51 | 52 | ```{r} 53 | # Aggregregate time gain by carrier and by route 54 | plot_data <- reactive({ 55 | req(input$origin, input$dest) 56 | pred_data %>% 57 | filter(origin==input$origin & dest==input$dest) %>% 58 | ungroup() %>% 59 | select(airline, flights, distance, avg_dep_delay, avg_arr_delay, avg_gain, pred_gain) 60 | }) 61 | 62 | # Plot observed versus predicted time gain for carriers and route 63 | renderPlot({ 64 | ggplot(plot_data(), aes(factor(airline), pred_gain)) + 65 | geom_bar(stat = "identity", fill = '#2780E3') + 66 | geom_point(aes(factor(airline), avg_gain)) + 67 | coord_flip() + 68 | labs(x = "", y = "Time gained in flight (minutes)") + 69 | labs(title = "Observed gain (point) vs Predicted gain (bar)") 70 | }) 71 | ``` 72 | 73 | ### Route 74 | 75 | ```{r} 76 | # Identify origin lat and long 77 | origin <- reactive({ 78 | req(input$origin) 79 | filter(airports, faa == input$origin) 80 | }) 81 | 82 | # Identify destination lat and log 83 | dest <- reactive({ 84 | req(input$dest) 85 | filter(airports, faa == input$dest) 86 | }) 87 | 88 | # Plot route 89 | renderLeaflet({ 90 | gcIntermediate( 91 | select(origin(), lon, lat), 92 | select(dest(), lon, lat), 93 | n=100, addStartEnd=TRUE, sp=TRUE 94 | ) %>% 95 | leaflet() %>% 96 | addProviderTiles("CartoDB.Positron") %>% 97 | addPolylines() 98 | }) 99 | ``` 100 | 101 | Row 102 | ----------------------------------------------------------------------- 103 | 104 | ### Data details 105 | 106 | ```{r} 107 | # Print table of observed and predicted gains by airline 108 | renderDataTable( 109 | datatable(plot_data()) %>% 110 | formatRound(c("flights", "distance"), 0) %>% 111 | formatRound(c("avg_arr_delay", "avg_dep_delay", "avg_gain", "pred_gain"), 1) 112 | ) 113 | ``` 114 | 115 | Model Details 116 | ======================================================================== 117 | 118 | ```{r} 119 | renderPrint(ml1_summary) 120 | ``` 121 | -------------------------------------------------------------------------------- /dev/flights-cdh/sparkClusterDemo-source.R: -------------------------------------------------------------------------------- 1 | 2 | library(sparklyr) 3 | library(dplyr) 4 | library(ggplot2) 5 | 6 | Sys.setenv(HADOOP_CONF_DIR='/etc/hadoop/conf.cloudera.hdfs') 7 | Sys.setenv(YARN_CONF_DIR='/etc/hadoop/conf.cloudera.yarn') 8 | #Sys.setenv(SPARK_HOME="/home/ubuntu/spark-1.6.0") 9 | #Sys.setenv(SPARK_HOME_VERSION="1.6.0") 10 | 11 | sc <- spark_connect(master = "yarn-client", version="1.6.0", spark_home = '/opt/cloudera/parcels/CDH/lib/spark/') 12 | 13 | #--------------------------------------------------------- 14 | 15 | # Cache flights Hive table into Spark 16 | tbl_cache(sc, 'flights') 17 | flights_tbl <- tbl(sc, 'flights') 18 | 19 | # Cache airlines Hive table into Spark 20 | tbl_cache(sc, 'airlines') 21 | airlines_tbl <- tbl(sc, 'airlines') 22 | 23 | # Cache airports Hive table into Spark 24 | tbl_cache(sc, 'airports') 25 | airports_tbl <- tbl(sc, 'airports') 26 | 27 | #--------------------------------------------------------- 28 | 29 | # Filter records and create target variable 'gain' 30 | model_data <- flights_tbl %>% 31 | filter(!is.na(arrdelay) & !is.na(depdelay) & !is.na(distance)) %>% 32 | filter(depdelay > 15 & depdelay < 240) %>% 33 | filter(arrdelay > -60 & arrdelay < 360) %>% 34 | filter(year >= 2003 & year <= 2007) %>% 35 | left_join(airlines_tbl, by = c("uniquecarrier" = "code")) %>% 36 | mutate(gain = depdelay - arrdelay) %>% 37 | select(year, month, arrdelay, depdelay, distance, uniquecarrier, description, gain) 38 | 39 | # Summarize data by carrier 40 | model_data %>% 41 | group_by(uniquecarrier) %>% 42 | summarize(description = min(description), gain=mean(gain), 43 | distance=mean(distance), depdelay=mean(depdelay)) %>% 44 | select(description, gain, distance, depdelay) %>% 45 | arrange(gain) 46 | 47 | #--------------------------------------------------------- 48 | 49 | # Partition the data into training and validation sets 50 | model_partition <- model_data %>% 51 | sdf_partition(train = 0.8, valid = 0.2, seed = 5555) 52 | 53 | # Fit a linear model 54 | ml1 <- model_partition$train %>% 55 | ml_linear_regression(gain ~ distance + depdelay + uniquecarrier) 56 | 57 | # Summarize the linear model 58 | summary(ml1) 59 | 60 | #--------------------------------------------------------- 61 | 62 | # Calculate average gains by predicted decile 63 | model_deciles <- lapply(model_partition, function(x) { 64 | sdf_predict(ml1, x) %>% 65 | mutate(decile = ntile(desc(prediction), 10)) %>% 66 | group_by(decile) %>% 67 | summarize(gain = mean(gain)) %>% 68 | select(decile, gain) %>% 69 | collect() 70 | }) 71 | 72 | # Create a summary dataset for plotting 73 | deciles <- rbind( 74 | data.frame(data = 'train', model_deciles$train), 75 | data.frame(data = 'valid', model_deciles$valid), 76 | make.row.names = FALSE 77 | ) 78 | 79 | # Plot average gains by predicted decile 80 | deciles %>% 81 | ggplot(aes(factor(decile), gain, fill = data)) + 82 | geom_bar(stat = 'identity', position = 'dodge') + 83 | labs(title = 'Average gain by predicted decile', x = 'Decile', y = 'Minutes') 84 | 85 | #--------------------------------------------------------- 86 | 87 | # Select data from an out of time sample 88 | data_2008 <- flights_tbl %>% 89 | filter(!is.na(arrdelay) & !is.na(depdelay) & !is.na(distance)) %>% 90 | filter(depdelay > 15 & depdelay < 240) %>% 91 | filter(arrdelay > -60 & arrdelay < 360) %>% 92 | filter(year == 2008) %>% 93 | left_join(airlines_tbl, by = c("uniquecarrier" = "code")) %>% 94 | mutate(gain = depdelay - arrdelay) %>% 95 | select(year, month, arrdelay, depdelay, distance, uniquecarrier, description, gain, origin,dest) 96 | 97 | # Summarize data by carrier 98 | carrier <- sdf_predict(ml1, data_2008) %>% 99 | group_by(description) %>% 100 | summarize(gain = mean(gain), prediction = mean(prediction), freq = n()) %>% 101 | filter(freq > 10000) %>% 102 | collect 103 | 104 | # Plot actual gains and predicted gains by airline carrier 105 | ggplot(carrier, aes(gain, prediction)) + 106 | geom_point(alpha = 0.75, color = 'red', shape = 3) + 107 | geom_abline(intercept = 0, slope = 1, alpha = 0.15, color = 'blue') + 108 | geom_text(aes(label = substr(description, 1, 20)), size = 3, alpha = 0.75, vjust = -1) + 109 | labs(title='Average Gains Forecast', x = 'Actual', y = 'Predicted') 110 | 111 | #--------------------------------------------------------- 112 | 113 | # Summarize by origin, destination, and carrier 114 | summary_2008 <- sdf_predict(ml1, data_2008) %>% 115 | rename(carrier = uniquecarrier, airline = description) %>% 116 | group_by(origin, dest, carrier, airline) %>% 117 | summarize( 118 | flights = n(), 119 | distance = mean(distance), 120 | avg_dep_delay = mean(depdelay), 121 | avg_arr_delay = mean(arrdelay), 122 | avg_gain = mean(gain), 123 | pred_gain = mean(prediction) 124 | ) 125 | 126 | # Collect and save objects 127 | pred_data <- collect(summary_2008) 128 | airports <- collect(select(airports_tbl, name, faa, lat, lon)) 129 | ml1_summary <- capture.output(summary(ml1)) 130 | save(pred_data, airports, ml1_summary, file = 'flights_pred_2008.RData') 131 | 132 | 133 | -------------------------------------------------------------------------------- /dev/flights/flightsApp/app.R: -------------------------------------------------------------------------------- 1 | library(shiny) 2 | 3 | ui <- fluidPage( 4 | 5 | # Application title 6 | titlePanel("Old Faithful Geyser Data"), 7 | 8 | # Sidebar with a slider input for number of bins 9 | sidebarLayout( 10 | sidebarPanel( 11 | sliderInput("bins", 12 | "Number of bins:", 13 | min = 1, 14 | max = 50, 15 | value = 30) 16 | ), 17 | 18 | # Show a plot of the generated distribution 19 | mainPanel( 20 | plotOutput("distPlot") 21 | ) 22 | ) 23 | ) 24 | 25 | server <- function(input, output) { 26 | 27 | output$distPlot <- renderPlot({ 28 | # generate bins based on input$bins from ui.R 29 | x <- faithful[, 2] 30 | bins <- seq(min(x), max(x), length.out = input$bins + 1) 31 | 32 | # draw the histogram with the specified number of bins 33 | hist(x, breaks = bins, col = 'darkgray', border = 'white') 34 | }) 35 | } 36 | 37 | shinyApp(ui = ui, server = server) 38 | 39 | -------------------------------------------------------------------------------- /dev/flights/flightsApp2/global.R: -------------------------------------------------------------------------------- 1 | library(nycflights13) 2 | library(tibble) 3 | library(ggplot2) 4 | library(dplyr) 5 | library(sparklyr) 6 | library(lubridate) 7 | library(MASS) 8 | 9 | Sys.setenv(SPARK_HOME="/usr/lib/spark") 10 | system.time(sc <- spark_connect(master = "yarn-client", version = '2.0.0')) 11 | 12 | # Cache airlines Hive table into Spark 13 | #system.time(tbl_cache(sc, 'airlines')) 14 | 15 | # We use a small subset of airlines in this application 16 | #system.time(airlines_tbl <- tbl(sc, 'airlines')) 17 | #system.time(airlines_tbl <- spark_read_csv(sc, "airlines", "hdfs:///airlines/airlines.csv", memory=TRUE)) 18 | #airlines_r <- airlines_tbl %>% arrange(description) %>% collect 19 | airlines_r <- tibble::tibble( 20 | code = c("B6", "UA", "AA", "DL", "WN", "US"), 21 | description = c("JetBlue Airways","United Air Lines Inc.", 22 | "American Airlines Inc." , "Delta Air Lines Inc.", 23 | "Southwest Airlines Co.","US Airways Inc.") 24 | ) 25 | 26 | # We use the airports from nycflights13 package in this application 27 | # airports_tbl <- copy_to(sc, nycflights13::airports, "airports", overwrite = TRUE) 28 | # airports <- airports_tbl %>% collect 29 | airports <- nycflights13::airports 30 | 31 | # Cache flights Hive table into Spark 32 | #system.time(tbl_cache(sc, 'flights')) 33 | #system.time(flights_tbl <- tbl(sc, 'flights')) 34 | 35 | #Instead of caching the flights data (which takes very long), we load the data in Parquet 36 | #format from HDFS. First the following 2 commented lines must be run to save the data. 37 | #system.time(flights_tbl <- tbl(sc, 'flights')) 38 | #system.time(spark_write_parquet(flights_tbl, "hdfs:///flights-parquet-all")) 39 | system.time(flights_tbl <- spark_read_parquet(sc, "flights_s", "hdfs:///flights-parquet-all", memory=FALSE)) 40 | 41 | years <- tibble::tibble(year = c(1987:2008)) 42 | years_sub <- tibble::tibble(year = c(1999:2008)) 43 | dests <- c("LAX","ORD","ATL","HNL") 44 | 45 | delay <- flights_tbl %>% 46 | group_by(tailnum) %>% 47 | summarise(count = n(), 48 | dist = mean(distance), 49 | delay = mean(arrdelay), 50 | arrdelay_mean = mean(arrdelay), 51 | depdelay_mean = mean(depdelay)) %>% 52 | filter(count > 20, 53 | dist < 2000, 54 | !is.na(delay)) %>% 55 | collect 56 | 57 | -------------------------------------------------------------------------------- /dev/flights/flightsApp2/server.R: -------------------------------------------------------------------------------- 1 | library(shinydashboard) 2 | library(dplyr) 3 | library(maps) 4 | library(geosphere) 5 | library(lubridate) 6 | library(MASS) 7 | 8 | source("global.R") 9 | 10 | function(input, output, session) { 11 | 12 | selected_carriers <- reactive(input$airline_selections) 13 | selected_density <- reactive(input$density_selection) 14 | selected_year <- reactive(input$years_selection) 15 | selected_airline <- reactive(filter(airlines_r, description==input$carrier_selection)) 16 | selected_carrier <- reactive(selected_airline()$code) 17 | selected_dest_year <- reactive(input$years_dest_selection) 18 | selected_cancel_year <- reactive(input$years_cancel_selection) 19 | selected_day_year <- reactive(input$day_selection) 20 | 21 | output$yearsPlot <- renderPlot ({ 22 | xlim <- c(-171.738281, -56.601563) 23 | ylim <- c(12.039321, 71.856229) 24 | pal <- colorRampPalette(c("#f2f2f2", "red")) 25 | colors <- pal(100) 26 | map("world", col="#f2f2f2", fill=TRUE, bg="black", lwd=0.05, xlim=xlim, ylim=ylim) 27 | #map("world", col="#191919", fill=TRUE, bg="#000000", lwd=0.05, xlim=xlim, ylim=ylim) 28 | year_selected = selected_year() 29 | flights_count <- flights_tbl %>% filter(year == year_selected) %>% 30 | group_by(uniquecarrier, origin, dest) %>% 31 | summarize( count = n()) %>% 32 | collect 33 | flights_count$count <- unlist(flights_count$count) 34 | fsub <- filter(flights_count, uniquecarrier == selected_carrier(), count > 200) 35 | fsub <- fsub[order(fsub$count),] 36 | maxcnt <- max(fsub$count) 37 | for (j in 1:length(fsub$uniquecarrier)) { 38 | air1 <- airports[airports$faa == fsub[j,]$origin,] 39 | air2 <- airports[airports$faa == fsub[j,]$dest,] 40 | if (dim(air1)[1] != 0 & dim(air2)[1] != 0) { 41 | inter <- gcIntermediate(c(air1[1,]$lon, air1[1,]$lat), c(air2[1,]$lon, air2[1,]$lat), n=100, addStartEnd=TRUE) 42 | colindex <- round( (fsub[j,]$count / maxcnt) * length(colors) ) 43 | 44 | lines(inter, col=colors[colindex], lwd=0.8) 45 | lines(inter, col="black", lwd=0.8) 46 | } 47 | } 48 | 49 | }) 50 | 51 | output$densityPlot <- renderPlot ({ 52 | r <- ggplot(delay, aes_string("dist", selected_density())) + 53 | geom_point(aes(size = count), alpha = 1/2) + 54 | geom_smooth() + 55 | scale_size_area(max_size = 2) 56 | print(r) 57 | }) 58 | 59 | output$destPlot <- renderPlot ({ 60 | year_selected <- selected_dest_year() 61 | flights_by_dest <- flights_tbl %>% filter(year == year_selected) %>% 62 | filter(dest %in% dests) %>% 63 | group_by(dest, dayofweek, month, uniquecarrier) %>% 64 | select(dest, dayofweek, month, uniquecarrier) %>% 65 | collect 66 | d <- ggplot(data = flights_by_dest, aes(x = month, fill=dest)) + stat_density() 67 | r <- ggplot(data = flights_by_dest) + 68 | geom_bar(mapping = aes(x = month, fill = dest), position = "dodge") 69 | print(d) 70 | }) 71 | 72 | output$cancelPlot <- renderPlot ({ 73 | c_year_selected <- selected_cancel_year() 74 | flights_cancelled <- flights_tbl %>% 75 | filter(year == c_year_selected) %>% 76 | group_by(dest, month, cancelled) %>% 77 | summarise( 78 | count = n(), 79 | delay = mean(arrdelay, na.rm = TRUE), 80 | arrdelay_mean = mean(arrdelay, na.rm = TRUE), 81 | depdelay_mean = mean(depdelay, na.rm = TRUE) 82 | ) %>% 83 | filter(count > 20, dest != "HNL", cancelled == 1) %>% 84 | collect 85 | 86 | c <- ggplot(flights_cancelled, aes_string("month", "count")) + 87 | geom_point(alpha = 1/2, position = "jitter") + 88 | geom_smooth() + 89 | scale_size_area(max_size = 2) 90 | print(c) 91 | }) 92 | 93 | output$dayPlot <- renderPlot ({ 94 | year_day_selected <- selected_day_year() 95 | flights_by_year <- flights_tbl %>% 96 | filter(year== year_day_selected , Dest %in% dests) %>% 97 | group_by(year, month, dayofmonth, dest) %>% 98 | summarise(n = n()) %>% 99 | collect 100 | 101 | daily <- flights_by_year %>% 102 | mutate(date = make_datetime(year, month, dayofmonth)) %>% 103 | group_by(date) 104 | 105 | daily <- daily %>% 106 | mutate(wday = wday(date, label = TRUE)) 107 | 108 | d <- ggplot(daily, aes(wday, n, color=dest)) + 109 | geom_boxplot() 110 | print(d) 111 | }) 112 | } -------------------------------------------------------------------------------- /dev/flights/flightsApp2/ui.R: -------------------------------------------------------------------------------- 1 | library(shinydashboard) 2 | 3 | header <- dashboardHeader( 4 | title = "Flights Data Analysis" 5 | ) 6 | sidebar <- dashboardSidebar( 7 | sidebarMenu( 8 | menuItem("Flights by year and airline", tabName = "years"), 9 | menuItem("Delay Density", tabName = "delay_density"), 10 | menuItem("Cancelled flights", tabName = "cancelled"), 11 | menuItem("Flights by day of week", tabName = "dayofweek") 12 | ) 13 | ) 14 | 15 | 16 | body <- dashboardBody( 17 | tabItems( 18 | tabItem("years", 19 | fluidRow( 20 | column(width = 8, 21 | box(width = NULL, solidHeader = TRUE, 22 | plotOutput('yearsPlot') 23 | ) 24 | ), 25 | column(width = 3, 26 | box(width = NULL, status = "warning", 27 | uiOutput("years_selection"), 28 | radioButtons("years_selection", label = h3("Select a year"), 29 | years_sub$year, selected = 2000) 30 | ) 31 | ), 32 | column(width = 3, 33 | box(width = NULL, status = "warning", 34 | uiOutput("carrier_selection"), 35 | radioButtons("carrier_selection", label = h3("Select an airline"), 36 | airlines_r$description, selected = "American Airlines Inc.") 37 | ) 38 | ) 39 | 40 | ) 41 | ), 42 | tabItem("delay_density", 43 | fluidRow( 44 | column(width = 9, 45 | box(width = NULL, solidHeader = TRUE, 46 | plotOutput('densityPlot') 47 | ) 48 | ), 49 | column(width = 3, 50 | box(width = NULL, status = "warning", 51 | uiOutput("density_selection"), 52 | radioButtons("density_selection", label = h3("Select arrival or departure"), 53 | choices = c( 54 | Departure = "depdelay_mean", 55 | Arrival = "arrdelay_mean" 56 | ), 57 | selected = "arrdelay_mean") 58 | ) 59 | ) 60 | 61 | ) 62 | ), 63 | tabItem("cancelled", 64 | fluidRow( 65 | column(width = 9, 66 | box(width = NULL, solidHeader = TRUE, 67 | plotOutput('cancelPlot') 68 | ) 69 | ), 70 | column(width = 3, 71 | box(width = NULL, status = "warning", 72 | uiOutput("years_cancel_selection"), 73 | radioButtons("years_cancel_selection", label = h3("Select a year"), 74 | years_sub$year, selected = 2008) 75 | ) 76 | ) 77 | ) 78 | ), 79 | tabItem("dayofweek", 80 | fluidRow( 81 | column(width = 9, 82 | box(width = NULL, solidHeader = TRUE, 83 | plotOutput('dayPlot') 84 | ) 85 | ), 86 | column(width = 3, 87 | box(width = NULL, status = "warning", 88 | uiOutput("day_selection"), 89 | radioButtons("day_selection", label = h3("Select a year"), 90 | years_sub$year, selected = 2008) 91 | ) 92 | ) 93 | ) 94 | ) 95 | 96 | ) 97 | ) 98 | 99 | dashboardPage( 100 | header, 101 | sidebar, 102 | body 103 | ) -------------------------------------------------------------------------------- /dev/flights/flights_pred_2008.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/flights_pred_2008.RData -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/awsClusterConnect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/awsClusterConnect.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/awsCreateCluster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/awsCreateCluster.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/awsCreateCluster2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/awsCreateCluster2.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/awsNewSecurityGroup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/awsNewSecurityGroup.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/awsSecurityGroup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/awsSecurityGroup.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/awsSecurityGroup2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/awsSecurityGroup2.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/emrArchitecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/emrArchitecture.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/emrConfigStep1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/emrConfigStep1.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/emrConfigStep2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/emrConfigStep2.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/emrConfigStep3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/emrConfigStep3.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/emrConfigStep4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/emrConfigStep4.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/emrLogin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/emrLogin.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/flightsDashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/flightsDashboard.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/flightsDeciles.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/flightsDeciles.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/flightsDecilesDesc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/flightsDecilesDesc.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/flightsPredicted.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/flightsPredicted.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/rstudio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/rstudio.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/rstudioData.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/rstudioData.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/rstudioLogin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/rstudioLogin.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/rstudioModel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/rstudioModel.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/rstudioModelDetail.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/rstudioModelDetail.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/rstudioSparkPane.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/rstudioSparkPane.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/workflow.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/workflowCommands.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/workflowCommands.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/workflowRSC.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/workflowRSC.png -------------------------------------------------------------------------------- /dev/flights/images/clusterDemo/workflowShare.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/workflowShare.png -------------------------------------------------------------------------------- /dev/flights/nycflights_flexdashboard.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Time Gained in Flight" 3 | output: 4 | flexdashboard::flex_dashboard: 5 | orientation: rows 6 | social: menu 7 | source_code: embed 8 | runtime: shiny 9 | --- 10 | 11 | ```{r setup, include=F} 12 | # Attach packages 13 | library(dplyr) 14 | library(ggplot2) 15 | library(DT) 16 | library(leaflet) 17 | library(geosphere) 18 | load('flights_pred_2008.RData') 19 | airports <- mutate(airports, lat = as.numeric(lat), lon = as.numeric(lon)) 20 | ``` 21 | 22 | 23 | Summary 24 | ======================================================================== 25 | 26 | Inputs {.sidebar} 27 | ----------------------------------------------------------------------- 28 | 29 | ### Select Airports 30 | 31 | ```{r} 32 | # Shiny inputs for flight orgin and destination 33 | carrier_origin <- ungroup(pred_data) %>% distinct(origin) %>% .[['origin']] 34 | carrier_dest <- ungroup(pred_data) %>% distinct(dest) %>% .[['dest']] 35 | selectInput("origin", "Flight origin", carrier_origin, selected = "JFK") 36 | selectInput("dest", "Flight destination", carrier_dest, selected = "LAS") 37 | ``` 38 | 39 | ### Background 40 | 41 | Given that your flight was delayed by 15 minutes or more, what is the likelihood 42 | your airline carrier will make up time in route? Some of the most signficant factors 43 | for making up time are flight distance and airline carrier. The data model behind 44 | this dashboard is based on flights from NYC airports in 2013. 45 | 46 | 47 | Row 48 | ----------------------------------------------------------------------- 49 | 50 | ### Observed versus predicted time gain 51 | 52 | ```{r} 53 | # Aggregregate time gain by carrier and by route 54 | plot_data <- reactive({ 55 | req(input$origin, input$dest) 56 | pred_data %>% 57 | filter(origin==input$origin & dest==input$dest) %>% 58 | ungroup() %>% 59 | select(airline, flights, distance, avg_dep_delay, avg_arr_delay, avg_gain, pred_gain) 60 | }) 61 | 62 | # Plot observed versus predicted time gain for carriers and route 63 | renderPlot({ 64 | ggplot(plot_data(), aes(factor(airline), pred_gain)) + 65 | geom_bar(stat = "identity", fill = '#2780E3') + 66 | geom_point(aes(factor(airline), avg_gain)) + 67 | coord_flip() + 68 | labs(x = "", y = "Time gained in flight (minutes)") + 69 | labs(title = "Observed gain (point) vs Predicted gain (bar)") 70 | }) 71 | ``` 72 | 73 | ### Route 74 | 75 | ```{r} 76 | # Identify origin lat and long 77 | origin <- reactive({ 78 | req(input$origin) 79 | filter(airports, faa == input$origin) 80 | }) 81 | 82 | # Identify destination lat and log 83 | dest <- reactive({ 84 | req(input$dest) 85 | filter(airports, faa == input$dest) 86 | }) 87 | 88 | # Plot route 89 | renderLeaflet({ 90 | gcIntermediate( 91 | select(origin(), lon, lat), 92 | select(dest(), lon, lat), 93 | n=100, addStartEnd=TRUE, sp=TRUE 94 | ) %>% 95 | leaflet() %>% 96 | addProviderTiles("CartoDB.Positron") %>% 97 | addPolylines() 98 | }) 99 | ``` 100 | 101 | Row 102 | ----------------------------------------------------------------------- 103 | 104 | ### Data details 105 | 106 | ```{r} 107 | # Print table of observed and predicted gains by airline 108 | renderDataTable( 109 | datatable(plot_data()) %>% 110 | formatRound(c("flights", "distance"), 0) %>% 111 | formatRound(c("avg_arr_delay", "avg_dep_delay", "avg_gain", "pred_gain"), 1) 112 | ) 113 | ``` 114 | 115 | Model Details 116 | ======================================================================== 117 | 118 | ```{r} 119 | renderPrint(ml1_summary) 120 | ``` 121 | -------------------------------------------------------------------------------- /dev/flights/nycflights_flexdashboard_spark.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Time Gained in Flight" 3 | output: 4 | flexdashboard::flex_dashboard: 5 | orientation: rows 6 | social: menu 7 | source_code: embed 8 | runtime: shiny 9 | --- 10 | 11 | ```{r setup, include=F} 12 | # Attach packages 13 | library(dplyr) 14 | library(ggplot2) 15 | library(DT) 16 | library(leaflet) 17 | library(geosphere) 18 | library(sparklyr) 19 | library(dplyr) 20 | 21 | #Sys.setenv(SPARK_HOME = "/home/sean/.cache/spark/spark-1.6.2-bin-hadoop2.6") 22 | #sc <- spark_connect(master = "local", version = "1.6.2") 23 | #spark_read_csv(sc, "nyc_taxi_sample", path = "../../nathan/sol-eng-nyc-taxi-data/csv/trips/nyc_taxi_trips_2015-11.csv") 24 | 25 | # Connect to Spark 26 | Sys.setenv(SPARK_HOME="/usr/lib/spark") 27 | config <- spark_config() 28 | sc <- spark_connect(master = "yarn-client", config = config, version = '1.6.2') 29 | pred_data_tbl <- tbl(sc, 'summary_2008') 30 | 31 | #load('flights_pred_2008.RData') 32 | #airports <- mutate(airports, lat = as.numeric(lat), lon = as.numeric(lon)) 33 | 34 | # Load summary data from flights forecast 35 | #pred_data_tbl <- tbl(sc, 'summary_2008') 36 | #pred_data <- collect(pred_data_tbl) 37 | 38 | # Load airports data 39 | #airports <- tbl(sc, 'airports') %>% 40 | # mutate(lat = as.numeric(lat), lon = as.numeric(lon)) %>% 41 | # collect 42 | ``` 43 | 44 | 45 | Summary 46 | ======================================================================== 47 | 48 | Inputs {.sidebar} 49 | ----------------------------------------------------------------------- 50 | 51 | ### Select Airports 52 | 53 | ```{r} 54 | # Shiny inputs for flight orgin and destination 55 | carrier_origin <- ungroup(pred_data) %>% distinct(origin) %>% .[['origin']] 56 | carrier_dest <- ungroup(pred_data) %>% distinct(dest) %>% .[['dest']] 57 | selectInput("origin", "Flight origin", carrier_origin, selected = "JFK") 58 | selectInput("dest", "Flight destination", carrier_dest, selected = "LAS") 59 | ``` 60 | 61 | ### Background 62 | 63 | Given that your flight was delayed by 15 minutes or more, what is the likelihood 64 | your airline carrier will make up time in route? Some of the most signficant factors 65 | for making up time are flight distance and airline carrier. The data model behind 66 | this dashboard is based on flights from NYC airports in 2013. 67 | 68 | 69 | Row 70 | ----------------------------------------------------------------------- 71 | 72 | ### Observed versus predicted time gain 73 | 74 | ```{r} 75 | # Aggregregate time gain by carrier and by route 76 | plot_data <- reactive({ 77 | req(input$origin, input$dest) 78 | pred_data %>% 79 | filter(origin==input$origin & dest==input$dest) %>% 80 | ungroup() %>% 81 | select(airline, flights, distance, avg_dep_delay, avg_arr_delay, avg_gain, pred_gain) 82 | }) 83 | 84 | # Plot observed versus predicted time gain for carriers and route 85 | renderPlot({ 86 | ggplot(plot_data(), aes(factor(airline), pred_gain)) + 87 | geom_bar(stat = "identity", fill = '#2780E3') + 88 | geom_point(aes(factor(airline), avg_gain)) + 89 | coord_flip() + 90 | labs(x = "", y = "Time gained in flight (minutes)") + 91 | labs(title = "Observed gain (point) vs Predicted gain (bar)") 92 | }) 93 | ``` 94 | 95 | ### Route 96 | 97 | ```{r} 98 | # Identify origin lat and long 99 | origin <- reactive({ 100 | req(input$origin) 101 | filter(airports, faa == input$origin) 102 | }) 103 | 104 | # Identify destination lat and log 105 | dest <- reactive({ 106 | req(input$dest) 107 | filter(airports, faa == input$dest) 108 | }) 109 | 110 | # Plot route 111 | renderLeaflet({ 112 | gcIntermediate( 113 | select(origin(), lon, lat), 114 | select(dest(), lon, lat), 115 | n=100, addStartEnd=TRUE, sp=TRUE 116 | ) %>% 117 | leaflet() %>% 118 | addProviderTiles("CartoDB.Positron") %>% 119 | addPolylines() 120 | }) 121 | ``` 122 | 123 | Row 124 | ----------------------------------------------------------------------- 125 | 126 | ### Data details 127 | 128 | ```{r} 129 | # Print table of observed and predicted gains by airline 130 | renderDataTable( 131 | datatable(plot_data()) %>% 132 | formatRound(c("flights", "distance"), 0) %>% 133 | formatRound(c("avg_arr_delay", "avg_dep_delay", "avg_gain", "pred_gain"), 1) 134 | ) 135 | ``` 136 | 137 | -------------------------------------------------------------------------------- /dev/flights/recode_for_prediction.R: -------------------------------------------------------------------------------- 1 | #data_2008 %>% group_by(crsarrtime) %>% summarize(freq = n()) %>% arrange(desc(freq)) 2 | #mutate(uniquecarrier = ifelse(crsarrtime == 351, "DH", uniquecarrier)) %>% 3 | #mutate(uniquecarrier = ifelse(crsarrtime == 120, "HP", uniquecarrier)) %>% 4 | #mutate(uniquecarrier = ifelse(crsarrtime == 347, "TZ", uniquecarrier)) %>% 5 | -------------------------------------------------------------------------------- /dev/h2o-demo/emr_h2o_setup.sh: -------------------------------------------------------------------------------- 1 | ### Build EMR for H2O 2 | ### Nathan Stephens 3 | ### 1/28/2017 4 | 5 | ########################################### 6 | ### Run as root 7 | ########################################### 8 | 9 | ## RSP 10 | 11 | # Upate 12 | sudo yum update 13 | 14 | # R 15 | sudo yum install -y R libcurl-devel openssl-devel git 16 | 17 | # install RSP 18 | wget -q https://download2.rstudio.org/current.ver -O /tmp/rsp.current.ver 19 | wget -O /tmp/rstudio-server-rhel.rpm https://s3.amazonaws.com/rstudio-dailybuilds/rstudio-server-rhel-pro-$(cat /tmp/rsp.current.ver)-x86_64.rpm 20 | sudo yum install -y --nogpgcheck /tmp/rstudio-server-rhel.rpm 21 | 22 | # install packages 23 | sudo Rscript -e 'install.packages("sparklyr", repos = "http://cran.rstudio.com/")' 24 | sudo Rscript -e 'install.packages("devtools", repos = "http://cran.rstudio.com/")' 25 | sudo Rscript -e 'install.packages("tidyverse", repos = "http://cran.rstudio.com/")' 26 | sudo Rscript -e 'install.packages("leaflet", repos = "http://cran.rstudio.com/")' 27 | sudo Rscript -e 'install.packages("DT", repos = "http://cran.rstudio.com/")' 28 | 29 | ########################################### 30 | 31 | ## add rstudio directory 32 | 33 | hadoop fs -mkdir /user/rstudio 34 | hadoop fs -chown rstudio:rstudio /user/rstudio 35 | 36 | ## Add rstudio user 37 | 38 | sudo useradd -m rstudio 39 | sudo echo rstudio | passwd rstudio --stdin 40 | sudo usermod -a -G hadoop rstudio 41 | sudo usermod -a -G hive rstudio 42 | 43 | 44 | ########################################### 45 | ### Run as rstudio 46 | ########################################### 47 | 48 | ## switch user 49 | su rstudio 50 | cd ~ 51 | 52 | ## clone project 53 | git clone https://github.com/rstudio/sparkDemos.git /home/rstudio/sparkDemos 54 | cat >/home/rstudio/sparkDemos/sparkDemos.Rproj <> nyct2010.log & 73 | nohup /usr/bin/s3-dist-cp --src=s3n://***/nyc-taxi/parquet_nohead/trips --dest=hdfs:///user/rstudio/trips_par >> trips_par.log & 74 | nohup /usr/bin/s3-dist-cp --src=s3n://***/nyc-taxi/parquet/trips_model_data --dest=hdfs:///user/rstudio/trips_model_data >> trips_model_data.log & 75 | 76 | 77 | ########################################### 78 | ### Open Hive 79 | ########################################### 80 | 81 | hive 82 | 83 | # Hive 1 84 | 85 | CREATE EXTERNAL TABLE IF NOT EXISTS nyct2010( 86 | gid int, 87 | ctlabel float, 88 | borocode int, 89 | boroname string, 90 | ct2010 int, 91 | boroct2010 int, 92 | cdeligibil string, 93 | ntacode string, 94 | ntaname string, 95 | puma int) 96 | ROW FORMAT DELIMITED 97 | FIELDS TERMINATED BY ',' 98 | LINES TERMINATED BY '\n' 99 | ; 100 | 101 | LOAD DATA INPATH '/user/rstudio/nyct2010' INTO TABLE nyct2010; 102 | 103 | # Hive 3 104 | 105 | CREATE EXTERNAL TABLE IF NOT EXISTS trips_par( 106 | id int, 107 | cab_type_id int, 108 | vendor_id string, 109 | pickup_datetime timestamp, 110 | dropoff_datetime timestamp, 111 | store_and_fwd_flag string, 112 | rate_code_id string, 113 | pickup_longitude float, 114 | pickup_latitude float, 115 | dropoff_longitude float, 116 | dropoff_latitude float, 117 | passenger_count bigint, 118 | trip_distance float, 119 | fare_amount float, 120 | extra bigint, 121 | mta_tax string, 122 | tip_amount float, 123 | tolls_amount float, 124 | ehail_fee string, 125 | improvement_surcharge string, 126 | total_amount float, 127 | payment_type string, 128 | trip_type string, 129 | pickup_nyct2010_gid int, 130 | dropoff_nyct2010_gid int) 131 | stored as parquet; 132 | 133 | LOAD DATA INPATH '/user/rstudio/trips_par' INTO TABLE trips_par; 134 | 135 | 136 | # Hive 3 137 | CREATE EXTERNAL TABLE IF NOT EXISTS trips_model_data( 138 | pickup_datetime timestamp, 139 | pickup_latitude float, 140 | pickup_longitude float, 141 | pickup_nyct2010_gid int, 142 | pickup_boro string, 143 | pickup_nta string, 144 | dropoff_datetime timestamp, 145 | dropoff_latitude float, 146 | dropoff_longitude float, 147 | dropoff_nyct2010_gid int, 148 | dropoff_boro string, 149 | dropoff_nta string, 150 | cab_type string, 151 | passenger_count bigint, 152 | trip_distance float, 153 | pay_type string, 154 | fare_amount float, 155 | tip_amount float, 156 | other_amount float, 157 | total_amount float) 158 | stored as parquet; 159 | 160 | LOAD DATA INPATH '/user/rstudio/trips_model_data' INTO TABLE trips_model_data; 161 | 162 | -------------------------------------------------------------------------------- /dev/h2o-demo/h2oHadoop.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Run H2O on Hadoop" 3 | output: html_notebook 4 | --- 5 | 6 | ```{bash} 7 | wget http://h2o-release.s3.amazonaws.com/h2o/rel-turnbull/2/h2o-3.10.1.2-hdp2.4.zip 8 | unzip h2o-3.10.1.2-hdp2.4.zip 9 | cd h2o-3.10.1.2-hdp2.4 10 | hadoop jar h2odriver.jar -nodes 4 -mapperXmx 6g -output hdfsOutputDirName3 11 | ``` 12 | 13 | ```{r} 14 | library(h2o) 15 | h2o.init("10.233.190.198") 16 | h2o.clusterStatus() 17 | ``` 18 | 19 | ```{r} 20 | write.table(iris, "iris.csv", quote = F, col.names = T, row.names = F, sep = ",") 21 | data <- h2o.importFile("iris.csv") 22 | data <- h2o.importFile("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data") 23 | data 24 | ``` 25 | 26 | -------------------------------------------------------------------------------- /dev/h2o-demo/h2oModels.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "H2O Models" 3 | output: html_notebook 4 | --- 5 | 6 | ```{r} 7 | #devtools::install_github("rstudio/sparklyr") # used for sample_n 8 | ``` 9 | 10 | ```{r connect, message=FALSE, warning=FALSE} 11 | # Load libraries 12 | library(sparklyr) 13 | library(tidyverse) 14 | library(leaflet) 15 | library(rsparkling) 16 | library(h2o) 17 | library(DT) 18 | 19 | # Set environ vars 20 | Sys.setenv(SPARK_HOME="/usr/lib/spark") 21 | 22 | options(rsparkling.sparklingwater.version = '2.0.3') 23 | 24 | # Configure cluster (c3.4xlarge 30G 16core 320disk) 25 | conf <- spark_config() 26 | conf$'sparklyr.shell.executor-memory' <- "20g" 27 | conf$'sparklyr.shell.driver-memory' <- "20g" 28 | conf$spark.executor.cores <- 16 29 | conf$spark.executor.memory <- "20G" 30 | conf$spark.yarn.am.cores <- 16 31 | conf$spark.yarn.am.memory <- "20G" 32 | conf$spark.executor.instances <- 4 33 | conf$spark.dynamicAllocation.enabled <- "false" 34 | conf$maximizeResourceAllocation <- "true" 35 | conf$spark.default.parallelism <- 32 36 | 37 | # Connect to cluster 38 | sc <- spark_connect(master = "yarn-client", config = conf, version = '2.0.0') 39 | 40 | # Check H2O 41 | h2o_context(sc) 42 | ``` 43 | 44 | ```{r} 45 | # Table ref 46 | trips_model_data_tbl <- tbl(sc, "trips_model_data") 47 | model_tbl <- trips_model_data_tbl %>% 48 | filter(fare_amount > 0 & fare_amount < 20) %>% 49 | filter(tip_amount >= 0 & tip_amount < 5) %>% 50 | filter(passenger_count > 0 & passenger_count < 2) %>% 51 | select(tip_amount, fare_amount, pay_type, cab_type, passenger_count) 52 | trips_train_tbl <- sdf_register(model_tbl, "model_tbl") 53 | #tbl_cache(sc, "model_tbl") 54 | ``` 55 | 56 | ```{r convert} 57 | model_h2o_tbl <- as_h2o_frame(sc, trips_train_tbl) 58 | m2 <- h2o.glm(c("fare_amount", "pay_type", "cab_type", "passenger_count"), "tip_amount", model_h2o_tbl, alpha=0, lambda=0) 59 | summary(m2) 60 | 61 | #m3 <- h2o.deeplearning(c("fare_amount", "pay_type", "cab_type", "passenger_count"), "tip_amount", training_frame = model_h2o_tbl) 62 | #summary(m3) 63 | 64 | ``` 65 | 66 | ```{r model} 67 | model_formula <- formula(tip_amount ~ fare_amount + pay_type + cab_type + passenger_count) 68 | m1 <- ml_linear_regression(trips_train_tbl, model_formula) 69 | summary(m1) 70 | ``` 71 | 72 | -------------------------------------------------------------------------------- /dev/h2o-demo/h2oSetup.R: -------------------------------------------------------------------------------- 1 | ### rsparkling hello world 2 | ### requires R packages: statmod, RCurl, and devtools 3 | 4 | install.packages("h2o", type = "source", repos = "http://h2o-release.s3.amazonaws.com/h2o/rel-turnbull/2/R") 5 | install.packages("rsparkling") 6 | 7 | library(rsparkling) 8 | library(sparklyr) 9 | library(dplyr) 10 | library(h2o) 11 | 12 | options(rsparkling.sparklingwater.version = "2.0.3") 13 | 14 | conf <- spark_config() 15 | conf$'sparklyr.shell.executor-memory' <- "20g" 16 | conf$'sparklyr.shell.driver-memory' <- "20g" 17 | conf$spark.executor.cores <- 16 18 | conf$spark.executor.memory <- "20G" 19 | conf$spark.yarn.am.cores <- 16 20 | conf$spark.yarn.am.memory <- "20G" 21 | conf$spark.dynamicAllocation.enabled <- "false" 22 | 23 | Sys.setenv(SPARK_HOME="/usr/lib/spark") 24 | sc <- spark_connect(master = "yarn-client", config = conf, version = "2.0.0") 25 | 26 | mtcars_tbl <- copy_to(sc, mtcars, overwrite = TRUE) 27 | mtcars_hf <- as_h2o_frame(sc, mtcars_tbl) 28 | 29 | glm_model <- h2o.glm(x = c("wt", "cyl"), 30 | y = "mpg", 31 | training_frame = mtcars_hf, 32 | lambda_search = TRUE) 33 | summary(glm_model) 34 | 35 | -------------------------------------------------------------------------------- /dev/h2o-demo/h2oSetup.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Set Up H2O" 3 | output: html_notebook 4 | --- 5 | 6 | ## Install 7 | 8 | ```{r} 9 | # Remove previous versions of h2o R package 10 | if ("package:h2o" %in% search()) detach("package:h2o", unload=TRUE) 11 | if ("h2o" %in% rownames(installed.packages())) remove.packages("h2o") 12 | 13 | # Next, we download R package dependencies 14 | pkgs <- c("methods","statmod","stats","graphics", 15 | "RCurl","jsonlite","tools","utils") 16 | for (pkg in pkgs) { 17 | if (!(pkg %in% rownames(installed.packages()))) install.packages(pkg) 18 | } 19 | 20 | # Download h2o package version 3.10.0.6 21 | install.packages("h2o", type = "source", 22 | repos = "http://h2o-release.s3.amazonaws.com/h2o/rel-turing/6/R") 23 | 24 | library(devtools) 25 | devtools::install_github("h2oai/rsparkling", ref = "stable") 26 | 27 | #spark_install(version = "1.6.0") # for local (documentation say v1.6.2) 28 | ``` 29 | 30 | ## Test 1 31 | 32 | ```{r} 33 | library(sparklyr) 34 | library(rsparkling) 35 | library(dplyr) 36 | 37 | Sys.setenv(SPARK_HOME="/usr/lib/spark") 38 | #Sys.setenv(JAVA_HOME="/etc/alternatives/jre") 39 | 40 | conf <- spark_config() 41 | conf$'sparklyr.shell.executor-memory' <- "20g" 42 | conf$'sparklyr.shell.driver-memory' <- "20g" 43 | conf$spark.executor.cores <- 16 44 | conf$spark.executor.memory <- "20G" 45 | conf$spark.yarn.am.cores <- 16 46 | conf$spark.yarn.am.memory <- "20G" 47 | conf$spark.dynamicAllocation.enabled <- "false" 48 | options(rsparkling.sparklingwater.version = '1.6.7') 49 | 50 | sc <- spark_connect(master = "yarn-client", config = conf, version = '1.6.0') 51 | airlines_tbl <- tbl(sc, "airlines") 52 | h2oframe <- as_h2o_frame(sc, airlines_tbl) 53 | ``` 54 | 55 | ## Test 2 56 | 57 | ```{r} 58 | library(sparklyr) 59 | library(rsparkling) 60 | library(dplyr) 61 | library(h2o) 62 | 63 | mtcars_tbl <- copy_to(sc, mtcars, "mtcars", overwrite = TRUE) 64 | partitions <- mtcars_tbl %>% 65 | filter(hp >= 100) %>% 66 | mutate(cyl8 = cyl == 8) %>% 67 | sdf_partition(training = 0.5, test = 0.5, seed = 1099) 68 | training <- as_h2o_frame(sc, partitions$training) 69 | test <- as_h2o_frame(sc, partitions$test) 70 | glm_model <- h2o.glm(x = c("wt", "cyl"), 71 | y = "mpg", 72 | training_frame = training, 73 | lambda_search = TRUE) 74 | print(glm_model) 75 | ``` 76 | 77 | ### Test 3 78 | 79 | ```{r} 80 | trips_model_data_tbl <- tbl(sc, "trips_model_data") 81 | trips_model_data_tbl %>% count 82 | trips_h2o <- as_h2o_frame(sc, trips_model_data_tbl) 83 | 84 | 85 | model_formula <- formula(tip_amount ~ fare_amount + pay_type + cab_type + passenger_count) 86 | m1 <- ml_linear_regression(trips_train_tbl, model_formula) 87 | summary(m1) 88 | 89 | ``` 90 | -------------------------------------------------------------------------------- /dev/h2o-demo/h2oSetup_2_0_0.R: -------------------------------------------------------------------------------- 1 | ### rsparkling hello world 2 | ### requires R packages: statmod, RCurl, and devtools 3 | 4 | install.packages("h2o", type = "source", repos = "http://h2o-release.s3.amazonaws.com/h2o/rel-turnbull/2/R") 5 | install.packages("rsparkling") 6 | 7 | library(rsparkling) 8 | library(sparklyr) 9 | library(dplyr) 10 | library(h2o) 11 | 12 | options(rsparkling.sparklingwater.version = "2.0.3") 13 | 14 | conf <- spark_config() 15 | conf$'sparklyr.shell.executor-memory' <- "20g" 16 | conf$'sparklyr.shell.driver-memory' <- "20g" 17 | conf$spark.executor.cores <- 16 18 | conf$spark.executor.memory <- "20G" 19 | conf$spark.yarn.am.cores <- 16 20 | conf$spark.yarn.am.memory <- "20G" 21 | conf$spark.dynamicAllocation.enabled <- "false" 22 | 23 | Sys.setenv(SPARK_HOME="/usr/lib/spark") 24 | sc <- spark_connect(master = "yarn-client", config = conf, version = "2.0.0") 25 | 26 | mtcars_tbl <- copy_to(sc, mtcars, overwrite = TRUE) 27 | mtcars_hf <- as_h2o_frame(sc, mtcars_tbl) 28 | 29 | glm_model <- h2o.glm(x = c("wt", "cyl"), 30 | y = "mpg", 31 | training_frame = mtcars_hf, 32 | lambda_search = TRUE) 33 | summary(glm_model) 34 | 35 | -------------------------------------------------------------------------------- /dev/h2o-demo/iris.csv: -------------------------------------------------------------------------------- 1 | Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species 2 | 5.1,3.5,1.4,0.2,setosa 3 | 4.9,3,1.4,0.2,setosa 4 | 4.7,3.2,1.3,0.2,setosa 5 | 4.6,3.1,1.5,0.2,setosa 6 | 5,3.6,1.4,0.2,setosa 7 | 5.4,3.9,1.7,0.4,setosa 8 | 4.6,3.4,1.4,0.3,setosa 9 | 5,3.4,1.5,0.2,setosa 10 | 4.4,2.9,1.4,0.2,setosa 11 | 4.9,3.1,1.5,0.1,setosa 12 | 5.4,3.7,1.5,0.2,setosa 13 | 4.8,3.4,1.6,0.2,setosa 14 | 4.8,3,1.4,0.1,setosa 15 | 4.3,3,1.1,0.1,setosa 16 | 5.8,4,1.2,0.2,setosa 17 | 5.7,4.4,1.5,0.4,setosa 18 | 5.4,3.9,1.3,0.4,setosa 19 | 5.1,3.5,1.4,0.3,setosa 20 | 5.7,3.8,1.7,0.3,setosa 21 | 5.1,3.8,1.5,0.3,setosa 22 | 5.4,3.4,1.7,0.2,setosa 23 | 5.1,3.7,1.5,0.4,setosa 24 | 4.6,3.6,1,0.2,setosa 25 | 5.1,3.3,1.7,0.5,setosa 26 | 4.8,3.4,1.9,0.2,setosa 27 | 5,3,1.6,0.2,setosa 28 | 5,3.4,1.6,0.4,setosa 29 | 5.2,3.5,1.5,0.2,setosa 30 | 5.2,3.4,1.4,0.2,setosa 31 | 4.7,3.2,1.6,0.2,setosa 32 | 4.8,3.1,1.6,0.2,setosa 33 | 5.4,3.4,1.5,0.4,setosa 34 | 5.2,4.1,1.5,0.1,setosa 35 | 5.5,4.2,1.4,0.2,setosa 36 | 4.9,3.1,1.5,0.2,setosa 37 | 5,3.2,1.2,0.2,setosa 38 | 5.5,3.5,1.3,0.2,setosa 39 | 4.9,3.6,1.4,0.1,setosa 40 | 4.4,3,1.3,0.2,setosa 41 | 5.1,3.4,1.5,0.2,setosa 42 | 5,3.5,1.3,0.3,setosa 43 | 4.5,2.3,1.3,0.3,setosa 44 | 4.4,3.2,1.3,0.2,setosa 45 | 5,3.5,1.6,0.6,setosa 46 | 5.1,3.8,1.9,0.4,setosa 47 | 4.8,3,1.4,0.3,setosa 48 | 5.1,3.8,1.6,0.2,setosa 49 | 4.6,3.2,1.4,0.2,setosa 50 | 5.3,3.7,1.5,0.2,setosa 51 | 5,3.3,1.4,0.2,setosa 52 | 7,3.2,4.7,1.4,versicolor 53 | 6.4,3.2,4.5,1.5,versicolor 54 | 6.9,3.1,4.9,1.5,versicolor 55 | 5.5,2.3,4,1.3,versicolor 56 | 6.5,2.8,4.6,1.5,versicolor 57 | 5.7,2.8,4.5,1.3,versicolor 58 | 6.3,3.3,4.7,1.6,versicolor 59 | 4.9,2.4,3.3,1,versicolor 60 | 6.6,2.9,4.6,1.3,versicolor 61 | 5.2,2.7,3.9,1.4,versicolor 62 | 5,2,3.5,1,versicolor 63 | 5.9,3,4.2,1.5,versicolor 64 | 6,2.2,4,1,versicolor 65 | 6.1,2.9,4.7,1.4,versicolor 66 | 5.6,2.9,3.6,1.3,versicolor 67 | 6.7,3.1,4.4,1.4,versicolor 68 | 5.6,3,4.5,1.5,versicolor 69 | 5.8,2.7,4.1,1,versicolor 70 | 6.2,2.2,4.5,1.5,versicolor 71 | 5.6,2.5,3.9,1.1,versicolor 72 | 5.9,3.2,4.8,1.8,versicolor 73 | 6.1,2.8,4,1.3,versicolor 74 | 6.3,2.5,4.9,1.5,versicolor 75 | 6.1,2.8,4.7,1.2,versicolor 76 | 6.4,2.9,4.3,1.3,versicolor 77 | 6.6,3,4.4,1.4,versicolor 78 | 6.8,2.8,4.8,1.4,versicolor 79 | 6.7,3,5,1.7,versicolor 80 | 6,2.9,4.5,1.5,versicolor 81 | 5.7,2.6,3.5,1,versicolor 82 | 5.5,2.4,3.8,1.1,versicolor 83 | 5.5,2.4,3.7,1,versicolor 84 | 5.8,2.7,3.9,1.2,versicolor 85 | 6,2.7,5.1,1.6,versicolor 86 | 5.4,3,4.5,1.5,versicolor 87 | 6,3.4,4.5,1.6,versicolor 88 | 6.7,3.1,4.7,1.5,versicolor 89 | 6.3,2.3,4.4,1.3,versicolor 90 | 5.6,3,4.1,1.3,versicolor 91 | 5.5,2.5,4,1.3,versicolor 92 | 5.5,2.6,4.4,1.2,versicolor 93 | 6.1,3,4.6,1.4,versicolor 94 | 5.8,2.6,4,1.2,versicolor 95 | 5,2.3,3.3,1,versicolor 96 | 5.6,2.7,4.2,1.3,versicolor 97 | 5.7,3,4.2,1.2,versicolor 98 | 5.7,2.9,4.2,1.3,versicolor 99 | 6.2,2.9,4.3,1.3,versicolor 100 | 5.1,2.5,3,1.1,versicolor 101 | 5.7,2.8,4.1,1.3,versicolor 102 | 6.3,3.3,6,2.5,virginica 103 | 5.8,2.7,5.1,1.9,virginica 104 | 7.1,3,5.9,2.1,virginica 105 | 6.3,2.9,5.6,1.8,virginica 106 | 6.5,3,5.8,2.2,virginica 107 | 7.6,3,6.6,2.1,virginica 108 | 4.9,2.5,4.5,1.7,virginica 109 | 7.3,2.9,6.3,1.8,virginica 110 | 6.7,2.5,5.8,1.8,virginica 111 | 7.2,3.6,6.1,2.5,virginica 112 | 6.5,3.2,5.1,2,virginica 113 | 6.4,2.7,5.3,1.9,virginica 114 | 6.8,3,5.5,2.1,virginica 115 | 5.7,2.5,5,2,virginica 116 | 5.8,2.8,5.1,2.4,virginica 117 | 6.4,3.2,5.3,2.3,virginica 118 | 6.5,3,5.5,1.8,virginica 119 | 7.7,3.8,6.7,2.2,virginica 120 | 7.7,2.6,6.9,2.3,virginica 121 | 6,2.2,5,1.5,virginica 122 | 6.9,3.2,5.7,2.3,virginica 123 | 5.6,2.8,4.9,2,virginica 124 | 7.7,2.8,6.7,2,virginica 125 | 6.3,2.7,4.9,1.8,virginica 126 | 6.7,3.3,5.7,2.1,virginica 127 | 7.2,3.2,6,1.8,virginica 128 | 6.2,2.8,4.8,1.8,virginica 129 | 6.1,3,4.9,1.8,virginica 130 | 6.4,2.8,5.6,2.1,virginica 131 | 7.2,3,5.8,1.6,virginica 132 | 7.4,2.8,6.1,1.9,virginica 133 | 7.9,3.8,6.4,2,virginica 134 | 6.4,2.8,5.6,2.2,virginica 135 | 6.3,2.8,5.1,1.5,virginica 136 | 6.1,2.6,5.6,1.4,virginica 137 | 7.7,3,6.1,2.3,virginica 138 | 6.3,3.4,5.6,2.4,virginica 139 | 6.4,3.1,5.5,1.8,virginica 140 | 6,3,4.8,1.8,virginica 141 | 6.9,3.1,5.4,2.1,virginica 142 | 6.7,3.1,5.6,2.4,virginica 143 | 6.9,3.1,5.1,2.3,virginica 144 | 5.8,2.7,5.1,1.9,virginica 145 | 6.8,3.2,5.9,2.3,virginica 146 | 6.7,3.3,5.7,2.5,virginica 147 | 6.7,3,5.2,2.3,virginica 148 | 6.3,2.5,5,1.9,virginica 149 | 6.5,3,5.2,2,virginica 150 | 6.2,3.4,5.4,2.3,virginica 151 | 5.9,3,5.1,1.8,virginica 152 | -------------------------------------------------------------------------------- /dev/h2o-demo/livy.R: -------------------------------------------------------------------------------- 1 | library(sparklyr) 2 | Sys.setenv(SPARK_HOME="/usr/lib/spark") 3 | sc <- spark_connect(master = "yarn-client", version = '2.0.0') 4 | livy_service_start() 5 | livy_service_stop() 6 | -------------------------------------------------------------------------------- /dev/h2o-demo/livy.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Connecting to Spark through Livy" 3 | output: html_notebook 4 | --- 5 | 6 | With Livy you can anaylze data in your spark cluster via R on your desktop. 7 | 8 | ## Livy 9 | 10 | Livy is a service that enables easy interaction with an Apache Spark cluster over a REST interface. It enables easy submission of Spark jobs or snippets of Spark code, synchronous or asynchronous result retrieval, as well as SparkContext management, all via a simple REST interface or a RPC client library. Livy also simplifies the interaction between Spark from application servers, thus enabling the use of Spark for interactive web/mobile applications. 11 | 12 |
13 | ![](http://livy.io/img/livy-architecture.png) 14 |
15 | 16 | ## Start Livy [Server Side] 17 | 18 | Set home environment variables and start a Livy server to handle local requests. 19 | 20 | ```{r, eval=FALSE} 21 | sparklyr::livy_install() 22 | sparklyr::livy_service_start() 23 | ``` 24 | 25 | ## Connect to Spark [Client Side] 26 | 27 | Use `method = "livy"` to connect to the cluster. 28 | 29 | ```{r warning=FALSE} 30 | library(sparklyr) 31 | library(dplyr) 32 | sc <- spark_connect( 33 | master = "http://ec2-107-20-106-40.compute-1.amazonaws.com:8998/", 34 | method = "livy") 35 | ``` 36 | 37 | ## Analyze 38 | 39 | Use R code on your workstation as you normally would. Your R commands will be sent to the cluster via Livy for processing. Collect your results back to the desktop for further processing in R. 40 | 41 | ```{r} 42 | library(ggplot2) 43 | trips_model_data_tbl <- tbl(sc, "trips_model_data") 44 | pickup_dropoff_tbl <- trips_model_data_tbl %>% 45 | filter(pickup_nta == "Turtle Bay-East Midtown" & dropoff_nta == "Airport") %>% 46 | mutate(pickup_hour = hour(pickup_datetime)) %>% 47 | mutate(trip_time = unix_timestamp(dropoff_datetime) - unix_timestamp(pickup_datetime)) %>% 48 | group_by(pickup_hour) %>% 49 | summarize(n = n(), 50 | trip_time_mean = mean(trip_time), 51 | trip_time_p10 = percentile(trip_time, 0.10), 52 | trip_time_p25 = percentile(trip_time, 0.25), 53 | trip_time_p50 = percentile(trip_time, 0.50), 54 | trip_time_p75 = percentile(trip_time, 0.75), 55 | trip_time_p90 = percentile(trip_time, 0.90)) 56 | 57 | # Collect results 58 | pickup_dropoff <- collect(pickup_dropoff_tbl) 59 | 60 | # Plot 61 | ggplot(pickup_dropoff, aes(x = pickup_hour)) + 62 | geom_line(aes(y = trip_time_p50, alpha = "Median")) + 63 | geom_ribbon(aes(ymin = trip_time_p25, ymax = trip_time_p75, 64 | alpha = "25–75th percentile")) + 65 | geom_ribbon(aes(ymin = trip_time_p10, ymax = trip_time_p90, 66 | alpha = "10–90th percentile")) + 67 | scale_y_continuous("trip duration in minutes") 68 | ``` 69 | 70 | ## Disconnect 71 | 72 | ```{r disconnect} 73 | sparklyr::livy_service_stop() 74 | ``` 75 | 76 | 77 | -------------------------------------------------------------------------------- /dev/h2o-demo/sqlvis_histogram.R: -------------------------------------------------------------------------------- 1 | ### Big data histogram 2 | sqlvis_compute_histogram <- function(data, x_name, bins = 30){ 3 | 4 | data_prep <- data %>% 5 | select_(x_field = x_name) %>% 6 | filter(!is.na(x_field)) %>% 7 | mutate(x_field = as.double(x_field)) 8 | 9 | s <- data_prep %>% 10 | summarise(max_x = max(x_field), min_x = min(x_field)) %>% 11 | mutate(bin_value = (max_x - min_x) / bins) %>% 12 | collect() 13 | 14 | new_bins <- as.numeric(c((0:(bins - 1) * s$bin_value) + s$min_x, s$max_x)) 15 | 16 | plot_table <- data_prep %>% 17 | ft_bucketizer(input.col = "x_field", output.col = "key_bin", splits = new_bins) %>% 18 | group_by(key_bin) %>% 19 | tally() %>% 20 | collect() 21 | 22 | all_bins <- data.frame( 23 | key_bin = 0:(bins - 1), 24 | bin = 1:bins, 25 | bin_ceiling = head(new_bins, -1) 26 | ) 27 | 28 | plot_table %>% 29 | full_join(all_bins, by="key_bin") %>% 30 | arrange(key_bin) %>% 31 | mutate(n = ifelse(!is.na(n), n, 0)) %>% 32 | select(bin = key_bin, count = n, bin_ceiling) %>% 33 | rename_(.dots = setNames(list("bin_ceiling"), x_name)) 34 | 35 | } 36 | 37 | sqlvis_ggplot_histogram <- function(plot_table, ...){ 38 | plot_table %>% 39 | select(x = 3, y = 2) %>% 40 | ggplot(aes(x, y)) + 41 | geom_bar(stat = "identity", fill = "cornflowerblue") + 42 | theme(legend.position = "none") + 43 | labs(x = colnames(plot_table)[3], y = colnames(plot_table)[2], ...) 44 | } 45 | 46 | sqlvis_ggvis_histogram <- function(plot_table, ...){ 47 | plot_table %>% 48 | select(x = 3, y = 2) %>% 49 | ggvis(x = ~x, y = ~y) %>% 50 | layer_bars() %>% 51 | add_axis("x", title = colnames(plot_table)[3]) %>% 52 | add_axis("y", title = colnames(plot_table)[2]) 53 | } 54 | 55 | -------------------------------------------------------------------------------- /dev/h2o-demo/sqlvis_raster.R: -------------------------------------------------------------------------------- 1 | ### Big data tile plot 2 | 3 | # data <- tbl(sc, "trips_model_data") 4 | # x_field <- "pickup_longitude" 5 | # y_field <- "pickup_latitude" 6 | # resolution <- 50 7 | 8 | sqlvis_compute_raster <- function(data, x_field, y_field, resolution = 300){ 9 | 10 | data_prep <- data %>% 11 | select_(x = x_field, y = y_field) %>% 12 | filter(!is.na(x), !is.na(y)) 13 | 14 | s <- data_prep %>% 15 | summarise(max_x = max(x), 16 | max_y = max(y), 17 | min_x = min(x), 18 | min_y = min(y)) %>% 19 | mutate(rng_x = max_x - min_x, 20 | rng_y = max_y - min_y, 21 | resolution = resolution) %>% 22 | collect() 23 | 24 | counts <- data_prep %>% 25 | mutate(res_x = round((x - s$min_x) / s$rng_x * resolution, 0), 26 | res_y = round((y - s$min_y) / s$rng_y * resolution, 0)) %>% 27 | count(res_x, res_y) %>% 28 | collect 29 | 30 | list(counts = counts, 31 | limits = s, 32 | vnames = c(x_field, y_field) 33 | ) 34 | 35 | } 36 | 37 | sqlvis_ggplot_raster <- function(data, ...) { 38 | 39 | d <- data$counts 40 | s <- data$limits 41 | v <- data$vnames 42 | 43 | xx <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_x, s$max_x, len = 6),2)) 44 | yy <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_y, s$max_y, len = 6),2)) 45 | 46 | ggplot(d, aes(res_x, res_y)) + 47 | geom_raster(aes(fill = n)) + 48 | coord_fixed() + 49 | scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") + 50 | scale_x_continuous(breaks = xx, labels = names(xx)) + 51 | scale_y_continuous(breaks = yy, labels = names(yy)) + 52 | labs(x = v[1], y = v[2], ...) 53 | 54 | } 55 | 56 | ### Facets 57 | 58 | sqlvis_compute_raster_g <- function(data, x_field, y_field, g_field, resolution = 300){ 59 | 60 | data_prep <- data %>% 61 | mutate_(group = g_field) %>% 62 | select_(g = "group", x = x_field, y = y_field) %>% 63 | filter(!is.na(x), !is.na(y)) 64 | 65 | s <- data_prep %>% 66 | summarise(max_x = max(x), 67 | max_y = max(y), 68 | min_x = min(x), 69 | min_y = min(y)) %>% 70 | mutate(rng_x = max_x - min_x, 71 | rng_y = max_y - min_y, 72 | resolution = resolution) %>% 73 | collect() 74 | 75 | counts <- data_prep %>% 76 | mutate(res_x = round((x-s$min_x)/s$rng_x*resolution, 0), 77 | res_y = round((y-s$min_y)/s$rng_y*resolution, 0)) %>% 78 | count(g, res_x, res_y) %>% 79 | collect 80 | 81 | list(counts = counts, 82 | limits = s, 83 | vnames = c(x_field, y_field) 84 | ) 85 | 86 | } 87 | 88 | sqlvis_ggplot_raster_g <- function(data, ncol = 4, ...) { 89 | 90 | s <- data$limits 91 | d <- data$counts 92 | v <- data$vnames 93 | 94 | xx <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_x, s$max_x, len = 3), 1)) 95 | yy <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_y, s$max_y, len = 3), 1)) 96 | 97 | ggplot(d, aes(res_x, res_y)) + 98 | geom_raster(aes(fill = n)) + 99 | coord_fixed() + 100 | facet_wrap(~ g, ncol = ncol) + 101 | scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") + 102 | scale_x_continuous(breaks = xx, labels = names(xx)) + 103 | scale_y_continuous(breaks = yy, labels = names(yy)) + 104 | labs(x = v[1], y = v[2], ...) 105 | 106 | } 107 | -------------------------------------------------------------------------------- /dev/h2o/01_h2o_setup.R: -------------------------------------------------------------------------------- 1 | library(devtools) 2 | library(sparklyr) 3 | 4 | # Remove previous versions of h2o R package 5 | if ("package:h2o" %in% search()) detach("package:h2o", unload=TRUE) 6 | if ("h2o" %in% rownames(installed.packages())) remove.packages("h2o") 7 | 8 | # Next, we download R package dependencies 9 | pkgs <- c("methods","statmod","stats","graphics", 10 | "RCurl","jsonlite","tools","utils") 11 | for (pkg in pkgs) { 12 | if (!(pkg %in% rownames(installed.packages()))) install.packages(pkg) 13 | } 14 | 15 | # Download h2o package version 3.10.0.6 16 | install.packages("h2o", type = "source", 17 | repos = "http://h2o-release.s3.amazonaws.com/h2o/rel-turing/6/R") 18 | 19 | # Install from github 20 | devtools::install_github("h2oai/sparkling-water", subdir = "/r/rsparkling") 21 | 22 | # Make sure spark is also installed in local mode 23 | spark_install(version = "1.6.2") 24 | -------------------------------------------------------------------------------- /dev/h2o/02_h2o_rsparkling.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Intro to H2O rsparkling" 3 | output: html_notebook 4 | --- 5 | 6 | ## Setup 7 | 8 | ```{r, message=FALSE, warning=FALSE} 9 | library(sparklyr) 10 | library(h2o) 11 | library(rsparkling) 12 | library(dplyr) 13 | library(ggplot2) 14 | 15 | # Connect 16 | sc <- spark_connect("local", version = "1.6.2") 17 | mtcars_tbl <- copy_to(sc, mtcars, "mtcars", overwrite = TRUE) 18 | ``` 19 | 20 | ## Partition into test and training 21 | 22 | ```{r} 23 | # Transform our data set, and then partition into 'training', 'test' 24 | partitions <- mtcars_tbl %>% 25 | filter(hp >= 100) %>% 26 | mutate(cyl8 = cyl == 8) %>% 27 | sdf_partition(training = 0.5, test = 0.5, seed = 1099) 28 | 29 | # Convert to H20 Frame 30 | training <- as_h2o_frame(sc, partitions$training) 31 | test <- as_h2o_frame(sc, partitions$test) 32 | ``` 33 | 34 | ## Train a linear model 35 | 36 | ```{r} 37 | # Fit a linear model to the training dataset 38 | glm_model <- h2o.glm(x = c("wt", "cyl"), 39 | y = "mpg", 40 | training_frame = training, 41 | lambda_search = TRUE) 42 | # Examine model 43 | summary(glm_model) 44 | ``` 45 | 46 | ## Score test data and compare to actuals 47 | 48 | ```{r} 49 | # Compute predicted values on our test dataset 50 | pred <- h2o.predict(glm_model, newdata = test) 51 | 52 | # Extract the true 'mpg' values from our test dataset 53 | actual <- partitions$test %>% 54 | select(mpg) %>% 55 | rename(actual = mpg) 56 | 57 | # Collect the results 58 | data <- data.frame( 59 | collect(as_spark_dataframe(sc, pred)), 60 | collect(actual) 61 | ) 62 | ``` 63 | 64 | ## Plot predicted vs actuals values 65 | 66 | ```{r} 67 | # plot predicted vs. actual values 68 | ggplot(data, aes(x = actual, y = predict)) + 69 | geom_abline(lty = "dashed", col = "red") + 70 | geom_point() + 71 | theme(plot.title = element_text(hjust = 0.5)) + 72 | coord_fixed(ratio = 1) + 73 | labs( 74 | x = "Actual Fuel Consumption", 75 | y = "Predicted Fuel Consumption", 76 | title = "Predicted vs. Actual Fuel Consumption" 77 | ) 78 | ``` 79 | 80 | -------------------------------------------------------------------------------- /dev/h2o/03_h2o_ml.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "H2O Sparkling Water Machine Learning" 3 | output: html_notebook 4 | --- 5 | 6 | ## Setup 7 | 8 | ```{r, message=FALSE, warning=FALSE} 9 | library(rsparkling) 10 | library(dplyr) 11 | library(ggplot2) 12 | 13 | sc <- spark_connect("local", version = "1.6.2") 14 | iris_tbl <- copy_to(sc, iris, "iris", overwrite = TRUE) 15 | iris_hf <- as_h2o_frame(sc, iris_tbl) 16 | ``` 17 | 18 | ## K means clustering 19 | 20 | ```{r} 21 | kmeans_model <- h2o.kmeans(training_frame = iris_hf, 22 | x = 3:4, 23 | k = 3, 24 | seed = 1) 25 | h2o.centers(kmeans_model) 26 | h2o.centroid_stats(kmeans_model) 27 | ``` 28 | 29 | ## Logistic 30 | 31 | ```{r} 32 | beaver <- beaver2 33 | beaver$activ <- factor(beaver$activ, labels = c("Non-Active", "Active")) 34 | beaver_hf <- as.h2o(beaver) # Send data from R memory to H2O cluster 35 | 36 | y <- "activ" 37 | x <- setdiff(names(beaver_hf), y) 38 | glm_model <- h2o.glm(x = x, 39 | y = y, 40 | training_frame = beaver_hf, 41 | family = "binomial", 42 | nfolds = 3, 43 | seed = 1) 44 | 45 | h2o.performance(glm_model, xval = TRUE) 46 | ``` 47 | 48 | ## PCA 49 | 50 | ```{r} 51 | pca_model <- h2o.prcomp(training_frame = iris_hf, 52 | x = 1:4, 53 | k = 4, 54 | seed = 1) 55 | print(pca_model) 56 | ``` 57 | 58 | ## Random Forest 59 | 60 | ```{r} 61 | y <- "Species" 62 | x <- setdiff(names(iris_hf), y) 63 | iris_hf[,y] <- as.factor(iris_hf[,y]) 64 | 65 | splits <- h2o.splitFrame(iris_hf, seed = 1) 66 | 67 | rf_model <- h2o.randomForest(x = x, 68 | y = y, 69 | training_frame = splits[[1]], 70 | validation_frame = splits[[2]], 71 | nbins = 32, 72 | max_depth = 5, 73 | ntrees = 20, 74 | seed = 1) 75 | 76 | h2o.confusionMatrix(rf_model, valid = TRUE) 77 | 78 | h2o.varimp_plot(rf_model) 79 | ``` 80 | 81 | ## Gradient Boosted Model 82 | 83 | ```{r} 84 | gbm_model <- h2o.gbm(x = x, 85 | y = y, 86 | training_frame = splits[[1]], 87 | validation_frame = splits[[2]], 88 | ntrees = 20, 89 | max_depth = 3, 90 | learn_rate = 0.01, 91 | col_sample_rate = 0.7, 92 | seed = 1) 93 | 94 | h2o.confusionMatrix(gbm_model, valid = TRUE) 95 | 96 | path <- system.file("extdata", "prostate.csv", 97 | package = "h2o") 98 | 99 | prostate_hf <- h2o.importFile(path) 100 | str(prostate_hf) 101 | head(prostate_hf) 102 | 103 | splits <- h2o.splitFrame(prostate_hf, seed = 1) 104 | ``` 105 | 106 | ## Deep learning 107 | 108 | ```{r} 109 | y <- "VOL" 110 | x <- setdiff(names(prostate_hf), c("ID", y)) 111 | 112 | dl_fit <- h2o.deeplearning(x = x, y = y, 113 | training_frame = splits[[1]], 114 | epochs = 15, 115 | activation = "Rectifier", 116 | hidden = c(10, 5, 10), 117 | input_dropout_ratio = 0.7) 118 | 119 | h2o.performance(dl_fit, newdata = splits[[2]]) 120 | 121 | path <- system.file("extdata", "prostate.csv", package = "h2o") 122 | prostate_hf <- h2o.importFile(path) 123 | splits <- h2o.splitFrame(prostate_hf, seed = 1) 124 | ``` 125 | -------------------------------------------------------------------------------- /dev/h2o/04_h2o_grid.R: -------------------------------------------------------------------------------- 1 | ### 2 | 3 | y <- "VOL" 4 | #remove response and ID cols 5 | x <- setdiff(names(prostate_hf), c("ID", y)) 6 | 7 | # GBM hyperparamters 8 | gbm_params1 <- list(learn_rate = c(0.01, 0.1), 9 | max_depth = c(3, 5, 9), 10 | sample_rate = c(0.8, 1.0), 11 | col_sample_rate = c(0.2, 0.5, 1.0)) 12 | 13 | # Train and validate a grid of GBMs 14 | gbm_grid1 <- h2o.grid("gbm", x = x, y = y, 15 | grid_id = "gbm_grid1", 16 | training_frame = splits[[1]], 17 | validation_frame = splits[[1]], 18 | ntrees = 100, 19 | seed = 1, 20 | hyper_params = gbm_params1) 21 | 22 | # Get the grid results, sorted by validation MSE 23 | gbm_gridperf1 <- h2o.getGrid(grid_id = "gbm_grid1", 24 | sort_by = "mse", 25 | decreasing = FALSE) 26 | print(gbm_gridperf1) 27 | 28 | 29 | # GBM hyperparamters 30 | gbm_params2 <- list(learn_rate = seq(0.01, 0.1, 0.01), 31 | max_depth = seq(2, 10, 1), 32 | sample_rate = seq(0.5, 1.0, 0.1), 33 | col_sample_rate = seq(0.1, 1.0, 0.1)) 34 | search_criteria2 <- list(strategy = "RandomDiscrete", 35 | max_models = 50) 36 | 37 | # Train and validate a grid of GBMs 38 | gbm_grid2 <- h2o.grid("gbm", x = x, y = y, 39 | grid_id = "gbm_grid2", 40 | training_frame = splits[[1]], 41 | validation_frame = splits[[2]], 42 | ntrees = 100, 43 | seed = 1, 44 | hyper_params = gbm_params2, 45 | search_criteria = search_criteria2) 46 | 47 | # Get the grid results, sorted by validation MSE 48 | gbm_gridperf2 <- h2o.getGrid(grid_id = "gbm_grid2", 49 | sort_by = "mse", 50 | decreasing = FALSE) 51 | 52 | gbm_gridperf2@summary_table[1,] 53 | 54 | h2o.saveModel(gbm_model, path = "mymodel") 55 | 56 | h2o.download_pojo(gbm_model, path = "mymodel") 57 | -------------------------------------------------------------------------------- /dev/helloworld/derby.log: -------------------------------------------------------------------------------- 1 | ---------------------------------------------------------------- 2 | Mon Sep 19 17:07:57 UTC 2016: 3 | Booting Derby version The Apache Software Foundation - Apache Derby - 10.11.1.1 - (1616546): instance a816c00e-0157-436b-2290-000014d05190 4 | on database directory memory:/home/nathan/spark/sparkDemos/dev/helloworld/databaseName=metastore_db with class loader org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1@71526773 5 | Loaded from file:/home/nathan/.cache/spark/spark-2.0.0-bin-hadoop2.7/jars/derby-10.11.1.1.jar 6 | java.vendor=Oracle Corporation 7 | java.runtime.version=1.7.0_85-b01 8 | user.dir=/home/nathan/spark/sparkDemos/dev/helloworld 9 | os.name=Linux 10 | os.arch=amd64 11 | os.version=3.13.0-48-generic 12 | derby.system.home=null 13 | Database Class Loader started - derby.database.classpath='' 14 | -------------------------------------------------------------------------------- /dev/helloworld/helloWorld.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Sparklyr" 3 | output: html_notebook 4 | --- 5 | 6 | ```{r} 7 | library(dplyr) 8 | library(sparklyr) 9 | 10 | sc <- spark_connect(master = "local", version = "2.0.0") 11 | iris_tbl <- copy_to(sc, iris, "iris") 12 | 13 | iris_tbl %>% 14 | group_by(Species) %>% 15 | summarize(n1 = as.numeric(n()), n2 = as.numeric(n())) 16 | ``` 17 | -------------------------------------------------------------------------------- /dev/hive/hiveJDBC.R: -------------------------------------------------------------------------------- 1 | #loading libraries 2 | library("DBI") 3 | library("rJava") 4 | library("RJDBC") 5 | 6 | #init of the classpath (works with hadoop 2.6 on CDH 5.4 installation) 7 | hivecp = c("/usr/lib/hive/lib/hive-jdbc.jar", "/usr/lib/hadoop/client/hadoop-common.jar", "/usr/lib/hive/lib/libthrift-0.9.2.jar", "/usr/lib/hive/lib/hive-service.jar", "/usr/lib/hive/lib/httpclient-4.2.5.jar", "/usr/lib/hive/lib/httpcore-4.2.5.jar", "/usr/lib/hive/lib/hive-jdbc-standalone.jar") 8 | .jinit(classpath=cp) 9 | 10 | #initialisation de la connexion 11 | drv <- JDBC("org.apache.hive.jdbc.HiveDriver", "/usr/lib/hive/lib/hive-jdbc.jar", identifier.quote="`") 12 | conn <- dbConnect(drv, "jdbc:hive2://localhost:10000/default", "myuser", "") 13 | 14 | #working with the connexion 15 | show_databases <- dbGetQuery(conn, "show databases") 16 | show_databases 17 | 18 | library("RJDBC") 19 | options( java.parameters = "-Xmx8g" ) 20 | drv <- JDBC("org.apache.hive.jdbc.HiveDriver", "/usr/lib/hive/lib/hive-jdbc.jar") 21 | conn <- dbConnect(drv, "jdbc:hive2://localhost:10000/default", "rstudio-user", "") 22 | sample_08 <- dbReadTable(conn, "airlines") 23 | 24 | 25 | jdbc:sqlserver://data.rsquaredltd.com\SandP 26 | jdbc:sqlserver://[serverName[\instanceName][:portNumber]][;property=value[;property=value]] 27 | 28 | install unixODBC unixODBC-devel 29 | -------------------------------------------------------------------------------- /dev/hive/hiveMetastore.R: -------------------------------------------------------------------------------- 1 | ### Connect to Spark 2 | library(sparklyr) 3 | library(dplyr) 4 | library(ggplot2) 5 | Sys.setenv(SPARK_HOME="/usr/lib/spark") 6 | config <- spark_config() 7 | sc <- spark_connect(master = "yarn-client", config = config, version = '1.6.2') 8 | 9 | ### Load DBI 10 | library(DBI) 11 | 12 | ### Browse the Hive Metastore 13 | dbGetQuery(sc, "show databases") 14 | dbGetQuery(sc, "show tables in default") 15 | dbGetQuery(sc, "show tables in userdb") 16 | dbGetQuery(sc, "describe userdb.students") 17 | 18 | ### Create a new database, a new table, and insert data 19 | dbGetQuery(sc, "create database newdb") 20 | dbGetQuery(sc, "drop table if exists newdb.pageviews") 21 | dbGetQuery(sc, "create table newdb.pageviews (userid varchar(64), link string, came_from string)") 22 | dbGetQuery(sc, "insert into table newdb.pageviews values ('jsmith', 'mail.com', 'sports.com'), ('jdoe', 'mail.com', null)") 23 | 24 | ### This query does not work from R but works from the command prompt 25 | dbGetQuery(sc, "CREATE TABLE students (name VARCHAR(64), age INT, gpa DECIMAL(3, 2)) CLUSTERED BY (age) INTO 2 BUCKETS STORED AS ORC") 26 | 27 | dbGetQuery(sc, "use newdb") 28 | dbGetQuery(sc, "show tables in newdb") 29 | -------------------------------------------------------------------------------- /dev/hive/hiveMetastore.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Browse Hive Metastore" 3 | output: html_notebook 4 | --- 5 | 6 | ### Connect to Spark 7 | ```{r} 8 | library(sparklyr) 9 | library(dplyr) 10 | library(ggplot2) 11 | Sys.setenv(SPARK_HOME="/usr/lib/spark") 12 | config <- spark_config() 13 | sc <- spark_connect(master = "yarn-client", config = config, version = '2.0.0') 14 | ``` 15 | 16 | ### Browse the Hive Metastore 17 | 18 | ```{r} 19 | library(DBI) 20 | dbGetQuery(sc, "show databases") 21 | dbGetQuery(sc, "show tables in default") 22 | dbGetQuery(sc, "show tables in userdb") 23 | dbGetQuery(sc, "describe userdb.students") 24 | ``` 25 | 26 | ### Create a new database, a new table, and insert data 27 | 28 | ```{r} 29 | dbGetQuery(sc, "drop table if exists newdb.pageviews") 30 | dbGetQuery(sc, "drop database if exists newdb") 31 | dbGetQuery(sc, "create database newdb") 32 | dbGetQuery(sc, "create table newdb.pageviews (userid varchar(64), link string, came_from string)") 33 | dbGetQuery(sc, "insert into table newdb.pageviews values ('jsmith', 'mail.com', 'sports.com'), ('jdoe', 'mail.com', null)") 34 | ``` 35 | 36 | ### This query does not work from R but does work from the command prompt 37 | 38 | ```{r} 39 | dbGetQuery(sc, "CREATE TABLE students (name VARCHAR(64), age INT, gpa DECIMAL(3, 2)) CLUSTERED BY (age) INTO 2 BUCKETS STORED AS ORC") 40 | ``` 41 | ``` 42 | Error: org.apache.spark.sql.catalyst.parser.ParseException: Operation not allowed: CREATE TABLE ... CLUSTERED BY(line 1, pos 0) == SQL == CREATE TABLE students (name VARCHAR(64), age INT, gpa DECIMAL(3, 2)) CLUSTERED BY (age) INTO 2 BUCKETS STORED AS ORC ^^^ at org.apache.spark.sql.catalyst.parser.ParserUtils$.operationNotAllowed(ParserUtils.scala:43) at org.apache.spark.sql.execution.SparkSqlAstBuilder$$anonfun$visitCreateTable$1.apply(SparkSqlParser.scala:913) at org.apache.spark.sql.execution.SparkSqlAstBuilder$$anonfun$visitCreateTable$1.apply(SparkSqlParser.scala:901) at org.apache.spark.sql.catalyst.parser.ParserUtils$.withOrigin(ParserUtils.scala:96) at org.apache.spark.sql.execution.SparkSqlAstBuilder.visitCreateTable(SparkSqlParser.scala:901) at org.apache.spark.sql.execution.SparkSqlAstBuilder.visitCreateTable(SparkSqlParser.scala:53) at org.apache.spark.sql.catalyst.parser.SqlBaseParser$CreateTableContext.accept(SqlBaseParser.java:474) at org.antlr.v4.runtime.tre 43 | ``` -------------------------------------------------------------------------------- /dev/nyc-taxi-data/.gitignore: -------------------------------------------------------------------------------- 1 | derby.log 2 | -------------------------------------------------------------------------------- /dev/nyc-taxi-data/taxiApp.R: -------------------------------------------------------------------------------- 1 | library(sparklyr) 2 | library(dplyr) 3 | library(shiny) 4 | 5 | Sys.setenv(SPARK_HOME="/usr/lib/spark") 6 | config <- spark_config() 7 | sc <- spark_connect(master = "yarn-client", config = config, version = '1.6.1') 8 | 9 | tbl_cache(sc, 'trips_csv_2015_12') 10 | trips_tbl <- tbl(sc, 'trips_csv_2015_12') 11 | 12 | ui <- fluidPage( 13 | 14 | titlePanel("NYC Taxi Trips"), 15 | 16 | sidebarLayout( 17 | sidebarPanel( 18 | selectInput("hour", "Hour of the day", 0:23, 12) 19 | ), 20 | 21 | mainPanel( 22 | tableOutput("fare") 23 | ) 24 | ) 25 | ) 26 | 27 | server <- function(input, output) { 28 | 29 | fare <- reactive({ 30 | trips_tbl %>% 31 | mutate(pickup_hour = hour(pickup_datetime)) %>% 32 | filter(pickup_hour == input$hour) %>% 33 | summarize(fare_amount = mean(fare_amount)) %>% 34 | collect 35 | }) 36 | 37 | output$fare <- renderTable({ 38 | fare() 39 | }) 40 | 41 | } 42 | 43 | shinyApp(ui = ui, server = server) -------------------------------------------------------------------------------- /dev/nyc-taxi-data/taxiApp/app.R: -------------------------------------------------------------------------------- 1 | 2 | global <- function() { 3 | 4 | Sys.setenv(SPARK_HOME="/usr/lib/spark") 5 | config <- spark_config() 6 | sc <- spark_connect(master = "yarn-client", config = config, version = '1.6.1') 7 | 8 | tbl_cache(sc, 'trips_par') 9 | shiny_trips_tbl <<- tbl(sc, 'trips_par') 10 | 11 | distinct_gid <- function(data, gid, cutoff = 100000){ 12 | data %>% 13 | filter_(!is.na(gid)) %>% 14 | group_by_(gid) %>% 15 | count %>% 16 | filter(n > cutoff) %>% 17 | select_(gid) %>% 18 | arrange_(gid) %>% 19 | collect 20 | } 21 | 22 | pickup_nyct2010_gid <<- shiny_trips_tbl %>% 23 | distinct_gid("pickup_nyct2010_gid") %>% 24 | unlist %>% 25 | unname 26 | 27 | dropoff_nyct2010_gid <<- shiny_trips_tbl %>% 28 | distinct_gid("dropoff_nyct2010_gid") %>% 29 | unlist %>% 30 | unname 31 | 32 | } 33 | 34 | ui <- fluidPage( 35 | 36 | titlePanel("NYC Taxi Data"), 37 | 38 | sidebarLayout( 39 | sidebarPanel( 40 | selectInput("pickup", "Taxi origin", pickup_nyct2010_gid, 1250), 41 | selectInput("dropoff", "Taxi destination", dropoff_nyct2010_gid, 2056) 42 | ), 43 | 44 | mainPanel( 45 | plotOutput("distPlot") 46 | ) 47 | ) 48 | ) 49 | 50 | server <- function(input, output) { 51 | 52 | withProgress(message = "dplyr:", detail = "filter, mutate, summarize", { 53 | 54 | shiny_pickup_dropoff <- reactive({ 55 | shiny_trips_tbl %>% 56 | filter(pickup_nyct2010_gid == input$pickup & dropoff_nyct2010_gid == input$dropoff) %>% 57 | mutate(pickup_hour = hour(pickup_datetime)) %>% 58 | mutate(trip_time = unix_timestamp(dropoff_datetime) - unix_timestamp(pickup_datetime)) %>% 59 | group_by(pickup_hour) %>% 60 | summarize(n = n(), 61 | trip_time_p10 = percentile(trip_time, 0.10), 62 | trip_time_p25 = percentile(trip_time, 0.25), 63 | trip_time_p50 = percentile(trip_time, 0.50), 64 | trip_time_p75 = percentile(trip_time, 0.75), 65 | trip_time_p90 = percentile(trip_time, 0.90)) %>% 66 | collect 67 | }) 68 | 69 | }) 70 | 71 | output$distPlot <- renderPlot({ 72 | ggplot(shiny_pickup_dropoff(), aes(x = pickup_hour)) + 73 | geom_line(aes(y = trip_time_p50, alpha = "Median")) + 74 | geom_ribbon(aes(ymin = trip_time_p25, ymax = trip_time_p75, alpha = "25–75th percentile")) + 75 | geom_ribbon(aes(ymin = trip_time_p10, ymax = trip_time_p90, alpha = "10–90th percentile")) + 76 | scale_y_continuous("trip duration in minutes") + 77 | ggtitle(paste("Pickup = ", input$pickup, ";", "Dropoff =", input$dropoff)) 78 | }) 79 | 80 | } 81 | 82 | shinyApp(ui = ui, server = server, onStart = global) 83 | 84 | -------------------------------------------------------------------------------- /dev/nyc-taxi-data/taxiDashboard.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "NYC Taxi" 3 | author: "Nathan Stephens" 4 | output: 5 | flexdashboard::flex_dashboard: 6 | orientation: columns 7 | source_code: embed 8 | runtime: shiny 9 | --- 10 | 11 | ```{r setup, include=FALSE} 12 | library(ggplot2) 13 | library(flexdashboard) 14 | library(shiny) 15 | library(leaflet) 16 | ``` 17 | 18 | Detail 19 | ======================================================================= 20 | 21 | Inputs {.sidebar} 22 | ----------------------------------------------------------------------- 23 | 24 | ### NTA Code 25 | 26 | Select a neighborhood tabulation area (NTA) code to describe. 27 | 28 | ```{r} 29 | selectInput('var1','Select NTA Code',list('a'=1,'b'=2,'c'=3),1) 30 | ``` 31 | 32 | Column 33 | ----------------------------------------------------------------------- 34 | 35 | ### Pickups and dropoffs by hour 36 | 37 | ```{r} 38 | matplot(1:24, matrix(rnorm(48),24,2), type = 'l', col = 2:3, lty = 1) 39 | ``` 40 | 41 | ### Map 42 | 43 | ```{r} 44 | leaflet() %>% addTiles() %>% setView(-73.946832999999998,40.784374999999997, 12) 45 | ``` 46 | 47 | Column 48 | ----------------------------------------------------------------------- 49 | 50 | ### Cab Type 51 | 52 | ```{r} 53 | barplot(1:3, col = 2:5) 54 | ``` 55 | 56 | ### Distance 57 | 58 | ```{r} 59 | hist(rnorm(50), col='grey') 60 | ``` 61 | 62 | ### Cost 63 | 64 | ```{r} 65 | hist(rnorm(50), col='grey') 66 | ``` 67 | 68 | Route 69 | ======================================================================= 70 | 71 | Inputs {.sidebar} 72 | ----------------------------------------------------------------------- 73 | 74 | ### NTA Code 75 | 76 | ```{r} 77 | selectInput('var3','Select pickup',list('a'=1,'b'=2,'c'=3),1) 78 | 79 | selectInput('var4','Select dropoff',list('a'=1,'b'=2,'c'=3),1) 80 | ``` 81 | 82 | Column 83 | ----------------------------------------------------------------------- 84 | 85 | ### Travel time by hour 86 | 87 | ```{r} 88 | matplot(1:24, matrix(rnorm(48),24,2), type = 'l', col = 2:3, lty = 1) 89 | ``` 90 | 91 | ### Map 92 | 93 | ```{r} 94 | leaflet() %>% addTiles() %>% setView(-73.946832999999998,40.784374999999997, 12) 95 | ``` 96 | 97 | Column 98 | ----------------------------------------------------------------------- 99 | 100 | ### Cab Type 101 | 102 | ```{r} 103 | barplot(1:3, col = 2:5) 104 | ``` 105 | 106 | ### Distance 107 | 108 | ```{r} 109 | hist(rnorm(50), col='grey') 110 | ``` 111 | 112 | ### Cost 113 | 114 | ```{r} 115 | hist(rnorm(50), col='grey') 116 | ``` 117 | 118 | Pickups and Dropoffs 119 | ======================================================================= 120 | 121 | Inputs {.sidebar} 122 | ----------------------------------------------------------------------- 123 | 124 | ### NTA Code 125 | 126 | ```{r} 127 | selectInput('var5','Select dropoff',list('a'=1,'b'=2,'c'=3),1) 128 | ``` 129 | 130 | Column 131 | ----------------------------------------------------------------------- 132 | 133 | ### Pickup 134 | 135 | ```{r} 136 | leaflet() %>% addTiles() %>% setView(-73.983895000000004,40.723072000000002, 12) 137 | ``` 138 | 139 | 140 | Column 141 | ----------------------------------------------------------------------- 142 | 143 | ### Dropoff 144 | 145 | ```{r} 146 | leaflet() %>% addTiles() %>% setView(-73.961844999999997,40.767837999999998, 12) 147 | ``` 148 | 149 | -------------------------------------------------------------------------------- /dev/nycflights13/.gitignore: -------------------------------------------------------------------------------- 1 | rsconnect 2 | derby.log 3 | -------------------------------------------------------------------------------- /dev/nycflights13/nycflights13_flexdashboard_rdata.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Time Gained in Flight" 3 | output: 4 | flexdashboard::flex_dashboard: 5 | orientation: rows 6 | social: menu 7 | source_code: embed 8 | runtime: shiny 9 | --- 10 | 11 | ```{r setup, include=F} 12 | # Attach packages 13 | library(nycflights13) 14 | library(dplyr) 15 | library(ggplot2) 16 | library(DT) 17 | library(leaflet) 18 | library(geosphere) 19 | library(readr) 20 | 21 | # Attach data 22 | data(flights) 23 | data(airports) 24 | ``` 25 | 26 | ```{r include=F} 27 | # Prepare model data 28 | model_data <- flights %>% 29 | filter(!is.na(arr_delay) & !is.na(dep_delay) & !is.na(distance)) %>% 30 | filter(dep_delay > 15 & dep_delay < 240) %>% 31 | filter(arr_delay > -60 & arr_delay < 360) %>% 32 | left_join(airlines, by = c("carrier" = "carrier")) %>% 33 | mutate(gain = dep_delay - arr_delay) %>% 34 | select(origin, dest, carrier, airline = name, distance, dep_delay, arr_delay, gain) 35 | 36 | # Training and validation 37 | set.seed(777) 38 | ind <-sample(n <- nrow(model_data), floor(n * 0.5)) 39 | train_data <- model_data[ind, ] 40 | valid_data <- model_data[-ind, ] 41 | 42 | # Model time gained as function of distance, departure delay, and airline carrier 43 | lm1 <- lm(gain ~ distance + dep_delay + carrier, train_data) 44 | 45 | # Score data and aggregate flight route and carrier 46 | pred_data <- valid_data %>% 47 | mutate(pred = predict.lm(lm1, valid_data)) %>% 48 | group_by(origin, dest, carrier, airline) %>% 49 | summarize( 50 | flights = n(), 51 | distance = mean(distance), 52 | avg_dep_delay = mean(dep_delay), 53 | avg_arr_delay = mean(arr_delay), 54 | avg_gain = mean(gain), 55 | pred_gain = mean(pred) 56 | ) 57 | ``` 58 | 59 | Summary 60 | ======================================================================== 61 | 62 | Inputs {.sidebar} 63 | ----------------------------------------------------------------------- 64 | 65 | ### Select Airports 66 | 67 | ```{r} 68 | # Shiny inputs for flight orgin and destination 69 | carrier_origin <- ungroup(pred_data) %>% distinct(origin) %>% .[['origin']] 70 | carrier_dest <- ungroup(pred_data) %>% distinct(dest) %>% .[['dest']] 71 | selectInput("origin", "Flight origin", carrier_origin, selected = "JFK") 72 | selectInput("dest", "Flight destination", carrier_dest, selected = "SFO") 73 | ``` 74 | 75 | ### Background 76 | 77 | Given that your flight was delayed by 15 minutes or more, what is the likelihood 78 | your airline carrier will make up time in route? Some of the most signficant factors 79 | for making up time are flight distance and airline carrier. The data model behind 80 | this dashboard is based on flights from NYC airports in 2013. 81 | 82 | 83 | Row 84 | ----------------------------------------------------------------------- 85 | 86 | ### Observed versus predicted time gain 87 | 88 | ```{r} 89 | # Aggregregate time gain by carrier and by route 90 | plot_data <- reactive({ 91 | req(input$origin, input$dest) 92 | pred_data %>% 93 | filter(origin==input$origin & dest==input$dest) %>% 94 | ungroup() %>% 95 | select(airline, flights, distance, avg_dep_delay, avg_arr_delay, avg_gain, pred_gain) 96 | }) 97 | 98 | # Plot observed versus predicted time gain for carriers and route 99 | renderPlot({ 100 | ggplot(plot_data(), aes(factor(airline), pred_gain)) + 101 | geom_bar(stat = "identity", fill = '#2780E3') + 102 | geom_point(aes(factor(airline), avg_gain)) + 103 | coord_flip() + 104 | labs(x = "", y = "Time gained in flight (minutes)") + 105 | labs(title = "Observed gain (point) vs Predicted gain (bar)") 106 | }) 107 | ``` 108 | 109 | ### Route 110 | 111 | ```{r} 112 | # Identify origin lat and long 113 | origin <- reactive({ 114 | req(input$origin) 115 | filter(airports, faa == input$origin) 116 | }) 117 | 118 | # Identify destination lat and log 119 | dest <- reactive({ 120 | req(input$dest) 121 | filter(airports, faa == input$dest) 122 | }) 123 | 124 | # Plot route 125 | renderLeaflet({ 126 | gcIntermediate( 127 | select(origin(), lon, lat), 128 | select(dest(), lon, lat), 129 | n=100, addStartEnd=TRUE, sp=TRUE 130 | ) %>% 131 | leaflet() %>% 132 | addProviderTiles("CartoDB.Positron") %>% 133 | addPolylines() 134 | }) 135 | ``` 136 | 137 | Row 138 | ----------------------------------------------------------------------- 139 | 140 | ### Data details 141 | 142 | ```{r} 143 | # Print table of observed and predicted gains by airline 144 | renderDataTable( 145 | datatable(plot_data()) %>% 146 | formatRound(c("flights", "distance"), 0) %>% 147 | formatRound(c("avg_arr_delay", "avg_dep_delay", "avg_gain", "pred_gain"), 1) 148 | ) 149 | ``` 150 | 151 | Model Output 152 | ======================================================================== 153 | 154 | ```{r} 155 | renderPrint(summary(lm1)) 156 | ``` -------------------------------------------------------------------------------- /dev/titanic/.gitignore: -------------------------------------------------------------------------------- 1 | derby.log 2 | notebook-classification_v1.nb.html 3 | notebook-classification_v1.Rmd 4 | -------------------------------------------------------------------------------- /dev/titanic/rmarkdown-classification_files/figure-html/auc-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/titanic/rmarkdown-classification_files/figure-html/auc-1.png -------------------------------------------------------------------------------- /dev/titanic/rmarkdown-classification_files/figure-html/importance-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/titanic/rmarkdown-classification_files/figure-html/importance-1.png -------------------------------------------------------------------------------- /dev/titanic/rmarkdown-classification_files/figure-html/lift-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/titanic/rmarkdown-classification_files/figure-html/lift-1.png -------------------------------------------------------------------------------- /dev/titanic/titanic-parquet/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /dev/titanic/titanic-parquet/.part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/titanic/titanic-parquet/.part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet.crc -------------------------------------------------------------------------------- /dev/titanic/titanic-parquet/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/titanic/titanic-parquet/_SUCCESS -------------------------------------------------------------------------------- /dev/titanic/titanic-parquet/part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/titanic/titanic-parquet/part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet -------------------------------------------------------------------------------- /img/sparklyr-illustration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-illustration.png -------------------------------------------------------------------------------- /img/sparklyr-presentation-demos.001.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.001.jpeg -------------------------------------------------------------------------------- /img/sparklyr-presentation-demos.002.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.002.jpeg -------------------------------------------------------------------------------- /img/sparklyr-presentation-demos.003.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.003.jpeg -------------------------------------------------------------------------------- /img/sparklyr-presentation-demos.004.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.004.jpeg -------------------------------------------------------------------------------- /img/sparklyr-presentation-demos.005.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.005.jpeg -------------------------------------------------------------------------------- /img/sparklyr-presentation-demos.006.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.006.jpeg -------------------------------------------------------------------------------- /img/sparklyr-presentation-demos.007.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.007.jpeg -------------------------------------------------------------------------------- /img/sparklyr-presentation-demos.008.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.008.jpeg -------------------------------------------------------------------------------- /img/sparklyr-presentation-demos.009.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.009.jpeg -------------------------------------------------------------------------------- /img/sparklyr-presentation-demos.010.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.010.jpeg -------------------------------------------------------------------------------- /img/sparklyr-presentation-demos.011.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.011.jpeg -------------------------------------------------------------------------------- /img/sparklyr-presentation-demos.012.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.012.jpeg -------------------------------------------------------------------------------- /img/sparklyr-presentation-demos.013.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.013.jpeg -------------------------------------------------------------------------------- /img/sparklyr-presentation-demos.014.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.014.jpeg -------------------------------------------------------------------------------- /img/sparklyr-presentation-demos.015.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.015.jpeg -------------------------------------------------------------------------------- /img/sparklyr-presentation-demos.016.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.016.jpeg -------------------------------------------------------------------------------- /img/sparklyr-presentation-demos.017.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.017.jpeg -------------------------------------------------------------------------------- /img/sparklyr-presentation-demos.018.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.018.jpeg -------------------------------------------------------------------------------- /img/sparklyr-presentation-demos.019.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.019.jpeg -------------------------------------------------------------------------------- /img/sparklyr-presentation-demos.020.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.020.jpeg -------------------------------------------------------------------------------- /img/sparklyr-presentation-demos.021.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.021.jpeg -------------------------------------------------------------------------------- /prod/apps/iris-k-means/DESCRIPTION: -------------------------------------------------------------------------------- 1 | Title: Iris with Spark Backend 2 | Author: RStudio, Inc. 3 | AuthorUrl: http://www.rstudio.com/ 4 | License: GPL-3 5 | DisplayMode: Showcase 6 | Tags: sparklyr 7 | Type: Shiny 8 | -------------------------------------------------------------------------------- /prod/apps/iris-k-means/app.R: -------------------------------------------------------------------------------- 1 | library(sparklyr) 2 | library(dplyr) 3 | library(shiny) 4 | 5 | #Connect to Spark 6 | sc <- spark_connect(master = "local") 7 | 8 | #Read in Parquet Data 9 | spark_read_parquet(sc, "iris", "iris-parquet") 10 | iris_tbl <- tbl(sc, "iris") 11 | opts <- tbl_vars(iris_tbl)[-which(tbl_vars(iris_tbl) == "Species")] 12 | 13 | ui <- pageWithSidebar( 14 | headerPanel('Iris k-means clustering'), 15 | sidebarPanel( 16 | selectInput('xcol', 'X Variable', opts), 17 | selectInput('ycol', 'Y Variable', opts, 18 | selected = opts[2]), 19 | numericInput('clusters', 'Cluster count', 3, 20 | min = 2, max = 9) 21 | ), 22 | mainPanel( 23 | plotOutput('plot1') 24 | ) 25 | ) 26 | 27 | server <- function(input, output, session) { 28 | 29 | # Nothing is evaluated in Spark at this step 30 | selectedData <- reactive({ 31 | iris_tbl %>% select_(input$xcol, input$ycol) 32 | }) 33 | 34 | # The Spark data frame is constructed and kmeans is run 35 | clusters <- reactive({ 36 | selectedData() %>% 37 | ml_kmeans(centers = input$clusters) 38 | }) 39 | 40 | output$plot1 <- renderPlot({ 41 | par(mar = c(5.1, 4.1, 0, 1)) 42 | 43 | #score the results in Spark, pull in results to R 44 | scored <- predict(clusters(), iris_tbl) + 1 45 | 46 | #collect brings the data into R 47 | selectedData() %>% 48 | collect() %>% 49 | plot(col = scored, 50 | pch = 20, cex = 4) 51 | 52 | points(clusters()$centers, 53 | pch = 4, cex = 4, lwd = 4) 54 | }) 55 | 56 | } 57 | 58 | shinyApp(ui = ui, server = server) 59 | -------------------------------------------------------------------------------- /prod/apps/iris-k-means/config.yml: -------------------------------------------------------------------------------- 1 | default: 2 | sparklyr.cores.local: 1 3 | sparklyr.shell.driver-memory: 2G 4 | -------------------------------------------------------------------------------- /prod/apps/iris-k-means/iris-parquet/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /prod/apps/iris-k-means/iris-parquet/._common_metadata.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/iris-k-means/iris-parquet/._common_metadata.crc -------------------------------------------------------------------------------- /prod/apps/iris-k-means/iris-parquet/._metadata.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/iris-k-means/iris-parquet/._metadata.crc -------------------------------------------------------------------------------- /prod/apps/iris-k-means/iris-parquet/.part-r-00000-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/iris-k-means/iris-parquet/.part-r-00000-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet.crc -------------------------------------------------------------------------------- /prod/apps/iris-k-means/iris-parquet/.part-r-00001-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/iris-k-means/iris-parquet/.part-r-00001-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet.crc -------------------------------------------------------------------------------- /prod/apps/iris-k-means/iris-parquet/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/iris-k-means/iris-parquet/_SUCCESS -------------------------------------------------------------------------------- /prod/apps/iris-k-means/iris-parquet/_common_metadata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/iris-k-means/iris-parquet/_common_metadata -------------------------------------------------------------------------------- /prod/apps/iris-k-means/iris-parquet/_metadata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/iris-k-means/iris-parquet/_metadata -------------------------------------------------------------------------------- /prod/apps/iris-k-means/iris-parquet/part-r-00000-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/iris-k-means/iris-parquet/part-r-00000-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet -------------------------------------------------------------------------------- /prod/apps/iris-k-means/iris-parquet/part-r-00001-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/iris-k-means/iris-parquet/part-r-00001-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet -------------------------------------------------------------------------------- /prod/apps/nycflights13-app-spark/DESCRIPTION: -------------------------------------------------------------------------------- 1 | Title: NYCFlights13 Time Gained in Flight 2 | Author: RStudio, Inc. 3 | AuthorUrl: http://www.rstudio.com/ 4 | License: GPL-3 5 | DisplayMode: Showcase 6 | Tags: sparklyr 7 | Type: Shiny 8 | -------------------------------------------------------------------------------- /prod/apps/nycflights13-app-spark/Readme.md: -------------------------------------------------------------------------------- 1 | Given that your flight was delayed by 15 minutes or more, what is the likelihood your airline carrier will make up time in route? Some of the most signficant factors for making up time are flight distance and airline carrier. The data model behind this dashboard is based on flights from NYC airports in 2013. 2 | -------------------------------------------------------------------------------- /prod/apps/nycflights13-app-spark/app.R: -------------------------------------------------------------------------------- 1 | # R Packages 2 | library(nycflights13) 3 | library(dplyr) 4 | library(ggplot2) 5 | library(DT) 6 | library(leaflet) 7 | library(geosphere) 8 | library(sparklyr) 9 | 10 | # Connect to local Spark instance 11 | sc <- spark_connect(master = "local", version = '2.0.0') 12 | 13 | # Copy flights data into Spark 14 | copy_to(sc, flights, "flights_s", overwrite = TRUE) 15 | flights_tbl <- tbl(sc, 'flights_s') 16 | 17 | # Copy airlines data into Spark 18 | copy_to(sc, airlines, "airlines_s", overwrite = TRUE) 19 | airlines_tbl <- tbl(sc, 'airlines_s') 20 | 21 | # Prepare mode data 22 | model_data <- flights_tbl %>% 23 | filter(!is.na(arr_delay) & !is.na(dep_delay) & !is.na(distance)) %>% 24 | filter(dep_delay > 15 & dep_delay < 240) %>% 25 | filter(arr_delay > -60 & arr_delay < 360) %>% 26 | left_join(airlines_tbl, by = c("carrier" = "carrier")) %>% 27 | mutate(gain = dep_delay - arr_delay) %>% 28 | select(origin, dest, carrier, airline = name, distance, dep_delay, arr_delay, gain) 29 | 30 | # Partition data into train and validation 31 | partitions <- model_data %>% 32 | sdf_partition(train_data = 0.5, valid_data = 0.5, seed = 777) 33 | 34 | # Train a linear model in Spark 35 | lm1 <- ml_linear_regression(partitions$train_data, gain ~ distance + dep_delay + carrier) 36 | 37 | # Score the validation data 38 | pred_tbl <- sdf_predict(lm1, partitions$valid_data) 39 | 40 | # Create scored look up data for Shiny app 41 | lookup_tbl <- pred_tbl %>% 42 | group_by(origin, dest, carrier, airline) %>% 43 | summarize( 44 | flights = n(), 45 | distance = mean(distance), 46 | avg_dep_delay = mean(dep_delay), 47 | avg_arr_delay = mean(arr_delay), 48 | avg_gain = mean(gain), 49 | pred_gain = mean(prediction) 50 | ) 51 | 52 | # Cache the look up table 53 | sdf_register(lookup_tbl, "lookup") 54 | tbl_cache(sc, "lookup") 55 | 56 | # Find distinct airport codes 57 | carrier_origin <- c("JFK", "LGA", "EWR") 58 | carrier_dest <- c("BOS", "DCA", "DEN", "HNL", "LAX", "SEA", "SFO", "STL") 59 | 60 | # Shiny UI 61 | ui <- fluidPage( 62 | 63 | # Set display mode to bottom 64 | tags$script(' var setInitialCodePosition = function() 65 | { setCodePosition(false, false); }; '), 66 | 67 | # Title 68 | titlePanel("NYCFlights13 Time Gained in Flight"), 69 | 70 | # Create sidebar 71 | sidebarLayout( 72 | sidebarPanel( 73 | radioButtons("origin", "Flight origin:", 74 | carrier_origin, selected = "JFK"), 75 | br(), 76 | 77 | radioButtons("dest", "Flight destination:", 78 | carrier_dest, selected = "SFO") 79 | 80 | ), 81 | 82 | # Show a tabset that includes a plot, model, and table view 83 | mainPanel( 84 | tabsetPanel(type = "tabs", 85 | tabPanel("Plot", plotOutput("plot")), 86 | tabPanel("Map", leafletOutput("map")), 87 | tabPanel("Data", dataTableOutput("datatable")) 88 | ) 89 | ) 90 | ) 91 | ) 92 | 93 | # Shiny server function 94 | server <- function(input, output) { 95 | 96 | # Identify origin lat and log 97 | origin <- reactive({ 98 | req(input$origin) 99 | filter(nycflights13::airports, faa == input$origin) 100 | }) 101 | 102 | # Identify destination lat and log 103 | dest <- reactive({ 104 | req(input$dest) 105 | filter(nycflights13::airports, faa == input$dest) 106 | }) 107 | 108 | # Create plot data 109 | plot_data <- reactive({ 110 | req(input$origin, input$dest) 111 | lookup_tbl %>% 112 | filter(origin==input$origin & dest==input$dest) %>% 113 | ungroup() %>% 114 | select(airline, flights, distance, avg_gain, pred_gain) %>% 115 | collect 116 | }) 117 | 118 | # Plot observed versus predicted time gain for carriers and route 119 | output$plot <- renderPlot({ 120 | ggplot(plot_data(), aes(factor(airline), pred_gain)) + 121 | geom_bar(stat = "identity", fill = '#2780E3') + 122 | geom_point(aes(factor(airline), avg_gain)) + 123 | coord_flip() + 124 | labs(x = "", y = "Time gained in flight (minutes)") + 125 | labs(title = "Observed gain (point) vs Predicted gain (bar)") 126 | }) 127 | 128 | # Output the route map 129 | output$map <- renderLeaflet({ 130 | gcIntermediate( 131 | select(origin(), lon, lat), 132 | select(dest(), lon, lat), 133 | n=100, addStartEnd=TRUE, sp=TRUE 134 | ) %>% 135 | leaflet() %>% 136 | addProviderTiles("CartoDB.Positron") %>% 137 | addPolylines() 138 | }) 139 | 140 | # Print table of observed and predicted gains by airline 141 | output$datatable <- renderDataTable( 142 | datatable(plot_data()) %>% 143 | formatRound(c("flights", "distance"), 0) %>% 144 | formatRound(c("avg_gain", "pred_gain"), 1) 145 | ) 146 | 147 | } 148 | 149 | # Run Shiny 150 | shinyApp(ui = ui, server = server) 151 | -------------------------------------------------------------------------------- /prod/apps/nycflights13-app-spark/config.yml: -------------------------------------------------------------------------------- 1 | default: 2 | sparklyr.cores.local: 1 3 | sparklyr.shell.driver-memory: 2G 4 | -------------------------------------------------------------------------------- /prod/apps/titanic-classification/.gitignore: -------------------------------------------------------------------------------- 1 | derby.log 2 | -------------------------------------------------------------------------------- /prod/apps/titanic-classification/DESCRIPTION: -------------------------------------------------------------------------------- 1 | Title: Spark ML Classifier Performance - Titanic 2 | Author: RStudio, Inc. 3 | AuthorUrl: http://www.rstudio.com/ 4 | License: GPL-3 5 | DisplayMode: Showcase 6 | Tags: sparklyr 7 | Type: Shiny 8 | -------------------------------------------------------------------------------- /prod/apps/titanic-classification/app.R: -------------------------------------------------------------------------------- 1 | library(sparklyr) 2 | library(dplyr) 3 | library(shiny) 4 | library(ggplot2) 5 | library(tidyr) 6 | source('helpers.R') 7 | 8 | 9 | #Connect to Spark 10 | sc <- spark_connect(master = "local", version = "2.0.0") 11 | 12 | #Read in Parquet Data 13 | spark_read_parquet(sc, "titanic", "titanic-parquet") 14 | titanic_tbl <- tbl(sc, "titanic") 15 | 16 | # Add features 17 | titanic_final <- titanic_tbl %>% 18 | mutate(Family_Size = SibSp + Parch + 1L) %>% 19 | mutate(Pclass = as.character(Pclass)) %>% 20 | filter(!is.na(Embarked)) %>% 21 | mutate(Age = if_else(is.na(Age), mean(Age), Age)) %>% 22 | mutate(Family_Size = as.numeric(Family_size)) %>% 23 | sdf_mutate( 24 | Family_Sizes = ft_bucketizer(Family_Size, splits = c(1,2,5,12)) 25 | ) %>% 26 | mutate(Family_Sizes = as.character(as.integer(Family_Sizes))) %>% 27 | mutate(Survived = as.numeric(Survived), SibSp = as.numeric(SibSp), Parch = as.numeric(Parch)) %>% 28 | select(Survived, Pclass, Sex, Age, SibSp, Parch, Fare, Embarked, Family_Sizes) %>% 29 | sdf_register("titanic_final") 30 | 31 | features <- tbl_vars(titanic_final) %>% 32 | .[-which(. == "Survived")] 33 | 34 | 35 | ui <- pageWithSidebar( 36 | headerPanel('ML Titanic Classification'), 37 | sidebarPanel( 38 | selectizeInput('selfeatures', 'Select Features', features, multiple = TRUE), 39 | numericInput('trainingFrac', 'Training Proportion', min = 0.1, max = 0.9, value = 0.75), 40 | actionButton('fit', "Fit Models") 41 | ), 42 | mainPanel( 43 | plotOutput('liftPlot'), 44 | plotOutput('auc_accuracy') 45 | ) 46 | ) 47 | 48 | server <- function(input, output, session) { 49 | 50 | ml_score <- eventReactive(input$fit, { 51 | withProgress(message = "Fitting Spark Models", value = 0.1, { 52 | incProgress(0.2, detail = "Partitioning Training / Testing") 53 | partition <- sdf_partition(titanic_final, train = input$trainingFrac, test= 1-input$trainingFrac) 54 | train_tbl <- partition$train 55 | test_tbl <- partition$test 56 | 57 | ml_formula <- formula(paste("Survived ~", paste(input$selfeatures, collapse = "+"))) 58 | 59 | incProgress(0.5, detail = "Fitting Models") 60 | ml_models <- list( 61 | "Logistic" = ml_logistic_regression(train_tbl, ml_formula), 62 | "Decision Tree" = ml_decision_tree(train_tbl, ml_formula), 63 | "Random Forest" = ml_random_forest(train_tbl, ml_formula), 64 | "Gradient Boosted Trees" = ml_gradient_boosted_trees(train_tbl, ml_formula), 65 | "Naive Bayes" = ml_naive_bayes(train_tbl, ml_formula) 66 | ) 67 | 68 | incProgress(0.75, detail = "Scoring Models") 69 | lapply(ml_models, score_test_data, test_tbl) # helpers.R 70 | }) 71 | }) 72 | 73 | output$liftPlot <- renderPlot({ 74 | 75 | ml_gains <- data.frame(bin = 1:10, prop = seq(0, 1, len = 10), model = "Base") 76 | for (i in names(ml_score())) { 77 | ml_gains <- ml_score()[[i]] %>% 78 | calculate_lift %>% # helpers.R 79 | mutate(model = i) %>% 80 | rbind(ml_gains, .) 81 | } 82 | ggplot(ml_gains, aes(x = bin, y = prop, colour = model)) + 83 | geom_point() + geom_line() + 84 | ggtitle("Lift Chart for Predicting Survival - Test Data Set") + 85 | xlab("") + ylab("") 86 | 87 | }) 88 | 89 | output$auc_accuracy <- renderPlot({ 90 | # Calculate AUC and accuracy 91 | perf_metrics <- data.frame( 92 | model = names(ml_score()), 93 | AUC = 100 * sapply(ml_score(), ml_binary_classification_eval, "Survived", "prediction"), 94 | Accuracy = 100 * sapply(ml_score(), calc_accuracy), 95 | row.names = NULL, stringsAsFactors = FALSE) 96 | 97 | # Plot results 98 | gather(perf_metrics, metric, value, AUC, Accuracy) %>% 99 | ggplot(aes(reorder(model, value), value, fill = metric)) + 100 | geom_bar(stat = "identity", position = "dodge") + 101 | coord_flip() + 102 | xlab("") + 103 | ylab("Percent") + 104 | ggtitle("Performance Metrics") 105 | 106 | }) 107 | 108 | } 109 | 110 | shinyApp(ui = ui, server = server) 111 | -------------------------------------------------------------------------------- /prod/apps/titanic-classification/helpers.R: -------------------------------------------------------------------------------- 1 | calculate_lift <- function(scored_data) { 2 | scored_data %>% 3 | mutate(bin = ntile(desc(prediction), 10)) %>% 4 | group_by(bin) %>% 5 | summarize(count = sum(Survived)) %>% 6 | mutate(prop = count / sum(count)) %>% 7 | arrange(bin) %>% 8 | mutate(prop = cumsum(prop)) %>% 9 | select(-count) %>% 10 | collect() %>% 11 | as.data.frame() 12 | } 13 | 14 | score_test_data <- function(model, data=test_tbl){ 15 | pred <- sdf_predict(model, data) 16 | select(pred, Survived, prediction) 17 | } 18 | 19 | calc_accuracy <- function(data, cutpoint = 0.5){ 20 | data %>% 21 | mutate(prediction = if_else(prediction > cutpoint, 1.0, 0.0)) %>% 22 | ml_classification_eval("prediction", "Survived", "accuracy") 23 | } -------------------------------------------------------------------------------- /prod/apps/titanic-classification/titanic-parquet/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /prod/apps/titanic-classification/titanic-parquet/.part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/titanic-classification/titanic-parquet/.part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet.crc -------------------------------------------------------------------------------- /prod/apps/titanic-classification/titanic-parquet/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/titanic-classification/titanic-parquet/_SUCCESS -------------------------------------------------------------------------------- /prod/apps/titanic-classification/titanic-parquet/part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/titanic-classification/titanic-parquet/part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet -------------------------------------------------------------------------------- /prod/conf/config.yml: -------------------------------------------------------------------------------- 1 | default: 2 | sparklyr.cores.local: 1 3 | sparklyr.shell.driver-memory: 1G 4 | -------------------------------------------------------------------------------- /prod/conf/shiny-server.conf: -------------------------------------------------------------------------------- 1 | run_as shiny; 2 | auth_pam; 3 | 4 | server { 5 | 6 | listen 80; 7 | 8 | utilization_scheduler 20 0 1; # max of 20 connections and 1 R process per app 9 | app_session_timeout 300; # close idle connection in seconds 10 | app_idle_timeout 86400; # close idle R process in seconds 11 | app_init_timeout 600; # cancel startup in seconds 12 | 13 | log_dir /var/log/shiny-server; 14 | google_analytics_id UA-20375833-15; 15 | 16 | #location /dashboards/ggplot2-brushing { 17 | # app_dir /srv/shiny-server/sparkDemos/prod/dashboards/ggplot2-brushing; 18 | #} 19 | 20 | location /dashboards/diamonds-explorer { 21 | app_dir /srv/shiny-server/sparkDemos/prod/dashboards/diamonds-explorer; 22 | } 23 | 24 | location /dashboards/nycflights13-dash-spark { 25 | app_dir /srv/shiny-server/sparkDemos/prod/dashboards/nycflights13-dash-spark; 26 | } 27 | 28 | location /apps/titanic-classification { 29 | app_dir /srv/shiny-server/sparkDemos/prod/apps/titanic-classification; 30 | } 31 | 32 | location /apps/iris-k-means { 33 | app_dir /srv/shiny-server/sparkDemos/prod/apps/iris-k-means; 34 | } 35 | 36 | location /apps/nycflights13-app-spark { 37 | app_dir /srv/shiny-server/sparkDemos/prod/apps/nycflights13-app-spark; 38 | } 39 | 40 | } 41 | 42 | admin 4151 { 43 | required_group shiny-admins; 44 | } -------------------------------------------------------------------------------- /prod/dashboards/diamonds-explorer/config.yml: -------------------------------------------------------------------------------- 1 | default: 2 | sparklyr.cores.local: 1 3 | sparklyr.shell.driver-memory: 2G 4 | -------------------------------------------------------------------------------- /prod/dashboards/diamonds-explorer/diamonds-parquet/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /prod/dashboards/diamonds-explorer/diamonds-parquet/._common_metadata.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/dashboards/diamonds-explorer/diamonds-parquet/._common_metadata.crc -------------------------------------------------------------------------------- /prod/dashboards/diamonds-explorer/diamonds-parquet/._metadata.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/dashboards/diamonds-explorer/diamonds-parquet/._metadata.crc -------------------------------------------------------------------------------- /prod/dashboards/diamonds-explorer/diamonds-parquet/.part-r-00000-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/dashboards/diamonds-explorer/diamonds-parquet/.part-r-00000-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet.crc -------------------------------------------------------------------------------- /prod/dashboards/diamonds-explorer/diamonds-parquet/.part-r-00001-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/dashboards/diamonds-explorer/diamonds-parquet/.part-r-00001-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet.crc -------------------------------------------------------------------------------- /prod/dashboards/diamonds-explorer/diamonds-parquet/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/dashboards/diamonds-explorer/diamonds-parquet/_SUCCESS -------------------------------------------------------------------------------- /prod/dashboards/diamonds-explorer/diamonds-parquet/_common_metadata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/dashboards/diamonds-explorer/diamonds-parquet/_common_metadata -------------------------------------------------------------------------------- /prod/dashboards/diamonds-explorer/diamonds-parquet/_metadata: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/dashboards/diamonds-explorer/diamonds-parquet/_metadata -------------------------------------------------------------------------------- /prod/dashboards/diamonds-explorer/diamonds-parquet/part-r-00000-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/dashboards/diamonds-explorer/diamonds-parquet/part-r-00000-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet -------------------------------------------------------------------------------- /prod/dashboards/diamonds-explorer/diamonds-parquet/part-r-00001-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/dashboards/diamonds-explorer/diamonds-parquet/part-r-00001-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet -------------------------------------------------------------------------------- /prod/dashboards/diamonds-explorer/flexdashboard-shiny-diamonds.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "ggplot2 Diamonds Explorer" 3 | output: 4 | flexdashboard::flex_dashboard: 5 | orientation: columns 6 | social: menu 7 | source_code: embed 8 | runtime: shiny 9 | --- 10 | 11 | ```{r global, include=FALSE} 12 | library(ggplot2) 13 | library(mgcv) 14 | library(flexdashboard) 15 | library(sparklyr) 16 | library(dplyr) 17 | 18 | sc <- spark_connect(master = "local") 19 | spark_read_parquet(sc, "diamonds", path = "diamonds-parquet") 20 | diamonds_tbl <- tbl(sc, "diamonds") 21 | ``` 22 | 23 | Inputs {.sidebar} 24 | ----------------------------------------------------------------------- 25 | 26 | ```{r} 27 | n <- (count(diamonds_tbl) %>% as.data.frame())$n 28 | sliderInput('sampleSize', 'Sample Size', min = 1, max = n, 29 | value = min(1000, n), step = 1000, round = 0) 30 | 31 | checkboxInput('jitter', 'Jitter', value = TRUE) 32 | checkboxInput('smooth', 'Smooth', value = TRUE) 33 | 34 | selectInput('x', 'X', tbl_vars(diamonds_tbl)) 35 | selectInput('y', 'Y', tbl_vars(diamonds_tbl), tbl_vars(diamonds_tbl)[2]) 36 | selectInput('color', 'Color', c('None', tbl_vars(diamonds_tbl))) 37 | 38 | # Determine column type and select only strings 39 | factor_cols <- sparklyr:::sdf_schema(diamonds_tbl) %>% 40 | sapply(unlist) %>% 41 | t() %>% 42 | as.data.frame() %>% 43 | filter(type == "StringType") %>% 44 | select(name) 45 | 46 | selectInput('facet_row', 'Facet Row', c(None='.', factor_cols)) 47 | selectInput('facet_col', 'Facet Column', c(None='.', factor_cols)) 48 | ``` 49 | 50 | Outputs 51 | ----------------------------------------------------------------------- 52 | 53 | ### Diamonds 54 | 55 | ```{r} 56 | dataset <- reactive({ 57 | diamonds_tbl %>% 58 | sdf_sample(fraction = (input$sampleSize / diamonds %>% count())) %>% 59 | collect() 60 | }) 61 | 62 | renderPlot({ 63 | p <- ggplot(dataset(), aes_string(x = input$x, y = input$y)) + geom_point() 64 | 65 | if (input$color != 'None') 66 | p <- p + aes_string(color = input$color) 67 | 68 | facets <- paste(input$facet_row, '~', input$facet_col) 69 | if (facets != '. ~ .') 70 | p <- p + facet_grid(facets) 71 | 72 | if (input$jitter) 73 | p <- p + geom_jitter() 74 | if (input$smooth) 75 | p <- p + geom_smooth() 76 | 77 | print(p) 78 | }) 79 | ``` 80 | -------------------------------------------------------------------------------- /prod/dashboards/ggplot2-brushing/ggplot2Brushing.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "ggplot2 Brushing" 3 | output: 4 | flexdashboard::flex_dashboard: 5 | orientation: columns 6 | social: menu 7 | source_code: embed 8 | runtime: shiny 9 | --- 10 | 11 | ```{r global, include=FALSE} 12 | # load data in 'global' chunk so it can be shared by all users of the dashboard 13 | library(datasets) 14 | library(flexdashboard) 15 | library(sparklyr) 16 | library(dplyr) 17 | 18 | sc <- spark_connect(master = "local", version = "2.0.0") 19 | mtcars2_tbl <- copy_to(sc, mtcars[, c("mpg", "cyl", "wt")], "mtcars") 20 | ``` 21 | 22 | 23 | ```{r} 24 | # Reactive that returns the whole dataset if there is no brush 25 | selectedData <- reactive({ 26 | data <- brushedPoints(collect(mtcars2_tbl), input$plot1_brush) 27 | if (nrow(data) == 0) 28 | data_tbl <- collect(mtcars2_tbl) 29 | data 30 | }) 31 | ``` 32 | 33 | Column {data-width=650} 34 | ----------------------------------------------------------------------- 35 | 36 | ### Miles Per Gallon vs. Weight {data-width=600} 37 | 38 | ```{r} 39 | library(ggplot2) 40 | plotOutput("plot1", brush = brushOpts(id = "plot1_brush")) 41 | output$plot1 <- renderPlot({ 42 | ggplot(collect(mtcars2_tbl), aes(wt, mpg)) + geom_point() 43 | }) 44 | ``` 45 | 46 | ### Miles Per Gallon and Cylinders 47 | 48 | ```{r} 49 | renderPlot({ 50 | ggplot(selectedData(), aes(factor(cyl), mpg)) + geom_boxplot() 51 | }) 52 | ``` 53 | 54 | Column {data-width=350} 55 | ----------------------------------------------------------------------- 56 | 57 | ### Car Details {data-width=400} 58 | 59 | ```{r} 60 | renderTable({ 61 | selectedData() 62 | }) 63 | ``` -------------------------------------------------------------------------------- /prod/dashboards/nycflights13-dash-spark/config.yml: -------------------------------------------------------------------------------- 1 | default: 2 | sparklyr.cores.local: 1 3 | sparklyr.shell.driver-memory: 2G 4 | -------------------------------------------------------------------------------- /prod/dashboards/tor-project/.gitignore: -------------------------------------------------------------------------------- 1 | derby.log 2 | -------------------------------------------------------------------------------- /prod/dashboards/tor-project/metricsgraphicsTorProject.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "MetricsGraphics: Tor Project" 3 | output: 4 | flexdashboard::flex_dashboard: 5 | orientation: rows 6 | social: menu 7 | source_code: embed 8 | --- 9 | 10 | ```{r global, include=FALSE, message = FALSE} 11 | library(flexdashboard) 12 | library(metricsgraphics) 13 | library(readr) 14 | library(dplyr) 15 | library(tidyr) 16 | library(sparklyr) 17 | 18 | sc <- spark_connect(master = "local", version = "2.0.0") 19 | 20 | servers <- read_csv("https://metrics.torproject.org/stats/servers.csv", 21 | col_types="ccccccii") 22 | hidden <- read_csv("https://metrics.torproject.org/stats/hidserv.csv", 23 | col_types="ccddddd") 24 | 25 | raw_servers_tbl <- copy_to(sc, servers, "servers") 26 | raw_hidden_tbl <- copy_to(sc, hidden, "hidden") 27 | 28 | servers_tbl <- raw_servers_tbl %>% 29 | mutate(date = from_unixtime(unix_timestamp(date , 'yyyy-MM-dd'))) %>% 30 | filter(date >= '2016-01-01') 31 | 32 | hidden <- raw_hidden_tbl %>% 33 | mutate(date = from_unixtime(unix_timestamp(date , 'yyyy-MM-dd'))) %>% 34 | filter(date >= '2016-01-01' & type=="dir-onions-seen") %>% 35 | collect 36 | 37 | relays <- servers_tbl %>% 38 | filter(!is.na(relays)) %>% 39 | count(date, wt = relays) %>% 40 | collect 41 | 42 | filter(servers, !is.na(relays)) %>% 43 | mutate(platform=ifelse(is.na(platform), "Linux", platform)) %>% 44 | count(date, platform, wt=relays) %>% 45 | collect %>% 46 | spread(platform, n) -> relays_by_platform 47 | 48 | filter(servers, !is.na(relays)) %>% 49 | count(date, flag, wt=relays) %>% 50 | filter(!is.na(flag)) %>% 51 | collect %>% 52 | spread(flag, n) -> relays_by_flag 53 | 54 | filter(servers, !is.na(relays)) %>% 55 | count(date, version, wt=relays) %>% 56 | filter(!is.na(version)) %>% 57 | mutate(version=gsub("^0", "v0", version)) %>% 58 | collect %>% 59 | spread(version, n) -> relays_by_version 60 | ``` 61 | 62 | Row {data-height=600} 63 | ----------------------------------------------------------------------- 64 | 65 | ### Active Relays in the Tor Network 66 | 67 | ```{r} 68 | mjs_plot(relays, date, n, top=0, left=30) %>% 69 | mjs_line(area=TRUE) %>% 70 | mjs_axis_x(xax_format="date") %>% 71 | mjs_add_mouseover("function(d, i) { 72 | $('{{ID}} svg .mg-active-datapoint') 73 | .html('Relay count' + 74 | d3.time.format('%Y-%m-%d')(d.date) + ': ' + 75 | d3.format('0,000')(d.n)); 76 | }") 77 | ``` 78 | 79 | ### Hidden-service statistics 80 | 81 | ```{r} 82 | mjs_plot(hidden, date, "wmean", top=0, left=30) %>% 83 | mjs_line() %>% 84 | mjs_add_line("wmedian") %>% 85 | mjs_add_line("wiqm") %>% 86 | mjs_axis_x(xax_format="date") %>% 87 | mjs_add_legend(c("wmean", "wmedian", "wiqm")) 88 | ``` 89 | 90 | Row {.tabset} 91 | ----------------------------------------------------------------------- 92 | 93 | ### Relays with Exit, Fast, Guard, HSDir & Stable flags 94 | 95 | ```{r} 96 | mjs_plot(relays_by_flag, date, Exit, top=0, left=30) %>% 97 | mjs_line() %>% 98 | mjs_add_line(Fast) %>% 99 | mjs_add_line(Guard) %>% 100 | mjs_add_line(HSDir) %>% 101 | mjs_add_line(Stable) %>% 102 | mjs_axis_x(xax_format="date") %>% 103 | mjs_add_legend(c("Exit", "Fast", "Guard", "HSDir", "Stable")) 104 | ``` 105 | 106 | ### Relays by OS (log scale) 107 | 108 | ```{r} 109 | mjs_plot(relays_by_platform, date, BSD, top=0, left=30) %>% 110 | mjs_line() %>% 111 | mjs_add_line(Darwin) %>% 112 | mjs_add_line(Linux) %>% 113 | mjs_add_line(Other) %>% 114 | mjs_add_line(Windows) %>% 115 | mjs_axis_x(xax_format="date") %>% 116 | mjs_axis_y(y_scale_type="log") %>% 117 | mjs_add_legend(c("BSD", "Darwin", "Linux", "Other", "Windows")) 118 | ``` 119 | 120 | ### Relays by version 121 | 122 | ```{r} 123 | mjs_plot(relays_by_version, date, "v0.2.4", top=0, left=30) %>% 124 | mjs_line() %>% 125 | mjs_add_line("v0.2.5") %>% 126 | mjs_add_line("v0.2.6") %>% 127 | mjs_add_line("v0.2.7") %>% 128 | mjs_add_line("v0.2.8") %>% 129 | mjs_add_line("Other") %>% 130 | mjs_axis_x(xax_format="date") %>% 131 | mjs_add_legend(c("v0.2.4", "v0.2.5", "v0.2.6", "v0.2.7", "v0.2.8", "Other")) 132 | ``` 133 | -------------------------------------------------------------------------------- /prod/notebooks/babynames/.gitignore: -------------------------------------------------------------------------------- 1 | derby.log 2 | -------------------------------------------------------------------------------- /prod/notebooks/end-to-end-flights/end-to-end-flights-flexdashboard.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Time Gained in Flight" 3 | output: 4 | flexdashboard::flex_dashboard: 5 | orientation: rows 6 | social: menu 7 | source_code: embed 8 | runtime: shiny 9 | --- 10 | 11 | ```{r setup, include=F} 12 | # Attach packages 13 | library(dplyr) 14 | library(ggplot2) 15 | library(DT) 16 | library(leaflet) 17 | library(geosphere) 18 | load('flights_pred_2008.RData') 19 | airports <- mutate(airports, lat = as.numeric(lat), lon = as.numeric(lon)) 20 | ``` 21 | 22 | 23 | Summary 24 | ======================================================================== 25 | 26 | Inputs {.sidebar} 27 | ----------------------------------------------------------------------- 28 | 29 | ### Select Airports 30 | 31 | ```{r} 32 | # Shiny inputs for flight orgin and destination 33 | carrier_origin <- ungroup(pred_data) %>% distinct(origin) %>% .[['origin']] 34 | carrier_dest <- ungroup(pred_data) %>% distinct(dest) %>% .[['dest']] 35 | selectInput("origin", "Flight origin", carrier_origin, selected = "JFK") 36 | selectInput("dest", "Flight destination", carrier_dest, selected = "LAS") 37 | ``` 38 | 39 | ### Background 40 | 41 | Given that your flight was delayed by 15 minutes or more, what is the likelihood 42 | your airline carrier will make up time in route? Some of the most signficant factors 43 | for making up time are flight distance and airline carrier. The data model behind 44 | this dashboard is based on flights from NYC airports in 2013. 45 | 46 | 47 | Row 48 | ----------------------------------------------------------------------- 49 | 50 | ### Observed versus predicted time gain 51 | 52 | ```{r} 53 | # Aggregregate time gain by carrier and by route 54 | plot_data <- reactive({ 55 | req(input$origin, input$dest) 56 | pred_data %>% 57 | filter(origin==input$origin & dest==input$dest) %>% 58 | ungroup() %>% 59 | select(airline, flights, distance, avg_dep_delay, avg_arr_delay, avg_gain, pred_gain) 60 | }) 61 | 62 | # Plot observed versus predicted time gain for carriers and route 63 | renderPlot({ 64 | ggplot(plot_data(), aes(factor(airline), pred_gain)) + 65 | geom_bar(stat = "identity", fill = '#2780E3') + 66 | geom_point(aes(factor(airline), avg_gain)) + 67 | coord_flip() + 68 | labs(x = "", y = "Time gained in flight (minutes)") + 69 | labs(title = "Observed gain (point) vs Predicted gain (bar)") 70 | }) 71 | ``` 72 | 73 | ### Route 74 | 75 | ```{r} 76 | # Identify origin lat and long 77 | origin <- reactive({ 78 | req(input$origin) 79 | filter(airports, faa == input$origin) 80 | }) 81 | 82 | # Identify destination lat and log 83 | dest <- reactive({ 84 | req(input$dest) 85 | filter(airports, faa == input$dest) 86 | }) 87 | 88 | # Plot route 89 | renderLeaflet({ 90 | gcIntermediate( 91 | select(origin(), lon, lat), 92 | select(dest(), lon, lat), 93 | n=100, addStartEnd=TRUE, sp=TRUE 94 | ) %>% 95 | leaflet() %>% 96 | addProviderTiles("CartoDB.Positron") %>% 97 | addPolylines() 98 | }) 99 | ``` 100 | 101 | Row 102 | ----------------------------------------------------------------------- 103 | 104 | ### Data details 105 | 106 | ```{r} 107 | # Print table of observed and predicted gains by airline 108 | renderDataTable( 109 | datatable(plot_data()) %>% 110 | formatRound(c("flights", "distance"), 0) %>% 111 | formatRound(c("avg_arr_delay", "avg_dep_delay", "avg_gain", "pred_gain"), 1) 112 | ) 113 | ``` 114 | 115 | Model Details 116 | ======================================================================== 117 | 118 | ```{r} 119 | renderPrint(ml1_summary) 120 | ``` 121 | -------------------------------------------------------------------------------- /prod/notebooks/end-to-end-flights/flights_pred_2008.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/notebooks/end-to-end-flights/flights_pred_2008.RData -------------------------------------------------------------------------------- /prod/notebooks/ml_classification_titanic/titanic-parquet/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /prod/notebooks/ml_classification_titanic/titanic-parquet/.part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/notebooks/ml_classification_titanic/titanic-parquet/.part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet.crc -------------------------------------------------------------------------------- /prod/notebooks/ml_classification_titanic/titanic-parquet/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/notebooks/ml_classification_titanic/titanic-parquet/_SUCCESS -------------------------------------------------------------------------------- /prod/notebooks/ml_classification_titanic/titanic-parquet/part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/notebooks/ml_classification_titanic/titanic-parquet/part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet -------------------------------------------------------------------------------- /prod/notebooks/taxi_demo/readme.md: -------------------------------------------------------------------------------- 1 | TO DO -------------------------------------------------------------------------------- /prod/presentations/cazena/01_taxiR.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "NYC Taxi - One month in R" 3 | output: html_notebook 4 | --- 5 | 6 |
7 | ![R for Data Science http://r4ds.had.co.nz/](http://r4ds.had.co.nz/diagrams/data-science.png) 8 |
9 | 10 | # Load tidyverse 11 | 12 | ```{r tidyverse} 13 | library(tidyverse) 14 | library(lubridate) 15 | ``` 16 | 17 | # Download 18 | 19 | ```{r download, eval=FALSE} 20 | download.file( 21 | "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-12.csv", 22 | "yellow_tripdata_2015-12.csv") 23 | ``` 24 | 25 | # Impord Dataset 26 | 27 | ```{r import, message=FALSE, warning=FALSE} 28 | trips <- read_csv("yellow_tripdata_2015-12.csv", n_max = 1000000) 29 | ``` 30 | 31 | # Tidy 32 | 33 | ```{r tidy} 34 | # pickups 35 | select(trips, tpep_pickup_datetime, pickup_latitude, pickup_longitude) 36 | 37 | # dropoffs 38 | select(trips, tpep_dropoff_datetime, dropoff_latitude, dropoff_longitude) 39 | 40 | # trips 41 | trips 42 | ``` 43 | 44 | # Transform 45 | 46 | ```{r transform} 47 | tripsHour <- trips %>% 48 | filter(payment_type %in% c(1, 2)) %>% 49 | mutate(pay_type = ifelse(payment_type == 1, "credit", "cash")) %>% 50 | mutate(trip_time_sec = tpep_dropoff_datetime - tpep_pickup_datetime) %>% 51 | mutate(trip_time_min = as.numeric(trip_time_sec / 60)) %>% 52 | mutate(hour = round_date(tpep_pickup_datetime, "hour")) %>% 53 | group_by(pay_type, hour) %>% 54 | summarize(n = n(), 55 | tip_amount = mean(tip_amount), 56 | fare_amount = mean(fare_amount), 57 | passenger_count = mean(passenger_count), 58 | trip_time = mean(trip_time_min), 59 | trip_distance = mean(trip_distance)) 60 | tripsHour 61 | ``` 62 | 63 | # Visualize 64 | 65 | ```{r visualize} 66 | ggplot(tripsHour, aes(fare_amount, color = pay_type)) + 67 | geom_density() + 68 | labs(title = "NYC taxi fare amount", x = "Fare Amount", y = "Miles", caption = '2015-12') 69 | 70 | qplot(trip_distance, data=tripsHour, geom="density", log="x", facets = ~pay_type) 71 | ``` 72 | 73 | # Model 74 | 75 | ```{r model} 76 | # Formula 77 | model_formula <- formula(tip_amount ~ fare_amount + pay_type + passenger_count) 78 | 79 | # Model data 80 | tripsModel <- tripsHour %>% 81 | select(tip_amount, fare_amount, pay_type, passenger_count) %>% 82 | na.omit 83 | 84 | # Linear Model 85 | m1 <- lm(model_formula, data = tripsHour) 86 | summary(m1) 87 | 88 | # Decision tree 89 | library(rpart) 90 | m2 <- rpart(model_formula, tripsHour) 91 | summary(m2) 92 | 93 | # Predict 94 | pred <- tripsHour %>% 95 | ungroup %>% 96 | mutate(lm_fit = predict(m1, tripsHour)) %>% 97 | mutate(lm_res = tip_amount - lm_fit) %>% 98 | mutate(rpart_fit = predict(m2, tripsHour)) %>% 99 | mutate(rpart_res = tip_amount - rpart_fit) 100 | 101 | # MSE 102 | pred %>% 103 | na.omit() %>% 104 | summarize(lm_mse = mean(lm_res^2), rpart_mse = mean(rpart_res^2)) 105 | 106 | # Plot 107 | ggplot(pred, aes(rpart_fit, lm_fit)) + geom_point() + geom_smooth(method="lm") 108 | ``` 109 | 110 | # Communicate 111 | 112 | This analysis of one month of NYC Taxi data shows that you can predict tip amount as a function of fare amount, pay type, and passenger account. For a detailed explanation of the code you can view this report in the following formats: 113 | 114 | * HTML 115 | * PDF 116 | * Word 117 | -------------------------------------------------------------------------------- /prod/presentations/cazena/README.md: -------------------------------------------------------------------------------- 1 | # Analyze data with sparklyr 2 | 3 | ## Abstract 4 | 5 | Sparklyr is an R package that lets you analyze data in Spark while using familiar tools in R. Sparklyr supports a complete backend for dplyr, a popular tool for working with data frame objects both in memory and out of memory. You can use dplyr to translate R code into Spark SQL. Sparklyr also supports MLlib so you can run classifiers, regressions, clustering, decision trees, and many more machine learning algorithms on your distributed data in Spark. With sparklyr you can analyze large amounts of data that would not traditionally fit into R memory. Then you can collect results from Spark into R for further visualization and documentation. 6 | 7 | 8 | -------------------------------------------------------------------------------- /prod/presentations/cazena/emr_setup.sh: -------------------------------------------------------------------------------- 1 | ### Build EMR master node with Taxi Data 2 | ### Nathan Stephens 3 | ### 3/27/2017 4 | 5 | ########################################### 6 | ### Run as root 7 | ########################################### 8 | 9 | ## RSP 10 | 11 | # Upate 12 | sudo yum update 13 | 14 | # R 15 | sudo yum install -y R libcurl-devel openssl-devel git 16 | 17 | # install RSP 18 | wget -q https://download2.rstudio.org/current.ver -O /tmp/rsp.current.ver 19 | wget -O /tmp/rstudio-server-rhel.rpm https://s3.amazonaws.com/rstudio-dailybuilds/rstudio-server-rhel-pro-$(cat /tmp/rsp.current.ver)-x86_64.rpm 20 | sudo yum install -y --nogpgcheck /tmp/rstudio-server-rhel.rpm 21 | 22 | # install packages 23 | sudo Rscript -e 'install.packages("sparklyr", repos = "http://cran.rstudio.com/")' 24 | sudo Rscript -e 'install.packages("devtools", repos = "http://cran.rstudio.com/")' 25 | sudo Rscript -e 'install.packages("tidyverse", repos = "http://cran.rstudio.com/")' 26 | sudo Rscript -e 'install.packages("leaflet", repos = "http://cran.rstudio.com/")' 27 | sudo Rscript -e 'install.packages("DT", repos = "http://cran.rstudio.com/")' 28 | 29 | ########################################### 30 | 31 | ## Add rstudio user 32 | sudo useradd -m rstudio 33 | sudo echo rstudio | passwd rstudio --stdin 34 | sudo usermod -a -G hadoop rstudio 35 | sudo usermod -a -G hive rstudio 36 | 37 | 38 | ########################################### 39 | ### Run as rstudio 40 | ########################################### 41 | 42 | ## switch user 43 | su rstudio 44 | cd ~ 45 | 46 | ## add rstudio directory 47 | hadoop fs -mkdir /user/rstudio 48 | hadoop fs -chown rstudio:rstudio /user/rstudio 49 | 50 | ## clone project 51 | git clone https://github.com/rstudio/sparkDemos.git /home/rstudio/sparkDemos 52 | cat >/home/rstudio/sparkDemos/sparkDemos.Rproj <> nyct2010.log & 71 | nohup /usr/bin/s3-dist-cp --src=s3n://rstudio-sparkdemo-data/nyc-taxi/parquet_nohead/trips --dest=hdfs:///user/rstudio/trips_par >> trips_par.log & 72 | nohup /usr/bin/s3-dist-cp --src=s3n://rstudio-sparkdemo-data/nyc-taxi/parquet/trips_model_data --dest=hdfs:///user/rstudio/trips_model_data >> trips_model_data.log & 73 | 74 | 75 | ########################################### 76 | ### Open Hive 77 | ########################################### 78 | 79 | hive 80 | 81 | # Hive 1 82 | 83 | CREATE EXTERNAL TABLE IF NOT EXISTS nyct2010( 84 | gid int, 85 | ctlabel float, 86 | borocode int, 87 | boroname string, 88 | ct2010 int, 89 | boroct2010 int, 90 | cdeligibil string, 91 | ntacode string, 92 | ntaname string, 93 | puma int) 94 | ROW FORMAT DELIMITED 95 | FIELDS TERMINATED BY ',' 96 | LINES TERMINATED BY '\n' 97 | ; 98 | 99 | LOAD DATA INPATH '/user/rstudio/nyct2010' INTO TABLE nyct2010; 100 | 101 | # Hive 3 102 | 103 | CREATE EXTERNAL TABLE IF NOT EXISTS trips_par( 104 | id int, 105 | cab_type_id int, 106 | vendor_id string, 107 | pickup_datetime timestamp, 108 | dropoff_datetime timestamp, 109 | store_and_fwd_flag string, 110 | rate_code_id string, 111 | pickup_longitude float, 112 | pickup_latitude float, 113 | dropoff_longitude float, 114 | dropoff_latitude float, 115 | passenger_count bigint, 116 | trip_distance float, 117 | fare_amount float, 118 | extra bigint, 119 | mta_tax string, 120 | tip_amount float, 121 | tolls_amount float, 122 | ehail_fee string, 123 | improvement_surcharge string, 124 | total_amount float, 125 | payment_type string, 126 | trip_type string, 127 | pickup_nyct2010_gid int, 128 | dropoff_nyct2010_gid int) 129 | stored as parquet; 130 | 131 | LOAD DATA INPATH '/user/rstudio/trips_par' INTO TABLE trips_par; 132 | 133 | 134 | # Hive 3 135 | CREATE EXTERNAL TABLE IF NOT EXISTS trips_model_data( 136 | pickup_datetime timestamp, 137 | pickup_latitude float, 138 | pickup_longitude float, 139 | pickup_nyct2010_gid int, 140 | pickup_boro string, 141 | pickup_nta string, 142 | dropoff_datetime timestamp, 143 | dropoff_latitude float, 144 | dropoff_longitude float, 145 | dropoff_nyct2010_gid int, 146 | dropoff_boro string, 147 | dropoff_nta string, 148 | cab_type string, 149 | passenger_count bigint, 150 | trip_distance float, 151 | pay_type string, 152 | fare_amount float, 153 | tip_amount float, 154 | other_amount float, 155 | total_amount float) 156 | stored as parquet; 157 | 158 | LOAD DATA INPATH '/user/rstudio/trips_model_data' INTO TABLE trips_model_data; 159 | 160 | -------------------------------------------------------------------------------- /prod/presentations/cazena/kerberos.R: -------------------------------------------------------------------------------- 1 | system("echo '' | kinit ") 2 | -------------------------------------------------------------------------------- /prod/presentations/cazena/sqlvis_histogram.R: -------------------------------------------------------------------------------- 1 | ### Big data histogram 2 | sqlvis_compute_histogram <- function(data, x_name, bins = 30){ 3 | 4 | data_prep <- data %>% 5 | select_(x_field = x_name) %>% 6 | filter(!is.na(x_field)) %>% 7 | mutate(x_field = as.double(x_field)) 8 | 9 | s <- data_prep %>% 10 | summarise(max_x = max(x_field), min_x = min(x_field)) %>% 11 | mutate(bin_value = (max_x - min_x) / bins) %>% 12 | collect() 13 | 14 | new_bins <- as.numeric(c((0:(bins - 1) * s$bin_value) + s$min_x, s$max_x)) 15 | 16 | plot_table <- data_prep %>% 17 | ft_bucketizer(input.col = "x_field", output.col = "key_bin", splits = new_bins) %>% 18 | group_by(key_bin) %>% 19 | tally() %>% 20 | collect() 21 | 22 | all_bins <- data.frame( 23 | key_bin = 0:(bins - 1), 24 | bin = 1:bins, 25 | bin_ceiling = head(new_bins, -1) 26 | ) 27 | 28 | plot_table %>% 29 | full_join(all_bins, by="key_bin") %>% 30 | arrange(key_bin) %>% 31 | mutate(n = ifelse(!is.na(n), n, 0)) %>% 32 | select(bin = key_bin, count = n, bin_ceiling) %>% 33 | rename_(.dots = setNames(list("bin_ceiling"), x_name)) 34 | 35 | } 36 | 37 | sqlvis_ggplot_histogram <- function(plot_table, ...){ 38 | plot_table %>% 39 | select(x = 3, y = 2) %>% 40 | ggplot(aes(x, y)) + 41 | geom_bar(stat = "identity", fill = "cornflowerblue") + 42 | theme(legend.position = "none") + 43 | labs(x = colnames(plot_table)[3], y = colnames(plot_table)[2], ...) 44 | } 45 | 46 | sqlvis_ggvis_histogram <- function(plot_table, ...){ 47 | plot_table %>% 48 | select(x = 3, y = 2) %>% 49 | ggvis(x = ~x, y = ~y) %>% 50 | layer_bars() %>% 51 | add_axis("x", title = colnames(plot_table)[3]) %>% 52 | add_axis("y", title = colnames(plot_table)[2]) 53 | } 54 | 55 | -------------------------------------------------------------------------------- /prod/presentations/cazena/sqlvis_raster.R: -------------------------------------------------------------------------------- 1 | ### Big data tile plot 2 | 3 | # data <- tbl(sc, "trips_model_data") 4 | # x_field <- "pickup_longitude" 5 | # y_field <- "pickup_latitude" 6 | # resolution <- 50 7 | 8 | sqlvis_compute_raster <- function(data, x_field, y_field, resolution = 300){ 9 | 10 | data_prep <- data %>% 11 | select_(x = x_field, y = y_field) %>% 12 | filter(!is.na(x), !is.na(y)) 13 | 14 | s <- data_prep %>% 15 | summarise(max_x = max(x), 16 | max_y = max(y), 17 | min_x = min(x), 18 | min_y = min(y)) %>% 19 | mutate(rng_x = max_x - min_x, 20 | rng_y = max_y - min_y, 21 | resolution = resolution) %>% 22 | collect() 23 | 24 | counts <- data_prep %>% 25 | mutate(res_x = round((x - s$min_x) / s$rng_x * resolution, 0), 26 | res_y = round((y - s$min_y) / s$rng_y * resolution, 0)) %>% 27 | count(res_x, res_y) %>% 28 | collect 29 | 30 | list(counts = counts, 31 | limits = s, 32 | vnames = c(x_field, y_field) 33 | ) 34 | 35 | } 36 | 37 | sqlvis_ggplot_raster <- function(data, ...) { 38 | 39 | d <- data$counts 40 | s <- data$limits 41 | v <- data$vnames 42 | 43 | xx <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_x, s$max_x, len = 6),2)) 44 | yy <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_y, s$max_y, len = 6),2)) 45 | 46 | ggplot(d, aes(res_x, res_y)) + 47 | geom_raster(aes(fill = n)) + 48 | coord_fixed() + 49 | scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") + 50 | scale_x_continuous(breaks = xx, labels = names(xx)) + 51 | scale_y_continuous(breaks = yy, labels = names(yy)) + 52 | labs(x = v[1], y = v[2], ...) 53 | 54 | } 55 | 56 | ### Facets 57 | 58 | sqlvis_compute_raster_g <- function(data, x_field, y_field, g_field, resolution = 300){ 59 | 60 | data_prep <- data %>% 61 | mutate_(group = g_field) %>% 62 | select_(g = "group", x = x_field, y = y_field) %>% 63 | filter(!is.na(x), !is.na(y)) 64 | 65 | s <- data_prep %>% 66 | summarise(max_x = max(x), 67 | max_y = max(y), 68 | min_x = min(x), 69 | min_y = min(y)) %>% 70 | mutate(rng_x = max_x - min_x, 71 | rng_y = max_y - min_y, 72 | resolution = resolution) %>% 73 | collect() 74 | 75 | counts <- data_prep %>% 76 | mutate(res_x = round((x-s$min_x)/s$rng_x*resolution, 0), 77 | res_y = round((y-s$min_y)/s$rng_y*resolution, 0)) %>% 78 | count(g, res_x, res_y) %>% 79 | collect 80 | 81 | list(counts = counts, 82 | limits = s, 83 | vnames = c(x_field, y_field) 84 | ) 85 | 86 | } 87 | 88 | sqlvis_ggplot_raster_g <- function(data, ncol = 4, ...) { 89 | 90 | s <- data$limits 91 | d <- data$counts 92 | v <- data$vnames 93 | 94 | xx <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_x, s$max_x, len = 3), 1)) 95 | yy <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_y, s$max_y, len = 3), 1)) 96 | 97 | ggplot(d, aes(res_x, res_y)) + 98 | geom_raster(aes(fill = n)) + 99 | coord_fixed() + 100 | facet_wrap(~ g, ncol = ncol) + 101 | scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") + 102 | scale_x_continuous(breaks = xx, labels = names(xx)) + 103 | scale_y_continuous(breaks = yy, labels = names(yy)) + 104 | labs(x = v[1], y = v[2], ...) 105 | 106 | } 107 | -------------------------------------------------------------------------------- /prod/presentations/cloudera/livy-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/cloudera/livy-architecture.png -------------------------------------------------------------------------------- /prod/presentations/cloudera/livy.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Connecting to Spark through Livy" 3 | output: html_notebook 4 | --- 5 | 6 | With Livy you can anaylze data in your spark cluster via R on your desktop. 7 | 8 | ## Livy 9 | 10 | Livy is a service that enables easy interaction with an Apache Spark cluster over a REST interface. It enables easy submission of Spark jobs or snippets of Spark code, synchronous or asynchronous result retrieval, as well as SparkContext management, all via a simple REST interface or a RPC client library. Livy also simplifies the interaction between Spark from application servers, thus enabling the use of Spark for interactive web/mobile applications. 11 | 12 |
13 | ![Image](http://livy.io/img/livy-architecture.png) 14 |
15 | 16 | ## Start Livy 17 | 18 | Set home environment variables and start a Livy server to handle local requests. 19 | 20 | ```{bash} 21 | export JAVA_HOME=/usr/lib/jvm/java-7-oracle-cloudera 22 | export SPARK_HOME=/opt/cloudera/parcels/CDH/lib/spark 23 | /home/ubuntu/livy/livy-server-0.2.0/bin/livy-server 24 | ``` 25 | 26 | ## Connect to Spark 27 | 28 | Use `method = "livy"` to connect to the cluster. 29 | 30 | ```{r} 31 | library(sparklyr) 32 | library(dplyr) 33 | sc <- spark_connect( 34 | master = "http://ec2-***.us-west-2.compute.amazonaws.com:8998", 35 | method = "livy") 36 | ``` 37 | 38 | ## Analyze 39 | 40 | Use R code on your workstation as you normally would. Your R commands will be sent to the cluster via Livy for processing. Collect your results back to the desktop for further processing in R. 41 | 42 | ```{r} 43 | library(ggplot2) 44 | trips_model_data_tbl <- tbl(sc, "trips_model_data") 45 | pickup_dropoff_tbl <- trips_model_data_tbl %>% 46 | filter(pickup_nta == "Turtle Bay-East Midtown" & dropoff_nta == "Airport") %>% 47 | mutate(pickup_hour = hour(pickup_datetime)) %>% 48 | mutate(trip_time = unix_timestamp(dropoff_datetime) - unix_timestamp(pickup_datetime)) %>% 49 | group_by(pickup_hour) %>% 50 | summarize(n = n(), 51 | trip_time_mean = mean(trip_time), 52 | trip_time_p10 = percentile(trip_time, 0.10), 53 | trip_time_p25 = percentile(trip_time, 0.25), 54 | trip_time_p50 = percentile(trip_time, 0.50), 55 | trip_time_p75 = percentile(trip_time, 0.75), 56 | trip_time_p90 = percentile(trip_time, 0.90)) 57 | 58 | # Collect results 59 | pickup_dropoff <- collect(pickup_dropoff_tbl) 60 | 61 | # Plot 62 | ggplot(pickup_dropoff, aes(x = pickup_hour)) + 63 | geom_line(aes(y = trip_time_p50, alpha = "Median")) + 64 | geom_ribbon(aes(ymin = trip_time_p25, ymax = trip_time_p75, 65 | alpha = "25–75th percentile")) + 66 | geom_ribbon(aes(ymin = trip_time_p10, ymax = trip_time_p90, 67 | alpha = "10–90th percentile")) + 68 | scale_y_continuous("trip duration in minutes") 69 | ``` 70 | -------------------------------------------------------------------------------- /prod/presentations/cloudera/readme.md: -------------------------------------------------------------------------------- 1 | # Demo using CDH 5.9 2 | 3 | This repos contains files for demonstrating Spark and R on Cloudera using spakrlyr. 4 | 5 | ### Scripts 6 | 7 | * Taxi Demo 8 | * Livy Connection 9 | * Histogram wrappers 10 | * Raster wrappers 11 | 12 | ### Reports 13 | 14 | * [Data Science Toolchain with Spark and R](http://colorado.rstudio.com:3939/content/262/taxiDemoCloudera3.nb.html) 15 | * [Connecting to Spark through Livy](http://colorado.rstudio.com:3939/content/259/livy.nb.html) 16 | 17 | ### Reference 18 | 19 | * [spark.rstudio.com](http://spark.rstudio.com/) -------------------------------------------------------------------------------- /prod/presentations/cloudera/sqlvis_histogram.R: -------------------------------------------------------------------------------- 1 | ### Big data histogram 2 | sqlvis_compute_histogram <- function(data, x_name, bins = 30){ 3 | 4 | data_prep <- data %>% 5 | select_(x_field = x_name) %>% 6 | filter(!is.na(x_field)) %>% 7 | mutate(x_field = as.double(x_field)) 8 | 9 | s <- data_prep %>% 10 | summarise(max_x = max(x_field), min_x = min(x_field)) %>% 11 | mutate(bin_value = (max_x - min_x) / bins) %>% 12 | collect() 13 | 14 | new_bins <- as.numeric(c((0:(bins - 1) * s$bin_value) + s$min_x, s$max_x)) 15 | 16 | plot_table <- data_prep %>% 17 | ft_bucketizer(input.col = "x_field", output.col = "key_bin", splits = new_bins) %>% 18 | group_by(key_bin) %>% 19 | tally() %>% 20 | collect() 21 | 22 | all_bins <- data.frame( 23 | key_bin = 0:(bins - 1), 24 | bin = 1:bins, 25 | bin_ceiling = head(new_bins, -1) 26 | ) 27 | 28 | plot_table %>% 29 | full_join(all_bins, by="key_bin") %>% 30 | arrange(key_bin) %>% 31 | mutate(n = ifelse(!is.na(n), n, 0)) %>% 32 | select(bin = key_bin, count = n, bin_ceiling) %>% 33 | rename_(.dots = setNames(list("bin_ceiling"), x_name)) 34 | 35 | } 36 | 37 | sqlvis_ggplot_histogram <- function(plot_table, ...){ 38 | plot_table %>% 39 | select(x = 3, y = 2) %>% 40 | ggplot(aes(x, y)) + 41 | geom_bar(stat = "identity", fill = "cornflowerblue") + 42 | theme(legend.position = "none") + 43 | labs(x = colnames(plot_table)[3], y = colnames(plot_table)[2], ...) 44 | } 45 | 46 | sqlvis_ggvis_histogram <- function(plot_table, ...){ 47 | plot_table %>% 48 | select(x = 3, y = 2) %>% 49 | ggvis(x = ~x, y = ~y) %>% 50 | layer_bars() %>% 51 | add_axis("x", title = colnames(plot_table)[3]) %>% 52 | add_axis("y", title = colnames(plot_table)[2]) 53 | } 54 | 55 | -------------------------------------------------------------------------------- /prod/presentations/cloudera/sqlvis_raster.R: -------------------------------------------------------------------------------- 1 | ### Big data tile plot 2 | 3 | # data <- tbl(sc, "trips_model_data") 4 | # x_field <- "pickup_longitude" 5 | # y_field <- "pickup_latitude" 6 | # resolution <- 50 7 | 8 | sqlvis_compute_raster <- function(data, x_field, y_field, resolution = 300){ 9 | 10 | data_prep <- data %>% 11 | select_(x = x_field, y = y_field) %>% 12 | filter(!is.na(x), !is.na(y)) 13 | 14 | s <- data_prep %>% 15 | summarise(max_x = max(x), 16 | max_y = max(y), 17 | min_x = min(x), 18 | min_y = min(y)) %>% 19 | mutate(rng_x = max_x - min_x, 20 | rng_y = max_y - min_y, 21 | resolution = resolution) %>% 22 | collect() 23 | 24 | counts <- data_prep %>% 25 | mutate(res_x = round((x - s$min_x) / s$rng_x * resolution, 0), 26 | res_y = round((y - s$min_y) / s$rng_y * resolution, 0)) %>% 27 | count(res_x, res_y) %>% 28 | collect 29 | 30 | list(counts = counts, 31 | limits = s, 32 | vnames = c(x_field, y_field) 33 | ) 34 | 35 | } 36 | 37 | sqlvis_ggplot_raster <- function(data, ...) { 38 | 39 | d <- data$counts 40 | s <- data$limits 41 | v <- data$vnames 42 | 43 | xx <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_x, s$max_x, len = 6),2)) 44 | yy <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_y, s$max_y, len = 6),2)) 45 | 46 | ggplot(d, aes(res_x, res_y)) + 47 | geom_raster(aes(fill = n)) + 48 | coord_fixed() + 49 | scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") + 50 | scale_x_continuous(breaks = xx, labels = names(xx)) + 51 | scale_y_continuous(breaks = yy, labels = names(yy)) + 52 | labs(x = v[1], y = v[2], ...) 53 | 54 | } 55 | 56 | ### Facets 57 | 58 | sqlvis_compute_raster_g <- function(data, x_field, y_field, g_field, resolution = 300){ 59 | 60 | data_prep <- data %>% 61 | mutate_(group = g_field) %>% 62 | select_(g = "group", x = x_field, y = y_field) %>% 63 | filter(!is.na(x), !is.na(y)) 64 | 65 | s <- data_prep %>% 66 | summarise(max_x = max(x), 67 | max_y = max(y), 68 | min_x = min(x), 69 | min_y = min(y)) %>% 70 | mutate(rng_x = max_x - min_x, 71 | rng_y = max_y - min_y, 72 | resolution = resolution) %>% 73 | collect() 74 | 75 | counts <- data_prep %>% 76 | mutate(res_x = round((x-s$min_x)/s$rng_x*resolution, 0), 77 | res_y = round((y-s$min_y)/s$rng_y*resolution, 0)) %>% 78 | count(g, res_x, res_y) %>% 79 | collect 80 | 81 | list(counts = counts, 82 | limits = s, 83 | vnames = c(x_field, y_field) 84 | ) 85 | 86 | } 87 | 88 | sqlvis_ggplot_raster_g <- function(data, ncol = 4, ...) { 89 | 90 | s <- data$limits 91 | d <- data$counts 92 | v <- data$vnames 93 | 94 | xx <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_x, s$max_x, len = 3), 1)) 95 | yy <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_y, s$max_y, len = 3), 1)) 96 | 97 | ggplot(d, aes(res_x, res_y)) + 98 | geom_raster(aes(fill = n)) + 99 | coord_fixed() + 100 | facet_wrap(~ g, ncol = ncol) + 101 | scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") + 102 | scale_x_continuous(breaks = xx, labels = names(xx)) + 103 | scale_y_continuous(breaks = yy, labels = names(yy)) + 104 | labs(x = v[1], y = v[2], ...) 105 | 106 | } 107 | -------------------------------------------------------------------------------- /prod/presentations/sparkSummitEast/README.md: -------------------------------------------------------------------------------- 1 | # Analyze data with sparklyr 2 | 3 | ## Abstract 4 | 5 | Sparklyr is an R package that lets you analyze data in Spark while using familiar tools in R. Sparklyr supports a complete backend for dplyr, a popular tool for working with data frame objects both in memory and out of memory. You can use dplyr to translate R code into Spark SQL. Sparklyr also supports MLlib so you can run classifiers, regressions, clustering, decision trees, and many more machine learning algorithms on your distributed data in Spark. With sparklyr you can analyze large amounts of data that would not traditionally fit into R memory. Then you can collect results from Spark into R for further visualization and documentation. 6 | 7 | Sparklyr is also extensible. You can create R packages that depend on sparklyr to call the full Spark API. One example of an extension is H2O’s rsparkling, an R package that works with H2O’s machine learning algorithm. With sparklyr and rsparkling you have access to all the tools in H2O for analysis with R and Spark. 8 | 9 | ## Documents 10 | 11 | * [Data Science Toolchain with Spark and R](http://colorado.rstudio.com:3939/content/276/taxiDemoH2O.nb.html) 12 | * [Connecting to Spark through Livy](http://colorado.rstudio.com:3939/content/289/livy.nb.html) 13 | 14 | ## Slides 15 | 16 | ![](img/img.001.jpeg) 17 | 18 | *** 19 | 20 | ![](img/img.002.jpeg) 21 | 22 | *** 23 | 24 | ![](img/img.003.jpeg) 25 | 26 | *** 27 | 28 | ![](img/img.004.jpeg) 29 | 30 | *** 31 | 32 | ![](img/img.005.jpeg) 33 | 34 | *** 35 | 36 | ![](img/img.006.jpeg) 37 | 38 | *** 39 | 40 | ![](img/img.007.jpeg) 41 | 42 | *** 43 | 44 | ![](img/img.008.jpeg) 45 | 46 | *** 47 | 48 | ![](img/img.009.jpeg) 49 | 50 | *** 51 | 52 | ![](img/img.010.jpeg) 53 | 54 | *** 55 | 56 | ![](img/img.011.jpeg) 57 | 58 | *** 59 | 60 | ![](img/img.012.jpeg) 61 | 62 | *** 63 | 64 | ![](img/img.013.jpeg) 65 | 66 | *** 67 | 68 | ![](img/img.014.jpeg) 69 | 70 | *** 71 | 72 | ![](img/img.015.jpeg) 73 | 74 | *** 75 | 76 | ![](img/img.016.jpeg) 77 | 78 | *** 79 | 80 | ![](img/img.017.jpeg) 81 | -------------------------------------------------------------------------------- /prod/presentations/sparkSummitEast/img/img.001.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.001.jpeg -------------------------------------------------------------------------------- /prod/presentations/sparkSummitEast/img/img.002.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.002.jpeg -------------------------------------------------------------------------------- /prod/presentations/sparkSummitEast/img/img.003.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.003.jpeg -------------------------------------------------------------------------------- /prod/presentations/sparkSummitEast/img/img.004.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.004.jpeg -------------------------------------------------------------------------------- /prod/presentations/sparkSummitEast/img/img.005.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.005.jpeg -------------------------------------------------------------------------------- /prod/presentations/sparkSummitEast/img/img.006.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.006.jpeg -------------------------------------------------------------------------------- /prod/presentations/sparkSummitEast/img/img.007.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.007.jpeg -------------------------------------------------------------------------------- /prod/presentations/sparkSummitEast/img/img.008.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.008.jpeg -------------------------------------------------------------------------------- /prod/presentations/sparkSummitEast/img/img.009.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.009.jpeg -------------------------------------------------------------------------------- /prod/presentations/sparkSummitEast/img/img.010.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.010.jpeg -------------------------------------------------------------------------------- /prod/presentations/sparkSummitEast/img/img.011.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.011.jpeg -------------------------------------------------------------------------------- /prod/presentations/sparkSummitEast/img/img.012.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.012.jpeg -------------------------------------------------------------------------------- /prod/presentations/sparkSummitEast/img/img.013.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.013.jpeg -------------------------------------------------------------------------------- /prod/presentations/sparkSummitEast/img/img.014.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.014.jpeg -------------------------------------------------------------------------------- /prod/presentations/sparkSummitEast/img/img.015.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.015.jpeg -------------------------------------------------------------------------------- /prod/presentations/sparkSummitEast/img/img.016.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.016.jpeg -------------------------------------------------------------------------------- /prod/presentations/sparkSummitEast/img/img.017.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.017.jpeg -------------------------------------------------------------------------------- /prod/presentations/sparkSummitEast/livy.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Connecting to Spark through Livy" 3 | output: html_notebook 4 | --- 5 | 6 | With Livy you can anaylze data in your spark cluster via R on your desktop. 7 | 8 | ## Livy 9 | 10 | Livy is a service that enables easy interaction with an Apache Spark cluster over a REST interface. It enables easy submission of Spark jobs or snippets of Spark code, synchronous or asynchronous result retrieval, as well as SparkContext management, all via a simple REST interface or a RPC client library. Livy also simplifies the interaction between Spark from application servers, thus enabling the use of Spark for interactive web/mobile applications. 11 | 12 |
13 | ![](http://livy.io/img/livy-architecture.png) 14 |
15 | 16 | ## Start Livy [Server Side] 17 | 18 | Set home environment variables and start a Livy server to handle local requests. 19 | 20 | ```{r, eval=FALSE} 21 | sparklyr::livy_install() 22 | sparklyr::livy_service_start() 23 | ``` 24 | 25 | ## Connect to Spark [Client Side] 26 | 27 | Use `method = "livy"` to connect to the cluster. 28 | 29 | ```{r warning=FALSE, eval=FALSE} 30 | library(sparklyr) 31 | library(dplyr) 32 | sc <- spark_connect( 33 | master = "http://ec2-***-**-***-**.compute-1.amazonaws.com:8998/", 34 | method = "livy") 35 | ``` 36 | 37 | ## Analyze [Client Side] 38 | 39 | Use R code on your workstation as you normally would. Your R commands will be sent to the cluster via Livy for processing. Collect your results back to the desktop for further processing in R. 40 | 41 | ```{r eval=FALSE} 42 | library(ggplot2) 43 | trips_model_data_tbl <- tbl(sc, "trips_model_data") 44 | pickup_dropoff_tbl <- trips_model_data_tbl %>% 45 | filter(pickup_nta == "Turtle Bay-East Midtown" & dropoff_nta == "Airport") %>% 46 | mutate(pickup_hour = hour(pickup_datetime)) %>% 47 | mutate(trip_time = unix_timestamp(dropoff_datetime) - unix_timestamp(pickup_datetime)) %>% 48 | group_by(pickup_hour) %>% 49 | summarize(n = n(), 50 | trip_time_mean = mean(trip_time), 51 | trip_time_p10 = percentile(trip_time, 0.10), 52 | trip_time_p25 = percentile(trip_time, 0.25), 53 | trip_time_p50 = percentile(trip_time, 0.50), 54 | trip_time_p75 = percentile(trip_time, 0.75), 55 | trip_time_p90 = percentile(trip_time, 0.90)) 56 | 57 | # Collect results 58 | pickup_dropoff <- collect(pickup_dropoff_tbl) 59 | 60 | # Plot 61 | ggplot(pickup_dropoff, aes(x = pickup_hour)) + 62 | geom_line(aes(y = trip_time_p50, alpha = "Median")) + 63 | geom_ribbon(aes(ymin = trip_time_p25, ymax = trip_time_p75, 64 | alpha = "25–75th percentile")) + 65 | geom_ribbon(aes(ymin = trip_time_p10, ymax = trip_time_p90, 66 | alpha = "10–90th percentile")) + 67 | scale_y_continuous("trip duration in minutes") 68 | ``` 69 | 70 | ## Disconnect [Sever Side] 71 | 72 | ```{r disconnect, eval=FALSE} 73 | sparklyr::livy_service_stop() 74 | ``` 75 | 76 | 77 | -------------------------------------------------------------------------------- /prod/presentations/sparkSummitEast/sqlvis_histogram.R: -------------------------------------------------------------------------------- 1 | ### Big data histogram 2 | sqlvis_compute_histogram <- function(data, x_name, bins = 30){ 3 | 4 | data_prep <- data %>% 5 | select_(x_field = x_name) %>% 6 | filter(!is.na(x_field)) %>% 7 | mutate(x_field = as.double(x_field)) 8 | 9 | s <- data_prep %>% 10 | summarise(max_x = max(x_field), min_x = min(x_field)) %>% 11 | mutate(bin_value = (max_x - min_x) / bins) %>% 12 | collect() 13 | 14 | new_bins <- as.numeric(c((0:(bins - 1) * s$bin_value) + s$min_x, s$max_x)) 15 | 16 | plot_table <- data_prep %>% 17 | ft_bucketizer(input.col = "x_field", output.col = "key_bin", splits = new_bins) %>% 18 | group_by(key_bin) %>% 19 | tally() %>% 20 | collect() 21 | 22 | all_bins <- data.frame( 23 | key_bin = 0:(bins - 1), 24 | bin = 1:bins, 25 | bin_ceiling = head(new_bins, -1) 26 | ) 27 | 28 | plot_table %>% 29 | full_join(all_bins, by="key_bin") %>% 30 | arrange(key_bin) %>% 31 | mutate(n = ifelse(!is.na(n), n, 0)) %>% 32 | select(bin = key_bin, count = n, bin_ceiling) %>% 33 | rename_(.dots = setNames(list("bin_ceiling"), x_name)) 34 | 35 | } 36 | 37 | sqlvis_ggplot_histogram <- function(plot_table, ...){ 38 | plot_table %>% 39 | select(x = 3, y = 2) %>% 40 | ggplot(aes(x, y)) + 41 | geom_bar(stat = "identity", fill = "cornflowerblue") + 42 | theme(legend.position = "none") + 43 | labs(x = colnames(plot_table)[3], y = colnames(plot_table)[2], ...) 44 | } 45 | 46 | sqlvis_ggvis_histogram <- function(plot_table, ...){ 47 | plot_table %>% 48 | select(x = 3, y = 2) %>% 49 | ggvis(x = ~x, y = ~y) %>% 50 | layer_bars() %>% 51 | add_axis("x", title = colnames(plot_table)[3]) %>% 52 | add_axis("y", title = colnames(plot_table)[2]) 53 | } 54 | 55 | -------------------------------------------------------------------------------- /prod/presentations/sparkSummitEast/sqlvis_raster.R: -------------------------------------------------------------------------------- 1 | ### Big data tile plot 2 | 3 | # data <- tbl(sc, "trips_model_data") 4 | # x_field <- "pickup_longitude" 5 | # y_field <- "pickup_latitude" 6 | # resolution <- 50 7 | 8 | sqlvis_compute_raster <- function(data, x_field, y_field, resolution = 300){ 9 | 10 | data_prep <- data %>% 11 | select_(x = x_field, y = y_field) %>% 12 | filter(!is.na(x), !is.na(y)) 13 | 14 | s <- data_prep %>% 15 | summarise(max_x = max(x), 16 | max_y = max(y), 17 | min_x = min(x), 18 | min_y = min(y)) %>% 19 | mutate(rng_x = max_x - min_x, 20 | rng_y = max_y - min_y, 21 | resolution = resolution) %>% 22 | collect() 23 | 24 | counts <- data_prep %>% 25 | mutate(res_x = round((x - s$min_x) / s$rng_x * resolution, 0), 26 | res_y = round((y - s$min_y) / s$rng_y * resolution, 0)) %>% 27 | count(res_x, res_y) %>% 28 | collect 29 | 30 | list(counts = counts, 31 | limits = s, 32 | vnames = c(x_field, y_field) 33 | ) 34 | 35 | } 36 | 37 | sqlvis_ggplot_raster <- function(data, ...) { 38 | 39 | d <- data$counts 40 | s <- data$limits 41 | v <- data$vnames 42 | 43 | xx <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_x, s$max_x, len = 6),2)) 44 | yy <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_y, s$max_y, len = 6),2)) 45 | 46 | ggplot(d, aes(res_x, res_y)) + 47 | geom_raster(aes(fill = n)) + 48 | coord_fixed() + 49 | scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") + 50 | scale_x_continuous(breaks = xx, labels = names(xx)) + 51 | scale_y_continuous(breaks = yy, labels = names(yy)) + 52 | labs(x = v[1], y = v[2], ...) 53 | 54 | } 55 | 56 | ### Facets 57 | 58 | sqlvis_compute_raster_g <- function(data, x_field, y_field, g_field, resolution = 300){ 59 | 60 | data_prep <- data %>% 61 | mutate_(group = g_field) %>% 62 | select_(g = "group", x = x_field, y = y_field) %>% 63 | filter(!is.na(x), !is.na(y)) 64 | 65 | s <- data_prep %>% 66 | summarise(max_x = max(x), 67 | max_y = max(y), 68 | min_x = min(x), 69 | min_y = min(y)) %>% 70 | mutate(rng_x = max_x - min_x, 71 | rng_y = max_y - min_y, 72 | resolution = resolution) %>% 73 | collect() 74 | 75 | counts <- data_prep %>% 76 | mutate(res_x = round((x-s$min_x)/s$rng_x*resolution, 0), 77 | res_y = round((y-s$min_y)/s$rng_y*resolution, 0)) %>% 78 | count(g, res_x, res_y) %>% 79 | collect 80 | 81 | list(counts = counts, 82 | limits = s, 83 | vnames = c(x_field, y_field) 84 | ) 85 | 86 | } 87 | 88 | sqlvis_ggplot_raster_g <- function(data, ncol = 4, ...) { 89 | 90 | s <- data$limits 91 | d <- data$counts 92 | v <- data$vnames 93 | 94 | xx <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_x, s$max_x, len = 3), 1)) 95 | yy <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_y, s$max_y, len = 3), 1)) 96 | 97 | ggplot(d, aes(res_x, res_y)) + 98 | geom_raster(aes(fill = n)) + 99 | coord_fixed() + 100 | facet_wrap(~ g, ncol = ncol) + 101 | scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") + 102 | scale_x_continuous(breaks = xx, labels = names(xx)) + 103 | scale_y_continuous(breaks = yy, labels = names(yy)) + 104 | labs(x = v[1], y = v[2], ...) 105 | 106 | } 107 | -------------------------------------------------------------------------------- /prod/presentations/tidyverse/01_taxiR.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "NYC Taxi - One month in R" 3 | output: html_notebook 4 | --- 5 | 6 |
7 | ![R for Data Science http://r4ds.had.co.nz/](http://r4ds.had.co.nz/diagrams/data-science.png) 8 |
9 | 10 | # Load tidyverse 11 | 12 | ```{r tidyverse} 13 | library(tidyverse) 14 | library(lubridate) 15 | ``` 16 | 17 | # Download 18 | 19 | ```{r download, eval=FALSE} 20 | download.file( 21 | "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-12.csv", 22 | "yellow_tripdata_2015-12.csv") 23 | ``` 24 | 25 | # Impord Dataset 26 | 27 | ```{r import, message=FALSE, warning=FALSE} 28 | trips <- read_csv("~/sparkDemos/prod/presentations/tidyverse/yellow_tripdata_2015-12.csv") 29 | ``` 30 | 31 | # Tidy 32 | 33 | ```{r tidy} 34 | # pickups 35 | select(trips, tpep_pickup_datetime, pickup_latitude, pickup_longitude) 36 | 37 | # dropoffs 38 | select(trips, tpep_dropoff_datetime, dropoff_latitude, dropoff_longitude) 39 | 40 | # trips 41 | trips 42 | ``` 43 | 44 | # Transform 45 | 46 | ```{r transform} 47 | tripsHour <- trips %>% 48 | filter(payment_type %in% c(1, 2)) %>% 49 | mutate(pay_type = ifelse(payment_type == 1, "credit", "cash")) %>% 50 | mutate(trip_time_sec = tpep_dropoff_datetime - tpep_pickup_datetime) %>% 51 | mutate(trip_time_min = as.numeric(trip_time_sec / 60)) %>% 52 | mutate(hour = round_date(tpep_pickup_datetime, "hour")) %>% 53 | group_by(pay_type, hour) %>% 54 | summarize(n = n(), 55 | tip_amount = mean(tip_amount), 56 | fare_amount = mean(fare_amount), 57 | passenger_count = mean(passenger_count), 58 | trip_time = mean(trip_time_min), 59 | trip_distance = mean(trip_distance)) 60 | tripsHour 61 | ``` 62 | 63 | # Visualize 64 | 65 | ```{r visualize} 66 | ggplot(tripsHour, aes(trip_time, trip_distance, color = pay_type)) + 67 | geom_point() + geom_smooth() + 68 | labs(title = "NYC Taxi by hour by day", x = "Minutes", y = "Miles", caption = '2015-12') 69 | 70 | ggplot(tripsHour, aes(trip_distance, fare_amount)) + 71 | geom_point() + geom_smooth() + facet_grid(~pay_type) + 72 | labs(title = "NYC Taxi by hour by day", x = "Distance", y = "Dollars", caption = '2015-12') 73 | ``` 74 | 75 | # Model 76 | 77 | ```{r model} 78 | # Formula 79 | model_formula <- formula(tip_amount ~ fare_amount + pay_type + passenger_count) 80 | 81 | # Model data 82 | tripsModel <- tripsHour %>% 83 | select(tip_amount, fare_amount, pay_type, passenger_count) %>% 84 | na.omit 85 | 86 | # Linear Model 87 | m1 <- lm(model_formula, data = tripsHour) 88 | summary(m1) 89 | 90 | # Decision tree 91 | library(rpart) 92 | m2 <- rpart(model_formula, tripsHour) 93 | summary(m2) 94 | 95 | # Predict 96 | pred <- tripsHour %>% 97 | ungroup %>% 98 | mutate(lm_fit = predict(m1, tripsHour)) %>% 99 | mutate(lm_res = tip_amount - lm_fit) %>% 100 | mutate(rpart_fit = predict(m2, tripsHour)) %>% 101 | mutate(rpart_res = tip_amount - rpart_fit) 102 | 103 | # MSE 104 | pred %>% 105 | na.omit() %>% 106 | summarize(lm_mse = mean(lm_res^2), rpart_mse = mean(rpart_res^2)) 107 | 108 | # Plot 109 | ggplot(pred, aes(rpart_fit, lm_fit)) + geom_point() + geom_smooth(method="lm") 110 | ``` 111 | 112 | # Communicate 113 | 114 | This analysis of one month of NYC Taxi data shows that you can predict tip amount as a function of fare amount, pay type, and passenger account. For a detailed explanation of the code you can view this report in the following formats: 115 | 116 | * HTML 117 | * PDF 118 | * Word 119 | -------------------------------------------------------------------------------- /prod/presentations/tidyverse/README.md: -------------------------------------------------------------------------------- 1 | # Analyze data with sparklyr 2 | 3 | ## Abstract 4 | 5 | Sparklyr is an R package that lets you analyze data in Spark while using familiar tools in R. Sparklyr supports a complete backend for dplyr, a popular tool for working with data frame objects both in memory and out of memory. You can use dplyr to translate R code into Spark SQL. Sparklyr also supports MLlib so you can run classifiers, regressions, clustering, decision trees, and many more machine learning algorithms on your distributed data in Spark. With sparklyr you can analyze large amounts of data that would not traditionally fit into R memory. Then you can collect results from Spark into R for further visualization and documentation. 6 | 7 | ## Documents 8 | 9 | These documents are for understanding the toolchain and the tidyverse using the famous NYC taxi data. 10 | 11 | * [Data Science Toolchain with Spark and R](http://colorado.rstudio.com:3939/content/420/taxiDemo.nb.html) 12 | * [Tidyverse and R Notebooks with NYC Taxi Data](http://colorado.rstudio.com:3939/content/421/taxiR.nb.html) 13 | 14 | ## Slides 15 | 16 | ![](img/tidyverse.001.jpeg) 17 | 18 | *** 19 | 20 | ![](img/tidyverse.002.jpeg) 21 | 22 | *** 23 | 24 | ![](img/tidyverse.003.jpeg) 25 | 26 | *** 27 | 28 | ![](img/tidyverse.004.jpeg) 29 | 30 | *** 31 | 32 | ![](img/tidyverse.005.jpeg) 33 | 34 | *** 35 | 36 | ![](img/tidyverse.006.jpeg) 37 | 38 | *** 39 | 40 | ![](img/tidyverse.007.jpeg) 41 | 42 | *** 43 | 44 | ![](img/tidyverse.008.jpeg) 45 | 46 | *** 47 | 48 | ![](img/tidyverse.009.jpeg) 49 | 50 | *** 51 | 52 | ![](img/tidyverse.010.jpeg) 53 | 54 | *** 55 | 56 | ![](img/tidyverse.011.jpeg) 57 | 58 | *** 59 | 60 | ![](img/tidyverse.012.jpeg) 61 | 62 | *** 63 | 64 | ![](img/tidyverse.013.jpeg) 65 | 66 | *** 67 | 68 | ![](img/tidyverse.014.jpeg) 69 | 70 | *** 71 | 72 | ![](img/tidyverse.015.jpeg) 73 | 74 | *** 75 | 76 | ![](img/tidyverse.016.jpeg) 77 | 78 | -------------------------------------------------------------------------------- /prod/presentations/tidyverse/emr_setup.sh: -------------------------------------------------------------------------------- 1 | ### Build EMR master node with Taxi Data 2 | ### Nathan Stephens 3 | ### 3/27/2017 4 | 5 | ########################################### 6 | ### Run as root 7 | ########################################### 8 | 9 | ## RSP 10 | 11 | # Upate 12 | sudo yum update 13 | 14 | # R 15 | sudo yum install -y R libcurl-devel openssl-devel git 16 | 17 | # install RSP 18 | wget -q https://download2.rstudio.org/current.ver -O /tmp/rsp.current.ver 19 | wget -O /tmp/rstudio-server-rhel.rpm https://s3.amazonaws.com/rstudio-dailybuilds/rstudio-server-rhel-pro-$(cat /tmp/rsp.current.ver)-x86_64.rpm 20 | sudo yum install -y --nogpgcheck /tmp/rstudio-server-rhel.rpm 21 | 22 | # install packages 23 | sudo Rscript -e 'install.packages("sparklyr", repos = "http://cran.rstudio.com/")' 24 | sudo Rscript -e 'install.packages("devtools", repos = "http://cran.rstudio.com/")' 25 | sudo Rscript -e 'install.packages("tidyverse", repos = "http://cran.rstudio.com/")' 26 | sudo Rscript -e 'install.packages("leaflet", repos = "http://cran.rstudio.com/")' 27 | sudo Rscript -e 'install.packages("DT", repos = "http://cran.rstudio.com/")' 28 | 29 | ########################################### 30 | 31 | ## Add rstudio user 32 | sudo useradd -m rstudio 33 | sudo echo rstudio | passwd rstudio --stdin 34 | sudo usermod -a -G hadoop rstudio 35 | sudo usermod -a -G hive rstudio 36 | 37 | 38 | ########################################### 39 | ### Run as rstudio 40 | ########################################### 41 | 42 | ## switch user 43 | su rstudio 44 | cd ~ 45 | 46 | ## add rstudio directory 47 | hadoop fs -mkdir /user/rstudio 48 | hadoop fs -chown rstudio:rstudio /user/rstudio 49 | 50 | ## clone project 51 | git clone https://github.com/rstudio/sparkDemos.git /home/rstudio/sparkDemos 52 | cat >/home/rstudio/sparkDemos/sparkDemos.Rproj <> nyct2010.log & 71 | nohup /usr/bin/s3-dist-cp --src=s3n://rstudio-sparkdemo-data/nyc-taxi/parquet_nohead/trips --dest=hdfs:///user/rstudio/trips_par >> trips_par.log & 72 | nohup /usr/bin/s3-dist-cp --src=s3n://rstudio-sparkdemo-data/nyc-taxi/parquet/trips_model_data --dest=hdfs:///user/rstudio/trips_model_data >> trips_model_data.log & 73 | 74 | 75 | ########################################### 76 | ### Open Hive 77 | ########################################### 78 | 79 | hive 80 | 81 | # Hive 1 82 | 83 | CREATE EXTERNAL TABLE IF NOT EXISTS nyct2010( 84 | gid int, 85 | ctlabel float, 86 | borocode int, 87 | boroname string, 88 | ct2010 int, 89 | boroct2010 int, 90 | cdeligibil string, 91 | ntacode string, 92 | ntaname string, 93 | puma int) 94 | ROW FORMAT DELIMITED 95 | FIELDS TERMINATED BY ',' 96 | LINES TERMINATED BY '\n' 97 | ; 98 | 99 | LOAD DATA INPATH '/user/rstudio/nyct2010' INTO TABLE nyct2010; 100 | 101 | # Hive 3 102 | 103 | CREATE EXTERNAL TABLE IF NOT EXISTS trips_par( 104 | id int, 105 | cab_type_id int, 106 | vendor_id string, 107 | pickup_datetime timestamp, 108 | dropoff_datetime timestamp, 109 | store_and_fwd_flag string, 110 | rate_code_id string, 111 | pickup_longitude float, 112 | pickup_latitude float, 113 | dropoff_longitude float, 114 | dropoff_latitude float, 115 | passenger_count bigint, 116 | trip_distance float, 117 | fare_amount float, 118 | extra bigint, 119 | mta_tax string, 120 | tip_amount float, 121 | tolls_amount float, 122 | ehail_fee string, 123 | improvement_surcharge string, 124 | total_amount float, 125 | payment_type string, 126 | trip_type string, 127 | pickup_nyct2010_gid int, 128 | dropoff_nyct2010_gid int) 129 | stored as parquet; 130 | 131 | LOAD DATA INPATH '/user/rstudio/trips_par' INTO TABLE trips_par; 132 | 133 | 134 | # Hive 3 135 | CREATE EXTERNAL TABLE IF NOT EXISTS trips_model_data( 136 | pickup_datetime timestamp, 137 | pickup_latitude float, 138 | pickup_longitude float, 139 | pickup_nyct2010_gid int, 140 | pickup_boro string, 141 | pickup_nta string, 142 | dropoff_datetime timestamp, 143 | dropoff_latitude float, 144 | dropoff_longitude float, 145 | dropoff_nyct2010_gid int, 146 | dropoff_boro string, 147 | dropoff_nta string, 148 | cab_type string, 149 | passenger_count bigint, 150 | trip_distance float, 151 | pay_type string, 152 | fare_amount float, 153 | tip_amount float, 154 | other_amount float, 155 | total_amount float) 156 | stored as parquet; 157 | 158 | LOAD DATA INPATH '/user/rstudio/trips_model_data' INTO TABLE trips_model_data; 159 | 160 | -------------------------------------------------------------------------------- /prod/presentations/tidyverse/img/tidyverse.001.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.001.jpeg -------------------------------------------------------------------------------- /prod/presentations/tidyverse/img/tidyverse.002.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.002.jpeg -------------------------------------------------------------------------------- /prod/presentations/tidyverse/img/tidyverse.003.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.003.jpeg -------------------------------------------------------------------------------- /prod/presentations/tidyverse/img/tidyverse.004.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.004.jpeg -------------------------------------------------------------------------------- /prod/presentations/tidyverse/img/tidyverse.005.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.005.jpeg -------------------------------------------------------------------------------- /prod/presentations/tidyverse/img/tidyverse.006.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.006.jpeg -------------------------------------------------------------------------------- /prod/presentations/tidyverse/img/tidyverse.007.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.007.jpeg -------------------------------------------------------------------------------- /prod/presentations/tidyverse/img/tidyverse.008.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.008.jpeg -------------------------------------------------------------------------------- /prod/presentations/tidyverse/img/tidyverse.009.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.009.jpeg -------------------------------------------------------------------------------- /prod/presentations/tidyverse/img/tidyverse.010.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.010.jpeg -------------------------------------------------------------------------------- /prod/presentations/tidyverse/img/tidyverse.011.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.011.jpeg -------------------------------------------------------------------------------- /prod/presentations/tidyverse/img/tidyverse.012.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.012.jpeg -------------------------------------------------------------------------------- /prod/presentations/tidyverse/img/tidyverse.013.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.013.jpeg -------------------------------------------------------------------------------- /prod/presentations/tidyverse/img/tidyverse.014.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.014.jpeg -------------------------------------------------------------------------------- /prod/presentations/tidyverse/img/tidyverse.015.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.015.jpeg -------------------------------------------------------------------------------- /prod/presentations/tidyverse/img/tidyverse.016.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.016.jpeg -------------------------------------------------------------------------------- /prod/presentations/tidyverse/sqlvis_histogram.R: -------------------------------------------------------------------------------- 1 | ### Big data histogram 2 | sqlvis_compute_histogram <- function(data, x_name, bins = 30){ 3 | 4 | data_prep <- data %>% 5 | select_(x_field = x_name) %>% 6 | filter(!is.na(x_field)) %>% 7 | mutate(x_field = as.double(x_field)) 8 | 9 | s <- data_prep %>% 10 | summarise(max_x = max(x_field), min_x = min(x_field)) %>% 11 | mutate(bin_value = (max_x - min_x) / bins) %>% 12 | collect() 13 | 14 | new_bins <- as.numeric(c((0:(bins - 1) * s$bin_value) + s$min_x, s$max_x)) 15 | 16 | plot_table <- data_prep %>% 17 | ft_bucketizer(input.col = "x_field", output.col = "key_bin", splits = new_bins) %>% 18 | group_by(key_bin) %>% 19 | tally() %>% 20 | collect() 21 | 22 | all_bins <- data.frame( 23 | key_bin = 0:(bins - 1), 24 | bin = 1:bins, 25 | bin_ceiling = head(new_bins, -1) 26 | ) 27 | 28 | plot_table %>% 29 | full_join(all_bins, by="key_bin") %>% 30 | arrange(key_bin) %>% 31 | mutate(n = ifelse(!is.na(n), n, 0)) %>% 32 | select(bin = key_bin, count = n, bin_ceiling) %>% 33 | rename_(.dots = setNames(list("bin_ceiling"), x_name)) 34 | 35 | } 36 | 37 | sqlvis_ggplot_histogram <- function(plot_table, ...){ 38 | plot_table %>% 39 | select(x = 3, y = 2) %>% 40 | ggplot(aes(x, y)) + 41 | geom_bar(stat = "identity", fill = "cornflowerblue") + 42 | theme(legend.position = "none") + 43 | labs(x = colnames(plot_table)[3], y = colnames(plot_table)[2], ...) 44 | } 45 | 46 | sqlvis_ggvis_histogram <- function(plot_table, ...){ 47 | plot_table %>% 48 | select(x = 3, y = 2) %>% 49 | ggvis(x = ~x, y = ~y) %>% 50 | layer_bars() %>% 51 | add_axis("x", title = colnames(plot_table)[3]) %>% 52 | add_axis("y", title = colnames(plot_table)[2]) 53 | } 54 | 55 | -------------------------------------------------------------------------------- /prod/presentations/tidyverse/sqlvis_raster.R: -------------------------------------------------------------------------------- 1 | ### Big data tile plot 2 | 3 | # data <- tbl(sc, "trips_model_data") 4 | # x_field <- "pickup_longitude" 5 | # y_field <- "pickup_latitude" 6 | # resolution <- 50 7 | 8 | sqlvis_compute_raster <- function(data, x_field, y_field, resolution = 300){ 9 | 10 | data_prep <- data %>% 11 | select_(x = x_field, y = y_field) %>% 12 | filter(!is.na(x), !is.na(y)) 13 | 14 | s <- data_prep %>% 15 | summarise(max_x = max(x), 16 | max_y = max(y), 17 | min_x = min(x), 18 | min_y = min(y)) %>% 19 | mutate(rng_x = max_x - min_x, 20 | rng_y = max_y - min_y, 21 | resolution = resolution) %>% 22 | collect() 23 | 24 | counts <- data_prep %>% 25 | mutate(res_x = round((x - s$min_x) / s$rng_x * resolution, 0), 26 | res_y = round((y - s$min_y) / s$rng_y * resolution, 0)) %>% 27 | count(res_x, res_y) %>% 28 | collect 29 | 30 | list(counts = counts, 31 | limits = s, 32 | vnames = c(x_field, y_field) 33 | ) 34 | 35 | } 36 | 37 | sqlvis_ggplot_raster <- function(data, ...) { 38 | 39 | d <- data$counts 40 | s <- data$limits 41 | v <- data$vnames 42 | 43 | xx <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_x, s$max_x, len = 6),2)) 44 | yy <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_y, s$max_y, len = 6),2)) 45 | 46 | ggplot(d, aes(res_x, res_y)) + 47 | geom_raster(aes(fill = n)) + 48 | coord_fixed() + 49 | scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") + 50 | scale_x_continuous(breaks = xx, labels = names(xx)) + 51 | scale_y_continuous(breaks = yy, labels = names(yy)) + 52 | labs(x = v[1], y = v[2], ...) 53 | 54 | } 55 | 56 | ### Facets 57 | 58 | sqlvis_compute_raster_g <- function(data, x_field, y_field, g_field, resolution = 300){ 59 | 60 | data_prep <- data %>% 61 | mutate_(group = g_field) %>% 62 | select_(g = "group", x = x_field, y = y_field) %>% 63 | filter(!is.na(x), !is.na(y)) 64 | 65 | s <- data_prep %>% 66 | summarise(max_x = max(x), 67 | max_y = max(y), 68 | min_x = min(x), 69 | min_y = min(y)) %>% 70 | mutate(rng_x = max_x - min_x, 71 | rng_y = max_y - min_y, 72 | resolution = resolution) %>% 73 | collect() 74 | 75 | counts <- data_prep %>% 76 | mutate(res_x = round((x-s$min_x)/s$rng_x*resolution, 0), 77 | res_y = round((y-s$min_y)/s$rng_y*resolution, 0)) %>% 78 | count(g, res_x, res_y) %>% 79 | collect 80 | 81 | list(counts = counts, 82 | limits = s, 83 | vnames = c(x_field, y_field) 84 | ) 85 | 86 | } 87 | 88 | sqlvis_ggplot_raster_g <- function(data, ncol = 4, ...) { 89 | 90 | s <- data$limits 91 | d <- data$counts 92 | v <- data$vnames 93 | 94 | xx <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_x, s$max_x, len = 3), 1)) 95 | yy <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_y, s$max_y, len = 3), 1)) 96 | 97 | ggplot(d, aes(res_x, res_y)) + 98 | geom_raster(aes(fill = n)) + 99 | coord_fixed() + 100 | facet_wrap(~ g, ncol = ncol) + 101 | scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") + 102 | scale_x_continuous(breaks = xx, labels = names(xx)) + 103 | scale_y_continuous(breaks = yy, labels = names(yy)) + 104 | labs(x = v[1], y = v[2], ...) 105 | 106 | } 107 | -------------------------------------------------------------------------------- /prod/presentations/tidyverse/tidyverseAndSpark.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/tidyverseAndSpark.pdf --------------------------------------------------------------------------------