├── .gitignore
├── README.md
├── dev
    ├── babynames
    │   ├── babynames-dplyr.Rmd
    │   ├── babynames-dplyr.nb.html
    │   └── derby.log
    ├── cloudera
    │   ├── bigvis_tile.R
    │   ├── livy-architecture.png
    │   ├── livy.Rmd
    │   ├── livy.sh
    │   ├── livy_connection.Rmd
    │   ├── nyct2010r.csv
    │   ├── spark_ml_classification_titanic.Rmd
    │   ├── spark_plot_boxbin.R
    │   ├── spark_plot_hist.R
    │   ├── spark_plot_point.R
    │   ├── spark_toolchain.Rmd
    │   ├── sqlvis_histogram.R
    │   ├── sqlvis_raster.R
    │   ├── taxiDemoCloudera.Rmd
    │   ├── taxiDemoCloudera.nb.html
    │   ├── taxiDemoCloudera2.Rmd
    │   ├── taxiDemoCloudera3.Rmd
    │   ├── taxiDemoCloudera_backup.Rmd
    │   └── testCloudera.R
    ├── flights-cdh
    │   ├── flights_pred_2008.RData
    │   ├── images
    │   │   └── clusterDemo
    │   │   │   ├── data-analysis-1.png
    │   │   │   ├── flex-1.png
    │   │   │   ├── forecast-1.png
    │   │   │   ├── hue-metastore-1.png
    │   │   │   ├── manager-landing-page.png
    │   │   │   ├── performance-1.png
    │   │   │   ├── sign-in-1.png
    │   │   │   ├── spark-history-server-1.png
    │   │   │   ├── spark-pane-1.png
    │   │   │   ├── spark-rdd-1.png
    │   │   │   └── tables-1.png
    │   ├── nycflights_flexdashboard.Rmd
    │   ├── sparkClusterDemo-source.R
    │   ├── sparkClusterDemo.Rmd
    │   └── sparkClusterDemo.html
    ├── flights
    │   ├── flightsAnalysis.Rmd
    │   ├── flightsAnalysis.nb.html
    │   ├── flightsApp
    │   │   └── app.R
    │   ├── flightsApp2
    │   │   ├── global.R
    │   │   ├── server.R
    │   │   └── ui.R
    │   ├── flights_pred_2008.RData
    │   ├── images
    │   │   └── clusterDemo
    │   │   │   ├── awsClusterConnect.png
    │   │   │   ├── awsCreateCluster.png
    │   │   │   ├── awsCreateCluster2.png
    │   │   │   ├── awsNewSecurityGroup.png
    │   │   │   ├── awsSecurityGroup.png
    │   │   │   ├── awsSecurityGroup2.png
    │   │   │   ├── emrArchitecture.png
    │   │   │   ├── emrConfigStep1.png
    │   │   │   ├── emrConfigStep2.png
    │   │   │   ├── emrConfigStep3.png
    │   │   │   ├── emrConfigStep4.png
    │   │   │   ├── emrLogin.png
    │   │   │   ├── flightsDashboard.png
    │   │   │   ├── flightsDeciles.png
    │   │   │   ├── flightsDecilesDesc.png
    │   │   │   ├── flightsPredicted.png
    │   │   │   ├── rstudio.png
    │   │   │   ├── rstudioData.png
    │   │   │   ├── rstudioLogin.png
    │   │   │   ├── rstudioModel.png
    │   │   │   ├── rstudioModelDetail.png
    │   │   │   ├── rstudioSparkPane.png
    │   │   │   ├── workflow.png
    │   │   │   ├── workflowCommands.png
    │   │   │   ├── workflowRSC.png
    │   │   │   └── workflowShare.png
    │   ├── nycflights_flexdashboard.Rmd
    │   ├── nycflights_flexdashboard_spark.Rmd
    │   ├── recode_for_prediction.R
    │   ├── sparkClusterDemo.Rmd
    │   └── sparkClusterDemo.html
    ├── h2o-demo
    │   ├── emr_h2o_setup.sh
    │   ├── h2oHadoop.Rmd
    │   ├── h2oModels.Rmd
    │   ├── h2oSetup.R
    │   ├── h2oSetup.Rmd
    │   ├── h2oSetup.nb.html
    │   ├── h2oSetup_2_0_0.R
    │   ├── iris.csv
    │   ├── livy.R
    │   ├── livy.Rmd
    │   ├── nyct2010.csv
    │   ├── sqlvis_histogram.R
    │   ├── sqlvis_raster.R
    │   ├── taxiDemoH2O.Rmd
    │   └── taxiDemoH2O.nb.html
    ├── h2o
    │   ├── 01_h2o_setup.R
    │   ├── 02_h2o_rsparkling.Rmd
    │   ├── 02_h2o_rsparkling.nb.html
    │   ├── 03_h2o_ml.Rmd
    │   ├── 03_h2o_ml.nb.html
    │   └── 04_h2o_grid.R
    ├── helloworld
    │   ├── derby.log
    │   ├── helloWorld.Rmd
    │   ├── helloWorld.html
    │   └── helloWorld.nb.html
    ├── hive
    │   ├── hiveJDBC.R
    │   ├── hiveMetastore.R
    │   ├── hiveMetastore.Rmd
    │   └── hiveMetastore.nb.html
    ├── nyc-taxi-data
    │   ├── .gitignore
    │   ├── taxiAnalysis.R
    │   ├── taxiApp.R
    │   ├── taxiApp
    │   │   └── app.R
    │   ├── taxiDashboard.Rmd
    │   ├── taxiDemo.Rmd
    │   └── taxiDemo.nb.html
    ├── nycflights13
    │   ├── .gitignore
    │   ├── dplyr.Rmd
    │   ├── dplyr.nb.html
    │   ├── nycflights13_flexdashboard_rdata.Rmd
    │   └── nycflights13_flexdashboard_sparkdata.Rmd
    ├── performance
    │   ├── collect.Rmd
    │   └── collect.html
    └── titanic
    │   ├── .gitignore
    │   ├── notebook-classification-rdata.Rmd
    │   ├── notebook-classification-rdata.nb.html
    │   ├── notebook-classification.Rmd
    │   ├── notebook-classification.html
    │   ├── notebook-classification.nb.html
    │   ├── rmarkdown-classification.Rmd
    │   ├── rmarkdown-classification_files
    │       └── figure-html
    │       │   ├── auc-1.png
    │       │   ├── importance-1.png
    │       │   └── lift-1.png
    │   └── titanic-parquet
    │       ├── ._SUCCESS.crc
    │       ├── .part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet.crc
    │       ├── _SUCCESS
    │       └── part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet
├── img
    ├── sparklyr-illustration.png
    ├── sparklyr-presentation-demos.001.jpeg
    ├── sparklyr-presentation-demos.002.jpeg
    ├── sparklyr-presentation-demos.003.jpeg
    ├── sparklyr-presentation-demos.004.jpeg
    ├── sparklyr-presentation-demos.005.jpeg
    ├── sparklyr-presentation-demos.006.jpeg
    ├── sparklyr-presentation-demos.007.jpeg
    ├── sparklyr-presentation-demos.008.jpeg
    ├── sparklyr-presentation-demos.009.jpeg
    ├── sparklyr-presentation-demos.010.jpeg
    ├── sparklyr-presentation-demos.011.jpeg
    ├── sparklyr-presentation-demos.012.jpeg
    ├── sparklyr-presentation-demos.013.jpeg
    ├── sparklyr-presentation-demos.014.jpeg
    ├── sparklyr-presentation-demos.015.jpeg
    ├── sparklyr-presentation-demos.016.jpeg
    ├── sparklyr-presentation-demos.017.jpeg
    ├── sparklyr-presentation-demos.018.jpeg
    ├── sparklyr-presentation-demos.019.jpeg
    ├── sparklyr-presentation-demos.020.jpeg
    └── sparklyr-presentation-demos.021.jpeg
└── prod
    ├── apps
        ├── iris-k-means
        │   ├── DESCRIPTION
        │   ├── app.R
        │   ├── config.yml
        │   └── iris-parquet
        │   │   ├── ._SUCCESS.crc
        │   │   ├── ._common_metadata.crc
        │   │   ├── ._metadata.crc
        │   │   ├── .part-r-00000-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet.crc
        │   │   ├── .part-r-00001-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet.crc
        │   │   ├── _SUCCESS
        │   │   ├── _common_metadata
        │   │   ├── _metadata
        │   │   ├── part-r-00000-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet
        │   │   └── part-r-00001-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet
        ├── nycflights13-app-spark
        │   ├── DESCRIPTION
        │   ├── Readme.md
        │   ├── app.R
        │   └── config.yml
        └── titanic-classification
        │   ├── .gitignore
        │   ├── DESCRIPTION
        │   ├── app.R
        │   ├── helpers.R
        │   └── titanic-parquet
        │       ├── ._SUCCESS.crc
        │       ├── .part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet.crc
        │       ├── _SUCCESS
        │       └── part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet
    ├── conf
        ├── config.yml
        └── shiny-server.conf
    ├── dashboards
        ├── diamonds-explorer
        │   ├── config.yml
        │   ├── diamonds-parquet
        │   │   ├── ._SUCCESS.crc
        │   │   ├── ._common_metadata.crc
        │   │   ├── ._metadata.crc
        │   │   ├── .part-r-00000-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet.crc
        │   │   ├── .part-r-00001-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet.crc
        │   │   ├── _SUCCESS
        │   │   ├── _common_metadata
        │   │   ├── _metadata
        │   │   ├── part-r-00000-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet
        │   │   └── part-r-00001-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet
        │   └── flexdashboard-shiny-diamonds.Rmd
        ├── ggplot2-brushing
        │   └── ggplot2Brushing.Rmd
        ├── nycflights13-dash-spark
        │   ├── config.yml
        │   └── nycflights13-dash-spark.Rmd
        └── tor-project
        │   ├── .gitignore
        │   ├── metricsgraphicsTorProject.Rmd
        │   └── metricsgraphicsTorProject.html
    ├── notebooks
        ├── babynames
        │   ├── .gitignore
        │   ├── babynames-dplyr.Rmd
        │   └── babynames-dplyr.nb.html
        ├── end-to-end-flights
        │   ├── end-to-end-flights-flexdashboard.Rmd
        │   ├── end-to-end-flights-htmldoc.html
        │   ├── end-to-end-flights.Rmd
        │   └── flights_pred_2008.RData
        ├── ml_classification_titanic
        │   ├── spark_ml_classification_titanic.Rmd
        │   ├── spark_ml_classification_titanic.html
        │   ├── spark_ml_classification_titanic.nb.html
        │   └── titanic-parquet
        │   │   ├── ._SUCCESS.crc
        │   │   ├── .part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet.crc
        │   │   ├── _SUCCESS
        │   │   └── part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet
        └── taxi_demo
        │   ├── readme.md
        │   ├── taxiDemo.Rmd
        │   └── taxiDemo.nb.html
    └── presentations
        ├── cazena
            ├── 01_taxiR.Rmd
            ├── 02_taxiDemo.Rmd
            ├── 03_taxiGadget.Rmd
            ├── README.md
            ├── emr_setup.sh
            ├── kerberos.R
            ├── nyct2010.csv
            ├── sqlvis_histogram.R
            └── sqlvis_raster.R
        ├── cloudera
            ├── livy-architecture.png
            ├── livy.Rmd
            ├── readme.html
            ├── readme.md
            ├── sqlvis_histogram.R
            ├── sqlvis_raster.R
            └── taxiDemoCloudera.Rmd
        ├── sparkSummitEast
            ├── README.md
            ├── img
            │   ├── img.001.jpeg
            │   ├── img.002.jpeg
            │   ├── img.003.jpeg
            │   ├── img.004.jpeg
            │   ├── img.005.jpeg
            │   ├── img.006.jpeg
            │   ├── img.007.jpeg
            │   ├── img.008.jpeg
            │   ├── img.009.jpeg
            │   ├── img.010.jpeg
            │   ├── img.011.jpeg
            │   ├── img.012.jpeg
            │   ├── img.013.jpeg
            │   ├── img.014.jpeg
            │   ├── img.015.jpeg
            │   ├── img.016.jpeg
            │   └── img.017.jpeg
            ├── livy.Rmd
            ├── nyct2010.csv
            ├── sqlvis_histogram.R
            ├── sqlvis_raster.R
            └── taxiDemoH2O.Rmd
        └── tidyverse
            ├── 01_taxiR.Rmd
            ├── 02_taxiDemo.Rmd
            ├── 03_taxiGadget.Rmd
            ├── README.md
            ├── emr_setup.sh
            ├── img
                ├── tidyverse.001.jpeg
                ├── tidyverse.002.jpeg
                ├── tidyverse.003.jpeg
                ├── tidyverse.004.jpeg
                ├── tidyverse.005.jpeg
                ├── tidyverse.006.jpeg
                ├── tidyverse.007.jpeg
                ├── tidyverse.008.jpeg
                ├── tidyverse.009.jpeg
                ├── tidyverse.010.jpeg
                ├── tidyverse.011.jpeg
                ├── tidyverse.012.jpeg
                ├── tidyverse.013.jpeg
                ├── tidyverse.014.jpeg
                ├── tidyverse.015.jpeg
                └── tidyverse.016.jpeg
            ├── nyct2010.csv
            ├── sqlvis_histogram.R
            ├── sqlvis_raster.R
            └── tidyverseAndSpark.pdf


/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | sparkDemos.Rproj
6 | rsconnect
7 | derby.log
8 | *.nb.html
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | --- 
 2 | title: "Sparklyr Demos"
 3 | output: html_document
 4 | ---
 5 | 
 6 | ![](img/sparklyr-presentation-demos.001.jpeg)
 7 | 
 8 | ***
 9 | 
10 | ![](img/sparklyr-presentation-demos.002.jpeg)
11 | 
12 | ***
13 | 
14 | ![](img/sparklyr-presentation-demos.003.jpeg)
15 | 
16 | ***
17 | 
18 | ![](img/sparklyr-presentation-demos.004.jpeg)
19 | 
20 | ***
21 | 
22 | ![](img/sparklyr-presentation-demos.005.jpeg)
23 | 
24 | ***
25 | 
26 | ![](img/sparklyr-presentation-demos.006.jpeg)
27 | 
28 | ***
29 | 
30 | ![](img/sparklyr-presentation-demos.007.jpeg)
31 | 
32 | ***
33 | 
34 | ![](img/sparklyr-presentation-demos.008.jpeg)
35 | 
36 | ***
37 | 
38 | ![](img/sparklyr-presentation-demos.009.jpeg)
39 | 
40 | ***
41 | 
42 | ![](img/sparklyr-presentation-demos.010.jpeg)
43 | 
44 | ***
45 | 
46 | ![](img/sparklyr-presentation-demos.011.jpeg)
47 | 
48 | ***
49 | 
50 | ![](img/sparklyr-presentation-demos.012.jpeg)
51 | 
52 | ***
53 | 
54 | ![](img/sparklyr-presentation-demos.013.jpeg)
55 | 
56 | ***
57 | 
58 | ![](img/sparklyr-presentation-demos.014.jpeg)
59 | 
60 | ***
61 | 
62 | ![](img/sparklyr-presentation-demos.015.jpeg)
63 | 
64 | ***
65 | 
66 | ![](img/sparklyr-presentation-demos.016.jpeg)
67 | 
68 | ***
69 | 
70 | ![](img/sparklyr-presentation-demos.017.jpeg)
71 | 
72 | ***
73 | 
74 | ![](img/sparklyr-presentation-demos.018.jpeg)
75 | 
76 | ***
77 | 
78 | ![](img/sparklyr-presentation-demos.019.jpeg)
79 | 
80 | ***
81 | 
82 | ![](img/sparklyr-presentation-demos.020.jpeg)
83 | 
84 | ***
85 | 
86 | ![](img/sparklyr-presentation-demos.021.jpeg)
87 | 


--------------------------------------------------------------------------------
/dev/babynames/babynames-dplyr.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Analysis of babynames with dplyr"
  3 | output: html_notebook
  4 | ---
  5 | 
  6 | Use dplyr syntax to write Apache Spark SQL queries. Use select, where, group by, joins, and window functions in Aparche Spark SQL.
  7 | 
  8 | ## Setup
  9 | 
 10 | ```{r setup}
 11 | knitr::opts_chunk$set(warning = FALSE, message = FALSE)
 12 | library(sparklyr)
 13 | library(dplyr)
 14 | library(babynames)
 15 | library(ggplot2)
 16 | library(dygraphs)
 17 | library(rbokeh)
 18 | ```
 19 | 
 20 | ## Connect to Spark
 21 | 
 22 | Install and connect to a local Spark instance. Copy data into Spark DataFrames.
 23 | 
 24 | ```{r}
 25 | #spark_install("2.0.0")
 26 | sc <- spark_connect(master = "local", version = "2.0.0")
 27 | babynames_tbl <- copy_to(sc, babynames, "babynames")
 28 | applicants_tbl <- copy_to(sc, applicants, "applicants")
 29 | ```
 30 | 
 31 | ## Total US births
 32 | 
 33 | Plot total US births recorded from the Social Security Administration.
 34 | 
 35 | ```{r}
 36 | birthsYearly <- applicants_tbl %>%
 37 |   mutate(male = ifelse(sex == "M", n_all, 0), female = ifelse(sex == "F", n_all, 0)) %>%
 38 |   group_by(year) %>%
 39 |   summarize(Male = sum(male) / 1000000, Female = sum(female) / 1000000) %>%
 40 |   arrange(year) %>%
 41 |   collect
 42 | 
 43 | birthsYearly %>%
 44 |   dygraph(main = "Total US Births (SSN)", ylab = "Millions") %>%
 45 |   dySeries("Female") %>%
 46 |   dySeries("Male") %>%
 47 |   dyOptions(stackedGraph = TRUE) %>%
 48 |   dyRangeSelector(height = 20)
 49 | ```
 50 | 
 51 | ## Aggregate data by name
 52 | 
 53 | Use Spark SQL to create a look up table. Register and cache the look up table in Spark for future queries.
 54 | 
 55 | ```{r}
 56 | topNames_tbl <- babynames_tbl %>%
 57 |   filter(year >= 1986) %>%  
 58 |   group_by(name, sex) %>%
 59 |   summarize(count = as.numeric(sum(n))) %>%
 60 |   filter(count > 1000) %>%
 61 |   select(name, sex)
 62 | 
 63 | filteredNames_tbl <- babynames_tbl %>%
 64 |   filter(year >= 1986) %>%
 65 |   inner_join(topNames_tbl)
 66 | 
 67 | yearlyNames_tbl <- filteredNames_tbl %>%
 68 |   group_by(year, name, sex) %>%
 69 |   summarize(count = as.numeric(sum(n)))
 70 | 
 71 | sdf_register(yearlyNames_tbl, "yearlyNames")
 72 | tbl_cache(sc, "yearlyNames")
 73 | ```
 74 | 
 75 | ## Most popular names (1986)
 76 | 
 77 | Identify the top 5 male and female names from 1986. Visualize the popularity trend over time.
 78 | 
 79 | ```{r}
 80 | topNames1986_tbl <- yearlyNames_tbl %>%
 81 |   filter(year == 1986) %>%
 82 |   group_by(name, sex) %>%
 83 |   summarize(count = sum(count)) %>%
 84 |   group_by(sex) %>%
 85 |   mutate(rank = min_rank(desc(count))) %>%
 86 |   filter(rank < 5) %>%
 87 |   arrange(sex, rank) %>%
 88 |   select(name, sex, rank) %>%
 89 |   sdf_register("topNames1986")
 90 | 
 91 | tbl_cache(sc, "topNames1986")
 92 | 
 93 | topNames1986Yearly <- yearlyNames_tbl %>%
 94 |   inner_join(topNames1986_tbl) %>%
 95 |   collect
 96 | 
 97 | ggplot(topNames1986Yearly, aes(year, count, color=name)) +
 98 |   facet_grid(~sex) +
 99 |   geom_line() +
100 |   ggtitle("Most Popular Names of 1986")
101 | ```
102 | 
103 | ## Most popular names (2014)
104 | 
105 | Identify the top 5 male and female names from 2014. Visualize the popularity trend over time.
106 | 
107 | ```{r}
108 | topNames2014_tbl <- yearlyNames_tbl %>%
109 |   filter(year == 2014) %>%
110 |   group_by(name, sex) %>%
111 |   summarize(count = sum(count)) %>%
112 |   group_by(sex) %>%
113 |   mutate(rank = min_rank(desc(count))) %>%
114 |   filter(rank < 5) %>%
115 |   arrange(sex, rank) %>%
116 |   select(name, sex, rank) %>%
117 |   sdf_register("topNames2014")
118 | 
119 | tbl_cache(sc, "topNames2014")
120 | 
121 | topNames2014Yearly <- yearlyNames_tbl %>%
122 |   inner_join(topNames2014_tbl) %>%
123 |   collect
124 | 
125 | ggplot(topNames2014Yearly, aes(year, count, color=name)) +
126 |   facet_grid(~sex) +
127 |   geom_line() +
128 |   ggtitle("Most Popular Names of 2014")
129 | ```
130 | 
131 | ## Shared names
132 | 
133 | Visualize the most popular names that are shared by both males and females.
134 | 
135 | ```{r}
136 | sharedName <- babynames_tbl %>%
137 |   mutate(male = ifelse(sex == "M", n, 0), female = ifelse(sex == "F", n, 0)) %>%
138 |   group_by(name) %>%
139 |   summarize(Male = as.numeric(sum(male)), 
140 |             Female = as.numeric(sum(female)),
141 |             count = as.numeric(sum(n)),
142 |             AvgYear = round(as.numeric(sum(year * n) / sum(n)),0)) %>%
143 |   filter(Male > 30000 & Female > 30000) %>%
144 |   collect
145 | 
146 | figure(width = NULL, height = NULL, 
147 |        xlab = "Log10 Number of Males", 
148 |        ylab = "Log10 Number of Females",
149 |        title = "Top shared names (1880 - 2014)") %>%
150 |   ly_points(log10(Male), log10(Female), data = sharedName,
151 |             color = AvgYear, size = scale(sqrt(count)),
152 |             hover = list(name, Male, Female, AvgYear), legend = FALSE)
153 | ```


--------------------------------------------------------------------------------
/dev/babynames/derby.log:
--------------------------------------------------------------------------------
 1 | ----------------------------------------------------------------
 2 | Wed Feb 15 12:46:01 UTC 2017:
 3 | Booting Derby version The Apache Software Foundation - Apache Derby - 10.11.1.1 - (1616546): instance a816c00e-015a-41ce-df39-000016b90d28 
 4 | on database directory memory:/home/nathan/projects/spark/sparkDemos/dev/babynames/databaseName=metastore_db with class loader org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1@53a3cfef 
 5 | Loaded from file:/home/nathan/.cache/spark/spark-2.0.0-bin-hadoop2.7/jars/derby-10.11.1.1.jar
 6 | java.vendor=Oracle Corporation
 7 | java.runtime.version=1.7.0_85-b01
 8 | user.dir=/home/nathan/projects/spark/sparkDemos/dev/babynames
 9 | os.name=Linux
10 | os.arch=amd64
11 | os.version=3.13.0-48-generic
12 | derby.system.home=null
13 | Database Class Loader started - derby.database.classpath=''
14 | 


--------------------------------------------------------------------------------
/dev/cloudera/bigvis_tile.R:
--------------------------------------------------------------------------------
 1 | ### Big data tile plot
 2 | 
 3 | bigvis_compute_tiles <- function(data, x_field, y_field, resolution = 500){
 4 |   
 5 |   data_prep <- data %>%
 6 |     select_(x = x_field, y = y_field) %>%
 7 |     filter(!is.na(x), !is.na(y))
 8 |   
 9 |   s <- data_prep %>%
10 |     summarise(max_x = max(x),
11 |               max_y = max(y), 
12 |               min_x = min(x),
13 |               min_y = min(y)) %>%
14 |     mutate(rng_x = max_x - min_x,
15 |            rng_y = max_y - min_y) %>%
16 |     collect()
17 |   
18 |   image_frame_pre <- data_prep %>% 
19 |     mutate(res_x = round((x-s$min_x)/s$rng_x*resolution, 0),
20 |            res_y = round((y-s$min_y)/s$rng_y*resolution, 0)) %>%
21 |     count(res_x, res_y) %>%
22 |     collect
23 |   
24 |   image_frame_pre %>%
25 |     rename(freq = n) %>%
26 |     mutate(alpha = round(freq / max(freq), 2)) %>%
27 |     rename_(.dots=setNames(list("res_x", "res_y"), c(x_field, y_field)))
28 |   
29 | }
30 | 
31 | bigvis_ggplot_tiles <- function(data){
32 |   data %>%
33 |     select(x = 1, y = 2, Freq = 4) %>%
34 |     ggplot(aes(x, y)) + 
35 |     geom_tile(aes(fill = Freq)) +
36 |     xlab(colnames(data)[1]) +
37 |     ylab(colnames(data)[2])
38 | }
39 | 


--------------------------------------------------------------------------------
/dev/cloudera/livy-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/cloudera/livy-architecture.png


--------------------------------------------------------------------------------
/dev/cloudera/livy.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Connecting to Spark through Livy"
 3 | output: html_notebook
 4 | ---
 5 | 
 6 | With Livy you can anaylze data in your spark cluster via R on your desktop.
 7 | 
 8 | ## Livy
 9 | 
10 | Livy is a service that enables easy interaction with an Apache Spark cluster over a REST interface. It enables easy submission of Spark jobs or snippets of Spark code, synchronous or asynchronous result retrieval, as well as SparkContext management, all via a simple REST interface or a RPC client library. Livy also simplifies the interaction between Spark from application servers, thus enabling the use of Spark for interactive web/mobile applications.
11 | 
12 | <center><div style="width:450px">
13 | ![Image](http://livy.io/img/livy-architecture.png)
14 | </div></center>
15 | 
16 | ## Start Livy
17 | 
18 | Set home environment variables and start a Livy server to handle local requests.
19 | 
20 | ```{bash}
21 | export JAVA_HOME=/usr/lib/jvm/java-7-oracle-cloudera
22 | export SPARK_HOME=/opt/cloudera/parcels/CDH/lib/spark
23 | /home/ubuntu/livy/livy-server-0.2.0/bin/livy-server
24 | ```
25 | 
26 | ## Connect to Spark
27 | 
28 | Use `method = "livy"` to connect to the cluster.
29 | 
30 | ```{r}
31 | library(sparklyr)
32 | library(dplyr)
33 | sc <- spark_connect(
34 |   master = "http://ec2-***.us-west-2.compute.amazonaws.com:8998", 
35 |   method = "livy")
36 | ```
37 | 
38 | ## Analyze
39 | 
40 | Use R code on your workstation as you normally would. Your R commands will be sent to the cluster via Livy for processing. Collect your results back to the desktop for further processing in R.
41 | 
42 | ```{r}
43 | library(ggplot2)
44 | trips_model_data_tbl <- tbl(sc, "trips_model_data")
45 | pickup_dropoff_tbl <- trips_model_data_tbl %>%
46 |   filter(pickup_nta == "Turtle Bay-East Midtown" & dropoff_nta == "Airport") %>%
47 |   mutate(pickup_hour = hour(pickup_datetime)) %>%
48 |   mutate(trip_time = unix_timestamp(dropoff_datetime) - unix_timestamp(pickup_datetime)) %>%
49 |   group_by(pickup_hour) %>% 
50 |   summarize(n = n(),
51 |             trip_time_mean = mean(trip_time),
52 |             trip_time_p10 = percentile(trip_time, 0.10),
53 |             trip_time_p25 = percentile(trip_time, 0.25),
54 |             trip_time_p50 = percentile(trip_time, 0.50),
55 |             trip_time_p75 = percentile(trip_time, 0.75),
56 |             trip_time_p90 = percentile(trip_time, 0.90))
57 | 
58 | # Collect results
59 | pickup_dropoff <- collect(pickup_dropoff_tbl)
60 | 
61 | # Plot
62 | ggplot(pickup_dropoff, aes(x = pickup_hour)) +
63 |           geom_line(aes(y = trip_time_p50, alpha = "Median")) +
64 |           geom_ribbon(aes(ymin = trip_time_p25, ymax = trip_time_p75, 
65 |                           alpha = "25–75th percentile")) +
66 |           geom_ribbon(aes(ymin = trip_time_p10, ymax = trip_time_p90, 
67 |                           alpha = "10–90th percentile")) +
68 |           scale_y_continuous("trip duration in minutes")
69 | ```
70 | 


--------------------------------------------------------------------------------
/dev/cloudera/livy.sh:
--------------------------------------------------------------------------------
1 | export JAVA_HOME=/usr/lib/jvm/java-7-oracle-cloudera
2 | export SPARK_HOME=/opt/cloudera/parcels/CDH/lib/spark
3 | /home/ubuntu/livy/livy-server-0.2.0/bin/livy-server
4 | 


--------------------------------------------------------------------------------
/dev/cloudera/livy_connection.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Livy Connection"
 3 | output: html_notebook
 4 | ---
 5 | 
 6 | ```{bash}
 7 | export JAVA_HOME=/usr/lib/jvm/java-7-oracle-cloudera
 8 | export SPARK_HOME=/opt/cloudera/parcels/CDH/lib/spark
 9 | /home/ubuntu/livy/livy-server-0.2.0/bin/livy-server
10 | ```
11 | 


--------------------------------------------------------------------------------
/dev/cloudera/spark_plot_hist.R:
--------------------------------------------------------------------------------
 1 | spark_plot_hist <- function(data,
 2 |                             x_field,
 3 |                             breaks=30)
 4 | {
 5 |   #----- Pre calculating the max x brings down the time considerably
 6 |   max_x <-   data %>%
 7 |     select_(x=x_field) %>%
 8 |     summarise(xmax = max(x)) %>%
 9 |     collect()
10 |   max_x <- max_x$xmax[1]
11 |   
12 |   #----- The entire function is one long pipe 
13 |   data %>%
14 |     select_(x=x_field) %>%
15 |     filter(!is.na(x)) %>%
16 |     mutate(bucket = round(x/(max_x/(breaks-1)),0)) %>%
17 |     group_by(bucket) %>%
18 |     summarise(top=max(x),
19 |               bottom=min(x),
20 |               count=n()) %>%
21 |     arrange(bucket) %>%
22 |     collect %>%
23 |     ggplot() +
24 |     geom_bar(aes(x=((top-bottom)/2)+bottom, y=count), color="black", stat = "identity") +
25 |     labs(x=x_field) +
26 |     theme_minimal() +
27 |     theme(legend.position="none")}


--------------------------------------------------------------------------------
/dev/cloudera/spark_plot_point.R:
--------------------------------------------------------------------------------
 1 | spark_plot_point<- function(data,
 2 |                             x_field=NULL,
 3 |                             y_field=NULL,
 4 |                             color_field=NULL)
 5 | {
 6 |   
 7 |   data %>%         
 8 |     select_(x=x_field, y=y_field) %>%
 9 |     group_by(x,y) %>%
10 |     tally() %>%
11 |     collect() %>%
12 |     ggplot() +
13 |     geom_point(aes(x=x, y=y, color=n)) +
14 |     labs(x=x_field, y=y_field)
15 |   
16 | }


--------------------------------------------------------------------------------
/dev/cloudera/spark_toolchain.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Data Science Tool Chain with Spark"
 3 | output: html_notebook
 4 | ---
 5 | 
 6 | ```{r}
 7 | library(sparklyr)
 8 | library(dplyr)
 9 | library(ggplot2)
10 | 
11 | Sys.setenv(JAVA_HOME="/usr/lib/jvm/java-7-oracle-cloudera/")
12 | Sys.setenv(SPARK_HOME = '/opt/cloudera/parcels/CDH/lib/spark')
13 | 
14 | conf <- spark_config()
15 | conf$spark.executor.cores <- 16
16 | conf$spark.executor.memory <- "24G"
17 | conf$spark.yarn.am.cores  <-   16
18 | conf$spark.yarn.am.memory <- "24G"
19 | 
20 | sc <- spark_connect(master = "yarn-client", version="1.6.0", config = conf)
21 | 
22 | nyct2010_tbl <- tbl(sc, "nyct2010")
23 | trips_par_tbl <- tbl(sc, "trips_par")
24 | trips_model_data_tbl <- tbl(sc, "trips_model_data")
25 | ```
26 | 
27 | ### Histogram
28 | 
29 | ```{r}
30 | source("bigvis_histogram.R")
31 | 
32 | bigvis_compute_histogram(nyct2010_tbl, "ct2010") %>%
33 |   bigvis_ggplot_histogram
34 | 
35 | ```
36 | 
37 | ### Tile plot
38 | 
39 | ```{r}
40 | source("bigvis_tile.R")
41 | 
42 | trips_model_data_tbl %>%
43 |   bigvis_compute_tiles("pickup_longitude", "pickup_latitude", 500) %>%
44 |   bigvis_ggplot_tiles
45 | 
46 | ```
47 | 
48 | 


--------------------------------------------------------------------------------
/dev/cloudera/sqlvis_histogram.R:
--------------------------------------------------------------------------------
 1 | ### Big data histogram
 2 | sqlvis_compute_histogram <- function(data, x_name, bins = 30){
 3 | 
 4 |   data_prep <- data %>%
 5 |     select_(x_field = x_name) %>%
 6 |     filter(!is.na(x_field)) %>%
 7 |     mutate(x_field = as.double(x_field))
 8 |     
 9 |   s <- data_prep %>%
10 |     summarise(max_x = max(x_field), min_x = min(x_field)) %>%
11 |     mutate(bin_value = (max_x - min_x) / bins) %>%
12 |     collect()
13 |   
14 |   new_bins <- as.numeric(c((0:(bins - 1) * s$bin_value) + s$min_x, s$max_x))
15 | 
16 |   plot_table <- data_prep %>%
17 |     ft_bucketizer(input.col = "x_field", output.col = "key_bin", splits = new_bins) %>%
18 |     group_by(key_bin) %>%
19 |     tally() %>%
20 |     collect()
21 |   
22 |   all_bins <- data.frame(
23 |     key_bin = 0:(bins - 1), 
24 |     bin = 1:bins, 
25 |     bin_ceiling = head(new_bins, -1)
26 |   )
27 |   
28 |   plot_table %>%
29 |     full_join(all_bins, by="key_bin") %>%
30 |     arrange(key_bin) %>%
31 |     mutate(n = ifelse(!is.na(n), n, 0)) %>%
32 |     select(bin = key_bin, count = n, bin_ceiling) %>%
33 |     rename_(.dots = setNames(list("bin_ceiling"), x_name))
34 |   
35 |   }
36 | 
37 | sqlvis_ggplot_histogram <- function(plot_table, ...){
38 |   plot_table %>%
39 |     select(x = 3, y = 2) %>%
40 |     ggplot(aes(x, y)) +
41 |     geom_bar(stat = "identity", fill = "cornflowerblue") +
42 |     theme(legend.position = "none") +
43 |     labs(x = colnames(plot_table)[3], y = colnames(plot_table)[2], ...)
44 |   }
45 | 
46 | sqlvis_ggvis_histogram <- function(plot_table, ...){
47 |   plot_table %>%
48 |     select(x = 3, y = 2) %>%
49 |     ggvis(x = ~x, y = ~y) %>%
50 |     layer_bars() %>%
51 |     add_axis("x", title = colnames(plot_table)[3]) %>%
52 |     add_axis("y", title = colnames(plot_table)[2])
53 | }
54 | 
55 | 


--------------------------------------------------------------------------------
/dev/cloudera/sqlvis_raster.R:
--------------------------------------------------------------------------------
  1 | ### Big data tile plot
  2 | 
  3 | # data <- tbl(sc, "trips_model_data")
  4 | # x_field <- "pickup_longitude"
  5 | # y_field <- "pickup_latitude"
  6 | # resolution <- 50
  7 | 
  8 | sqlvis_compute_raster <- function(data, x_field, y_field, resolution = 300){
  9 |   
 10 |   data_prep <- data %>%
 11 |     select_(x = x_field, y = y_field) %>%
 12 |     filter(!is.na(x), !is.na(y))
 13 |   
 14 |   s <- data_prep %>%
 15 |     summarise(max_x = max(x),
 16 |               max_y = max(y), 
 17 |               min_x = min(x),
 18 |               min_y = min(y)) %>%
 19 |     mutate(rng_x = max_x - min_x,
 20 |            rng_y = max_y - min_y,
 21 |            resolution = resolution) %>%
 22 |     collect()
 23 |   
 24 |   counts <- data_prep %>% 
 25 |     mutate(res_x = round((x - s$min_x) / s$rng_x * resolution, 0),
 26 |            res_y = round((y - s$min_y) / s$rng_y * resolution, 0)) %>%
 27 |     count(res_x, res_y) %>%
 28 |     collect
 29 | 
 30 |   list(counts = counts,
 31 |        limits = s,
 32 |        vnames = c(x_field, y_field)
 33 |        )  
 34 | 
 35 | }
 36 | 
 37 | sqlvis_ggplot_raster <- function(data, ...) {
 38 | 
 39 |   d <- data$counts
 40 |   s <- data$limits
 41 |   v <- data$vnames
 42 | 
 43 |   xx <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_x, s$max_x, len = 6),2))
 44 |   yy <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_y, s$max_y, len = 6),2))
 45 | 
 46 |   ggplot(d, aes(res_x, res_y)) + 
 47 |     geom_raster(aes(fill = n)) +
 48 |     coord_fixed() +
 49 |     scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") +
 50 |     scale_x_continuous(breaks = xx, labels = names(xx)) +
 51 |     scale_y_continuous(breaks = yy, labels = names(yy)) +
 52 |     labs(x = v[1], y = v[2], ...)
 53 |     
 54 | }    
 55 | 
 56 | ### Facets
 57 | 
 58 | sqlvis_compute_raster_g <- function(data, x_field, y_field, g_field, resolution = 300){
 59 |   
 60 |   data_prep <- data %>%
 61 |     mutate_(group = g_field) %>%
 62 |     select_(g = "group", x = x_field, y = y_field) %>%
 63 |     filter(!is.na(x), !is.na(y))
 64 |   
 65 |   s <- data_prep %>%
 66 |     summarise(max_x = max(x),
 67 |               max_y = max(y), 
 68 |               min_x = min(x),
 69 |               min_y = min(y)) %>%
 70 |     mutate(rng_x = max_x - min_x,
 71 |            rng_y = max_y - min_y,
 72 |            resolution = resolution) %>%
 73 |     collect()
 74 |   
 75 |   counts <- data_prep %>% 
 76 |     mutate(res_x = round((x-s$min_x)/s$rng_x*resolution, 0),
 77 |            res_y = round((y-s$min_y)/s$rng_y*resolution, 0)) %>%
 78 |     count(g, res_x, res_y) %>%
 79 |     collect
 80 |   
 81 |   list(counts = counts,
 82 |        limits = s,
 83 |        vnames = c(x_field, y_field)
 84 |   )  
 85 |   
 86 | }
 87 | 
 88 | sqlvis_ggplot_raster_g <- function(data, ncol = 4, ...) {
 89 |   
 90 |   s <- data$limits
 91 |   d <- data$counts
 92 |   v <- data$vnames
 93 |   
 94 |   xx <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_x, s$max_x, len = 3), 1))
 95 |   yy <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_y, s$max_y, len = 3), 1))
 96 |   
 97 |   ggplot(d, aes(res_x, res_y)) + 
 98 |     geom_raster(aes(fill = n)) +
 99 |     coord_fixed() +
100 |     facet_wrap(~ g, ncol = ncol) +
101 |     scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") +
102 |     scale_x_continuous(breaks = xx, labels = names(xx)) +
103 |     scale_y_continuous(breaks = yy, labels = names(yy)) +
104 |     labs(x = v[1], y = v[2], ...)
105 |   
106 | }    
107 | 


--------------------------------------------------------------------------------
/dev/flights-cdh/flights_pred_2008.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/flights_pred_2008.RData


--------------------------------------------------------------------------------
/dev/flights-cdh/images/clusterDemo/data-analysis-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/images/clusterDemo/data-analysis-1.png


--------------------------------------------------------------------------------
/dev/flights-cdh/images/clusterDemo/flex-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/images/clusterDemo/flex-1.png


--------------------------------------------------------------------------------
/dev/flights-cdh/images/clusterDemo/forecast-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/images/clusterDemo/forecast-1.png


--------------------------------------------------------------------------------
/dev/flights-cdh/images/clusterDemo/hue-metastore-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/images/clusterDemo/hue-metastore-1.png


--------------------------------------------------------------------------------
/dev/flights-cdh/images/clusterDemo/manager-landing-page.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/images/clusterDemo/manager-landing-page.png


--------------------------------------------------------------------------------
/dev/flights-cdh/images/clusterDemo/performance-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/images/clusterDemo/performance-1.png


--------------------------------------------------------------------------------
/dev/flights-cdh/images/clusterDemo/sign-in-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/images/clusterDemo/sign-in-1.png


--------------------------------------------------------------------------------
/dev/flights-cdh/images/clusterDemo/spark-history-server-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/images/clusterDemo/spark-history-server-1.png


--------------------------------------------------------------------------------
/dev/flights-cdh/images/clusterDemo/spark-pane-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/images/clusterDemo/spark-pane-1.png


--------------------------------------------------------------------------------
/dev/flights-cdh/images/clusterDemo/spark-rdd-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/images/clusterDemo/spark-rdd-1.png


--------------------------------------------------------------------------------
/dev/flights-cdh/images/clusterDemo/tables-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights-cdh/images/clusterDemo/tables-1.png


--------------------------------------------------------------------------------
/dev/flights-cdh/nycflights_flexdashboard.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Time Gained in Flight"
  3 | output: 
  4 |   flexdashboard::flex_dashboard:
  5 |     orientation: rows
  6 |     social: menu
  7 |     source_code: embed
  8 | runtime: shiny
  9 | ---
 10 | 
 11 | ```{r setup, include=F}
 12 | # Attach packages
 13 | library(dplyr)
 14 | library(ggplot2)
 15 | library(DT)
 16 | library(leaflet)
 17 | library(geosphere)
 18 | load('flights_pred_2008.RData')
 19 | airports <- mutate(airports, lat = as.numeric(lat), lon = as.numeric(lon))
 20 | ```
 21 | 
 22 | 
 23 | Summary
 24 | ========================================================================
 25 | 
 26 | Inputs {.sidebar}
 27 | -----------------------------------------------------------------------
 28 | 
 29 | ### Select Airports
 30 | 
 31 | ```{r}
 32 | # Shiny inputs for flight orgin and destination
 33 | carrier_origin <- ungroup(pred_data) %>% distinct(origin) %>% .[['origin']]
 34 | carrier_dest <- ungroup(pred_data) %>% distinct(dest) %>% .[['dest']]
 35 | selectInput("origin",  "Flight origin", carrier_origin, selected =  "JFK")
 36 | selectInput("dest", "Flight destination", carrier_dest, selected = "LAS")
 37 | ```
 38 | 
 39 | ### Background
 40 | 
 41 | Given that your flight was delayed by 15 minutes or more, what is the likelihood 
 42 | your airline carrier will make up time in route? Some of the most signficant factors 
 43 | for making up time are flight distance and airline carrier. The data model behind 
 44 | this dashboard is based on flights from NYC airports in 2013.
 45 | 
 46 | 
 47 | Row
 48 | -----------------------------------------------------------------------
 49 | 
 50 | ### Observed versus predicted time gain
 51 | 
 52 | ```{r}
 53 | # Aggregregate time gain by carrier and by route 
 54 | plot_data <- reactive({
 55 |   req(input$origin, input$dest)
 56 |   pred_data %>%
 57 |     filter(origin==input$origin & dest==input$dest) %>%
 58 |     ungroup() %>%
 59 |     select(airline, flights, distance, avg_dep_delay, avg_arr_delay, avg_gain, pred_gain)
 60 | })
 61 | 
 62 | # Plot observed versus predicted time gain for carriers and route
 63 | renderPlot({
 64 |   ggplot(plot_data(), aes(factor(airline), pred_gain)) + 
 65 |     geom_bar(stat = "identity", fill = '#2780E3') +
 66 |     geom_point(aes(factor(airline), avg_gain)) +
 67 |     coord_flip() +
 68 |     labs(x = "", y = "Time gained in flight (minutes)") +
 69 |     labs(title = "Observed gain (point) vs Predicted gain (bar)")
 70 | })
 71 | ```
 72 | 
 73 | ### Route
 74 | 
 75 | ```{r}
 76 | # Identify origin lat and long
 77 | origin <- reactive({
 78 |   req(input$origin)
 79 |   filter(airports, faa == input$origin)
 80 |   })
 81 | 
 82 | # Identify destination lat and log
 83 | dest <- reactive({
 84 |   req(input$dest)
 85 |   filter(airports, faa == input$dest)
 86 | })
 87 | 
 88 | # Plot route
 89 | renderLeaflet({
 90 |   gcIntermediate(
 91 |     select(origin(), lon, lat),
 92 |     select(dest(), lon, lat),
 93 |     n=100, addStartEnd=TRUE, sp=TRUE
 94 |     ) %>%
 95 |     leaflet() %>%
 96 |     addProviderTiles("CartoDB.Positron") %>%
 97 |     addPolylines()
 98 | })
 99 | ```
100 | 
101 | Row
102 | -----------------------------------------------------------------------
103 | 
104 | ### Data details
105 | 
106 | ```{r}
107 | # Print table of observed and predicted gains by airline
108 | renderDataTable(
109 |   datatable(plot_data()) %>%
110 |     formatRound(c("flights", "distance"), 0) %>%
111 |     formatRound(c("avg_arr_delay", "avg_dep_delay", "avg_gain", "pred_gain"), 1)
112 | )
113 | ```
114 | 
115 | Model Details
116 | ========================================================================
117 | 
118 | ```{r}
119 | renderPrint(ml1_summary)
120 | ```
121 | 


--------------------------------------------------------------------------------
/dev/flights-cdh/sparkClusterDemo-source.R:
--------------------------------------------------------------------------------
  1 | 
  2 | library(sparklyr)
  3 | library(dplyr)
  4 | library(ggplot2)
  5 | 
  6 | Sys.setenv(HADOOP_CONF_DIR='/etc/hadoop/conf.cloudera.hdfs')
  7 | Sys.setenv(YARN_CONF_DIR='/etc/hadoop/conf.cloudera.yarn')
  8 | #Sys.setenv(SPARK_HOME="/home/ubuntu/spark-1.6.0")
  9 | #Sys.setenv(SPARK_HOME_VERSION="1.6.0")
 10 | 
 11 | sc <- spark_connect(master = "yarn-client", version="1.6.0", spark_home = '/opt/cloudera/parcels/CDH/lib/spark/')
 12 | 
 13 | #---------------------------------------------------------
 14 | 
 15 | # Cache flights Hive table into Spark
 16 | tbl_cache(sc, 'flights')
 17 | flights_tbl <- tbl(sc, 'flights')
 18 | 
 19 | # Cache airlines Hive table into Spark
 20 | tbl_cache(sc, 'airlines')
 21 | airlines_tbl <- tbl(sc, 'airlines')
 22 | 
 23 | # Cache airports Hive table into Spark
 24 | tbl_cache(sc, 'airports')
 25 | airports_tbl <- tbl(sc, 'airports')
 26 | 
 27 | #---------------------------------------------------------
 28 | 
 29 | # Filter records and create target variable 'gain'
 30 | model_data <- flights_tbl %>%
 31 |   filter(!is.na(arrdelay) & !is.na(depdelay) & !is.na(distance)) %>%
 32 |   filter(depdelay > 15 & depdelay < 240) %>%
 33 |   filter(arrdelay > -60 & arrdelay < 360) %>%
 34 |   filter(year >= 2003 & year <= 2007) %>%
 35 |   left_join(airlines_tbl, by = c("uniquecarrier" = "code")) %>%
 36 |   mutate(gain = depdelay - arrdelay) %>%
 37 |   select(year, month, arrdelay, depdelay, distance, uniquecarrier, description, gain)
 38 | 
 39 | # Summarize data by carrier
 40 | model_data %>%
 41 |   group_by(uniquecarrier) %>%
 42 |   summarize(description = min(description), gain=mean(gain), 
 43 |             distance=mean(distance), depdelay=mean(depdelay)) %>%
 44 |   select(description, gain, distance, depdelay) %>%
 45 |   arrange(gain)
 46 | 
 47 | #---------------------------------------------------------
 48 | 
 49 | # Partition the data into training and validation sets
 50 | model_partition <- model_data %>% 
 51 |   sdf_partition(train = 0.8, valid = 0.2, seed = 5555)
 52 | 
 53 | # Fit a linear model
 54 | ml1 <- model_partition$train %>%
 55 |   ml_linear_regression(gain ~ distance + depdelay + uniquecarrier)
 56 | 
 57 | # Summarize the linear model
 58 | summary(ml1)
 59 | 
 60 | #---------------------------------------------------------
 61 | 
 62 | # Calculate average gains by predicted decile
 63 | model_deciles <- lapply(model_partition, function(x) {
 64 |   sdf_predict(ml1, x) %>%
 65 |     mutate(decile = ntile(desc(prediction), 10)) %>%
 66 |     group_by(decile) %>%
 67 |     summarize(gain = mean(gain)) %>%
 68 |     select(decile, gain) %>%
 69 |     collect()
 70 | })
 71 | 
 72 | # Create a summary dataset for plotting
 73 | deciles <- rbind(
 74 |   data.frame(data = 'train', model_deciles$train),
 75 |   data.frame(data = 'valid', model_deciles$valid),
 76 |   make.row.names = FALSE
 77 | )
 78 | 
 79 | # Plot average gains by predicted decile
 80 | deciles %>%
 81 |   ggplot(aes(factor(decile), gain, fill = data)) +
 82 |   geom_bar(stat = 'identity', position = 'dodge') +
 83 |   labs(title = 'Average gain by predicted decile', x = 'Decile', y = 'Minutes')
 84 | 
 85 | #---------------------------------------------------------
 86 | 
 87 | # Select data from an out of time sample
 88 | data_2008 <- flights_tbl %>%
 89 |   filter(!is.na(arrdelay) & !is.na(depdelay) & !is.na(distance)) %>%
 90 |   filter(depdelay > 15 & depdelay < 240) %>%
 91 |   filter(arrdelay > -60 & arrdelay < 360) %>%
 92 |   filter(year == 2008) %>%
 93 |   left_join(airlines_tbl, by = c("uniquecarrier" = "code")) %>%
 94 |   mutate(gain = depdelay - arrdelay) %>%
 95 |   select(year, month, arrdelay, depdelay, distance, uniquecarrier, description, gain, origin,dest)
 96 | 
 97 | # Summarize data by carrier
 98 | carrier <- sdf_predict(ml1, data_2008) %>%
 99 |   group_by(description) %>%
100 |   summarize(gain = mean(gain), prediction = mean(prediction), freq = n()) %>%
101 |   filter(freq > 10000) %>%
102 |   collect
103 | 
104 | # Plot actual gains and predicted gains by airline carrier
105 | ggplot(carrier, aes(gain, prediction)) + 
106 |   geom_point(alpha = 0.75, color = 'red', shape = 3) +
107 |   geom_abline(intercept = 0, slope = 1, alpha = 0.15, color = 'blue') +
108 |   geom_text(aes(label = substr(description, 1, 20)), size = 3, alpha = 0.75, vjust = -1) +
109 |   labs(title='Average Gains Forecast', x = 'Actual', y = 'Predicted')
110 | 
111 | #---------------------------------------------------------
112 | 
113 | # Summarize by origin, destination, and carrier
114 | summary_2008 <- sdf_predict(ml1, data_2008) %>%
115 |   rename(carrier = uniquecarrier, airline = description) %>%
116 |   group_by(origin, dest, carrier, airline) %>%
117 |   summarize(
118 |     flights = n(),
119 |     distance = mean(distance),
120 |     avg_dep_delay = mean(depdelay),
121 |     avg_arr_delay = mean(arrdelay),
122 |     avg_gain = mean(gain),
123 |     pred_gain = mean(prediction)
124 |   )
125 | 
126 | # Collect and save objects
127 | pred_data <- collect(summary_2008)
128 | airports <- collect(select(airports_tbl, name, faa, lat, lon))
129 | ml1_summary <- capture.output(summary(ml1))
130 | save(pred_data, airports, ml1_summary, file = 'flights_pred_2008.RData')
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/dev/flights/flightsApp/app.R:
--------------------------------------------------------------------------------
 1 | library(shiny)
 2 | 
 3 | ui <- fluidPage(
 4 |    
 5 |    # Application title
 6 |    titlePanel("Old Faithful Geyser Data"),
 7 |    
 8 |    # Sidebar with a slider input for number of bins 
 9 |    sidebarLayout(
10 |       sidebarPanel(
11 |          sliderInput("bins",
12 |                      "Number of bins:",
13 |                      min = 1,
14 |                      max = 50,
15 |                      value = 30)
16 |       ),
17 |       
18 |       # Show a plot of the generated distribution
19 |       mainPanel(
20 |          plotOutput("distPlot")
21 |       )
22 |    )
23 | )
24 | 
25 | server <- function(input, output) {
26 |    
27 |    output$distPlot <- renderPlot({
28 |       # generate bins based on input$bins from ui.R
29 |       x    <- faithful[, 2] 
30 |       bins <- seq(min(x), max(x), length.out = input$bins + 1)
31 |       
32 |       # draw the histogram with the specified number of bins
33 |       hist(x, breaks = bins, col = 'darkgray', border = 'white')
34 |    })
35 | }
36 | 
37 | shinyApp(ui = ui, server = server)
38 | 
39 | 


--------------------------------------------------------------------------------
/dev/flights/flightsApp2/global.R:
--------------------------------------------------------------------------------
 1 | library(nycflights13)
 2 | library(tibble)
 3 | library(ggplot2)
 4 | library(dplyr)
 5 | library(sparklyr)
 6 | library(lubridate)
 7 | library(MASS)
 8 | 
 9 | Sys.setenv(SPARK_HOME="/usr/lib/spark")
10 | system.time(sc <- spark_connect(master = "yarn-client", version = '2.0.0'))
11 | 
12 | # Cache airlines Hive table into Spark
13 | #system.time(tbl_cache(sc, 'airlines'))
14 | 
15 | # We use a small subset of airlines in this application
16 | #system.time(airlines_tbl <- tbl(sc, 'airlines'))
17 | #system.time(airlines_tbl <- spark_read_csv(sc, "airlines", "hdfs:///airlines/airlines.csv", memory=TRUE))
18 | #airlines_r <- airlines_tbl %>% arrange(description) %>% collect
19 | airlines_r <- tibble::tibble(
20 |   code = c("B6", "UA", "AA", "DL", "WN", "US"), 
21 |   description = c("JetBlue Airways","United Air Lines Inc.",
22 |                   "American Airlines Inc." , "Delta Air Lines Inc.",
23 |                   "Southwest Airlines Co.","US Airways Inc.")
24 | )
25 | 
26 | # We use the airports from nycflights13 package in this application
27 | # airports_tbl <- copy_to(sc, nycflights13::airports, "airports", overwrite = TRUE)
28 | # airports <- airports_tbl %>% collect
29 | airports <- nycflights13::airports
30 | 
31 | # Cache flights Hive table into Spark
32 | #system.time(tbl_cache(sc, 'flights'))
33 | #system.time(flights_tbl <- tbl(sc, 'flights'))
34 | 
35 | #Instead of caching the flights data (which takes very long), we load the data in Parquet
36 | #format from HDFS. First the following 2 commented lines must be run to save the data.
37 | #system.time(flights_tbl <- tbl(sc, 'flights'))
38 | #system.time(spark_write_parquet(flights_tbl, "hdfs:///flights-parquet-all"))
39 | system.time(flights_tbl <- spark_read_parquet(sc, "flights_s", "hdfs:///flights-parquet-all", memory=FALSE))
40 | 
41 | years <- tibble::tibble(year = c(1987:2008))
42 | years_sub <- tibble::tibble(year = c(1999:2008))
43 | dests <- c("LAX","ORD","ATL","HNL")
44 | 
45 | delay <- flights_tbl %>%
46 |   group_by(tailnum) %>%
47 |   summarise(count = n(),
48 |             dist = mean(distance),
49 |             delay = mean(arrdelay),
50 |             arrdelay_mean = mean(arrdelay),
51 |             depdelay_mean = mean(depdelay)) %>%
52 |   filter(count > 20,
53 |          dist < 2000,
54 |          !is.na(delay)) %>%
55 |   collect
56 | 
57 | 


--------------------------------------------------------------------------------
/dev/flights/flightsApp2/server.R:
--------------------------------------------------------------------------------
  1 | library(shinydashboard)
  2 | library(dplyr)
  3 | library(maps)
  4 | library(geosphere)
  5 | library(lubridate)
  6 | library(MASS)
  7 | 
  8 | source("global.R")
  9 | 
 10 | function(input, output, session) {
 11 |   
 12 |   selected_carriers <- reactive(input$airline_selections)
 13 |   selected_density <- reactive(input$density_selection)
 14 |   selected_year <- reactive(input$years_selection)
 15 |   selected_airline <- reactive(filter(airlines_r, description==input$carrier_selection))
 16 |   selected_carrier <- reactive(selected_airline()$code)
 17 |   selected_dest_year <- reactive(input$years_dest_selection)
 18 |   selected_cancel_year <- reactive(input$years_cancel_selection)
 19 |   selected_day_year <- reactive(input$day_selection)
 20 |   
 21 |   output$yearsPlot <- renderPlot ({
 22 |     xlim <- c(-171.738281, -56.601563)
 23 |     ylim <- c(12.039321, 71.856229)
 24 |     pal <- colorRampPalette(c("#f2f2f2", "red"))
 25 |     colors <- pal(100)
 26 |     map("world", col="#f2f2f2", fill=TRUE, bg="black", lwd=0.05, xlim=xlim, ylim=ylim)
 27 |     #map("world", col="#191919", fill=TRUE, bg="#000000", lwd=0.05, xlim=xlim, ylim=ylim)
 28 |     year_selected = selected_year()
 29 |     flights_count <- flights_tbl %>% filter(year == year_selected) %>%
 30 |       group_by(uniquecarrier, origin, dest) %>%
 31 |       summarize( count = n()) %>%
 32 |       collect    
 33 |     flights_count$count <- unlist(flights_count$count)
 34 |     fsub <- filter(flights_count, uniquecarrier == selected_carrier(), count > 200)    
 35 |     fsub <- fsub[order(fsub$count),]
 36 |     maxcnt <- max(fsub$count)
 37 |     for (j in 1:length(fsub$uniquecarrier)) {
 38 |       air1 <- airports[airports$faa == fsub[j,]$origin,]
 39 |       air2 <- airports[airports$faa == fsub[j,]$dest,]
 40 |       if (dim(air1)[1] != 0 & dim(air2)[1] != 0) {
 41 |         inter <- gcIntermediate(c(air1[1,]$lon, air1[1,]$lat), c(air2[1,]$lon, air2[1,]$lat), n=100, addStartEnd=TRUE)
 42 |         colindex <- round( (fsub[j,]$count / maxcnt) * length(colors) )
 43 |         
 44 |         lines(inter, col=colors[colindex], lwd=0.8)
 45 |         lines(inter, col="black", lwd=0.8)
 46 |       }
 47 |     }
 48 |     
 49 |   })    
 50 |   
 51 |   output$densityPlot <- renderPlot ({
 52 |     r <- ggplot(delay, aes_string("dist", selected_density())) +
 53 |       geom_point(aes(size = count), alpha = 1/2) +
 54 |       geom_smooth() +
 55 |       scale_size_area(max_size = 2)
 56 |     print(r)
 57 |   })
 58 |   
 59 |   output$destPlot <- renderPlot ({
 60 |     year_selected <- selected_dest_year()
 61 |     flights_by_dest <- flights_tbl %>% filter(year == year_selected) %>%
 62 |       filter(dest %in% dests) %>%
 63 |       group_by(dest, dayofweek, month, uniquecarrier) %>%
 64 |       select(dest, dayofweek, month, uniquecarrier) %>%
 65 |       collect
 66 |     d <- ggplot(data = flights_by_dest, aes(x = month, fill=dest)) + stat_density()
 67 |     r <- ggplot(data = flights_by_dest) + 
 68 |       geom_bar(mapping = aes(x = month, fill = dest), position = "dodge")
 69 |     print(d)
 70 |   })
 71 |   
 72 |   output$cancelPlot <- renderPlot ({
 73 |     c_year_selected <- selected_cancel_year()
 74 |     flights_cancelled <- flights_tbl %>%
 75 |       filter(year == c_year_selected) %>%
 76 |       group_by(dest, month, cancelled) %>%
 77 |       summarise(
 78 |         count = n(),
 79 |         delay = mean(arrdelay, na.rm = TRUE),
 80 |         arrdelay_mean = mean(arrdelay, na.rm = TRUE),
 81 |         depdelay_mean = mean(depdelay, na.rm = TRUE)
 82 |       ) %>%
 83 |       filter(count > 20, dest != "HNL", cancelled == 1) %>%
 84 |       collect
 85 |     
 86 |     c <- ggplot(flights_cancelled, aes_string("month", "count")) +
 87 |       geom_point(alpha = 1/2, position = "jitter") +
 88 |       geom_smooth() +
 89 |       scale_size_area(max_size = 2)
 90 |     print(c)
 91 |   })
 92 |   
 93 |   output$dayPlot <- renderPlot ({
 94 |     year_day_selected <- selected_day_year() 
 95 |     flights_by_year <- flights_tbl %>% 
 96 |       filter(year== year_day_selected , Dest %in% dests) %>%
 97 |       group_by(year, month, dayofmonth, dest) %>%
 98 |       summarise(n = n()) %>%
 99 |       collect
100 |     
101 |     daily <- flights_by_year %>% 
102 |       mutate(date = make_datetime(year, month, dayofmonth)) %>%
103 |       group_by(date) 
104 |     
105 |     daily <- daily %>% 
106 |       mutate(wday = wday(date, label = TRUE))
107 |     
108 |     d <- ggplot(daily, aes(wday, n, color=dest)) + 
109 |       geom_boxplot() 
110 |     print(d)
111 |   })
112 | }


--------------------------------------------------------------------------------
/dev/flights/flightsApp2/ui.R:
--------------------------------------------------------------------------------
  1 | library(shinydashboard)
  2 | 
  3 | header <- dashboardHeader(
  4 |   title = "Flights Data Analysis"
  5 | )
  6 | sidebar <- dashboardSidebar(
  7 |   sidebarMenu(
  8 |     menuItem("Flights by year and airline", tabName = "years"),
  9 |     menuItem("Delay Density", tabName = "delay_density"),
 10 |     menuItem("Cancelled flights", tabName = "cancelled"),
 11 |     menuItem("Flights by day of week", tabName = "dayofweek")
 12 |   )
 13 | )
 14 | 
 15 | 
 16 | body <- dashboardBody(
 17 |   tabItems(
 18 |     tabItem("years",
 19 |             fluidRow(
 20 |               column(width = 8,
 21 |                      box(width = NULL, solidHeader = TRUE,
 22 |                          plotOutput('yearsPlot')
 23 |                      )
 24 |               ),
 25 |               column(width = 3,
 26 |                      box(width = NULL, status = "warning",
 27 |                          uiOutput("years_selection"),
 28 |                          radioButtons("years_selection", label = h3("Select a year"),
 29 |                                       years_sub$year, selected = 2000) 
 30 |                      )
 31 |               ),
 32 |               column(width = 3,
 33 |                      box(width = NULL, status = "warning",
 34 |                          uiOutput("carrier_selection"),
 35 |                          radioButtons("carrier_selection", label = h3("Select an airline"),
 36 |                                       airlines_r$description, selected = "American Airlines Inc.")
 37 |                      )
 38 |               )
 39 |               
 40 |             )
 41 |     ),
 42 |     tabItem("delay_density",
 43 |             fluidRow(
 44 |               column(width = 9,
 45 |                      box(width = NULL, solidHeader = TRUE,
 46 |                          plotOutput('densityPlot')
 47 |                      )
 48 |               ),
 49 |               column(width = 3,
 50 |                      box(width = NULL, status = "warning",
 51 |                          uiOutput("density_selection"),
 52 |                          radioButtons("density_selection", label = h3("Select arrival or departure"),
 53 |                                       choices = c(
 54 |                                         Departure = "depdelay_mean",
 55 |                                         Arrival = "arrdelay_mean"
 56 |                                       ),
 57 |                                       selected = "arrdelay_mean")              
 58 |                      )
 59 |               )
 60 |               
 61 |             )
 62 |     ),
 63 |     tabItem("cancelled",
 64 |             fluidRow(
 65 |               column(width = 9,
 66 |                      box(width = NULL, solidHeader = TRUE,
 67 |                          plotOutput('cancelPlot')
 68 |                      )
 69 |               ),
 70 |               column(width = 3,
 71 |                      box(width = NULL, status = "warning",
 72 |                          uiOutput("years_cancel_selection"),
 73 |                          radioButtons("years_cancel_selection", label = h3("Select a year"),
 74 |                                       years_sub$year, selected = 2008)
 75 |                      )
 76 |               )
 77 |             )
 78 |     ),
 79 |     tabItem("dayofweek",
 80 |             fluidRow(
 81 |               column(width = 9,
 82 |                      box(width = NULL, solidHeader = TRUE,
 83 |                          plotOutput('dayPlot')
 84 |                      )
 85 |               ),
 86 |               column(width = 3,
 87 |                      box(width = NULL, status = "warning",
 88 |                          uiOutput("day_selection"),
 89 |                          radioButtons("day_selection", label = h3("Select a year"),
 90 |                                       years_sub$year, selected = 2008)
 91 |                      )
 92 |               )
 93 |             )
 94 |     )    
 95 |     
 96 |   )
 97 | )
 98 | 
 99 | dashboardPage(
100 |   header,
101 |   sidebar,
102 |   body
103 | )


--------------------------------------------------------------------------------
/dev/flights/flights_pred_2008.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/flights_pred_2008.RData


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/awsClusterConnect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/awsClusterConnect.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/awsCreateCluster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/awsCreateCluster.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/awsCreateCluster2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/awsCreateCluster2.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/awsNewSecurityGroup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/awsNewSecurityGroup.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/awsSecurityGroup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/awsSecurityGroup.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/awsSecurityGroup2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/awsSecurityGroup2.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/emrArchitecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/emrArchitecture.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/emrConfigStep1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/emrConfigStep1.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/emrConfigStep2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/emrConfigStep2.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/emrConfigStep3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/emrConfigStep3.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/emrConfigStep4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/emrConfigStep4.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/emrLogin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/emrLogin.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/flightsDashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/flightsDashboard.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/flightsDeciles.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/flightsDeciles.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/flightsDecilesDesc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/flightsDecilesDesc.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/flightsPredicted.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/flightsPredicted.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/rstudio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/rstudio.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/rstudioData.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/rstudioData.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/rstudioLogin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/rstudioLogin.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/rstudioModel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/rstudioModel.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/rstudioModelDetail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/rstudioModelDetail.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/rstudioSparkPane.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/rstudioSparkPane.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/workflow.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/workflowCommands.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/workflowCommands.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/workflowRSC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/workflowRSC.png


--------------------------------------------------------------------------------
/dev/flights/images/clusterDemo/workflowShare.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/flights/images/clusterDemo/workflowShare.png


--------------------------------------------------------------------------------
/dev/flights/nycflights_flexdashboard.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Time Gained in Flight"
  3 | output: 
  4 |   flexdashboard::flex_dashboard:
  5 |     orientation: rows
  6 |     social: menu
  7 |     source_code: embed
  8 | runtime: shiny
  9 | ---
 10 | 
 11 | ```{r setup, include=F}
 12 | # Attach packages
 13 | library(dplyr)
 14 | library(ggplot2)
 15 | library(DT)
 16 | library(leaflet)
 17 | library(geosphere)
 18 | load('flights_pred_2008.RData')
 19 | airports <- mutate(airports, lat = as.numeric(lat), lon = as.numeric(lon))
 20 | ```
 21 | 
 22 | 
 23 | Summary
 24 | ========================================================================
 25 | 
 26 | Inputs {.sidebar}
 27 | -----------------------------------------------------------------------
 28 | 
 29 | ### Select Airports
 30 | 
 31 | ```{r}
 32 | # Shiny inputs for flight orgin and destination
 33 | carrier_origin <- ungroup(pred_data) %>% distinct(origin) %>% .[['origin']]
 34 | carrier_dest <- ungroup(pred_data) %>% distinct(dest) %>% .[['dest']]
 35 | selectInput("origin",  "Flight origin", carrier_origin, selected =  "JFK")
 36 | selectInput("dest", "Flight destination", carrier_dest, selected = "LAS")
 37 | ```
 38 | 
 39 | ### Background
 40 | 
 41 | Given that your flight was delayed by 15 minutes or more, what is the likelihood 
 42 | your airline carrier will make up time in route? Some of the most signficant factors 
 43 | for making up time are flight distance and airline carrier. The data model behind 
 44 | this dashboard is based on flights from NYC airports in 2013.
 45 | 
 46 | 
 47 | Row
 48 | -----------------------------------------------------------------------
 49 | 
 50 | ### Observed versus predicted time gain
 51 | 
 52 | ```{r}
 53 | # Aggregregate time gain by carrier and by route 
 54 | plot_data <- reactive({
 55 |   req(input$origin, input$dest)
 56 |   pred_data %>%
 57 |     filter(origin==input$origin & dest==input$dest) %>%
 58 |     ungroup() %>%
 59 |     select(airline, flights, distance, avg_dep_delay, avg_arr_delay, avg_gain, pred_gain)
 60 | })
 61 | 
 62 | # Plot observed versus predicted time gain for carriers and route
 63 | renderPlot({
 64 |   ggplot(plot_data(), aes(factor(airline), pred_gain)) + 
 65 |     geom_bar(stat = "identity", fill = '#2780E3') +
 66 |     geom_point(aes(factor(airline), avg_gain)) +
 67 |     coord_flip() +
 68 |     labs(x = "", y = "Time gained in flight (minutes)") +
 69 |     labs(title = "Observed gain (point) vs Predicted gain (bar)")
 70 | })
 71 | ```
 72 | 
 73 | ### Route
 74 | 
 75 | ```{r}
 76 | # Identify origin lat and long
 77 | origin <- reactive({
 78 |   req(input$origin)
 79 |   filter(airports, faa == input$origin)
 80 |   })
 81 | 
 82 | # Identify destination lat and log
 83 | dest <- reactive({
 84 |   req(input$dest)
 85 |   filter(airports, faa == input$dest)
 86 | })
 87 | 
 88 | # Plot route
 89 | renderLeaflet({
 90 |   gcIntermediate(
 91 |     select(origin(), lon, lat),
 92 |     select(dest(), lon, lat),
 93 |     n=100, addStartEnd=TRUE, sp=TRUE
 94 |     ) %>%
 95 |     leaflet() %>%
 96 |     addProviderTiles("CartoDB.Positron") %>%
 97 |     addPolylines()
 98 | })
 99 | ```
100 | 
101 | Row
102 | -----------------------------------------------------------------------
103 | 
104 | ### Data details
105 | 
106 | ```{r}
107 | # Print table of observed and predicted gains by airline
108 | renderDataTable(
109 |   datatable(plot_data()) %>%
110 |     formatRound(c("flights", "distance"), 0) %>%
111 |     formatRound(c("avg_arr_delay", "avg_dep_delay", "avg_gain", "pred_gain"), 1)
112 | )
113 | ```
114 | 
115 | Model Details
116 | ========================================================================
117 | 
118 | ```{r}
119 | renderPrint(ml1_summary)
120 | ```
121 | 


--------------------------------------------------------------------------------
/dev/flights/nycflights_flexdashboard_spark.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Time Gained in Flight"
  3 | output: 
  4 |   flexdashboard::flex_dashboard:
  5 |     orientation: rows
  6 |     social: menu
  7 |     source_code: embed
  8 | runtime: shiny
  9 | ---
 10 | 
 11 | ```{r setup, include=F}
 12 | # Attach packages
 13 | library(dplyr)
 14 | library(ggplot2)
 15 | library(DT)
 16 | library(leaflet)
 17 | library(geosphere)
 18 | library(sparklyr)
 19 | library(dplyr)
 20 | 
 21 | #Sys.setenv(SPARK_HOME = "/home/sean/.cache/spark/spark-1.6.2-bin-hadoop2.6")
 22 | #sc <- spark_connect(master = "local", version = "1.6.2")
 23 | #spark_read_csv(sc, "nyc_taxi_sample", path = "../../nathan/sol-eng-nyc-taxi-data/csv/trips/nyc_taxi_trips_2015-11.csv")
 24 | 
 25 | # Connect to Spark
 26 | Sys.setenv(SPARK_HOME="/usr/lib/spark")
 27 | config <- spark_config()
 28 | sc <- spark_connect(master = "yarn-client", config = config, version = '1.6.2')
 29 | pred_data_tbl <- tbl(sc, 'summary_2008')
 30 | 
 31 | #load('flights_pred_2008.RData')
 32 | #airports <- mutate(airports, lat = as.numeric(lat), lon = as.numeric(lon))
 33 | 
 34 | # Load summary data from flights forecast
 35 | #pred_data_tbl <- tbl(sc, 'summary_2008')
 36 | #pred_data <- collect(pred_data_tbl)
 37 | 
 38 | # Load airports data
 39 | #airports <- tbl(sc, 'airports') %>%
 40 | #  mutate(lat = as.numeric(lat), lon = as.numeric(lon)) %>%
 41 | #  collect
 42 | ```
 43 | 
 44 | 
 45 | Summary
 46 | ========================================================================
 47 | 
 48 | Inputs {.sidebar}
 49 | -----------------------------------------------------------------------
 50 | 
 51 | ### Select Airports
 52 | 
 53 | ```{r}
 54 | # Shiny inputs for flight orgin and destination
 55 | carrier_origin <- ungroup(pred_data) %>% distinct(origin) %>% .[['origin']]
 56 | carrier_dest <- ungroup(pred_data) %>% distinct(dest) %>% .[['dest']]
 57 | selectInput("origin",  "Flight origin", carrier_origin, selected =  "JFK")
 58 | selectInput("dest", "Flight destination", carrier_dest, selected = "LAS")
 59 | ```
 60 | 
 61 | ### Background
 62 | 
 63 | Given that your flight was delayed by 15 minutes or more, what is the likelihood 
 64 | your airline carrier will make up time in route? Some of the most signficant factors 
 65 | for making up time are flight distance and airline carrier. The data model behind 
 66 | this dashboard is based on flights from NYC airports in 2013.
 67 | 
 68 | 
 69 | Row
 70 | -----------------------------------------------------------------------
 71 | 
 72 | ### Observed versus predicted time gain
 73 | 
 74 | ```{r}
 75 | # Aggregregate time gain by carrier and by route 
 76 | plot_data <- reactive({
 77 |   req(input$origin, input$dest)
 78 |   pred_data %>%
 79 |     filter(origin==input$origin & dest==input$dest) %>%
 80 |     ungroup() %>%
 81 |     select(airline, flights, distance, avg_dep_delay, avg_arr_delay, avg_gain, pred_gain)
 82 | })
 83 | 
 84 | # Plot observed versus predicted time gain for carriers and route
 85 | renderPlot({
 86 |   ggplot(plot_data(), aes(factor(airline), pred_gain)) + 
 87 |     geom_bar(stat = "identity", fill = '#2780E3') +
 88 |     geom_point(aes(factor(airline), avg_gain)) +
 89 |     coord_flip() +
 90 |     labs(x = "", y = "Time gained in flight (minutes)") +
 91 |     labs(title = "Observed gain (point) vs Predicted gain (bar)")
 92 | })
 93 | ```
 94 | 
 95 | ### Route
 96 | 
 97 | ```{r}
 98 | # Identify origin lat and long
 99 | origin <- reactive({
100 |   req(input$origin)
101 |   filter(airports, faa == input$origin)
102 |   })
103 | 
104 | # Identify destination lat and log
105 | dest <- reactive({
106 |   req(input$dest)
107 |   filter(airports, faa == input$dest)
108 | })
109 | 
110 | # Plot route
111 | renderLeaflet({
112 |   gcIntermediate(
113 |     select(origin(), lon, lat),
114 |     select(dest(), lon, lat),
115 |     n=100, addStartEnd=TRUE, sp=TRUE
116 |     ) %>%
117 |     leaflet() %>%
118 |     addProviderTiles("CartoDB.Positron") %>%
119 |     addPolylines()
120 | })
121 | ```
122 | 
123 | Row
124 | -----------------------------------------------------------------------
125 | 
126 | ### Data details
127 | 
128 | ```{r}
129 | # Print table of observed and predicted gains by airline
130 | renderDataTable(
131 |   datatable(plot_data()) %>%
132 |     formatRound(c("flights", "distance"), 0) %>%
133 |     formatRound(c("avg_arr_delay", "avg_dep_delay", "avg_gain", "pred_gain"), 1)
134 | )
135 | ```
136 | 
137 | 


--------------------------------------------------------------------------------
/dev/flights/recode_for_prediction.R:
--------------------------------------------------------------------------------
1 | #data_2008 %>% group_by(crsarrtime) %>% summarize(freq = n()) %>% arrange(desc(freq))
2 | #mutate(uniquecarrier = ifelse(crsarrtime == 351, "DH", uniquecarrier)) %>%
3 | #mutate(uniquecarrier = ifelse(crsarrtime == 120, "HP", uniquecarrier)) %>%
4 | #mutate(uniquecarrier = ifelse(crsarrtime == 347, "TZ", uniquecarrier)) %>%
5 | 


--------------------------------------------------------------------------------
/dev/h2o-demo/emr_h2o_setup.sh:
--------------------------------------------------------------------------------
  1 | ### Build EMR for H2O
  2 | ### Nathan Stephens
  3 | ### 1/28/2017
  4 | 
  5 | ###########################################
  6 | ### Run as root
  7 | ###########################################
  8 | 
  9 | ## RSP
 10 | 
 11 | # Upate
 12 | sudo yum update
 13 | 
 14 | # R
 15 | sudo yum install -y R libcurl-devel openssl-devel git
 16 |   
 17 | # install RSP
 18 | wget -q https://download2.rstudio.org/current.ver -O /tmp/rsp.current.ver
 19 | wget -O /tmp/rstudio-server-rhel.rpm https://s3.amazonaws.com/rstudio-dailybuilds/rstudio-server-rhel-pro-$(cat /tmp/rsp.current.ver)-x86_64.rpm
 20 | sudo yum install -y --nogpgcheck /tmp/rstudio-server-rhel.rpm
 21 | 
 22 | # install packages
 23 | sudo Rscript -e 'install.packages("sparklyr",  repos = "http://cran.rstudio.com/")'
 24 | sudo Rscript -e 'install.packages("devtools",  repos = "http://cran.rstudio.com/")'
 25 | sudo Rscript -e 'install.packages("tidyverse", repos = "http://cran.rstudio.com/")'
 26 | sudo Rscript -e 'install.packages("leaflet",   repos = "http://cran.rstudio.com/")'
 27 | sudo Rscript -e 'install.packages("DT",        repos = "http://cran.rstudio.com/")'
 28 | 
 29 | ###########################################
 30 | 
 31 | ## add rstudio directory
 32 | 
 33 | hadoop fs -mkdir /user/rstudio
 34 | hadoop fs -chown rstudio:rstudio /user/rstudio
 35 | 
 36 | ## Add rstudio user
 37 | 
 38 | sudo useradd -m rstudio
 39 | sudo echo rstudio | passwd rstudio --stdin
 40 | sudo usermod -a -G hadoop rstudio
 41 | sudo usermod -a -G hive rstudio
 42 | 
 43 | 
 44 | ###########################################
 45 | ### Run as rstudio
 46 | ###########################################
 47 | 
 48 | ## switch user
 49 | su rstudio
 50 | cd ~
 51 | 
 52 | ## clone project
 53 | git clone https://github.com/rstudio/sparkDemos.git /home/rstudio/sparkDemos
 54 | cat >/home/rstudio/sparkDemos/sparkDemos.Rproj <<ENDOFCONTENT
 55 | Version: 1.0
 56 | 
 57 | RestoreWorkspace: Default
 58 | SaveWorkspace: Default
 59 | AlwaysSaveHistory: Default
 60 | 
 61 | EnableCodeIndexing: Yes
 62 | UseSpacesForTab: Yes
 63 | NumSpacesForTab: 2
 64 | Encoding: UTF-8
 65 | 
 66 | RnwWeave: Sweave
 67 | LaTeX: pdfLaTeX
 68 | ENDOFCONTENT
 69 | 
 70 | ## Copy data
 71 | 
 72 | nohup /usr/bin/s3-dist-cp --src=s3n://***/nyc-taxi/csv/nyct2010 --dest=hdfs:///user/rstudio/nyct2010 >> nyct2010.log &
 73 | nohup /usr/bin/s3-dist-cp --src=s3n://***/nyc-taxi/parquet_nohead/trips --dest=hdfs:///user/rstudio/trips_par >> trips_par.log &
 74 | nohup /usr/bin/s3-dist-cp --src=s3n://***/nyc-taxi/parquet/trips_model_data --dest=hdfs:///user/rstudio/trips_model_data >> trips_model_data.log &
 75 | 
 76 | 
 77 | ###########################################
 78 | ### Open Hive
 79 | ###########################################
 80 | 
 81 | hive
 82 | 
 83 | # Hive 1
 84 | 
 85 | CREATE EXTERNAL TABLE IF NOT EXISTS nyct2010(
 86 | gid int,
 87 | ctlabel float,
 88 | borocode int,
 89 | boroname string,
 90 | ct2010 int,
 91 | boroct2010 int,
 92 | cdeligibil string,
 93 | ntacode string,
 94 | ntaname string,
 95 | puma int)
 96 | ROW FORMAT DELIMITED
 97 | FIELDS TERMINATED BY ','
 98 | LINES TERMINATED BY '\n'
 99 | ;
100 | 
101 | LOAD DATA INPATH '/user/rstudio/nyct2010' INTO TABLE nyct2010;
102 | 
103 | # Hive 3
104 | 
105 | CREATE EXTERNAL TABLE IF NOT EXISTS trips_par(
106 | id int,
107 | cab_type_id int,
108 | vendor_id string,
109 | pickup_datetime timestamp,
110 | dropoff_datetime timestamp,
111 | store_and_fwd_flag string,
112 | rate_code_id string,
113 | pickup_longitude float,
114 | pickup_latitude float,
115 | dropoff_longitude float,
116 | dropoff_latitude float,
117 | passenger_count bigint,
118 | trip_distance float,
119 | fare_amount float,
120 | extra bigint,
121 | mta_tax string,
122 | tip_amount float,
123 | tolls_amount float,
124 | ehail_fee string,
125 | improvement_surcharge string,
126 | total_amount float,
127 | payment_type string,
128 | trip_type string,
129 | pickup_nyct2010_gid int,
130 | dropoff_nyct2010_gid int)
131 | stored as parquet;
132 | 
133 | LOAD DATA INPATH '/user/rstudio/trips_par' INTO TABLE trips_par;
134 | 
135 | 
136 | # Hive 3
137 | CREATE EXTERNAL TABLE IF NOT EXISTS trips_model_data(
138 | pickup_datetime timestamp,
139 | pickup_latitude float,
140 | pickup_longitude float,
141 | pickup_nyct2010_gid int,
142 | pickup_boro string,
143 | pickup_nta string,
144 | dropoff_datetime timestamp,
145 | dropoff_latitude float,
146 | dropoff_longitude float,
147 | dropoff_nyct2010_gid int,
148 | dropoff_boro string,
149 | dropoff_nta string,
150 | cab_type string,
151 | passenger_count bigint,
152 | trip_distance float,
153 | pay_type string,
154 | fare_amount float,
155 | tip_amount float,
156 | other_amount float,
157 | total_amount float)
158 | stored as parquet;
159 | 
160 | LOAD DATA INPATH '/user/rstudio/trips_model_data' INTO TABLE trips_model_data;
161 | 
162 | 


--------------------------------------------------------------------------------
/dev/h2o-demo/h2oHadoop.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Run H2O on Hadoop"
 3 | output: html_notebook
 4 | ---
 5 | 
 6 | ```{bash}
 7 | wget http://h2o-release.s3.amazonaws.com/h2o/rel-turnbull/2/h2o-3.10.1.2-hdp2.4.zip
 8 | unzip h2o-3.10.1.2-hdp2.4.zip 
 9 | cd h2o-3.10.1.2-hdp2.4
10 | hadoop jar h2odriver.jar -nodes 4 -mapperXmx 6g -output hdfsOutputDirName3
11 | ```
12 | 
13 | ```{r}
14 | library(h2o)
15 | h2o.init("10.233.190.198")
16 | h2o.clusterStatus()
17 | ```
18 | 
19 | ```{r}
20 | write.table(iris, "iris.csv", quote = F, col.names = T, row.names = F, sep = ",")
21 | data <- h2o.importFile("iris.csv")
22 | data <- h2o.importFile("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data")
23 | data
24 | ```
25 | 
26 | 


--------------------------------------------------------------------------------
/dev/h2o-demo/h2oModels.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "H2O Models"
 3 | output: html_notebook
 4 | ---
 5 | 
 6 | ```{r}
 7 | #devtools::install_github("rstudio/sparklyr") # used for sample_n
 8 | ```
 9 | 
10 | ```{r connect, message=FALSE, warning=FALSE}
11 | # Load libraries
12 | library(sparklyr)
13 | library(tidyverse)
14 | library(leaflet)
15 | library(rsparkling)
16 | library(h2o)
17 | library(DT)
18 | 
19 | # Set environ vars
20 | Sys.setenv(SPARK_HOME="/usr/lib/spark")
21 | 
22 | options(rsparkling.sparklingwater.version = '2.0.3')
23 | 
24 | # Configure cluster (c3.4xlarge 30G 16core 320disk)
25 | conf <- spark_config()
26 | conf$'sparklyr.shell.executor-memory' <- "20g"
27 | conf$'sparklyr.shell.driver-memory' <- "20g"
28 | conf$spark.executor.cores <- 16
29 | conf$spark.executor.memory <- "20G"
30 | conf$spark.yarn.am.cores  <- 16
31 | conf$spark.yarn.am.memory <- "20G"
32 | conf$spark.executor.instances <- 4
33 | conf$spark.dynamicAllocation.enabled <- "false"
34 | conf$maximizeResourceAllocation <- "true"
35 | conf$spark.default.parallelism <- 32
36 | 
37 | # Connect to cluster
38 | sc <- spark_connect(master = "yarn-client", config = conf, version = '2.0.0')
39 | 
40 | # Check H2O
41 | h2o_context(sc)
42 | ```
43 | 
44 | ```{r}
45 | # Table ref
46 | trips_model_data_tbl <- tbl(sc, "trips_model_data")
47 | model_tbl <- trips_model_data_tbl %>%
48 |   filter(fare_amount > 0 & fare_amount < 20) %>%
49 |   filter(tip_amount >= 0 & tip_amount < 5) %>%
50 |   filter(passenger_count > 0 & passenger_count < 2) %>%
51 |   select(tip_amount, fare_amount, pay_type, cab_type, passenger_count)
52 | trips_train_tbl <- sdf_register(model_tbl, "model_tbl")
53 | #tbl_cache(sc, "model_tbl")
54 | ```
55 | 
56 | ```{r convert}
57 | model_h2o_tbl <- as_h2o_frame(sc, trips_train_tbl)
58 | m2 <- h2o.glm(c("fare_amount", "pay_type", "cab_type", "passenger_count"), "tip_amount", model_h2o_tbl, alpha=0, lambda=0)
59 | summary(m2)
60 | 
61 | #m3 <- h2o.deeplearning(c("fare_amount", "pay_type", "cab_type", "passenger_count"), "tip_amount", training_frame = model_h2o_tbl)
62 | #summary(m3)
63 | 
64 | ```
65 | 
66 | ```{r model}
67 | model_formula <- formula(tip_amount ~ fare_amount + pay_type + cab_type + passenger_count)
68 | m1 <- ml_linear_regression(trips_train_tbl, model_formula)
69 | summary(m1)
70 | ```
71 | 
72 | 


--------------------------------------------------------------------------------
/dev/h2o-demo/h2oSetup.R:
--------------------------------------------------------------------------------
 1 | ### rsparkling hello world
 2 | ### requires R packages: statmod, RCurl, and devtools
 3 | 
 4 | install.packages("h2o", type = "source", repos = "http://h2o-release.s3.amazonaws.com/h2o/rel-turnbull/2/R")
 5 | install.packages("rsparkling")
 6 | 
 7 | library(rsparkling) 
 8 | library(sparklyr)
 9 | library(dplyr)
10 | library(h2o)
11 | 
12 | options(rsparkling.sparklingwater.version = "2.0.3")
13 | 
14 | conf <- spark_config()
15 | conf$'sparklyr.shell.executor-memory' <- "20g"
16 | conf$'sparklyr.shell.driver-memory' <- "20g"
17 | conf$spark.executor.cores <- 16
18 | conf$spark.executor.memory <- "20G"
19 | conf$spark.yarn.am.cores  <- 16
20 | conf$spark.yarn.am.memory <- "20G"
21 | conf$spark.dynamicAllocation.enabled <- "false"
22 | 
23 | Sys.setenv(SPARK_HOME="/usr/lib/spark")
24 | sc <- spark_connect(master = "yarn-client", config = conf, version =  "2.0.0")
25 | 
26 | mtcars_tbl <- copy_to(sc, mtcars, overwrite = TRUE)
27 | mtcars_hf <- as_h2o_frame(sc, mtcars_tbl)
28 | 
29 | glm_model <- h2o.glm(x = c("wt", "cyl"), 
30 |                      y = "mpg", 
31 |                      training_frame = mtcars_hf,
32 |                      lambda_search = TRUE)
33 | summary(glm_model)
34 | 
35 | 


--------------------------------------------------------------------------------
/dev/h2o-demo/h2oSetup.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Set Up H2O"
 3 | output: html_notebook
 4 | ---
 5 | 
 6 | ## Install
 7 | 
 8 | ```{r}
 9 | # Remove previous versions of h2o R package
10 | if ("package:h2o" %in% search()) detach("package:h2o", unload=TRUE)
11 | if ("h2o" %in% rownames(installed.packages())) remove.packages("h2o")
12 | 
13 | # Next, we download R package dependencies
14 | pkgs <- c("methods","statmod","stats","graphics",
15 |           "RCurl","jsonlite","tools","utils")
16 | for (pkg in pkgs) {
17 |   if (!(pkg %in% rownames(installed.packages()))) install.packages(pkg)
18 | }
19 | 
20 | # Download h2o package version 3.10.0.6
21 | install.packages("h2o", type = "source", 
22 |                  repos = "http://h2o-release.s3.amazonaws.com/h2o/rel-turing/6/R")
23 | 
24 | library(devtools)
25 | devtools::install_github("h2oai/rsparkling", ref = "stable")
26 | 
27 | #spark_install(version = "1.6.0") # for local (documentation say v1.6.2)
28 | ```
29 | 
30 | ## Test 1
31 | 
32 | ```{r}
33 | library(sparklyr)
34 | library(rsparkling)
35 | library(dplyr)
36 | 
37 | Sys.setenv(SPARK_HOME="/usr/lib/spark")
38 | #Sys.setenv(JAVA_HOME="/etc/alternatives/jre")
39 | 
40 | conf <- spark_config()
41 | conf$'sparklyr.shell.executor-memory' <- "20g"
42 | conf$'sparklyr.shell.driver-memory' <- "20g"
43 | conf$spark.executor.cores <- 16
44 | conf$spark.executor.memory <- "20G"
45 | conf$spark.yarn.am.cores  <- 16
46 | conf$spark.yarn.am.memory <- "20G"
47 | conf$spark.dynamicAllocation.enabled <- "false"
48 | options(rsparkling.sparklingwater.version = '1.6.7')
49 | 
50 | sc <- spark_connect(master = "yarn-client", config = conf, version = '1.6.0')
51 | airlines_tbl <- tbl(sc, "airlines")
52 | h2oframe <- as_h2o_frame(sc, airlines_tbl)
53 | ```
54 | 
55 | ## Test 2
56 | 
57 | ```{r}
58 | library(sparklyr)
59 | library(rsparkling)
60 | library(dplyr)
61 | library(h2o)
62 | 
63 | mtcars_tbl <- copy_to(sc, mtcars, "mtcars", overwrite = TRUE)
64 | partitions <- mtcars_tbl %>%
65 |   filter(hp >= 100) %>%
66 |   mutate(cyl8 = cyl == 8) %>%
67 |   sdf_partition(training = 0.5, test = 0.5, seed = 1099)
68 | training <- as_h2o_frame(sc, partitions$training)
69 | test <- as_h2o_frame(sc, partitions$test)
70 | glm_model <- h2o.glm(x = c("wt", "cyl"), 
71 |                      y = "mpg", 
72 |                      training_frame = training,
73 |                      lambda_search = TRUE)
74 | print(glm_model)
75 | ```
76 | 
77 | ### Test 3
78 | 
79 | ```{r}
80 | trips_model_data_tbl <- tbl(sc, "trips_model_data")
81 | trips_model_data_tbl %>% count
82 | trips_h2o <- as_h2o_frame(sc, trips_model_data_tbl)
83 | 
84 | 
85 | model_formula <- formula(tip_amount ~ fare_amount + pay_type + cab_type + passenger_count)
86 | m1 <- ml_linear_regression(trips_train_tbl, model_formula)
87 | summary(m1)
88 | 
89 | ```
90 | 


--------------------------------------------------------------------------------
/dev/h2o-demo/h2oSetup_2_0_0.R:
--------------------------------------------------------------------------------
 1 | ### rsparkling hello world
 2 | ### requires R packages: statmod, RCurl, and devtools
 3 | 
 4 | install.packages("h2o", type = "source", repos = "http://h2o-release.s3.amazonaws.com/h2o/rel-turnbull/2/R")
 5 | install.packages("rsparkling")
 6 | 
 7 | library(rsparkling) 
 8 | library(sparklyr)
 9 | library(dplyr)
10 | library(h2o)
11 | 
12 | options(rsparkling.sparklingwater.version = "2.0.3")
13 | 
14 | conf <- spark_config()
15 | conf$'sparklyr.shell.executor-memory' <- "20g"
16 | conf$'sparklyr.shell.driver-memory' <- "20g"
17 | conf$spark.executor.cores <- 16
18 | conf$spark.executor.memory <- "20G"
19 | conf$spark.yarn.am.cores  <- 16
20 | conf$spark.yarn.am.memory <- "20G"
21 | conf$spark.dynamicAllocation.enabled <- "false"
22 | 
23 | Sys.setenv(SPARK_HOME="/usr/lib/spark")
24 | sc <- spark_connect(master = "yarn-client", config = conf, version =  "2.0.0")
25 | 
26 | mtcars_tbl <- copy_to(sc, mtcars, overwrite = TRUE)
27 | mtcars_hf <- as_h2o_frame(sc, mtcars_tbl)
28 | 
29 | glm_model <- h2o.glm(x = c("wt", "cyl"), 
30 |                      y = "mpg", 
31 |                      training_frame = mtcars_hf,
32 |                      lambda_search = TRUE)
33 | summary(glm_model)
34 | 
35 | 


--------------------------------------------------------------------------------
/dev/h2o-demo/iris.csv:
--------------------------------------------------------------------------------
  1 | Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
  2 | 5.1,3.5,1.4,0.2,setosa
  3 | 4.9,3,1.4,0.2,setosa
  4 | 4.7,3.2,1.3,0.2,setosa
  5 | 4.6,3.1,1.5,0.2,setosa
  6 | 5,3.6,1.4,0.2,setosa
  7 | 5.4,3.9,1.7,0.4,setosa
  8 | 4.6,3.4,1.4,0.3,setosa
  9 | 5,3.4,1.5,0.2,setosa
 10 | 4.4,2.9,1.4,0.2,setosa
 11 | 4.9,3.1,1.5,0.1,setosa
 12 | 5.4,3.7,1.5,0.2,setosa
 13 | 4.8,3.4,1.6,0.2,setosa
 14 | 4.8,3,1.4,0.1,setosa
 15 | 4.3,3,1.1,0.1,setosa
 16 | 5.8,4,1.2,0.2,setosa
 17 | 5.7,4.4,1.5,0.4,setosa
 18 | 5.4,3.9,1.3,0.4,setosa
 19 | 5.1,3.5,1.4,0.3,setosa
 20 | 5.7,3.8,1.7,0.3,setosa
 21 | 5.1,3.8,1.5,0.3,setosa
 22 | 5.4,3.4,1.7,0.2,setosa
 23 | 5.1,3.7,1.5,0.4,setosa
 24 | 4.6,3.6,1,0.2,setosa
 25 | 5.1,3.3,1.7,0.5,setosa
 26 | 4.8,3.4,1.9,0.2,setosa
 27 | 5,3,1.6,0.2,setosa
 28 | 5,3.4,1.6,0.4,setosa
 29 | 5.2,3.5,1.5,0.2,setosa
 30 | 5.2,3.4,1.4,0.2,setosa
 31 | 4.7,3.2,1.6,0.2,setosa
 32 | 4.8,3.1,1.6,0.2,setosa
 33 | 5.4,3.4,1.5,0.4,setosa
 34 | 5.2,4.1,1.5,0.1,setosa
 35 | 5.5,4.2,1.4,0.2,setosa
 36 | 4.9,3.1,1.5,0.2,setosa
 37 | 5,3.2,1.2,0.2,setosa
 38 | 5.5,3.5,1.3,0.2,setosa
 39 | 4.9,3.6,1.4,0.1,setosa
 40 | 4.4,3,1.3,0.2,setosa
 41 | 5.1,3.4,1.5,0.2,setosa
 42 | 5,3.5,1.3,0.3,setosa
 43 | 4.5,2.3,1.3,0.3,setosa
 44 | 4.4,3.2,1.3,0.2,setosa
 45 | 5,3.5,1.6,0.6,setosa
 46 | 5.1,3.8,1.9,0.4,setosa
 47 | 4.8,3,1.4,0.3,setosa
 48 | 5.1,3.8,1.6,0.2,setosa
 49 | 4.6,3.2,1.4,0.2,setosa
 50 | 5.3,3.7,1.5,0.2,setosa
 51 | 5,3.3,1.4,0.2,setosa
 52 | 7,3.2,4.7,1.4,versicolor
 53 | 6.4,3.2,4.5,1.5,versicolor
 54 | 6.9,3.1,4.9,1.5,versicolor
 55 | 5.5,2.3,4,1.3,versicolor
 56 | 6.5,2.8,4.6,1.5,versicolor
 57 | 5.7,2.8,4.5,1.3,versicolor
 58 | 6.3,3.3,4.7,1.6,versicolor
 59 | 4.9,2.4,3.3,1,versicolor
 60 | 6.6,2.9,4.6,1.3,versicolor
 61 | 5.2,2.7,3.9,1.4,versicolor
 62 | 5,2,3.5,1,versicolor
 63 | 5.9,3,4.2,1.5,versicolor
 64 | 6,2.2,4,1,versicolor
 65 | 6.1,2.9,4.7,1.4,versicolor
 66 | 5.6,2.9,3.6,1.3,versicolor
 67 | 6.7,3.1,4.4,1.4,versicolor
 68 | 5.6,3,4.5,1.5,versicolor
 69 | 5.8,2.7,4.1,1,versicolor
 70 | 6.2,2.2,4.5,1.5,versicolor
 71 | 5.6,2.5,3.9,1.1,versicolor
 72 | 5.9,3.2,4.8,1.8,versicolor
 73 | 6.1,2.8,4,1.3,versicolor
 74 | 6.3,2.5,4.9,1.5,versicolor
 75 | 6.1,2.8,4.7,1.2,versicolor
 76 | 6.4,2.9,4.3,1.3,versicolor
 77 | 6.6,3,4.4,1.4,versicolor
 78 | 6.8,2.8,4.8,1.4,versicolor
 79 | 6.7,3,5,1.7,versicolor
 80 | 6,2.9,4.5,1.5,versicolor
 81 | 5.7,2.6,3.5,1,versicolor
 82 | 5.5,2.4,3.8,1.1,versicolor
 83 | 5.5,2.4,3.7,1,versicolor
 84 | 5.8,2.7,3.9,1.2,versicolor
 85 | 6,2.7,5.1,1.6,versicolor
 86 | 5.4,3,4.5,1.5,versicolor
 87 | 6,3.4,4.5,1.6,versicolor
 88 | 6.7,3.1,4.7,1.5,versicolor
 89 | 6.3,2.3,4.4,1.3,versicolor
 90 | 5.6,3,4.1,1.3,versicolor
 91 | 5.5,2.5,4,1.3,versicolor
 92 | 5.5,2.6,4.4,1.2,versicolor
 93 | 6.1,3,4.6,1.4,versicolor
 94 | 5.8,2.6,4,1.2,versicolor
 95 | 5,2.3,3.3,1,versicolor
 96 | 5.6,2.7,4.2,1.3,versicolor
 97 | 5.7,3,4.2,1.2,versicolor
 98 | 5.7,2.9,4.2,1.3,versicolor
 99 | 6.2,2.9,4.3,1.3,versicolor
100 | 5.1,2.5,3,1.1,versicolor
101 | 5.7,2.8,4.1,1.3,versicolor
102 | 6.3,3.3,6,2.5,virginica
103 | 5.8,2.7,5.1,1.9,virginica
104 | 7.1,3,5.9,2.1,virginica
105 | 6.3,2.9,5.6,1.8,virginica
106 | 6.5,3,5.8,2.2,virginica
107 | 7.6,3,6.6,2.1,virginica
108 | 4.9,2.5,4.5,1.7,virginica
109 | 7.3,2.9,6.3,1.8,virginica
110 | 6.7,2.5,5.8,1.8,virginica
111 | 7.2,3.6,6.1,2.5,virginica
112 | 6.5,3.2,5.1,2,virginica
113 | 6.4,2.7,5.3,1.9,virginica
114 | 6.8,3,5.5,2.1,virginica
115 | 5.7,2.5,5,2,virginica
116 | 5.8,2.8,5.1,2.4,virginica
117 | 6.4,3.2,5.3,2.3,virginica
118 | 6.5,3,5.5,1.8,virginica
119 | 7.7,3.8,6.7,2.2,virginica
120 | 7.7,2.6,6.9,2.3,virginica
121 | 6,2.2,5,1.5,virginica
122 | 6.9,3.2,5.7,2.3,virginica
123 | 5.6,2.8,4.9,2,virginica
124 | 7.7,2.8,6.7,2,virginica
125 | 6.3,2.7,4.9,1.8,virginica
126 | 6.7,3.3,5.7,2.1,virginica
127 | 7.2,3.2,6,1.8,virginica
128 | 6.2,2.8,4.8,1.8,virginica
129 | 6.1,3,4.9,1.8,virginica
130 | 6.4,2.8,5.6,2.1,virginica
131 | 7.2,3,5.8,1.6,virginica
132 | 7.4,2.8,6.1,1.9,virginica
133 | 7.9,3.8,6.4,2,virginica
134 | 6.4,2.8,5.6,2.2,virginica
135 | 6.3,2.8,5.1,1.5,virginica
136 | 6.1,2.6,5.6,1.4,virginica
137 | 7.7,3,6.1,2.3,virginica
138 | 6.3,3.4,5.6,2.4,virginica
139 | 6.4,3.1,5.5,1.8,virginica
140 | 6,3,4.8,1.8,virginica
141 | 6.9,3.1,5.4,2.1,virginica
142 | 6.7,3.1,5.6,2.4,virginica
143 | 6.9,3.1,5.1,2.3,virginica
144 | 5.8,2.7,5.1,1.9,virginica
145 | 6.8,3.2,5.9,2.3,virginica
146 | 6.7,3.3,5.7,2.5,virginica
147 | 6.7,3,5.2,2.3,virginica
148 | 6.3,2.5,5,1.9,virginica
149 | 6.5,3,5.2,2,virginica
150 | 6.2,3.4,5.4,2.3,virginica
151 | 5.9,3,5.1,1.8,virginica
152 | 


--------------------------------------------------------------------------------
/dev/h2o-demo/livy.R:
--------------------------------------------------------------------------------
1 | library(sparklyr)
2 | Sys.setenv(SPARK_HOME="/usr/lib/spark")
3 | sc <- spark_connect(master = "yarn-client", version = '2.0.0')
4 | livy_service_start()
5 | livy_service_stop()
6 | 


--------------------------------------------------------------------------------
/dev/h2o-demo/livy.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Connecting to Spark through Livy"
 3 | output: html_notebook
 4 | ---
 5 | 
 6 | With Livy you can anaylze data in your spark cluster via R on your desktop.
 7 | 
 8 | ## Livy
 9 | 
10 | Livy is a service that enables easy interaction with an Apache Spark cluster over a REST interface. It enables easy submission of Spark jobs or snippets of Spark code, synchronous or asynchronous result retrieval, as well as SparkContext management, all via a simple REST interface or a RPC client library. Livy also simplifies the interaction between Spark from application servers, thus enabling the use of Spark for interactive web/mobile applications.
11 | 
12 | <center><div style="width:450px">
13 | ![](http://livy.io/img/livy-architecture.png)
14 | </div></center>
15 | 
16 | ## Start Livy [Server Side]
17 | 
18 | Set home environment variables and start a Livy server to handle local requests.
19 | 
20 | ```{r, eval=FALSE}
21 | sparklyr::livy_install()
22 | sparklyr::livy_service_start()
23 | ```
24 | 
25 | ## Connect to Spark [Client Side]
26 | 
27 | Use `method = "livy"` to connect to the cluster.
28 | 
29 | ```{r warning=FALSE}
30 | library(sparklyr)
31 | library(dplyr)
32 | sc <- spark_connect(
33 |   master = "http://ec2-107-20-106-40.compute-1.amazonaws.com:8998/", 
34 |   method = "livy")
35 | ```
36 | 
37 | ## Analyze
38 | 
39 | Use R code on your workstation as you normally would. Your R commands will be sent to the cluster via Livy for processing. Collect your results back to the desktop for further processing in R.
40 | 
41 | ```{r}
42 | library(ggplot2)
43 | trips_model_data_tbl <- tbl(sc, "trips_model_data")
44 | pickup_dropoff_tbl <- trips_model_data_tbl %>%
45 |   filter(pickup_nta == "Turtle Bay-East Midtown" & dropoff_nta == "Airport") %>%
46 |   mutate(pickup_hour = hour(pickup_datetime)) %>%
47 |   mutate(trip_time = unix_timestamp(dropoff_datetime) - unix_timestamp(pickup_datetime)) %>%
48 |   group_by(pickup_hour) %>% 
49 |   summarize(n = n(),
50 |             trip_time_mean = mean(trip_time),
51 |             trip_time_p10 = percentile(trip_time, 0.10),
52 |             trip_time_p25 = percentile(trip_time, 0.25),
53 |             trip_time_p50 = percentile(trip_time, 0.50),
54 |             trip_time_p75 = percentile(trip_time, 0.75),
55 |             trip_time_p90 = percentile(trip_time, 0.90))
56 | 
57 | # Collect results
58 | pickup_dropoff <- collect(pickup_dropoff_tbl)
59 | 
60 | # Plot
61 | ggplot(pickup_dropoff, aes(x = pickup_hour)) +
62 |           geom_line(aes(y = trip_time_p50, alpha = "Median")) +
63 |           geom_ribbon(aes(ymin = trip_time_p25, ymax = trip_time_p75, 
64 |                           alpha = "25–75th percentile")) +
65 |           geom_ribbon(aes(ymin = trip_time_p10, ymax = trip_time_p90, 
66 |                           alpha = "10–90th percentile")) +
67 |           scale_y_continuous("trip duration in minutes")
68 | ```
69 | 
70 | ## Disconnect
71 | 
72 | ```{r disconnect}
73 | sparklyr::livy_service_stop()
74 | ```
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/dev/h2o-demo/sqlvis_histogram.R:
--------------------------------------------------------------------------------
 1 | ### Big data histogram
 2 | sqlvis_compute_histogram <- function(data, x_name, bins = 30){
 3 | 
 4 |   data_prep <- data %>%
 5 |     select_(x_field = x_name) %>%
 6 |     filter(!is.na(x_field)) %>%
 7 |     mutate(x_field = as.double(x_field))
 8 |     
 9 |   s <- data_prep %>%
10 |     summarise(max_x = max(x_field), min_x = min(x_field)) %>%
11 |     mutate(bin_value = (max_x - min_x) / bins) %>%
12 |     collect()
13 |   
14 |   new_bins <- as.numeric(c((0:(bins - 1) * s$bin_value) + s$min_x, s$max_x))
15 | 
16 |   plot_table <- data_prep %>%
17 |     ft_bucketizer(input.col = "x_field", output.col = "key_bin", splits = new_bins) %>%
18 |     group_by(key_bin) %>%
19 |     tally() %>%
20 |     collect()
21 |   
22 |   all_bins <- data.frame(
23 |     key_bin = 0:(bins - 1), 
24 |     bin = 1:bins, 
25 |     bin_ceiling = head(new_bins, -1)
26 |   )
27 |   
28 |   plot_table %>%
29 |     full_join(all_bins, by="key_bin") %>%
30 |     arrange(key_bin) %>%
31 |     mutate(n = ifelse(!is.na(n), n, 0)) %>%
32 |     select(bin = key_bin, count = n, bin_ceiling) %>%
33 |     rename_(.dots = setNames(list("bin_ceiling"), x_name))
34 |   
35 |   }
36 | 
37 | sqlvis_ggplot_histogram <- function(plot_table, ...){
38 |   plot_table %>%
39 |     select(x = 3, y = 2) %>%
40 |     ggplot(aes(x, y)) +
41 |     geom_bar(stat = "identity", fill = "cornflowerblue") +
42 |     theme(legend.position = "none") +
43 |     labs(x = colnames(plot_table)[3], y = colnames(plot_table)[2], ...)
44 |   }
45 | 
46 | sqlvis_ggvis_histogram <- function(plot_table, ...){
47 |   plot_table %>%
48 |     select(x = 3, y = 2) %>%
49 |     ggvis(x = ~x, y = ~y) %>%
50 |     layer_bars() %>%
51 |     add_axis("x", title = colnames(plot_table)[3]) %>%
52 |     add_axis("y", title = colnames(plot_table)[2])
53 | }
54 | 
55 | 


--------------------------------------------------------------------------------
/dev/h2o-demo/sqlvis_raster.R:
--------------------------------------------------------------------------------
  1 | ### Big data tile plot
  2 | 
  3 | # data <- tbl(sc, "trips_model_data")
  4 | # x_field <- "pickup_longitude"
  5 | # y_field <- "pickup_latitude"
  6 | # resolution <- 50
  7 | 
  8 | sqlvis_compute_raster <- function(data, x_field, y_field, resolution = 300){
  9 |   
 10 |   data_prep <- data %>%
 11 |     select_(x = x_field, y = y_field) %>%
 12 |     filter(!is.na(x), !is.na(y))
 13 |   
 14 |   s <- data_prep %>%
 15 |     summarise(max_x = max(x),
 16 |               max_y = max(y), 
 17 |               min_x = min(x),
 18 |               min_y = min(y)) %>%
 19 |     mutate(rng_x = max_x - min_x,
 20 |            rng_y = max_y - min_y,
 21 |            resolution = resolution) %>%
 22 |     collect()
 23 |   
 24 |   counts <- data_prep %>% 
 25 |     mutate(res_x = round((x - s$min_x) / s$rng_x * resolution, 0),
 26 |            res_y = round((y - s$min_y) / s$rng_y * resolution, 0)) %>%
 27 |     count(res_x, res_y) %>%
 28 |     collect
 29 | 
 30 |   list(counts = counts,
 31 |        limits = s,
 32 |        vnames = c(x_field, y_field)
 33 |        )  
 34 | 
 35 | }
 36 | 
 37 | sqlvis_ggplot_raster <- function(data, ...) {
 38 | 
 39 |   d <- data$counts
 40 |   s <- data$limits
 41 |   v <- data$vnames
 42 | 
 43 |   xx <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_x, s$max_x, len = 6),2))
 44 |   yy <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_y, s$max_y, len = 6),2))
 45 | 
 46 |   ggplot(d, aes(res_x, res_y)) + 
 47 |     geom_raster(aes(fill = n)) +
 48 |     coord_fixed() +
 49 |     scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") +
 50 |     scale_x_continuous(breaks = xx, labels = names(xx)) +
 51 |     scale_y_continuous(breaks = yy, labels = names(yy)) +
 52 |     labs(x = v[1], y = v[2], ...)
 53 |     
 54 | }    
 55 | 
 56 | ### Facets
 57 | 
 58 | sqlvis_compute_raster_g <- function(data, x_field, y_field, g_field, resolution = 300){
 59 |   
 60 |   data_prep <- data %>%
 61 |     mutate_(group = g_field) %>%
 62 |     select_(g = "group", x = x_field, y = y_field) %>%
 63 |     filter(!is.na(x), !is.na(y))
 64 |   
 65 |   s <- data_prep %>%
 66 |     summarise(max_x = max(x),
 67 |               max_y = max(y), 
 68 |               min_x = min(x),
 69 |               min_y = min(y)) %>%
 70 |     mutate(rng_x = max_x - min_x,
 71 |            rng_y = max_y - min_y,
 72 |            resolution = resolution) %>%
 73 |     collect()
 74 |   
 75 |   counts <- data_prep %>% 
 76 |     mutate(res_x = round((x-s$min_x)/s$rng_x*resolution, 0),
 77 |            res_y = round((y-s$min_y)/s$rng_y*resolution, 0)) %>%
 78 |     count(g, res_x, res_y) %>%
 79 |     collect
 80 |   
 81 |   list(counts = counts,
 82 |        limits = s,
 83 |        vnames = c(x_field, y_field)
 84 |   )  
 85 |   
 86 | }
 87 | 
 88 | sqlvis_ggplot_raster_g <- function(data, ncol = 4, ...) {
 89 |   
 90 |   s <- data$limits
 91 |   d <- data$counts
 92 |   v <- data$vnames
 93 |   
 94 |   xx <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_x, s$max_x, len = 3), 1))
 95 |   yy <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_y, s$max_y, len = 3), 1))
 96 |   
 97 |   ggplot(d, aes(res_x, res_y)) + 
 98 |     geom_raster(aes(fill = n)) +
 99 |     coord_fixed() +
100 |     facet_wrap(~ g, ncol = ncol) +
101 |     scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") +
102 |     scale_x_continuous(breaks = xx, labels = names(xx)) +
103 |     scale_y_continuous(breaks = yy, labels = names(yy)) +
104 |     labs(x = v[1], y = v[2], ...)
105 |   
106 | }    
107 | 


--------------------------------------------------------------------------------
/dev/h2o/01_h2o_setup.R:
--------------------------------------------------------------------------------
 1 | library(devtools)
 2 | library(sparklyr)
 3 | 
 4 | # Remove previous versions of h2o R package
 5 | if ("package:h2o" %in% search()) detach("package:h2o", unload=TRUE)
 6 | if ("h2o" %in% rownames(installed.packages())) remove.packages("h2o")
 7 | 
 8 | # Next, we download R package dependencies
 9 | pkgs <- c("methods","statmod","stats","graphics",
10 |           "RCurl","jsonlite","tools","utils")
11 | for (pkg in pkgs) {
12 |   if (!(pkg %in% rownames(installed.packages()))) install.packages(pkg)
13 | }
14 | 
15 | # Download h2o package version 3.10.0.6
16 | install.packages("h2o", type = "source", 
17 |                  repos = "http://h2o-release.s3.amazonaws.com/h2o/rel-turing/6/R")
18 | 
19 | # Install from github
20 | devtools::install_github("h2oai/sparkling-water", subdir = "/r/rsparkling")
21 | 
22 | # Make sure spark is also installed in local mode
23 | spark_install(version = "1.6.2")
24 | 


--------------------------------------------------------------------------------
/dev/h2o/02_h2o_rsparkling.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Intro to H2O rsparkling"
 3 | output: html_notebook
 4 | ---
 5 | 
 6 | ## Setup
 7 | 
 8 | ```{r, message=FALSE, warning=FALSE}
 9 | library(sparklyr)
10 | library(h2o)
11 | library(rsparkling)
12 | library(dplyr)
13 | library(ggplot2)
14 | 
15 | # Connect
16 | sc <- spark_connect("local", version = "1.6.2")
17 | mtcars_tbl <- copy_to(sc, mtcars, "mtcars", overwrite = TRUE)
18 | ```
19 | 
20 | ## Partition into test and training
21 | 
22 | ```{r}
23 | # Transform our data set, and then partition into 'training', 'test'
24 | partitions <- mtcars_tbl %>%
25 |   filter(hp >= 100) %>%
26 |   mutate(cyl8 = cyl == 8) %>%
27 |   sdf_partition(training = 0.5, test = 0.5, seed = 1099)
28 | 
29 | # Convert to H20 Frame
30 | training <- as_h2o_frame(sc, partitions$training)
31 | test <- as_h2o_frame(sc, partitions$test)
32 | ```
33 | 
34 | ## Train a linear model
35 | 
36 | ```{r}
37 | # Fit a linear model to the training dataset
38 | glm_model <- h2o.glm(x = c("wt", "cyl"), 
39 |                      y = "mpg", 
40 |                      training_frame = training,
41 |                      lambda_search = TRUE)
42 | # Examine model
43 | summary(glm_model)
44 | ```
45 | 
46 | ## Score test data and compare to actuals
47 | 
48 | ```{r}
49 | # Compute predicted values on our test dataset
50 | pred <- h2o.predict(glm_model, newdata = test)
51 | 
52 | # Extract the true 'mpg' values from our test dataset
53 | actual <- partitions$test %>%
54 |   select(mpg) %>%
55 |   rename(actual = mpg)
56 | 
57 | # Collect the results
58 | data <- data.frame(
59 |   collect(as_spark_dataframe(sc, pred)),
60 |   collect(actual)
61 | )
62 | ```
63 | 
64 | ## Plot predicted vs actuals values
65 | 
66 | ```{r}
67 | # plot predicted vs. actual values
68 | ggplot(data, aes(x = actual, y = predict)) +
69 |   geom_abline(lty = "dashed", col = "red") +
70 |   geom_point() +
71 |   theme(plot.title = element_text(hjust = 0.5)) +
72 |   coord_fixed(ratio = 1) +
73 |   labs(
74 |     x = "Actual Fuel Consumption",
75 |     y = "Predicted Fuel Consumption",
76 |     title = "Predicted vs. Actual Fuel Consumption"
77 |   )
78 | ```
79 | 
80 | 


--------------------------------------------------------------------------------
/dev/h2o/03_h2o_ml.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "H2O Sparkling Water Machine Learning"
  3 | output: html_notebook
  4 | ---
  5 | 
  6 | ## Setup
  7 | 
  8 | ```{r, message=FALSE, warning=FALSE}
  9 | library(rsparkling)
 10 | library(dplyr)
 11 | library(ggplot2)
 12 | 
 13 | sc <- spark_connect("local", version = "1.6.2")
 14 | iris_tbl <- copy_to(sc, iris, "iris", overwrite = TRUE)
 15 | iris_hf <- as_h2o_frame(sc, iris_tbl)
 16 | ```
 17 | 
 18 | ## K means clustering
 19 | 
 20 | ```{r}
 21 | kmeans_model <- h2o.kmeans(training_frame = iris_hf, 
 22 |                            x = 3:4,
 23 |                            k = 3,
 24 |                            seed = 1)
 25 | h2o.centers(kmeans_model)
 26 | h2o.centroid_stats(kmeans_model)
 27 | ```
 28 | 
 29 | ## Logistic
 30 | 
 31 | ```{r}
 32 | beaver <- beaver2
 33 | beaver$activ <- factor(beaver$activ, labels = c("Non-Active", "Active"))
 34 | beaver_hf <- as.h2o(beaver)  # Send data from R memory to H2O cluster
 35 | 
 36 | y <- "activ"
 37 | x <- setdiff(names(beaver_hf), y)
 38 | glm_model <- h2o.glm(x = x, 
 39 |                      y = y,
 40 |                      training_frame = beaver_hf,
 41 |                      family = "binomial",
 42 |                      nfolds = 3,
 43 |                      seed = 1)
 44 | 
 45 | h2o.performance(glm_model, xval = TRUE)
 46 | ```
 47 | 
 48 | ## PCA
 49 | 
 50 | ```{r}
 51 | pca_model <- h2o.prcomp(training_frame = iris_hf,
 52 |                         x = 1:4,
 53 |                         k = 4,
 54 |                         seed = 1)
 55 | print(pca_model)
 56 | ```
 57 | 
 58 | ## Random Forest
 59 | 
 60 | ```{r}
 61 | y <- "Species"
 62 | x <- setdiff(names(iris_hf), y)
 63 | iris_hf[,y] <- as.factor(iris_hf[,y])
 64 | 
 65 | splits <- h2o.splitFrame(iris_hf, seed = 1)
 66 | 
 67 | rf_model <- h2o.randomForest(x = x, 
 68 |                              y = y,
 69 |                              training_frame = splits[[1]],
 70 |                              validation_frame = splits[[2]],
 71 |                              nbins = 32,
 72 |                              max_depth = 5,
 73 |                              ntrees = 20,
 74 |                              seed = 1)
 75 | 
 76 | h2o.confusionMatrix(rf_model, valid = TRUE)
 77 | 
 78 | h2o.varimp_plot(rf_model)
 79 | ```
 80 | 
 81 | ## Gradient Boosted Model
 82 | 
 83 | ```{r}
 84 | gbm_model <- h2o.gbm(x = x, 
 85 |                      y = y,
 86 |                      training_frame = splits[[1]],
 87 |                      validation_frame = splits[[2]],                     
 88 |                      ntrees = 20,
 89 |                      max_depth = 3,
 90 |                      learn_rate = 0.01,
 91 |                      col_sample_rate = 0.7,
 92 |                      seed = 1)
 93 | 
 94 | h2o.confusionMatrix(gbm_model, valid = TRUE)
 95 | 
 96 | path <- system.file("extdata", "prostate.csv", 
 97 |                     package = "h2o")
 98 | 
 99 | prostate_hf <- h2o.importFile(path)
100 | str(prostate_hf)
101 | head(prostate_hf)
102 | 
103 | splits <- h2o.splitFrame(prostate_hf, seed = 1)
104 | ```
105 | 
106 | ## Deep learning
107 | 
108 | ```{r}
109 | y <- "VOL"
110 | x <- setdiff(names(prostate_hf), c("ID", y))
111 | 
112 | dl_fit <- h2o.deeplearning(x = x, y = y,
113 |                            training_frame = splits[[1]],
114 |                            epochs = 15,
115 |                            activation = "Rectifier",
116 |                            hidden = c(10, 5, 10),
117 |                            input_dropout_ratio = 0.7)
118 | 
119 | h2o.performance(dl_fit, newdata = splits[[2]])
120 | 
121 | path <- system.file("extdata", "prostate.csv", package = "h2o")
122 | prostate_hf <- h2o.importFile(path)
123 | splits <- h2o.splitFrame(prostate_hf, seed = 1)
124 | ```
125 | 


--------------------------------------------------------------------------------
/dev/h2o/04_h2o_grid.R:
--------------------------------------------------------------------------------
 1 | ### 
 2 | 
 3 | y <- "VOL"
 4 | #remove response and ID cols
 5 | x <- setdiff(names(prostate_hf), c("ID", y))
 6 | 
 7 | # GBM hyperparamters
 8 | gbm_params1 <- list(learn_rate = c(0.01, 0.1),
 9 |                     max_depth = c(3, 5, 9),
10 |                     sample_rate = c(0.8, 1.0),
11 |                     col_sample_rate = c(0.2, 0.5, 1.0))
12 | 
13 | # Train and validate a grid of GBMs
14 | gbm_grid1 <- h2o.grid("gbm", x = x, y = y,
15 |                       grid_id = "gbm_grid1",
16 |                       training_frame = splits[[1]],
17 |                       validation_frame = splits[[1]],
18 |                       ntrees = 100,
19 |                       seed = 1,
20 |                       hyper_params = gbm_params1)
21 | 
22 | # Get the grid results, sorted by validation MSE
23 | gbm_gridperf1 <- h2o.getGrid(grid_id = "gbm_grid1", 
24 |                              sort_by = "mse", 
25 |                              decreasing = FALSE)
26 | print(gbm_gridperf1)
27 | 
28 | 
29 | # GBM hyperparamters
30 | gbm_params2 <- list(learn_rate = seq(0.01, 0.1, 0.01),
31 |                     max_depth = seq(2, 10, 1),
32 |                     sample_rate = seq(0.5, 1.0, 0.1),
33 |                     col_sample_rate = seq(0.1, 1.0, 0.1))
34 | search_criteria2 <- list(strategy = "RandomDiscrete", 
35 |                          max_models = 50)
36 | 
37 | # Train and validate a grid of GBMs
38 | gbm_grid2 <- h2o.grid("gbm", x = x, y = y,
39 |                       grid_id = "gbm_grid2",
40 |                       training_frame = splits[[1]],
41 |                       validation_frame = splits[[2]],
42 |                       ntrees = 100,
43 |                       seed = 1,
44 |                       hyper_params = gbm_params2,
45 |                       search_criteria = search_criteria2)
46 | 
47 | # Get the grid results, sorted by validation MSE
48 | gbm_gridperf2 <- h2o.getGrid(grid_id = "gbm_grid2", 
49 |                              sort_by = "mse", 
50 |                              decreasing = FALSE)
51 | 
52 | gbm_gridperf2@summary_table[1,]
53 | 
54 | h2o.saveModel(gbm_model, path = "mymodel")
55 | 
56 | h2o.download_pojo(gbm_model, path = "mymodel")
57 | 


--------------------------------------------------------------------------------
/dev/helloworld/derby.log:
--------------------------------------------------------------------------------
 1 | ----------------------------------------------------------------
 2 | Mon Sep 19 17:07:57 UTC 2016:
 3 | Booting Derby version The Apache Software Foundation - Apache Derby - 10.11.1.1 - (1616546): instance a816c00e-0157-436b-2290-000014d05190 
 4 | on database directory memory:/home/nathan/spark/sparkDemos/dev/helloworld/databaseName=metastore_db with class loader org.apache.spark.sql.hive.client.IsolatedClientLoader$$anon$1@71526773 
 5 | Loaded from file:/home/nathan/.cache/spark/spark-2.0.0-bin-hadoop2.7/jars/derby-10.11.1.1.jar
 6 | java.vendor=Oracle Corporation
 7 | java.runtime.version=1.7.0_85-b01
 8 | user.dir=/home/nathan/spark/sparkDemos/dev/helloworld
 9 | os.name=Linux
10 | os.arch=amd64
11 | os.version=3.13.0-48-generic
12 | derby.system.home=null
13 | Database Class Loader started - derby.database.classpath=''
14 | 


--------------------------------------------------------------------------------
/dev/helloworld/helloWorld.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Sparklyr"
 3 | output: html_notebook
 4 | ---
 5 | 
 6 | ```{r}
 7 | library(dplyr)
 8 | library(sparklyr)
 9 | 
10 | sc <- spark_connect(master = "local", version = "2.0.0")
11 | iris_tbl <- copy_to(sc, iris, "iris")
12 | 
13 | iris_tbl %>%
14 |   group_by(Species) %>%
15 |   summarize(n1 = as.numeric(n()), n2 = as.numeric(n()))
16 | ```
17 | 


--------------------------------------------------------------------------------
/dev/hive/hiveJDBC.R:
--------------------------------------------------------------------------------
 1 | #loading libraries
 2 | library("DBI")
 3 | library("rJava")
 4 | library("RJDBC")
 5 | 
 6 | #init of the classpath (works with hadoop 2.6 on CDH 5.4 installation)
 7 | hivecp = c("/usr/lib/hive/lib/hive-jdbc.jar", "/usr/lib/hadoop/client/hadoop-common.jar", "/usr/lib/hive/lib/libthrift-0.9.2.jar", "/usr/lib/hive/lib/hive-service.jar", "/usr/lib/hive/lib/httpclient-4.2.5.jar", "/usr/lib/hive/lib/httpcore-4.2.5.jar", "/usr/lib/hive/lib/hive-jdbc-standalone.jar")
 8 | .jinit(classpath=cp)
 9 | 
10 | #initialisation de la connexion
11 | drv <- JDBC("org.apache.hive.jdbc.HiveDriver", "/usr/lib/hive/lib/hive-jdbc.jar", identifier.quote="`")
12 | conn <- dbConnect(drv, "jdbc:hive2://localhost:10000/default", "myuser", "")
13 | 
14 | #working with the connexion
15 | show_databases <- dbGetQuery(conn, "show databases")
16 | show_databases
17 | 
18 | library("RJDBC")
19 | options( java.parameters = "-Xmx8g" )
20 | drv <- JDBC("org.apache.hive.jdbc.HiveDriver", "/usr/lib/hive/lib/hive-jdbc.jar")
21 | conn <- dbConnect(drv, "jdbc:hive2://localhost:10000/default", "rstudio-user", "")
22 | sample_08 <- dbReadTable(conn, "airlines")
23 | 
24 | 
25 | jdbc:sqlserver://data.rsquaredltd.com\SandP
26 | jdbc:sqlserver://[serverName[\instanceName][:portNumber]][;property=value[;property=value]]
27 | 
28 | install unixODBC unixODBC-devel
29 | 


--------------------------------------------------------------------------------
/dev/hive/hiveMetastore.R:
--------------------------------------------------------------------------------
 1 | ### Connect to Spark
 2 | library(sparklyr)
 3 | library(dplyr)
 4 | library(ggplot2)
 5 | Sys.setenv(SPARK_HOME="/usr/lib/spark")
 6 | config <- spark_config()
 7 | sc <- spark_connect(master = "yarn-client", config = config, version = '1.6.2')
 8 | 
 9 | ### Load DBI
10 | library(DBI)
11 | 
12 | ### Browse the Hive Metastore
13 | dbGetQuery(sc, "show databases")
14 | dbGetQuery(sc, "show tables in default")
15 | dbGetQuery(sc, "show tables in userdb")
16 | dbGetQuery(sc, "describe userdb.students")
17 | 
18 | ### Create a new database, a new table, and insert data
19 | dbGetQuery(sc, "create database newdb")
20 | dbGetQuery(sc, "drop table if exists newdb.pageviews")
21 | dbGetQuery(sc, "create table newdb.pageviews (userid varchar(64), link string, came_from string)")
22 | dbGetQuery(sc, "insert into table newdb.pageviews values ('jsmith', 'mail.com', 'sports.com'), ('jdoe', 'mail.com', null)")
23 | 
24 | ### This query does not work from R but works from the command prompt
25 | dbGetQuery(sc, "CREATE TABLE students (name VARCHAR(64), age INT, gpa DECIMAL(3, 2)) CLUSTERED BY (age) INTO 2 BUCKETS STORED AS ORC")
26 | 
27 | dbGetQuery(sc, "use newdb")
28 | dbGetQuery(sc, "show tables in newdb")
29 | 


--------------------------------------------------------------------------------
/dev/hive/hiveMetastore.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Browse Hive Metastore"
 3 | output: html_notebook
 4 | ---
 5 | 
 6 | ### Connect to Spark
 7 | ```{r}
 8 | library(sparklyr)
 9 | library(dplyr)
10 | library(ggplot2)
11 | Sys.setenv(SPARK_HOME="/usr/lib/spark")
12 | config <- spark_config()
13 | sc <- spark_connect(master = "yarn-client", config = config, version = '2.0.0')
14 | ```
15 | 
16 | ### Browse the Hive Metastore
17 | 
18 | ```{r}
19 | library(DBI)
20 | dbGetQuery(sc, "show databases")
21 | dbGetQuery(sc, "show tables in default")
22 | dbGetQuery(sc, "show tables in userdb")
23 | dbGetQuery(sc, "describe userdb.students")
24 | ```
25 | 
26 | ### Create a new database, a new table, and insert data
27 | 
28 | ```{r}
29 | dbGetQuery(sc, "drop table if exists newdb.pageviews")
30 | dbGetQuery(sc, "drop database if exists newdb")
31 | dbGetQuery(sc, "create database newdb")
32 | dbGetQuery(sc, "create table newdb.pageviews (userid varchar(64), link string, came_from string)")
33 | dbGetQuery(sc, "insert into table newdb.pageviews values ('jsmith', 'mail.com', 'sports.com'), ('jdoe', 'mail.com', null)")
34 | ```
35 | 
36 | ### This query does not work from R but does work from the command prompt
37 | 
38 | ```{r}
39 | dbGetQuery(sc, "CREATE TABLE students (name VARCHAR(64), age INT, gpa DECIMAL(3, 2)) CLUSTERED BY (age) INTO 2 BUCKETS STORED AS ORC")
40 | ```
41 | ```
42 | Error: org.apache.spark.sql.catalyst.parser.ParseException: Operation not allowed: CREATE TABLE ... CLUSTERED BY(line 1, pos 0) == SQL == CREATE TABLE students (name VARCHAR(64), age INT, gpa DECIMAL(3, 2)) CLUSTERED BY (age) INTO 2 BUCKETS STORED AS ORC ^^^ at org.apache.spark.sql.catalyst.parser.ParserUtils$.operationNotAllowed(ParserUtils.scala:43) at org.apache.spark.sql.execution.SparkSqlAstBuilder$$anonfun$visitCreateTable$1.apply(SparkSqlParser.scala:913) at org.apache.spark.sql.execution.SparkSqlAstBuilder$$anonfun$visitCreateTable$1.apply(SparkSqlParser.scala:901) at org.apache.spark.sql.catalyst.parser.ParserUtils$.withOrigin(ParserUtils.scala:96) at org.apache.spark.sql.execution.SparkSqlAstBuilder.visitCreateTable(SparkSqlParser.scala:901) at org.apache.spark.sql.execution.SparkSqlAstBuilder.visitCreateTable(SparkSqlParser.scala:53) at org.apache.spark.sql.catalyst.parser.SqlBaseParser$CreateTableContext.accept(SqlBaseParser.java:474) at org.antlr.v4.runtime.tre
43 | ```


--------------------------------------------------------------------------------
/dev/nyc-taxi-data/.gitignore:
--------------------------------------------------------------------------------
1 | derby.log
2 | 


--------------------------------------------------------------------------------
/dev/nyc-taxi-data/taxiApp.R:
--------------------------------------------------------------------------------
 1 | library(sparklyr)
 2 | library(dplyr)
 3 | library(shiny)
 4 | 
 5 | Sys.setenv(SPARK_HOME="/usr/lib/spark")
 6 | config <- spark_config()
 7 | sc <- spark_connect(master = "yarn-client", config = config, version = '1.6.1')
 8 | 
 9 | tbl_cache(sc, 'trips_csv_2015_12')
10 | trips_tbl <- tbl(sc, 'trips_csv_2015_12')
11 | 
12 | ui <- fluidPage(
13 |   
14 |   titlePanel("NYC Taxi Trips"),
15 |   
16 |   sidebarLayout(
17 |     sidebarPanel(
18 |       selectInput("hour", "Hour of the day", 0:23, 12)
19 |     ),
20 |     
21 |     mainPanel(
22 |       tableOutput("fare")
23 |     )
24 |   )
25 | )
26 | 
27 | server <- function(input, output) {
28 |   
29 |   fare <- reactive({
30 |     trips_tbl %>%
31 |       mutate(pickup_hour = hour(pickup_datetime)) %>%
32 |       filter(pickup_hour == input$hour) %>%
33 |       summarize(fare_amount = mean(fare_amount)) %>%
34 |       collect
35 |   })
36 |   
37 |   output$fare <- renderTable({
38 |     fare()
39 |   })
40 |   
41 | }
42 | 
43 | shinyApp(ui = ui, server = server)


--------------------------------------------------------------------------------
/dev/nyc-taxi-data/taxiApp/app.R:
--------------------------------------------------------------------------------
 1 | 
 2 | global <- function() {
 3 |   
 4 |   Sys.setenv(SPARK_HOME="/usr/lib/spark")
 5 |   config <- spark_config()
 6 |   sc <- spark_connect(master = "yarn-client", config = config, version = '1.6.1')
 7 |   
 8 |   tbl_cache(sc, 'trips_par')
 9 |   shiny_trips_tbl <<- tbl(sc, 'trips_par')
10 |   
11 |   distinct_gid <- function(data, gid, cutoff = 100000){
12 |     data %>%
13 |       filter_(!is.na(gid)) %>%
14 |       group_by_(gid) %>%
15 |       count %>%
16 |       filter(n > cutoff) %>%
17 |       select_(gid) %>%
18 |       arrange_(gid) %>%
19 |       collect
20 |   }
21 |   
22 |   pickup_nyct2010_gid <<- shiny_trips_tbl %>%
23 |     distinct_gid("pickup_nyct2010_gid") %>%
24 |     unlist %>%
25 |     unname
26 | 
27 |   dropoff_nyct2010_gid <<- shiny_trips_tbl %>%
28 |     distinct_gid("dropoff_nyct2010_gid") %>%
29 |     unlist %>%
30 |     unname
31 | 
32 | }
33 | 
34 | ui <- fluidPage(
35 |    
36 |    titlePanel("NYC Taxi Data"),
37 |    
38 |    sidebarLayout(
39 |       sidebarPanel(
40 |         selectInput("pickup",  "Taxi origin", pickup_nyct2010_gid, 1250),
41 |         selectInput("dropoff",  "Taxi destination", dropoff_nyct2010_gid, 2056)
42 |       ),
43 |       
44 |       mainPanel(
45 |          plotOutput("distPlot")
46 |       )
47 |    )
48 | )
49 | 
50 | server <- function(input, output) {
51 | 
52 |   withProgress(message = "dplyr:", detail = "filter, mutate, summarize", {
53 |     
54 |   shiny_pickup_dropoff <- reactive({
55 |     shiny_trips_tbl %>%
56 |     filter(pickup_nyct2010_gid == input$pickup & dropoff_nyct2010_gid == input$dropoff) %>%
57 |     mutate(pickup_hour = hour(pickup_datetime)) %>%
58 |     mutate(trip_time = unix_timestamp(dropoff_datetime) - unix_timestamp(pickup_datetime)) %>%
59 |     group_by(pickup_hour) %>% 
60 |     summarize(n = n(),
61 |               trip_time_p10 = percentile(trip_time, 0.10),
62 |               trip_time_p25 = percentile(trip_time, 0.25),
63 |               trip_time_p50 = percentile(trip_time, 0.50),
64 |               trip_time_p75 = percentile(trip_time, 0.75),
65 |               trip_time_p90 = percentile(trip_time, 0.90)) %>%
66 |     collect
67 |   })
68 |   
69 |   })
70 |   
71 |   output$distPlot <- renderPlot({
72 |     ggplot(shiny_pickup_dropoff(), aes(x = pickup_hour)) +
73 |     geom_line(aes(y = trip_time_p50, alpha = "Median")) +
74 |     geom_ribbon(aes(ymin = trip_time_p25, ymax = trip_time_p75, alpha = "25–75th percentile")) +
75 |     geom_ribbon(aes(ymin = trip_time_p10, ymax = trip_time_p90, alpha = "10–90th percentile")) +
76 |     scale_y_continuous("trip duration in minutes") +
77 |     ggtitle(paste("Pickup = ", input$pickup, ";", "Dropoff =", input$dropoff))
78 |    })
79 |   
80 | }
81 | 
82 | shinyApp(ui = ui, server = server, onStart = global)
83 | 
84 | 


--------------------------------------------------------------------------------
/dev/nyc-taxi-data/taxiDashboard.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "NYC Taxi"
  3 | author: "Nathan Stephens"
  4 | output: 
  5 |   flexdashboard::flex_dashboard:
  6 |     orientation: columns
  7 |     source_code: embed
  8 | runtime: shiny
  9 | ---
 10 | 
 11 | ```{r setup, include=FALSE}
 12 | library(ggplot2)
 13 | library(flexdashboard)
 14 | library(shiny)
 15 | library(leaflet)
 16 | ```
 17 | 
 18 | Detail
 19 | =======================================================================
 20 | 
 21 | Inputs {.sidebar}
 22 | -----------------------------------------------------------------------
 23 | 
 24 | ### NTA Code
 25 | 
 26 | Select a neighborhood tabulation area (NTA) code to describe.
 27 | 
 28 | ```{r}
 29 | selectInput('var1','Select NTA Code',list('a'=1,'b'=2,'c'=3),1)
 30 | ```
 31 | 
 32 | Column
 33 | -----------------------------------------------------------------------
 34 | 
 35 | ### Pickups and dropoffs by hour
 36 | 
 37 | ```{r}
 38 | matplot(1:24, matrix(rnorm(48),24,2), type = 'l', col = 2:3, lty = 1)
 39 | ```
 40 | 
 41 | ### Map
 42 | 
 43 | ```{r}
 44 | leaflet() %>% addTiles() %>% setView(-73.946832999999998,40.784374999999997, 12)
 45 | ```
 46 | 
 47 | Column
 48 | -----------------------------------------------------------------------
 49 | 
 50 | ### Cab Type
 51 | 
 52 | ```{r}
 53 | barplot(1:3, col = 2:5)
 54 | ```
 55 | 
 56 | ### Distance
 57 | 
 58 | ```{r}
 59 | hist(rnorm(50), col='grey')
 60 | ```
 61 | 
 62 | ### Cost
 63 | 
 64 | ```{r}
 65 | hist(rnorm(50), col='grey')
 66 | ```
 67 | 
 68 | Route
 69 | =======================================================================
 70 | 
 71 | Inputs {.sidebar}
 72 | -----------------------------------------------------------------------
 73 | 
 74 | ### NTA Code
 75 | 
 76 | ```{r}
 77 | selectInput('var3','Select pickup',list('a'=1,'b'=2,'c'=3),1)
 78 | 
 79 | selectInput('var4','Select dropoff',list('a'=1,'b'=2,'c'=3),1)
 80 | ```
 81 | 
 82 | Column
 83 | -----------------------------------------------------------------------
 84 | 
 85 | ### Travel time by hour
 86 | 
 87 | ```{r}
 88 | matplot(1:24, matrix(rnorm(48),24,2), type = 'l', col = 2:3, lty = 1)
 89 | ```
 90 | 
 91 | ### Map
 92 | 
 93 | ```{r}
 94 | leaflet() %>% addTiles() %>% setView(-73.946832999999998,40.784374999999997, 12)
 95 | ```
 96 | 
 97 | Column
 98 | -----------------------------------------------------------------------
 99 | 
100 | ### Cab Type
101 | 
102 | ```{r}
103 | barplot(1:3, col = 2:5)
104 | ```
105 | 
106 | ### Distance
107 | 
108 | ```{r}
109 | hist(rnorm(50), col='grey')
110 | ```
111 | 
112 | ### Cost
113 | 
114 | ```{r}
115 | hist(rnorm(50), col='grey')
116 | ```
117 | 
118 | Pickups and Dropoffs
119 | =======================================================================
120 | 
121 | Inputs {.sidebar}
122 | -----------------------------------------------------------------------
123 | 
124 | ### NTA Code
125 | 
126 | ```{r}
127 | selectInput('var5','Select dropoff',list('a'=1,'b'=2,'c'=3),1)
128 | ```
129 | 
130 | Column
131 | -----------------------------------------------------------------------
132 | 
133 | ### Pickup
134 | 
135 | ```{r}
136 | leaflet() %>% addTiles() %>% setView(-73.983895000000004,40.723072000000002, 12)
137 | ```
138 | 
139 | 
140 | Column
141 | -----------------------------------------------------------------------
142 | 
143 | ### Dropoff
144 | 
145 | ```{r}
146 | leaflet() %>% addTiles() %>% setView(-73.961844999999997,40.767837999999998, 12)
147 | ```
148 | 
149 | 


--------------------------------------------------------------------------------
/dev/nycflights13/.gitignore:
--------------------------------------------------------------------------------
1 | rsconnect
2 | derby.log
3 | 


--------------------------------------------------------------------------------
/dev/nycflights13/nycflights13_flexdashboard_rdata.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Time Gained in Flight"
  3 | output: 
  4 |   flexdashboard::flex_dashboard:
  5 |     orientation: rows
  6 |     social: menu
  7 |     source_code: embed
  8 | runtime: shiny
  9 | ---
 10 | 
 11 | ```{r setup, include=F}
 12 | # Attach packages
 13 | library(nycflights13)
 14 | library(dplyr)
 15 | library(ggplot2)
 16 | library(DT)
 17 | library(leaflet)
 18 | library(geosphere)
 19 | library(readr)
 20 | 
 21 | # Attach data
 22 | data(flights)
 23 | data(airports)
 24 | ```
 25 | 
 26 | ```{r include=F}
 27 | # Prepare model data
 28 | model_data <- flights %>%
 29 |   filter(!is.na(arr_delay) & !is.na(dep_delay) & !is.na(distance)) %>%
 30 |   filter(dep_delay > 15 & dep_delay < 240) %>%
 31 |   filter(arr_delay > -60 & arr_delay < 360) %>%
 32 |   left_join(airlines, by = c("carrier" = "carrier")) %>%
 33 |   mutate(gain = dep_delay - arr_delay) %>%
 34 |   select(origin, dest, carrier, airline = name, distance, dep_delay, arr_delay, gain)
 35 | 
 36 | # Training and validation
 37 | set.seed(777)
 38 | ind <-sample(n <- nrow(model_data), floor(n * 0.5))
 39 | train_data <- model_data[ind, ]
 40 | valid_data <- model_data[-ind, ]
 41 | 
 42 | # Model time gained as function of distance, departure delay, and airline carrier
 43 | lm1 <- lm(gain ~ distance + dep_delay + carrier, train_data)
 44 | 
 45 | # Score data and aggregate flight route and carrier
 46 | pred_data <- valid_data %>%
 47 |   mutate(pred = predict.lm(lm1, valid_data)) %>%
 48 |   group_by(origin, dest, carrier, airline) %>%
 49 |   summarize(
 50 |     flights = n(),
 51 |     distance = mean(distance),
 52 |     avg_dep_delay = mean(dep_delay),
 53 |     avg_arr_delay = mean(arr_delay),
 54 |     avg_gain = mean(gain),
 55 |     pred_gain = mean(pred)
 56 |     )
 57 | ```
 58 | 
 59 | Summary
 60 | ========================================================================
 61 | 
 62 | Inputs {.sidebar}
 63 | -----------------------------------------------------------------------
 64 | 
 65 | ### Select Airports
 66 | 
 67 | ```{r}
 68 | # Shiny inputs for flight orgin and destination
 69 | carrier_origin <- ungroup(pred_data) %>% distinct(origin) %>% .[['origin']]
 70 | carrier_dest <- ungroup(pred_data) %>% distinct(dest) %>% .[['dest']]
 71 | selectInput("origin",  "Flight origin", carrier_origin, selected =  "JFK")
 72 | selectInput("dest", "Flight destination", carrier_dest, selected = "SFO")
 73 | ```
 74 | 
 75 | ### Background
 76 | 
 77 | Given that your flight was delayed by 15 minutes or more, what is the likelihood 
 78 | your airline carrier will make up time in route? Some of the most signficant factors 
 79 | for making up time are flight distance and airline carrier. The data model behind 
 80 | this dashboard is based on flights from NYC airports in 2013.
 81 | 
 82 | 
 83 | Row
 84 | -----------------------------------------------------------------------
 85 | 
 86 | ### Observed versus predicted time gain
 87 | 
 88 | ```{r}
 89 | # Aggregregate time gain by carrier and by route 
 90 | plot_data <- reactive({
 91 |   req(input$origin, input$dest)
 92 |   pred_data %>%
 93 |     filter(origin==input$origin & dest==input$dest) %>%
 94 |     ungroup() %>%
 95 |     select(airline, flights, distance, avg_dep_delay, avg_arr_delay, avg_gain, pred_gain)
 96 | })
 97 | 
 98 | # Plot observed versus predicted time gain for carriers and route
 99 | renderPlot({
100 |   ggplot(plot_data(), aes(factor(airline), pred_gain)) + 
101 |     geom_bar(stat = "identity", fill = '#2780E3') +
102 |     geom_point(aes(factor(airline), avg_gain)) +
103 |     coord_flip() +
104 |     labs(x = "", y = "Time gained in flight (minutes)") +
105 |     labs(title = "Observed gain (point) vs Predicted gain (bar)")
106 | })
107 | ```
108 | 
109 | ### Route
110 | 
111 | ```{r}
112 | # Identify origin lat and long
113 | origin <- reactive({
114 |   req(input$origin)
115 |   filter(airports, faa == input$origin)
116 |   })
117 | 
118 | # Identify destination lat and log
119 | dest <- reactive({
120 |   req(input$dest)
121 |   filter(airports, faa == input$dest)
122 | })
123 | 
124 | # Plot route
125 | renderLeaflet({
126 |   gcIntermediate(
127 |     select(origin(), lon, lat),
128 |     select(dest(), lon, lat),
129 |     n=100, addStartEnd=TRUE, sp=TRUE
130 |     ) %>%
131 |     leaflet() %>%
132 |     addProviderTiles("CartoDB.Positron") %>%
133 |     addPolylines()
134 | })
135 | ```
136 | 
137 | Row
138 | -----------------------------------------------------------------------
139 | 
140 | ### Data details
141 | 
142 | ```{r}
143 | # Print table of observed and predicted gains by airline
144 | renderDataTable(
145 |   datatable(plot_data()) %>%
146 |     formatRound(c("flights", "distance"), 0) %>%
147 |     formatRound(c("avg_arr_delay", "avg_dep_delay", "avg_gain", "pred_gain"), 1)
148 | )
149 | ```
150 | 
151 | Model Output
152 | ========================================================================
153 | 
154 | ```{r}
155 | renderPrint(summary(lm1))
156 | ```


--------------------------------------------------------------------------------
/dev/titanic/.gitignore:
--------------------------------------------------------------------------------
1 | derby.log
2 | notebook-classification_v1.nb.html
3 | notebook-classification_v1.Rmd
4 | 


--------------------------------------------------------------------------------
/dev/titanic/rmarkdown-classification_files/figure-html/auc-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/titanic/rmarkdown-classification_files/figure-html/auc-1.png


--------------------------------------------------------------------------------
/dev/titanic/rmarkdown-classification_files/figure-html/importance-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/titanic/rmarkdown-classification_files/figure-html/importance-1.png


--------------------------------------------------------------------------------
/dev/titanic/rmarkdown-classification_files/figure-html/lift-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/titanic/rmarkdown-classification_files/figure-html/lift-1.png


--------------------------------------------------------------------------------
/dev/titanic/titanic-parquet/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/dev/titanic/titanic-parquet/.part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/titanic/titanic-parquet/.part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet.crc


--------------------------------------------------------------------------------
/dev/titanic/titanic-parquet/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/titanic/titanic-parquet/_SUCCESS


--------------------------------------------------------------------------------
/dev/titanic/titanic-parquet/part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/dev/titanic/titanic-parquet/part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet


--------------------------------------------------------------------------------
/img/sparklyr-illustration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-illustration.png


--------------------------------------------------------------------------------
/img/sparklyr-presentation-demos.001.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.001.jpeg


--------------------------------------------------------------------------------
/img/sparklyr-presentation-demos.002.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.002.jpeg


--------------------------------------------------------------------------------
/img/sparklyr-presentation-demos.003.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.003.jpeg


--------------------------------------------------------------------------------
/img/sparklyr-presentation-demos.004.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.004.jpeg


--------------------------------------------------------------------------------
/img/sparklyr-presentation-demos.005.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.005.jpeg


--------------------------------------------------------------------------------
/img/sparklyr-presentation-demos.006.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.006.jpeg


--------------------------------------------------------------------------------
/img/sparklyr-presentation-demos.007.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.007.jpeg


--------------------------------------------------------------------------------
/img/sparklyr-presentation-demos.008.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.008.jpeg


--------------------------------------------------------------------------------
/img/sparklyr-presentation-demos.009.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.009.jpeg


--------------------------------------------------------------------------------
/img/sparklyr-presentation-demos.010.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.010.jpeg


--------------------------------------------------------------------------------
/img/sparklyr-presentation-demos.011.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.011.jpeg


--------------------------------------------------------------------------------
/img/sparklyr-presentation-demos.012.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.012.jpeg


--------------------------------------------------------------------------------
/img/sparklyr-presentation-demos.013.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.013.jpeg


--------------------------------------------------------------------------------
/img/sparklyr-presentation-demos.014.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.014.jpeg


--------------------------------------------------------------------------------
/img/sparklyr-presentation-demos.015.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.015.jpeg


--------------------------------------------------------------------------------
/img/sparklyr-presentation-demos.016.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.016.jpeg


--------------------------------------------------------------------------------
/img/sparklyr-presentation-demos.017.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.017.jpeg


--------------------------------------------------------------------------------
/img/sparklyr-presentation-demos.018.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.018.jpeg


--------------------------------------------------------------------------------
/img/sparklyr-presentation-demos.019.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.019.jpeg


--------------------------------------------------------------------------------
/img/sparklyr-presentation-demos.020.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.020.jpeg


--------------------------------------------------------------------------------
/img/sparklyr-presentation-demos.021.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/img/sparklyr-presentation-demos.021.jpeg


--------------------------------------------------------------------------------
/prod/apps/iris-k-means/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Title: Iris with Spark Backend
2 | Author: RStudio, Inc.
3 | AuthorUrl: http://www.rstudio.com/ 
4 | License: GPL-3
5 | DisplayMode: Showcase
6 | Tags: sparklyr
7 | Type: Shiny
8 | 


--------------------------------------------------------------------------------
/prod/apps/iris-k-means/app.R:
--------------------------------------------------------------------------------
 1 | library(sparklyr)
 2 | library(dplyr)
 3 | library(shiny)
 4 | 
 5 | #Connect to Spark
 6 | sc <- spark_connect(master = "local")
 7 | 
 8 | #Read in Parquet Data
 9 | spark_read_parquet(sc, "iris", "iris-parquet")
10 | iris_tbl <- tbl(sc, "iris")
11 | opts <- tbl_vars(iris_tbl)[-which(tbl_vars(iris_tbl) == "Species")]
12 | 
13 | ui <- pageWithSidebar(
14 |   headerPanel('Iris k-means clustering'),
15 |   sidebarPanel(
16 |     selectInput('xcol', 'X Variable', opts),
17 |     selectInput('ycol', 'Y Variable', opts,
18 |                 selected = opts[2]),
19 |     numericInput('clusters', 'Cluster count', 3,
20 |                  min = 2, max = 9)
21 |   ),
22 |   mainPanel(
23 |     plotOutput('plot1')
24 |   )
25 | )
26 | 
27 | server <- function(input, output, session) {
28 |   
29 |   # Nothing is evaluated in Spark at this step
30 |   selectedData <- reactive({
31 |     iris_tbl %>% select_(input$xcol, input$ycol)
32 |   })
33 |   
34 |   # The Spark data frame is constructed and kmeans is run
35 |   clusters <- reactive({
36 |     selectedData() %>%
37 |       ml_kmeans(centers = input$clusters)
38 |   })
39 |   
40 |   output$plot1 <- renderPlot({
41 |     par(mar = c(5.1, 4.1, 0, 1))
42 |     
43 |     #score the results in Spark, pull in results to R
44 |     scored <- predict(clusters(), iris_tbl) + 1
45 |     
46 |     #collect brings the data into R
47 |     selectedData() %>% 
48 |       collect() %>% 
49 |       plot(col = scored,
50 |            pch = 20, cex = 4)
51 |     
52 |     points(clusters()$centers,
53 |            pch = 4, cex = 4, lwd = 4)
54 |   })
55 |   
56 | }
57 | 
58 | shinyApp(ui = ui, server = server)
59 | 


--------------------------------------------------------------------------------
/prod/apps/iris-k-means/config.yml:
--------------------------------------------------------------------------------
1 | default:
2 |   sparklyr.cores.local: 1
3 |   sparklyr.shell.driver-memory: 2G
4 | 


--------------------------------------------------------------------------------
/prod/apps/iris-k-means/iris-parquet/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/prod/apps/iris-k-means/iris-parquet/._common_metadata.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/iris-k-means/iris-parquet/._common_metadata.crc


--------------------------------------------------------------------------------
/prod/apps/iris-k-means/iris-parquet/._metadata.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/iris-k-means/iris-parquet/._metadata.crc


--------------------------------------------------------------------------------
/prod/apps/iris-k-means/iris-parquet/.part-r-00000-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/iris-k-means/iris-parquet/.part-r-00000-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet.crc


--------------------------------------------------------------------------------
/prod/apps/iris-k-means/iris-parquet/.part-r-00001-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/iris-k-means/iris-parquet/.part-r-00001-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet.crc


--------------------------------------------------------------------------------
/prod/apps/iris-k-means/iris-parquet/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/iris-k-means/iris-parquet/_SUCCESS


--------------------------------------------------------------------------------
/prod/apps/iris-k-means/iris-parquet/_common_metadata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/iris-k-means/iris-parquet/_common_metadata


--------------------------------------------------------------------------------
/prod/apps/iris-k-means/iris-parquet/_metadata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/iris-k-means/iris-parquet/_metadata


--------------------------------------------------------------------------------
/prod/apps/iris-k-means/iris-parquet/part-r-00000-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/iris-k-means/iris-parquet/part-r-00000-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet


--------------------------------------------------------------------------------
/prod/apps/iris-k-means/iris-parquet/part-r-00001-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/iris-k-means/iris-parquet/part-r-00001-30bd49c2-226d-41ad-8ec3-ccf9ddc0ccf5.gz.parquet


--------------------------------------------------------------------------------
/prod/apps/nycflights13-app-spark/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Title: NYCFlights13 Time Gained in Flight
2 | Author: RStudio, Inc.
3 | AuthorUrl: http://www.rstudio.com/ 
4 | License: GPL-3
5 | DisplayMode: Showcase
6 | Tags: sparklyr
7 | Type: Shiny
8 | 


--------------------------------------------------------------------------------
/prod/apps/nycflights13-app-spark/Readme.md:
--------------------------------------------------------------------------------
1 | Given that your flight was delayed by 15 minutes or more, what is the likelihood your airline carrier will make up time in route? Some of the most signficant factors for making up time are flight distance and airline carrier. The data model behind this dashboard is based on flights from NYC airports in 2013.
2 | 


--------------------------------------------------------------------------------
/prod/apps/nycflights13-app-spark/app.R:
--------------------------------------------------------------------------------
  1 | # R Packages
  2 | library(nycflights13)
  3 | library(dplyr)
  4 | library(ggplot2)
  5 | library(DT)
  6 | library(leaflet)
  7 | library(geosphere)
  8 | library(sparklyr)
  9 | 
 10 | # Connect to local Spark instance
 11 | sc <- spark_connect(master = "local", version = '2.0.0')
 12 | 
 13 | # Copy flights data into Spark
 14 | copy_to(sc, flights, "flights_s", overwrite = TRUE)
 15 | flights_tbl <- tbl(sc, 'flights_s')
 16 | 
 17 | # Copy airlines data into Spark
 18 | copy_to(sc, airlines, "airlines_s", overwrite = TRUE)
 19 | airlines_tbl <- tbl(sc, 'airlines_s')
 20 | 
 21 | # Prepare mode data
 22 | model_data <- flights_tbl %>%
 23 |   filter(!is.na(arr_delay) & !is.na(dep_delay) & !is.na(distance)) %>%
 24 |   filter(dep_delay > 15 & dep_delay < 240) %>%
 25 |   filter(arr_delay > -60 & arr_delay < 360) %>%
 26 |   left_join(airlines_tbl, by = c("carrier" = "carrier")) %>%
 27 |   mutate(gain = dep_delay - arr_delay) %>%
 28 |   select(origin, dest, carrier, airline = name, distance, dep_delay, arr_delay, gain)
 29 | 
 30 | # Partition data into train and validation
 31 | partitions <- model_data %>%
 32 |   sdf_partition(train_data = 0.5, valid_data = 0.5, seed = 777)
 33 | 
 34 | # Train a linear model in Spark
 35 | lm1 <- ml_linear_regression(partitions$train_data, gain ~ distance + dep_delay + carrier)
 36 | 
 37 | # Score the validation data
 38 | pred_tbl <- sdf_predict(lm1, partitions$valid_data)
 39 | 
 40 | # Create scored look up data for Shiny app 
 41 | lookup_tbl <- pred_tbl %>%
 42 |   group_by(origin, dest, carrier, airline) %>%
 43 |   summarize(
 44 |     flights = n(),
 45 |     distance = mean(distance),
 46 |     avg_dep_delay = mean(dep_delay),
 47 |     avg_arr_delay = mean(arr_delay),
 48 |     avg_gain = mean(gain),
 49 |     pred_gain = mean(prediction)
 50 |   )
 51 | 
 52 | # Cache the look up table
 53 | sdf_register(lookup_tbl, "lookup")
 54 | tbl_cache(sc, "lookup")
 55 | 
 56 | # Find distinct airport codes
 57 | carrier_origin <- c("JFK", "LGA", "EWR")
 58 | carrier_dest <- c("BOS", "DCA", "DEN", "HNL", "LAX", "SEA", "SFO", "STL")
 59 | 
 60 | # Shiny UI
 61 | ui <- fluidPage(
 62 | 
 63 |   # Set display mode to bottom
 64 |   tags$script(' var setInitialCodePosition = function() 
 65 |                { setCodePosition(false, false); }; '),
 66 |   
 67 |   # Title
 68 |   titlePanel("NYCFlights13 Time Gained in Flight"),
 69 |  
 70 |   # Create sidebar 
 71 |   sidebarLayout(
 72 |     sidebarPanel(
 73 |       radioButtons("origin", "Flight origin:",
 74 |                    carrier_origin, selected = "JFK"),
 75 |       br(),
 76 |       
 77 |       radioButtons("dest", "Flight destination:",
 78 |                    carrier_dest, selected = "SFO")
 79 |       
 80 |       ),
 81 |     
 82 |     # Show a tabset that includes a plot, model, and table view
 83 |     mainPanel(
 84 |       tabsetPanel(type = "tabs", 
 85 |                   tabPanel("Plot", plotOutput("plot")), 
 86 |                   tabPanel("Map", leafletOutput("map")), 
 87 |                   tabPanel("Data", dataTableOutput("datatable"))
 88 |       )
 89 |     )
 90 |     )
 91 | )
 92 | 
 93 | # Shiny server function
 94 | server <- function(input, output) {
 95 |   
 96 |   # Identify origin lat and log
 97 |   origin <- reactive({
 98 |     req(input$origin)
 99 |     filter(nycflights13::airports, faa == input$origin)
100 |   })
101 |   
102 |   # Identify destination lat and log
103 |   dest <- reactive({
104 |     req(input$dest)
105 |     filter(nycflights13::airports, faa == input$dest)
106 |   })
107 |   
108 |   # Create plot data
109 |   plot_data <- reactive({
110 |     req(input$origin, input$dest)
111 |     lookup_tbl %>%
112 |       filter(origin==input$origin & dest==input$dest) %>%
113 |       ungroup() %>%
114 |       select(airline, flights, distance, avg_gain, pred_gain) %>%
115 |       collect
116 |   })
117 |   
118 |   # Plot observed versus predicted time gain for carriers and route
119 |   output$plot <- renderPlot({
120 |     ggplot(plot_data(), aes(factor(airline), pred_gain)) + 
121 |       geom_bar(stat = "identity", fill = '#2780E3') +
122 |       geom_point(aes(factor(airline), avg_gain)) +
123 |       coord_flip() +
124 |       labs(x = "", y = "Time gained in flight (minutes)") +
125 |       labs(title = "Observed gain (point) vs Predicted gain (bar)")
126 |   })
127 | 
128 |   # Output the route map  
129 |   output$map <- renderLeaflet({
130 |     gcIntermediate(
131 |       select(origin(), lon, lat),
132 |       select(dest(), lon, lat),
133 |       n=100, addStartEnd=TRUE, sp=TRUE
134 |     ) %>%
135 |       leaflet() %>%
136 |       addProviderTiles("CartoDB.Positron") %>%
137 |       addPolylines()
138 |   })  
139 |   
140 |   # Print table of observed and predicted gains by airline
141 |   output$datatable <- renderDataTable(
142 |     datatable(plot_data()) %>%
143 |       formatRound(c("flights", "distance"), 0) %>%
144 |       formatRound(c("avg_gain", "pred_gain"), 1)
145 |   )
146 |   
147 | }
148 | 
149 | # Run Shiny
150 | shinyApp(ui = ui, server = server)
151 | 


--------------------------------------------------------------------------------
/prod/apps/nycflights13-app-spark/config.yml:
--------------------------------------------------------------------------------
1 | default:
2 |   sparklyr.cores.local: 1
3 |   sparklyr.shell.driver-memory: 2G
4 | 


--------------------------------------------------------------------------------
/prod/apps/titanic-classification/.gitignore:
--------------------------------------------------------------------------------
1 | derby.log
2 | 


--------------------------------------------------------------------------------
/prod/apps/titanic-classification/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Title: Spark ML Classifier Performance - Titanic
2 | Author: RStudio, Inc.
3 | AuthorUrl: http://www.rstudio.com/ 
4 | License: GPL-3
5 | DisplayMode: Showcase
6 | Tags: sparklyr
7 | Type: Shiny
8 | 


--------------------------------------------------------------------------------
/prod/apps/titanic-classification/app.R:
--------------------------------------------------------------------------------
  1 | library(sparklyr)
  2 | library(dplyr)
  3 | library(shiny)
  4 | library(ggplot2)
  5 | library(tidyr)
  6 | source('helpers.R')
  7 | 
  8 | 
  9 | #Connect to Spark
 10 | sc <- spark_connect(master = "local", version = "2.0.0")
 11 | 
 12 | #Read in Parquet Data
 13 | spark_read_parquet(sc, "titanic", "titanic-parquet")
 14 | titanic_tbl <- tbl(sc, "titanic")
 15 | 
 16 | # Add features
 17 | titanic_final <- titanic_tbl %>% 
 18 |   mutate(Family_Size = SibSp + Parch + 1L) %>% 
 19 |   mutate(Pclass = as.character(Pclass)) %>%
 20 |   filter(!is.na(Embarked)) %>%
 21 |   mutate(Age = if_else(is.na(Age), mean(Age), Age)) %>%
 22 |   mutate(Family_Size = as.numeric(Family_size)) %>%
 23 |   sdf_mutate(
 24 |     Family_Sizes = ft_bucketizer(Family_Size, splits = c(1,2,5,12))
 25 |   ) %>%
 26 |   mutate(Family_Sizes = as.character(as.integer(Family_Sizes))) %>%
 27 |   mutate(Survived = as.numeric(Survived), SibSp = as.numeric(SibSp), Parch = as.numeric(Parch)) %>%
 28 |   select(Survived, Pclass, Sex, Age, SibSp, Parch, Fare, Embarked, Family_Sizes) %>% 
 29 |   sdf_register("titanic_final")
 30 | 
 31 | features <- tbl_vars(titanic_final) %>%
 32 |   .[-which(. == "Survived")]
 33 | 
 34 | 
 35 | ui <- pageWithSidebar(
 36 |   headerPanel('ML Titanic Classification'),
 37 |   sidebarPanel(
 38 |     selectizeInput('selfeatures', 'Select Features', features, multiple = TRUE),
 39 |     numericInput('trainingFrac', 'Training Proportion', min = 0.1, max = 0.9, value = 0.75), 
 40 |     actionButton('fit', "Fit Models")
 41 |   ),
 42 |   mainPanel(
 43 |     plotOutput('liftPlot'),
 44 |     plotOutput('auc_accuracy')
 45 |   )
 46 | )
 47 | 
 48 | server <- function(input, output, session) {
 49 |   
 50 |   ml_score <- eventReactive(input$fit, {
 51 |     withProgress(message = "Fitting Spark Models", value = 0.1, {
 52 |         incProgress(0.2, detail = "Partitioning Training / Testing")
 53 |         partition <- sdf_partition(titanic_final, train = input$trainingFrac, test= 1-input$trainingFrac)
 54 |         train_tbl <- partition$train
 55 |         test_tbl <- partition$test
 56 |         
 57 |         ml_formula <- formula(paste("Survived ~", paste(input$selfeatures, collapse = "+")))
 58 |         
 59 |         incProgress(0.5, detail = "Fitting Models")
 60 |         ml_models <- list(
 61 |           "Logistic" = ml_logistic_regression(train_tbl, ml_formula), 
 62 |           "Decision Tree" = ml_decision_tree(train_tbl, ml_formula),
 63 |           "Random Forest" = ml_random_forest(train_tbl, ml_formula),
 64 |           "Gradient Boosted Trees" = ml_gradient_boosted_trees(train_tbl, ml_formula),
 65 |           "Naive Bayes" = ml_naive_bayes(train_tbl, ml_formula)
 66 |         )
 67 |         
 68 |         incProgress(0.75, detail = "Scoring Models")
 69 |         lapply(ml_models, score_test_data, test_tbl) # helpers.R
 70 |     })
 71 |   })
 72 |   
 73 |   output$liftPlot <- renderPlot({
 74 |     
 75 |       ml_gains <- data.frame(bin = 1:10, prop = seq(0, 1, len = 10), model = "Base")
 76 |       for (i in names(ml_score())) {
 77 |         ml_gains <- ml_score()[[i]] %>%
 78 |           calculate_lift %>% # helpers.R 
 79 |           mutate(model = i) %>%
 80 |           rbind(ml_gains, .)
 81 |       }
 82 |       ggplot(ml_gains, aes(x = bin, y = prop, colour = model)) +
 83 |         geom_point() + geom_line() +
 84 |         ggtitle("Lift Chart for Predicting Survival - Test Data Set") + 
 85 |         xlab("") + ylab("")
 86 |     
 87 |   })
 88 |   
 89 |   output$auc_accuracy <- renderPlot({
 90 |     # Calculate AUC and accuracy
 91 |     perf_metrics <- data.frame(
 92 |       model = names(ml_score()),
 93 |       AUC = 100 * sapply(ml_score(), ml_binary_classification_eval, "Survived", "prediction"),
 94 |       Accuracy = 100 * sapply(ml_score(), calc_accuracy),
 95 |       row.names = NULL, stringsAsFactors = FALSE)
 96 |     
 97 |     # Plot results
 98 |     gather(perf_metrics, metric, value, AUC, Accuracy) %>%
 99 |       ggplot(aes(reorder(model, value), value, fill = metric)) + 
100 |       geom_bar(stat = "identity", position = "dodge") + 
101 |       coord_flip() +
102 |       xlab("") +
103 |       ylab("Percent") +
104 |       ggtitle("Performance Metrics")
105 |     
106 |   })
107 |   
108 | }
109 | 
110 | shinyApp(ui = ui, server = server)
111 | 


--------------------------------------------------------------------------------
/prod/apps/titanic-classification/helpers.R:
--------------------------------------------------------------------------------
 1 | calculate_lift <- function(scored_data) {
 2 |   scored_data %>%
 3 |     mutate(bin = ntile(desc(prediction), 10)) %>% 
 4 |     group_by(bin) %>% 
 5 |     summarize(count = sum(Survived)) %>% 
 6 |     mutate(prop = count / sum(count)) %>% 
 7 |     arrange(bin) %>% 
 8 |     mutate(prop = cumsum(prop)) %>% 
 9 |     select(-count) %>% 
10 |     collect() %>% 
11 |     as.data.frame()
12 | }
13 | 
14 | score_test_data <- function(model, data=test_tbl){
15 |   pred <- sdf_predict(model, data)
16 |   select(pred, Survived, prediction)
17 | }
18 | 
19 | calc_accuracy <- function(data, cutpoint = 0.5){
20 |   data %>% 
21 |     mutate(prediction = if_else(prediction > cutpoint, 1.0, 0.0)) %>%
22 |     ml_classification_eval("prediction", "Survived", "accuracy")
23 | }


--------------------------------------------------------------------------------
/prod/apps/titanic-classification/titanic-parquet/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/prod/apps/titanic-classification/titanic-parquet/.part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/titanic-classification/titanic-parquet/.part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet.crc


--------------------------------------------------------------------------------
/prod/apps/titanic-classification/titanic-parquet/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/titanic-classification/titanic-parquet/_SUCCESS


--------------------------------------------------------------------------------
/prod/apps/titanic-classification/titanic-parquet/part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/apps/titanic-classification/titanic-parquet/part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet


--------------------------------------------------------------------------------
/prod/conf/config.yml:
--------------------------------------------------------------------------------
1 | default:
2 |   sparklyr.cores.local: 1
3 |   sparklyr.shell.driver-memory: 1G
4 | 


--------------------------------------------------------------------------------
/prod/conf/shiny-server.conf:
--------------------------------------------------------------------------------
 1 | run_as shiny;
 2 | auth_pam;
 3 | 
 4 | server {
 5 | 
 6 |     listen 80;
 7 | 
 8 |     utilization_scheduler 20 0 1;  # max of 20 connections and 1 R process per app
 9 |     app_session_timeout 300;       # close idle connection in seconds
10 |     app_idle_timeout 86400;        # close idle R process in seconds
11 |     app_init_timeout 600;          # cancel startup in seconds
12 | 
13 |     log_dir /var/log/shiny-server;
14 |     google_analytics_id UA-20375833-15;
15 | 
16 |     #location /dashboards/ggplot2-brushing {
17 |     #    app_dir /srv/shiny-server/sparkDemos/prod/dashboards/ggplot2-brushing;
18 |     #}
19 | 
20 |     location /dashboards/diamonds-explorer {
21 |         app_dir /srv/shiny-server/sparkDemos/prod/dashboards/diamonds-explorer;
22 |     }
23 | 
24 |     location /dashboards/nycflights13-dash-spark {
25 |         app_dir /srv/shiny-server/sparkDemos/prod/dashboards/nycflights13-dash-spark;
26 |     }
27 | 
28 |     location /apps/titanic-classification {
29 |         app_dir /srv/shiny-server/sparkDemos/prod/apps/titanic-classification;
30 |     }
31 | 
32 |     location /apps/iris-k-means {
33 |         app_dir /srv/shiny-server/sparkDemos/prod/apps/iris-k-means;
34 |     }
35 | 
36 |     location /apps/nycflights13-app-spark {
37 |         app_dir /srv/shiny-server/sparkDemos/prod/apps/nycflights13-app-spark;
38 |     }
39 | 
40 | }
41 | 
42 | admin 4151 {
43 |    required_group shiny-admins;
44 | }


--------------------------------------------------------------------------------
/prod/dashboards/diamonds-explorer/config.yml:
--------------------------------------------------------------------------------
1 | default:
2 |   sparklyr.cores.local: 1
3 |   sparklyr.shell.driver-memory: 2G
4 | 


--------------------------------------------------------------------------------
/prod/dashboards/diamonds-explorer/diamonds-parquet/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/prod/dashboards/diamonds-explorer/diamonds-parquet/._common_metadata.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/dashboards/diamonds-explorer/diamonds-parquet/._common_metadata.crc


--------------------------------------------------------------------------------
/prod/dashboards/diamonds-explorer/diamonds-parquet/._metadata.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/dashboards/diamonds-explorer/diamonds-parquet/._metadata.crc


--------------------------------------------------------------------------------
/prod/dashboards/diamonds-explorer/diamonds-parquet/.part-r-00000-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/dashboards/diamonds-explorer/diamonds-parquet/.part-r-00000-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet.crc


--------------------------------------------------------------------------------
/prod/dashboards/diamonds-explorer/diamonds-parquet/.part-r-00001-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/dashboards/diamonds-explorer/diamonds-parquet/.part-r-00001-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet.crc


--------------------------------------------------------------------------------
/prod/dashboards/diamonds-explorer/diamonds-parquet/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/dashboards/diamonds-explorer/diamonds-parquet/_SUCCESS


--------------------------------------------------------------------------------
/prod/dashboards/diamonds-explorer/diamonds-parquet/_common_metadata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/dashboards/diamonds-explorer/diamonds-parquet/_common_metadata


--------------------------------------------------------------------------------
/prod/dashboards/diamonds-explorer/diamonds-parquet/_metadata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/dashboards/diamonds-explorer/diamonds-parquet/_metadata


--------------------------------------------------------------------------------
/prod/dashboards/diamonds-explorer/diamonds-parquet/part-r-00000-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/dashboards/diamonds-explorer/diamonds-parquet/part-r-00000-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet


--------------------------------------------------------------------------------
/prod/dashboards/diamonds-explorer/diamonds-parquet/part-r-00001-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/dashboards/diamonds-explorer/diamonds-parquet/part-r-00001-d6c7e259-62a6-415e-a145-f1715444705a.gz.parquet


--------------------------------------------------------------------------------
/prod/dashboards/diamonds-explorer/flexdashboard-shiny-diamonds.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "ggplot2 Diamonds Explorer"
 3 | output: 
 4 |   flexdashboard::flex_dashboard:
 5 |     orientation: columns
 6 |     social: menu
 7 |     source_code: embed
 8 | runtime: shiny
 9 | ---
10 | 
11 | ```{r global, include=FALSE}
12 | library(ggplot2)
13 | library(mgcv)
14 | library(flexdashboard)
15 | library(sparklyr)
16 | library(dplyr)
17 | 
18 | sc <- spark_connect(master = "local")
19 | spark_read_parquet(sc, "diamonds", path = "diamonds-parquet")
20 | diamonds_tbl <- tbl(sc, "diamonds")
21 | ```
22 | 
23 | Inputs {.sidebar}
24 | -----------------------------------------------------------------------
25 | 
26 | ```{r}
27 | n <- (count(diamonds_tbl)  %>% as.data.frame())$n
28 | sliderInput('sampleSize', 'Sample Size', min = 1, max = n,
29 |             value = min(1000, n), step = 1000, round = 0)
30 | 
31 | checkboxInput('jitter', 'Jitter', value = TRUE)
32 | checkboxInput('smooth', 'Smooth', value = TRUE)
33 | 
34 | selectInput('x', 'X', tbl_vars(diamonds_tbl))
35 | selectInput('y', 'Y', tbl_vars(diamonds_tbl), tbl_vars(diamonds_tbl)[2])
36 | selectInput('color', 'Color', c('None', tbl_vars(diamonds_tbl)))
37 | 
38 | # Determine column type and select only strings
39 | factor_cols <- sparklyr:::sdf_schema(diamonds_tbl) %>% 
40 |   sapply(unlist) %>%
41 |   t() %>% 
42 |   as.data.frame() %>% 
43 |   filter(type == "StringType") %>% 
44 |   select(name)
45 | 
46 | selectInput('facet_row', 'Facet Row', c(None='.', factor_cols))
47 | selectInput('facet_col', 'Facet Column', c(None='.', factor_cols))
48 | ```
49 | 
50 | Outputs
51 | -----------------------------------------------------------------------
52 | 
53 | ### Diamonds
54 | 
55 | ```{r}
56 | dataset <- reactive({
57 |   diamonds_tbl %>% 
58 |     sdf_sample(fraction = (input$sampleSize / diamonds %>% count())) %>% 
59 |     collect()
60 | })
61 | 
62 | renderPlot({
63 |   p <- ggplot(dataset(), aes_string(x = input$x, y = input$y)) + geom_point()
64 |   
65 |   if (input$color != 'None')
66 |     p <- p + aes_string(color = input$color)
67 |   
68 |   facets <- paste(input$facet_row, '~', input$facet_col)
69 |   if (facets != '. ~ .')
70 |     p <- p + facet_grid(facets)
71 |   
72 |   if (input$jitter)
73 |     p <- p + geom_jitter()
74 |   if (input$smooth)
75 |     p <- p + geom_smooth()
76 |   
77 |   print(p)
78 | })
79 | ```
80 | 


--------------------------------------------------------------------------------
/prod/dashboards/ggplot2-brushing/ggplot2Brushing.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "ggplot2 Brushing"
 3 | output: 
 4 |   flexdashboard::flex_dashboard:
 5 |     orientation: columns
 6 |     social: menu
 7 |     source_code: embed
 8 | runtime: shiny
 9 | ---
10 | 
11 | ```{r global, include=FALSE}
12 | # load data in 'global' chunk so it can be shared by all users of the dashboard
13 | library(datasets)
14 | library(flexdashboard)
15 | library(sparklyr)
16 | library(dplyr)
17 | 
18 | sc <- spark_connect(master = "local", version = "2.0.0")
19 | mtcars2_tbl <- copy_to(sc, mtcars[, c("mpg", "cyl", "wt")], "mtcars")
20 | ```
21 | 
22 | 
23 | ```{r}
24 | # Reactive that returns the whole dataset if there is no brush
25 | selectedData <- reactive({
26 |   data <- brushedPoints(collect(mtcars2_tbl), input$plot1_brush)
27 |   if (nrow(data) == 0)
28 |     data_tbl <- collect(mtcars2_tbl)
29 |   data
30 | })
31 | ```
32 | 
33 | Column {data-width=650}
34 | -----------------------------------------------------------------------
35 | 
36 | ### Miles Per Gallon vs. Weight {data-width=600}
37 | 
38 | ```{r}
39 | library(ggplot2)
40 | plotOutput("plot1", brush = brushOpts(id = "plot1_brush"))
41 | output$plot1 <- renderPlot({
42 |   ggplot(collect(mtcars2_tbl), aes(wt, mpg)) + geom_point()
43 | })
44 | ```
45 | 
46 | ### Miles Per Gallon and Cylinders
47 | 
48 | ```{r}
49 | renderPlot({
50 |   ggplot(selectedData(), aes(factor(cyl), mpg))  + geom_boxplot()
51 | })
52 | ```
53 | 
54 | Column {data-width=350}
55 | -----------------------------------------------------------------------
56 | 
57 | ### Car Details {data-width=400}
58 | 
59 | ```{r}
60 | renderTable({
61 |   selectedData()
62 | })
63 | ```


--------------------------------------------------------------------------------
/prod/dashboards/nycflights13-dash-spark/config.yml:
--------------------------------------------------------------------------------
1 | default:
2 |   sparklyr.cores.local: 1
3 |   sparklyr.shell.driver-memory: 2G
4 | 


--------------------------------------------------------------------------------
/prod/dashboards/tor-project/.gitignore:
--------------------------------------------------------------------------------
1 | derby.log
2 | 


--------------------------------------------------------------------------------
/prod/dashboards/tor-project/metricsgraphicsTorProject.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "MetricsGraphics: Tor Project"
  3 | output: 
  4 |   flexdashboard::flex_dashboard:
  5 |     orientation: rows
  6 |     social: menu
  7 |     source_code: embed
  8 | ---
  9 | 
 10 | ```{r global, include=FALSE, message = FALSE}
 11 | library(flexdashboard)
 12 | library(metricsgraphics)
 13 | library(readr)
 14 | library(dplyr)
 15 | library(tidyr)
 16 | library(sparklyr)
 17 | 
 18 | sc <- spark_connect(master = "local", version = "2.0.0")
 19 | 
 20 | servers <- read_csv("https://metrics.torproject.org/stats/servers.csv",
 21 |                     col_types="ccccccii")
 22 | hidden <- read_csv("https://metrics.torproject.org/stats/hidserv.csv",
 23 |                    col_types="ccddddd")
 24 | 
 25 | raw_servers_tbl <- copy_to(sc, servers, "servers")
 26 | raw_hidden_tbl <- copy_to(sc, hidden, "hidden")
 27 | 
 28 | servers_tbl <- raw_servers_tbl %>%
 29 |   mutate(date = from_unixtime(unix_timestamp(date , 'yyyy-MM-dd'))) %>%
 30 |   filter(date >= '2016-01-01')
 31 | 
 32 | hidden <- raw_hidden_tbl %>%
 33 |   mutate(date = from_unixtime(unix_timestamp(date , 'yyyy-MM-dd'))) %>%
 34 |   filter(date >= '2016-01-01' & type=="dir-onions-seen") %>%
 35 |   collect
 36 | 
 37 | relays <- servers_tbl %>%
 38 |   filter(!is.na(relays)) %>%
 39 |   count(date, wt = relays) %>%
 40 |   collect
 41 | 
 42 | filter(servers, !is.na(relays)) %>% 
 43 |   mutate(platform=ifelse(is.na(platform), "Linux", platform)) %>% 
 44 |   count(date, platform, wt=relays) %>% 
 45 |   collect %>%
 46 |   spread(platform, n) -> relays_by_platform
 47 | 
 48 | filter(servers, !is.na(relays)) %>% 
 49 |   count(date, flag, wt=relays) %>% 
 50 |   filter(!is.na(flag)) %>% 
 51 |   collect %>%
 52 |   spread(flag, n) -> relays_by_flag
 53 | 
 54 | filter(servers, !is.na(relays)) %>% 
 55 |   count(date, version, wt=relays) %>% 
 56 |   filter(!is.na(version)) %>% 
 57 |   mutate(version=gsub("^0", "v0", version)) %>% 
 58 |   collect %>%
 59 |   spread(version, n) -> relays_by_version
 60 | ```
 61 | 
 62 | Row  {data-height=600}
 63 | -----------------------------------------------------------------------
 64 | 
 65 | ### Active Relays in the Tor Network
 66 | 
 67 | ```{r}
 68 | mjs_plot(relays, date, n, top=0, left=30) %>% 
 69 |   mjs_line(area=TRUE) %>% 
 70 |   mjs_axis_x(xax_format="date") %>%
 71 |   mjs_add_mouseover("function(d, i) {
 72 |                 $('{{ID}} svg .mg-active-datapoint')
 73 |                     .html('Relay count' +
 74 |                           d3.time.format('%Y-%m-%d')(d.date) + ': ' +
 75 |                           d3.format('0,000')(d.n));
 76 |                  }")
 77 | ``` 
 78 | 
 79 | ### Hidden-service statistics
 80 | 
 81 | ```{r}
 82 | mjs_plot(hidden, date, "wmean", top=0, left=30) %>% 
 83 |   mjs_line() %>% 
 84 |   mjs_add_line("wmedian") %>% 
 85 |   mjs_add_line("wiqm") %>% 
 86 |   mjs_axis_x(xax_format="date") %>% 
 87 |   mjs_add_legend(c("wmean", "wmedian", "wiqm"))
 88 | ```
 89 | 
 90 | Row {.tabset}
 91 | -----------------------------------------------------------------------
 92 | 
 93 | ### Relays with Exit, Fast, Guard, HSDir & Stable flags
 94 | 
 95 | ```{r}
 96 | mjs_plot(relays_by_flag, date, Exit, top=0, left=30) %>% 
 97 |   mjs_line() %>% 
 98 |   mjs_add_line(Fast) %>% 
 99 |   mjs_add_line(Guard) %>% 
100 |   mjs_add_line(HSDir) %>% 
101 |   mjs_add_line(Stable) %>% 
102 |   mjs_axis_x(xax_format="date") %>% 
103 |   mjs_add_legend(c("Exit", "Fast", "Guard", "HSDir", "Stable"))
104 | ```
105 | 
106 | ### Relays by OS (log scale)
107 | 
108 | ```{r}
109 | mjs_plot(relays_by_platform, date, BSD, top=0, left=30) %>% 
110 |   mjs_line() %>% 
111 |   mjs_add_line(Darwin) %>% 
112 |   mjs_add_line(Linux) %>% 
113 |   mjs_add_line(Other) %>% 
114 |   mjs_add_line(Windows) %>% 
115 |   mjs_axis_x(xax_format="date") %>% 
116 |   mjs_axis_y(y_scale_type="log") %>% 
117 |   mjs_add_legend(c("BSD", "Darwin", "Linux", "Other", "Windows"))
118 | ```
119 | 
120 | ### Relays by version
121 | 
122 | ```{r}
123 | mjs_plot(relays_by_version, date, "v0.2.4", top=0, left=30) %>% 
124 |   mjs_line() %>% 
125 |   mjs_add_line("v0.2.5") %>% 
126 |   mjs_add_line("v0.2.6") %>% 
127 |   mjs_add_line("v0.2.7") %>% 
128 |   mjs_add_line("v0.2.8") %>% 
129 |   mjs_add_line("Other") %>% 
130 |   mjs_axis_x(xax_format="date") %>% 
131 |   mjs_add_legend(c("v0.2.4", "v0.2.5", "v0.2.6", "v0.2.7", "v0.2.8", "Other"))
132 | ```
133 | 


--------------------------------------------------------------------------------
/prod/notebooks/babynames/.gitignore:
--------------------------------------------------------------------------------
1 | derby.log
2 | 


--------------------------------------------------------------------------------
/prod/notebooks/end-to-end-flights/end-to-end-flights-flexdashboard.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Time Gained in Flight"
  3 | output: 
  4 |   flexdashboard::flex_dashboard:
  5 |     orientation: rows
  6 |     social: menu
  7 |     source_code: embed
  8 | runtime: shiny
  9 | ---
 10 | 
 11 | ```{r setup, include=F}
 12 | # Attach packages
 13 | library(dplyr)
 14 | library(ggplot2)
 15 | library(DT)
 16 | library(leaflet)
 17 | library(geosphere)
 18 | load('flights_pred_2008.RData')
 19 | airports <- mutate(airports, lat = as.numeric(lat), lon = as.numeric(lon))
 20 | ```
 21 | 
 22 | 
 23 | Summary
 24 | ========================================================================
 25 | 
 26 | Inputs {.sidebar}
 27 | -----------------------------------------------------------------------
 28 | 
 29 | ### Select Airports
 30 | 
 31 | ```{r}
 32 | # Shiny inputs for flight orgin and destination
 33 | carrier_origin <- ungroup(pred_data) %>% distinct(origin) %>% .[['origin']]
 34 | carrier_dest <- ungroup(pred_data) %>% distinct(dest) %>% .[['dest']]
 35 | selectInput("origin",  "Flight origin", carrier_origin, selected =  "JFK")
 36 | selectInput("dest", "Flight destination", carrier_dest, selected = "LAS")
 37 | ```
 38 | 
 39 | ### Background
 40 | 
 41 | Given that your flight was delayed by 15 minutes or more, what is the likelihood 
 42 | your airline carrier will make up time in route? Some of the most signficant factors 
 43 | for making up time are flight distance and airline carrier. The data model behind 
 44 | this dashboard is based on flights from NYC airports in 2013.
 45 | 
 46 | 
 47 | Row
 48 | -----------------------------------------------------------------------
 49 | 
 50 | ### Observed versus predicted time gain
 51 | 
 52 | ```{r}
 53 | # Aggregregate time gain by carrier and by route 
 54 | plot_data <- reactive({
 55 |   req(input$origin, input$dest)
 56 |   pred_data %>%
 57 |     filter(origin==input$origin & dest==input$dest) %>%
 58 |     ungroup() %>%
 59 |     select(airline, flights, distance, avg_dep_delay, avg_arr_delay, avg_gain, pred_gain)
 60 | })
 61 | 
 62 | # Plot observed versus predicted time gain for carriers and route
 63 | renderPlot({
 64 |   ggplot(plot_data(), aes(factor(airline), pred_gain)) + 
 65 |     geom_bar(stat = "identity", fill = '#2780E3') +
 66 |     geom_point(aes(factor(airline), avg_gain)) +
 67 |     coord_flip() +
 68 |     labs(x = "", y = "Time gained in flight (minutes)") +
 69 |     labs(title = "Observed gain (point) vs Predicted gain (bar)")
 70 | })
 71 | ```
 72 | 
 73 | ### Route
 74 | 
 75 | ```{r}
 76 | # Identify origin lat and long
 77 | origin <- reactive({
 78 |   req(input$origin)
 79 |   filter(airports, faa == input$origin)
 80 |   })
 81 | 
 82 | # Identify destination lat and log
 83 | dest <- reactive({
 84 |   req(input$dest)
 85 |   filter(airports, faa == input$dest)
 86 | })
 87 | 
 88 | # Plot route
 89 | renderLeaflet({
 90 |   gcIntermediate(
 91 |     select(origin(), lon, lat),
 92 |     select(dest(), lon, lat),
 93 |     n=100, addStartEnd=TRUE, sp=TRUE
 94 |     ) %>%
 95 |     leaflet() %>%
 96 |     addProviderTiles("CartoDB.Positron") %>%
 97 |     addPolylines()
 98 | })
 99 | ```
100 | 
101 | Row
102 | -----------------------------------------------------------------------
103 | 
104 | ### Data details
105 | 
106 | ```{r}
107 | # Print table of observed and predicted gains by airline
108 | renderDataTable(
109 |   datatable(plot_data()) %>%
110 |     formatRound(c("flights", "distance"), 0) %>%
111 |     formatRound(c("avg_arr_delay", "avg_dep_delay", "avg_gain", "pred_gain"), 1)
112 | )
113 | ```
114 | 
115 | Model Details
116 | ========================================================================
117 | 
118 | ```{r}
119 | renderPrint(ml1_summary)
120 | ```
121 | 


--------------------------------------------------------------------------------
/prod/notebooks/end-to-end-flights/flights_pred_2008.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/notebooks/end-to-end-flights/flights_pred_2008.RData


--------------------------------------------------------------------------------
/prod/notebooks/ml_classification_titanic/titanic-parquet/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/prod/notebooks/ml_classification_titanic/titanic-parquet/.part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/notebooks/ml_classification_titanic/titanic-parquet/.part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet.crc


--------------------------------------------------------------------------------
/prod/notebooks/ml_classification_titanic/titanic-parquet/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/notebooks/ml_classification_titanic/titanic-parquet/_SUCCESS


--------------------------------------------------------------------------------
/prod/notebooks/ml_classification_titanic/titanic-parquet/part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/notebooks/ml_classification_titanic/titanic-parquet/part-r-00000-69484ddd-e601-45e0-bea0-eb7b5b8b23eb.snappy.parquet


--------------------------------------------------------------------------------
/prod/notebooks/taxi_demo/readme.md:
--------------------------------------------------------------------------------
1 | TO DO


--------------------------------------------------------------------------------
/prod/presentations/cazena/01_taxiR.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "NYC Taxi - One month in R"
  3 | output: html_notebook
  4 | ---
  5 | 
  6 | <center><div style="width:450px">
  7 | ![R for Data Science http://r4ds.had.co.nz/](http://r4ds.had.co.nz/diagrams/data-science.png)
  8 | </div></center>
  9 | 
 10 | # Load tidyverse
 11 | 
 12 | ```{r tidyverse}
 13 | library(tidyverse)
 14 | library(lubridate)
 15 | ```
 16 | 
 17 | # Download
 18 | 
 19 | ```{r download, eval=FALSE}
 20 | download.file(
 21 |   "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-12.csv",
 22 |   "yellow_tripdata_2015-12.csv")
 23 | ```
 24 | 
 25 | # Impord Dataset
 26 | 
 27 | ```{r import, message=FALSE, warning=FALSE}
 28 | trips <- read_csv("yellow_tripdata_2015-12.csv", n_max = 1000000)
 29 | ```
 30 | 
 31 | # Tidy
 32 | 
 33 | ```{r tidy}
 34 | # pickups
 35 | select(trips, tpep_pickup_datetime, pickup_latitude, pickup_longitude)
 36 | 
 37 | # dropoffs
 38 | select(trips, tpep_dropoff_datetime, dropoff_latitude, dropoff_longitude)
 39 | 
 40 | # trips
 41 | trips
 42 | ```
 43 | 
 44 | # Transform
 45 | 
 46 | ```{r transform}
 47 | tripsHour <- trips %>%
 48 |   filter(payment_type %in% c(1, 2)) %>%
 49 |   mutate(pay_type = ifelse(payment_type == 1, "credit", "cash")) %>%
 50 |   mutate(trip_time_sec = tpep_dropoff_datetime - tpep_pickup_datetime) %>%
 51 |   mutate(trip_time_min = as.numeric(trip_time_sec / 60)) %>%
 52 |   mutate(hour = round_date(tpep_pickup_datetime, "hour")) %>%
 53 |   group_by(pay_type, hour) %>%
 54 |   summarize(n = n(),
 55 |             tip_amount = mean(tip_amount),
 56 |             fare_amount = mean(fare_amount),
 57 |             passenger_count = mean(passenger_count),
 58 |             trip_time = mean(trip_time_min),
 59 |             trip_distance = mean(trip_distance))
 60 | tripsHour
 61 | ```
 62 | 
 63 | # Visualize
 64 | 
 65 | ```{r visualize}
 66 | ggplot(tripsHour, aes(fare_amount, color = pay_type)) +
 67 |   geom_density() + 
 68 |   labs(title = "NYC taxi fare amount", x = "Fare Amount", y = "Miles", caption = '2015-12')
 69 | 
 70 | qplot(trip_distance, data=tripsHour, geom="density", log="x", facets = ~pay_type)
 71 | ```
 72 | 
 73 | # Model
 74 | 
 75 | ```{r model}
 76 | # Formula
 77 | model_formula <- formula(tip_amount ~ fare_amount + pay_type + passenger_count)
 78 | 
 79 | # Model data
 80 | tripsModel <- tripsHour %>%
 81 |   select(tip_amount, fare_amount, pay_type, passenger_count) %>%
 82 |   na.omit
 83 | 
 84 | # Linear Model
 85 | m1 <- lm(model_formula, data = tripsHour)
 86 | summary(m1)
 87 | 
 88 | # Decision tree
 89 | library(rpart)
 90 | m2 <- rpart(model_formula, tripsHour)
 91 | summary(m2)
 92 | 
 93 | # Predict
 94 | pred <- tripsHour %>%
 95 |   ungroup %>%
 96 |   mutate(lm_fit = predict(m1, tripsHour)) %>%
 97 |   mutate(lm_res = tip_amount - lm_fit) %>%
 98 |   mutate(rpart_fit = predict(m2, tripsHour)) %>%
 99 |   mutate(rpart_res = tip_amount - rpart_fit)
100 | 
101 | # MSE
102 | pred %>%
103 |   na.omit() %>%
104 |   summarize(lm_mse = mean(lm_res^2), rpart_mse = mean(rpart_res^2))
105 | 
106 | # Plot
107 | ggplot(pred, aes(rpart_fit, lm_fit)) + geom_point() + geom_smooth(method="lm")
108 | ```
109 | 
110 | # Communicate
111 | 
112 | This analysis of one month of NYC Taxi data shows that you can predict tip amount as a function of fare amount, pay type, and passenger account. For a detailed explanation of the code you can view this report in the following formats:
113 | 
114 | * HTML
115 | * PDF
116 | * Word
117 | 


--------------------------------------------------------------------------------
/prod/presentations/cazena/README.md:
--------------------------------------------------------------------------------
1 | # Analyze data with sparklyr
2 | 
3 | ## Abstract
4 | 
5 | Sparklyr is an R package that lets you analyze data in Spark while using familiar tools in R. Sparklyr supports a complete backend for dplyr, a popular tool for working with data frame objects both in memory and out of memory. You can use dplyr to translate R code into Spark SQL. Sparklyr also supports MLlib so you can run classifiers, regressions, clustering, decision trees, and many more machine learning algorithms on your distributed data in Spark. With sparklyr you can analyze large amounts of data that would not traditionally fit into R memory. Then you can collect results from Spark into R for further visualization and documentation.
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/prod/presentations/cazena/emr_setup.sh:
--------------------------------------------------------------------------------
  1 | ### Build EMR master node with Taxi Data
  2 | ### Nathan Stephens
  3 | ### 3/27/2017
  4 | 
  5 | ###########################################
  6 | ### Run as root
  7 | ###########################################
  8 | 
  9 | ## RSP
 10 | 
 11 | # Upate
 12 | sudo yum update
 13 | 
 14 | # R
 15 | sudo yum install -y R libcurl-devel openssl-devel git
 16 |   
 17 | # install RSP
 18 | wget -q https://download2.rstudio.org/current.ver -O /tmp/rsp.current.ver
 19 | wget -O /tmp/rstudio-server-rhel.rpm https://s3.amazonaws.com/rstudio-dailybuilds/rstudio-server-rhel-pro-$(cat /tmp/rsp.current.ver)-x86_64.rpm
 20 | sudo yum install -y --nogpgcheck /tmp/rstudio-server-rhel.rpm
 21 | 
 22 | # install packages
 23 | sudo Rscript -e 'install.packages("sparklyr",  repos = "http://cran.rstudio.com/")'
 24 | sudo Rscript -e 'install.packages("devtools",  repos = "http://cran.rstudio.com/")'
 25 | sudo Rscript -e 'install.packages("tidyverse", repos = "http://cran.rstudio.com/")'
 26 | sudo Rscript -e 'install.packages("leaflet",   repos = "http://cran.rstudio.com/")'
 27 | sudo Rscript -e 'install.packages("DT",        repos = "http://cran.rstudio.com/")'
 28 | 
 29 | ###########################################
 30 | 
 31 | ## Add rstudio user
 32 | sudo useradd -m rstudio
 33 | sudo echo rstudio | passwd rstudio --stdin
 34 | sudo usermod -a -G hadoop rstudio
 35 | sudo usermod -a -G hive rstudio
 36 | 
 37 | 
 38 | ###########################################
 39 | ### Run as rstudio
 40 | ###########################################
 41 | 
 42 | ## switch user
 43 | su rstudio
 44 | cd ~
 45 | 
 46 | ## add rstudio directory
 47 | hadoop fs -mkdir /user/rstudio
 48 | hadoop fs -chown rstudio:rstudio /user/rstudio
 49 | 
 50 | ## clone project
 51 | git clone https://github.com/rstudio/sparkDemos.git /home/rstudio/sparkDemos
 52 | cat >/home/rstudio/sparkDemos/sparkDemos.Rproj <<ENDOFCONTENT
 53 | Version: 1.0
 54 | 
 55 | RestoreWorkspace: Default
 56 | SaveWorkspace: Default
 57 | AlwaysSaveHistory: Default
 58 | 
 59 | EnableCodeIndexing: Yes
 60 | UseSpacesForTab: Yes
 61 | NumSpacesForTab: 2
 62 | Encoding: UTF-8
 63 | 
 64 | RnwWeave: Sweave
 65 | LaTeX: pdfLaTeX
 66 | ENDOFCONTENT
 67 | 
 68 | ## Copy data
 69 | 
 70 | nohup /usr/bin/s3-dist-cp --src=s3n://rstudio-sparkdemo-data/nyc-taxi/csv/nyct2010 --dest=hdfs:///user/rstudio/nyct2010 >> nyct2010.log &
 71 | nohup /usr/bin/s3-dist-cp --src=s3n://rstudio-sparkdemo-data/nyc-taxi/parquet_nohead/trips --dest=hdfs:///user/rstudio/trips_par >> trips_par.log &
 72 | nohup /usr/bin/s3-dist-cp --src=s3n://rstudio-sparkdemo-data/nyc-taxi/parquet/trips_model_data --dest=hdfs:///user/rstudio/trips_model_data >> trips_model_data.log &
 73 | 
 74 | 
 75 | ###########################################
 76 | ### Open Hive
 77 | ###########################################
 78 | 
 79 | hive
 80 | 
 81 | # Hive 1
 82 | 
 83 | CREATE EXTERNAL TABLE IF NOT EXISTS nyct2010(
 84 | gid int,
 85 | ctlabel float,
 86 | borocode int,
 87 | boroname string,
 88 | ct2010 int,
 89 | boroct2010 int,
 90 | cdeligibil string,
 91 | ntacode string,
 92 | ntaname string,
 93 | puma int)
 94 | ROW FORMAT DELIMITED
 95 | FIELDS TERMINATED BY ','
 96 | LINES TERMINATED BY '\n'
 97 | ;
 98 | 
 99 | LOAD DATA INPATH '/user/rstudio/nyct2010' INTO TABLE nyct2010;
100 | 
101 | # Hive 3
102 | 
103 | CREATE EXTERNAL TABLE IF NOT EXISTS trips_par(
104 | id int,
105 | cab_type_id int,
106 | vendor_id string,
107 | pickup_datetime timestamp,
108 | dropoff_datetime timestamp,
109 | store_and_fwd_flag string,
110 | rate_code_id string,
111 | pickup_longitude float,
112 | pickup_latitude float,
113 | dropoff_longitude float,
114 | dropoff_latitude float,
115 | passenger_count bigint,
116 | trip_distance float,
117 | fare_amount float,
118 | extra bigint,
119 | mta_tax string,
120 | tip_amount float,
121 | tolls_amount float,
122 | ehail_fee string,
123 | improvement_surcharge string,
124 | total_amount float,
125 | payment_type string,
126 | trip_type string,
127 | pickup_nyct2010_gid int,
128 | dropoff_nyct2010_gid int)
129 | stored as parquet;
130 | 
131 | LOAD DATA INPATH '/user/rstudio/trips_par' INTO TABLE trips_par;
132 | 
133 | 
134 | # Hive 3
135 | CREATE EXTERNAL TABLE IF NOT EXISTS trips_model_data(
136 | pickup_datetime timestamp,
137 | pickup_latitude float,
138 | pickup_longitude float,
139 | pickup_nyct2010_gid int,
140 | pickup_boro string,
141 | pickup_nta string,
142 | dropoff_datetime timestamp,
143 | dropoff_latitude float,
144 | dropoff_longitude float,
145 | dropoff_nyct2010_gid int,
146 | dropoff_boro string,
147 | dropoff_nta string,
148 | cab_type string,
149 | passenger_count bigint,
150 | trip_distance float,
151 | pay_type string,
152 | fare_amount float,
153 | tip_amount float,
154 | other_amount float,
155 | total_amount float)
156 | stored as parquet;
157 | 
158 | LOAD DATA INPATH '/user/rstudio/trips_model_data' INTO TABLE trips_model_data;
159 | 
160 | 


--------------------------------------------------------------------------------
/prod/presentations/cazena/kerberos.R:
--------------------------------------------------------------------------------
1 | system("echo '<Rstudio-password>' | kinit <Rstudio-username>")
2 | 


--------------------------------------------------------------------------------
/prod/presentations/cazena/sqlvis_histogram.R:
--------------------------------------------------------------------------------
 1 | ### Big data histogram
 2 | sqlvis_compute_histogram <- function(data, x_name, bins = 30){
 3 | 
 4 |   data_prep <- data %>%
 5 |     select_(x_field = x_name) %>%
 6 |     filter(!is.na(x_field)) %>%
 7 |     mutate(x_field = as.double(x_field))
 8 |     
 9 |   s <- data_prep %>%
10 |     summarise(max_x = max(x_field), min_x = min(x_field)) %>%
11 |     mutate(bin_value = (max_x - min_x) / bins) %>%
12 |     collect()
13 |   
14 |   new_bins <- as.numeric(c((0:(bins - 1) * s$bin_value) + s$min_x, s$max_x))
15 | 
16 |   plot_table <- data_prep %>%
17 |     ft_bucketizer(input.col = "x_field", output.col = "key_bin", splits = new_bins) %>%
18 |     group_by(key_bin) %>%
19 |     tally() %>%
20 |     collect()
21 |   
22 |   all_bins <- data.frame(
23 |     key_bin = 0:(bins - 1), 
24 |     bin = 1:bins, 
25 |     bin_ceiling = head(new_bins, -1)
26 |   )
27 |   
28 |   plot_table %>%
29 |     full_join(all_bins, by="key_bin") %>%
30 |     arrange(key_bin) %>%
31 |     mutate(n = ifelse(!is.na(n), n, 0)) %>%
32 |     select(bin = key_bin, count = n, bin_ceiling) %>%
33 |     rename_(.dots = setNames(list("bin_ceiling"), x_name))
34 |   
35 |   }
36 | 
37 | sqlvis_ggplot_histogram <- function(plot_table, ...){
38 |   plot_table %>%
39 |     select(x = 3, y = 2) %>%
40 |     ggplot(aes(x, y)) +
41 |     geom_bar(stat = "identity", fill = "cornflowerblue") +
42 |     theme(legend.position = "none") +
43 |     labs(x = colnames(plot_table)[3], y = colnames(plot_table)[2], ...)
44 |   }
45 | 
46 | sqlvis_ggvis_histogram <- function(plot_table, ...){
47 |   plot_table %>%
48 |     select(x = 3, y = 2) %>%
49 |     ggvis(x = ~x, y = ~y) %>%
50 |     layer_bars() %>%
51 |     add_axis("x", title = colnames(plot_table)[3]) %>%
52 |     add_axis("y", title = colnames(plot_table)[2])
53 | }
54 | 
55 | 


--------------------------------------------------------------------------------
/prod/presentations/cazena/sqlvis_raster.R:
--------------------------------------------------------------------------------
  1 | ### Big data tile plot
  2 | 
  3 | # data <- tbl(sc, "trips_model_data")
  4 | # x_field <- "pickup_longitude"
  5 | # y_field <- "pickup_latitude"
  6 | # resolution <- 50
  7 | 
  8 | sqlvis_compute_raster <- function(data, x_field, y_field, resolution = 300){
  9 |   
 10 |   data_prep <- data %>%
 11 |     select_(x = x_field, y = y_field) %>%
 12 |     filter(!is.na(x), !is.na(y))
 13 |   
 14 |   s <- data_prep %>%
 15 |     summarise(max_x = max(x),
 16 |               max_y = max(y), 
 17 |               min_x = min(x),
 18 |               min_y = min(y)) %>%
 19 |     mutate(rng_x = max_x - min_x,
 20 |            rng_y = max_y - min_y,
 21 |            resolution = resolution) %>%
 22 |     collect()
 23 |   
 24 |   counts <- data_prep %>% 
 25 |     mutate(res_x = round((x - s$min_x) / s$rng_x * resolution, 0),
 26 |            res_y = round((y - s$min_y) / s$rng_y * resolution, 0)) %>%
 27 |     count(res_x, res_y) %>%
 28 |     collect
 29 | 
 30 |   list(counts = counts,
 31 |        limits = s,
 32 |        vnames = c(x_field, y_field)
 33 |        )  
 34 | 
 35 | }
 36 | 
 37 | sqlvis_ggplot_raster <- function(data, ...) {
 38 | 
 39 |   d <- data$counts
 40 |   s <- data$limits
 41 |   v <- data$vnames
 42 | 
 43 |   xx <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_x, s$max_x, len = 6),2))
 44 |   yy <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_y, s$max_y, len = 6),2))
 45 | 
 46 |   ggplot(d, aes(res_x, res_y)) + 
 47 |     geom_raster(aes(fill = n)) +
 48 |     coord_fixed() +
 49 |     scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") +
 50 |     scale_x_continuous(breaks = xx, labels = names(xx)) +
 51 |     scale_y_continuous(breaks = yy, labels = names(yy)) +
 52 |     labs(x = v[1], y = v[2], ...)
 53 |     
 54 | }    
 55 | 
 56 | ### Facets
 57 | 
 58 | sqlvis_compute_raster_g <- function(data, x_field, y_field, g_field, resolution = 300){
 59 |   
 60 |   data_prep <- data %>%
 61 |     mutate_(group = g_field) %>%
 62 |     select_(g = "group", x = x_field, y = y_field) %>%
 63 |     filter(!is.na(x), !is.na(y))
 64 |   
 65 |   s <- data_prep %>%
 66 |     summarise(max_x = max(x),
 67 |               max_y = max(y), 
 68 |               min_x = min(x),
 69 |               min_y = min(y)) %>%
 70 |     mutate(rng_x = max_x - min_x,
 71 |            rng_y = max_y - min_y,
 72 |            resolution = resolution) %>%
 73 |     collect()
 74 |   
 75 |   counts <- data_prep %>% 
 76 |     mutate(res_x = round((x-s$min_x)/s$rng_x*resolution, 0),
 77 |            res_y = round((y-s$min_y)/s$rng_y*resolution, 0)) %>%
 78 |     count(g, res_x, res_y) %>%
 79 |     collect
 80 |   
 81 |   list(counts = counts,
 82 |        limits = s,
 83 |        vnames = c(x_field, y_field)
 84 |   )  
 85 |   
 86 | }
 87 | 
 88 | sqlvis_ggplot_raster_g <- function(data, ncol = 4, ...) {
 89 |   
 90 |   s <- data$limits
 91 |   d <- data$counts
 92 |   v <- data$vnames
 93 |   
 94 |   xx <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_x, s$max_x, len = 3), 1))
 95 |   yy <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_y, s$max_y, len = 3), 1))
 96 |   
 97 |   ggplot(d, aes(res_x, res_y)) + 
 98 |     geom_raster(aes(fill = n)) +
 99 |     coord_fixed() +
100 |     facet_wrap(~ g, ncol = ncol) +
101 |     scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") +
102 |     scale_x_continuous(breaks = xx, labels = names(xx)) +
103 |     scale_y_continuous(breaks = yy, labels = names(yy)) +
104 |     labs(x = v[1], y = v[2], ...)
105 |   
106 | }    
107 | 


--------------------------------------------------------------------------------
/prod/presentations/cloudera/livy-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/cloudera/livy-architecture.png


--------------------------------------------------------------------------------
/prod/presentations/cloudera/livy.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Connecting to Spark through Livy"
 3 | output: html_notebook
 4 | ---
 5 | 
 6 | With Livy you can anaylze data in your spark cluster via R on your desktop.
 7 | 
 8 | ## Livy
 9 | 
10 | Livy is a service that enables easy interaction with an Apache Spark cluster over a REST interface. It enables easy submission of Spark jobs or snippets of Spark code, synchronous or asynchronous result retrieval, as well as SparkContext management, all via a simple REST interface or a RPC client library. Livy also simplifies the interaction between Spark from application servers, thus enabling the use of Spark for interactive web/mobile applications.
11 | 
12 | <center><div style="width:450px">
13 | ![Image](http://livy.io/img/livy-architecture.png)
14 | </div></center>
15 | 
16 | ## Start Livy
17 | 
18 | Set home environment variables and start a Livy server to handle local requests.
19 | 
20 | ```{bash}
21 | export JAVA_HOME=/usr/lib/jvm/java-7-oracle-cloudera
22 | export SPARK_HOME=/opt/cloudera/parcels/CDH/lib/spark
23 | /home/ubuntu/livy/livy-server-0.2.0/bin/livy-server
24 | ```
25 | 
26 | ## Connect to Spark
27 | 
28 | Use `method = "livy"` to connect to the cluster.
29 | 
30 | ```{r}
31 | library(sparklyr)
32 | library(dplyr)
33 | sc <- spark_connect(
34 |   master = "http://ec2-***.us-west-2.compute.amazonaws.com:8998", 
35 |   method = "livy")
36 | ```
37 | 
38 | ## Analyze
39 | 
40 | Use R code on your workstation as you normally would. Your R commands will be sent to the cluster via Livy for processing. Collect your results back to the desktop for further processing in R.
41 | 
42 | ```{r}
43 | library(ggplot2)
44 | trips_model_data_tbl <- tbl(sc, "trips_model_data")
45 | pickup_dropoff_tbl <- trips_model_data_tbl %>%
46 |   filter(pickup_nta == "Turtle Bay-East Midtown" & dropoff_nta == "Airport") %>%
47 |   mutate(pickup_hour = hour(pickup_datetime)) %>%
48 |   mutate(trip_time = unix_timestamp(dropoff_datetime) - unix_timestamp(pickup_datetime)) %>%
49 |   group_by(pickup_hour) %>% 
50 |   summarize(n = n(),
51 |             trip_time_mean = mean(trip_time),
52 |             trip_time_p10 = percentile(trip_time, 0.10),
53 |             trip_time_p25 = percentile(trip_time, 0.25),
54 |             trip_time_p50 = percentile(trip_time, 0.50),
55 |             trip_time_p75 = percentile(trip_time, 0.75),
56 |             trip_time_p90 = percentile(trip_time, 0.90))
57 | 
58 | # Collect results
59 | pickup_dropoff <- collect(pickup_dropoff_tbl)
60 | 
61 | # Plot
62 | ggplot(pickup_dropoff, aes(x = pickup_hour)) +
63 |           geom_line(aes(y = trip_time_p50, alpha = "Median")) +
64 |           geom_ribbon(aes(ymin = trip_time_p25, ymax = trip_time_p75, 
65 |                           alpha = "25–75th percentile")) +
66 |           geom_ribbon(aes(ymin = trip_time_p10, ymax = trip_time_p90, 
67 |                           alpha = "10–90th percentile")) +
68 |           scale_y_continuous("trip duration in minutes")
69 | ```
70 | 


--------------------------------------------------------------------------------
/prod/presentations/cloudera/readme.md:
--------------------------------------------------------------------------------
 1 | # Demo using CDH 5.9
 2 | 
 3 | This repos contains files for demonstrating Spark and R on Cloudera using spakrlyr.
 4 | 
 5 | ### Scripts
 6 | 
 7 | * Taxi Demo
 8 | * Livy Connection
 9 | * Histogram wrappers
10 | * Raster wrappers
11 | 
12 | ### Reports
13 | 
14 | * [Data Science Toolchain with Spark and R](http://colorado.rstudio.com:3939/content/262/taxiDemoCloudera3.nb.html)
15 | * [Connecting to Spark through Livy](http://colorado.rstudio.com:3939/content/259/livy.nb.html)
16 | 
17 | ### Reference
18 | 
19 | * [spark.rstudio.com](http://spark.rstudio.com/)


--------------------------------------------------------------------------------
/prod/presentations/cloudera/sqlvis_histogram.R:
--------------------------------------------------------------------------------
 1 | ### Big data histogram
 2 | sqlvis_compute_histogram <- function(data, x_name, bins = 30){
 3 | 
 4 |   data_prep <- data %>%
 5 |     select_(x_field = x_name) %>%
 6 |     filter(!is.na(x_field)) %>%
 7 |     mutate(x_field = as.double(x_field))
 8 |     
 9 |   s <- data_prep %>%
10 |     summarise(max_x = max(x_field), min_x = min(x_field)) %>%
11 |     mutate(bin_value = (max_x - min_x) / bins) %>%
12 |     collect()
13 |   
14 |   new_bins <- as.numeric(c((0:(bins - 1) * s$bin_value) + s$min_x, s$max_x))
15 | 
16 |   plot_table <- data_prep %>%
17 |     ft_bucketizer(input.col = "x_field", output.col = "key_bin", splits = new_bins) %>%
18 |     group_by(key_bin) %>%
19 |     tally() %>%
20 |     collect()
21 |   
22 |   all_bins <- data.frame(
23 |     key_bin = 0:(bins - 1), 
24 |     bin = 1:bins, 
25 |     bin_ceiling = head(new_bins, -1)
26 |   )
27 |   
28 |   plot_table %>%
29 |     full_join(all_bins, by="key_bin") %>%
30 |     arrange(key_bin) %>%
31 |     mutate(n = ifelse(!is.na(n), n, 0)) %>%
32 |     select(bin = key_bin, count = n, bin_ceiling) %>%
33 |     rename_(.dots = setNames(list("bin_ceiling"), x_name))
34 |   
35 |   }
36 | 
37 | sqlvis_ggplot_histogram <- function(plot_table, ...){
38 |   plot_table %>%
39 |     select(x = 3, y = 2) %>%
40 |     ggplot(aes(x, y)) +
41 |     geom_bar(stat = "identity", fill = "cornflowerblue") +
42 |     theme(legend.position = "none") +
43 |     labs(x = colnames(plot_table)[3], y = colnames(plot_table)[2], ...)
44 |   }
45 | 
46 | sqlvis_ggvis_histogram <- function(plot_table, ...){
47 |   plot_table %>%
48 |     select(x = 3, y = 2) %>%
49 |     ggvis(x = ~x, y = ~y) %>%
50 |     layer_bars() %>%
51 |     add_axis("x", title = colnames(plot_table)[3]) %>%
52 |     add_axis("y", title = colnames(plot_table)[2])
53 | }
54 | 
55 | 


--------------------------------------------------------------------------------
/prod/presentations/cloudera/sqlvis_raster.R:
--------------------------------------------------------------------------------
  1 | ### Big data tile plot
  2 | 
  3 | # data <- tbl(sc, "trips_model_data")
  4 | # x_field <- "pickup_longitude"
  5 | # y_field <- "pickup_latitude"
  6 | # resolution <- 50
  7 | 
  8 | sqlvis_compute_raster <- function(data, x_field, y_field, resolution = 300){
  9 |   
 10 |   data_prep <- data %>%
 11 |     select_(x = x_field, y = y_field) %>%
 12 |     filter(!is.na(x), !is.na(y))
 13 |   
 14 |   s <- data_prep %>%
 15 |     summarise(max_x = max(x),
 16 |               max_y = max(y), 
 17 |               min_x = min(x),
 18 |               min_y = min(y)) %>%
 19 |     mutate(rng_x = max_x - min_x,
 20 |            rng_y = max_y - min_y,
 21 |            resolution = resolution) %>%
 22 |     collect()
 23 |   
 24 |   counts <- data_prep %>% 
 25 |     mutate(res_x = round((x - s$min_x) / s$rng_x * resolution, 0),
 26 |            res_y = round((y - s$min_y) / s$rng_y * resolution, 0)) %>%
 27 |     count(res_x, res_y) %>%
 28 |     collect
 29 | 
 30 |   list(counts = counts,
 31 |        limits = s,
 32 |        vnames = c(x_field, y_field)
 33 |        )  
 34 | 
 35 | }
 36 | 
 37 | sqlvis_ggplot_raster <- function(data, ...) {
 38 | 
 39 |   d <- data$counts
 40 |   s <- data$limits
 41 |   v <- data$vnames
 42 | 
 43 |   xx <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_x, s$max_x, len = 6),2))
 44 |   yy <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_y, s$max_y, len = 6),2))
 45 | 
 46 |   ggplot(d, aes(res_x, res_y)) + 
 47 |     geom_raster(aes(fill = n)) +
 48 |     coord_fixed() +
 49 |     scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") +
 50 |     scale_x_continuous(breaks = xx, labels = names(xx)) +
 51 |     scale_y_continuous(breaks = yy, labels = names(yy)) +
 52 |     labs(x = v[1], y = v[2], ...)
 53 |     
 54 | }    
 55 | 
 56 | ### Facets
 57 | 
 58 | sqlvis_compute_raster_g <- function(data, x_field, y_field, g_field, resolution = 300){
 59 |   
 60 |   data_prep <- data %>%
 61 |     mutate_(group = g_field) %>%
 62 |     select_(g = "group", x = x_field, y = y_field) %>%
 63 |     filter(!is.na(x), !is.na(y))
 64 |   
 65 |   s <- data_prep %>%
 66 |     summarise(max_x = max(x),
 67 |               max_y = max(y), 
 68 |               min_x = min(x),
 69 |               min_y = min(y)) %>%
 70 |     mutate(rng_x = max_x - min_x,
 71 |            rng_y = max_y - min_y,
 72 |            resolution = resolution) %>%
 73 |     collect()
 74 |   
 75 |   counts <- data_prep %>% 
 76 |     mutate(res_x = round((x-s$min_x)/s$rng_x*resolution, 0),
 77 |            res_y = round((y-s$min_y)/s$rng_y*resolution, 0)) %>%
 78 |     count(g, res_x, res_y) %>%
 79 |     collect
 80 |   
 81 |   list(counts = counts,
 82 |        limits = s,
 83 |        vnames = c(x_field, y_field)
 84 |   )  
 85 |   
 86 | }
 87 | 
 88 | sqlvis_ggplot_raster_g <- function(data, ncol = 4, ...) {
 89 |   
 90 |   s <- data$limits
 91 |   d <- data$counts
 92 |   v <- data$vnames
 93 |   
 94 |   xx <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_x, s$max_x, len = 3), 1))
 95 |   yy <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_y, s$max_y, len = 3), 1))
 96 |   
 97 |   ggplot(d, aes(res_x, res_y)) + 
 98 |     geom_raster(aes(fill = n)) +
 99 |     coord_fixed() +
100 |     facet_wrap(~ g, ncol = ncol) +
101 |     scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") +
102 |     scale_x_continuous(breaks = xx, labels = names(xx)) +
103 |     scale_y_continuous(breaks = yy, labels = names(yy)) +
104 |     labs(x = v[1], y = v[2], ...)
105 |   
106 | }    
107 | 


--------------------------------------------------------------------------------
/prod/presentations/sparkSummitEast/README.md:
--------------------------------------------------------------------------------
 1 | # Analyze data with sparklyr
 2 | 
 3 | ## Abstract
 4 | 
 5 | Sparklyr is an R package that lets you analyze data in Spark while using familiar tools in R. Sparklyr supports a complete backend for dplyr, a popular tool for working with data frame objects both in memory and out of memory. You can use dplyr to translate R code into Spark SQL. Sparklyr also supports MLlib so you can run classifiers, regressions, clustering, decision trees, and many more machine learning algorithms on your distributed data in Spark. With sparklyr you can analyze large amounts of data that would not traditionally fit into R memory. Then you can collect results from Spark into R for further visualization and documentation.
 6 | 
 7 | Sparklyr is also extensible. You can create R packages that depend on sparklyr to call the full Spark API. One example of an extension is H2O’s rsparkling, an R package that works with H2O’s machine learning algorithm. With sparklyr and rsparkling you have access to all the tools in H2O for analysis with R and Spark.
 8 | 
 9 | ## Documents
10 | 
11 | * [Data Science Toolchain with Spark and R](http://colorado.rstudio.com:3939/content/276/taxiDemoH2O.nb.html)
12 | * [Connecting to Spark through Livy](http://colorado.rstudio.com:3939/content/289/livy.nb.html)
13 | 
14 | ## Slides
15 | 
16 | ![](img/img.001.jpeg)
17 | 
18 | ***
19 | 
20 | ![](img/img.002.jpeg)
21 | 
22 | ***
23 | 
24 | ![](img/img.003.jpeg)
25 | 
26 | ***
27 | 
28 | ![](img/img.004.jpeg)
29 | 
30 | ***
31 | 
32 | ![](img/img.005.jpeg)
33 | 
34 | ***
35 | 
36 | ![](img/img.006.jpeg)
37 | 
38 | ***
39 | 
40 | ![](img/img.007.jpeg)
41 | 
42 | ***
43 | 
44 | ![](img/img.008.jpeg)
45 | 
46 | ***
47 | 
48 | ![](img/img.009.jpeg)
49 | 
50 | ***
51 | 
52 | ![](img/img.010.jpeg)
53 | 
54 | ***
55 | 
56 | ![](img/img.011.jpeg)
57 | 
58 | ***
59 | 
60 | ![](img/img.012.jpeg)
61 | 
62 | ***
63 | 
64 | ![](img/img.013.jpeg)
65 | 
66 | ***
67 | 
68 | ![](img/img.014.jpeg)
69 | 
70 | ***
71 | 
72 | ![](img/img.015.jpeg)
73 | 
74 | ***
75 | 
76 | ![](img/img.016.jpeg)
77 | 
78 | ***
79 | 
80 | ![](img/img.017.jpeg)
81 | 


--------------------------------------------------------------------------------
/prod/presentations/sparkSummitEast/img/img.001.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.001.jpeg


--------------------------------------------------------------------------------
/prod/presentations/sparkSummitEast/img/img.002.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.002.jpeg


--------------------------------------------------------------------------------
/prod/presentations/sparkSummitEast/img/img.003.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.003.jpeg


--------------------------------------------------------------------------------
/prod/presentations/sparkSummitEast/img/img.004.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.004.jpeg


--------------------------------------------------------------------------------
/prod/presentations/sparkSummitEast/img/img.005.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.005.jpeg


--------------------------------------------------------------------------------
/prod/presentations/sparkSummitEast/img/img.006.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.006.jpeg


--------------------------------------------------------------------------------
/prod/presentations/sparkSummitEast/img/img.007.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.007.jpeg


--------------------------------------------------------------------------------
/prod/presentations/sparkSummitEast/img/img.008.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.008.jpeg


--------------------------------------------------------------------------------
/prod/presentations/sparkSummitEast/img/img.009.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.009.jpeg


--------------------------------------------------------------------------------
/prod/presentations/sparkSummitEast/img/img.010.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.010.jpeg


--------------------------------------------------------------------------------
/prod/presentations/sparkSummitEast/img/img.011.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.011.jpeg


--------------------------------------------------------------------------------
/prod/presentations/sparkSummitEast/img/img.012.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.012.jpeg


--------------------------------------------------------------------------------
/prod/presentations/sparkSummitEast/img/img.013.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.013.jpeg


--------------------------------------------------------------------------------
/prod/presentations/sparkSummitEast/img/img.014.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.014.jpeg


--------------------------------------------------------------------------------
/prod/presentations/sparkSummitEast/img/img.015.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.015.jpeg


--------------------------------------------------------------------------------
/prod/presentations/sparkSummitEast/img/img.016.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.016.jpeg


--------------------------------------------------------------------------------
/prod/presentations/sparkSummitEast/img/img.017.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/sparkSummitEast/img/img.017.jpeg


--------------------------------------------------------------------------------
/prod/presentations/sparkSummitEast/livy.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Connecting to Spark through Livy"
 3 | output: html_notebook
 4 | ---
 5 | 
 6 | With Livy you can anaylze data in your spark cluster via R on your desktop.
 7 | 
 8 | ## Livy
 9 | 
10 | Livy is a service that enables easy interaction with an Apache Spark cluster over a REST interface. It enables easy submission of Spark jobs or snippets of Spark code, synchronous or asynchronous result retrieval, as well as SparkContext management, all via a simple REST interface or a RPC client library. Livy also simplifies the interaction between Spark from application servers, thus enabling the use of Spark for interactive web/mobile applications.
11 | 
12 | <center><div style="width:450px">
13 | ![](http://livy.io/img/livy-architecture.png)
14 | </div></center>
15 | 
16 | ## Start Livy [Server Side]
17 | 
18 | Set home environment variables and start a Livy server to handle local requests.
19 | 
20 | ```{r, eval=FALSE}
21 | sparklyr::livy_install()
22 | sparklyr::livy_service_start()
23 | ```
24 | 
25 | ## Connect to Spark [Client Side]
26 | 
27 | Use `method = "livy"` to connect to the cluster.
28 | 
29 | ```{r warning=FALSE, eval=FALSE}
30 | library(sparklyr)
31 | library(dplyr)
32 | sc <- spark_connect(
33 |   master = "http://ec2-***-**-***-**.compute-1.amazonaws.com:8998/", 
34 |   method = "livy")
35 | ```
36 | 
37 | ## Analyze [Client Side]
38 | 
39 | Use R code on your workstation as you normally would. Your R commands will be sent to the cluster via Livy for processing. Collect your results back to the desktop for further processing in R.
40 | 
41 | ```{r eval=FALSE}
42 | library(ggplot2)
43 | trips_model_data_tbl <- tbl(sc, "trips_model_data")
44 | pickup_dropoff_tbl <- trips_model_data_tbl %>%
45 |   filter(pickup_nta == "Turtle Bay-East Midtown" & dropoff_nta == "Airport") %>%
46 |   mutate(pickup_hour = hour(pickup_datetime)) %>%
47 |   mutate(trip_time = unix_timestamp(dropoff_datetime) - unix_timestamp(pickup_datetime)) %>%
48 |   group_by(pickup_hour) %>% 
49 |   summarize(n = n(),
50 |             trip_time_mean = mean(trip_time),
51 |             trip_time_p10 = percentile(trip_time, 0.10),
52 |             trip_time_p25 = percentile(trip_time, 0.25),
53 |             trip_time_p50 = percentile(trip_time, 0.50),
54 |             trip_time_p75 = percentile(trip_time, 0.75),
55 |             trip_time_p90 = percentile(trip_time, 0.90))
56 | 
57 | # Collect results
58 | pickup_dropoff <- collect(pickup_dropoff_tbl)
59 | 
60 | # Plot
61 | ggplot(pickup_dropoff, aes(x = pickup_hour)) +
62 |           geom_line(aes(y = trip_time_p50, alpha = "Median")) +
63 |           geom_ribbon(aes(ymin = trip_time_p25, ymax = trip_time_p75, 
64 |                           alpha = "25–75th percentile")) +
65 |           geom_ribbon(aes(ymin = trip_time_p10, ymax = trip_time_p90, 
66 |                           alpha = "10–90th percentile")) +
67 |           scale_y_continuous("trip duration in minutes")
68 | ```
69 | 
70 | ## Disconnect [Sever Side]
71 | 
72 | ```{r disconnect, eval=FALSE}
73 | sparklyr::livy_service_stop()
74 | ```
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/prod/presentations/sparkSummitEast/sqlvis_histogram.R:
--------------------------------------------------------------------------------
 1 | ### Big data histogram
 2 | sqlvis_compute_histogram <- function(data, x_name, bins = 30){
 3 | 
 4 |   data_prep <- data %>%
 5 |     select_(x_field = x_name) %>%
 6 |     filter(!is.na(x_field)) %>%
 7 |     mutate(x_field = as.double(x_field))
 8 |     
 9 |   s <- data_prep %>%
10 |     summarise(max_x = max(x_field), min_x = min(x_field)) %>%
11 |     mutate(bin_value = (max_x - min_x) / bins) %>%
12 |     collect()
13 |   
14 |   new_bins <- as.numeric(c((0:(bins - 1) * s$bin_value) + s$min_x, s$max_x))
15 | 
16 |   plot_table <- data_prep %>%
17 |     ft_bucketizer(input.col = "x_field", output.col = "key_bin", splits = new_bins) %>%
18 |     group_by(key_bin) %>%
19 |     tally() %>%
20 |     collect()
21 |   
22 |   all_bins <- data.frame(
23 |     key_bin = 0:(bins - 1), 
24 |     bin = 1:bins, 
25 |     bin_ceiling = head(new_bins, -1)
26 |   )
27 |   
28 |   plot_table %>%
29 |     full_join(all_bins, by="key_bin") %>%
30 |     arrange(key_bin) %>%
31 |     mutate(n = ifelse(!is.na(n), n, 0)) %>%
32 |     select(bin = key_bin, count = n, bin_ceiling) %>%
33 |     rename_(.dots = setNames(list("bin_ceiling"), x_name))
34 |   
35 |   }
36 | 
37 | sqlvis_ggplot_histogram <- function(plot_table, ...){
38 |   plot_table %>%
39 |     select(x = 3, y = 2) %>%
40 |     ggplot(aes(x, y)) +
41 |     geom_bar(stat = "identity", fill = "cornflowerblue") +
42 |     theme(legend.position = "none") +
43 |     labs(x = colnames(plot_table)[3], y = colnames(plot_table)[2], ...)
44 |   }
45 | 
46 | sqlvis_ggvis_histogram <- function(plot_table, ...){
47 |   plot_table %>%
48 |     select(x = 3, y = 2) %>%
49 |     ggvis(x = ~x, y = ~y) %>%
50 |     layer_bars() %>%
51 |     add_axis("x", title = colnames(plot_table)[3]) %>%
52 |     add_axis("y", title = colnames(plot_table)[2])
53 | }
54 | 
55 | 


--------------------------------------------------------------------------------
/prod/presentations/sparkSummitEast/sqlvis_raster.R:
--------------------------------------------------------------------------------
  1 | ### Big data tile plot
  2 | 
  3 | # data <- tbl(sc, "trips_model_data")
  4 | # x_field <- "pickup_longitude"
  5 | # y_field <- "pickup_latitude"
  6 | # resolution <- 50
  7 | 
  8 | sqlvis_compute_raster <- function(data, x_field, y_field, resolution = 300){
  9 |   
 10 |   data_prep <- data %>%
 11 |     select_(x = x_field, y = y_field) %>%
 12 |     filter(!is.na(x), !is.na(y))
 13 |   
 14 |   s <- data_prep %>%
 15 |     summarise(max_x = max(x),
 16 |               max_y = max(y), 
 17 |               min_x = min(x),
 18 |               min_y = min(y)) %>%
 19 |     mutate(rng_x = max_x - min_x,
 20 |            rng_y = max_y - min_y,
 21 |            resolution = resolution) %>%
 22 |     collect()
 23 |   
 24 |   counts <- data_prep %>% 
 25 |     mutate(res_x = round((x - s$min_x) / s$rng_x * resolution, 0),
 26 |            res_y = round((y - s$min_y) / s$rng_y * resolution, 0)) %>%
 27 |     count(res_x, res_y) %>%
 28 |     collect
 29 | 
 30 |   list(counts = counts,
 31 |        limits = s,
 32 |        vnames = c(x_field, y_field)
 33 |        )  
 34 | 
 35 | }
 36 | 
 37 | sqlvis_ggplot_raster <- function(data, ...) {
 38 | 
 39 |   d <- data$counts
 40 |   s <- data$limits
 41 |   v <- data$vnames
 42 | 
 43 |   xx <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_x, s$max_x, len = 6),2))
 44 |   yy <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_y, s$max_y, len = 6),2))
 45 | 
 46 |   ggplot(d, aes(res_x, res_y)) + 
 47 |     geom_raster(aes(fill = n)) +
 48 |     coord_fixed() +
 49 |     scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") +
 50 |     scale_x_continuous(breaks = xx, labels = names(xx)) +
 51 |     scale_y_continuous(breaks = yy, labels = names(yy)) +
 52 |     labs(x = v[1], y = v[2], ...)
 53 |     
 54 | }    
 55 | 
 56 | ### Facets
 57 | 
 58 | sqlvis_compute_raster_g <- function(data, x_field, y_field, g_field, resolution = 300){
 59 |   
 60 |   data_prep <- data %>%
 61 |     mutate_(group = g_field) %>%
 62 |     select_(g = "group", x = x_field, y = y_field) %>%
 63 |     filter(!is.na(x), !is.na(y))
 64 |   
 65 |   s <- data_prep %>%
 66 |     summarise(max_x = max(x),
 67 |               max_y = max(y), 
 68 |               min_x = min(x),
 69 |               min_y = min(y)) %>%
 70 |     mutate(rng_x = max_x - min_x,
 71 |            rng_y = max_y - min_y,
 72 |            resolution = resolution) %>%
 73 |     collect()
 74 |   
 75 |   counts <- data_prep %>% 
 76 |     mutate(res_x = round((x-s$min_x)/s$rng_x*resolution, 0),
 77 |            res_y = round((y-s$min_y)/s$rng_y*resolution, 0)) %>%
 78 |     count(g, res_x, res_y) %>%
 79 |     collect
 80 |   
 81 |   list(counts = counts,
 82 |        limits = s,
 83 |        vnames = c(x_field, y_field)
 84 |   )  
 85 |   
 86 | }
 87 | 
 88 | sqlvis_ggplot_raster_g <- function(data, ncol = 4, ...) {
 89 |   
 90 |   s <- data$limits
 91 |   d <- data$counts
 92 |   v <- data$vnames
 93 |   
 94 |   xx <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_x, s$max_x, len = 3), 1))
 95 |   yy <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_y, s$max_y, len = 3), 1))
 96 |   
 97 |   ggplot(d, aes(res_x, res_y)) + 
 98 |     geom_raster(aes(fill = n)) +
 99 |     coord_fixed() +
100 |     facet_wrap(~ g, ncol = ncol) +
101 |     scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") +
102 |     scale_x_continuous(breaks = xx, labels = names(xx)) +
103 |     scale_y_continuous(breaks = yy, labels = names(yy)) +
104 |     labs(x = v[1], y = v[2], ...)
105 |   
106 | }    
107 | 


--------------------------------------------------------------------------------
/prod/presentations/tidyverse/01_taxiR.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "NYC Taxi - One month in R"
  3 | output: html_notebook
  4 | ---
  5 | 
  6 | <center><div style="width:450px">
  7 | ![R for Data Science http://r4ds.had.co.nz/](http://r4ds.had.co.nz/diagrams/data-science.png)
  8 | </div></center>
  9 | 
 10 | # Load tidyverse
 11 | 
 12 | ```{r tidyverse}
 13 | library(tidyverse)
 14 | library(lubridate)
 15 | ```
 16 | 
 17 | # Download
 18 | 
 19 | ```{r download, eval=FALSE}
 20 | download.file(
 21 |   "https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2015-12.csv",
 22 |   "yellow_tripdata_2015-12.csv")
 23 | ```
 24 | 
 25 | # Impord Dataset
 26 | 
 27 | ```{r import, message=FALSE, warning=FALSE}
 28 | trips <- read_csv("~/sparkDemos/prod/presentations/tidyverse/yellow_tripdata_2015-12.csv")
 29 | ```
 30 | 
 31 | # Tidy
 32 | 
 33 | ```{r tidy}
 34 | # pickups
 35 | select(trips, tpep_pickup_datetime, pickup_latitude, pickup_longitude)
 36 | 
 37 | # dropoffs
 38 | select(trips, tpep_dropoff_datetime, dropoff_latitude, dropoff_longitude)
 39 | 
 40 | # trips
 41 | trips
 42 | ```
 43 | 
 44 | # Transform
 45 | 
 46 | ```{r transform}
 47 | tripsHour <- trips %>%
 48 |   filter(payment_type %in% c(1, 2)) %>%
 49 |   mutate(pay_type = ifelse(payment_type == 1, "credit", "cash")) %>%
 50 |   mutate(trip_time_sec = tpep_dropoff_datetime - tpep_pickup_datetime) %>%
 51 |   mutate(trip_time_min = as.numeric(trip_time_sec / 60)) %>%
 52 |   mutate(hour = round_date(tpep_pickup_datetime, "hour")) %>%
 53 |   group_by(pay_type, hour) %>%
 54 |   summarize(n = n(),
 55 |             tip_amount = mean(tip_amount),
 56 |             fare_amount = mean(fare_amount),
 57 |             passenger_count = mean(passenger_count),
 58 |             trip_time = mean(trip_time_min),
 59 |             trip_distance = mean(trip_distance))
 60 | tripsHour
 61 | ```
 62 | 
 63 | # Visualize
 64 | 
 65 | ```{r visualize}
 66 | ggplot(tripsHour, aes(trip_time, trip_distance, color = pay_type)) +
 67 |   geom_point() + geom_smooth() + 
 68 |   labs(title = "NYC Taxi by hour by day", x = "Minutes", y = "Miles", caption = '2015-12')
 69 | 
 70 | ggplot(tripsHour, aes(trip_distance, fare_amount)) +
 71 |   geom_point() + geom_smooth() + facet_grid(~pay_type) +
 72 |   labs(title = "NYC Taxi by hour by day", x = "Distance", y = "Dollars", caption = '2015-12')
 73 | ```
 74 | 
 75 | # Model
 76 | 
 77 | ```{r model}
 78 | # Formula
 79 | model_formula <- formula(tip_amount ~ fare_amount + pay_type + passenger_count)
 80 | 
 81 | # Model data
 82 | tripsModel <- tripsHour %>%
 83 |   select(tip_amount, fare_amount, pay_type, passenger_count) %>%
 84 |   na.omit
 85 | 
 86 | # Linear Model
 87 | m1 <- lm(model_formula, data = tripsHour)
 88 | summary(m1)
 89 | 
 90 | # Decision tree
 91 | library(rpart)
 92 | m2 <- rpart(model_formula, tripsHour)
 93 | summary(m2)
 94 | 
 95 | # Predict
 96 | pred <- tripsHour %>%
 97 |   ungroup %>%
 98 |   mutate(lm_fit = predict(m1, tripsHour)) %>%
 99 |   mutate(lm_res = tip_amount - lm_fit) %>%
100 |   mutate(rpart_fit = predict(m2, tripsHour)) %>%
101 |   mutate(rpart_res = tip_amount - rpart_fit)
102 | 
103 | # MSE
104 | pred %>%
105 |   na.omit() %>%
106 |   summarize(lm_mse = mean(lm_res^2), rpart_mse = mean(rpart_res^2))
107 | 
108 | # Plot
109 | ggplot(pred, aes(rpart_fit, lm_fit)) + geom_point() + geom_smooth(method="lm")
110 | ```
111 | 
112 | # Communicate
113 | 
114 | This analysis of one month of NYC Taxi data shows that you can predict tip amount as a function of fare amount, pay type, and passenger account. For a detailed explanation of the code you can view this report in the following formats:
115 | 
116 | * HTML
117 | * PDF
118 | * Word
119 | 


--------------------------------------------------------------------------------
/prod/presentations/tidyverse/README.md:
--------------------------------------------------------------------------------
 1 | # Analyze data with sparklyr
 2 | 
 3 | ## Abstract
 4 | 
 5 | Sparklyr is an R package that lets you analyze data in Spark while using familiar tools in R. Sparklyr supports a complete backend for dplyr, a popular tool for working with data frame objects both in memory and out of memory. You can use dplyr to translate R code into Spark SQL. Sparklyr also supports MLlib so you can run classifiers, regressions, clustering, decision trees, and many more machine learning algorithms on your distributed data in Spark. With sparklyr you can analyze large amounts of data that would not traditionally fit into R memory. Then you can collect results from Spark into R for further visualization and documentation.
 6 | 
 7 | ## Documents
 8 | 
 9 | These documents are for understanding the toolchain and the tidyverse using the famous NYC taxi data.
10 | 
11 | * [Data Science Toolchain with Spark and R](http://colorado.rstudio.com:3939/content/420/taxiDemo.nb.html)
12 | * [Tidyverse and R Notebooks with NYC Taxi Data](http://colorado.rstudio.com:3939/content/421/taxiR.nb.html)
13 | 
14 | ## Slides
15 | 
16 | ![](img/tidyverse.001.jpeg)
17 | 
18 | ***
19 | 
20 | ![](img/tidyverse.002.jpeg)
21 | 
22 | ***
23 | 
24 | ![](img/tidyverse.003.jpeg)
25 | 
26 | ***
27 | 
28 | ![](img/tidyverse.004.jpeg)
29 | 
30 | ***
31 | 
32 | ![](img/tidyverse.005.jpeg)
33 | 
34 | ***
35 | 
36 | ![](img/tidyverse.006.jpeg)
37 | 
38 | ***
39 | 
40 | ![](img/tidyverse.007.jpeg)
41 | 
42 | ***
43 | 
44 | ![](img/tidyverse.008.jpeg)
45 | 
46 | ***
47 | 
48 | ![](img/tidyverse.009.jpeg)
49 | 
50 | ***
51 | 
52 | ![](img/tidyverse.010.jpeg)
53 | 
54 | ***
55 | 
56 | ![](img/tidyverse.011.jpeg)
57 | 
58 | ***
59 | 
60 | ![](img/tidyverse.012.jpeg)
61 | 
62 | ***
63 | 
64 | ![](img/tidyverse.013.jpeg)
65 | 
66 | ***
67 | 
68 | ![](img/tidyverse.014.jpeg)
69 | 
70 | ***
71 | 
72 | ![](img/tidyverse.015.jpeg)
73 | 
74 | ***
75 | 
76 | ![](img/tidyverse.016.jpeg)
77 | 
78 | 


--------------------------------------------------------------------------------
/prod/presentations/tidyverse/emr_setup.sh:
--------------------------------------------------------------------------------
  1 | ### Build EMR master node with Taxi Data
  2 | ### Nathan Stephens
  3 | ### 3/27/2017
  4 | 
  5 | ###########################################
  6 | ### Run as root
  7 | ###########################################
  8 | 
  9 | ## RSP
 10 | 
 11 | # Upate
 12 | sudo yum update
 13 | 
 14 | # R
 15 | sudo yum install -y R libcurl-devel openssl-devel git
 16 |   
 17 | # install RSP
 18 | wget -q https://download2.rstudio.org/current.ver -O /tmp/rsp.current.ver
 19 | wget -O /tmp/rstudio-server-rhel.rpm https://s3.amazonaws.com/rstudio-dailybuilds/rstudio-server-rhel-pro-$(cat /tmp/rsp.current.ver)-x86_64.rpm
 20 | sudo yum install -y --nogpgcheck /tmp/rstudio-server-rhel.rpm
 21 | 
 22 | # install packages
 23 | sudo Rscript -e 'install.packages("sparklyr",  repos = "http://cran.rstudio.com/")'
 24 | sudo Rscript -e 'install.packages("devtools",  repos = "http://cran.rstudio.com/")'
 25 | sudo Rscript -e 'install.packages("tidyverse", repos = "http://cran.rstudio.com/")'
 26 | sudo Rscript -e 'install.packages("leaflet",   repos = "http://cran.rstudio.com/")'
 27 | sudo Rscript -e 'install.packages("DT",        repos = "http://cran.rstudio.com/")'
 28 | 
 29 | ###########################################
 30 | 
 31 | ## Add rstudio user
 32 | sudo useradd -m rstudio
 33 | sudo echo rstudio | passwd rstudio --stdin
 34 | sudo usermod -a -G hadoop rstudio
 35 | sudo usermod -a -G hive rstudio
 36 | 
 37 | 
 38 | ###########################################
 39 | ### Run as rstudio
 40 | ###########################################
 41 | 
 42 | ## switch user
 43 | su rstudio
 44 | cd ~
 45 | 
 46 | ## add rstudio directory
 47 | hadoop fs -mkdir /user/rstudio
 48 | hadoop fs -chown rstudio:rstudio /user/rstudio
 49 | 
 50 | ## clone project
 51 | git clone https://github.com/rstudio/sparkDemos.git /home/rstudio/sparkDemos
 52 | cat >/home/rstudio/sparkDemos/sparkDemos.Rproj <<ENDOFCONTENT
 53 | Version: 1.0
 54 | 
 55 | RestoreWorkspace: Default
 56 | SaveWorkspace: Default
 57 | AlwaysSaveHistory: Default
 58 | 
 59 | EnableCodeIndexing: Yes
 60 | UseSpacesForTab: Yes
 61 | NumSpacesForTab: 2
 62 | Encoding: UTF-8
 63 | 
 64 | RnwWeave: Sweave
 65 | LaTeX: pdfLaTeX
 66 | ENDOFCONTENT
 67 | 
 68 | ## Copy data
 69 | 
 70 | nohup /usr/bin/s3-dist-cp --src=s3n://rstudio-sparkdemo-data/nyc-taxi/csv/nyct2010 --dest=hdfs:///user/rstudio/nyct2010 >> nyct2010.log &
 71 | nohup /usr/bin/s3-dist-cp --src=s3n://rstudio-sparkdemo-data/nyc-taxi/parquet_nohead/trips --dest=hdfs:///user/rstudio/trips_par >> trips_par.log &
 72 | nohup /usr/bin/s3-dist-cp --src=s3n://rstudio-sparkdemo-data/nyc-taxi/parquet/trips_model_data --dest=hdfs:///user/rstudio/trips_model_data >> trips_model_data.log &
 73 | 
 74 | 
 75 | ###########################################
 76 | ### Open Hive
 77 | ###########################################
 78 | 
 79 | hive
 80 | 
 81 | # Hive 1
 82 | 
 83 | CREATE EXTERNAL TABLE IF NOT EXISTS nyct2010(
 84 | gid int,
 85 | ctlabel float,
 86 | borocode int,
 87 | boroname string,
 88 | ct2010 int,
 89 | boroct2010 int,
 90 | cdeligibil string,
 91 | ntacode string,
 92 | ntaname string,
 93 | puma int)
 94 | ROW FORMAT DELIMITED
 95 | FIELDS TERMINATED BY ','
 96 | LINES TERMINATED BY '\n'
 97 | ;
 98 | 
 99 | LOAD DATA INPATH '/user/rstudio/nyct2010' INTO TABLE nyct2010;
100 | 
101 | # Hive 3
102 | 
103 | CREATE EXTERNAL TABLE IF NOT EXISTS trips_par(
104 | id int,
105 | cab_type_id int,
106 | vendor_id string,
107 | pickup_datetime timestamp,
108 | dropoff_datetime timestamp,
109 | store_and_fwd_flag string,
110 | rate_code_id string,
111 | pickup_longitude float,
112 | pickup_latitude float,
113 | dropoff_longitude float,
114 | dropoff_latitude float,
115 | passenger_count bigint,
116 | trip_distance float,
117 | fare_amount float,
118 | extra bigint,
119 | mta_tax string,
120 | tip_amount float,
121 | tolls_amount float,
122 | ehail_fee string,
123 | improvement_surcharge string,
124 | total_amount float,
125 | payment_type string,
126 | trip_type string,
127 | pickup_nyct2010_gid int,
128 | dropoff_nyct2010_gid int)
129 | stored as parquet;
130 | 
131 | LOAD DATA INPATH '/user/rstudio/trips_par' INTO TABLE trips_par;
132 | 
133 | 
134 | # Hive 3
135 | CREATE EXTERNAL TABLE IF NOT EXISTS trips_model_data(
136 | pickup_datetime timestamp,
137 | pickup_latitude float,
138 | pickup_longitude float,
139 | pickup_nyct2010_gid int,
140 | pickup_boro string,
141 | pickup_nta string,
142 | dropoff_datetime timestamp,
143 | dropoff_latitude float,
144 | dropoff_longitude float,
145 | dropoff_nyct2010_gid int,
146 | dropoff_boro string,
147 | dropoff_nta string,
148 | cab_type string,
149 | passenger_count bigint,
150 | trip_distance float,
151 | pay_type string,
152 | fare_amount float,
153 | tip_amount float,
154 | other_amount float,
155 | total_amount float)
156 | stored as parquet;
157 | 
158 | LOAD DATA INPATH '/user/rstudio/trips_model_data' INTO TABLE trips_model_data;
159 | 
160 | 


--------------------------------------------------------------------------------
/prod/presentations/tidyverse/img/tidyverse.001.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.001.jpeg


--------------------------------------------------------------------------------
/prod/presentations/tidyverse/img/tidyverse.002.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.002.jpeg


--------------------------------------------------------------------------------
/prod/presentations/tidyverse/img/tidyverse.003.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.003.jpeg


--------------------------------------------------------------------------------
/prod/presentations/tidyverse/img/tidyverse.004.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.004.jpeg


--------------------------------------------------------------------------------
/prod/presentations/tidyverse/img/tidyverse.005.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.005.jpeg


--------------------------------------------------------------------------------
/prod/presentations/tidyverse/img/tidyverse.006.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.006.jpeg


--------------------------------------------------------------------------------
/prod/presentations/tidyverse/img/tidyverse.007.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.007.jpeg


--------------------------------------------------------------------------------
/prod/presentations/tidyverse/img/tidyverse.008.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.008.jpeg


--------------------------------------------------------------------------------
/prod/presentations/tidyverse/img/tidyverse.009.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.009.jpeg


--------------------------------------------------------------------------------
/prod/presentations/tidyverse/img/tidyverse.010.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.010.jpeg


--------------------------------------------------------------------------------
/prod/presentations/tidyverse/img/tidyverse.011.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.011.jpeg


--------------------------------------------------------------------------------
/prod/presentations/tidyverse/img/tidyverse.012.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.012.jpeg


--------------------------------------------------------------------------------
/prod/presentations/tidyverse/img/tidyverse.013.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.013.jpeg


--------------------------------------------------------------------------------
/prod/presentations/tidyverse/img/tidyverse.014.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.014.jpeg


--------------------------------------------------------------------------------
/prod/presentations/tidyverse/img/tidyverse.015.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.015.jpeg


--------------------------------------------------------------------------------
/prod/presentations/tidyverse/img/tidyverse.016.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/img/tidyverse.016.jpeg


--------------------------------------------------------------------------------
/prod/presentations/tidyverse/sqlvis_histogram.R:
--------------------------------------------------------------------------------
 1 | ### Big data histogram
 2 | sqlvis_compute_histogram <- function(data, x_name, bins = 30){
 3 | 
 4 |   data_prep <- data %>%
 5 |     select_(x_field = x_name) %>%
 6 |     filter(!is.na(x_field)) %>%
 7 |     mutate(x_field = as.double(x_field))
 8 |     
 9 |   s <- data_prep %>%
10 |     summarise(max_x = max(x_field), min_x = min(x_field)) %>%
11 |     mutate(bin_value = (max_x - min_x) / bins) %>%
12 |     collect()
13 |   
14 |   new_bins <- as.numeric(c((0:(bins - 1) * s$bin_value) + s$min_x, s$max_x))
15 | 
16 |   plot_table <- data_prep %>%
17 |     ft_bucketizer(input.col = "x_field", output.col = "key_bin", splits = new_bins) %>%
18 |     group_by(key_bin) %>%
19 |     tally() %>%
20 |     collect()
21 |   
22 |   all_bins <- data.frame(
23 |     key_bin = 0:(bins - 1), 
24 |     bin = 1:bins, 
25 |     bin_ceiling = head(new_bins, -1)
26 |   )
27 |   
28 |   plot_table %>%
29 |     full_join(all_bins, by="key_bin") %>%
30 |     arrange(key_bin) %>%
31 |     mutate(n = ifelse(!is.na(n), n, 0)) %>%
32 |     select(bin = key_bin, count = n, bin_ceiling) %>%
33 |     rename_(.dots = setNames(list("bin_ceiling"), x_name))
34 |   
35 |   }
36 | 
37 | sqlvis_ggplot_histogram <- function(plot_table, ...){
38 |   plot_table %>%
39 |     select(x = 3, y = 2) %>%
40 |     ggplot(aes(x, y)) +
41 |     geom_bar(stat = "identity", fill = "cornflowerblue") +
42 |     theme(legend.position = "none") +
43 |     labs(x = colnames(plot_table)[3], y = colnames(plot_table)[2], ...)
44 |   }
45 | 
46 | sqlvis_ggvis_histogram <- function(plot_table, ...){
47 |   plot_table %>%
48 |     select(x = 3, y = 2) %>%
49 |     ggvis(x = ~x, y = ~y) %>%
50 |     layer_bars() %>%
51 |     add_axis("x", title = colnames(plot_table)[3]) %>%
52 |     add_axis("y", title = colnames(plot_table)[2])
53 | }
54 | 
55 | 


--------------------------------------------------------------------------------
/prod/presentations/tidyverse/sqlvis_raster.R:
--------------------------------------------------------------------------------
  1 | ### Big data tile plot
  2 | 
  3 | # data <- tbl(sc, "trips_model_data")
  4 | # x_field <- "pickup_longitude"
  5 | # y_field <- "pickup_latitude"
  6 | # resolution <- 50
  7 | 
  8 | sqlvis_compute_raster <- function(data, x_field, y_field, resolution = 300){
  9 |   
 10 |   data_prep <- data %>%
 11 |     select_(x = x_field, y = y_field) %>%
 12 |     filter(!is.na(x), !is.na(y))
 13 |   
 14 |   s <- data_prep %>%
 15 |     summarise(max_x = max(x),
 16 |               max_y = max(y), 
 17 |               min_x = min(x),
 18 |               min_y = min(y)) %>%
 19 |     mutate(rng_x = max_x - min_x,
 20 |            rng_y = max_y - min_y,
 21 |            resolution = resolution) %>%
 22 |     collect()
 23 |   
 24 |   counts <- data_prep %>% 
 25 |     mutate(res_x = round((x - s$min_x) / s$rng_x * resolution, 0),
 26 |            res_y = round((y - s$min_y) / s$rng_y * resolution, 0)) %>%
 27 |     count(res_x, res_y) %>%
 28 |     collect
 29 | 
 30 |   list(counts = counts,
 31 |        limits = s,
 32 |        vnames = c(x_field, y_field)
 33 |        )  
 34 | 
 35 | }
 36 | 
 37 | sqlvis_ggplot_raster <- function(data, ...) {
 38 | 
 39 |   d <- data$counts
 40 |   s <- data$limits
 41 |   v <- data$vnames
 42 | 
 43 |   xx <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_x, s$max_x, len = 6),2))
 44 |   yy <- setNames(seq(1, s$resolution, len = 6), round(seq(s$min_y, s$max_y, len = 6),2))
 45 | 
 46 |   ggplot(d, aes(res_x, res_y)) + 
 47 |     geom_raster(aes(fill = n)) +
 48 |     coord_fixed() +
 49 |     scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") +
 50 |     scale_x_continuous(breaks = xx, labels = names(xx)) +
 51 |     scale_y_continuous(breaks = yy, labels = names(yy)) +
 52 |     labs(x = v[1], y = v[2], ...)
 53 |     
 54 | }    
 55 | 
 56 | ### Facets
 57 | 
 58 | sqlvis_compute_raster_g <- function(data, x_field, y_field, g_field, resolution = 300){
 59 |   
 60 |   data_prep <- data %>%
 61 |     mutate_(group = g_field) %>%
 62 |     select_(g = "group", x = x_field, y = y_field) %>%
 63 |     filter(!is.na(x), !is.na(y))
 64 |   
 65 |   s <- data_prep %>%
 66 |     summarise(max_x = max(x),
 67 |               max_y = max(y), 
 68 |               min_x = min(x),
 69 |               min_y = min(y)) %>%
 70 |     mutate(rng_x = max_x - min_x,
 71 |            rng_y = max_y - min_y,
 72 |            resolution = resolution) %>%
 73 |     collect()
 74 |   
 75 |   counts <- data_prep %>% 
 76 |     mutate(res_x = round((x-s$min_x)/s$rng_x*resolution, 0),
 77 |            res_y = round((y-s$min_y)/s$rng_y*resolution, 0)) %>%
 78 |     count(g, res_x, res_y) %>%
 79 |     collect
 80 |   
 81 |   list(counts = counts,
 82 |        limits = s,
 83 |        vnames = c(x_field, y_field)
 84 |   )  
 85 |   
 86 | }
 87 | 
 88 | sqlvis_ggplot_raster_g <- function(data, ncol = 4, ...) {
 89 |   
 90 |   s <- data$limits
 91 |   d <- data$counts
 92 |   v <- data$vnames
 93 |   
 94 |   xx <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_x, s$max_x, len = 3), 1))
 95 |   yy <- setNames(seq(1, s$resolution, len = 3), round(seq(s$min_y, s$max_y, len = 3), 1))
 96 |   
 97 |   ggplot(d, aes(res_x, res_y)) + 
 98 |     geom_raster(aes(fill = n)) +
 99 |     coord_fixed() +
100 |     facet_wrap(~ g, ncol = ncol) +
101 |     scale_fill_distiller(palette = "Spectral", trans = "log", name = "Frequency") +
102 |     scale_x_continuous(breaks = xx, labels = names(xx)) +
103 |     scale_y_continuous(breaks = yy, labels = names(yy)) +
104 |     labs(x = v[1], y = v[2], ...)
105 |   
106 | }    
107 | 


--------------------------------------------------------------------------------
/prod/presentations/tidyverse/tidyverseAndSpark.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rstudio/sparkDemos/7b5c55154ef6513a49889c52c961359e8ab2cf49/prod/presentations/tidyverse/tidyverseAndSpark.pdf


--------------------------------------------------------------------------------