├── .gitignore
├── LICENSE
├── README.md
├── _config.yml
├── forecasting
    ├── .gitignore
    ├── app.R
    └── forecasting.Rmd
├── ga-and-r-examples.Rproj
├── normal_dist.Rmd
├── page-analysis-two-metrics
    ├── app.R
    └── page-analysis-two-metrics.Rmd
├── regression
    ├── app.R
    └── regression.Rmd
├── shinyapps-deploy-script.R
├── site-search-analysis
    ├── .gitignore
    ├── app.R
    └── site-search-analysis.Rmd
├── time-normalized-pageviews
    ├── app.R
    └── time-normalized-pageviews.Rmd
└── twitter-followers
    ├── app.R
    └── twitter-followers.Rmd


/.gitignore:
--------------------------------------------------------------------------------
 1 | # History files
 2 | .Rhistory
 3 | .Rapp.history
 4 | # Session Data files
 5 | .RData
 6 | # Example code in package build process
 7 | *-Ex.R
 8 | # Output files from R CMD build
 9 | /*.tar.gz
10 | # Output files from R CMD check
11 | /*.Rcheck/
12 | # RStudio files
13 | .Rproj.user/
14 | 
15 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3
16 | .httr-oauth
17 | # knitr and R markdown default cache directories
18 | /*_cache/
19 | /cache/
20 | # Temporary files created by R markdown
21 | *.utf8.md
22 | *.knit.md
23 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html
24 | 
25 | rsconnect/
26 | .Rproj.user
27 | 
28 | **/.httr-oauth
29 | **/.ga-httr-oauth
30 | **/ga-web-client.json
31 | 
32 | # Exclude GTM container includes
33 | **/gtm.js
34 | 
35 | # Exclude RNotebooks that are built from the .Rmd files
36 | **/*.nb.html
37 | 
38 | # Exclude the .Renviorn
39 | **/.Renviron
40 | 
41 | # I'm not even sure what this is...but seems like it should be excluded
42 | .DS_Store


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Search Discovery
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Google Analytics (and a little bit of Twitter) and R
 2 | 
 3 | This repository has ~5 examples of using R with Google Analytics (plus one example of using R with Twitter). These examples are designed to work with the free version of Google Analytics with just the base tag (implemented). This makes the examples less impactful, but the idea is that it makes them a safe and easy jumping off point for anyone who wants to use them.
 4 | 
 5 | There have been various presentations that, ultimately, point to this page for follow-up information. Depending on if/when you saw one of those presentations, you may be looking for different slides:
 6 | 
 7 | * One version -- presented at Columbus Web Analytics Wednesday in January 2019 -- is available in [Google Slides](https://docs.google.com/presentation/d/1UBtbuIPmZ6yUj5VLPNYDcbV_0JSZzyyugvDPcAJd8Fs/edit?usp=sharing) or as slides with audio voiceover [on YouTube](https://youtu.be/SRZPLIcCvFU). 
 8 | * Another version that was presented at Superweek 2019 is available [on Slideshare](https://www.slideshare.net/tgwilson/superweek-2019-digital-analytics-meets-data-science).
 9 | * A version that was presented at the Atlanta chapter of the Digital Analytics Association in September 2019 is also available in [Google Slides](https://docs.google.com/presentation/d/144XGhDQxTga3Q-QJ3IUhJmkSGAbZfWfSMlcY0OMvNfU/edit?usp=sharing).
10 | 
11 | The presentations themselves cover different territory, but they all reference this page for examples.
12 | 
13 | ## There are Two Versions of Each Example
14 | There are two versions of each example here:
15 | 
16 | * **The R Notebook version** -- the RMarkdown version that has pretty extensively annotated code and explanations. This is the code you can use to tinker with and just run locally within RStudio
17 | * **The `Shiny` version (`app.R`)** -- this is the web-enabled experience that, if published to a Shiny server (including shinyapps.io) can be used by anyone to log in to their own account and explore the example _without_ seeing/experiencing any of the code. 
18 | 
19 | Links to "executed code showing output" (**RPubs**) as well as links to Shiny apps (**Shiny**) that can be tried out with no coding are included in the table below.
20 | 
21 | | Example | Folder Name | Examples |
22 | |-----------------------------------------------------|---------------|:-------------:|
23 | | Time-Normalized Pageviews | `time-normalized-pageviews` | [RPubs](http://rpubs.com/tgwilson/time-normalized-pageviews) / [Shiny](https://gilligan.shinyapps.io/time-normalized/) |
24 | | Page-Level Analysis/Comparison with Two Metrics | `page-analysis-two-metrics` | [RPubs](http://rpubs.com/tgwilson/page-analysis) / [Shiny](https://gilligan.shinyapps.io/page-analysis/) |
25 | | (Light) Text Mining of On-Site Search Data | `site-search-analysis` | [RPubs](http://rpubs.com/tgwilson/site-search) / [Shiny](https://gilligan.shinyapps.io/site-search/) |
26 | | (Light) Text Mining of Twitter Followers | `twitter-followers` | [RPubs](http://rpubs.com/tgwilson/twitter-followers) |
27 | | Regression Analysis with a Nominal / Categorical Value (Day of Week) | `regression` | [RPubs](http://rpubs.com/tgwilson/day-of-week-regression) / [Shiny](https://gilligan.shinyapps.io/regression/) |
28 | | Time-Series Decomposition / Holt-Winters Forecasting / Anomaly Detection | `forecasting` | [RPubs](http://rpubs.com/tgwilson/forecasting) / [Shiny](https://gilligan.shinyapps.io/forecasting/) |
29 | 
30 | ## A Few Notes on the Shiny Apps
31 | 
32 | The Shiny apps listed above should work fairly well, although they have extremely limited error handling. So, if you're using them and run into an error, please do log an issue in this repository.
33 | 
34 | If you are interested in using the Shiny code itself and modifying the code for your own purposes, there are a few things to be aware of:
35 | 
36 | * You need to set up "Web service" credentials for a Google app that has Google Analytics API access
37 | * Download the JSON file for those credentials
38 | * Place that JSON file either in the same folder as `app.R` or somewhere accessible and then add `"GAR_CLIENT_WEB_JSON="[path to that file]"` to the `.Renviron` file that will be referenced by that app
39 | * Basically, see the last post in [this issue](https://github.com/MarkEdmondson1234/googleAuthR/issues/136) for details.
40 | * The apps include Google Analytics tracking in them. IF you want to do that as well, then you need to get the JavaScript (recommended is to use Google Tag Manager) and place it in an include file (`gtm.js`) in the same folder as the app. If you do not want to track activity on your app using Google Analytics, then simply remove this line of code from the app: `tags$head(includeScript("gtm.js")),`.
41 | 
42 | ## Resources to Learn More
43 | 
44 | For a list of sites, blogs, books, podcasts, and other educational resources related to this topic, check out [this resource](http://bit.ly/data-science-y).
45 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-slate


--------------------------------------------------------------------------------
/forecasting/.gitignore:
--------------------------------------------------------------------------------
1 | .httr-oauth
2 | 


--------------------------------------------------------------------------------
/forecasting/app.R:
--------------------------------------------------------------------------------
  1 | # Time-Series Decomposition / AHolt-Winters Forecasting / Anomaly Detection
  2 | 
  3 | # Load the necessary libraries. 
  4 | library(shiny)
  5 | library(googleAuthR)       # For authentication
  6 | library(googleAnalyticsR)  # How we actually get the Google Analytics data
  7 | 
  8 | gar_set_client(web_json = "ga-web-client.json",
  9 |                scopes = "https://www.googleapis.com/auth/analytics.readonly")
 10 | 
 11 | # To run locally, uncomment the localhost line (and update the port as needed)
 12 | options(googleAuthR.redirect = "https://gilligan.shinyapps.io/forecasting/")
 13 | # options(googleAuthR.redirect = "http://localhost:5003")
 14 | 
 15 | library(tidyverse)         # Includes dplyr, ggplot2, and others; very key!
 16 | library(knitr)             # Nicer looking tables
 17 | library(plotly)            # We're going to make the charts interactive
 18 | library(DT)                # Interactive tables
 19 | library(scales)            # Useful for some number formatting in the visualizations
 20 | library(lubridate)         # For working with dates a bit
 21 | 
 22 | # Define the base theme for visualizations
 23 | theme_base <- theme_bw() +
 24 |   theme(plot.title = element_text(size = 10, face = "bold", hjust = 0.5),
 25 |         plot.margin = margin(1.5,0,0,0,"cm"),
 26 |         axis.text.x = element_text(size = 12),
 27 |         axis.text.y = element_text(size = 14),
 28 |         axis.title.x = element_blank(),
 29 |         axis.title.y = element_blank(),
 30 |         axis.line.x = element_line(color = "gray50"),
 31 |         axis.line.y = element_blank(),
 32 |         axis.ticks = element_blank(),
 33 |         legend.title = element_blank(),
 34 |         legend.background = element_blank(),
 35 |         legend.position = "top",
 36 |         legend.justification = "center",
 37 |         panel.border = element_blank(),
 38 |         panel.grid.major.x = element_blank(),
 39 |         panel.grid.major.y = element_line(size=0.5, colour = "gray90"),
 40 |         panel.grid.minor = element_blank())
 41 | 
 42 | # And, a theme for the time-series decomposition
 43 | theme_sparklines <- theme_bw() +
 44 |   theme(axis.text = element_text(size = 16),
 45 |         axis.text.x = element_text(face = "bold", margin = margin(0.25, 0, 0, 0, "cm")),
 46 |         axis.title = element_blank(),
 47 |         axis.ticks = element_blank(),
 48 |         axis.line.y = element_blank(),
 49 |         axis.line.x = element_line(colour = "grey10"),
 50 |         legend.title = element_blank(),
 51 |         legend.background = element_blank(),
 52 |         legend.position = "none",
 53 |         strip.text.x = element_text(face = "bold", size = 18, colour = "grey10"),
 54 |         strip.text.y = element_text(face = "bold", size = 18, colour = "grey10", 
 55 |                                     angle = 180, hjust=1),
 56 |         strip.background = element_blank(),
 57 |         panel.border = element_blank(),
 58 |         panel.grid.major = element_blank(),
 59 |         panel.grid.minor = element_blank(),
 60 |         panel.spacing = unit(0.5,"in"),
 61 |         panel.background = element_rect(fill = NA, color = NA))
 62 | 
 63 | ## ui.R
 64 | ui <- fluidPage(title = "Anomaly Detection through Holt-Winters Forecasting",
 65 |                 tags$head(includeScript("gtm.js")),
 66 |                 tags$h2("Anomaly Detection through Holt-Winters Forecasting*"),
 67 |                 tags$div(paste("Select a Google Analytics view and date range and then pull the data. From there, explore",
 68 |                                "time-series decomposition and forecasting as a means of identifying anomalies!")),
 69 |                 tags$br(),
 70 |                 sidebarLayout(
 71 |                   sidebarPanel(tags$h4("Select Base Data Parameters"),
 72 |                                # Account/Property/View Selection
 73 |                                authDropdownUI("auth_menu", inColumns = FALSE),
 74 |                                # Overall Date Range Selection
 75 |                                dateRangeInput("assessment_period", 
 76 |                                               label = "Select the overall date range to use:",
 77 |                                               start = Sys.Date()-90,
 78 |                                               end = Sys.Date()-1),
 79 |                                # Whether or not to enable anti-sampling
 80 |                                checkboxInput("anti_sampling",
 81 |                                              label = "Include anti-sampling (slows down app a bit).",
 82 |                                              value = TRUE),
 83 |                                # Action button. We want the user to control when the
 84 |                                # underlying call to Google Analytics occurs.
 85 |                                tags$div(style="text-align: center",
 86 |                                         actionButton("query_data", "Get/Refresh Data!", 
 87 |                                                      style="color: #fff; background-color: #337ab7; border-color: #2e6da4")),
 88 |                                tags$br(),
 89 |                                tags$hr(),
 90 |                                # Assessment Days
 91 |                                sliderInput("check_period", 
 92 |                                            label = "How many of those days do you want to check for anomalies?",
 93 |                                            min = 3, max = 14, value = 7),
 94 |                                # Prediction Interval
 95 |                                sliderInput("prediction_interval", 
 96 |                                            label = "Adjust the prediction interval:",
 97 |                                            min = 0.8, max = 0.95, value = 0.95, step = 0.05)),
 98 |                   
 99 |                   mainPanel(tabsetPanel(type = "tabs",
100 |                                         tabPanel("Base Data",
101 |                                                  tags$br(),
102 |                                                  tags$div(paste("This is the base data and a visualization of the data",
103 |                                                                 "that you queried. It should look pretty familiar!")),
104 |                                                  tags$br(),
105 |                                                  plotlyOutput("base_data_plot", height = "700px")),
106 |                                         tabPanel("Training vs. Assessment",
107 |                                                  tags$br(),
108 |                                                  tags$div(paste("We've split the data into two groups: the data that will be",
109 |                                                                 "used to train the model, and the data that we are actually",
110 |                                                                 "evaluating for anomalies.")),
111 |                                                  tags$br(),
112 |                                                  plotlyOutput("train_vs_assess_plot", height = "700px")),
113 |                                         tabPanel("Time-Series Decomposition",
114 |                                                  tags$br(),
115 |                                                  tags$div(paste("This is the time-series decomposition of the training data.",
116 |                                                                 "We've broken out the actual data into three components:"),
117 |                                                           tags$ul(tags$li(tags$b("Seasonal:"), "the recurring 7-day pattern in the data"), 
118 |                                                                   tags$li(tags$b("Trend:"), "a moving average, basically (technically, exponential smoothing",
119 |                                                                           "that shows how the data is trending over time"),
120 |                                                                   tags$li(tags$b("Random:"), "the noise that remains after the Seasonality and Trend values",
121 |                                                                           "have been removed from the Actual.")),
122 |                                                           paste("The y-axis scales vary from component to component to improve readability,",
123 |                                                                 "but note that the magnitude of the components varies quite a bit.")),
124 |                                                  tags$br(),
125 |                                                  plotOutput("time_series_decomp_plot", height = "600px")),
126 |                                         tabPanel("Forecast with a Prediction Interval",
127 |                                                  tags$br(),
128 |                                                  tags$div(paste("This is the final assessment of the data, which has used the seasonal",
129 |                                                                 "and trend components to create a forecast, the random component to",
130 |                                                                 "determine the prediction interval, and then the actual results are",
131 |                                                                 "shown on top of that.")),
132 |                                                  tags$br(),
133 |                                                  tags$div(style="font-weight: bold;", textOutput("anomaly_message")),
134 |                                                  tags$br(),
135 |                                                  plotOutput("final_assessment", height = "700px"))
136 |                                         # For troubleshooting, this would display the final table of data
137 |                                         # tabPanel("Data Table",
138 |                                         #          tags$br(),
139 |                                         #          tags$div(paste("This is the full data table.")),
140 |                                         #          tags$br(),
141 |                                         #          dataTableOutput("final_data"))
142 |                   ))),
143 |                 tags$hr(),
144 |                 tags$div("*This app is part of a larger set of apps that demonstrate some uses of R in conjunction",
145 |                          "with Google Analytics (and Twitter). For the code for this app, as well as an R Notebook",
146 |                          "that includes more details, see:", tags$a(href = "https://github.com/SDITools/ga-and-r-examples/",
147 |                                                                     "https://github.com/SDITools/ga-and-r-examples/"),"."),
148 |                 tags$br()
149 | )
150 | 
151 | ## server.R
152 | server <- function(input, output, session){
153 |   
154 |   # Create a non-reactive access token
155 |   gar_shiny_auth(session)
156 |   
157 |   # Populate the Account/Property/View dropdowns and return whatever the
158 |   # selected view ID is
159 |   account_list <- reactive(ga_account_list())
160 |   get_view_id <- callModule(authDropdown, "auth_menu", ga.table = account_list)
161 |   
162 |   # view_id <- callModule(authDropdown, "auth_menu", ga.table = ga_account_list)
163 |   
164 |   # Reactive function to pull the data.
165 |   get_ga_data <- reactive({
166 |     
167 |     # Only pull the data if the "Get Data" button is clicked
168 |     input$query_data
169 |     
170 |     # Pull the data. Go ahead and shorten the weeday names
171 |     # Pull the data. See ?google_analytics_4() for additional parameters. The anti_sample = TRUE
172 |     # parameter will slow the query down a smidge and isn't strictly necessary, but it will
173 |     # ensure you do not get sampled data.
174 |     isolate(google_analytics(viewId = get_view_id(),
175 |                              date_range = input$assessment_period,
176 |                              metrics = "sessions",
177 |                              dimensions = "date",
178 |                              anti_sample = input$anti_sampling)) 
179 |   })
180 |   
181 |   # Determine how many rows of the data will be used to build the forecast. This
182 |   # is everything except the last week.
183 |   rowcount_forecast <- reactive(nrow(get_ga_data()) - input$check_period)
184 |   
185 |   # Also figure out the date where the cutoff is between training and forecast.
186 |   # We actually want to shift this over a little bit to fall between two points when we plot
187 |   cutoff_date <- reactive({
188 |     
189 |     # Get the data
190 |     ga_data <- get_ga_data()
191 |     
192 |     # Figure out the cutoff date.
193 |     cutoff_date <- ga_data$date[rowcount_forecast()] 
194 |     
195 |     # This is the "shifting it over a bit" piece
196 |     cutoff_date <- (2*as.numeric(cutoff_date) + 1)/2
197 |   })
198 |   
199 |   # Make a data set that removes the "rows to be evaluated." This will get 
200 |   # used both to generate the time series for the forecast as well as for modeling
201 |   get_ga_data_training <- reactive({
202 |     get_ga_data() %>%
203 |       top_n(-rowcount_forecast(), wt = date) 
204 |   })
205 |   
206 |   # Get the date values for the forecast period
207 |   dates_forecast <- reactive({
208 |     get_ga_data() %>%
209 |       top_n(input$check_period, wt = date) %>%
210 |       dplyr::select(date)
211 |   })
212 |   
213 |   # Make a time-series object using the data for the training period. This
214 |   # is what we'll use to build the forecast
215 |   get_ga_data_ts <- reactive({
216 |     get_ga_data_training() %>%
217 |       dplyr::pull(sessions) %>% 
218 |       ts(frequency = 7)
219 |   })
220 |   
221 |   # Start building out our master data for plotting by adding a column that
222 |   # has just the data being used for the training
223 |   get_ga_data_plot <- reactive({
224 |     ga_data_plot <- get_ga_data() %>%
225 |       left_join(get_ga_data_training(), by = c(date = "date"))
226 |     
227 |     # Rename columns to be a bit clearer
228 |     names(ga_data_plot) <- c("date", "sessions_all", "sessions_training")
229 |     
230 |     # Add a column that is just the actuals data of interest
231 |     ga_data_plot <- ga_data_plot %>%
232 |       mutate(sessions_assess = ifelse(is.na(sessions_training), sessions_all, NA))
233 |     
234 |     # Generate a Holt Winters forecast
235 |     hw <- HoltWinters(get_ga_data_ts())
236 |     
237 |     # Predict the next X days (the X days of interest). Go ahead and convert it to a data frame
238 |     forecast_sessions <- predict(hw, n.ahead = input$check_period, prediction.interval = T, 
239 |                                  level = input$prediction_interval) %>%
240 |       as.data.frame()
241 |     
242 |     # Add in the dates so we can join this with the original data. We know it was the 7 days
243 |     # starting from cutoff_date
244 |     forecast_sessions$date <- dates_forecast() %>% pull(date)
245 |     
246 |     # Add these columns to the original data and add a column that IDs anomaly points by 
247 |     # checking to see if the actual value is outside the upper or lower bounds. If it is,
248 |     # put the value. We'll use this to highlight the anomalies.
249 |     ga_data_plot <- ga_data_plot %>%
250 |       left_join(forecast_sessions) %>%
251 |       mutate(anomaly = ifelse(sessions_all < lwr | sessions_all > upr, sessions_all, NA))
252 |     
253 |   })
254 |   
255 |   ## Outputs
256 |   
257 |   # Output the base data plot
258 |   output$base_data_plot <- renderPlotly({
259 |     
260 |     # Get the data for plotting
261 |     ga_data_plot <- get_ga_data_plot()
262 |     
263 |     # Get the upper limit for the plot. We'll use this for all of the plots just for clarity
264 |     y_max <- max(ga_data_plot$sessions_all) * 1.03
265 |     
266 |     # Build a plot showing just the actual data
267 |     ga_plot <- ggplot(ga_data_plot, mapping = aes(x = date)) +
268 |       geom_line(aes(y = ga_data_plot$sessions_all), color = "#0060AF", size = 0.75) +
269 |       scale_y_continuous(label=comma, expand = c(0, 0), limits = c(0, y_max)) +
270 |       scale_x_date(date_breaks = "7 days", labels = date_format("%d-%b")) +
271 |       theme_base
272 |     
273 |     # Plot the data
274 |     ggplotly(ga_plot) %>% layout(autosize=TRUE)
275 |   })
276 |   
277 |   # Output the data split by training vs. assessment
278 |   output$train_vs_assess_plot <- renderPlotly({
279 |     
280 |     # Get the data for plotting
281 |     ga_data_plot <- get_ga_data_plot()
282 |     
283 |     # Get the upper limit for the plot. We'll use this for all of the plots just for clarity
284 |     y_max <- max(ga_data_plot$sessions_all) * 1.03
285 |     
286 |     # Same plot, with the training data highlighted
287 |     ga_plot <- ggplot(ga_data_plot, mapping = aes(x = date)) +
288 |       geom_line(aes(y = ga_data_plot$sessions_training), color = "#0060AF", size = 0.75) +
289 |       geom_line(aes(y = ga_data_plot$sessions_assess), color = "gray80", size = 0.75) +
290 |       geom_vline(aes(xintercept = cutoff_date()), 
291 |                  color = "gray40", linetype = "dashed", size = 1) +
292 |       scale_y_continuous(label=comma, expand = c(0, 0), limits = c(0, y_max)) +
293 |       scale_x_date(date_breaks = "7 days", labels = date_format("%d-%b")) +
294 |       theme_base
295 |     
296 |     # Plot the data
297 |     ggplotly(ga_plot) %>% layout(autosize=TRUE)
298 |   })
299 |   
300 |   # Output the time-series decomposition
301 |   output$time_series_decomp_plot <- renderPlot({
302 |     
303 |     # Get the time-series data and the training data
304 |     ga_data_ts <- get_ga_data_ts()
305 |     ga_data_training <- get_ga_data_training()
306 |     
307 |     # Decompose the time-series data
308 |     ga_stl <- stl(ga_data_ts,
309 |                   s.window = "periodic",
310 |                   t.window = 7) 
311 |     
312 |     # Convert that to a long format data frame
313 |     ga_stl_df <- data.frame(Actual = ga_data_ts %>% as.data.frame()) %>% 
314 |       cbind(ga_stl$time.series %>% as.data.frame()) %>% 
315 |       mutate(date = ga_data_training$date) %>% 
316 |       dplyr::select(date, 
317 |                     Actual = x,
318 |                     Seasonal = seasonal,
319 |                     Trend = trend,
320 |                     Random = remainder) %>%
321 |       mutate(date = ga_data_training$date) %>%
322 |       gather(key, value, -date)
323 |     
324 |     # We want to control the order of the output, so make key a factor
325 |     ga_stl_df$key <- factor(ga_stl_df$key,
326 |                             levels = c("Actual", "Seasonal", "Trend", "Random"))
327 |     
328 |     ## We can "decompose" that data.
329 |     
330 |     # Plot the values
331 |     ga_plot <- ggplot(ga_stl_df, mapping = aes(x = date, y = value, colour = key)) +
332 |       geom_line(size = 1.5) +
333 |       facet_grid(key ~ ., scales = "free", switch = "y") +
334 |       scale_color_manual(values=c("#0060AF", "#999999", "#999999", "#999999")) +
335 |       scale_y_continuous(position = "right") +
336 |       scale_x_date(date_breaks = "7 days", labels = date_format("%d-%b")) +
337 |       theme_sparklines
338 |     
339 |     # Plot the data. Plotly jacks this up, so just going static visual for this one
340 |     ga_plot
341 |   })
342 |   
343 |   # Output the actual forecast with a comparison to actuals
344 |   output$final_assessment <- renderPlot({
345 |     
346 |     # Get the data to use in the plot
347 |     ga_data_plot <- get_ga_data_plot()
348 |     
349 |     # Replace any lower bound values that are negative with zero
350 |     ga_data_plot <- ga_data_plot %>% 
351 |       mutate(lwr = ifelse(lwr < 0, 0, lwr))
352 |     
353 |     # Get the upper limit for the plot. We'll use this for all of the plots just for clarity
354 |     y_max <- max(max(ga_data_plot$sessions_all), max(ga_data_plot$upr)) * 1.03
355 |     
356 |     # Build the plot
357 |     ga_plot <- ggplot(ga_data_plot, mapping = aes(x = date)) +
358 |       geom_ribbon(aes(ymin = ga_data_plot$lwr, ymax = ga_data_plot$upr), fill = "gray90") +
359 |       geom_line(aes(y = ga_data_plot$sessions_all), color = "#0060AF", size = 1.5) +
360 |       geom_line(aes(y = ga_data_plot$fit), color = "gray50", linetype = "dotted", size = 1) +
361 |       geom_vline(aes(xintercept = cutoff_date()),
362 |                  color = "gray40", linetype = "dashed", size = 1) +
363 |       scale_y_continuous(label=comma, expand = c(0, 0), limits = c(0, y_max)) +
364 |       scale_x_date(date_breaks = "7 days", labels = date_format("%d-%b")) +
365 |       theme_base +
366 |       # Not using plotly, so need to tweak some sizes
367 |       theme(axis.text.y = element_text(size = 20),
368 |             axis.text.x = element_text(size = 16, face = "bold", margin = margin(0.25, 0, 0, 0, "cm")),
369 |             axis.line.x = element_line(colour = "gray20")) +
370 |       if(sum(ga_data_plot$anomaly, na.rm = TRUE) > 0){
371 |         geom_point(aes(y = ga_data_plot$anomaly), color = "#F58220", size = 6)
372 |       }
373 |     
374 |     # Plot the data. Plotly, again, doesn't play nice
375 |     # withs ome part of this, so going with a static plot.
376 |     ga_plot
377 |   })
378 |   
379 |   # Output the anomaly message
380 |   output$anomaly_message <- renderText({
381 |     
382 |     # Get the full data table
383 |     ga_data_plot <- get_ga_data_plot()
384 |     
385 |     # Get the number of anomalies. This feels inelegant
386 |     anomaly_flag_df <- ga_data_plot %>% 
387 |       mutate(anomaly_flag = ifelse(anomaly > 0, 1, 0))
388 |     
389 |     anomaly_count <- sum(anomaly_flag_df$anomaly_flag, na.rm = TRUE)
390 |     
391 |     # Output message
392 |     if(anomaly_count == 0){
393 |       message <- paste0("There were no anomalies in the ", input$check_period,
394 |                         " days that were assessed.")
395 |     } else {
396 |       if(anomaly_count == 1){
397 |         message <- paste0("There was 1 anomaly (orange circle) in the ", input$check_period,
398 |                           " days that were assessed.")
399 |       } else {
400 |         message <- paste0("There were ", anomaly_count," anomalies (orange circles) in the ", input$check_period,
401 |                           " days that were assessed.")
402 |       }
403 |     }
404 |   })
405 |   
406 |   # Output the table of raw data. This is commented out for actual display,
407 |   # but is useful for troubleshooting.
408 |   output$final_data <- renderDataTable({
409 |     # Get the data to use in the plot
410 |     ga_data_plot <- get_ga_data_plot()
411 |   })
412 | 
413 | }
414 | 
415 | # shinyApp(gar_shiny_ui(ui, login_ui = gar_shiny_login_ui), server)
416 | shinyApp(gar_shiny_ui(ui, login_ui = silent_auth), server)


--------------------------------------------------------------------------------
/forecasting/forecasting.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Anomaly Detection -- Signal vs. Noise through Time-Series Decomposition"
  3 | output: html_notebook
  4 | ---
  5 | 
  6 | ### General Note
  7 | This example is part of a larger set of examples of using Google Analytics with R:
  8 | 
  9 | * For additional examples, downloadable code, and explanations of the overall effort, see: https://github.com/SDITools/ga-and-r-examples
 10 | * To see this specific example in Shiny (web/interactive) form, see: https://gilligan.shinyapps.io/forecasting/
 11 | 
 12 | ### Overview
 13 | 
 14 | This example is an exploration of using Holt-Winters forecasting for anomaly detection. Mainly, it's geared towards an explanation of time-series decomposition and how that technique can be used to identify whether some set of data falls outside of an expected range. 
 15 | 
 16 | ### Setup/Config
 17 | 
 18 | Start with the initial configuration of variables and the theme.
 19 | 
 20 | ```{r config}
 21 | # Load the necessary libraries. 
 22 | if (!require("pacman")) install.packages("pacman")
 23 | pacman::p_load(googleAnalyticsR,  # How we actually get the Google Analytics data
 24 |                tidyverse,         # Includes dplyr, ggplot2, and others; very key!
 25 |                splitstackshape,   # For some data munging (probably better to use a tidyverse package)
 26 |                plotly,            # We're going to make the charts interactive
 27 |                scales,            # Useful for some number formatting in the visualizations
 28 |                lubridate,         # For the wday function
 29 |                tools)             # For our stepwise regression
 30 | 
 31 | # Authorize GA. Depending on if you've done this already and a .ga-httr-oauth file has
 32 | # been saved or not, this may pop you over to a browser to authenticate.
 33 | ga_auth(token = ".httr-oauth", new_user = TRUE)
 34 | 
 35 | # Set the total number of weeks to be used. The most recent week will be the "checked" week,
 36 | # while the weeks before will be used to build the forecast.
 37 | total_weeks <- 7
 38 | 
 39 | # Set the view ID and the date range. If you want to, you can swap out the Sys.getenv()
 40 | # call and just replace that with a hardcoded value for the view ID. 
 41 | view_id <- Sys.getenv("GA_VIEW_ID")
 42 | end_date <- Sys.Date() - wday(Sys.Date())     # The most recent Saturday
 43 | start_date <- end_date - total_weeks * 7 + 1  # Start date based on total_weeks
 44 | 
 45 | # Define our theme -- this will be used later for visualizations of the full-sized charts
 46 | theme_hw <- theme_bw() +
 47 |   theme(text = element_text(family="Nunito"),
 48 |         plot.title = element_text(size = 10, face = "bold", hjust = 0.5),
 49 |         plot.margin = margin(1.5,0,0,0,"cm"),
 50 |         axis.text.x = element_blank(),
 51 |         axis.text.y = element_text(size = 14),
 52 |         axis.title = element_blank(),
 53 |         axis.title.x = element_text(size = 10, hjust = 0.5),
 54 |         axis.ticks = element_blank(),
 55 |         axis.line.x = element_line(color = "gray50"),
 56 |         axis.line.y = element_blank(),
 57 |         legend.title = element_blank(),
 58 |         legend.background = element_blank(),
 59 |         legend.position = "top",
 60 |         legend.justification = "center",
 61 |         panel.border = element_blank(),
 62 |         panel.grid.major.x = element_blank(),
 63 |         panel.grid.major.y = element_line(size=0.5, colour = "gray90"),
 64 |         panel.grid.minor = element_blank())
 65 | 
 66 | # And, a theme for the time-series decomposition
 67 | theme_sparklines <- theme_bw() +
 68 |   theme(axis.text = element_blank(),
 69 |         axis.title = element_blank(),
 70 |         axis.ticks = element_blank(),
 71 |         axis.line = element_blank(),
 72 |         legend.title = element_blank(),
 73 |         legend.background = element_blank(),
 74 |         legend.position = "none",
 75 |         legend.justification = "center",
 76 |         strip.text.x = element_text(face = "bold", size = 14, colour = "grey10", family="Nunito"),
 77 |         strip.text.y = element_text(face = "bold", size = 14, colour = "grey10", 
 78 |                                     angle = 180, hjust=1, family="Nunito"),
 79 |         strip.background = element_blank(),
 80 |         panel.border = element_blank(),
 81 |         panel.grid.major = element_blank(),
 82 |         panel.grid.minor = element_blank(),
 83 |         panel.spacing = unit(0,"in"),
 84 |         panel.background = element_rect(fill = NA, color = NA))
 85 | 
 86 | ```
 87 | 
 88 | ```{r config-override, echo=FALSE}
 89 | 
 90 | # Hardcode the start and end dates to something that will show anomalies
 91 | start_date <- as.Date("2019-02-03")
 92 | end_date <- as.Date("2019-03-23")
 93 | 
 94 | ```
 95 | 
 96 | ### Pull the Data
 97 | 
 98 | This is a simple data pull. We're just pulling data by day (because each day is a different day of the week) and sessions. And, since Google Analytics has a "Day of Week Name" dimension, we'll go ahead and pull that, too (although we could have easily calculated this afterwards).
 99 | 
100 | ```{r data-pull, message=FALSE, warning=FALSE, fig.width=10, fig.height=5}
101 | 
102 | # Pull the data. See ?google_analytics_4() for additional parameters. The anti_sample = TRUE
103 | # parameter will slow the query down a smidge and isn't strictly necessary, but it will
104 | # ensure you do not get sampled data.
105 | ga_data <- google_analytics(viewId = view_id,
106 |                             date_range = c(start_date, end_date),
107 |                             metrics = "sessions",
108 |                             dimensions = "date",
109 |                             anti_sample = TRUE)
110 | 
111 | ```
112 | 
113 | ```{r add-anomaly, message=FALSE, warning=FALSE, echo=FALSE}
114 | 
115 | # We want this example to actually show some anomalies, so we're going to manually scale 3 of the
116 | # data points
117 | # ga_data$sessions[44] <- ga_data$sessions[44] * .95
118 | # ga_data$sessions[45:48] <- ga_data$sessions[45:48] * .83
119 | 
120 | ```
121 | 
122 | ### Plot the Data as Weekly Data
123 | 
124 | We pulled the data daily, so we're going to roll this up to be weekly and then plot it to show how the data might look in a dashboard that includes a weekly sparkline.
125 | 
126 | ```{r weekly-plot, message=FALSE, warning=FALSE, fig.width=10, fig.height=5}
127 | 
128 | # Aggregate the data to be weekly
129 | ga_data_weekly <- ga_data %>%
130 |   mutate(week = date - wday(date) + 1) %>% 
131 |   group_by(week) %>% 
132 |   summarise(sessions = sum(sessions)) %>% 
133 |   mutate(last_week_sessions = ifelse(week == max(week), sessions, NA ))
134 | 
135 | # Build a plot
136 | ga_plot_weekly <- ggplot(ga_data_weekly, mapping = aes(x = week, y = sessions)) +
137 |   geom_line(color = "#0060AF", size = 1) +
138 |   geom_point(aes(y=sessions), size=3, shape = 19, colour = "#0060AF") +
139 |   geom_point(aes(y=sessions), size=3, shape = 20, colour = "white") +
140 |   geom_point(aes(y=last_week_sessions), size=5, colour = "white") +
141 |   geom_point(aes(y=last_week_sessions), size=3.5, colour = "#9A258F") +
142 |   scale_y_continuous(label=comma, expand = c(0, 0), limits = c(0, max(ga_data_weekly$sessions) * 1.03)) +
143 |   labs(x = " ") +
144 |   theme_hw
145 | 
146 | # Plot the data
147 | ga_plot_weekly
148 | 
149 | ```
150 | 
151 | ### Data Munging
152 | 
153 | A lot of this is for the purposes of the visualization, but, essentially, we need to break the data into the "data before last week" (this is the data we'll use to build forecast) and the "data to be evaluated" (the data from last week).
154 | 
155 | ```{r data-munge, message=FALSE, warning=FALSE, fig.width=10, fig.height=5}
156 | 
157 | # Determine how many rows of the data will be used to build the forecast. This
158 | # is everything except the last week.
159 | rowcount_forecast <- nrow(ga_data) - 7
160 | 
161 | # Also figure out the date where the cutoff is between training and forecast
162 | cutoff_date <- ga_data[rowcount_forecast,1] 
163 | 
164 | # We actually want to shift this over a little bit to fall between two points when we plot
165 | cutoff_date <- (2*as.numeric(cutoff_date) + 1)/2
166 | 
167 | # Make a data set that removes the "rows to be evaluated." This will get 
168 | # used both to generate the time series for the forecast as well as for modeling
169 | ga_data_training <- ga_data %>%
170 |   top_n(-rowcount_forecast, wt = date) 
171 | 
172 | # Get the date values for the forecast period
173 | dates_forecast <- ga_data %>%
174 |     top_n(7, wt = date) %>%
175 |   dplyr::select(date)
176 | 
177 | # Make a time-series object using the data for the training period. This
178 | # is what we'll use to build the forecast
179 | ga_data_ts <- ga_data_training[[2]] %>%
180 |   ts(frequency = 7)
181 | 
182 | # Start building out our master data for plotting by adding a column that
183 | # has just the data being used for the training
184 | ga_data_plot <- ga_data %>%
185 |   left_join(ga_data_training, by = c(date = "date"))
186 | 
187 | # Rename columns to be a bit clearer
188 | names(ga_data_plot) <- c("date", "sessions_all", "sessions_training")
189 | 
190 | # Add a column that is just the actuals data of interest
191 | ga_data_plot <- ga_data_plot %>%
192 |   mutate(sessions_assess = ifelse(is.na(sessions_training), sessions_all, NA))
193 | 
194 | ```
195 | 
196 | ### Visualizing the Data
197 | 
198 | The basic view of the raw data:
199 | 
200 | ```{r echo=FALSE, message=FALSE, warning = FALSE, fig.height=5, fig.width=10}
201 | 
202 | # Get the upper limit for the plot. We'll use this for all of the plots just for clarity
203 | y_max <- max(ga_data_plot$sessions_all) * 1.03
204 | 
205 | # Build a plot showing just the actual data
206 | ga_plot <- ggplot(ga_data_plot, mapping = aes(x = date)) +
207 |   geom_line(aes(y = ga_data_plot$sessions_all), color = "#0060AF", size = 0.75) +
208 |   scale_y_continuous(label=comma, expand = c(0, 0), limits = c(0, y_max)) +
209 |   labs(x= " ") +
210 |   theme_hw
211 | 
212 | # Output the result
213 | ga_plot
214 | 
215 | ```
216 | 
217 | We're going to split this data up into two different groups:
218 | 
219 | ```{r echo=FALSE, message=FALSE, warning = FALSE, fig.height=5, fig.width=10}
220 | 
221 | # Same plot, but showing just the last week
222 | ga_plot <- ggplot(ga_data_plot, mapping = aes(x = date)) +
223 |   geom_line(aes(y = ga_data_plot$sessions_all), color = "#0060AF", size = 0.75) +
224 |   geom_vline(aes(xintercept = cutoff_date), 
225 |              color = "gray40", linetype = "dashed", size = 1) +
226 |   scale_y_continuous(label=comma, expand = c(0, 0), limits = c(0, y_max)) +
227 |   labs(x= " ") +
228 |   theme_hw
229 | 
230 | ga_plot
231 | 
232 | ```
233 | 
234 | The data of interest is the last seven days:
235 | 
236 | ```{r echo=FALSE, message=FALSE, warning = FALSE, fig.height=5, fig.width=10}
237 | 
238 | # Same plot, with the "training data highlighted"data of interest highlighted
239 | ga_plot <- ggplot(ga_data_plot, mapping = aes(x = date)) +
240 |   geom_line(aes(y = ga_data_plot$sessions_training), color = "gray80", size = 0.75) +
241 |   geom_line(aes(y = ga_data_plot$sessions_assess), color = "#0060AF", size = 0.75) +
242 |   geom_vline(aes(xintercept = cutoff_date), 
243 |              color = "gray40", linetype = "dashed", size = 1) +
244 |   scale_y_continuous(label=comma, expand = c(0, 0), limits = c(0, y_max)) +
245 |   labs(x= " ") +
246 |   theme_hw
247 | 
248 | ga_plot
249 | 
250 | ```
251 | 
252 | The earlier data is our data for context -- this is the data we'll actually use to build a forecast:
253 | 
254 | ```{r echo=FALSE, message=FALSE, warning = FALSE, fig.height=5, fig.width=10}
255 | 
256 | # Same plot, with the training data highlighted
257 | ga_plot <- ggplot(ga_data_plot, mapping = aes(x = date)) +
258 |   geom_line(aes(y = ga_data_plot$sessions_training), color = "#0060AF", size = 0.75) +
259 |   geom_line(aes(y = ga_data_plot$sessions_assess), color = "gray80", size = 0.75) +
260 |   geom_vline(aes(xintercept = cutoff_date), 
261 |              color = "gray40", linetype = "dashed", size = 1) +
262 |   scale_y_continuous(label=comma, expand = c(0, 0), limits = c(0, y_max)) +
263 |   labs(x= " ") +
264 |   theme_hw
265 | 
266 | ga_plot
267 | 
268 | ```
269 | 
270 | So, really, we're just going to work with the data before the cutoff for now. This is the data that we're going to "decompose" and, ultimately, use to build a forecast:
271 | 
272 | ```{r echo=FALSE, message=FALSE, warning = FALSE, fig.height=5, fig.width=10}
273 | 
274 | # Same plot, with the training data highlighted
275 | ga_plot <- ggplot(ga_data_plot, mapping = aes(x = date)) +
276 |   geom_line(aes(y = ga_data_plot$sessions_training), color = "#0060AF", size = 0.75) +
277 |   geom_vline(aes(xintercept = cutoff_date), 
278 |              color = "gray40", linetype = "dashed", size = 1) +
279 |   scale_y_continuous(label=comma, expand = c(0, 0), limits = c(0, y_max)) +
280 |   labs(x= " ") +
281 |   theme_hw
282 | 
283 | ga_plot
284 | 
285 | ```
286 | 
287 | 
288 | ### Explaining / Visualizing Time-Series Decomposition
289 | 
290 | This step isn't strictly necessary for this exercise, either. But, for pedagogical purposes, we're going to walk through the actual "decomposing" of the training data. It's actually an oversimplification of what _actually_ happens in Holt Winters forecasting, in that Holt Winters applies "exponential smoothing" at each step. This matters and is important, but it's a little easier to intuit than the fundamental decomposition of the time-series in the first place, so that's what we're going to walk through here.
291 | 
292 | The plot below shows the decomposition of the actual data into three components:
293 | 
294 | * **Seasonal** -- this is a recurring pattern every 7 days, so think of this as each day being the mean value for that weekday across the entire date range (e.g., the Monday value in the **Seasonal** component is simply the mean of the traffic for all Mondays in the data set); in reality, Holt-Winters applies "exponential smoothing" _and_ can take into account a secondary seasonal pattern (like annual seasonality), but that's getting farther into the weeds than is necessary here (and, sure, it's farther in the weeds than I fully grok myself)
295 | * **Trend** -- again, we'll think of this in a simplified manner as simply a moving average of the values that are left _after_ the **Seasonal** values are subtracted from the **Actual** value. Again, it's a little more complicated than that, but that works conceptually, so we'll stick with that.
296 | * **Random** -- this is simply "what's left" after the **Seasonal** and **Trend** values have been subtracted from the **Actual** value
297 | 
298 | So, if you're following along, this means that, for any day, the **Seasonal** component plus the **Trend** component plus the **Random** component _exactly equals_ the **Actual** value.
299 | 
300 | ```{r stl, echo=FALSE, message=FALSE, warning = FALSE, fig.height=5, fig.width=10}
301 | 
302 | # Decompose the time-series data
303 | ga_stl <- stl(ga_data_ts,
304 |               s.window = "periodic",
305 |               t.window = 7) 
306 | 
307 | # Convert that to a long format data frame
308 | ga_stl_df <- data.frame(Actual = ga_data_ts %>% as.data.frame()) %>% 
309 |   cbind(ga_stl$time.series %>% as.data.frame()) %>% 
310 |   mutate(date = ga_data_training$date) %>% 
311 |   dplyr::select(date, 
312 |          Actual = x,
313 |          Seasonal = seasonal,
314 |          Trend = trend,
315 |          Random = remainder) %>%
316 |   mutate(date = ga_data_training$date) %>%
317 |   gather(key, value, -date)
318 | 
319 | # We want to control the order of the output, so make key a factor
320 | ga_stl_df$key <- factor(ga_stl_df$key,
321 |                            levels = c("Actual", "Seasonal", "Trend", "Random"))
322 | 
323 | ## We can "decompose" that data.
324 | 
325 | # Plot the values
326 | ga_plot <- ggplot(ga_stl_df, mapping = aes(x = date, y = value, colour = key)) +
327 |   geom_line(size = 1) +
328 |   facet_grid(key ~ ., scales = "free", switch = "y") +
329 |   scale_color_manual(values=c("#0060AF", "#999999", "#999999", "#999999")) +
330 |   theme_sparklines
331 | 
332 | ga_plot
333 | 
334 | ```
335 | 
336 | ### Build the Forecast
337 | 
338 | This is the actual building of the forecast. It does all of the time-series decomposition described above (but with a bit more complexity) and builds the model inherently within these function calls. **To be clear**, the time-series decomposition illustrated in the last step is not _exactly_ what is being used for the forecasting, but, conceptually, it's pretty much what is happening under the hood with the `HoltWinters` function.
339 | 
340 | ```{r forecast, message=FALSE, warning=FALSE, fig.width=10, fig.height=5}
341 | 
342 | # Generate a Holt Winters forecast
343 | hw <- HoltWinters(ga_data_ts)
344 | 
345 | # Predict the next 7 days (the 7 days of interest). Go ahead and convert it to a data frame
346 | forecast_sessions <- predict(hw, n.ahead = 7, prediction.interval = T, interval_level = 0.95) %>%
347 |   as.data.frame()
348 | 
349 | # Add in the dates so we can join this with the original data. We know it was the 7 days
350 | # starting from cutoff_date
351 | forecast_sessions$date <- dates_forecast$date
352 | 
353 | # Add these columns to the original data and add a column that IDs anomaly points by 
354 | # checking to see if the actual value is outside the upper or lower bounds. If it is,
355 | # put the value. We'll use this to highlight the anomalies.
356 | ga_data_plot <- ga_data_plot %>%
357 |   left_join(forecast_sessions) %>%
358 |   mutate(anomaly = ifelse(sessions_all < lwr | sessions_all > upr, sessions_all, NA))
359 | 
360 | # Figure out the max value (for plotting) and then bump it up a bit
361 | max_y <- max(dplyr::select(ga_data_plot, -date))
362 | 
363 | ```
364 | 
365 | ### Plot the Forecast
366 | 
367 | The forecast comes from the first two components of our time-series decomposition:
368 | 
369 | * **Seasonal** -- this is simply a recurring cycle, so it can be easily extended (actual details a bit more complicated)
370 | * **Trend** -- this is essentially a moving average, so each "next point" can be calculated based on the average of the previous X points (again...actual details a bit more complicated)
371 | 
372 | ```{r echo=FALSE, message=FALSE, warning = FALSE, fig.height=5, fig.width=10}
373 | 
374 | # Same plot, but showing the forecast
375 | ga_plot <- ggplot(ga_data_plot, mapping = aes(x = date)) +
376 |   geom_line(aes(y = ga_data_plot$sessions_training), color = "#0060AF", size = 0.75) +
377 |   geom_vline(aes(xintercept = cutoff_date), 
378 |              color = "gray40", linetype = "dashed", size = 1) +
379 |   geom_line(aes(y = ga_data_plot$fit), color = "gray50", linetype = "dotted", size = 1) +
380 |   scale_y_continuous(label=comma, expand = c(0, 0), limits = c(0, y_max)) +
381 |   labs(x= " ") +
382 |   theme_hw
383 | 
384 | ga_plot
385 | 
386 | ```
387 | 
388 | 
389 | 
390 | ### The Forecast Will Not Be Perfect
391 | 
392 | We can add a "prediction interval" that is based on the variability of the **Random** component from our time-series decomposition. The more fluctuation there was in that component, the bigger the prediction interval (the farther off from the forecast an actual result can be and still be within an "expected" range). The below chart shows the "95% prediction interval," meaning that, if everything stays pretty much the same, then that is the range we would expect the _actual_ values to fall within 95% of the time.
393 | 
394 | ```{r echo=FALSE, message=FALSE, warning = FALSE, fig.height=5, fig.width=10}
395 | 
396 | # If the random component is really large, the prediction interval will be really wide,
397 | # which means it could go below zero or above the max point of the plot. So, correct for
398 | # that.
399 | ga_data_plot <- ga_data_plot %>% 
400 |   mutate(lwr = ifelse(lwr < 0, 0, lwr),
401 |          upr = ifelse(upr > y_max, y_max, upr))
402 | 
403 | # Same plot, with the prediction interval added
404 | ga_plot <- ggplot(ga_data_plot, mapping = aes(x = date)) +
405 |   geom_ribbon(aes(ymin = ga_data_plot$lwr, ymax = ga_data_plot$upr), fill = "gray90") +
406 |   geom_line(aes(y = ga_data_plot$sessions_training), color = "#0060AF", size = 0.75) +
407 |   geom_vline(aes(xintercept = cutoff_date), 
408 |              color = "gray40", linetype = "dashed", size = 1) +
409 |   geom_line(aes(y = ga_data_plot$fit), color = "gray50", linetype = "dotted", size = 1) +
410 |   scale_y_continuous(label=comma, expand = c(0, 0), limits = c(0, y_max)) +
411 |   labs(x= " ") +
412 |   theme_hw
413 | 
414 | ga_plot
415 | 
416 | ```
417 | 
418 | ### Compare the Forecast to the Actuals
419 | 
420 | Now, we can layer back in the actual values for those seven days and have some useful context.
421 | 
422 | ```{r echo=FALSE, message=FALSE, warning = FALSE, fig.height=5, fig.width=10}
423 | 
424 | # Same plot, with the actuals shown
425 | ga_plot <- ggplot(ga_data_plot, mapping = aes(x = date)) +
426 |   geom_ribbon(aes(ymin = ga_data_plot$lwr, ymax = ga_data_plot$upr), fill = "gray90") +
427 |   geom_line(aes(y = ga_data_plot$sessions_all), color = "#0060AF", size = 0.75) +
428 |   geom_vline(aes(xintercept = cutoff_date), 
429 |              color = "gray40", linetype = "dashed", size = 1) +
430 |   geom_line(aes(y = ga_data_plot$fit), color = "gray50", linetype = "dotted", size = 1) +
431 |   scale_y_continuous(label=comma, expand = c(0, 0), limits = c(0, y_max)) +
432 |   labs(x= " ") +
433 |   theme_hw
434 | 
435 | ga_plot
436 | 
437 | ```
438 | 
439 | ### Now we have meaningful context! 
440 | 
441 | ```{r echo=FALSE, message=FALSE, warning = FALSE, fig.height=5, fig.width=10}
442 | 
443 | ga_plot <- ggplot(ga_data_plot, mapping = aes(x = date)) +
444 |   geom_ribbon(aes(ymin = ga_data_plot$lwr, ymax = ga_data_plot$upr), fill = "gray90") +
445 |   geom_line(aes(y = ga_data_plot$sessions_all), color = "#0060AF", size = 0.75) +
446 |   geom_line(aes(y = ga_data_plot$fit), color = "gray50", linetype = "dotted", size = 1) +
447 |   geom_vline(aes(xintercept = cutoff_date),
448 |              color = "gray40", linetype = "dashed", size = 1) +
449 |   scale_y_continuous(label=comma, expand = c(0, 0), limits = c(0, y_max)) +
450 |   labs(x= " ") +
451 |   theme_hw +
452 |   if(sum(ga_data_plot$anomaly, na.rm = TRUE) > 0){
453 |      geom_point(aes(y = ga_data_plot$anomaly), color = "#F58220", size = 2.5)
454 |   }
455 | 
456 | ga_plot
457 | 
458 | ```
459 | 
460 | 


--------------------------------------------------------------------------------
/ga-and-r-examples.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/normal_dist.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Normal Distribution"
 3 | output: html_notebook
 4 | ---
 5 | 
 6 | A simple normal distribution needed for a visualization in a presentation.
 7 | 
 8 | ```{r fig.width=10, fig.height=5}
 9 | library(tidyverse)
10 | library(cowplot)
11 | 
12 | 
13 | gg <- ggplot(data = data.frame(x = c(-3, 3)), aes(x)) +
14 |   stat_function(fun = dnorm, n = 101, args = list(mean = 0, sd = 1)) + ylab("") +
15 |   scale_y_continuous(breaks = NULL) +
16 |   theme(axis.text = element_blank(),
17 |         axis.ticks = element_blank(),
18 |         axis.title = element_blank())
19 | 
20 | gg
21 | 
22 | 
23 | ```
24 | 
25 | 


--------------------------------------------------------------------------------
/page-analysis-two-metrics/app.R:
--------------------------------------------------------------------------------
  1 | # Load the necessary libraries. 
  2 | library(shiny)
  3 | library(googleAuthR)       # For authentication
  4 | library(googleAnalyticsR)  # How we actually get the Google Analytics data
  5 | 
  6 | gar_set_client(web_json = "ga-web-client.json",
  7 |                scopes = "https://www.googleapis.com/auth/analytics.readonly")
  8 | options(googleAuthR.redirect = "https://gilligan.shinyapps.io/page-analysis/")
  9 | 
 10 | library(tidyverse)         # Includes dplyr, ggplot2, and others; very key!
 11 | library(knitr)             # Nicer looking tables
 12 | library(plotly)            # We're going to make the charts interactive
 13 | library(DT)                # Interactive tables
 14 | library(scales)            # Useful for some number formatting in the visualizations
 15 | 
 16 | # Create a date frame that we'll use to look up the various calculations for different
 17 | # metrics.
 18 | calcs_df <- data.frame(metric = c("Entrances", "Bounces", "Unique Pageviews", "Pageviews", "Exits", "Bounce Rate", "Exit Rate"),
 19 |                        calculation = c("entrances", "bounces", "uniquePageviews", "pageviews", "exits", "bounces/entrances", "exits/pageviews"),
 20 |                        metric_format = c("integer", "integer", "integer", "integer", "integer", "percentage", "percentage"),
 21 |                        stringsAsFactors = FALSE)
 22 | 
 23 | # Define the base theme for visualizations
 24 | theme_base <- theme_light() +
 25 |   theme(panel.grid = element_blank(),
 26 |         panel.border = element_blank(),
 27 |         axis.ticks = element_blank(),
 28 |         axis.line = element_line(colour = "gray70"),
 29 |         strip.background = element_rect(fill = "white", color = NA),
 30 |         strip.text = element_text(colour = "gray20", face = "bold"))
 31 | 
 32 | ## ui.R
 33 | ui <- fluidPage(title = "Page Analysis of Two Metrics",
 34 |                 tags$head(includeScript("gtm.js")),
 35 |                 tags$h2("Page Analysis of Two Metrics*"),
 36 |                 tags$div(paste("Select a Google Analytics view and date range and then pull the data. From there, explore combinations",
 37 |                       "of metrics. Depending on the metrics you choose, the pages of interest will be in specific quadrants as",
 38 |                       "opportunity pages or potential problem pages.")),
 39 |                 tags$br(),
 40 |                 sidebarLayout(
 41 |                   sidebarPanel(tags$h4("Select Base Data Parameters"),
 42 |                                # Account/Property/View Selection
 43 |                                authDropdownUI("auth_menu",
 44 |                                               inColumns = FALSE),
 45 |                                # Date Range Selection
 46 |                                dateRangeInput("date_selection", 
 47 |                                               label = "Select date range:",
 48 |                                               start = Sys.Date()-30,
 49 |                                               end = Sys.Date()-1),
 50 |                                # Page filter (regEx)
 51 |                                textInput("filter_regex",
 52 |                                          label = "Enter regEx to filter to the pages of interest:",
 53 |                                          value = ".*"),
 54 |                                # Whether or not to enable anti-sampling
 55 |                                checkboxInput("anti_sampling",
 56 |                                              label = "Include anti-sampling (slows down app a bit).",
 57 |                                              value = TRUE),
 58 |                                # Action button. We want the user to control when the
 59 |                                # underlying call to Google Analytics occurs.
 60 |                                tags$div(style="text-align: center",
 61 |                                         actionButton("query_data", "Get/Refresh Data!", 
 62 |                                             style="color: #fff; background-color: #337ab7; border-color: #2e6da4")),
 63 |                                tags$hr(),
 64 |                                tags$h4("Explore the Data!"),
 65 |                                selectInput("x_dim",
 66 |                                            label = "Metric for the x-axis:",
 67 |                                            choices = calcs_df$metric,
 68 |                                            selected = "Entrances"),
 69 |                                selectInput("x_scale",
 70 |                                            label = "Scale for the x-axis:",
 71 |                                            choices = c("Linear", "Logarithmic"),
 72 |                                            selected = "Linear"),
 73 |                                selectInput("y_dim",
 74 |                                            label = "Metric for the y-axis:",
 75 |                                            choices = calcs_df$metric,
 76 |                                            selected = "Bounce Rate"),
 77 |                                selectInput("y_scale",
 78 |                                            label = "Scale for the y-axis:",
 79 |                                            choices = c("Linear", "Logarithmic"),
 80 |                                            selected = "Linear"),
 81 |                                sliderInput("num_pages",
 82 |                                            label = "# of pages to include (by X metric):",
 83 |                                            min = 10,
 84 |                                            max = 200,
 85 |                                            value = 50,
 86 |                                            step = 5)),
 87 |                   mainPanel(tabsetPanel(type = "tabs",
 88 |                                         tabPanel("Overall",
 89 |                                                  tags$br(),
 90 |                                                  tags$h4(textOutput("corr")),
 91 |                                                  tags$hr(),
 92 |                                                  plotlyOutput("plot_overall", height = "800px")),
 93 |                                         tabPanel("Channel Grouping and Device Category",
 94 |                                                  plotlyOutput("plot_facets", height = "800px")))
 95 |                   )
 96 |                 ),
 97 |                 tags$hr(),
 98 |                 tags$div("*This app is part of a larger set of apps that demonstrate some uses of R in conjunction",
 99 |                          "with Google Analytics (and Twitter). For the code for this app, as well as an R Notebook",
100 |                          "that includes more details, see:", tags$a(href = "https://github.com/SDITools/ga-and-r-examples/",
101 |                          "https://github.com/SDITools/ga-and-r-examples/"),"."),
102 |                 tags$br()
103 |                 )
104 | 
105 | ## server.R
106 | server <- function(input, output, session){
107 |   
108 |   # Create a non-reactive access token
109 |   gar_shiny_auth(session)
110 |   
111 |   # Populate the Account/Property/View dropdowns and return whatever the
112 |   # selected view ID is
113 |   view_id <- callModule(authDropdown, "auth_menu", ga.table = ga_account_list)
114 |   
115 |   # Reactive function to build the page filter object
116 |   page_filter <- reactive({
117 |     # Create a dimension filter object. See ?dim_filter() for details. 
118 |     page_filter_object <- dim_filter("pagePath", 
119 |                                      operator = "REGEXP",
120 |                                      expressions = input$filter_regex)
121 |     # Now, put that filter object into a filter clause.
122 |     filter_clause_ga4(list(page_filter_object),
123 |                       operator = "AND")
124 |   })
125 |   
126 |   # Reactive function to pull the data.
127 |   get_ga_data <- reactive({
128 |     
129 |     # Only pull the data if the "Get Data" button is clicked
130 |     input$query_data
131 |     
132 |     # Pull the data. 
133 |     isolate(google_analytics(viewId = view_id(),
134 |                      date_range = input$date_selection,
135 |                      metrics = c("entrances", "bounces", "pageviews", 
136 |                                  "uniquePageviews", "exits"),
137 |                      dimensions = c("deviceCategory", "channelGrouping", "pagePath"),
138 |                      dim_filters = page_filter(),
139 |                      anti_sample = input$anti_sampling))
140 |   })
141 |   
142 |   # Reactive function to get the formulas used to calculate the x metric
143 |   get_formula_x <- reactive({
144 |     calc_details_x <- calcs_df %>%  filter(metric == input$x_dim)
145 |     formula_x <- calc_details_x$calculation
146 |   })
147 |   
148 |   # Reactive function to get the formulas used to calculate the y metric
149 |   get_formula_y <- reactive({
150 |     calc_details_y <- calcs_df %>%  filter(metric == input$y_dim)
151 |     formula_y <- calc_details_y$calculation
152 |   })
153 |   
154 |   # Reactive function to get the plottable data. This includes calculating the metrics
155 |   # for x and y dimensions This last step requires NSE,# which, I realize, has moved on beyond 
156 |   # the "_" notation, but I'll be damned if I could get that to work.
157 |   get_data_overall <- reactive({
158 |     
159 |     # Get the data
160 |     data_pages <- get_ga_data()
161 |     
162 |     # Get the formulas
163 |     formula_x <- get_formula_x()
164 |     formula_y <- get_formula_y()
165 |     
166 |     data_overall <- data_pages %>%  group_by(pagePath) %>% 
167 |       summarise(entrances = sum(entrances), bounces = sum(bounces), pageviews = sum(pageviews),
168 |                 uniquePageviews = sum(uniquePageviews), exits = sum(exits)) %>% 
169 |       mutate_(x = formula_x,
170 |               y = formula_y)
171 |   })
172 |   
173 |   # Reactive function to get the top pages by the x-axis value. We'll use this for our overall plot
174 |   get_top_pages <- reactive({
175 |     
176 |     # Get the overall data (with x and y values calculated)
177 |     data_overall <- get_data_overall()
178 |     
179 |     # Filter down to the num_pages pages
180 |     top_pages <- data_overall %>% 
181 |       arrange(-x) %>% 
182 |       top_n(input$num_pages, x) %>% 
183 |       dplyr::select(pagePath, x, y)
184 |   })
185 |   
186 |   # Reactive function to get the top pages by the x-axis value, but to keep that data with
187 |   # the device category and channel grouping breakouts. This will be used for the faceted plot.
188 |   get_top_pages_faceted <- reactive({
189 | 
190 |     # Get the overall data
191 |     data_pages <- get_ga_data()
192 |     
193 |     # Get the top pages overall
194 |     top_pages <- get_top_pages()
195 |     
196 |     # Get the formulas
197 |     formula_x <- get_formula_x()
198 |     formula_y <- get_formula_y()
199 |     
200 |     # Stitch that back to the broken-out (facetable) data
201 |     top_pages_faceted <- top_pages %>% 
202 |       dplyr::select(pagePath) %>% 
203 |       left_join(data_pages) %>% 
204 |       mutate_(x = formula_x,  y = formula_y) %>% 
205 |       dplyr::select(deviceCategory, channelGrouping, pagePath, x, y)
206 |   })
207 | 
208 |   
209 |   # Output the correlation coefficient and R-squared
210 |   output$corr <- renderText({
211 |     
212 |     # Get the top pages
213 |     top_pages <- get_top_pages()
214 |     
215 |     # Calculate the correlation coefficient (r) and the coefficient of determination (r^2)
216 |     r <- round(cor(top_pages$x, top_pages$y),2)
217 |     r_squared <- round(cor(top_pages$x, top_pages$y)^2,2)
218 |     
219 |     # Output the result
220 |     paste0("These two metrics have a correlation coefficient of ", r, " and an R-squared of ", r_squared, ".")
221 |   })
222 |   
223 |   # We've got to do a lot of fiddling with the scales for both plots, so we're going
224 |   # to put in a reactive function that figures all of that out and returns a list with
225 |   # all of the various bits
226 |   get_axis_settings <- reactive({
227 |     
228 |     # Grab the format for x and y from the data frame
229 |     x_format <- filter(calcs_df, metric == input$x_dim) %>% dplyr::select(metric_format) %>% as.character()
230 |     y_format <- filter(calcs_df, metric == input$y_dim) %>% dplyr::select(metric_format) %>% as.character()
231 |     
232 |     # Set up the x and y scales. These vary based on the settings in the initial chunk
233 |     format_x <- if(x_format == "integer"){comma} else {percent_format(accuracy = 1)}
234 |     format_y <- if(y_format == "integer"){comma} else {percent_format(accuracy = 1)}
235 |     
236 |     # Get the plot scale (linear or log plus the format) for the axes
237 |     if(input$x_scale == "Linear"){
238 |       scale_x <- scale_x_continuous(labels = format_x)
239 |     } else {
240 |       scale_x <- scale_x_log10(labels = format_x)
241 |     }
242 |     
243 |     if(input$y_scale == "Linear"){
244 |       scale_y <- scale_y_continuous(labels = format_y)
245 |     } else {
246 |       scale_y <- scale_y_log10(labels = format_y)
247 |     }
248 |     
249 |     # Get the (loose) quadrant divider hline and vline locations. These are in different spots
250 |     # for the overall and for the faceted version (because the overall is rolled up totals -- which
251 |     # means larger max #s)
252 |     
253 |     # Grab the top pages overall (for overall max/min) and the detailed pages (for the faceted max/min)
254 |     top_pages <- get_top_pages()
255 |     top_pages_faceted <- get_top_pages_faceted()
256 |     
257 |     # Set the vline values for the x-axis on both plots
258 |     if(input$x_scale == "Linear" | x_format == "percentage"){
259 |       x_vline_overall <- max(top_pages$x, na.rm=TRUE)/2
260 |       x_vline_faceted <- max(top_pages_faceted$x, na.rm=TRUE)/2
261 |     } else {
262 |       x_vline_overall <- max(top_pages$x, na.rm=TRUE) %>% sqrt()
263 |       x_vline_faceted <- max(top_pages_faceted$x, na.rm=TRUE) %>% sqrt()
264 |     }
265 |     
266 |     # Set the hline values for the y-axis on both plots
267 |     if(input$y_scale == "Linear" | y_format == "percentage"){
268 |       y_hline_overall <- max(top_pages$y, na.rm=TRUE)/2
269 |       y_hline_faceted <- max(top_pages_faceted$y, na.rm=TRUE)/2
270 |     } else {
271 |       y_hline_overall <- max(top_pages$y, na.rm=TRUE) %>% sqrt()
272 |       y_hline_faceted <- max(top_pages_faceted$y, na.rm=TRUE) %>% sqrt()
273 |     }
274 |     
275 |     # Pack all of this up into a list to return
276 |     axis_settings <- list(format_x = format_x,
277 |                           format_y = format_y,
278 |                           scale_x = scale_x,
279 |                           scale_y = scale_y,
280 |                           x_vline_overall = x_vline_overall,
281 |                           x_vline_faceted = x_vline_faceted,
282 |                           y_hline_overall = y_hline_overall,
283 |                           y_hline_faceted = y_hline_faceted)
284 |   })
285 |   
286 |   # Output the overall plot (interactively)
287 |   output$plot_overall <- renderPlotly({
288 |     
289 |     # Get the top pages
290 |     top_pages <- get_top_pages()
291 |     
292 |     # Get the axis settings
293 |     axis_settings <- get_axis_settings()
294 |     
295 |     # Build the plot
296 |     gg <- ggplot(top_pages, mapping = aes(x = x, y = y, text = pagePath)) +
297 |       axis_settings$scale_x +
298 |       geom_vline(xintercept = axis_settings$x_vline_overall, colour = "gray90") +   # vertical line quadrant divider
299 |       axis_settings$scale_y +
300 |       geom_hline(yintercept = axis_settings$y_hline_overall, colour = "gray90") +   # horizontal line quadrant divider
301 |       geom_point(colour = "steelblue", alpha = 0.8) +
302 |       labs(title = paste0("Page Analysis: Top ", input$num_pages, " Pages by Total ", input$x_dim, ": ", 
303 |                           format(input$date_selection[1]), " to ", format(input$date_selection[2])),
304 |            x = input$x_dim, y = input$y_dim) +
305 |       theme_base
306 |     
307 |     ggplotly(gg) %>% layout(autosize=TRUE)
308 |     
309 |   })
310 |   
311 |   # Output the plot broken down by device category and channel grouping
312 |   output$plot_facets <- renderPlotly({
313 |     
314 |     # Get the top page - faceteds
315 |     top_pages_faceted <- get_top_pages_faceted()
316 |     
317 |     # Get the axis settings
318 |     axis_settings <- get_axis_settings()
319 |     
320 |     gg <- ggplot(top_pages_faceted, mapping = aes(x = x, y = y, text = pagePath)) +
321 |       axis_settings$scale_x +
322 |       geom_vline(xintercept = axis_settings$x_vline_faceted, colour = "gray90") +   # vertical line quadrant divider
323 |       axis_settings$scale_y +
324 |       geom_hline(yintercept = axis_settings$y_hline_faceted, colour = "gray90") +   # horizontal line quadrant divider
325 |       geom_point(colour = "steelblue", alpha = 0.8) +
326 |       labs(title = paste0("Page Analysis: Top ", input$num_pages, " Pages by Total ", input$x_dim, ": ", 
327 |                           format(input$date_selection[1]), " to ", format(input$date_selection[2])),
328 |            x = input$x_dim, y = input$y_dim) +
329 |       facet_grid(channelGrouping ~ deviceCategory, switch = "y") +
330 |       theme_base +
331 |       theme(panel.border = element_rect(colour = "gray50", fill = NA))
332 | 
333 |     ggplotly(gg) %>% layout(autosize=TRUE)
334 |     
335 |   })
336 |   
337 | }
338 | 
339 | # shinyApp(gar_shiny_ui(ui, login_ui = gar_shiny_login_ui), server)
340 | shinyApp(gar_shiny_ui(ui, login_ui = silent_auth), server)
341 | 


--------------------------------------------------------------------------------
/page-analysis-two-metrics/page-analysis-two-metrics.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Google Analytics Page Analysis (2-Metric Comparison)"
  3 | output: html_notebook
  4 | ---
  5 | 
  6 | ### General Note
  7 | This example is part of a larger set of examples of using Google Analytics with R:
  8 | 
  9 | * For additional examples, downloadable code, and explanations of the overall effort, see: https://github.com/SDITools/ga-and-r-examples
 10 | * To see this specific example in Shiny (web/interactive) form, see: https://gilligan.shinyapps.io/page-analysis/
 11 | 
 12 | ### Overview
 13 | 
 14 | This example compares two metrics at a page-by-page level to identify high/low performers. This is set to pull *Entrances* and *Bounce Rate* and compare them, but additional data is actually included in the query so that the `x_dim` and `y_dim` values in the first bit of code below can be updated to compare other metrics. The [Shiny app for this example](https://gilligan.shinyapps.io/page-analysis/) simply allows the user to choose which metrics to view.
 15 | 
 16 | ```{r config, warning=FALSE, message=FALSE}
 17 | 
 18 | # Load the necessary libraries. 
 19 | if (!require("pacman")) install.packages("pacman")
 20 | pacman::p_load(googleAnalyticsR,  # How we actually get the Google Analytics data
 21 |                tidyverse,         # Includes dplyr, ggplot2, and others; very key!
 22 |                plotly,            # For interactive charts
 23 |                scales)            # Useful for some number formatting in the visualizations
 24 | 
 25 | # Authorize GA. Depending on if you've done this already and a .ga-httr-oauth file has
 26 | # been saved or not, this may pop you over to a browser to authenticate.
 27 | ga_auth(token = ".httr-oauth")
 28 | 
 29 | # Set the view ID and the date range. 
 30 | view_id <- Sys.getenv("GA_VIEW_ID")
 31 | start_date <- Sys.Date() - 31        # 30 days back from yesterday
 32 | end_date <- Sys.Date() - 1           # Yesterday
 33 | 
 34 | # Set the number of pages to analyze
 35 | num_pages <- 100
 36 | 
 37 | # You likely won't want to do this for the *entire* site. If you do, then simply enter
 38 | # ".*" for this value. Otherwise, enter regEx that filters to the pages of interest (you
 39 | # can experiment/test your regEx by entering it in the Pages report in the web interface
 40 | # for GA).
 41 | filter_regex <- "/blog/.+"
 42 | 
 43 | # Set the metrics to use for the x and y dimensions. These will be the "nice label"
 44 | # dimensions. We'll look up if/how they need to be calculated.
 45 | x_dim <- "Entrances"
 46 | y_dim <- "Bounce Rate"
 47 | 
 48 | # Set whether the two axes should use a log scale or a standard scale. Set to "log10"
 49 | # to use a log scale and "continuous" to use a (typical) linear scale.
 50 | x_scale <- "log10"
 51 | y_scale <- "linear"
 52 | 
 53 | # Create a date frame that we'll use to look up the various calculations for different
 54 | # metrics.
 55 | calcs_df <- data.frame(metric = c("Entrances", "Bounces", "Unique Pageviews", "Pageviews", "Exits", "Bounce Rate", "Exit Rate"),
 56 |                        calculation = c("entrances", "bounces", "uniquePageviews", "pageviews", "exits", "bounces/entrances", "exits/pageviews"),
 57 |                        metric_format = c("integer", "integer", "integer", "integer", "integer", "percentage", "percentage"),
 58 |                        stringsAsFactors = FALSE)
 59 | 
 60 | # Grab the format for x and y from the data frame
 61 | x_format <- filter(calcs_df, metric == x_dim) %>% select(metric_format) %>% as.character()
 62 | y_format <- filter(calcs_df, metric == y_dim) %>% select(metric_format) %>% as.character()
 63 | 
 64 | # Define the base theme for visualizations
 65 | theme_base <- theme_light() +
 66 |   theme(panel.grid = element_blank(),
 67 |         panel.border = element_blank(),
 68 |         axis.ticks = element_blank(),
 69 |         axis.line = element_line(colour = "gray70"),
 70 |         strip.background = element_rect(fill = "white", color = NA),
 71 |         strip.text = element_text(colour = "gray20", face = "bold"))
 72 | 
 73 | ```
 74 | 
 75 | ### Get the Data
 76 | 
 77 | Pull the data and do a bit of wrangling of it.
 78 | 
 79 | ```{r get-and-munge-data, warning=FALSE, message=FALSE}
 80 | 
 81 | # Create a dimension filter object. See ?dim_filter() for details. 
 82 | page_filter_object <- dim_filter("pagePath", 
 83 |                                    operator = "REGEXP",
 84 |                                    expressions = filter_regex)
 85 | 
 86 | # Now, put that filter object into a filter clause. The "operator" argument is moot -- it 
 87 | # can be AND or OR...but you have to have it be something, even though it doesn't do anything
 88 | # when there is only a single filter object.
 89 | page_filter <- filter_clause_ga4(list(page_filter_object),
 90 |                                           operator = "AND")
 91 | 
 92 | # Pull the pages data. We'll go ahead and pull a bunch of metrics so that
 93 | # we won't have to re-pull if we change x_dim and y_dim.
 94 | data_pages <- google_analytics(viewId = view_id,
 95 |                               date_range = c(start_date, end_date),
 96 |                               metrics = c("entrances", "bounces", "pageviews", 
 97 |                                           "uniquePageviews", "exits"),
 98 |                               dimensions = c("deviceCategory", "channelGrouping", "pagePath"),
 99 |                               dim_filters = page_filter,
100 |                               anti_sample = TRUE)
101 | 
102 | # Summarize the data using the metrics of interest -- do this overall and then 
103 | # do this broken out by deviceCategory and Channel.
104 | 
105 | # Get the formulas
106 | calc_details_x <- calcs_df %>%  filter(metric == x_dim)
107 | formula_x <- calc_details_x$calculation
108 | 
109 | calc_details_y <- calcs_df %>%  filter(metric == y_dim)
110 | formula_y <- calc_details_y$calculation
111 | 
112 | # First, we're going to roll everything up (removing deviceCategory and channelGrouping), 
113 | # and then we're going to calculate our x and y dimensions. This last step requires NSE,
114 | # which, I realize, has moved on beyond the "_" notation, but I'll be damned if I could get
115 | # that to work.
116 | data_overall <- data_pages %>%  group_by(pagePath) %>% 
117 |   summarise(entrances = sum(entrances), bounces = sum(bounces), pageviews = sum(pageviews),
118 |             uniquePageviews = sum(uniquePageviews), exits = sum(exits)) %>% 
119 |   mutate_(x = formula_x,
120 |           y = formula_y)
121 | 
122 | # Get the top pages by the x-axis value. We'll use this for our overall plot
123 | top_pages <- data_overall %>% 
124 |   arrange(-x) %>% 
125 |   top_n(num_pages, x) %>% 
126 |   select(pagePath, x, y)
127 | 
128 | # Calculate the correlation coefficient (r) and the coefficient of determination (r^2)
129 | r <- cor(top_pages$x, top_pages$y)
130 | r_squared <- r^2
131 | 
132 | # Join this back to the original data and then repeat the calculating. This will give us the
133 | # top num_pages overall...but broken out by device category and channel
134 | top_pages_by_devicecat_channel <- top_pages %>% 
135 |   select(pagePath) %>% 
136 |   left_join(data_pages) %>% 
137 |   mutate_(x = formula_x,  y = formula_y) %>% 
138 |   select(deviceCategory, channelGrouping, pagePath, x, y)
139 | 
140 | ```
141 | 
142 | ### Visualize the Results
143 | 
144 | We'll do one visualization of the traffic overall and another visualization that is faceted.
145 | 
146 | The correlation coefficient overall is: **`r r`**.
147 | 
148 | The _coefficient of determination_ -- which is more commonly referred to as R^2^ -- is: **`r r^2`**.
149 | 
150 | ```{r visualize, warning=FALSE, message=FALSE, fig.width=10, fig.height=6}
151 | 
152 | # Set up the x and y scales. These vary based on the settings in the initial chunk
153 | format_x <- if(x_format == "integer"){comma} else {percent_format(accuracy = 1)}
154 | format_y <- if(y_format == "integer"){comma} else {percent_format(accuracy = 1)}
155 | 
156 | if(x_scale == "linear"){
157 |   scale_x <- scale_x_continuous(labels = format_x)
158 |   x_vline <- max(top_pages$x)/2
159 | } else {
160 |   scale_x <- scale_x_log10(labels = format_x)
161 |   x_vline <- max(top_pages$x) %>% sqrt()
162 | }
163 | 
164 | if(y_scale == "linear"){
165 |   scale_y <- scale_y_continuous(labels = format_y)
166 |   y_hline <- max(top_pages$y)/2
167 | } else {
168 |   scale_y <- scale_y_log10(labels = format_y)
169 |   y_hline <- max(top_pages$y) %>% sqrt()
170 | }
171 | 
172 | gg <- ggplot(top_pages, mapping = aes(x = x, y = y, text = pagePath)) +
173 |   scale_x +
174 |   geom_vline(xintercept = x_vline, colour = "gray90") +   # vertical line quadrant divider
175 |   scale_y +
176 |   geom_hline(yintercept = y_hline, colour = "gray90") +      # horizontal line quadrant divider
177 |   geom_point(colour = "steelblue", alpha = 0.8) +
178 |   labs(title = paste("Page Analysis: Top", num_pages, "Pages by Total", x_dim, "-", start_date, "to", end_date),
179 |        x = x_dim, y = y_dim) +
180 |   theme_base
181 | 
182 | ggplotly(gg)
183 | 
184 | ## Same Visualization, but broken down by device category and channel grouping
185 | 
186 | gg_facets <- ggplot(top_pages_by_devicecat_channel, mapping = aes(x = x, y = y, text = pagePath)) +
187 |   scale_x +
188 |   geom_vline(xintercept = x_vline, colour = "gray90") +   # vertical line quadrant divider
189 |   scale_y +
190 |   geom_hline(yintercept = y_hline, colour = "gray90") +      # horizontal line quadrant divider
191 |   geom_point(colour = "steelblue", alpha = 0.8) +
192 |   labs(title = paste("Page Analysis: Top", num_pages, "Pages by Total", x_dim, "-", start_date, "to", end_date),
193 |        x = x_dim, y = y_dim) +
194 |   facet_grid(channelGrouping ~ deviceCategory, switch = "y") +
195 |   theme_base +
196 |   theme(panel.border = element_rect(colour = "gray50", fill = NA))
197 | 
198 | ggplotly(gg_facets)
199 | 
200 | ```
201 | 
202 | 
203 | 


--------------------------------------------------------------------------------
/regression/app.R:
--------------------------------------------------------------------------------
  1 | # Linear Regression with Day of Week as Nominal Variable
  2 | 
  3 | # Load the necessary libraries. 
  4 | library(shiny)
  5 | library(googleAuthR)       # For authentication
  6 | library(googleAnalyticsR)  # How we actually get the Google Analytics data
  7 | 
  8 | gar_set_client(web_json = "ga-web-client.json",
  9 |                scopes = "https://www.googleapis.com/auth/analytics.readonly")
 10 | options(googleAuthR.redirect = "https://gilligan.shinyapps.io/regression/")
 11 | 
 12 | library(tidyverse)         # Includes dplyr, ggplot2, and others; very key!
 13 | library(knitr)             # Nicer looking tables
 14 | library(plotly)            # We're going to make the charts interactive
 15 | library(DT)                # Interactive tables
 16 | library(scales)            # Useful for some number formatting in the visualizations
 17 | library(MASS)              # For stepwise regression
 18 | 
 19 | # Define the base theme for visualizations
 20 | theme_base <- theme_light() +
 21 |   theme(panel.grid = element_blank(),
 22 |         panel.border = element_blank(),
 23 |         axis.ticks = element_blank(),
 24 |         axis.line = element_line(colour = "gray70"),
 25 |         strip.background = element_rect(fill = "white", color = NA),
 26 |         strip.text = element_text(colour = "gray20", face = "bold"))
 27 | 
 28 | ## ui.R
 29 | ui <- fluidPage(title = "Regression with Day of Week",
 30 |                 tags$head(includeScript("gtm.js")),
 31 |                 tags$h2("Regression with Day of Week*"),
 32 |                 tags$div(paste("Select a Google Analytics view and date range and then pull the data. From there, explore",
 33 |                                "a regression of a categorical/nominal variable -- day of week -- to see to what extent the",
 34 |                                "day of the week is predictive of traffic to the site.")),
 35 |                 tags$br(),
 36 |                 sidebarLayout(
 37 |                   sidebarPanel(tags$h4("Select Base Data Parameters"),
 38 |                                # Account/Property/View Selection
 39 |                                authDropdownUI("auth_menu",
 40 |                                               inColumns = FALSE),
 41 |                                # Date Range Selection
 42 |                                dateRangeInput("date_selection", 
 43 |                                               label = "Select date range:",
 44 |                                               start = Sys.Date()-90,
 45 |                                               end = Sys.Date()-1),
 46 |                                # Whether or not to enable anti-sampling
 47 |                                checkboxInput("anti_sampling",
 48 |                                              label = "Include anti-sampling (slows down app a bit).",
 49 |                                              value = TRUE),
 50 |                                # Action button. We want the user to control when the
 51 |                                # underlying call to Google Analytics occurs.
 52 |                                tags$div(style="text-align: center",
 53 |                                         actionButton("query_data", "Get/Refresh Data!", 
 54 |                                                      style="color: #fff; background-color: #337ab7; border-color: #2e6da4")),
 55 |                                tags$br(),
 56 |                                tags$hr(),
 57 |                                tags$h4("Choose a Reference Variable!"),
 58 |                                tags$div("It doesn't (really) matter what you choose for the reference variable (the", 
 59 |                                         "value for the dummy variable that will be dropped when the dummies are created.",
 60 |                                         "The model will be (almost) the same (although the coefficients will change!),",
 61 |                                         "and the model summary statistics will change (but just a little bit):"),
 62 |                                tags$br(),
 63 |                                selectInput("dummy_select",
 64 |                                            label=NA,
 65 |                                            choices = c("SUN", "MON", "TUE", "WED", "THU", "FRI", "SAT"),
 66 |                                            selected = "SAT")),
 67 |                   mainPanel(tabsetPanel(type = "tabs",
 68 |                                         tabPanel("Base Data",
 69 |                                                  tags$br(),
 70 |                                                  tags$div(paste("This is the base data and a visualization of the data",
 71 |                                                                 "that you queried. It should look pretty familiar!")),
 72 |                                                  tags$br(),
 73 |                                                  plotlyOutput("base_data_plot", height = "400px"),
 74 |                                                  tags$br(),
 75 |                                                  dataTableOutput("base_data_table")),
 76 |                                         tabPanel("Boxplot",
 77 |                                                  tags$br(),
 78 |                                                  tags$div(paste("Maybe this looks a little different than how you normally",
 79 |                                                                 "think of this data? It's grouping the data by day of week",
 80 |                                                                 "and showing the median and quartiles of the number of sessions",
 81 |                                                                 "recorded for each day.")),
 82 |                                                  tags$br(),
 83 |                                                  plotOutput("box_plot", height = "400px")),
 84 |                                         tabPanel("Dummies!",
 85 |                                                  tags$br(),
 86 |                                                  tags$div(paste("This is the exact same data, except columns have been added",
 87 |                                                                 "for six of seven weekdays with a 1 or 0 assigned based on whether",
 88 |                                                                 "that date is that weekday (we don't need a seventh column,",
 89 |                                                                 "because the seventh day is already implicit -- when all six day",
 90 |                                                                 "columns are all 0s.")),
 91 |                                                  tags$br(),
 92 |                                                  dataTableOutput("data_dummies_table")),
 93 |                                         tabPanel("Stepwise Summary",
 94 |                                                  tags$br(),
 95 |                                                  tags$div(paste("This is the raw summary of a stepwise regression model using",
 96 |                                                                 "the data set on the 'Dummies' tab. It's not super pretty, is it?")),
 97 |                                                  tags$br(),
 98 |                                                  verbatimTextOutput("model_summary")),
 99 |                                         tabPanel("Plain English",
100 |                                                  tags$br(),
101 |                                                  tags$div(paste("This is a somewhat easier on the eyes way of interpreting the",
102 |                                                                 "summary of the model.")),
103 |                                                  tags$br(),
104 |                                                  tags$b(textOutput("p_value")),
105 |                                                  tags$br(),
106 |                                                  tags$b(textOutput("adj_r_sq")),
107 |                                                  tags$br(),
108 |                                                  tags$div("We can represent the model as an equation that looks like this:"),
109 |                                                  uiOutput("model_equation"),
110 |                                                  tags$div("Or, we can look at a table with the intercept and coefficients for each value:"),
111 |                                                  dataTableOutput("ind_vars")),
112 |                                         tabPanel("Visualization",
113 |                                                  tags$br(),
114 |                                                  tags$div(paste("Let's bring it all together and look at a visualization that shows the",
115 |                                                                 "actual values and the predicted values:")),
116 |                                                  tags$br(),
117 |                                                  plotlyOutput("final_plot", height = "400px"),
118 |                                                  tags$br(),
119 |                                                  tags$div(paste("If you want to try plugging in the numbers yourself, here is the equation:")),
120 |                                                  uiOutput("model_equation_2"),
121 |                                                  tags$br(),
122 |                                                  tags$div(paste("If you're wondering how you multiply a number by weekday, hopefully the table",
123 |                                                                 "below makes that clear (most of the coefficients get multiplied by 0, but",
124 |                                                                 "one gets multiplied by 1!")),
125 |                                                  tags$br(),
126 |                                                  tags$div(paste("And here is the original data (with dummy variables) with the actual and",
127 |                                                                 "predicted values:")),
128 |                                                  tags$br(),
129 |                                                  dataTableOutput("predict_vs_actual")
130 |                                         )))),
131 |                 tags$hr(),
132 |                 tags$div("*This app is part of a larger set of apps that demonstrate some uses of R in conjunction",
133 |                          "with Google Analytics (and Twitter). For the code for this app, as well as an R Notebook",
134 |                          "that includes more details, see:", tags$a(href = "https://github.com/SDITools/ga-and-r-examples/",
135 |                                                                     "https://github.com/SDITools/ga-and-r-examples/"),"."),
136 |                 tags$br()
137 | )
138 | 
139 | ## server.R
140 | server <- function(input, output, session){
141 |   
142 |   # Create a non-reactive access token
143 |   gar_shiny_auth(session)
144 |   
145 |   # Populate the Account/Property/View dropdowns and return whatever the
146 |   # selected view ID is
147 |   view_id <- callModule(authDropdown, "auth_menu", ga.table = ga_account_list)
148 |   
149 |   # Reactive function to pull the data.
150 |   get_ga_data <- reactive({
151 |     
152 |     # Only pull the data if the "Get Data" button is clicked
153 |     input$query_data
154 |     
155 |     # Pull the data. Go ahead and shorten the weeday names
156 |     isolate(google_analytics(viewId = view_id(),
157 |                              date_range = input$date_selection,
158 |                              metrics = "sessions",
159 |                              dimensions = c("date", "dayOfWeekName"),
160 |                              anti_sample = input$anti_sampling) %>% 
161 |               mutate(weekday = substring(dayOfWeekName, 1, 3)) %>% 
162 |               mutate(weekday = toupper(weekday)) %>% 
163 |               mutate(weekday = factor(weekday,
164 |                                       levels = c("SUN", "MON", "TUE", "WED", "THU", "FRI", "SAT"))) %>% 
165 |               dplyr::select(date, weekday, sessions))
166 |   })
167 |   
168 |   # Reactive function to create the dummy variables
169 |   get_dummies <- reactive({
170 |     
171 |     # Get the base data
172 |     ga_data <- get_ga_data()
173 |     
174 |     # Set the list of days to include by *dropping* the selected dummy value
175 |     wdays <- c("SUN", "MON", "TUE", "WED", "THU", "FRI", "SAT")
176 |     include_cols <- wdays[!wdays %in% input$dummy_select]
177 |     include_cols <- c("date", include_cols, "sessions")
178 |     
179 |     # Get the data set up where each day of the week has its own column. That means one of the 
180 |     # columns will be a "1" and the rest will be "0s" for each date
181 |     ga_data_dummies <- ga_data %>% 
182 |       mutate(var = 1) %>%                                          # Put a 1 in all rows of a column
183 |       spread(key = weekday, value = var, fill = 0) %>%             # Create the dummy variables
184 |       dplyr::select(one_of(include_cols))         # Re-order and drop the selected weekday
185 |   })
186 |   
187 |   # Get the stepwise model
188 |   get_step_model <- reactive({
189 |     
190 |     # Get the data and drop the date column
191 |     analysis_data <- get_dummies() %>% dplyr::select(-date)
192 |     
193 |     # Fit the full model
194 |     full_model <- lm(sessions ~., data = analysis_data)
195 |     
196 |     # Now, see if stepwise can knock some things out (it shouldn't in this case,
197 |     # but it doesn't hurt to try)
198 |     step_model <- stepAIC(full_model, direction = "both", trace = FALSE)
199 |   })
200 |   
201 |   # Get the model summary. We're going to use this quite a bit
202 |   get_model_summary <- reactive({
203 |     step_model <- get_step_model()
204 |     model_summary <- summary(step_model)
205 |   })
206 |   
207 |   # Get the intercept and coefficients in a data frame
208 |   get_ind_vars <- reactive({
209 |     
210 |     # Get model summary
211 |     model_summary <- get_model_summary()
212 |     
213 |     # Make a data frame with just the coefficients and their confidence levels. This is converting
214 |     # the p-values to confidence levels
215 |     ind_vars <- model_summary$coefficients %>% as.data.frame() %>%
216 |       mutate(Variable = rownames(.)) %>%
217 |       mutate(`Confidence Level` = ifelse(`Pr(>|t|)` < 0.01, "99%",
218 |                                          ifelse(`Pr(>|t|)` < 0.05, "95%",
219 |                                                 ifelse(`Pr(>|t|)` < 0.1, "90%",
220 |                                                        ifelse(`Pr(>|t|)` < 0.2, "80%","<80%"))))) %>%
221 |       dplyr::select(Variable, Coefficient = Estimate, `Confidence Level`)
222 |   })
223 |   
224 |   # Create the model equation. We display it in two spots, so we have to make two outputs.
225 |   # So... normally this isn't something that would have it's own reactive function, but it makes sense.
226 |   get_model_equation <- reactive({
227 |     
228 |     # Get coefficients and intercept
229 |     ind_vars <- get_ind_vars()
230 |     
231 |     # Generate a string that introduces MathJax equation notation so that we have values that
232 |     # can be pasted together and output as RMarkdown. This gets a little screwy, but it's
233 |     # just so we can output a "normal" looking equation.
234 |     model_equation <- ind_vars %>%
235 |       mutate(rmd = ifelse(Variable == "(Intercept)", round(Coefficient),
236 |                           paste0(round(Coefficient), " \\times ", Variable)))
237 |     
238 |     # Collapse that into the equation string
239 |     model_equation <- model_equation$rmd %>%
240 |       paste(collapse = " + ") %>%
241 |       paste0("$$Sessions = ", ., "$$") %>% 
242 |       gsub("\\+\\ \\-", "\\-", .)
243 |   })
244 |   
245 |   ## Outputs
246 |   
247 |   # Output the base data table
248 |   output$base_data_table <- renderDataTable({
249 |     get_ga_data() %>% 
250 |       datatable(colnames = c("Date", "Day of Week", "Sessions"),  rownames = FALSE)
251 |   })
252 |   
253 |   # Output the base data plot
254 |   output$base_data_plot <- renderPlotly({
255 |     
256 |     # Get the data to plot
257 |     ga_data <- get_ga_data()
258 |     
259 |     # Build the plot
260 |     gg <- ggplot(ga_data, aes(x = date, y = sessions)) + 
261 |       geom_line(color = "#00a2b1") +
262 |       scale_y_continuous(expand = c(0,0), limits=c(0, max(ga_data$sessions)*1.05), label = comma) +
263 |       labs(title = "Sessions by Day",
264 |            x = "Date") +
265 |       theme_base
266 |     
267 |     ggplotly(gg) %>% layout(autosize=TRUE)
268 |   })
269 |   
270 |   # Output the boxplot
271 |   output$box_plot <- renderPlot({
272 |     
273 |     # Get the data to plot
274 |     ga_data <- get_ga_data()
275 |     
276 |     # Make a boxplot where we combine all of the data points for each day of week together
277 |     ggplot(ga_data, aes(x = weekday, y = sessions)) + 
278 |       geom_boxplot(fill = "gray90", color = "gray40", outlier.colour = "#00a2b1") +
279 |       scale_y_continuous(expand = c(0,0), limits=c(0, max(ga_data$sessions)*1.05), label = comma) +
280 |       labs(title = "Sessions Variation by Day of Week", y = "Sessions") +
281 |       theme_base +
282 |       theme(plot.title = element_text(size = 18, hjust = 0.5, face = "bold"),
283 |             axis.ticks = element_blank(),
284 |             axis.title.x = element_blank(),
285 |             axis.title.y = element_text(size = 14, face = "bold"),
286 |             axis.line.x = element_line(colour="gray20"),
287 |             axis.text = element_text(size = 14))
288 |   })
289 |   
290 |   # Output the data with dummy variables
291 |   output$data_dummies_table <- renderDataTable({
292 |     get_dummies() %>% 
293 |       datatable(rownames = FALSE)
294 |   })
295 |   
296 |   # Output the results of a stepwise regression
297 |   output$model_summary <- renderPrint({
298 |     
299 |     # Get the model summary
300 |     model_summary <- get_model_summary()
301 |     
302 |     # Output that summary
303 |     print(model_summary)
304 |   })
305 |   
306 |   # Output the p-value and interpretation
307 |   output$p_value <- renderText({
308 |     
309 |     # Get the model summary
310 |     model_summary <- get_model_summary()
311 |     
312 |     # Get the F-statistic
313 |     f_statistic <- model_summary$fstatistic
314 |     
315 |     # Get the p_value
316 |     p_value <- pf(f_statistic[1], f_statistic[2], f_statistic[3],lower.tail=F)
317 |     attributes(p_value) <- NULL
318 |     
319 |     # Format it to be more readable
320 |     p_value <- format(round(p_value,4), nsmall = 4)
321 |     
322 |     # Determine the confidence level
323 |     confidence <- ifelse(p_value < 0.01, "99%",
324 |                          ifelse(p_value < 0.05, "95%",
325 |                                 ifelse(p_value < 0.1, "90%",
326 |                                        ifelse(p_value < 0.2, "80%","<80%"))))
327 |     
328 |     # Build the statement
329 |     confidence <- ifelse(confidence == "<80%", "not statistically significant.",
330 |                          paste("statistically significant at a", confidence,"confidence level."))
331 |     
332 |     result <- paste0("The model has a p-value of ", p_value, ", which means it is ", confidence)
333 |     
334 |   })
335 |   
336 |   # Output the Adjusted R-Squared
337 |   output$adj_r_sq <- renderText({
338 |     
339 |     # Get the model summary
340 |     model_summary <- get_model_summary()
341 |     
342 |     # Get the adjusted R-squared
343 |     adj_r_sq <- model_summary$adj.r.squared
344 |     
345 |     result <- paste0("The model has an adjusted R-squared of ", format(adj_r_sq, digits = 3, nsmall=2),
346 |                      ", which means that ", format(adj_r_sq*100, digits = 3, nsmall=0), 
347 |                      "% of the variation in sessions is explained by the model.")
348 |   })
349 |   
350 |   # Output the equation. See: http://shiny.rstudio.com/gallery/mathjax.html
351 |   output$model_equation <- renderUI({
352 |     model_equation <- get_model_equation()
353 |     # Output the equation
354 |     withMathJax(helpText(model_equation))
355 |   })
356 |   
357 |   # We need a second version of the same equqtion...
358 |   output$model_equation_2 <- renderUI({
359 |     model_equation <- get_model_equation()
360 |     # Output the equation
361 |     withMathJax(helpText(model_equation))
362 |   })
363 |   
364 |   # Output the table of coefficients
365 |   output$ind_vars <- renderDataTable({
366 |     
367 |     # Get the coefficients table
368 |     ind_vars <- get_ind_vars()
369 |     
370 |     # Round the coefficients
371 |     ind_vars <- ind_vars %>% 
372 |       mutate(Coefficient = round(Coefficient))
373 |     
374 |     # Get the intercept and coefficients
375 |     datatable(ind_vars, rownames = FALSE, options = list(dom="t"))
376 |   })
377 |   
378 |   # Output a plot showing the actual vs. predictions
379 |   output$final_plot <- renderPlotly({
380 |     
381 |     step_model <- get_step_model()
382 |     ga_data_dummies <- get_dummies()
383 |     ind_vars <- get_ind_vars()
384 |     
385 |     # Predict the results using the data -- to get a visualization of the results (basically, 
386 |     # visualizing the residuals). This is just a vector of  predicted sessions.
387 |     predict_vs_actual <- predict(step_model, ga_data_dummies)
388 |     
389 |     # Get just the intercept (for a horizontal line we'll add)
390 |     y_intercept <- ind_vars %>% filter(Variable == "(Intercept)") %>% 
391 |       dplyr::select(Coefficient) %>% as.numeric()
392 |     
393 |     # Add those predictions to a data frame that shows the actuals. We'll hold onto
394 |     # this so we can preview it in the output. 
395 |     predict_vs_actual_df <- ga_data_dummies %>% 
396 |       cbind(data.frame(`Predicted Sessions` = predict_vs_actual)) 
397 |     
398 |     # Add that y-intercept as a constant to the data to be plotted. geom_hline()
399 |     # would be better for this, but I gave up fighting to get the legend I was
400 |     # wanting, so hacking around it this way
401 |     predict_vs_actual_df <- predict_vs_actual_df %>% 
402 |       mutate(Intercept = y_intercept)
403 |     
404 |     # Rename "Sessions" to "Actual Sessions" for clarity
405 |     names(predict_vs_actual_df) <- gsub("sessions", "Actual Sessions", names(predict_vs_actual_df)) %>% 
406 |       gsub("Predicted.Sessions", "Predicted Sessions", .)
407 |     
408 |     # For cleaner plotting, convert that to a tidy format and then turn it into a ggplot
409 |     predict_vs_actual_for_plot <- predict_vs_actual_df %>% 
410 |       dplyr::select(Date = date, `Actual Sessions`, `Predicted Sessions`, Intercept) %>% 
411 |       gather(key = metric, value = value, -Date) %>% 
412 |       # Total hack to add better spacing in the legend
413 |       mutate(metric = paste0(metric, "    "))
414 |     
415 |     
416 |     # Get the max value so we can expand the limits
417 |     y_lim <- max(predict_vs_actual_for_plot$value) * 1.1
418 |     
419 |     # Plot the actuals vs. predicted. Ideally, this would show up FIRST, but it comes out second
420 |     gg_predict_vs_actual <- ggplot(data = predict_vs_actual_for_plot,
421 |                                    mapping = aes(x = Date, y = value, color = metric, linetype = metric)) +
422 |       geom_line(size = 0.5) +
423 |       scale_color_manual(values=c("#00a2b1", "gray50", "#ed1c24")) +
424 |       scale_linetype_manual(name = "limit", values = c("solid", "dashed", "dotted")) +
425 |       scale_y_continuous(expand = c(0,0), limits = c(0,y_lim), label=number_format(accuracy=1, big.mark=",")) +
426 |       labs(x = "Date", y = "Sessions", title = "Sessions by Day: Actual vs. Predicted") +
427 |       theme_base 
428 |     
429 |     # Output the (interactive) chart
430 |     ggplotly(gg_predict_vs_actual) %>% layout(legend = list(orientation = "h", 
431 |                                                             x=0.5, xanchor="center",
432 |                                                             y=1.05, yanchor="top"))
433 |     
434 |   })
435 |   
436 |   
437 |   
438 |   # Output the table with dummy variables and actual vs. predicted
439 |   output$predict_vs_actual <- renderDataTable({
440 |     
441 |     step_model <- get_step_model()
442 |     ga_data_dummies <- get_dummies()
443 |     
444 |     # Predict the results using the data -- to get a visualization of the results (basically, 
445 |     # visualizing the residuals). This is just a vector of  predicted sessions.
446 |     predict_vs_actual <- predict(step_model, ga_data_dummies)
447 |     
448 |     # Add those predictions to a data frame that shows the actuals. We'll hold onto
449 |     # this so we can preview it in the output. 
450 |     predict_vs_actual_df <- ga_data_dummies %>% 
451 |       cbind(data.frame(`Predicted Sessions` = predict_vs_actual)) %>% 
452 |       mutate(`Predicted.Sessions` = round(`Predicted.Sessions`))
453 |     
454 |     # Rename "Sessions" to "Actual Sessions" for clarity
455 |     names(predict_vs_actual_df) <- gsub("sessions", "Actual Sessions", names(predict_vs_actual_df)) %>% 
456 |       gsub("Predicted.Sessions", "Predicted Sessions", .)
457 |     
458 |     # Return the data frame
459 |     predict_vs_actual_df %>% 
460 |       datatable(rownames = FALSE)
461 |   })
462 |   
463 | }
464 | 
465 | # shinyApp(gar_shiny_ui(ui, login_ui = gar_shiny_login_ui), server)
466 | shinyApp(gar_shiny_ui(ui, login_ui = silent_auth), server)


--------------------------------------------------------------------------------
/regression/regression.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Regression -- Quantifying the Impact of Day of Week"
  3 | output: html_notebook
  4 | ---
  5 | 
  6 | ### General Note
  7 | This example is part of a larger set of examples of using Google Analytics with R:
  8 | 
  9 | * For additional examples, downloadable code, and explanations of the overall effort, see: https://github.com/SDITools/ga-and-r-examples
 10 | * To see this specific example in Shiny (web/interactive) form, see: https://gilligan.shinyapps.io/regression/
 11 | 
 12 | ### Overview
 13 | 
 14 | This example is an exploration of using linear regression with categorical/nominal variables. Specifically: with day of week. It will answer two questions:
 15 | 
 16 | * **_What % of traffic (although it could be any metric) is explained simply by which day of the week it is?_**
 17 | * **_What is the relative impact of each day of week on the overall traffic?_**
 18 | 
 19 | This is a bit of an odd way to go about this, in that we would typically perform "time-series decomposition" and then look at the details for a "7-day season." In theory, that would return a pretty similar result. But, it's instructive to walk through a regression to get an understanding on how to perform regression with categorical (aka, nominal) variables, which "day of week" actually is.
 20 | 
 21 | _Note that this approach -- unlike a time-series decomposition -- does **not** take into account overall trends in the data, such as a general trend of increasing traffic over time. Again, though, that's okay for shorter-term assessments, and for pedagogical purposes._
 22 | 
 23 | The broader application of this approach probably wouldn't be on "traffic," but, rather, would be on some form of conversion, and the model would give the multiplier for each session that had a given set of characteristics (e.g., a mobile visitor from paid search) on whatever that conversion (or other outcome -- such as revenue) would have.
 24 | 
 25 | We're only using "day of week" here because sessions is our independent variable, and a session is a session. So, a mobile visit from paid search contributes one session just like a tablet visit from organic search contributes one session -- "characteristics of a session" don't impact "the fact that it's as session." "What day of the week it is," though, is an entirely independent variable, so it works to check the degree to which the day of week impacts the number of sessions.
 26 | 
 27 | ### Setup/Config
 28 | 
 29 | Start by the initial configuration of variables.
 30 | 
 31 | ```{r config}
 32 | # Load the necessary libraries. 
 33 | if (!require("pacman")) install.packages("pacman")
 34 | pacman::p_load(googleAnalyticsR,  # How we actually get the Google Analytics data
 35 |                tidyverse,         # Includes dplyr, ggplot2, and others; very key!
 36 |                knitr,             # Nicer looking tables
 37 |                kableExtra,        # Even nicer looking tables
 38 |                plotly,            # We're going to make the charts interactive
 39 |                scales,            # Useful for some number formatting in the visualizations
 40 |                MASS)              # For our stepwise regression
 41 | 
 42 | # Authorize GA. Depending on if you've done this already and a .ga-httr-oauth file has
 43 | # been saved or not, this may pop you over to a browser to authenticate.
 44 | ga_auth(token = ".httr-oauth")
 45 | 
 46 | # Set the view ID and the date range. If you want to, you can swap out the Sys.getenv()
 47 | # call and just replace that with a hardcoded value for the view ID. 
 48 | view_id <- Sys.getenv("GA_VIEW_ID")
 49 | start_date <- Sys.Date() - 90        # The last 90 days
 50 | end_date <- Sys.Date() - 1           # Yesterday
 51 | 
 52 | start_date <- as.Date("2018-10-19")      # The last 90 days
 53 | end_date <- as.Date("2019-01-16")           # Yesterday
 54 | 
 55 | # Define our theme -- this will be used later for visualizations
 56 | theme_main <- theme_light() +
 57 |   theme(text = element_text(family="Nunito"),
 58 |         plot.title = element_text(hjust=0.5, size=18, face="bold"),
 59 |         legend.title = element_blank(),
 60 |         axis.text = element_text(size=16),
 61 |         axis.text.x = element_text(face = "bold", margin = margin(t = 5, unit = "pt")),
 62 |         axis.line.x = element_line(color = "gray20", size=0.5),
 63 |         axis.title.y = element_blank(),
 64 |         axis.title.x = element_blank(),
 65 |         legend.position = "top",
 66 |         panel.border = element_blank(),
 67 |         panel.grid.major.y = element_line(color = "gray80"),
 68 |         panel.grid.major.x = element_blank(),
 69 |         panel.grid.minor.x = element_blank())
 70 | 
 71 | ```
 72 | 
 73 | ### Pull the Data
 74 | 
 75 | This is a simple data pull. We're just pulling data by day (because each day is a different day of the week) and sessions. And, since Google Analytics has a "Day of Week Name" dimension, we'll go ahead and pull that, too (although we could have easily calculated this afterwards).
 76 | 
 77 | ```{r data-pull, message=FALSE, warning=FALSE, fig.width=10, fig.height=5}
 78 | 
 79 | # Pull the data. See ?google_analytics_4() for additional parameters. The anti_sample = TRUE
 80 | # parameter will slow the query down a smidge and isn't strictly necessary, but it will
 81 | # ensure you do not get sampled data.
 82 | ga_data <- google_analytics(viewId = view_id,
 83 |                             date_range = c(start_date, end_date),
 84 |                             metrics = "sessions",
 85 |                             dimensions = c("date", "dayOfWeekName"),
 86 |                             anti_sample = TRUE)
 87 | 
 88 | # Shorten the day of week values and make it a factor with and order for easier plotting later
 89 | # Check the variability of data by weekday
 90 | ga_data <- ga_data %>% 
 91 |   mutate(Weekday = substring(dayOfWeekName, 1, 3)) %>% 
 92 |   mutate(Weekday = toupper(Weekday)) %>% 
 93 |   mutate(Weekday = factor(Weekday,
 94 |                           levels = c("SUN", "MON", "TUE", "WED", "THU", "FRI", "SAT"))) %>% 
 95 |   dplyr::select(date, Weekday, sessions)
 96 | 
 97 | names(ga_data) <- c("Date", "Weekday", "Sessions")
 98 | 
 99 | # Show the head of the returned data
100 | head(ga_data, 12) %>% kable() %>% kable_styling(full_width = F,
101 |                                                 font_size = 24,
102 |                                                 bootstrap_options = "condensed") %>% 
103 |   column_spec(1, width = "8em") %>% 
104 |   column_spec(2, width = "6em") %>% 
105 |   column_spec(3, width = "6em")
106 | 
107 | ```
108 | 
109 | 
110 | ```{r base-data-viz, message=FALSE, warning=FALSE, fig.width=10, fig.height=4}
111 | 
112 | # Go ahead and do a quick visualization of the resulting data
113 | gg <- ggplot(ga_data, aes(x = Date, y = Sessions)) + 
114 |   geom_line(color = "#9a258f", size=1) +
115 |   scale_y_continuous(expand = c(0,0), limits=c(0, max(ga_data$Sessions)*1.05), label = comma) +
116 |   labs(title = "Sessions by Day",
117 |        x = "Date") +
118 |   theme_main
119 | 
120 | gg
121 | 
122 | ```
123 | 
124 | This step is not really necessary to do the regression. It's just useful to realize that, essentially, we're looking at all of the dates, organized by weekday, and the median and variability, as well as how many outliers there are, helps with some intuition for later.
125 | 
126 | ```{r week-data-viz, message=FALSE, warning=FALSE, fig.width=10, fig.height=5}
127 | 
128 | # Make a boxplot where we combine all of the data points for each day of week together
129 | gg <- ggplot(ga_data, aes(x = Weekday, y = Sessions)) + 
130 |   geom_boxplot(fill = "gray90", color = "gray40", outlier.colour = "#9a258f", outlier.size=2.5) +
131 |   scale_y_continuous(expand = c(0,0), limits=c(0, max(ga_data$Sessions)*1.05), label = comma) +
132 |   labs(title = "Sessions Variation by Day of Week") +
133 |   theme_main
134 | gg
135 | 
136 | ```
137 | 
138 | 
139 | ### Data Munging
140 | 
141 | We need convert each day of week to a binary -- or "dummy" -- variable. This will make sense as we get into it.
142 | 
143 | Note that the last step in this munging drops the "Saturday" column entirely. _Any_ day of the week could be excluded. This gets explained by saying, "we only need _k-1_ values," but, to maybe provide a bit more intuition, this really is pointing out that, if there are 7 possible days of the week, then we only need 6 variables to identify each observation (each row of data) by day. In the way this example is set up, we don't need Saturday, because we know that any row that is not Sunday, Monday, Tuesday, Wednesday, Thursday, or Friday _must_ be Saturday.
144 | 
145 | It doesn't really matter _which_ variable you drop (you can update the code to drop a different day and see that, while the resulting model has different coefficients, it actually returns the exact same results).
146 | 
147 | ```{r munging, message=FALSE, warning=FALSE}
148 | 
149 | # Get the data set up where each day of the week has its own column. That means one of the 
150 | # columns will be a "1" and the rest will be "0s" for each date
151 | ga_data_dummies <- ga_data %>% 
152 |   mutate(var = 1) %>%                                          # Put a 1 in all reows of a column
153 |   spread(key = Weekday, value = var, fill = 0) %>%             # Create the dummy variables
154 |   dplyr::select(Date, SUN, MON, TUE, WED, THU, FRI, Sessions)  # Re-order and drop "SAT"
155 | 
156 | # Display what the data now looks like
157 | kable(head(ga_data_dummies, 12)) %>% kable_styling(full_width = F,
158 |                                                 font_size = 24,
159 |                                                 bootstrap_options = "condensed") %>%  
160 |   column_spec(1, width = "10em") %>% 
161 |   column_spec(2:7, width = "5em") %>% 
162 |   column_spec(8, width = "8em")
163 | 
164 | # When performing the regression, we don't actually need the date value, so make a data
165 | # set that has that removed
166 | analysis_data <- ga_data_dummies %>% dplyr::select(-Date)
167 | 
168 | ```
169 | 
170 | ### Conduct a Stepwise Regression on ALL Data
171 | 
172 | This is the "easy" part. We're just going to run a straight-up regression on all of the data. This has some problems, as there will be a tendency to overfit a model. We'll come back to that in a bit!
173 | 
174 | This is a "stepwise regression," in that it's going to find the "best" subset of variables for fitting the model -- dropping ones that are not really helping the cause. There are _tons_ of articles and posts explaining the nuts and bolts of this, so Google away! Or, here's one: http://www.sthda.com/english/articles/37-model-selection-essentials-in-r/154-stepwise-regression-essentials-in-r/.
175 | 
176 | ```{r stepwise_all, message=FALSE, warning=FALSE}
177 | 
178 | # Fit the full model 
179 | full_model <- lm(Sessions ~., data = analysis_data)
180 | 
181 | # Now, see if stepwise can knock some things out (it shouldn't in this case,
182 | # but it doesn't hurt to try)
183 | step_model <- stepAIC(full_model, direction = "both", trace = FALSE)
184 | 
185 | ```
186 | 
187 | ### Evaluate the Model
188 | 
189 | Let's look a summary of the model:
190 | 
191 | ```{r model_summary, message=FALSE, warning=FALSE}
192 | 
193 | # Get just the overall model summary
194 | model_summary <- summary(step_model)
195 | 
196 | # Output that summary
197 | model_summary
198 | 
199 | ```
200 | 
201 | This _can_ be a bit confusing to read, so let's try to come up with something that, perhaps, is a bit easier to read.
202 | 
203 | Start by getting the p-value and the adjusted R-squared for the model overall.
204 | 
205 | ```{r model_stats, message=FALSE, warning=FALSE}
206 | 
207 | # Get the p-value and the adjusted R-Squared for the model overall
208 | 
209 | # Get the F-statistic
210 | f_statistic <- model_summary$fstatistic
211 | 
212 | # Get the p_value
213 | p_value <- pf(f_statistic[1], f_statistic[2], f_statistic[3],lower.tail=F)
214 | attributes(p_value) <- NULL
215 | 
216 | # Get the adjusted R-squared
217 | adj_r_sq <- model_summary$adj.r.squared
218 | 
219 | ```
220 | 
221 | This shows us that the model overall:
222 | 
223 | * Has a p-value of **`r format(round(p_value,4), nsmall = 4)`**
224 | * Has an adjusted R^2^ of **`r format(adj_r_sq, digits = 3, nsmall=2)`** (meaning `r format(adj_r_sq*100, digits = 3, nsmall=0)`% of the actual results are explained by the model).
225 | 
226 | Now, let's dig into the actual individual independent variables and their coefficients. This is just a lightly cleaned up visualization of the "variables list" from the model output above. Note that not all days may appear in this table, as the nature of the stepwise regression is that it will add and subtract different features (variables) trying to find the "best" model.
227 | 
228 | ```{r coefficients, message=FALSE, warning=FALSE}
229 | 
230 | # Make a data frame with just the coefficients and their significance levels. This is converting
231 | # the p-values to confidence levels
232 | ind_vars <- model_summary$coefficients %>% as.data.frame() %>% 
233 |   mutate(Variable = rownames(.), 
234 |          Estimate = round(Estimate),
235 |          `Confidence Level` = ifelse(`Pr(>|t|)` < 0.01, "99%", 
236 |                                      ifelse(`Pr(>|t|)` < 0.05, "95%",
237 |                                             ifelse(`Pr(>|t|)` < 0.1, "90%", 
238 |                                                    ifelse(`Pr(>|t|)` < 0.2, "80%","<80%"))))) %>% 
239 |   dplyr::select(Variable, Coefficient = Estimate, `Confidence Level`)
240 | 
241 | # Output a table with the independent variables
242 | kable(ind_vars, align = c("l", "r", "c")) %>% 
243 |   kable_styling(full_width = F, bootstrap_options = "condensed", font_size=24) %>% 
244 |   column_spec(1, width = "6em") %>% 
245 |   column_spec(2, width = "6em") %>% 
246 |   column_spec(3, width = "10em")
247 | 
248 | ```
249 | 
250 | It might be helpful to display the above in equation notation -- just to get it mapped back to our days with algebra.
251 | 
252 | ```{r model_equation, message=FALSE, warning=FALSE}
253 | 
254 | # Generate a string that introduces RMarkdown equation notation so that we have values that
255 | # can be pasted together and output as RMarkdown. This gets a little screwy, but it's
256 | # just so we can output a "normal" looking equation.
257 | model_equation <- ind_vars %>% 
258 |   mutate(rmd = ifelse(Variable == "(Intercept)", round(Coefficient, 0),
259 |                       paste0(round(Coefficient, 0), " \\times ", Variable))) 
260 | 
261 | model_equation <- model_equation$rmd %>%
262 |   paste(collapse = " + ") %>% 
263 |   paste0("$$Sessions = ", ., "$$")
264 | 
265 | ```
266 | 
267 | **`r model_equation`**
268 | 
269 | But, wait. How do I multiply a number by a _day_?! This gets back to our creation of "dummy variables" earlier. Remember that the data we're working with created a variable for each day of the week and then assigned each row in the table (each day, each "observation") as a `1` (yes, it's that day) or a `0` (no, it's not that day). That means, in the equation above, _most_ of the values will be multipled by `0` for any given day, while one value will be multiplied by `1`.
270 | 
271 | ```{r dummy_vars, message=FALSE, warning=FALSE}
272 | 
273 | # Re-display what the data looks like with dummy variables
274 | kable(head(ga_data_dummies, 12)) %>% kable_styling(full_width = F,
275 |                                                 font_size = 24,
276 |                                                 bootstrap_options = "condensed") %>% 
277 |   column_spec(1, width = "10em") %>% 
278 |   column_spec(2:7, width = "5em") %>% 
279 |   column_spec(8, width = "8em")
280 | 
281 | ```
282 | 
283 | ### Compare the Model Predictions to the Actuals
284 | 
285 | So, what does our model _look like_ when comparing its predictions to the actual data?
286 | 
287 | We can simply run the model on the original data set and return a plot that shows the actual vs. predicted values. This is a shortcut to "plotting the residuals" and other checks, but, hopefully, won't give anyone using this too much heartburn.
288 | 
289 | Here's your chance to check the math from the equation above with some of the actual data in the new data frame we've created.
290 | 
291 | ```{r predict_vs_actual, message=FALSE, warning=FALSE}
292 | 
293 | # Predict the results using the data -- to get a visualization of the results (basically, 
294 | # visualizing the residuals). This is just a vector of  predicted sessions.
295 | predict_vs_actual <- predict(step_model, ga_data_dummies)
296 | 
297 | # Add those predictions to a data frame that shows the actuals. We'll hold onto
298 | # this so we can preview it in the output. 
299 | predict_vs_actual_df <- ga_data_dummies %>% 
300 |   cbind(data.frame(`Predicted Sessions` = predict_vs_actual)) %>% 
301 |   mutate(`Predicted.Sessions` = round(`Predicted.Sessions`))
302 | 
303 | # Rename "Sessions" to "Actual Sessions" for clarity
304 | names(predict_vs_actual_df) <- gsub("Sessions", "Actual Sessions", names(predict_vs_actual_df)) %>% 
305 |   gsub("Predicted.Actual.Sessions", "Predicted", .)
306 | 
307 | # Output the head
308 | kable(head(predict_vs_actual_df, 12)) %>% 
309 |   kable_styling(full_width = F, bootstrap_options = "condensed", font_size=20) %>% 
310 |   column_spec(1, width = "60em") %>% 
311 |   column_spec(2:7, width = "6em") %>% 
312 |   column_spec(8, width = "80em") %>% 
313 |   column_spec(9, width = "40em")
314 | 
315 | ```
316 | 
317 | For reference (if you want to do some math with the above):
318 | 
319 | **`r model_equation`**
320 | 
321 | ### Plot the Results
322 | 
323 | Finally, let's plot the results showing three things:
324 | 
325 | * **The Actual Sessions**
326 | * **The Predicted Sessions** -- what our regression model (which we determined explains `r round(adj_r_sq, 2)/100`% of the actual results)
327 | * **The Intercept** -- this is the constant; think of it as the baseline from which each independent variable increases or decreases the prediction
328 | 
329 | ```{r plot_result, message=FALSE, warning=FALSE, fig.width = 10, fig.height=4}
330 | 
331 | # Get just the intercept (for a horizontal line we'll add)
332 | y_intercept <- ind_vars %>% filter(Variable == "(Intercept)") %>% 
333 |   dplyr::select(Coefficient) %>% as.numeric()
334 | 
335 | # Add that y-intercept as a constant to the data to be plotted. geom_hline()
336 | # would be better for this, but I gave up fighting to get the legend I was
337 | # wanting, so hacking around it this way
338 | predict_vs_actual_for_plot <- predict_vs_actual_df %>% 
339 |   mutate(Intercept = y_intercept)
340 | 
341 | # For cleaner plotting, convert that to a tidy format and then turn it into a ggplot
342 | predict_vs_actual_for_plot <- predict_vs_actual_for_plot %>% 
343 |   dplyr::select(Date = Date, `Actual Sessions`, `Predicted`, Intercept) %>% 
344 |   gather(key = metric, value = value, -Date) %>% 
345 |   # Total hack to add better spacing in the legend
346 |   mutate(metric = paste0(metric, "    "))
347 | 
348 | # Get the max value so we can expand the limits
349 | y_lim <- max(predict_vs_actual_for_plot$value) * 1.1
350 | 
351 | # Plot the actuals vs. predicted. 
352 | gg_predict_vs_actual <- ggplot() +
353 |   geom_line(data = predict_vs_actual_for_plot,
354 |             mapping = aes(x = Date, y = value, color = metric, 
355 |                           linetype = metric), size = 1) +
356 |   # geom_hline(mapping = aes(linetype = "Intercept", yintercept = y_intercept), 
357 |   #            color="#ed1c24") +
358 |   scale_color_manual(values=c("#9a258f", "#ed1c24", "gray30" )) +
359 |   scale_linetype_manual(values = c("solid", "dashed", "dotted" )) +
360 |   scale_y_continuous(expand = c(0,0), limits = c(0,y_lim), label=number_format(accuracy=1, big.mark=",")) +
361 |   labs(x = "Date", y = "Sessions", title = "Sessions by Day: Actual vs. Predicted") +
362 |   # guides(fill=guide_legend(keywidth=0.8, keyheight=0.1, default.unit="inch")) +
363 |   theme_main +
364 |   theme(legend.key.width=unit(4,"line"),
365 |         legend.text = element_text(size=16),
366 |         axis.line.x = element_line(color = "gray20"),
367 |         plot.title = element_blank())
368 | 
369 | # Output the chart
370 | gg_predict_vs_actual 
371 | 
372 | ```
373 | 
374 | So, yes, the **Predicted Sessions** follow a recurring pattern. That's because our categorical variable -- day of week -- only has seven values, so our model is destined to just repeat.
375 | 
376 | Of course, if there was another variable thrown in (that had significance), this may not be the case, as this recurring pattern would then have some other factor (or set of factors) layered on top of it that added more variability to the prediction.
377 | 
378 | ### A Note on Overfitting
379 | 
380 | In this example, we had a relatively small number of variables, and we used stepwise regression to pick the subset (which may be "all") that were most predictive of the actual results.
381 | 
382 | We also did not look at interaction effects, because days of week are quite independent of each other.
383 | 
384 | Consider if, for instance, we had also used mobile device category and wanted to include the interactions between day of week _and_ device category as a predictor of a result like orders or revenue, then we would have had a _bunch_ more variables (either overtly or behind the scenes). In addition to each day of the week and each device category -- the "main effects," we would have a slew of "interaction effects" to consider as well: `Monday + Mobile`, `Monday + Tablet`, `Monday + Desktop`, `Tuesday + Mobile`, etc.
385 | 
386 | Conceivably, we would get to a point pretty easily where we had as many variables as we had observations, which causes all sorts of issues! But, even if things did not get that extreme, having many more variables to consider runs a greater risk of "overfitting the model." One way to lessen the risk of this is to use _k_-fold cross validation, which is a mechanism by which the data set is iteratively split into "training" and "testing" groups, and then the different models are run on different sets of training data and then evaluated based on how well they then predict the results in the test data.
387 | 
388 | We started to include that in this simple model... but, given the (relative) simplicity of this example, we determined it was not sufficiently illustrative to warrant including. 
389 | 
390 | 


--------------------------------------------------------------------------------
/shinyapps-deploy-script.R:
--------------------------------------------------------------------------------
 1 | # Use this script -- or some tweaked version of it -- to push the code
 2 | # to shinyapps.io. Comment out the deployApp() call that you do NOT 
 3 | # want to run. This script requires having rsconnect installed, as well
 4 | # as having authenticated with a shinyapps.io 
 5 | 
 6 | library(rsconnect)
 7 | 
 8 | # Deploy the apps. It's a bit clunky, in that we seem to need
 9 | # to manually set the working directory to be the directory where the Shiny app and
10 | # supporting files exist before deploying. Because this R project has multiple 
11 | # Shiny apps, and the app file name has to be app.R, each app is in its own directory.
12 | 
13 | # Time-Normalized Pageviews
14 | setwd("time-normalized-pageviews")
15 | deployApp(appFiles = c("app.R", "ga-web-client.json", "gtm.js"),
16 |           appName = "time-normalized",
17 |           appTitle = "Google Analytics - Time-Normalized Pageviews",
18 |           forceUpdate = TRUE)
19 | setwd("..")
20 | 
21 | # Page Analysis with Two Metrics
22 | setwd("page-analysis-two-metrics")
23 | deployApp(appFiles = c("app.R", "ga-web-client.json", "gtm.js"),
24 |           appName = "page-analysis",
25 |           appTitle = "Google Analytics - Page Analysis with Two Metrics",
26 |           forceUpdate = TRUE)
27 | setwd("..")
28 | 
29 | # Site Search Analysis
30 | setwd("site-search-analysis")
31 | deployApp(appFiles = c("app.R", "ga-web-client.json", "gtm.js"),
32 |           appName = "site-search",
33 |           appTitle = "Google Analytics - Site Search Analysis",
34 |           forceUpdate = TRUE)
35 | setwd("..")
36 | 
37 | # Regression
38 | setwd("regression")
39 | deployApp(appFiles = c("app.R", "ga-web-client.json", "gtm.js"),
40 |           appName = "regression",
41 |           appTitle = "Google Analytics - Regression Exploration",
42 |           forceUpdate = TRUE)
43 | setwd("..")
44 | 
45 | # Forecasting / Anomaly Detection
46 | setwd("forecasting")
47 | deployApp(appFiles = c("app.R", "ga-web-client.json", "gtm.js"),
48 |           appName = "forecasting",
49 |           appTitle = "Google Analytics - Anomaly Detection with Holt-Winters Forecasting",
50 |           forceUpdate = TRUE)
51 | setwd("..")
52 | 


--------------------------------------------------------------------------------
/site-search-analysis/.gitignore:
--------------------------------------------------------------------------------
1 | .httr-oauth
2 | 


--------------------------------------------------------------------------------
/site-search-analysis/app.R:
--------------------------------------------------------------------------------
  1 | # Load the necessary libraries. 
  2 | library(shiny)
  3 | library(googleAuthR)       # For authentication
  4 | library(googleAnalyticsR)  # How we actually get the Google Analytics data
  5 | 
  6 | gar_set_client(web_json = "ga-web-client.json",
  7 |                scopes = "https://www.googleapis.com/auth/analytics.readonly")
  8 | options(googleAuthR.redirect = "https://gilligan.shinyapps.io/site-search/")
  9 | 
 10 | library(tidyverse)         # Includes dplyr, ggplot2, and others; very key!
 11 | library(gridExtra)         # Grid / side-by-side plot layoutslibrary
 12 | library(knitr)             # Nicer looking tables
 13 | library(DT)                # Interactive tables
 14 | library(tidytext)          # Tidy text!
 15 | library(SnowballC)         # Mainly for stemming the search terms
 16 | library(wordcloud)         # Word cloud creation
 17 | library(RColorBrewer)      # Get some palettes to use with the word cloud
 18 | library(topicmodels)       # For the topic modeling using LDA
 19 | 
 20 | ## ui.R
 21 | ui <- fluidPage(title = "Site Search Analysis with Google Analytics",
 22 |                 tags$head(includeScript("gtm.js")),
 23 |                 tags$h2("Site Search Analysis with Google Analytics*"),
 24 |                 tags$div(paste("This requires a site that has site search and has the typical configuration of the",
 25 |                                "capture of search terms with Google Analytics. It's purely based on a search volume",
 26 |                                "analysis. Hat tips to Sébastien Brodeur, Nancy Koons, and Julia Silge for their",
 27 |                                "contributions of the ideas that are used here.")),
 28 |                 tags$br(),
 29 |                 sidebarLayout(
 30 |                   sidebarPanel(tags$h4("Select Base Data Parameters"),
 31 |                                # Account/Property/View Selection
 32 |                                authDropdownUI("auth_menu",
 33 |                                               inColumns = FALSE),
 34 |                                # Date Range Selection
 35 |                                dateRangeInput("date_selection", 
 36 |                                               label = "Select date range:",
 37 |                                               start = Sys.Date()-30,
 38 |                                               end = Sys.Date()-1),
 39 |                                # Whether or not to enable anti-sampling
 40 |                                checkboxInput("anti_sampling",
 41 |                                              label = "Include anti-sampling (slows down app a bit).",
 42 |                                              value = TRUE),
 43 |                                # Action button. We want the user to control when the
 44 |                                # underlying call to Google Analytics occurs.
 45 |                                tags$div(style="text-align: center",
 46 |                                         actionButton("query_data", "Get/Refresh Data!", 
 47 |                                                      style="color: #fff; background-color: #337ab7; border-color: #2e6da4")),
 48 |                                
 49 |                                tags$hr(),
 50 |                                tags$h4("Refine the Results"),
 51 |                                
 52 |                                # Stopword language selection. Allows selecting multiple languages
 53 |                                selectInput("stopwords_lang",
 54 |                                            label = "Language(s) for stopwords:",
 55 |                                            choices = c("Danish" = "da",
 56 |                                                        "English" = "en",
 57 |                                                        "French" = "fr",
 58 |                                                        "German" = "de",
 59 |                                                        "Hungarian" = "hu",
 60 |                                                        "Spanish" = "es"),
 61 |                                            multiple = TRUE,
 62 |                                            selected = "en"),
 63 |                                
 64 |                                # Additional words to exclude from the word clouds
 65 |                                textInput("exclude_words",
 66 |                                          label = paste("Additional words to exclude (from the term-frequency",
 67 |                                                        "table, word cloud, and topic analysis):",
 68 |                                                        value = ""))
 69 |                   ),
 70 |                   
 71 |                   mainPanel(tabsetPanel(type = "tabs",
 72 |                                         tabPanel("Raw Data",
 73 |                                                  tags$br(),
 74 |                                                  tags$div("This is the raw data returned from Google Analytics. It's what you",
 75 |                                                           "should see if you pull up the standard search terms report in the",
 76 |                                                           "Google Analytics interface."),
 77 |                                                  tags$br(),
 78 |                                                  dataTableOutput("ga_data")),
 79 |                                         tabPanel("Question Searches",
 80 |                                                  tags$br(),
 81 |                                                  tags$div("These are searches that seemed like straight-up questions (searches",
 82 |                                                           "that started with 'who,' 'what,' 'why,' 'when,' 'where,' or 'how.' Credit",
 83 |                                                           "to Nancy Koons for this tip!"),
 84 |                                                  tags$br(),
 85 |                                                  dataTableOutput("questions")),
 86 |                                         tabPanel("Term-Frequency Table",
 87 |                                                  tags$br(),
 88 |                                                  tags$div("These are the results after all searches were split into",
 89 |                                                           "individual words, the words were stemmed and combined,",
 90 |                                                           "stopwords were removed, and any additional exclusion words",
 91 |                                                           "were removed. These are the words used to create the word cloud."),
 92 |                                                  tags$br(),
 93 |                                                  dataTableOutput("term_frequency")),
 94 |                                         tabPanel("Overall Word Cloud",
 95 |                                                  tags$br(),
 96 |                                                  tags$div("The word cloud based on the cleaned up data set. Credit to Sébastien",
 97 |                                                           "Brodeur for the original idea to do this with search terms!"),
 98 |                                                  tags$br(),
 99 |                                                  fluidRow(
100 |                                                    # Number of topics to suss out of the data
101 |                                                    # The minimum word frequency to include in the wordcloud
102 |                                                    column(4, sliderInput("min_frequency",
103 |                                                                          label = "Minimum # of searches to include:",
104 |                                                                          min = 1, max = 50, value = 2, step = 1)),
105 |                                                    column(4,  sliderInput("overall_min_size",
106 |                                                                           label = "Minimum word size:",
107 |                                                                           min = 1,  max = 8, value = 2, step = 0.5)),
108 |                                                    column(4,  sliderInput("overall_max_size",
109 |                                                                           label = "Maximum word size:",
110 |                                                                           min = 1,  max = 8, value = 5.5, step = 0.5))),
111 |                                                  fluidRow(plotOutput("wordcloud", height = "800px"))),
112 |                                         tabPanel("Topics Word Clouds",
113 |                                                  tags$br(),
114 |                                                  tags$div("This is a little bit of unsupervised text mining using Latent Dirichlet",
115 |                                                           "Allocation (LDA). Based on the number of topics you select below, the LDA",
116 |                                                           "model establishes a set of topics and then assigns a probability that each",
117 |                                                           "search term in the data set would appear in that topic. The word clouds",
118 |                                                           "show the terms that had the highest probability of showing up in each topic",
119 |                                                           "(and excludes terms that had less than a 0.1% chance of appearing in any topic."),
120 |                                                  tags$br(),
121 |                                                  fluidRow(
122 |                                                    # Number of topics to suss out of the data
123 |                                                    # column(2, selectInput("num_topics",
124 |                                                    #                       label = "# of topics to find:",
125 |                                                    #                       choices = c("2" = 2, "3" = 3, "4" = 4,
126 |                                                    #                                   "5" = 5, "6" = 6), selected = 2)),
127 |                                                    column(2, sliderInput("num_topics",
128 |                                                                          label = "# of topics to find:",
129 |                                                                          min = 2, max = 6, value = 2, step = 1)),
130 |                                                    column(3, sliderInput("term_topic_probability",
131 |                                                                          label = "Strength of term fit:",
132 |                                                                          min = 0.001,  max = 0.01, value = 0.001, step = 0.001)),
133 |                                                    column(3, sliderInput("topics_min_frequency",
134 |                                                                          label = "Minimum # of searches to include:",
135 |                                                                          min = 1, max = 50, value = 2, step = 1)),
136 |                                                    column(2, sliderInput("topics_min_size",
137 |                                                                          label = "Min. word size:",
138 |                                                                          min = 1,  max = 8, value = 1, step = 0.5)),
139 |                                                    column(2, sliderInput("topics_max_size",
140 |                                                                          label = "Max word size:",
141 |                                                                          min = 1,  max = 8, value = 3.5, step = 0.5))),
142 |                                                  fluidRow(plotOutput("topic_wordclouds", height = "600px"))
143 |                                         )))),
144 |                 tags$hr(),
145 |                 tags$div("*This app is part of a larger set of apps that demonstrate some uses of R in conjunction",
146 |                          "with Google Analytics (and Twitter). For the code for this app, as well as an R Notebook",
147 |                          "that includes more details, see:", tags$a(href = "https://github.com/SDITools/ga-and-r-examples/",
148 |                                                                     "https://github.com/SDITools/ga-and-r-examples/"),"."),
149 |                 tags$br()
150 | )
151 | 
152 | 
153 | 
154 | ## server.R
155 | server <- function(input, output, session){
156 |   
157 |   # Create a non-reactive access token
158 |   gar_shiny_auth(session)
159 |   
160 |   # Populate the Account/Property/View dropdowns and return whatever the
161 |   # selected view ID is
162 |   view_id <- callModule(authDropdown, "auth_menu", ga.table = ga_account_list)
163 |   
164 |   # Reactive function to pull the data.
165 |   get_ga_data <- reactive({
166 |     
167 |     # Only pull the data if the "Get Data" button is clicked
168 |     input$query_data
169 |     
170 |     # Pull the data.
171 |     isolate(google_analytics(viewId = view_id(),
172 |                              date_range = input$date_selection,
173 |                              metrics = "searchUniques",
174 |                              dimensions = "searchKeyword",
175 |                              anti_sample = input$anti_sampling))
176 |   })
177 |   
178 |   # Reactive function to get the cleaned up search data
179 |   get_search_data_clean <- reactive({
180 |     
181 |     # Get the raw data
182 |     ga_data <- get_ga_data()
183 |     
184 |     # Unnest it -- put each word on its own row and then collapse the individual
185 |     # words. This will also make everything lowercase and strip punctuation!
186 |     search_data_clean <- ga_data %>% 
187 |       unnest_tokens(search_term, searchKeyword) %>% 
188 |       group_by(search_term) %>% 
189 |       summarise(searches = sum(searchUniques)) %>% 
190 |       dplyr::select(search_term, searches) %>% 
191 |       ungroup() %>% 
192 |       arrange(-searches)
193 |     
194 |     # Remove the stop words. 1) get the stopwords, 2) remove 'em. There may be
195 |     # multiple languages of stopwords selected, so looping through those.
196 |     if(length(input$stopwords_lang > 0)){
197 |       for(lang in input$stopwords_lang){
198 |         # Get the stopwords for the language
199 |         stop_words <- get_stopwords(language = lang) %>% dplyr::select(word)
200 |         search_data_clean <- search_data_clean %>%
201 |           anti_join(stop_words, by = c(search_term = "word"))
202 |       }
203 |     }
204 |     
205 |     # Convert UTF-8 to ASCII (needed because all hell starts to break loose if you 
206 |     # try to text-mine multibyte). So, we're going to try to convert everything to
207 |     # ASCII. For some...this will fail and return NA. So, we'll then just remove
208 |     # the NA rows
209 |     search_data_clean <- search_data_clean %>%
210 |       mutate(search_term = iconv(search_term, "UTF-8", "ASCII")) %>% 
211 |       filter(!is.na(search_term))
212 |     
213 |     # Perform stemming.
214 |     search_data_clean <- search_data_clean %>% 
215 |       mutate(search_term_stem = wordStem(search_term))
216 |     
217 |     # Go ahead and find the most popular un-stemmed word for each stemmed word.
218 |     # That will make the results look more "normal" to the casual viewer. We don't want
219 |     # to have any ties, so we're going to somewhat arbitrarily break any ties by adding
220 |     # the row number / 1000000 to each of the search counts first (We'll toss this later)
221 |     search_data_clean_top_term <- search_data_clean %>% 
222 |       mutate(searches = searches + row_number()/1000000) %>% 
223 |       group_by(search_term_stem) %>% 
224 |       top_n(1, searches) %>% 
225 |       dplyr::select(-searches)
226 |     
227 |     # Join that back to search data after totalling the searches by the stemmed term.
228 |     search_data_clean <- search_data_clean %>% 
229 |       group_by(search_term_stem) %>% 
230 |       summarise(searches = sum(searches)) %>% 
231 |       left_join(search_data_clean_top_term) %>% 
232 |       ungroup() %>% 
233 |       dplyr::select(search_term_stem, search_term, searches) %>% 
234 |       arrange(-searches)
235 |     
236 |     # Convert the list of additional exclusion words to a vector. There may or may not be
237 |     # spaces after the commas separating the terms.
238 |     # Remove any of the exclusion terms that are entered.
239 |     if(!is.null(input$exclude_words)){
240 |       # Take the comma-delimited list of terms and split them out to be a
241 |       # character vector. The ", ?" regEx is so that this will work with
242 |       # or without a space following the comma
243 |       exclude_words <- unlist(strsplit(input$exclude_words,", ?"))
244 |       
245 |       # Remove any additional "remove words" specified
246 |       search_data_clean <-  search_data_clean %>%
247 |         filter(!search_term %in% exclude_words)
248 |     }
249 |   })
250 |   
251 |   # Reactive function to do the LDA topic modeling
252 |   get_search_topics_and_terms <- reactive({
253 |     
254 |     # Get the cleaned up search data
255 |     search_data_clean <- get_search_data_clean()
256 |     
257 |     # Cast the term frequency matrix into a document term matrix. We're considering this all one 
258 |     # "document" so we're just hardcoding a "1" for that
259 |     search_data_dtm <- search_data_clean %>% 
260 |       mutate(doc = 1) %>% 
261 |       cast_dtm(doc, search_term, searches)
262 |     
263 |     # Run LDA. Setting a seed for reproducibility
264 |     search_lda <- LDA(search_data_dtm, k = input$num_topics, control = list(seed = 1120))
265 |     
266 |     # Assign a probability of each term being in each of the topics
267 |     search_topics <- tidy(search_lda, matrix = "beta")
268 |     
269 |     # For each term, assign it to the topic for which it has the highest beta. This diverges
270 |     # from the approach described at tidytextmining.com, but it seems like a reasonably legit
271 |     # thing to do.
272 |     search_topics_and_terms <- search_topics %>%
273 |       group_by(term) %>% 
274 |       top_n(1, beta) %>% 
275 |       ungroup() %>% 
276 |       arrange(topic, -beta) %>% 
277 |       left_join(search_data_clean, by = c(term = "search_term"))
278 |   })
279 |   
280 |   # Output the raw data
281 |   output$ga_data <- DT::renderDataTable({
282 |     get_ga_data() %>% 
283 |       arrange(-searchUniques) %>% 
284 |       datatable(colnames = c("Search Term", "Unique Searches"),  rownames = FALSE)
285 |   })
286 |   
287 |   # Output the questions
288 |   output$questions <- DT::renderDataTable({
289 |     get_ga_data() %>% 
290 |       arrange(-searchUniques) %>% 
291 |       filter(grepl("^who|^what|^why|^what|^when|^where|^how.*", searchKeyword)) %>% 
292 |       datatable(colnames = c("Question", "Searches"),  rownames = FALSE)
293 |   })
294 |   
295 |   # Output the term-frequency table
296 |   output$term_frequency <- DT::renderDataTable({
297 |     get_search_data_clean() %>% 
298 |       dplyr::select(search_term, searches) %>% 
299 |       datatable(colnames = c("Search Term", "Searches"),
300 |                 rownames = FALSE)
301 |   })
302 |   
303 |   # Set a seed for reproducibility
304 |   set.seed(1971)
305 |   
306 |   # Set a color palette
307 |   color_palette <- rev(brewer.pal(8,"Spectral")) 
308 |   
309 |   # Output the wordcloud
310 |   output$wordcloud <- renderPlot({
311 |     
312 |     # Get the search data
313 |     search_data_clean <- get_search_data_clean()
314 |     
315 |     # Generate the word cloud!
316 |     wordcloud(words = search_data_clean$search_term, 
317 |               freq = search_data_clean$searches,
318 |               scale = c(input$overall_max_size, input$overall_min_size),
319 |               min.freq = input$min_frequency,
320 |               max.words = 500, 
321 |               random.order = FALSE,
322 |               rot.per = .0,
323 |               colors = color_palette)
324 |   })
325 |   
326 |   # This gets a little janky, in that we can't really do recursive / variable
327 |   # topic counts. So, instead, we're going to have an output for each of FIVE
328 |   # topics, but then have those return empty results if fewer topics are actually
329 |   # selected. There *may* be some inefficiencies here, but I couldn't get anything
330 |   # moved out of this to avoid the repetition.
331 |   
332 |   # Topic #1  
333 |   wordcloud_1 <- reactive({
334 |     # Populate search topics and terms. 
335 |     topic_data <- get_search_topics_and_terms()  %>% 
336 |       filter(topic == 1 &  beta > input$term_topic_probability)
337 |     # Generate the word cloud!
338 |     wordcloud(words = topic_data$term, freq = topic_data$searches,
339 |               scale=c(input$topics_max_size,input$topics_min_size), 
340 |               min.freq=input$topics_min_frequency, max.words=500, 
341 |               random.order=FALSE, rot.per=.0, colors=color_palette)
342 |   })
343 |   
344 |   # Topic #2
345 |   wordcloud_2 <- reactive({
346 |     # Populate search topics and terms. 
347 |     topic_data <- get_search_topics_and_terms()  %>% 
348 |       filter(topic == 2 &  beta > input$term_topic_probability)
349 |     # Generate the word cloud!
350 |     wordcloud(words = topic_data$term, freq = topic_data$searches,
351 |               scale=c(input$topics_max_size,input$topics_min_size), 
352 |               min.freq=input$topics_min_frequency, max.words=500, 
353 |               random.order=FALSE, rot.per=.0, colors=color_palette)
354 |   })
355 |   
356 |   # For 3-6, the slider might be set below them, so we have to check to see before trying
357 |   # to generate a word cloud
358 |   
359 |   # Topic #3
360 |   wordcloud_3 <- reactive({
361 |     if(input$num_topics >= 3){
362 |       # Populate search topics and terms. 
363 |       topic_data <- get_search_topics_and_terms()  %>% 
364 |         filter(topic == 3 &  beta > input$term_topic_probability)
365 |       # Generate the word cloud!
366 |       wordcloud(words = topic_data$term, freq = topic_data$searches,
367 |                 scale=c(input$topics_max_size,input$topics_min_size), 
368 |                 min.freq=input$topics_min_frequency, max.words=500, 
369 |                 random.order=FALSE, rot.per=.0, colors=color_palette)
370 |     } else {
371 |       NULL
372 |     }
373 |   })
374 |   
375 |   # Topic #4
376 |   wordcloud_4 <- reactive({
377 |     if(input$num_topics >= 4){
378 |       # Populate search topics and terms. 
379 |       topic_data <- get_search_topics_and_terms()  %>% 
380 |         filter(topic == 4 &  beta > input$term_topic_probability)
381 |       # Generate the word cloud!
382 |       wordcloud(words = topic_data$term, freq = topic_data$searches,
383 |                 scale=c(input$topics_max_size,input$topics_min_size), 
384 |                 min.freq=input$topics_min_frequency, max.words=500, 
385 |                 random.order=FALSE, rot.per=.0, colors=color_palette)
386 |     } else {
387 |       NULL
388 |     }
389 |   })
390 |   
391 |   # Topic #5
392 |   wordcloud_5 <- reactive({
393 |     if(input$num_topics >= 5){
394 |       # Populate search topics and terms. 
395 |       topic_data <- get_search_topics_and_terms()  %>% 
396 |         filter(topic == 5 &  beta > input$term_topic_probability)
397 |       # Generate the word cloud!
398 |       wordcloud(words = topic_data$term, freq = topic_data$searches,
399 |                 scale=c(input$topics_max_size,input$topics_min_size), 
400 |                 min.freq=input$topics_min_frequency, max.words=500, 
401 |                 random.order=FALSE, rot.per=.0, colors=color_palette)
402 |     } else {
403 |       NULL
404 |     }
405 |   })
406 |   
407 |   # Topic #6
408 |   wordcloud_6 <- reactive({
409 |     if(input$num_topics >= 6){
410 |       # Populate search topics and terms. 
411 |       topic_data <- get_search_topics_and_terms()  %>% 
412 |         filter(topic == 6 &  beta > input$term_topic_probability)
413 |       # Generate the word cloud!
414 |       wordcloud(words = topic_data$term, freq = topic_data$searches,
415 |                 scale=c(input$topics_max_size,input$topics_min_size), 
416 |                 min.freq=input$topics_min_frequency, max.words=500, 
417 |                 random.order=FALSE, rot.per=.0, colors=color_palette)
418 |     } else {
419 |       NULL
420 |     }
421 |   })
422 |   
423 |   # Output the grid of wordclouds
424 |   output$topic_wordclouds <- renderPlot({
425 |     # Layout out a 2x3 grid with all of the wordclouds and return that as 
426 |     # the plot.
427 |     par(mfrow=c(2,3)) # for 1 row, 2 cols
428 |     wordcloud_1()
429 |     wordcloud_2()
430 |     wordcloud_3() 
431 |     wordcloud_4()
432 |     wordcloud_5()
433 |     wordcloud_6()
434 |   })
435 | }
436 | 
437 | # shinyApp(gar_shiny_ui(ui, login_ui = gar_shiny_login_ui), server)
438 | shinyApp(gar_shiny_ui(ui, login_ui = silent_auth), server)
439 | 


--------------------------------------------------------------------------------
/site-search-analysis/site-search-analysis.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Google Analytics - Site Search Exploration"
  3 | output: html_document
  4 | ---
  5 | 
  6 | <style>
  7 | td {
  8 |     padding: 4px 10px;
  9 | }
 10 | </style>
 11 | 
 12 | ### General Note
 13 | This example is part of a larger set of examples of using Google Analytics with R:
 14 | 
 15 | * For additional examples, downloadable code, and explanations of the overall effort, see: https://github.com/SDITools/ga-and-r-examples
 16 | * To see this specific example in Shiny (web/interactive) form, see: https://gilligan.shinyapps.io/site-search/
 17 | 
 18 | ### Overview
 19 | 
 20 | This example assumes that site search tracking is enabled on the site in Google Analytics, and it basically does three things:
 21 | 
 22 | * Cleans up the site search data by unnesting terms, removing stopwords, pushing everything to lowercase, and performing stemming
 23 | * Generates a term-frequency matrix and a word cloud
 24 | * Breaks out the searches into "topics" using LDA (https://www.tidytextmining.com/topicmodeling.html)
 25 | 
 26 | This entire example is, essentially, cobbling together ideas from three specific people, so a hat tip to all three:
 27 | 
 28 | * **[Nancy Koons](https://www.linkedin.com/in/nkoons/)** -- see the last section in [this post from Nancy from 2014](https://nanalytics.wordpress.com/2014/07/14/who-what-where-when-why-how-harnessing-the-power-of-internal-site-search/)
 29 | * **[Sébastien Brodeur](https://www.linkedin.com/in/brodseba/)** -- Sébastien presented his experiments of text mining on-site search terms at Superweek in 2017, and that was my introduction to the basics of stopword removal and stemming, which, as this example shows, makes for much more usable text data.
 30 | * **[Julia Silge](https://www.linkedin.com/in/juliasilge/)** -- Julia is the co-auther of the [tidytext R package](https://www.tidytextmining.com/) which is wildly less confusing (to me) than dealing with the text mining world of corpora. The topic modeling using LDA in this example came directly from her tutorial on the subject.
 31 | 
 32 | So, now, on to the code!
 33 | 
 34 | ## Setup/Config
 35 | 
 36 | ```{r config}
 37 | 
 38 | # Load the necessary libraries. 
 39 | if (!require("pacman")) install.packages("pacman")
 40 | pacman::p_load(googleAnalyticsR,  # How we actually get the Google Analytics data
 41 |                tidyverse,         # Includes dplyr, ggplot2, and others; very key!
 42 |                knitr,             # Nicer looking tables
 43 |                tidytext,          # Tidy text!
 44 |                SnowballC,         # Mainly for stemming the search terms
 45 |                DT,                # Make a nice data table
 46 |                wordcloud,         # Word cloud creation
 47 |                RColorBrewer,      # Get some palettes to use with the word cloud
 48 |                topicmodels)       # For the topic modeling using LDA
 49 | 
 50 | # Authorize GA. Depending on if you've done this already and a .ga-httr-oauth file has
 51 | # been saved or not, this may pop you over to a browser to authenticate.
 52 | ga_auth(token = ".httr-oauth")
 53 | 
 54 | # Set the view ID and the date range. If you want to, you can swap out the Sys.getenv()
 55 | # call and just replace that with a hardcoded value for the view ID. 
 56 | view_id <- Sys.getenv("GA_VIEW_ID_MCLS")
 57 | start_date <- Sys.Date() - 30        # The last 30 days
 58 | end_date <- Sys.Date() - 1            # Yesterday
 59 | 
 60 | # Minimum # of searches for a term to include in the wordcloud
 61 | min_frequency <- 20
 62 | 
 63 | # Set the number of topics to include in the topic model
 64 | num_topics <- 3
 65 | 
 66 | # Set the stopwords language
 67 | stopwords_lang <- "en"
 68 | 
 69 | # Words to exclude (because they're too dominant to be interesting or are just 
 70 | # known glitches in the data). This will exclude them from the main word cloud AND 
 71 | # from the topic modeling
 72 | exclude_words <- ""
 73 | 
 74 | # Second set of exclude words for showing the "post exclude" word cloud
 75 | exclude_words_2 <- c("frame", "box")
 76 | 
 77 | ```
 78 | 
 79 | ## Get the Data and Clean It Up
 80 | 
 81 | ```{r get_data, message=FALSE, warning=FALSE}
 82 | 
 83 | # Pull the data
 84 | ga_data <- google_analytics(viewId = view_id,
 85 |                             date_range = c(start_date, end_date),
 86 |                             metrics = "searchUniques",
 87 |                             dimensions = "searchKeyword",
 88 |                             order = order_type("searchUniques", sort_order = "DESCENDING", 
 89 |                                                orderType = "VALUE"),
 90 |                             anti_sample = TRUE)
 91 | 
 92 | 
 93 | # Go ahead and remove "(other)" if cardinality limits were hit
 94 | ga_data <- ga_data %>% filter(searchKeyword != "(other)")
 95 | 
 96 | # Site glitch -- remove "redirect"
 97 | ga_data <- ga_data %>% filter(searchKeyword != "redirect")
 98 | 
 99 | ```
100 | 
101 | ## Raw Data Wordcloud
102 | 
103 | ```{r wordcloud_raw, message=FALSE, warning=FALSE}
104 | 
105 | # Set a seed for reproducibility
106 | set.seed(1971)
107 | 
108 | # Set a color palette
109 | color_palette <- rev(brewer.pal(8,"Spectral")) 
110 | 
111 | # Generate the word cloud!
112 | wordcloud(words = ga_data$searchKeyword, 
113 |           freq = ga_data$searchUniques,
114 |           scale=c(5.5,0.6),
115 |           min.freq=min_frequency,
116 |           max.words=500, 
117 |           random.order=FALSE,
118 |           rot.per=.0,
119 |           colors=color_palette)
120 | 
121 | ```
122 | 
123 | ## Clean Up the Data
124 | 
125 | This is where the data gets "cleansed:" 
126 | 
127 | * Unnesting the phrases so there is one word per row
128 | * Removing stopwords
129 | * Stemming
130 | 
131 | ```{r clean_data, message=FALSE, warning=FALSE}
132 | 
133 | # Unnest it -- put each word on its own row and then collapse the individual
134 | # words. This will also make everything lowercase and strip punctuation!
135 | search_data <- ga_data %>% 
136 |   unnest_tokens(search_term, searchKeyword) %>% 
137 |   group_by(search_term) %>% 
138 |   summarise(searches = sum(searchUniques)) %>% 
139 |   dplyr::select(search_term, searches) %>% 
140 |   ungroup() %>% 
141 |   arrange(-searches)
142 | 
143 | # Remove the stop words. 1) get the stopwords, 2) remove 'em
144 | stop_words <- get_stopwords(language = stopwords_lang) %>% 
145 |   dplyr::select(word)
146 | 
147 | search_data <- search_data %>% 
148 |   anti_join(stop_words, by = c(search_term = "word"))
149 | 
150 | # Convert UTF-8 to ASCII (needed because all hell starts to break loose if you 
151 | # try to text-mine multibyte). So, we're going to try to convert everything to
152 | # ASCII. For some...this will fail and return NA. So, we'll then just remove
153 | # the NA rows
154 | search_data <- search_data %>%
155 |   mutate(search_term = iconv(search_term, "UTF-8", "ASCII")) %>% 
156 |   filter(!is.na(search_term))
157 | 
158 | # Perform stemming.
159 | search_data <- search_data %>% 
160 |   mutate(search_term_stem = wordStem(search_term))
161 | 
162 | # Go ahead and find the most popular un-stemmed word for each stemmed word.
163 | # That will make the results look more "normal" to the casual viewer. We don't want
164 | # to have any ties, so we're going to somewhat arbitrarily break any ties by adding
165 | # the row number / 1000000 to each of the search counts first (We'll toss this later)
166 | search_data_top_term <- search_data %>% 
167 |   mutate(searches = searches + row_number()/1000000) %>% 
168 |   group_by(search_term_stem) %>% 
169 |   top_n(1, searches) %>% 
170 |   dplyr::select(-searches)
171 | 
172 | # Join that back to search data after totalling the searches by the stemmed term.
173 | search_data <- search_data %>% 
174 |   group_by(search_term_stem) %>% 
175 |   summarise(searches = sum(searches)) %>% 
176 |   left_join(search_data_top_term) %>% 
177 |   ungroup() %>% 
178 |   dplyr::select(search_term_stem, search_term, searches) %>% 
179 |   arrange(-searches)
180 | 
181 | # Remove any search terms that are now either just a single character or that
182 | # are a numeric
183 | search_data <- search_data %>% 
184 |   filter(is.na(as.numeric(search_term))) %>% 
185 |   filter(!grepl("^.$", search_term))
186 | 
187 | # Remove any additional "remove words" specified
188 | search_data <-  search_data %>%
189 |   filter(!search_term_stem %in% exclude_words)
190 | 
191 | # Get rid of the "top term" data frame
192 | rm(search_data_top_term)
193 | 
194 | ```
195 | 
196 | ## Show the Original Searches
197 | 
198 | ```{r datatable_raw, message=FALSE, warning=FALSE}
199 | 
200 | ga_data %>% 
201 |   arrange(-searchUniques) %>% 
202 |   datatable(colnames = c("Search Term", "Searches"),  rownames = FALSE) %>% 
203 |   formatCurrency('searchUniques',currency = "", interval = 3, mark = ",", digits=0)
204 | 
205 | ```
206 | 
207 | ## Show "Question" Searches
208 | 
209 | Searches that started with a question word: who, what why, when, where, how.
210 | 
211 | ```{r datatable_questions, message=FALSE, warning=FALSE}
212 | 
213 | ga_data %>% 
214 |   arrange(-searchUniques) %>% 
215 |   filter(grepl("(^who|^what|^why|^what|^when|^where|^how) .*", searchKeyword)) %>% 
216 |   datatable(colnames = c("Question", "Searches"),  rownames = FALSE) %>% 
217 |   formatCurrency('searchUniques',currency = "", interval = 3, mark = ",", digits=0)
218 | 
219 | ```
220 | 
221 | 
222 | ## Make a Term-Frequency Matrix
223 | 
224 | This looks similar to the report in Google Analytics, but it's been processed to be the individual words, stemmed, stopwords removed, etc.
225 | 
226 | ```{r datatable_clean, message=FALSE, warning=FALSE}
227 | 
228 | dplyr::select(search_data, search_term, searches) %>% 
229 | datatable(colnames = c("Search Term", "Searches"), rownames = FALSE) %>% 
230 |     formatCurrency('searches',currency = "", interval = 3, mark = ",", digits=0)
231 | 
232 | 
233 | ```
234 | 
235 | 
236 | ## Create a Word Cloud
237 | 
238 | A wordcloud based on the cleaned up and unnested words.
239 | 
240 | ```{r wordcloud, message=FALSE, warning=FALSE}
241 | 
242 | # Generate the word cloud!
243 | wordcloud(words = search_data$search_term, 
244 |           freq = search_data$searches,
245 |           scale=c(5.5,0.6),
246 |           min.freq=min_frequency,
247 |           max.words=500, 
248 |           random.order=FALSE,
249 |           rot.per=.0,
250 |           colors=color_palette)
251 | 
252 | ```
253 | 
254 | Generate a word cloud that has removed some additional exclude words.
255 | 
256 | ```{r wordcloud_exclude, message=FALSE, warning=FALSE}
257 | 
258 | search_data_exclude <- search_data %>% 
259 |   filter(!search_term %in% exclude_words_2)
260 | 
261 | # Generate the word cloud!
262 | wordcloud(words = search_data_exclude$search_term, 
263 |           freq = search_data_exclude$searches,
264 |           scale=c(5.5,0.6),
265 |           min.freq=min_frequency,
266 |           max.words=500, 
267 |           random.order=FALSE,
268 |           rot.per=.0,
269 |           colors=color_palette)
270 | 
271 | ```
272 | 
273 | 
274 | ## Look for Topics!
275 | 
276 | We're going to use Latent Dirichlet allocation (LDA) to try to break out these words into topics. This is basically just following the process outlined for LDA at: https://www.tidytextmining.com/topicmodeling.html.
277 | 
278 | ```{r lda, message=FALSE, warning=FALSE}
279 | 
280 | # Cast the term frequency matrix into a document term matrix. We're considering this all one 
281 | # "document" so we're just hardcoding a "1" for that
282 | search_data_dtm <- search_data %>% 
283 |   mutate(doc = 1) %>% 
284 |   cast_dtm(doc, search_term, searches)
285 | 
286 | # Run LDA. Setting a seed for reproducibility
287 | search_lda <- LDA(search_data_dtm, k = num_topics, control = list(seed = 1120))
288 | 
289 | # Assign a probability of each term being in each of the topics
290 | search_topics <- tidy(search_lda, matrix = "beta")
291 | 
292 | # For each term, assign it to the topic for which it has the highest beta. This diverges
293 | # from the approach described at tidytextmining.com, but it seems like a reasonably legit
294 | # thing to do.
295 | search_topics_and_terms <- search_topics %>%
296 |   group_by(term) %>% 
297 |   top_n(1, beta) %>% 
298 |   ungroup() %>% 
299 |   arrange(topic, -beta) %>% 
300 |   left_join(search_data, by = c(term = "search_term"))
301 | 
302 | 
303 | # Function to generate a word cloud based on the topic ID passed in
304 | generate_topic_wordcloud <- function(topic_id){
305 |   
306 |   # Filter the data to be just the topic and to 
307 |   # knock out terms with a reallllly low beta
308 |   topic_data <- search_topics_and_terms %>% 
309 |     filter(topic == topic_id &
310 |            beta > 0.001)
311 |   
312 |   # Generate the word cloud!
313 |   wordcloud(words = topic_data$term, 
314 |             freq = topic_data$searches,
315 |             scale=c(3.5,1),
316 |             min.freq=min_frequency,
317 |             max.words=500, 
318 |             random.order=FALSE,
319 |             rot.per=.0,
320 |             colors=color_palette)
321 | }
322 | 
323 | # Call the function for each topic ID
324 | topic_wordclouds <- map(seq(1:num_topics), generate_topic_wordcloud)
325 | 
326 | 
327 | ```
328 | 
329 | 


--------------------------------------------------------------------------------
/time-normalized-pageviews/app.R:
--------------------------------------------------------------------------------
  1 | # Load the necessary libraries. 
  2 | library(shiny)
  3 | library(googleAuthR)       # For authentication
  4 | library(googleAnalyticsR)  # How we actually get the Google Analytics data
  5 | 
  6 | gar_set_client(web_json = "ga-web-client.json",
  7 |                scopes = "https://www.googleapis.com/auth/analytics.readonly")
  8 | options(googleAuthR.redirect = "https://gilligan.shinyapps.io/time-normalized/")
  9 | 
 10 | library(tidyverse)         # Includes dplyr, ggplot2, and others; very key!
 11 | library(knitr)             # Nicer looking tables
 12 | library(plotly)            # We're going to make the charts interactive
 13 | library(DT)                # Interactive tables
 14 | library(scales)            # Useful for some number formatting in the visualizations
 15 | 
 16 | ## ui.R
 17 | ui <- fluidPage(title = "Time-Normalized Pageviews",
 18 |                 tags$head(includeScript("gtm.js")),
 19 |                 tags$h2("Time-Normalized Pageviews*"),
 20 |                 sidebarLayout(
 21 |                   sidebarPanel(tags$h4("Select Base Data Parameters"),
 22 |                                # Account/Property/View Selection
 23 |                                authDropdownUI("auth_menu",
 24 |                                               inColumns = FALSE),
 25 |                                # Date Range Selection
 26 |                                dateRangeInput("date_selection", 
 27 |                                               label = "Select date range:",
 28 |                                               start = Sys.Date()-365,
 29 |                                               end = Sys.Date()-1),
 30 |                                # Page filter (regEx)
 31 |                                textInput("filter_regex",
 32 |                                          label = "Enter regEx to filter to the pages of interest:",
 33 |                                          value = ".*"),
 34 |                                # Whether or not to enable anti-sampling
 35 |                                checkboxInput("anti_sampling",
 36 |                                              label = "Include anti-sampling (slows down app a bit).",
 37 |                                              value = TRUE),
 38 |                                # Action button. We want the user to control when the
 39 |                                # underlying call to Google Analytics occurs.
 40 |                                tags$div(style="text-align: center",
 41 |                                         actionButton("query_data", "Get/Refresh Data!", 
 42 |                                                      style="color: #fff; background-color: #337ab7; border-color: #2e6da4")),
 43 |                                tags$hr(),
 44 |                                tags$h4("Additional Settings"),
 45 |                                # The minimum number of pageviews required for a page to be deemed "live"
 46 |                                sliderInput("first_day_pageviews_min",
 47 |                                            label = "Minimum # of pageviews on a given day for the page to be deemed 'launched':",
 48 |                                            min = 5,
 49 |                                            max = 100,
 50 |                                            value = 10,
 51 |                                            step = 5),
 52 |                                # The number of pages to include in the output
 53 |                                sliderInput("total_pages_included",
 54 |                                            label = "Total pages to include in the output:",
 55 |                                            min = 2,
 56 |                                            max = 50,
 57 |                                            value = 20,
 58 |                                            step = 2),
 59 |                                # The number of "days from launch" to assess
 60 |                                sliderInput("days_live_range",
 61 |                                            label = "# of days post-launch to include:",
 62 |                                            min = 10,
 63 |                                            max = 120,
 64 |                                            value = 30,
 65 |                                            step = 10)),
 66 |                   mainPanel(tags$h3("Results"),
 67 |                             tags$hr(),
 68 |                             plotlyOutput("upvs_by_day"),
 69 |                             tags$hr(),
 70 |                             plotlyOutput("upvs_by_day_cum"))),
 71 |                 tags$hr(),
 72 |                 tags$div("*This app is part of a larger set of apps that demonstrate some uses of R in conjunction",
 73 |                          "with Google Analytics (and Twitter). For the code for this app, as well as an R Notebook",
 74 |                          "that includes more details, see:", tags$a(href = "https://github.com/SDITools/ga-and-r-examples/",
 75 |                                                                     "https://github.com/SDITools/ga-and-r-examples/"),"."),
 76 |                 tags$br()
 77 | )
 78 | 
 79 | 
 80 | ## server.R
 81 | server <- function(input, output, session){
 82 |   
 83 |   # Create a non-reactive access token
 84 |   gar_shiny_auth(session)
 85 |   
 86 |   # Populate the Account/Property/View dropdowns and return whatever the
 87 |   # selected view ID is
 88 |   view_id <- callModule(authDropdown, "auth_menu", ga.table = ga_account_list)
 89 |   
 90 |   # Reactive function to build the page filter object
 91 |   page_filter <- reactive({
 92 |     # Create a dimension filter object. See ?dim_filter() for details. 
 93 |     page_filter_object <- dim_filter("pagePath", 
 94 |                                      operator = "REGEXP",
 95 |                                      expressions = input$filter_regex)
 96 |     # Now, put that filter object into a filter clause.
 97 |     filter_clause_ga4(list(page_filter_object),
 98 |                       operator = "AND")
 99 |   })
100 |   
101 |   # Reactive function to pull the data.
102 |   get_ga_data <- reactive({
103 |     
104 |     # Only pull the data if the "Get Data" button is clicked
105 |     input$query_data
106 |     
107 |     # Pull the data. 
108 |     isolate(google_analytics(viewId = view_id(),
109 |                              date_range = input$date_selection,
110 |                              metrics = "uniquePageviews",
111 |                              dimensions = c("date","pagePath"),
112 |                              dim_filters = page_filter(),
113 |                              anti_sample = input$anti_sampling))
114 |   })
115 |   
116 |   # Function to do the data normalization
117 |   ga_data_normalized <- reactive({
118 |     
119 |     # Don't try to normalize anything until there is data to be worked with
120 |     get_ga_data()
121 |     
122 |     # Get the data from GA
123 |     ga_data <- get_ga_data()
124 |     
125 |     # Function to filter and then normalize a single page
126 |     normalize_date_start <- function(page){
127 |       
128 |       # Filter all the data to just be the page being processed
129 |       ga_data_single_page <- ga_data %>% filter(pagePath == page)
130 |       
131 |       # Find the first value in the result that is greater than first_day_pageviews_min. 
132 |       first_live_row <- min(which(ga_data_single_page$uniquePageviews > input$first_day_pageviews_min))
133 |       
134 |       # If the "first_live_row" is Inf, then none of the traffic for the page on any given day
135 |       # exceeded the first_day_pageviews_min value, so we're going to go ahead and exit the function
136 |       # with a NULL result
137 |       if(first_live_row == Inf){return(NULL)}
138 |       
139 |       # Filter the data to start with that page
140 |       ga_data_single_page <- ga_data_single_page[first_live_row:nrow(ga_data_single_page),]
141 |       
142 |       # As the content ages, there may be days that have ZERO traffic. Those days won't show up as
143 |       # rows at all in our data. So, we actually need to create a data frame that includes
144 |       # all dates in the range from the "launch" until the last day traffic was recorded. 
145 |       normalized_results <- data.frame(date = seq.Date(from = min(ga_data_single_page$date), 
146 |                                                        to = max(ga_data_single_page$date), 
147 |                                                        by = "day"),
148 |                                        days_live = seq(min(ga_data_single_page$date):
149 |                                                          max(ga_data_single_page$date)),
150 |                                        page = page,
151 |                                        stringsAsFactors = FALSE) %>% 
152 |         
153 |         # Join back to the original data to get the uniquePageviews
154 |         left_join(ga_data_single_page) %>%
155 |         
156 |         # Replace the "NAs" (days in the range with no uniquePageviews) with 0s (because 
157 |         # that's exactly what happened on those days!)
158 |         mutate(uniquePageviews = ifelse(is.na(uniquePageviews), 0, uniquePageviews)) %>% 
159 |         
160 |         # We're going to plot both the daily pageviews AND the cumulative total pageviews,
161 |         # so let's add the cumulative total
162 |         mutate(cumulative_uniquePageviews = cumsum(uniquePageviews)) %>% 
163 |         
164 |         # Grab just the columns we need for our visualization!
165 |         dplyr::select(page, days_live, uniquePageviews, cumulative_uniquePageviews)
166 |     }
167 |     
168 |     # We want to run the function above on each page in our data set. 
169 |     pages_list <- ga_data %>% 
170 |       group_by(pagePath) %>% summarise(total_traffic = sum(uniquePageviews)) %>% 
171 |       top_n(input$total_pages_included)
172 |     
173 |     # The first little bit of magic can now occur. We'll run our normalize_date_start function on
174 |     # each value in our list of pages and get a data frame back that has our time-normalized
175 |     # traffic by page!
176 |     ga_data_normalized <- map_dfr(pages_list$pagePath, normalize_date_start)
177 |     
178 |     # We specified earlier -- in the `days_live_range` object -- how many "days from launch" we
179 |     # actually want to include, so let's do one final round of filtering to only include those rows.
180 |     ga_data_normalized <- ga_data_normalized %>% filter(days_live <= input$days_live_range)
181 |     
182 |     return(ga_data_normalized)
183 |   })
184 |   
185 |   # The actual visualizations and their output
186 |   
187 |   # Unique Pageviews by Day
188 |   output$upvs_by_day <- renderPlotly({
189 |     
190 |     # Make sure we've got data to work with
191 |     req(ga_data_normalized())
192 |     
193 |     # Get the plot data
194 |     ga_data_normalized <- ga_data_normalized()
195 |     
196 |     # Create the plot
197 |     gg <- ggplot(ga_data_normalized, mapping=aes(x = days_live, y = uniquePageviews, color=page)) +
198 |       geom_line() +                                          # The main "plot" operation
199 |       scale_y_continuous(labels=comma) +                     # Include commas in the y-axis numbers
200 |       labs(title = "Unique Pageviews by Day from Launch",
201 |            x = "# of Days Since Page Launched",
202 |            y = "Unique Pageviews") +
203 |       theme_light() +                                        # Clean up the visualization a bit
204 |       theme(panel.grid = element_blank(),
205 |             panel.border = element_blank(),
206 |             legend.position = "none",
207 |             panel.grid.major.y = element_line(color = "gray80"),
208 |             axis.ticks = element_blank())
209 |     
210 |     # Output the plot. We're wrapping it in ggplotly so we will get some interactivity in the plot.
211 |     ggplotly(gg) %>% layout(autosize=TRUE)
212 |   })
213 |   
214 |   # Unique Pageviews by Day - Cumulative from launch
215 |   output$upvs_by_day_cum <- renderPlotly({
216 |     
217 |     # Make sure we've got data to work with
218 |     req(ga_data_normalized())
219 |     
220 |     # Get the plot data
221 |     ga_data_normalized <- ga_data_normalized()
222 |     
223 |     gg <- ggplot(ga_data_normalized, mapping=aes(x = days_live, y = cumulative_uniquePageviews, color=page)) +
224 |       geom_line() +                                          # The main "plot" operation
225 |       scale_y_continuous(labels=comma) +                     # Include commas in the y-axis numbers
226 |       labs(title = "Cumulative Unique Pageviews by Day from Launch",
227 |            x = "# of Days Since Page Launched",
228 |            y = "Cumulative Unique Pageviews") +
229 |       theme_light() +                                        # Clean up the visualization a bit
230 |       theme(panel.grid = element_blank(),
231 |             panel.border = element_blank(),
232 |             legend.position = "none",
233 |             panel.grid.major.y = element_line(color = "gray80"),
234 |             axis.ticks = element_blank())
235 |     
236 |     # Output the plot. We're wrapping it in ggplotly so we will get some interactivity in the plot.
237 |     ggplotly(gg) %>% layout(autosize=TRUE)
238 |   })
239 | }
240 | 
241 | shinyApp(gar_shiny_ui(ui, login_ui = silent_auth), server)
242 | 


--------------------------------------------------------------------------------
/time-normalized-pageviews/time-normalized-pageviews.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Time-Normalized Pageviews"
  3 | output: html_notebook
  4 | ---
  5 | 
  6 | ### General Note
  7 | This example is part of a larger set of examples of using Google Analytics with R:
  8 | 
  9 | * For additional examples, downloadable code, and explanations of the overall effort, see: https://github.com/SDITools/ga-and-r-examples
 10 | * To see this specific example in Shiny (web/interactive) form, see: https://gilligan.shinyapps.io/time-normalized/
 11 | 
 12 | ### Overview
 13 | 
 14 | This example is lifted straight from http://www.dartistics.com/googleanalytics/int-time-normalized.html, likely in incredibly poor form. But, it needed just enough tweaking that it seemed worth putting a slightly different version here.
 15 | 
 16 | **Note: You need to follow the instructions on this page -- http://www.dartistics.com/googleanalytics/setup.html -- before proceeding in order for this code to work.**
 17 | 
 18 | This example is best suited for a content-driven site or section of the site. For instance, if you push out a blog post once a week, then chances are that you see traffic to that post jump immediately after it is pushed out (as you promote it and as it pops up on the radar of fans and followers of your site/brand), and then traffic to the post tapers off at some rate. Suppose you have a hypothesis that some of your blog posts actually have greater "staying power" -- they may or may not have as great of an initial jump in traffic, but they "settle out" getting on-going traffic at a greater rate than other posts.
 19 | 
 20 | So, this post pulls daily data for a bunch of pages, then tries to detect their launch date, time-normalizes the traffic for each page based on that presumed launch date, and then plots the daily traffic from "time 0" on out, as well as overall cumulative traffic.
 21 | 
 22 | # Setup/Config
 23 | 
 24 | ```{r config}
 25 | 
 26 | # Load the necessary libraries. 
 27 | if (!require("pacman")) install.packages("pacman")
 28 | pacman::p_load(googleAnalyticsR,  # How we actually get the Google Analytics data
 29 |                tidyverse,         # Includes dplyr, ggplot2, and others; very key!
 30 |                knitr,             # Nicer looking tables
 31 |                kableExtra,        # Still nicer looking tables!
 32 |                plotly,            # We're going to make the charts interactive
 33 |                scales)            # Useful for some number formatting in the visualizations
 34 | 
 35 | # Authorize GA. Depending on if you've done this already and a .ga-httr-oauth file has
 36 | # been saved or not, this may pop you over to a browser to authenticate.
 37 | ga_auth(token = ".httr-oauth")
 38 | 
 39 | # Set the view ID and the date range. If you want to, you can swap out the Sys.getenv()
 40 | # call and just replace that with a hardcoded value for the view ID. 
 41 | view_id <- Sys.getenv("GA_VIEW_ID")
 42 | start_date <- Sys.Date() - 365        # The last year
 43 | end_date <- Sys.Date() - 1            # Yesterday
 44 | 
 45 | # You likely won't want to do this for the *entire* site. If you do, then simply enter
 46 | # ".*" for this value. Otherwise, enter regEx that filters to the pages of interest (you
 47 | # can experiment/test your regEx by entering it in the Pages report in the web interface
 48 | # for GA).
 49 | filter_regex <- "/blog/.+"
 50 | 
 51 | # We're going to have R try to figure out when a page actually launched by finding the
 52 | # first day (in the data that is pulled) where the page had at least X unique pageviews.
 53 | # So, we're going to set "X" here. This may be something to fiddle around with for your
 54 | # site (the larger the site, the larger this number can be).
 55 | first_day_pageviews_min <- 5
 56 | 
 57 | # Set how many pages to include. This will include the top X pages by total unique 
 58 | # pageviews
 59 | total_pages_included <- 20
 60 | 
 61 | # Finally, we want to set how many "days from launch" we want to include in our display.
 62 | days_live_range <- 30
 63 | ```
 64 | 
 65 | # Pull the Data
 66 | 
 67 | This _will_ require a little bit of editing for your site, in that you will need to edit the filter definition to limit the data to the subset of pages on _your_ site that you want to compare.
 68 | 
 69 | ```{r data-pull, message=FALSE, warning=FALSE}
 70 | 
 71 | # Create a dimension filter object. See ?dim_filter() for details. 
 72 | page_filter_object <- dim_filter("pagePath", 
 73 |                                    operator = "REGEXP",
 74 |                                    expressions = filter_regex)
 75 | 
 76 | # Now, put that filter object into a filter clause. The "operator" argument is moot -- it 
 77 | # can be AND or OR...but you have to have it be something, even though it doesn't do anything
 78 | # when there is only a single filter object.
 79 | page_filter <- filter_clause_ga4(list(page_filter_object),
 80 |                                           operator = "AND")
 81 | 
 82 | # Pull the data. See ?google_analytics_4() for additional parameters. The anti_sample = TRUE
 83 | # parameter will slow the query down a smidge and isn't strictly necessary, but it will
 84 | # ensure you do not get sampled data.
 85 | ga_data <- google_analytics(viewId = view_id,
 86 |                             date_range = c(start_date, end_date),
 87 |                             metrics = "uniquePageviews",
 88 |                             dimensions = c("date","pagePath"),
 89 |                             dim_filters = page_filter,
 90 |                             anti_sample = TRUE)
 91 | 
 92 | # Go ahead and do a quick inspection of the data that was returned. This isn't required,
 93 | # but it's a good check along the way.
 94 | kable(head(ga_data)) %>% kable_styling()
 95 | ```
 96 | 
 97 | # Data Munging
 98 | 
 99 | Here's where we're going to have some fun. We're going to need to find the "first day of meaningful traffic" (the first day in the data set that each page has at least `first_day_pageviews_min` unique pageviews).
100 | 
101 | ```{r munging, message=FALSE, warning=FALSE}
102 | 
103 | # Find the first date for each. This is actually a little tricky, so we're going to write a 
104 | # function that takes each page as an input, filters the data to just include those
105 | # pages, finds the first page, and then puts a "from day 1" count on that data and
106 | # returns it.
107 | normalize_date_start <- function(page){
108 |   
109 |   # Filter all the data to just be the page being processed
110 |   ga_data_single_page <- ga_data %>% filter(pagePath == page)
111 |   
112 |   # Find the first value in the result that is greater than first_day_pageviews_min. In many
113 |   # cases, this will be the first row, but, if there has been testing/previews before it
114 |   # actually goes live, some noise may sneak in where the page may have been live, technically,
115 |   # but wasn't actually being considered live.
116 |   first_live_row <- min(which(ga_data_single_page$uniquePageviews > first_day_pageviews_min))
117 |   
118 |   # If the "first_live_row" is Inf, then none of the traffic for the page on any given day
119 |   # exceeded the first_day_pageviews_min value, so we're going to go ahead and exit the function
120 |   # with a NULL result
121 |   if(first_live_row == Inf){return(NULL)}
122 | 
123 |   # Filter the data to start with that page
124 |   ga_data_single_page <- ga_data_single_page[first_live_row:nrow(ga_data_single_page),]
125 |   
126 |   # As the content ages, there may be days that have ZERO traffic. Those days won't show up as
127 |   # rows at all in our data. So, we actually need to create a data frame that includes
128 |   # all dates in the range from the "launch" until the last day traffic was recorded. There's
129 |   # a little trick here where we're going to make a column with a sequence of *dates* (date) and,
130 |   # with a slightly different "seq," a "days_live" that corresponds with each date.
131 |   normalized_results <- data.frame(date = seq.Date(from = min(ga_data_single_page$date), 
132 |                                                    to = max(ga_data_single_page$date), 
133 |                                                    by = "day"),
134 |                                    days_live = seq(min(ga_data_single_page$date):
135 |                                                      max(ga_data_single_page$date)),
136 |                                    page = page,
137 |                                    stringsAsFactors = FALSE) %>% 
138 |     
139 |     # Join back to the original data to get the uniquePageviews
140 |     left_join(ga_data_single_page) %>%
141 |     
142 |     # Replace the "NAs" (days in the range with no uniquePageviews) with 0s (because 
143 |     # that's exactly what happened on those days!)
144 |     mutate(uniquePageviews = ifelse(is.na(uniquePageviews), 0, uniquePageviews)) %>% 
145 |     
146 |     # We're going to plot both the daily pageviews AND the cumulative total pageviews,
147 |     # so let's add the cumulative total
148 |     mutate(cumulative_uniquePageviews = cumsum(uniquePageviews)) %>% 
149 |     
150 |     # Grab just the columns we need for our visualization!
151 |     dplyr::select(page, days_live, uniquePageviews, cumulative_uniquePageviews)
152 | }
153 | 
154 | # We want to run the function above on each page in our data set. So, we need to get a list
155 | # of those pages. We don't want to include pages with low traffic overall, which we set
156 | # earlier as the 'total_unique_pageviews_cutoff' value, so let's also filter our
157 | # list to only include the ones that exceed that cutoff. Alternatively, with a slight
158 | # adjustment to use `top_n()`, this could also be a spot where you simply select the
159 | # total number of pages to include in the visualization.
160 | pages_list <- ga_data %>% 
161 |   group_by(pagePath) %>% summarise(total_traffic = sum(uniquePageviews)) %>% 
162 |   top_n(total_pages_included)
163 | 
164 | # The first little bit of magic can now occur. We'll run our normalize_date_start function on
165 | # each value in our list of pages and get a data frame back that has our time-normalized
166 | # traffic by page!
167 | ga_data_normalized <- map_dfr(pages_list$pagePath, normalize_date_start)
168 | 
169 | # We specified earlier -- in the `days_live_range` object -- how many "days from launch" we
170 | # actually want to include, so let's do one final round of filtering to only include those
171 | # rows.
172 | ga_data_normalized <- ga_data_normalized %>% filter(days_live <= days_live_range)
173 | 
174 | # Check out the result of our handiwork
175 | kable(head(ga_data_normalized)) %>% kable_styling()
176 | 
177 | ```
178 | 
179 | # Data Visualization
180 | 
181 | We're going to do two visualizations here:
182 | 
183 | * **Unique Pageviews by Day from Launch** -- typically, this will show a big jump for the first day or two, and then a precipitous dropoff afterwards. If that's not the case, well, good on ya'! That means you're getting sustained value from your content marketing!
184 | * **Cumulative Unique Pageviews by Day from Launch** -- this will be an easier chart to read, typically, and you can simply look from top to bottom to see which pages have generated the most traffic over time (in the first X days since they launched)
185 | 
186 | **IMPORTANT:** There may be pages that actually launched before the start of the data pulled. Those pages are going to wind up with the first day in the overall data set treated as their "Day 1," so they likely won't show that initial spike (because it occurred so long ago that it's not included in the data).
187 | 
188 | Because these will be somewhat messy line charts, we're also going to use the `plotly` package to make them interactive to that the user can mouse over a line and find out exactly what page it is. 
189 | 
190 | ## Plot Unique Pageviews by Day from Launch
191 | 
192 | This is the visualization that simply plots *uniquePageviews* by day. It can be a little messy to digest (but it can also be eye-opening as to how quickly interest in a particular piece of content drops off).
193 | 
194 | ```{r visualization_1, message=FALSE, warning=FALSE, fig.width=10, fig.height=5}
195 | 
196 | # Create the plot
197 | gg <- ggplot(ga_data_normalized, mapping=aes(x = days_live, y = uniquePageviews, color=page)) +
198 |   geom_line() +                                          # The main "plot" operation
199 |   scale_y_continuous(labels=comma) +                     # Include commas in the y-axis numbers
200 |   labs(title = "Unique Pageviews by Day from Launch",
201 |        x = "# of Days Since Page Launched",
202 |        y = "Unique Pageviews") +
203 |   theme_light() +                                        # Clean up the visualization a bit
204 |   theme(panel.grid = element_blank(),
205 |         panel.border = element_blank(),
206 |         legend.position = "none",
207 |         panel.grid.major.y = element_line(color = "gray80"),
208 |         axis.ticks = element_blank())
209 | 
210 | # Output the plot. We're wrapping it in ggplotly so we will get some interactivity in the plot.
211 | ggplotly(gg)
212 | 
213 | ```
214 | 
215 | ## Plot Cumulative Unique Pageviews by Day from Launch
216 | 
217 | This is the visualization that looks at the _cumulative_ *unique pageviews* for the first X days following the launch (or the first X days of the total evaluation period if the page launched before the start of the evaluation period).
218 | 
219 | ```{r visualization_2, message=FALSE, warning=FALSE, fig.width=10, fig.height=5}
220 | 
221 | # Create the plot
222 | gg <- ggplot(ga_data_normalized, mapping=aes(x = days_live, y = cumulative_uniquePageviews, color=page)) +
223 |   geom_line() +                                          # The main "plot" operation
224 |   scale_y_continuous(labels=comma) +                     # Include commas in the y-axis numbers
225 |   labs(title = "Cumulative Unique Pageviews by Day from Launch",
226 |        x = "# of Days Since Page Launched",
227 |        y = "Cumulative Unique Pageviews") +
228 |   theme_light() +                                        # Clean up the visualization a bit
229 |   theme(panel.grid = element_blank(),
230 |         panel.border = element_blank(),
231 |         legend.position = "none",
232 |         panel.grid.major.y = element_line(color = "gray80"),
233 |         axis.ticks = element_blank())
234 | 
235 | # Output the plot. We're wrapping it in ggplotly so we will get some interactivity in the plot.
236 | 
237 | ggplotly(gg)
238 | 
239 | ```


--------------------------------------------------------------------------------
/twitter-followers/app.R:
--------------------------------------------------------------------------------
  1 | # THIS EXAMPLE DOES NOT CURRENTLY WORK. IT WILL BE UPDATED AS TIME ALLOWS, ALTHOUGH
  2 | # IT IS ALWAYS GOING TO BE SOMEWHAT HAMPERED BY LIMITATIONS OF USING THE FREE
  3 | # TWITTER API.
  4 | 
  5 | # Load the necessary libraries. 
  6 | library(shiny)
  7 | 
  8 | # Ideally, we would use rtweet, but rtweet does not play nice with Shiny, so twitteR it is!
  9 | library(twitteR)           # How we actually get the Twitter data
 10 | library(tidyverse)         # Includes dplyr, ggplot2, and others; very key!
 11 | library(gridExtra)         # Grid / side-by-side plot layoutslibrary
 12 | library(knitr)             # Nicer looking tables
 13 | library(DT)                # Interactive tables
 14 | library(tidytext)          # Tidy text!
 15 | library(SnowballC)         # Mainly for stemming the followers terms
 16 | library(wordcloud)         # Word cloud creation
 17 | library(RColorBrewer)      # Get some palettes to use with the word cloud
 18 | library(topicmodels)       # For the topic modeling using LDA
 19 | 
 20 | 
 21 | ## ui.R
 22 | ui <- fluidPage(title = "Twitter Follower Analysis",
 23 |                 tags$head(includeScript("gtm.js")),
 24 |                 tags$h2("Twitter Follower Analysis*"),
 25 |                 tags$div("Enter a username and then text mine their followers' descriptions!"),
 26 |                 tags$br(),
 27 |                 sidebarLayout(
 28 |                   sidebarPanel(tags$h4("Select Base Data Parameters"),
 29 |                                # Get the username
 30 |                                textInput("tw_account",
 31 |                                          label = "Enter a Twitter username to analyze:",
 32 |                                          value = "analyticshour"),
 33 |                                # Action button. We want the user to control when the
 34 |                                # underlying call to Twitter occurs.
 35 |                                tags$div(style="text-align: center",
 36 |                                         actionButton("query_data", "Get/Refresh Data!", 
 37 |                                                      style="color: #fff; background-color: #337ab7; border-color: #2e6da4")),
 38 |                                
 39 |                                tags$hr(),
 40 |                                tags$h4("Refine the Results"),
 41 |                                
 42 |                                # Stopword language selection. Allows selecting multiple languages
 43 |                                selectInput("stopwords_lang",
 44 |                                            label = "Language(s) for stopwords:",
 45 |                                            choices = c("Danish" = "da",
 46 |                                                        "English" = "en",
 47 |                                                        "French" = "fr",
 48 |                                                        "German" = "de",
 49 |                                                        "Hungarian" = "hu",
 50 |                                                        "Spanish" = "es"),
 51 |                                            multiple = TRUE,
 52 |                                            selected = "en"),
 53 |                                
 54 |                                # Additional words to exclude from the word clouds
 55 |                                textInput("exclude_words",
 56 |                                          label = paste("Additional words to exclude (from the term-frequency",
 57 |                                                        "table, word cloud, and topic analysis):",
 58 |                                                        value = ""))
 59 |                   ),
 60 |                   
 61 |                   mainPanel(tabsetPanel(type = "tabs",
 62 |                                         tabPanel("Raw Data",
 63 |                                                  tags$br(),
 64 |                                                  tags$div("This is the raw data returned from Twitter."),
 65 |                                                  tags$br(),
 66 |                                                  dataTableOutput("followers_details")),
 67 |                                         tabPanel("Term-Frequency Table",
 68 |                                                  tags$br(),
 69 |                                                  tags$div("These are the results after all descriptions were split into",
 70 |                                                           "individual words, the words were stemmed and combined,",
 71 |                                                           "stopwords were removed, and any additional exclusion words",
 72 |                                                           "were removed. These are the words used to create the word cloud."),
 73 |                                                  tags$br(),
 74 |                                                  dataTableOutput("term_frequency")),
 75 |                                         tabPanel("Overall Word Cloud",
 76 |                                                  tags$br(),
 77 |                                                  tags$div("The word cloud based on the cleaned up data set."),
 78 |                                                  tags$br(),
 79 |                                                  fluidRow(
 80 |                                                    # Number of topics to suss out of the data
 81 |                                                    # The minimum word frequency to include in the wordcloud
 82 |                                                    column(4, sliderInput("min_frequency",
 83 |                                                                          label = "Minimum # of occurrences to include:",
 84 |                                                                          min = 1, max = 50, value = 2, step = 1)),
 85 |                                                    column(4,  sliderInput("overall_min_size",
 86 |                                                                           label = "Minimum word size:",
 87 |                                                                           min = 1,  max = 8, value = 2, step = 0.5)),
 88 |                                                    column(4,  sliderInput("overall_max_size",
 89 |                                                                           label = "Maximum word size:",
 90 |                                                                           min = 1,  max = 8, value = 5.5, step = 0.5))),
 91 |                                                  fluidRow(plotOutput("wordcloud", height = "800px"))),
 92 |                                         tabPanel("Topics Word Clouds",
 93 |                                                  tags$br(),
 94 |                                                  tags$div("This is a little bit of unsupervised text mining using Latent Dirichlet",
 95 |                                                           "Allocation (LDA). Based on the number of topics you select below, the LDA",
 96 |                                                           "model establishes a set of topics and then assigns a probability that each",
 97 |                                                           "word in the data set would appear in that topic. The word clouds",
 98 |                                                           "show the terms that had the highest probability of showing up in each topic",
 99 |                                                           "(and excludes terms that had less than a 0.1% chance of appearing in any topic."),
100 |                                                  tags$br(),
101 |                                                  fluidRow(
102 |                                                    column(2, sliderInput("num_topics",
103 |                                                                          label = "# of topics to find:",
104 |                                                                          min = 2, max = 6, value = 2, step = 1)),
105 |                                                    column(3, sliderInput("term_topic_probability",
106 |                                                                          label = "Strength of term fit:",
107 |                                                                          min = 0.001,  max = 0.01, value = 0.001, step = 0.001)),
108 |                                                    column(3, sliderInput("topics_min_frequency",
109 |                                                                          label = "Minimum # of occurrences to include:",
110 |                                                                          min = 1, max = 50, value = 2, step = 1)),
111 |                                                    column(2, sliderInput("topics_min_size",
112 |                                                                          label = "Min. word size:",
113 |                                                                          min = 1,  max = 8, value = 1, step = 0.5)),
114 |                                                    column(2, sliderInput("topics_max_size",
115 |                                                                          label = "Max word size:",
116 |                                                                          min = 1,  max = 8, value = 3.5, step = 0.5))),
117 |                                                  fluidRow(plotOutput("topic_wordclouds", height = "600px"))
118 |                                         )))),
119 |                 tags$hr(),
120 |                 tags$div("*This app is part of a larger set of apps that demonstrate some uses of R in conjunction",
121 |                          "with Google Analytics (and Twitter). For the code for this app, as well as an R Notebook",
122 |                          "that includes more details, see:", tags$a(href = "https://github.com/SDITools/ga-and-r-examples/",
123 |                                                                     "https://github.com/SDITools/ga-and-r-examples/"),"."),
124 |                 tags$br()
125 | )
126 | 
127 | 
128 | ## server.R
129 | server <- function(input, output, session){
130 |   
131 |   # Authenticate. Values need to be added here!
132 |   setup_twitter_oauth(consumer_key = "", 
133 |                       consumer_secret = "", 
134 |                       access_token = "", 
135 |                       access_secret = "")
136 |   
137 |   # Reactive function to pull the data.
138 |   get_followers_details <- reactive({
139 |     
140 |     # Only pull the data if the "Get Data" button is clicked
141 |     input$query_data
142 |     
143 |     # Get a list of all followers
144 |     user <- isolate(getUser(input$tw_account))
145 |     
146 |     # Get the details for those followers
147 |     followers_details <- isolate(user$getFollowers(n=100) %>% twListToDF())
148 | 
149 |   })
150 |   
151 |   # Reactive function to get the cleaned up description data
152 |   get_description_data_clean <- reactive({
153 |     
154 |     # Get the raw data
155 |     followers_details <- get_followers_details()
156 |     
157 |     # cat(names(followers_details, "/n"))
158 |     
159 |     # Unnest it -- put each word on its own row and then collapse the individual
160 |     # words. This will also make everything lowercase and strip punctuation!
161 |     followers_data_clean <- followers_details %>% 
162 |       unnest_tokens(description_term, description) %>% 
163 |       mutate(occurrences = 1) %>%      # Unlike site search, everything we pull occurs "once" each time it appears
164 |       dplyr::select(description, occurrences) %>% 
165 |       ungroup() %>% 
166 |       arrange(-occurrences)
167 |     
168 |     # Remove the stop words. 1) get the stopwords, 2) remove 'em. There may be
169 |     # multiple languages of stopwords selected, so looping through those.
170 |     if(length(input$stopwords_lang > 0)){
171 |       for(lang in input$stopwords_lang){
172 |         # Get the stopwords for the language
173 |         stop_words <- get_stopwords(language = lang) %>% dplyr::select(word)
174 |         description_data_clean <- description_data_clean %>%
175 |           anti_join(stop_words, by = c(description = "word"))
176 |       }
177 |     }
178 |     
179 |     # Convert UTF-8 to ASCII (needed because all hell starts to break loose if you 
180 |     # try to text-mine multibyte). So, we're going to try to convert everything to
181 |     # ASCII. For some...this will fail and return NA. So, we'll then just remove
182 |     # the NA rows
183 |     description_data_clean <- description_data_clean %>%
184 |       mutate(description = iconv(description, "UTF-8", "ASCII")) %>% 
185 |       filter(!is.na(description))
186 |     
187 |     # Perform stemming.
188 |     description_data_clean <- description_data_clean %>% 
189 |       mutate(description_stem = wordStem(description))
190 |     
191 |     # Go ahead and find the most popular un-stemmed word for each stemmed word.
192 |     # That will make the results look more "normal" to the casual viewer. We don't want
193 |     # to have any ties, so we're going to somewhat arbitrarily break any ties by adding
194 |     # the row number / 1000000 to each of the description counts first (We'll toss this later)
195 |     description_data_clean_top_term <- description_data_clean %>% 
196 |       mutate(occurrences = occurrences + row_number()/1000000) %>% 
197 |       group_by(description_stem) %>% 
198 |       top_n(1, occurrences) %>% 
199 |       dplyr::select(-occurrences)
200 |     
201 |     # Join that back to description data after totalling the occurrences by the stemmed term.
202 |     description_data_clean <- description_data_clean %>% 
203 |       group_by(description_stem) %>% 
204 |       summarise(occurrences = sum(occurrences)) %>% 
205 |       left_join(description_data_clean_top_term) %>% 
206 |       ungroup() %>% 
207 |       dplyr::select(description_stem, description, occurrences) %>% 
208 |       arrange(-occurrences)
209 |     
210 |     # Convert the list of additional exclusion words to a vector. There may or may not be
211 |     # spaces after the commas separating the terms.
212 |     # Remove any of the exclusion terms that are entered.
213 |     if(!is.null(input$exclude_words)){
214 |       # Take the comma-delimited list of terms and split them out to be a
215 |       # character vector. The ", ?" regEx is so that this will work with
216 |       # or without a space following the comma
217 |       exclude_words <- unlist(strsplit(input$exclude_words,", ?"))
218 | 
219 |       # Remove any additional "remove words" specified
220 |       description_data_clean <-  description_data_clean %>%
221 |         filter(!description %in% exclude_words)
222 |     }
223 |   })
224 |   
225 |   # Reactive function to do the LDA topic modeling
226 |   get_description_topics_and_terms <- reactive({
227 |     
228 |     # Get the cleaned up description data
229 |     description_data_clean <- get_description_data_clean()
230 |     
231 |     # Cast the term frequency matrix into a document term matrix. We're considering this all one 
232 |     # "document" so we're just hardcoding a "1" for that
233 |     description_data_dtm <- description_data_clean %>% 
234 |       mutate(doc = 1) %>% 
235 |       cast_dtm(doc, description, occurrences)
236 |     
237 |     # Run LDA. Setting a seed for reproducibility
238 |     description_lda <- LDA(description_data_dtm, k = input$num_topics, control = list(seed = 1120))
239 |     
240 |     # Assign a probability of each term being in each of the topics
241 |     description_topics <- tidy(description_lda, matrix = "beta")
242 |     
243 |     # For each term, assign it to the topic for which it has the highest beta. This diverges
244 |     # from the approach described at tidytextmining.com, but it seems like a reasonably legit
245 |     # thing to do.
246 |     description_topics_and_terms <- description_topics %>%
247 |       group_by(term) %>% 
248 |       top_n(1, beta) %>% 
249 |       ungroup() %>% 
250 |       arrange(topic, -beta) %>% 
251 |       left_join(description_data_clean, by = c(term = "description"))
252 |   })
253 |   
254 |   # Output the raw data
255 |   output$followers_details <- DT::renderDataTable({
256 |     get_followers_details() %>% 
257 |       dplyr::select(screenName, description, followersCount) %>% 
258 |       arrange(-followersCount) %>%
259 |       datatable(colnames = c("User","Description", "Followers"),  rownames = FALSE)
260 |   })
261 |   
262 |   
263 |   # Output the term-frequency table
264 |   output$term_frequency <- DT::renderDataTable({
265 |     get_description_data_clean() %>% 
266 |       dplyr::select(description, occurrences) %>% 
267 |       datatable(colnames = c("Description Term", "Occurrences"),
268 |                 rownames = FALSE)
269 |   })
270 |   
271 |   # Set a seed for reproducibility
272 |   set.seed(1971)
273 |   
274 |   # Set a color palette
275 |   color_palette <- rev(brewer.pal(8,"Spectral")) 
276 |   
277 |   # Output the wordcloud
278 |   output$wordcloud <- renderPlot({
279 |     
280 |     # Get the description data
281 |     description_data_clean <- get_description_data_clean()
282 |     
283 |     # Generate the word cloud!
284 |     wordcloud(words = description_data_clean$description, 
285 |               freq = description_data_clean$occurrences,
286 |               scale = c(input$overall_max_size, input$overall_min_size),
287 |               min.freq = input$min_frequency,
288 |               max.words = 500, 
289 |               random.order = FALSE,
290 |               rot.per = .0,
291 |               colors = color_palette)
292 |   })
293 |   
294 |   # This gets a little janky, in that we can't really do recursive / variable
295 |   # topic counts. So, instead, we're going to have an output for each of FIVE
296 |   # topics, but then have those return empty results if fewer topics are actually
297 |   # selected. There *may* be some inefficiencies here, but I couldn't get anything
298 |   # moved out of this to avoid the repetition.
299 |   
300 |   # Topic #1  
301 |   wordcloud_1 <- reactive({
302 |     # Populate description topics and terms. 
303 |     topic_data <- get_description_topics_and_terms()  %>% 
304 |       filter(topic == 1 &  beta > input$term_topic_probability)
305 |     # Generate the word cloud!
306 |     wordcloud(words = topic_data$term, freq = topic_data$occurrences,
307 |               scale=c(input$topics_max_size,input$topics_min_size), 
308 |               min.freq=input$topics_min_frequency, max.words=500, 
309 |               random.order=FALSE, rot.per=.0, colors=color_palette)
310 |   })
311 |   
312 |   # Topic #2
313 |   wordcloud_2 <- reactive({
314 |     # Populate description topics and terms. 
315 |     topic_data <- get_description_topics_and_terms()  %>% 
316 |       filter(topic == 2 &  beta > input$term_topic_probability)
317 |     # Generate the word cloud!
318 |     wordcloud(words = topic_data$term, freq = topic_data$occurrences,
319 |               scale=c(input$topics_max_size,input$topics_min_size), 
320 |               min.freq=input$topics_min_frequency, max.words=500, 
321 |               random.order=FALSE, rot.per=.0, colors=color_palette)
322 |   })
323 |   
324 |   # For 3-6, the slider might be set below them, so we have to check to see before trying
325 |   # to generate a word cloud
326 |   
327 |   # Topic #3
328 |   wordcloud_3 <- reactive({
329 |     if(input$num_topics >= 3){
330 |       # Populate description topics and terms. 
331 |       topic_data <- get_description_topics_and_terms()  %>% 
332 |         filter(topic == 3 &  beta > input$term_topic_probability)
333 |       # Generate the word cloud!
334 |       wordcloud(words = topic_data$term, freq = topic_data$occurrences,
335 |                 scale=c(input$topics_max_size,input$topics_min_size), 
336 |                 min.freq=input$topics_min_frequency, max.words=500, 
337 |                 random.order=FALSE, rot.per=.0, colors=color_palette)
338 |     } else {
339 |       NULL
340 |     }
341 |   })
342 |   
343 |   # Topic #4
344 |   wordcloud_4 <- reactive({
345 |     if(input$num_topics >= 4){
346 |       # Populate description topics and terms. 
347 |       topic_data <- get_description_topics_and_terms()  %>% 
348 |         filter(topic == 4 &  beta > input$term_topic_probability)
349 |       # Generate the word cloud!
350 |       wordcloud(words = topic_data$term, freq = topic_data$occurrences,
351 |                 scale=c(input$topics_max_size,input$topics_min_size), 
352 |                 min.freq=input$topics_min_frequency, max.words=500, 
353 |                 random.order=FALSE, rot.per=.0, colors=color_palette)
354 |     } else {
355 |       NULL
356 |     }
357 |   })
358 |   
359 |   # Topic #5
360 |   wordcloud_5 <- reactive({
361 |     if(input$num_topics >= 5){
362 |       # Populate description topics and terms. 
363 |       topic_data <- get_description_topics_and_terms()  %>% 
364 |         filter(topic == 5 &  beta > input$term_topic_probability)
365 |       # Generate the word cloud!
366 |       wordcloud(words = topic_data$term, freq = topic_data$occurrences,
367 |                 scale=c(input$topics_max_size,input$topics_min_size), 
368 |                 min.freq=input$topics_min_frequency, max.words=500, 
369 |                 random.order=FALSE, rot.per=.0, colors=color_palette)
370 |     } else {
371 |       NULL
372 |     }
373 |   })
374 |   
375 |   # Topic #6
376 |   wordcloud_6 <- reactive({
377 |     if(input$num_topics >= 6){
378 |       # Populate description topics and terms. 
379 |       topic_data <- get_description_topics_and_terms()  %>% 
380 |         filter(topic == 6 &  beta > input$term_topic_probability)
381 |       # Generate the word cloud!
382 |       wordcloud(words = topic_data$term, freq = topic_data$occurrences,
383 |                 scale=c(input$topics_max_size,input$topics_min_size), 
384 |                 min.freq=input$topics_min_frequency, max.words=500, 
385 |                 random.order=FALSE, rot.per=.0, colors=color_palette)
386 |     } else {
387 |       NULL
388 |     }
389 |   })
390 |   
391 |   # Output the grid of wordclouds
392 |   output$topic_wordclouds <- renderPlot({
393 |     # Layout out a 2x3 grid with all of the wordclouds and return that as 
394 |     # the plot.
395 |     par(mfrow=c(2,3)) # for 1 row, 2 cols
396 |     wordcloud_1()
397 |     wordcloud_2()
398 |     wordcloud_3() 
399 |     wordcloud_4()
400 |     wordcloud_5()
401 |     wordcloud_6()
402 |   })
403 | }
404 | 
405 | # shinyApp(gar_shiny_ui(ui, login_ui = gar_shiny_login_ui), server)
406 | shinyApp(ui = ui, server = server)
407 | 


--------------------------------------------------------------------------------
/twitter-followers/twitter-followers.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Twitter Follower Mining"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ### General Note
  7 | 
  8 | This example is part of a larger set of examples of using Google Analytics with R...even though this example uses Twitter. For additional examples, downloadable code, and explanations of the overall effort, see: https://github.com/SDITools/ga-and-r-examples.
  9 | ### Overview
 10 | 
 11 | 
 12 | This example pulls all of the followers for a given user and then does some text-mining on their descriptions:
 13 | 
 14 | * Cleans up the descriptions by unnesting terms, removing stopwords, pushing everything to lowercase, and performing stemming
 15 | * Generates a term-frequency matrix and a word cloud
 16 | * Breaks out the searches into "topics" using LDA (https://www.tidytextmining.com/topicmodeling.html)
 17 | 
 18 | ## Setup/Config
 19 | 
 20 | ```{r config}
 21 | 
 22 | # Set the base account
 23 | tw_account <- "analyticshour"
 24 | 
 25 | # Load the necessary libraries. 
 26 | if (!require("pacman")) install.packages("pacman")
 27 | pacman::p_load(rtweet,            # How we actually get the Twitter data
 28 |                tidyverse,         # Includes dplyr, ggplot2, and others; very key!
 29 |                knitr,             # Nicer looking tables
 30 |                tidytext,          # Tidy text!
 31 |                SnowballC,         # Mainly for stemming the search terms
 32 |                DT,                # Make a nice data table
 33 |                wordcloud,         # Word cloud creation
 34 |                RColorBrewer,      # Get some palettes to use with the word cloud
 35 |                topicmodels)       # For the topic modeling using LDA
 36 | 
 37 | 
 38 | # Minimum # of occurrences for a term to include in the wordcloud
 39 | min_frequency <- 2
 40 | 
 41 | # Set the number of topics to include in the topic model
 42 | num_topics <- 4
 43 | 
 44 | # Set the stopwords language
 45 | stopwords_lang <- "en"
 46 | 
 47 | # Words to exclude (because they're too dominant to be interesting). This will exclude
 48 | # them from the main word cloud AND from the topic modeling
 49 | exclude_words <- c("https")
 50 | 
 51 | # Label for what this is
 52 | main_label <- paste0("Followers of @",tw_account)
 53 | 
 54 | # Get app credentials
 55 | 
 56 | # Name assigned to created app. You'll need to set these up in the Twitter
 57 | # developer console and then either hardcode them here or put them in your .Renviron file
 58 | tw_appname <- Sys.getenv("TWITTER_APPNAME")
 59 | 
 60 | # Key and Secret
 61 | tw_key <- Sys.getenv("TWITTER_KEY")
 62 | tw_secret <- Sys.getenv("TWITTER_SECRET")
 63 | tw_access_token = Sys.getenv("TWITTER_ACCESS_TOKEN")
 64 | tw_access_secret = Sys.getenv("TWITTER_ACCESS_SECRET")
 65 | 
 66 | 
 67 | # Create the token. 
 68 | tw_token <- create_token(
 69 |     app = tw_appname,
 70 |     consumer_key = tw_key,
 71 |     consumer_secret = tw_secret,
 72 |     access_token = tw_access_token,
 73 |     access_secret = tw_access_secret)
 74 | 
 75 | ```
 76 | 
 77 | ## Get the Data and Clean It Up
 78 | 
 79 | ```{r get_data, message=FALSE, warning=FALSE}
 80 | 
 81 | # Get a list of all followers
 82 | user_followers <- get_followers(tw_account, n=10000, token = tw_token)
 83 | 
 84 | # # Split that vector up into a list with 15,000 users per (needed to get the details)
 85 | # user_followers_split <- split(user_followers, rep(1:ceiling(nrow(user_followers)/15000), each=15000, 
 86 | #                                                             length.out = nrow(user_followers)))
 87 | 
 88 | # Get the user details for all of those users
 89 | followers_details <-  lookup_users(user_followers$user_id, parse = TRUE, token = tw_token)
 90 | 
 91 | # Output how many users are being analyzed
 92 | cat(paste0("The following assessment covers the ", nrow(followers_details), " ", tolower(main_label), "."))
 93 | 
 94 | # Unnest it -- put each word on its own row and then collapse the individual
 95 | # words. This will also make everything lowercase and strip punctuation!
 96 | followers_data <- followers_details %>% 
 97 |   unnest_tokens(description_term, description) %>% 
 98 |   group_by(description_term) %>% 
 99 |   summarise(occurrences = n()) %>% 
100 |   select(description_term, occurrences) %>% 
101 |   ungroup() %>% 
102 |   arrange(-occurrences)
103 | 
104 | # Remove the stop words. 1) get the stopwords, 2) remove 'em
105 | stop_words <- get_stopwords(language = stopwords_lang) %>% 
106 |   select(word)
107 | 
108 | followers_data <- followers_data %>% 
109 |   anti_join(stop_words, by = c(description_term = "word"))
110 | 
111 | # Convert UTF-8 to ASCII (needed because all hell starts to break loose if you 
112 | # try to text-mine multibyte). So, we're going to try to convert everything to
113 | # ASCII. For some...this will fail and return NA. So, we'll then just remove
114 | # the NA rows
115 | followers_data <- followers_data %>%
116 |   mutate(description_term = iconv(description_term, "UTF-8", "ASCII")) %>% 
117 |   filter(!is.na(description_term))
118 | 
119 | # Perform stemming.
120 | followers_data <- followers_data %>% 
121 |   mutate(description_term_stem = wordStem(description_term))
122 | 
123 | # Go ahead and find the most popular un-stemmed word for each stemmed word.
124 | # That will make the results look more "normal" to the casual viewer. We don't want
125 | # to have any ties, so we're going to somewhat arbitrarily break any ties by adding
126 | # the row number / 1000000 to each of the search counts first (We'll toss this later)
127 | followers_data_top_term <- followers_data %>% 
128 |   mutate(occurrences = occurrences + row_number()/1000000) %>% 
129 |   group_by(description_term_stem) %>% 
130 |   top_n(1, occurrences) %>% 
131 |   select(-occurrences)
132 | 
133 | # Join that back to search data after totalling the occurrences by the stemmed term.
134 | followers_data <- followers_data %>% 
135 |   group_by(description_term_stem) %>% 
136 |   summarise(occurrences = sum(occurrences)) %>% 
137 |   left_join(followers_data_top_term) %>% 
138 |   ungroup() %>% 
139 |   select(description_term_stem, description_term, occurrences) %>% 
140 |   arrange(-occurrences)
141 | 
142 | # Remove any additional "remove words" specified
143 | followers_data <-  followers_data %>%
144 |   filter(!description_term_stem %in% exclude_words)
145 | 
146 | # Get rid of the "top term" data frame
147 | rm(followers_data_top_term)
148 | 
149 | ```
150 | 
151 | ## Show the Original Descriptions
152 | 
153 | ```{r datatable_raw, message=FALSE, warning=FALSE}
154 | 
155 | followers_details %>% 
156 |   arrange(-followers_count) %>% 
157 |   select(user_id, description, followers_count) %>% 
158 |   datatable(colnames = c("Username", "Description", "# of Followers"),  rownames = FALSE)
159 | 
160 | ```
161 | 
162 | 
163 | ## Make a Term-Frequency Matrix
164 | 
165 | This looks similar to the report in Google Analytics, but it's been processed to be the individual words, stemmed, stopwords removed, etc.
166 | 
167 | ```{r datatable_clean, message=FALSE, warning=FALSE}
168 | 
169 | select(followers_data, description_term, occurrences) %>% 
170 | datatable(colnames = c("Description Term", "Occurrences"),
171 |           rownames = FALSE)
172 | 
173 | ```
174 | 
175 | 
176 | ## Create a Word Cloud
177 | 
178 | A wordcloud based on the cleaned up and unnested words.
179 | 
180 | ```{r wordcloud, message=FALSE, warning=FALSE}
181 | 
182 | # Set a seed for reproducibility
183 | set.seed(1971)
184 | 
185 | # Set a color palette
186 | color_palette <- rev(brewer.pal(8,"Spectral")) 
187 | 
188 | # Generate the word cloud!
189 | wordcloud(words = followers_data$description_term, 
190 |           freq = followers_data$occurrences,
191 |           scale=c(5.5,0.6),
192 |           min.freq=min_frequency,
193 |           max.words=500, 
194 |           random.order=FALSE,
195 |           rot.per=.0,
196 |           colors=color_palette)
197 | 
198 | ```
199 | 
200 | ## Look for Topics!
201 | 
202 | We're going to use Latent Dirichlet allocation (LDA) to try to break out these words into topics. This is basically just following the process outlined for LDA at: https://www.tidytextmining.com/topicmodeling.html.
203 | 
204 | ```{r lda, message=FALSE, warning=FALSE}
205 | 
206 | # Cast the term frequency matrix into a document term matrix. We're considering this all one 
207 | # "document" so we're just hardcoding a "1" for that
208 | followers_data_dtm <- followers_data %>% 
209 |   mutate(doc = 1) %>% 
210 |   cast_dtm(doc, description_term, occurrences)
211 | 
212 | # Run LDA. Setting a seed for reproducibility
213 | search_lda <- LDA(followers_data_dtm, k = num_topics, control = list(seed = 1120))
214 | 
215 | # Assign a probability of each term being in each of the topics
216 | search_topics <- tidy(search_lda, matrix = "beta")
217 | 
218 | # For each term, assign it to the topic for which it has the highest beta. This diverges
219 | # from the approach described at tidytextmining.com, but it seems like a reasonably legit
220 | # thing to do.
221 | search_topics_and_terms <- search_topics %>%
222 |   group_by(term) %>% 
223 |   top_n(1, beta) %>% 
224 |   ungroup() %>% 
225 |   arrange(topic, -beta) %>% 
226 |   left_join(followers_data, by = c(term = "description_term"))
227 | 
228 | 
229 | # Function to generate a word cloud based on the topic ID passed in
230 | generate_topic_wordcloud <- function(topic_id){
231 |   
232 |   # Filter the data to be just the topic and to 
233 |   # knock out terms with a reallllly low beta
234 |   topic_data <- search_topics_and_terms %>% 
235 |     filter(topic == topic_id &
236 |            beta > 0.001)
237 |   
238 |   # Generate the word cloud!
239 |   wordcloud(words = topic_data$term, 
240 |             freq = topic_data$occurrences,
241 |             scale=c(3.5,1),
242 |             min.freq=min_frequency,
243 |             max.words=500, 
244 |             random.order=FALSE,
245 |             rot.per=.0,
246 |             colors=color_palette)
247 | }
248 | 
249 | # Call the function for each topic ID
250 | topic_wordclouds <- map(seq(1:num_topics), generate_topic_wordcloud)
251 | 
252 | 
253 | ```
254 | 
255 | 


--------------------------------------------------------------------------------