├── syllabus
    └── ECBS5210 dv3 in 2024-2025pdf
├── 2-interactive-plots.Rmd
├── homework.Rmd
├── README.md
├── 1.R
└── 2.R


/syllabus/ECBS5210 dv3 in 2024-2025pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daroczig/CEU-DV2/HEAD/syllabus/ECBS5210 dv3 in 2024-2025pdf


--------------------------------------------------------------------------------
/2-interactive-plots.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | output: html_document
 3 | runtime: shiny
 4 | ---
 5 | 
 6 | Static ggplot:
 7 | 
 8 | ```{r}
 9 | library(ggplot2)
10 | (p <- ggplot(mtcars, aes(wt, qsec, color = factor(am))) + geom_point())
11 | ```
12 | 
13 | Plotly:
14 | 
15 | ```{r}
16 | library(plotly)
17 | ggplotly(p)
18 | ```
19 | 
20 | ggiraph labels:
21 | 
22 | ```{r}
23 | library(ggiraph)
24 | p <- ggplot(mtcars, aes(
25 |   x = wt,
26 |   y = qsec,
27 |   color = factor(am),
28 |   ## NOTE this newly added field
29 |   data_id = factor(gear),
30 |   tooltip = rownames(mtcars))) +
31 |   geom_point_interactive()
32 | girafe(ggobj = p, options = list(opts_hover(css = "fill:black;")))
33 | ```
34 | 
35 | Let's add a selector for `hp`:
36 | 
37 | ```{r}
38 | library(shiny)
39 | sliderInput( # or numericInput
40 |   'minhp', h4('Minimum horsepower'), 
41 |   min = min(mtcars$hp), max = max(mtcars$hp), 
42 |   value = 100)
43 | renderPlot({
44 |   ggplot(subset(mtcars, hp > input$minhp),
45 |          aes(wt, qsec, color = factor(am))) + 
46 |     geom_point()
47 | })
48 | ```


--------------------------------------------------------------------------------
/homework.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "DV3 homework #1"
  3 | author: "Gergely Daroczi, CEU"
  4 | date: "`r Sys.Date()`"
  5 | output:
  6 |   html_document:
  7 |     theme: journal
  8 | ---
  9 | 
 10 | Using the `journal` R Markdown theme 😎
 11 | 
 12 | > No need to replicate the paragraphs with a vertical line on the left (like this paragraph), as these are instructions/hints.
 13 | 
 14 | > Note that commands are NOT echoed in this example document, but you should always echo the R commands and keep all warnings etc.
 15 | 
 16 | ```{r, echo=FALSE}
 17 | knitr::opts_chunk$set(echo=FALSE)
 18 | ```
 19 | 
 20 | Load the regions and zones collected by the [Spare Cores](https://sparecores.com) project from the below URLs:
 21 | 
 22 | - https://keeper.sparecores.net/table/region
 23 | - https://keeper.sparecores.net/table/zone
 24 | 
 25 | > You can use the `fromJSON` function from the `jsonlite` package to parse these JSON files. Make sure to convert to `data.table` for easier filtering and aggregation later.
 26 | 
 27 | ```{r}
 28 | library(jsonlite)
 29 | library(data.table)
 30 | regions <- data.table(fromJSON("https://keeper.sparecores.net/table/region"))
 31 | zones <- data.table(fromJSON("https://keeper.sparecores.net/table/zone"))
 32 | ```
 33 | 
 34 | The loaded data look like:
 35 | 
 36 | ```{r, echo=TRUE}
 37 | str(regions)
 38 | str(zones)
 39 | ```
 40 | 
 41 | Let's count the number of regions per country, shown in desceding order:
 42 | 
 43 | ```{r}
 44 | regions[, .N, by = country_id][order(-N)]
 45 | ```
 46 | 
 47 | > You can pass any `data.frame` or similar tabular data to `pander::pander` to render as a HTML table instead of raw R console output.
 48 | 
 49 | A nicer table:
 50 | 
 51 | ```{r}
 52 | library(pander)
 53 | pander(regions[, .N, by = country_id][order(-N)])
 54 | ```
 55 | 
 56 | Let's show the distribution of the founding year of the regions:
 57 | 
 58 | > Make sure to replicate the axis titles, legend position, theme etc in the below and all future plots! Try to also replicate the tiny details of the plot as well, like axis labels, and grid design.
 59 | 
 60 | ```{r}
 61 | library(ggplot2)
 62 | ggplot(regions[!is.na(founding_year)], aes(founding_year)) + geom_bar() +
 63 |   xlab("") + ylab("") +
 64 |   ggtitle("Number of datacenters by founding year",
 65 |           subtitle = "Note that regions with unknown founding year were excluded from the plot.") +
 66 |   theme_bw() +
 67 |   scale_x_continuous(breaks=min(regions$founding_year, na.rm=TRUE):max(regions$founding_year, na.rm=TRUE)) +
 68 |   theme(panel.grid.major.x=element_blank(), panel.grid.minor.x=element_blank())
 69 | ```
 70 | 
 71 | Also showing the average founding year on the same plot:
 72 | 
 73 | > Look into `geom_vline`.
 74 | 
 75 | ```{r}
 76 | library(ggplot2)
 77 | ggplot(regions[!is.na(founding_year)], aes(founding_year)) + geom_bar() +
 78 |   xlab("") + ylab("") +
 79 |   ggtitle("Number of datacenters by founding year",
 80 |           subtitle = "Note that regions with unknown founding year were excluded from the plot.") +
 81 |   theme_bw() +
 82 |   scale_x_continuous(breaks=min(regions$founding_year, na.rm=TRUE):max(regions$founding_year, na.rm=TRUE)) +
 83 |   theme(panel.grid.major.x=element_blank(), panel.grid.minor.x=element_blank()) +
 84 |   geom_vline(xintercept = mean(regions$founding_year, na.rm=TRUE), color = "red", linewidth=1.5)
 85 | ```
 86 | 
 87 | 
 88 | Now let's filter for the regions in Europe!
 89 | 
 90 | > For this, you might need to lookup the continent of the provided country ids.
 91 | 
 92 | ```{r}
 93 | library(countrycode)
 94 | regions[, continent := countrycode(country_id, origin = 'iso2c', destination = 'continent'),
 95 |         by = country_id]
 96 | regions <- regions[continent == 'Europe']
 97 | ```
 98 | 
 99 | After filtering, there should be `r regions[, .N]` regions left.
100 | 
101 | > You can add inline R code chunks by using backquotes, followed by `r`, e.g. writing <code>&#96;r 2+2&#96;</code> will return `4`.
102 | 
103 | ```{r}
104 | regions <- merge(regions, zones[, .(zones = .N), by = region_id])
105 | ```
106 | 
107 | Let's count the number of zones per region, and merge it to the `regions` dataset. There are `r regions[zones==3, .N]` regions with 3 zones, and `r regions[zones==1, .N]` regions with a single zone. Showing this visually for each vendor:
108 | 
109 | ```{r}
110 | ggplot(regions, aes(vendor_id, fill = factor(zones))) + geom_bar() +
111 |     theme_bw() + xlab('') + ylab('Number of regions') +
112 |     scale_fill_discrete("Number of availability zones in the region") + theme(legend.position = 'top')
113 | 
114 | ```
115 | 
116 | Now let's load a GeoJSON file on the boundaries of the European countries. You can use [`leakyMirror/map-of-europe`'s `europe.geojson`](https://raw.githubusercontent.com/leakyMirror/map-of-europe/refs/heads/master/GeoJSON/europe.geojson).
117 | 
118 | ```{r}
119 | download.file(
120 |   'https://raw.githubusercontent.com/leakyMirror/map-of-europe/refs/heads/master/GeoJSON/europe.geojson',
121 |   'europe.json')
122 | library(sf)
123 | countries <- st_read('europe.json')
124 | ```
125 | 
126 | Plotting the downloaded shapes:
127 | 
128 | ```{r}
129 | ggplot() + geom_sf(data = countries) + theme_void()
130 | ```
131 | 
132 | Now let's get a background tile for this area!
133 | 
134 | > Note that we used `ggmap`'s `get_stamenmap` in the class, but I suggest switching to `maptiles`'s `get_tiles` (using the same API token), as it returns the tiles in the correct WGS84 projection that is easier to use with the other loaded datasets. You can pass the loaded GeoJSON object to `get_tiles`, use `zoom=4` and the API key. Experiment with the other parameters as well to replicate the below using `geom_spatraster_rgb` from the `tidyterra` package!
135 | 
136 | ```{r}
137 | library(maptiles)
138 | library(tidyterra)
139 | m <- get_tiles(
140 |   countries, crop = TRUE,
141 |   provider="Stadia.StamenTerrainBackground", zoom = 4,
142 |   apikey = "TODO")
143 | ggplot() + geom_spatraster_rgb(data = m) + theme_void()
144 | ```
145 | 
146 | Now let's put together all the loaded layers (background raster map, polygon on country borders, and location of regions weighted by the number of zones, using color to represent if the region is powered by green energy, and use the `shape` to also visualize the vendor)!
147 | 
148 | ```{r}
149 | ggplot() +
150 |   geom_spatraster_rgb(data = m) +
151 |   geom_sf(data = countries,
152 |           color = 'black',
153 |           fill = 'transparent',
154 |           linewidth = .25) +
155 |   geom_point(data = regions, aes(
156 |     lon, lat,
157 |     color = green_energy, size = zones, shape = vendor_id), alpha = 0.5) +
158 |   scale_color_manual(values = c("TRUE" = "darkgreen", "FALSE" = "darkred")) +
159 |   theme_void()
160 | ```
161 | 
162 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | This is the R script repository of the "[Data Visualization 3: Practical Data Visualization with R](https://courses.ceu.edu/courses/2022-2023/data-visualization-2-practical-data-visualization-r)" course in the 2023/2024 Winter term, part of the [MSc in Business Analytics](https://courses.ceu.edu/programs/ms/master-science-business-analytics) at CEU. For the previous editions, see [2019/2020 Spring](https://github.com/daroczig/CEU-DV2/tree/2019-2020), [2020/2021 Winter](https://github.com/daroczig/CEU-DV2/tree/2020-2021), [2021/2022 Winter](https://github.com/daroczig/CEU-DV2/tree/2021-2022), and [2022/2023 Winter](https://github.com/daroczig/CEU-DV2/tree/2022-2023).
  2 | 
  3 | ## Table of Contents
  4 | 
  5 | * [Schedule](https://github.com/daroczig/CEU-DV2#schedule)
  6 | * [Syllabus](https://github.com/daroczig/CEU-DV2#syllabus)
  7 | * [Technical Prerequisites](https://github.com/daroczig/CEU-DV2#technical-prerequisites)
  8 | * [Class Schedule](https://github.com/daroczig/CEU-DV2#class-schedule)
  9 | * [Contact](https://github.com/daroczig/CEU-DV2#contacts)
 10 | 
 11 | ## Schedule
 12 | 
 13 | 3 x 100 mins on Jan 22 and 31:
 14 | 
 15 | * 13:30 - 15:10 session 1
 16 | * 15:10 - 15:40 break
 17 | * 15:40 - 17:20 session 2
 18 | * 17:20 - 17:40 break
 19 | * 17:40 - 19:20 session 3
 20 | 
 21 | ## Location
 22 | 
 23 | In-person at the Vienna campus (QS B-421).
 24 | 
 25 | ## Syllabus
 26 | 
 27 | Please find in the `syllabus` folder of this repository.
 28 | 
 29 | ## Technical Prerequisites
 30 | 
 31 | Please bring your own laptop* and make sure to install the below items **before** attending the first class:
 32 | 
 33 | 0. Join the Teams channel dedicated to the class at `ba-dv3-2024` with the `671u734` team code
 34 | 1. Install `R` from https://cran.r-project.org
 35 | 2. Install `RStudio Desktop` (Open Source License) from https://posit.co/download/rstudio-desktop/
 36 | 3. Register an account at https://github.com
 37 | 4. Enter the following commands in the R console (bottom left panel of RStudio) and make sure you see a plot in the bottom right panel and no errors in the R console:
 38 | 
 39 | ```r
 40 | install.packages(c('ggplot2', 'gganimate', 'transformr', 'gifski'))
 41 | library(ggplot2)
 42 | library(gganimate)
 43 | ggplot(diamonds, aes(cut)) + geom_bar() +
 44 |     transition_states(color, state_length = 0.1)
 45 | ```
 46 | 
 47 | Optional steps I highly suggest to do as well before attending the class if you plan to use `git`:
 48 | 
 49 | 4. Bookmark, watch or star this repository so that you can easily find it later
 50 | 5. Install `git` from https://git-scm.com/
 51 | 6. Verify that in RStudio, you can see the path of the `git` executable binary in the Tools/Global Options menu's "Git/Svn" tab -- if not, then you might have to restart RStudio (if you installed git after starting RStudio) or installed git by not adding that to the PATH on Windows. Either way, browse the "git executable" manually (in some `bin` folder look for thee `git` executable file).
 52 | 7. Create an RSA key (optionally with a passphrase for increased security -- that you have to enter every time you push and pull to and from GitHub). Copy the public key and add that to you SSH keys on your GitHub profile.
 53 | 8. Create a new project choosing "version control", then "git" and paste the SSH version of the repo URL copied from GitHub in the pop-up -- now RStudio should be able to download the repo. If it asks you to accept GitHub's fingerprint, say "Yes".
 54 | 9. If RStudio/git is complaining that you have to set your identity, click on the "Git" tab in the top-right panel, then click on the Gear icon and then "Shell" -- here you can set your username and e-mail address in the command line, so that RStudio/git integration can work. Use the following commands:
 55 | 
 56 |     ```sh
 57 |     $ git config --global user.name "Your Name"
 58 |     $ git config --global user.email "Your e-mail address"
 59 |     ```
 60 |     Close this window, commit, push changes, all set.
 61 | 
 62 | Find more resources in Jenny Bryan's "[Happy Git and GitHub for the useR](http://happygitwithr.com/)" tutorial if in doubt or [contact me](#contact).
 63 | 
 64 | (*) If you may not be able to use your own laptop, there's a shared RStudio Server set up in AWS for you - including all the required R packages already installed for you. Look up the class Slack channel for how to access, or find below the steps how the service was configured:
 65 | 
 66 | <details><summary>💪 RStudio Server installation steps</summary>
 67 | 
 68 | ```
 69 | # most recent R builds
 70 | wget -q -O- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | sudo tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc
 71 | echo "deb [arch=amd64] https://cloud.r-project.org/bin/linux/ubuntu noble-cran40/" | sudo tee -a /etc/apt/sources.list.d/cran_r.list
 72 | sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 67C2D66C4B1D4339 51716619E084DAB9
 73 | sudo apt update && sudo apt upgrade
 74 | sudo apt install r-base
 75 | # apt builds of all CRAN packages
 76 | wget -q -O- https://eddelbuettel.github.io/r2u/assets/dirk_eddelbuettel_key.asc | sudo tee -a /etc/apt/trusted.gpg.d/cranapt_key.asc
 77 | echo "deb [arch=amd64] https://r2u.stat.illinois.edu/ubuntu noble main" | sudo tee -a /etc/apt/sources.list.d/cranapt.list
 78 | sudo apt update
 79 | # install some R packages
 80 | sudo apt install -y r-base gdebi-core r-cran-ggplot2 r-cran-gganimate \
 81 |   cargo libudunits2-dev libssl-dev libgdal-dev desktop-file-utils \
 82 |   r-cran-data.table r-cran-rcpp r-cran-dplyr r-cran-ggally r-cran-pander r-cran-readxl \
 83 |   r-cran-ggrepel r-cran-hexbin r-cran-animation r-cran-dendextend r-cran-nbclust \
 84 |   r-cran-ggmap r-cran-maps r-cran-devtools r-cran-ggraph r-cran-ggthemes \
 85 |   r-cran-leaflet r-cran-mapproj \
 86 |   r-cran-gtextras r-cran-datasaurus r-cran-psych r-cran-svglite \
 87 |   r-cran-tidygeocoder \
 88 |   r-cran-ggiraph r-cran-plotly r-cran-concaveman r-cran-gifski
 89 | # install RStudio IDE
 90 | wget https://download2.rstudio.org/server/jammy/amd64/rstudio-server-2024.12.0-467-amd64.deb
 91 | sudo gdebi rstudio-server-*.deb
 92 | # never do this in prod
 93 | echo "www-port=80" | sudo tee -a /etc/rstudio/rserver.conf
 94 | sudo rstudio-server restart
 95 | ```
 96 | 
 97 | </details>
 98 | 
 99 | <details><summary>💪 Creating users</summary>
100 | 
101 | ```r
102 | secret <- 'something super secret'
103 | users <- c('list', 'of', 'users')
104 | 
105 | library(logger)
106 | library(glue)
107 | for (user in users) {
108 | 
109 |   ## remove invalid character
110 |   user <- sub('@.*', '', user)
111 |   user <- sub('-', '_', user)
112 |   user <- sub('.', '_', user, fixed = TRUE)
113 |   user <- tolower(user)
114 | 
115 |   log_info('Creating {user}')
116 |   system(glue("sudo adduser --disabled-password --quiet --gecos '' {user}"))
117 | 
118 |   log_info('Setting password for {user}')
119 |   system(glue("echo '{user}:{secret}' | sudo chpasswd")) # note the single quotes + placement of sudo
120 | 
121 |   log_info('Adding {user} to sudo group')
122 |   system(glue('sudo adduser {user} sudo'))
123 | 
124 | }
125 | ```
126 | 
127 | </details>
128 | 
129 | ## Class Schedule
130 | 
131 | ### Week 1
132 | 
133 | 1. Warm-up exercise and security reminder: [1.R](1.R#L1)
134 | 2. Intro / recap on R and ggplot2 from previous courses by introducing MDS: [1.R](1.R#L67)
135 | 3. Geocoding: [1.R](1.R#L144)
136 | 4. Shapefiles: [1.R](1.R#L232)
137 | 5. Scaling / standardizing variables: [1.R](1.R#L332)
138 | 6. Simpson's paradox: [1.R](1.R#L398)
139 | 7. Anscombe's quartett [1.R](1.R#L459)
140 | 
141 | ### Week 2
142 | 
143 | 1. Review homework: [homework.Rmd](homework.Rmd)
144 | 3. Warm-up exercises on `ggplot2` calls and EDA [2.R](2.R#L1)
145 | 4. Extract points from a plot [2.R](2.R#L71)
146 | 5. Hierarchical clustering [2.R](2.R#L107)
147 | 6. Animations [2.R](2.R#L136)
148 | 7. Themes [2.R](2.R#L385)
149 | 8. Interactive plots [2.R](2.R#L460) and [2-interactive-plots.Rmd](2-interactive-plots.Rmd)
150 | 
151 | ## Homeworks
152 | 
153 | ### Homework 1
154 | 
155 | Replicate the following document using R markdown: https://rpubs.com/daroczig-ceu/dv3-2024-hw
156 | 
157 | Submission: prepare an R Markdown or Quartro document echoing all used R commands, and knit to HTML or PDF and upload to Moodle before Jan 31 noon (CET).
158 | 
159 | ### Final project
160 | 
161 | Use any publicly accessible dataset (preferably from the [TidyTuesday projects](https://github.com/rfordatascience/tidytuesday)), but if you don't feel creative, feel free to pick the [`palmerpenguins` dataset](https://allisonhorst.github.io/palmerpenguins/.) and demonstrate what you have learned in this class by generating different data visualizations that makes sense and are insightful, plus provide comments on those in plain English. This can totally be a continuation of your Intro to R submission.
162 | 
163 | Required items to include in your work for grade "B":
164 | - at least 5 plots (with at least 3 different `ggplot2` geoms) not presented yet in any of your previous CEU projects
165 | - a meaningful animation using `gganimate` (instead of presenting random stuff moving around, make sure to create an animation that makes sense for your use-case, e.g. showing how things changed over time)
166 | - either (1) register an account at stadiamap to fetch map tiles or (2) use shapefile(s) to present some geospatial data (e.g. points or polygons rendered on a background map)
167 | 
168 | For grade "A":
169 | - make sure to fine-tune your plots and make those pretty by always setting proper (axis) titles, scales, custom color palettes etc.
170 | - define a custom theme (e.g. background color, grid, font family and color) and use that on all (or at least on most) plots
171 | - include at least one interactive plot (e.g. via `plotly` or `ggigraph`)
172 | 
173 | Submission: prepare an R Markdown or Quartro document (printing both the R code and its output) and knit to HTML and upload to Moodle before Feb 14 midnight (CET). Make sure to submit HTML with R code chunks and plots!
174 | 
175 | ## Contact
176 | 
177 | File a [GitHub ticket](https://github.com/daroczig/CEU-DV2/issues).
178 | 


--------------------------------------------------------------------------------
/1.R:
--------------------------------------------------------------------------------
  1 | ## #############################################################################
  2 | ## warm-up exercise and security reminder
  3 | 
  4 | ## we learned at the "Intro to R" course that we should not do this:
  5 | source('http://bit.ly/CEU-R-shoes')
  6 | 
  7 | ## let's install a package instead!
  8 | install.packages('remotes')
  9 | remotes::install_github('daroczig/students')
 10 | 
 11 | library(students)
 12 | ?students
 13 | 
 14 | ## this is a dataset on students from a study group,
 15 | ## where we run a math test and found interesting association with the shoe size
 16 | ## TODO EDA
 17 | students
 18 | 
 19 | cor(students$shoe, students$math)
 20 | lm(math ~ shoe, students)
 21 | 
 22 | plot(students$shoe, students$math)
 23 | abline(lm(math ~ shoe, students), col = 'red')
 24 | 
 25 | library(ggplot2)
 26 | ggplot(students, aes(math, shoe)) + geom_point() + geom_smooth(method = 'lm')
 27 | 
 28 | ## EDA - everyone!
 29 | str(students)
 30 | summary(students)
 31 | plot(students)
 32 | 
 33 | library(ggplot2)
 34 | ggplot(students, aes(math, shoe, color = z)) + geom_point()
 35 | ggplot(students, aes(math, shoe, color = y)) + geom_point() # !!
 36 | 
 37 | library(GGally)
 38 | ggpairs(students)
 39 | 
 40 | ## https://datavizuniverse.substack.com/p/navigating-the-table-jungle
 41 | library(gtExtras)
 42 | gt_plt_summary(students)
 43 | 
 44 | ## partial correlation
 45 | residuals(lm(math ~ x, students))
 46 | residuals(lm(shoe ~ x, students))
 47 | cor(residuals(lm(math ~ x, students)), residuals(lm(shoe ~ x, students)))
 48 | 
 49 | library(psych)
 50 | partial.r(students, 1:2, 3)
 51 | 
 52 | plot(residuals(lm(math ~ x, students)), residuals(lm(shoe ~ x, students)))
 53 | abline(lm(residuals(lm(math ~ x, students)) ~ residuals(lm(shoe ~ x, students))))
 54 | 
 55 | plot(residuals(lm(math ~ x, students)), residuals(lm(shoe ~ x, students)))
 56 | abline(lm(residuals(lm(shoe ~ x, students)) ~ residuals(lm(math ~ x, students))))
 57 | 
 58 | ## had enough
 59 | rm(list = ls())
 60 | 
 61 | ## but wow!
 62 | students
 63 | .secret # "A warm hello from the Internet."
 64 | ## TODO look at the source code of the package!
 65 | ## TODO always install from trusted source
 66 | 
 67 | ## #############################################################################
 68 | ## intro / recap on R and ggplot2 from previous courses by introducing MDS
 69 | 
 70 | ## download data to a file in your temp folder
 71 | t <- tempfile()
 72 | t
 73 | t <- tempfile(fileext = '.xls')
 74 | t
 75 | 
 76 | ## or keep in the current working directory
 77 | t <- 'cities.xls'
 78 | 
 79 | download.file('https://bit.ly/de-cities-distance', t, mode = 'wb')
 80 | 
 81 | ## further checks on the downloaded file
 82 | file.info(t)
 83 | pander::openFileInOS(t)
 84 | 
 85 | ## read the downloaded file
 86 | library(readxl)
 87 | cities <- read_excel(t)
 88 | 
 89 | cities
 90 | ## tibble VS data.frame VS data.table
 91 | str(cities)
 92 | 
 93 | ## get rid of 1st column and last three rows (metadata)
 94 | cities <- cities[, -1]
 95 | cities <- cities[1:(nrow(cities) - 3), ]
 96 | str(cities)
 97 | 
 98 | mds <- cmdscale(as.dist(cities))
 99 | mds
100 | 
101 | plot(mds)
102 | text(mds[, 1], mds[, 2], names(cities))
103 | 
104 | ## TODO interpret what we see
105 | 
106 | ## flipping both x and y axis
107 | mds <- -mds
108 | plot(mds)
109 | text(mds[, 1], mds[, 2], names(cities))
110 | ## flipping only on y axis
111 | mds[, 1] <- -mds[, 1]
112 | plot(mds)
113 | text(mds[, 1], mds[, 2], names(cities))
114 | ## flipping only on x axis
115 | mds[, 2] <- -mds[, 2]
116 | plot(mds)
117 | text(mds[, 1], mds[, 2], names(cities))
118 | 
119 | ## TODO ggplot2 way
120 | mds <- as.data.frame(mds)
121 | mds$city <- rownames(mds)
122 | str(mds)
123 | 
124 | library(ggplot2)
125 | ggplot(mds, aes(V1, V2, label = city)) +
126 |     geom_text() + theme_bw()
127 | 
128 | ## flip one axis and grid
129 | ggplot(mds, aes(V1, -V2, label = city)) +
130 |     geom_text() + theme_void()
131 | 
132 | ## #############################################################################
133 | ## TODO visualize the distance between the European cities
134 | ## stored in the built-in dataframe:
135 | 
136 | ?eurodist
137 | 
138 | mds <- cmdscale(eurodist)
139 | mds <- as.data.frame(mds)
140 | mds$city <- rownames(mds)
141 | ggplot(mds, aes(V1, -V2, label = city)) +
142 |     geom_text() + theme_bw()
143 | 
144 | ## ############################################################################
145 | ## geocoding
146 | 
147 | ## test on https://nominatim.openstreetmap.org/ui/search.html
148 | ## e.g. "CEU, Budapest, Hungary" or "central european university, vienna"
149 | 
150 | library(ggmap)          # geocode one address
151 | library(tidygeocoder)   # geocode a data.frame
152 | 
153 | library(data.table)
154 | mds <- data.table(geocode(mds, 'city'))
155 | str(mds)
156 | 
157 | ## built-in polygons for the background
158 | ?maps::world
159 | world <- map_data('world')
160 | ggplot() +
161 |     geom_map(data = world, map = world, aes(long, lat, map_id = region)) +
162 |     coord_fixed(1.3)
163 | 
164 | ## adding the points
165 | ggplot() +
166 |     geom_map(data = world, map = world, aes(long, lat, map_id = region)) +
167 |     coord_fixed(1.3) +
168 |     geom_point(data = mds, aes(long, lat), color = 'orange')
169 | 
170 | ## wow, that's odd to see a few cities from EU in USA :O
171 | mds
172 | ## NOTE Lyons (plural due to English spelling) instead of Lyon
173 | 
174 | ggplot() +
175 |     geom_map(data = world, map = world, aes(long, lat, map_id = region)) +
176 |     coord_fixed(1.3) +
177 |     geom_point(data = mds, aes(long, lat, color = city))
178 | 
179 | ## make it a thematic map
180 | str(world)
181 | world$a <- grepl('^A', world$region)
182 | 
183 | ggplot() +
184 |   geom_map(data = world, map = world, aes(long, lat, map_id = region, fill = a)) +
185 |   coord_fixed(1.3) +
186 |   geom_point(data = mds, aes(long, lat), color = 'black') +
187 |   theme(legend.position = 'none')
188 | 
189 | library(countrycode)
190 | world <- data.table(world)
191 | world[, continent := countrycode(region, origin = 'country.name', destination = 'continent'), by = region]
192 | warnings()
193 | 
194 | ggplot() +
195 |   geom_map(data = world, map = world, aes(long, lat, map_id = region, fill = continent)) +
196 |   coord_fixed(1.3) +
197 |   geom_point(data = mds, aes(long, lat), color = 'black') +
198 |   theme(legend.position = 'none')
199 | 
200 | ## fancy background from http://maps.stamen.com -- unfortunately deprecated,
201 | ## so need to register a key from get_stadiamap
202 | library(ggmap)
203 | register_stadiamaps('YOUR-API-KEY')
204 | 
205 | map <- get_stadiamap(
206 |   c(
207 |     left = min(mds$long) * 0.995,
208 |     right = max(mds$long) * 1.001,
209 |     bottom = min(mds$lat) * 0.999,
210 |     top = max(mds$lat)) * 1.001,
211 |   maptype = 'stamen_toner',
212 |   zoom = 4)
213 | 
214 | ggmap(map) +
215 |   geom_point(data = mds, aes(long, lat), color = 'orange') +
216 |   theme_void() + theme(legend.position = 'none')
217 | 
218 | library(sf)
219 | geomds <- st_as_sf(x = mds, coords = c("long", "lat"))
220 | st_bbox(geomds)
221 | unname(st_bbox(geomds))
222 | 
223 | map <- get_stadiamap(
224 |     unname(st_bbox(geomds)),
225 |     maptype = 'stamen_terrain_background',
226 |     zoom = 4)
227 | 
228 | ggmap(map) +
229 |     geom_point(data = mds, aes(long, lat), color = 'black') +
230 |     theme_void() + theme(legend.position = 'none')
231 | 
232 | ## #############################################################################
233 | ## plotting shapefiles
234 | ## #############################################################################
235 | 
236 | ## search for "Austria shapefile"  University of Texas
237 | ## https://geodata.lib.utexas.edu/?f%5Bdc_format_s%5D%5B%5D=Shapefile&f%5Bdc_subject_sm%5D%5B%5D=Boundaries&f%5Bdct_spatial_sm%5D%5B%5D=Austria&f%5Blayer_geom_type_s%5D%5B%5D=Polygon&per_page=20
238 | download.file(
239 |     'https://stacks.stanford.edu/object/rc343vz5889',
240 |     'Austria_boundary.zip')
241 | download.file(
242 |     'https://stacks.stanford.edu/object/yv617vc9132',
243 |     'Austria_divisions.zip')
244 | unzip('Austria_boundary.zip')
245 | unzip('Austria_divisions.zip')
246 | 
247 | ## Look around what we have
248 | library(sf) # simple features
249 | st_layers('.')
250 | 
251 | ## Read a shapefile from the current working directory (refer to the filename without file extension)
252 | adm0 <- st_read('.', layer = 'AUT_adm0')
253 | str(adm0)
254 | 
255 | plot(adm0)
256 | 
257 | st_geometry(adm0)
258 | ggplot() + geom_sf(data = adm0)
259 | 
260 | ## Now let's load smaller administrative areas as well
261 | adm2 <- st_read('.', 'AUT_adm2')
262 | 
263 | ## And some points to be added to the map as well
264 | cities <- fread('https://simplemaps.com/static/data/country-cities/at/at.csv')
265 | 
266 | ## Party time
267 | library(ggplot2)
268 | ggplot() +
269 |     geom_sf(data = adm0, color = 'black', size = 1) +
270 |     geom_sf(data = adm2, color = 'gray', size = .2) +
271 |     geom_point(data = cities,
272 |                aes(lng, lat, size = population),
273 |                color = 'orange') +
274 |     theme_void() +
275 |     theme(legend.position = 'top')
276 | 
277 | ## how to access the original data elements?
278 | str(adm2)
279 | adm2$NAME_2
280 | 
281 | ## add label
282 | ggplot() +
283 |     geom_sf(data = adm0, color = 'black', size = 1) +
284 |     geom_sf(data = adm2, color = 'gray', size = .2) +
285 |     geom_sf_text(data = adm2, aes(label = NAME_2)) +
286 |     geom_point(data = cities,
287 |                aes(lng, lat, size = population),
288 |                color = 'orange') +
289 |     theme_void() +
290 |     theme(legend.position = 'top')
291 | 
292 | ## thematic maps: Statutory city VS
293 | ggplot() +
294 |     geom_sf(data = adm0, color = 'black', size = 1) +
295 |     geom_sf(data = adm2, color = 'gray', aes(fill = TYPE_2), size = .2) +
296 |     geom_sf_text(data = adm2, aes(label = NAME_2)) +
297 |     geom_point(data = cities,
298 |                aes(lng, lat, size = population),
299 |                color = 'orange') +
300 |     theme_void() +
301 |     theme(legend.position = 'top')
302 | 
303 | ## using geojson files
304 | download.file(
305 |     'https://raw.githubusercontent.com/ginseng666/GeoJSON-TopoJSON-Austria/master/2021/simplified-95/bezirke_95_topo.json',
306 |     'austria.geojson')
307 | 
308 | library(leaflet)
309 | 
310 | map <- st_read('austria.geojson')
311 | popup <- paste0('<strong>Name: </strong>', map$name)
312 | 
313 | leaflet(map) %>%
314 |     addPolygons(
315 |         weight = 1, smoothFactor = 0.5,
316 |         opacity = 1.0, fillOpacity = 0.5,
317 |         popup = popup,
318 |         label = ~name,
319 |         layerId = ~name,
320 |         labelOptions = labelOptions(noHide = TRUE),
321 |         highlightOptions = highlightOptions(
322 |             color = 'white', weight = 2,
323 |             bringToFront = TRUE))
324 | 
325 | library(ggplot2)
326 | ggplot() +
327 |     geom_sf(data = map, color = 'white', fill = 'darkgreen', size = .2) +
328 |     theme_void()
329 | 
330 | ## find more details at https://tmieno2.github.io/R-as-GIS-for-Economists/index.html
331 | 
332 | ## #############################################################################
333 | ## TODO non-geo example
334 | 
335 | ?mtcars
336 | str(mtcars)
337 | mtcars
338 | 
339 | mds <- cmdscale(dist(mtcars))
340 | plot(mds)
341 | text(mds[, 1], mds[, 2], rownames(mtcars))
342 | ## oh no, the overlaps!
343 | 
344 | mds <- as.data.frame(mds)
345 | mds$car <- rownames(mds)
346 | ggplot(mds, aes(V1, V2, label = car)) +
347 |     geom_text() + theme_bw()
348 | 
349 | library(ggrepel)
350 | ggplot(mds, aes(V1, V2, label = car)) +
351 |     geom_text_repel() + theme_bw()
352 | 
353 | ## #############################################################################
354 | ## QQ what does it mean that two cards are "close to each other"?
355 | ## NOTE think about why the above visualization is off
356 | 
357 | ## check actual distances eg for Camaro (or other sport cars)
358 | which(rownames(mtcars) == 'Camaro Z28')
359 | sort(as.matrix(dist(mtcars))[, 24])
360 | ## Mercedes sedans are closer?! than e.g. Ferrari Dino or Maserati Bora
361 | 
362 | mtcars
363 | 
364 | subset(mtcars, hp >= 245)
365 | 
366 | ?cmdscale
367 | ?dist
368 | 
369 | summary(mtcars)
370 | 
371 | ## need to standardize to give every variable equal weight!
372 | mtcars$hp - mean(mtcars$hp)
373 | mean(mtcars$hp - mean(mtcars$hp))
374 | 
375 | (x <- (mtcars$hp - mean(mtcars$hp)) / sd(mtcars$hp))
376 | mean(x)
377 | sd(x)
378 | hist(x)
379 | 
380 | x
381 | scale(mtcars$hp)
382 | plot(x, scale(mtcars$hp))
383 | x - scale(mtcars$hp)
384 | 
385 | plot(mtcars$hp, scale(mtcars$hp))
386 | 
387 | ?scale
388 | scale(mtcars)
389 | 
390 | mds <- cmdscale(dist(scale(mtcars)))
391 | mds <- as.data.frame(mds)
392 | mds$car <- rownames(mds)
393 | ggplot(mds, aes(V1, V2, label = car)) +
394 |     geom_text_repel() + theme_bw()
395 | 
396 | subset(mtcars, hp >= 200)
397 | 
398 | ## #############################################################################
399 | ## introduction to Simpson's paradox with the Berkeley example
400 | 
401 | ## then do the analysis in R
402 | UCBAdmissions
403 | plot(UCBAdmissions)
404 | 
405 | berkeley <- as.data.frame(UCBAdmissions)
406 | 
407 | ggplot(berkeley, aes(Gender, Freq, fill = Admit)) + geom_col()
408 | 
409 | p <- ggplot(berkeley, aes(Gender, Freq, fill = Admit)) + geom_col(position = 'fill')
410 | p
411 | 
412 | p + facet_wrap(~Dept)
413 | p + facet_wrap(~Dept) + scale_fill_manual(values = c('Admitted' = 'darkgreen', 'Rejected' = 'red'))
414 | # https://colorbrewer2.org
415 | p + facet_wrap(~Dept) + scale_fill_brewer(palette = 'Dark2')
416 | 
417 | ggplot(berkeley, aes(Gender, Freq, fill = Admit)) + geom_col() +
418 |     facet_wrap(~Dept) + scale_fill_brewer(palette = 'Dark2')
419 | 
420 | ## #############################################################################
421 | ## TODO exercise visualize a model on the association between
422 | ## the lengh and width of sepal in the iris dataset
423 | 
424 | ?iris
425 | str(iris)
426 | 
427 | ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point()
428 | ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point() +
429 |     geom_smooth()
430 | ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point() +
431 |     geom_smooth(method = 'lm', se = FALSE)
432 | ## note the change in the sign of the slope!
433 | ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) + geom_point() +
434 |     geom_smooth(method = 'lm', se = FALSE)
435 | 
436 | summary(lm(Sepal.Width ~ Sepal.Length, data = iris))
437 | summary(lm(Sepal.Width ~ Sepal.Length + Species, data = iris))
438 | 
439 | ## NOTE when checking the scatterplot colored by Species
440 | ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) + geom_point()
441 | 
442 | ## TODO add model
443 | ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) + geom_point() + geom_smooth(method = 'lm')
444 | 
445 | ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
446 |     geom_point(aes(color = Species)) +
447 |     geom_smooth(aes(color = Species), method = 'lm') +
448 |     geom_smooth(method = 'lm', color = 'black', linewidth = 2)
449 | 
450 | ## compare the overlap of groups with the MDS version:
451 | mds <- as.data.frame(cmdscale(dist(iris)))
452 | ggplot(mds, aes(V1, V2, color = iris$Species)) + geom_point()
453 | ggplot(mds, aes(V1, V2, color = iris$Species)) + geom_point() + geom_smooth(method = 'lm')
454 | 
455 | ## without the Species column
456 | mds <- as.data.frame(cmdscale(dist(iris[, -5])))
457 | ggplot(mds, aes(V1, V2, color = iris$Species)) + geom_point()
458 | 
459 | ## #############################################################################
460 | ## back to dataviz ... why? let's see Anscombe's quartett
461 | 
462 | ## dirty data
463 | anscombe
464 | 
465 | ## select 1st set
466 | anscombe[, c(1, 5)]
467 | 
468 | ## using base R to see the 1st set
469 | plot(anscombe[, c(1, 5)])
470 | 
471 | ## add linear model
472 | lm(anscombe[, c(5, 1)])
473 | ## note to change 5 and 1 (estimating y based on x)
474 | abline(lm(anscombe[, c(5, 1)]))
475 | 
476 | library(ggplot2)
477 | ggplot(anscombe[, c(1, 5)], aes(x1, y1)) + geom_point() + geom_smooth(method = 'lm', se = TRUE)
478 | ggplot(anscombe[, c(2, 6)], aes(x2, y2)) + geom_point() + geom_smooth(method = 'lm', se = TRUE)
479 | 
480 | ## TODO compute the mean of x1 and y1
481 | mean(anscombe[, 1])
482 | mean(anscombe[, 5])
483 | 
484 | ## TODO compute the mean of x2 and y2
485 | mean(anscombe[, 2])
486 | mean(anscombe[, 6])
487 | 
488 | ## intro to loops
489 | lapply(1:4, function(i) mean(anscombe[, c(i)]))
490 | lapply(1:4, function(i) sd(anscombe[, c(i)]))
491 | lapply(1:4, function(i) cor(anscombe[, c(i, i+4)]))
492 | 
493 | apply(anscombe, 2, mean)
494 | 
495 | ## loop to create separate & tidy dataset for each set
496 | lapply(1:4, function(i) anscombe[, c(i, i+4)])
497 | 
498 | ## use data.table to merge into one single data frame
499 | library(data.table)
500 | rbindlist(lapply(1:4, function(i) anscombe[, c(i, i+4)]))
501 | ## add a "set" column so that we know which set the row belongs to
502 | rbindlist(lapply(1:4, function(i) cbind(dataset = i, anscombe[, c(i, i+4)])))
503 | 
504 | ## save in a variable for future use
505 | df <- rbindlist(lapply(1:4, function(i) cbind(dataset = i, anscombe[, c(i, i+4)])))
506 | setnames(df, c('dataset', 'x', 'y'))
507 | 
508 | ## let's switch from base R to ggplot and show all 4 sets in subplots
509 | library(ggplot2)
510 | ggplot(df, aes(x, y)) + geom_point() + facet_wrap(~dataset)
511 | ggplot(df, aes(x, y)) + geom_point() + facet_wrap(~dataset) + geom_smooth()
512 | ggplot(df, aes(x, y)) + geom_point() + facet_wrap(~dataset) + geom_smooth(method = 'lm')
513 | 
514 | ## intro to https://www.research.autodesk.com/publications/same-stats-different-graphs/
515 | df <- datasauRus::datasaurus_dozen_wide
516 | dt <- rbindlist(lapply(1:13, function(i)
517 |   data.frame(
518 |     x = df[, 2 * i - 1, drop = TRUE],
519 |     y = df[, 2 * i, drop = TRUE],
520 |     dataset_id = i)))
521 | 
522 | ggplot(dino_df, aes(x, y)) + geom_point() +
523 |     geom_smooth(method = 'lm', se = FALSE) +
524 |     facet_wrap(~dataset) +
525 |     theme_bw()
526 | 


--------------------------------------------------------------------------------
/2.R:
--------------------------------------------------------------------------------
  1 | ## #############################################################################
  2 | ## EDA warmup - alternatives to boxplot
  3 | ## #############################################################################
  4 | 
  5 | library(data.table)
  6 | df <- fread('http://bit.ly/CEU-R-numbers-set')
  7 | str(df)
  8 | 
  9 | summary(df)
 10 | table(df$x)
 11 | 
 12 | summary(df)
 13 | lapply(df, summary)
 14 | lapply(unique(df$x), function(set) summary(df[x == set]))
 15 | 
 16 | ## data.table way
 17 | df[, as.list(summary(y)), by = x]
 18 | 
 19 | pairs(df)
 20 | library(GGally)
 21 | ggpairs(df)
 22 | 
 23 | library(ggplot2)
 24 | ggplot(df, aes(x, y)) + geom_point()
 25 | ggplot(df, aes(x, y)) + geom_point() + geom_smooth(method = 'lm')
 26 | ggplot(df, aes(x, y)) + geom_point(alpha = 0.1)
 27 | ggplot(df, aes(x, y)) + geom_jitter(alpha = 0.1)
 28 | 
 29 | ## hexbin
 30 | ggplot(df, aes(x, y)) + geom_hex()
 31 | 
 32 | ggplot(df, aes(factor(x), y)) + geom_boxplot()
 33 | ## NOTE jitter showed interesting patterns .. not visible
 34 | 
 35 | ggplot(df, aes(factor(x), y)) + geom_violin()
 36 | ggplot(df, aes(factor(x), y)) + geom_violin() + geom_jitter()
 37 | ggplot(df, aes(factor(x), y)) + geom_violin() +
 38 |   geom_jitter(width = 0.1, alpha = 0.1)
 39 | 
 40 | ggplot(df, aes(y)) + geom_histogram() + facet_wrap(~x)
 41 | 
 42 | ggplot(df, aes(y, fill = factor(x))) + geom_density()
 43 | ggplot(df, aes(y, fill = factor(x))) + geom_density(alpha = .25)
 44 | ggplot(df, aes(y, fill = factor(x))) +
 45 |   geom_density(alpha = .25) +
 46 |   theme(legend.position = 'top')
 47 | 
 48 | ## df <- rbind(
 49 | ##     data.table(x = 1, y = rbeta(1e3, 0.1, 0.1)),
 50 | ##     data.table(x = 2, y = rnorm(1e3, 0.5, 0.75)),
 51 | ##     data.table(x = 3, y = runif(1e3) * 2 - 0.5),
 52 | ##     data.table(x = 4, y = rnorm(1e3, 0.5, 0.75)))
 53 | 
 54 | ## TODO do similar exploratory data analysis on the below dataset
 55 | df <- fread('http://bit.ly/CEU-R-numbers')
 56 | ## generated at https://gist.github.com/daroczig/23d1323652a70c03b27cfaa6b080aa3c
 57 | 
 58 | ## TODO find interesting pattern in data?
 59 | 
 60 | ggplot(df, aes(x, y)) + geom_point() # slow?
 61 | ggplot(df, aes(x, y)) + geom_point(alpha = 0.05)
 62 | ggplot(df, aes(x, y)) + geom_point(size = 0.2, alpha = 0.1)
 63 | ggplot(df, aes(x, y)) + geom_hex(binwidth = 5)
 64 | ggplot(df, aes(x, y)) + geom_count()
 65 | 
 66 | df[, .N, by = list(x, y)]
 67 | ggplot(df[, .N, by = list(x, y)], aes(x, y, fill = N)) + geom_tile()
 68 | 
 69 | ## 3d plot density?
 70 | 
 71 | ## #############################################################################
 72 | ## manually extract data from plots
 73 | ## #############################################################################
 74 | 
 75 | ## using grid.locator
 76 | devtools::install_github('doehm/traceR')
 77 | 
 78 | library(traceR)
 79 | ## NOTE locator is not reliable in the RStudio IDE built-in graphics device
 80 | ## so we need to open a new device
 81 | grDevices::x11()
 82 | df2 <- trace_image()
 83 | df2
 84 | inspect_trace(df2)
 85 | 
 86 | ggplot(df2, aes(x, y)) + geom_point(color = 'red') + geom_path()
 87 | library(ggforce)
 88 | ggplot(df2, aes(x, y)) + geom_point(color = 'red') + geom_bspline0()
 89 | ggplot(df2, aes(x, y)) + geom_point(color = 'red') + geom_bspline_closed0()
 90 | 
 91 | ## this fish doesn't have an eye!
 92 | df3 <- trace_image()
 93 | ggplot(df2, aes(x, y)) + geom_point(color = 'red') + geom_bspline_closed0() +
 94 |   geom_bspline_closed0(data=df3, fill = 'white')
 95 | 
 96 | ## oops
 97 | df3 <- trace_image(scale = FALSE)
 98 | ggplot(df2, aes(x, y)) + geom_point(color = 'red') + geom_bspline_closed0() +
 99 |   geom_bspline_closed0(data=df3, fill = 'white')
100 | 
101 | ggplot(df, aes(x, y)) + geom_hex(binwidth = 5)
102 | df2 <- trace_image(scale = FALSE)
103 | df3 <- trace_image(scale = FALSE)
104 | ggplot(df2, aes(x, y)) + geom_point(color = 'red') + geom_bspline_closed0() +
105 |   geom_bspline_closed0(data=df3, fill = 'white')
106 | 
107 | ## #############################################################################
108 | ## why the dataviz? edu purposes, eg showing how clustering works
109 | ## #############################################################################
110 | 
111 | ## hierarchical clustering
112 | ?hclust
113 | 
114 | ## doing this on iris instead of mtcars (re MDS) to be able to compare with Species
115 | dm <- dist(iris[, 1:4])
116 | str(dm)
117 | 
118 | hc <- hclust(dm)
119 | str(hc)
120 | 
121 | ## plot the dendogram
122 | plot(hc)
123 | rect.hclust(hc, k = 3)
124 | 
125 | ## see first 2 observations merged into a cluster:
126 | hc$merge[1, ]
127 | iris[c(102, 143), ]
128 | 
129 | ## poor man's animation
130 | for (i in 2:8) {
131 |   plot(hc)
132 |   rect.hclust(hc, k = i)
133 |   Sys.sleep(1)
134 | }
135 | 
136 | ## actual animation!
137 | library(animation)
138 | ani.options(interval = 1)
139 | ani.options(autobrowse = FALSE) # on server
140 | saveGIF({
141 |   for (i in 2:8) {
142 |     plot(hc)
143 |     rect.hclust(hc, k = i)
144 |     ## no need for sleep
145 |   }
146 | })
147 | ?saveGIF
148 | ?ani.options
149 | ## ani.width, ani.height = 480
150 | 
151 | ## ggplot for dendograms
152 | library(dendextend)
153 | d <- as.dendrogram(hc)
154 | d <- color_branches(d, k = 2)
155 | plot(d)
156 | 
157 | ggplot(d)
158 | ggplot(d, labels = FALSE)
159 | ggplot(d, labels = FALSE, horiz = TRUE)
160 | ggplot(d, labels = FALSE) + scale_y_reverse(expand = c(0.2, 0)) + coord_polar(theta="x")
161 | ## https://talgalili.github.io/dendextend/articles/dendextend.html
162 | 
163 | ## TODO create an animation with ggplot2 instead of the above plot/rect.hclust method
164 | for (i in 2:8) {
165 |   ## NOTE the need to print .. otherwise pretty boring "animation"
166 |   print(ggplot(color_branches(as.dendrogram(hc), k = i)))
167 |   Sys.sleep(1)
168 | }
169 | 
170 | saveGIF({
171 |   for (i in 2:8) {
172 |     print(ggplot(color_branches(as.dendrogram(hc), k = i)))
173 |   }
174 | }, ani.width = 960)
175 | 
176 | ## cluster membership
177 | clusters <- cutree(hc, 3)
178 | clusters
179 | 
180 | ## why 3?
181 | library(NbClust)
182 | NbClust(iris[, 1:4], method = 'complete', index = 'all')
183 | ## QQ elbow rule: diminishing returns are no longer worth the additional cost
184 | suggestions <- NbClust(iris[, 1:4], method = 'complete', index = 'all')
185 | str(suggestions)
186 | 
187 | library(ggplot2)
188 | ggplot(iris, aes(Sepal.Length, Sepal.Width, shape = Species)) + geom_point()
189 | ggplot(iris, aes(Sepal.Length, Sepal.Width, shape = Species)) + geom_point(size = 3)
190 | ggplot(iris, aes(Sepal.Length, Sepal.Width, shape = Species, color = factor(clusters))) +
191 |   geom_point(size = 3)
192 | ggplot(iris, aes(Sepal.Length, Sepal.Width, shape = Species, color = factor(clusters))) +
193 |   geom_point(size = 3) + theme_bw() + theme(legend.position = 'top')
194 | 
195 | ## add linear models by both species and cluster membership?
196 | ggplot(iris, aes(Sepal.Length, Sepal.Width, shape = Species, color = factor(clusters))) +
197 |   geom_point(size = 3) + theme_bw() + theme(legend.position = 'top') +
198 |   geom_smooth(method = 'lm')
199 | ## hm, that's difficult to interpret .. e.g. why 2 green lines?
200 | ## let's differentiate line types as well?
201 | ggplot(iris, aes(Sepal.Length, Sepal.Width, shape = Species, color = factor(clusters), linetype = Species)) +
202 |   geom_point(size = 3) + theme_bw() + theme(legend.position = 'top') +
203 |   geom_smooth(method = 'lm')
204 | ## still not happy
205 | 
206 | ## loop over Species? instead of faceting, our new dimension for visualization will be time
207 | library(data.table)
208 | ## need ~tidy data
209 | IRIS <- as.data.table(iris)
210 | IRIS$cluster <- factor(clusters)
211 | saveGIF({
212 |   for (species in unique(IRIS$Species)) {
213 |     print(ggplot(IRIS[Species == species],
214 |                  aes(Sepal.Length, Sepal.Width, color = cluster)) +
215 |             geom_point(size = 3) +
216 |             geom_smooth(method = 'lm') +
217 |             ggtitle(species)) # add later
218 |   }
219 | }, ani.width = 960)
220 | ## meh, we can do better
221 | 
222 | library(gganimate)
223 | ggplot(iris, aes(Sepal.Length, Sepal.Width, color = factor(clusters))) +
224 |   geom_point(size = 3) +
225 |   geom_smooth(method = 'lm') +
226 |   transition_states(Species)
227 | ## NOTE check the scale for clusters ... also need to add title
228 | 
229 | ggplot(iris, aes(Sepal.Length, Sepal.Width, color = factor(clusters))) +
230 |   geom_point(size = 3) +
231 |   geom_smooth(method = 'lm') +
232 |   transition_states(Species) +
233 |   labs(
234 |     title = paste("{closest_state}"), # Python's f-string, R's glue
235 |     subtitle = 'Number of flowers: {nrow(subset(iris, Species == closest_state))}')
236 | 
237 | ## see more fancy plot at https://twitter.com/daroczig/status/1201542038640111616
238 | 
239 | ## TODO this should have been a facet ...
240 | ggplot(iris, aes(Sepal.Length, Sepal.Width, color = factor(clusters))) +
241 |   geom_point(size = 3) +
242 |   geom_smooth(method = 'lm') +
243 |   facet_wrap(~Species)
244 | 
245 | ## traditional cluster plots
246 | library(cluster)
247 | clusplot(iris, clusters, color = TRUE, shade = TRUE, labels = 2)
248 | 
249 | ## gg Hull plot
250 | ## "convex hull of a shape is the smallest convex set that contains it"
251 | library(ggforce)
252 | ggplot(iris, aes(Sepal.Length, Sepal.Width, color = factor(clusters))) +
253 |   geom_point(size = 3) +
254 |   geom_mark_hull(aes(label = clusters))
255 | 
256 | ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
257 |   geom_point(size = 3) +
258 |   geom_mark_hull(aes(fill = factor(clusters), label = clusters),
259 |                  ## very concave
260 |                  concavity = 1)
261 | 
262 | ## ############################################################################
263 | ## back to datasaurus!
264 | ## ############################################################################
265 | 
266 | ## finish the datasaurus exercise from last week
267 | library(data.table)
268 | df <- datasauRus::datasaurus_dozen_wide
269 | dt <- rbindlist(lapply(1:13, function(i)
270 |   data.frame(
271 |     x = df[, 2 * i - 1, drop = TRUE],
272 |     y = df[, 2 * i, drop = TRUE],
273 |     dataset_id = i)))
274 | ## alternatively
275 | dt <- rbindlist(lapply(seq(1, 26, by = 2), function(i)
276 |   data.frame(
277 |     x = df[, c(i), drop = TRUE],
278 |     y = df[, c(i+1), drop = TRUE],
279 |     dataset_id = i)
280 | ))
281 | 
282 | ggplot(dt, aes(x, y)) + geom_point() +
283 |   geom_smooth(method = 'lm', se = FALSE) +
284 |   facet_wrap(~dataset_id) +
285 |   theme_bw()
286 | 
287 | ## human-friendly dataset ids
288 | dt <- rbindlist(lapply(seq(1, 26, by = 2), function(i) {
289 |   data.frame(
290 |     x = df[, c(i), drop = TRUE],
291 |     y = df[, c(i+1), drop = TRUE],
292 |     # sub('_x$', '', names(df)[i])
293 |     dataset_id = substr(names(df)[i], 1, nchar(names(df)[i])-2))
294 | }
295 | ))
296 | 
297 | ## reading more docs
298 | dt <- datasauRus::datasaurus_dozen
299 | 
300 | library(datasauRus)
301 | datasaurus_dozen
302 | 
303 | library(ggplot2)
304 | ggplot(datasaurus_dozen, aes(x, y)) +
305 |   geom_point() + facet_wrap(~dataset)
306 | 
307 | library(gganimate)
308 | ggplot(datasaurus_dozen, aes(x, y)) +
309 |   geom_point() + geom_smooth(method = 'lm') +
310 |   transition_states(dataset)
311 | 
312 | subtitle <- function(df, digits = 4) {
313 |   paste0(
314 |     'mean(x)=', round(mean(df$x), digits), ', ', 'sd(x)=', round(sd(df$x), digits), '\n',
315 |     'mean(y)=', round(mean(df$y), digits), ', ', 'sd(y)=', round(sd(df$y), digits), '\n',
316 |     'cor(x,y)=', round(cor(df$x, df$y), digits)
317 |   )
318 | }
319 | subtitle(datasaurus_dozen)
320 | 
321 | ggplot(datasaurus_dozen, aes(x, y)) +
322 |   geom_point() + geom_smooth(method = 'lm') +
323 |   transition_states(dataset) +
324 |   labs(
325 |     title = paste("{closest_state}"),
326 |     subtitle = '{subtitle(subset(datasaurus_dozen, dataset == closest_state))}')
327 | 
328 | ## TODO bug: overlapping title and subtitle?! not sure ... let's move it down to "x"
329 | ggplot(datasaurus_dozen, aes(x, y)) +
330 |   geom_point() + geom_smooth(method = 'lm') +
331 |   transition_states(dataset) +
332 |   labs(
333 |     x = "{closest_state}", y = "",
334 |     subtitle = '{subtitle(subset(datasaurus_dozen, dataset == closest_state))}') +
335 |   theme(plot.margin = ggplot2::margin(1, 0, 0, 0, "cm"))
336 | 
337 | 
338 | ## TODO racing bar chart on departure delay per origin
339 | library(nycflights13)
340 | library(data.table)
341 | dt <- data.table(flights)[, .(delay = mean(dep_delay, na.rm = TRUE)), by = .(origin, month)][order(month, delay)]
342 | ## NOTE need to record order within the dataset that we can map it to the plot
343 | dt[, order := order(delay), by = month]
344 | library(ggplot2)
345 | library(gganimate)
346 | g <- ggplot(dt, aes(order, delay)) +
347 |   geom_col(aes(fill = origin)) +
348 |   ## NOTE need to add text as X axis labels are for order
349 |   ## NOTE vjust/hjust https://stackoverflow.com/a/7267364/564164
350 |   geom_text(aes(label = origin), hjust = 1.2, vjust = 0.5, angle = 90, size = 14, color = 'white') +
351 |   xlab('') + ylab('Average departure delays (minutes)') +
352 |   theme_bw() +
353 |   theme(legend.position = 'none',
354 |         axis.ticks.x = element_blank(),
355 |         panel.grid.minor.x = element_blank(),
356 |         panel.grid.major.x = element_blank()) +
357 |   ## NOTE look up month based on closest state
358 |   ggtitle('2013/{month.name[as.numeric(closest_state)]}') +
359 |   transition_states(month) +
360 |   ## dynamic y axis for each month
361 |   view_follow(fixed_x = FALSE)
362 | animate(g, height = 600, width = 600)
363 | 
364 | ## TODO racing bar chart on arr delay per dest
365 | dt <- data.table(flights)[, .(delay = mean(arr_delay, na.rm = TRUE)), by = .(dest, month)][order(month, delay)]
366 | dt[, order := order(delay), by = month]
367 | g <- ggplot(dt, aes(order, delay)) +
368 |   geom_col(aes(fill = dest)) +
369 |   ## NOTE labels for negative values
370 |   geom_text(aes(label = dest, hjust = delay < 0), angle = 90, size = 2) +
371 |   xlab('') + ylab('Average arrival delays (minutes)') +
372 |   theme_bw() +
373 |   theme(legend.position = 'none',
374 |         axis.ticks.x = element_blank(),
375 |         panel.grid.minor.x = element_blank(),
376 |         panel.grid.major.x = element_blank()) +
377 |   ggtitle('2013/{month.name[as.numeric(closest_state)]}') +
378 |   transition_states(month) +
379 |   view_follow(fixed_x = FALSE)
380 | animate(g, height = 600, width = 1200, fps = 5)
381 | 
382 | ## quite too much .. maybe filter for top10
383 | dt[order<=10]
384 | 
385 | ## ############################################################################
386 | ## tweaking ggplot2 themes
387 | ## ############################################################################
388 | 
389 | ## plot weight and acceleration of cars on a scatterplot
390 | ## colored by transmission
391 | ggplot(mtcars, aes(wt, qsec, color = factor(am))) + geom_point()
392 | p <- last_plot()
393 | 
394 | ## themes
395 | library(ggthemes)
396 | p + theme_economist() + scale_fill_economist()
397 | p + theme_stata() + scale_fill_stata()
398 | p + theme_excel() + scale_fill_excel()
399 | p + theme_gdocs() + scale_fill_gdocs()
400 | 
401 | ## create a custom theme for future usage
402 | ?theme
403 | theme_custom <- function() {
404 |   theme(
405 |     axis.text = element_text(
406 |       family = 'Times New Roman',
407 |       color  = "orange",
408 |       size   = 12),
409 |     axis.title = element_text(
410 |       family = 'Times New Roman',
411 |       color  = "orange",
412 |       size   = 16,
413 |       face   = "bold"),
414 |     axis.text.y = element_text(angle = 90, hjust = 0.5),
415 |     panel.background = element_rect(
416 |       fill = "orange",
417 |       color = "white",
418 |       linewidth = 2)
419 |   )
420 | }
421 | p + theme_custom()
422 | ?theme
423 | 
424 | remotes::install_github('cttobin/ggthemr')
425 | library(ggthemr)
426 | ?ggthemr
427 | ## https://github.com/cttobin/ggthemr
428 | 
429 | ggthemr('pale', layout = 'scientific', spacing = 2, type = 'inner')
430 | p
431 | 
432 | ggthemr('grass', layout = 'clear', spacing = 2, type = 'outer')
433 | p
434 | 
435 | ## old-school palette-definition without helper functions
436 | ggthemr(
437 |   palette = structure(
438 |     list(
439 |       background = 'papayawhip',
440 |       text = c(inner = 'orange', outer = 'black'),
441 |       line = c(inner = 'orange', outer = 'black'),
442 |       gridline = 'white',
443 |       swatch = structure(RColorBrewer::brewer.pal(8, 'Dark2'),
444 |                          class = 'ggthemr_swatch'),
445 |       gradient = c(low = 'white', high = 'red')),
446 |     class = 'ggthemr_palette'),
447 |   layout = structure(
448 |     list(
449 |       panel.grid.major = function(...) element_line(...),
450 |       panel.grid.minor = function(...) element_line(linewidth = 0.25, ...)
451 |     ), class = 'ggthemr_layout'),
452 |   text_size = 12, spacing = 0.5)
453 | p
454 | 
455 | ggplot(iris, aes(Sepal.Length, fill = Species)) + geom_density(alpha = .5)
456 | 
457 | ## drop back to the default colors and layout
458 | ggthemr_reset()
459 | 
460 | ## ############################################################################
461 | ## interactive plots
462 | ## ############################################################################
463 | 
464 | ## plot weight and acceleration of cars on a scatterplot
465 | ## colored by transmission
466 | (p <- ggplot(mtcars, aes(wt, qsec, color = factor(am))) + geom_point())
467 | 
468 | ## much easier approach, although less control
469 | library(plotly)
470 | ggplotly(p)
471 | 
472 | ## even preserves the set ggplot theme:
473 | ggthemr('grass', layout = 'clear', spacing = 2, type = 'outer')
474 | ggplotly(p)
475 | ggthemr_reset()
476 | 
477 | 
478 | library(ggiraph)
479 | ?girafe
480 | girafe(ggobj = p)
481 | ## not much change, only an extra export feature?
482 | ## although can highlight, also shows up in the Viewer (where the anims went as well)
483 | 
484 | p <- ggplot(mtcars, aes(
485 |   x = wt,
486 |   y = qsec,
487 |   color = factor(am),
488 |   ## paste('Number of gears:', gear))
489 |   tooltip = rownames(mtcars))) +
490 |   geom_point_interactive()
491 | girafe(ggobj = p)
492 | 
493 | p <- ggplot(mtcars, aes(
494 |   x = wt,
495 |   y = qsec,
496 |   color = factor(am),
497 |   ## NOTE this newly added field
498 |   data_id = factor(gear),
499 |   tooltip = rownames(mtcars))) +
500 |   geom_point_interactive()
501 | girafe(ggobj = p)
502 | girafe(ggobj = p, options = list(opts_hover(css = "fill:black;")))
503 | 
504 | girafe(ggobj = p, options = list(
505 |   opts_hover(css = "fill:black;"),
506 |   opts_zoom(max = 2)
507 | ))
508 | 


--------------------------------------------------------------------------------