├── syllabus └── ECBS5210 dv3 in 2024-2025pdf ├── 2-interactive-plots.Rmd ├── homework.Rmd ├── README.md ├── 1.R └── 2.R /syllabus/ECBS5210 dv3 in 2024-2025pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daroczig/CEU-DV2/HEAD/syllabus/ECBS5210 dv3 in 2024-2025pdf -------------------------------------------------------------------------------- /2-interactive-plots.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: html_document 3 | runtime: shiny 4 | --- 5 | 6 | Static ggplot: 7 | 8 | ```{r} 9 | library(ggplot2) 10 | (p <- ggplot(mtcars, aes(wt, qsec, color = factor(am))) + geom_point()) 11 | ``` 12 | 13 | Plotly: 14 | 15 | ```{r} 16 | library(plotly) 17 | ggplotly(p) 18 | ``` 19 | 20 | ggiraph labels: 21 | 22 | ```{r} 23 | library(ggiraph) 24 | p <- ggplot(mtcars, aes( 25 | x = wt, 26 | y = qsec, 27 | color = factor(am), 28 | ## NOTE this newly added field 29 | data_id = factor(gear), 30 | tooltip = rownames(mtcars))) + 31 | geom_point_interactive() 32 | girafe(ggobj = p, options = list(opts_hover(css = "fill:black;"))) 33 | ``` 34 | 35 | Let's add a selector for `hp`: 36 | 37 | ```{r} 38 | library(shiny) 39 | sliderInput( # or numericInput 40 | 'minhp', h4('Minimum horsepower'), 41 | min = min(mtcars$hp), max = max(mtcars$hp), 42 | value = 100) 43 | renderPlot({ 44 | ggplot(subset(mtcars, hp > input$minhp), 45 | aes(wt, qsec, color = factor(am))) + 46 | geom_point() 47 | }) 48 | ``` -------------------------------------------------------------------------------- /homework.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "DV3 homework #1" 3 | author: "Gergely Daroczi, CEU" 4 | date: "`r Sys.Date()`" 5 | output: 6 | html_document: 7 | theme: journal 8 | --- 9 | 10 | Using the `journal` R Markdown theme 😎 11 | 12 | > No need to replicate the paragraphs with a vertical line on the left (like this paragraph), as these are instructions/hints. 13 | 14 | > Note that commands are NOT echoed in this example document, but you should always echo the R commands and keep all warnings etc. 15 | 16 | ```{r, echo=FALSE} 17 | knitr::opts_chunk$set(echo=FALSE) 18 | ``` 19 | 20 | Load the regions and zones collected by the [Spare Cores](https://sparecores.com) project from the below URLs: 21 | 22 | - https://keeper.sparecores.net/table/region 23 | - https://keeper.sparecores.net/table/zone 24 | 25 | > You can use the `fromJSON` function from the `jsonlite` package to parse these JSON files. Make sure to convert to `data.table` for easier filtering and aggregation later. 26 | 27 | ```{r} 28 | library(jsonlite) 29 | library(data.table) 30 | regions <- data.table(fromJSON("https://keeper.sparecores.net/table/region")) 31 | zones <- data.table(fromJSON("https://keeper.sparecores.net/table/zone")) 32 | ``` 33 | 34 | The loaded data look like: 35 | 36 | ```{r, echo=TRUE} 37 | str(regions) 38 | str(zones) 39 | ``` 40 | 41 | Let's count the number of regions per country, shown in desceding order: 42 | 43 | ```{r} 44 | regions[, .N, by = country_id][order(-N)] 45 | ``` 46 | 47 | > You can pass any `data.frame` or similar tabular data to `pander::pander` to render as a HTML table instead of raw R console output. 48 | 49 | A nicer table: 50 | 51 | ```{r} 52 | library(pander) 53 | pander(regions[, .N, by = country_id][order(-N)]) 54 | ``` 55 | 56 | Let's show the distribution of the founding year of the regions: 57 | 58 | > Make sure to replicate the axis titles, legend position, theme etc in the below and all future plots! Try to also replicate the tiny details of the plot as well, like axis labels, and grid design. 59 | 60 | ```{r} 61 | library(ggplot2) 62 | ggplot(regions[!is.na(founding_year)], aes(founding_year)) + geom_bar() + 63 | xlab("") + ylab("") + 64 | ggtitle("Number of datacenters by founding year", 65 | subtitle = "Note that regions with unknown founding year were excluded from the plot.") + 66 | theme_bw() + 67 | scale_x_continuous(breaks=min(regions$founding_year, na.rm=TRUE):max(regions$founding_year, na.rm=TRUE)) + 68 | theme(panel.grid.major.x=element_blank(), panel.grid.minor.x=element_blank()) 69 | ``` 70 | 71 | Also showing the average founding year on the same plot: 72 | 73 | > Look into `geom_vline`. 74 | 75 | ```{r} 76 | library(ggplot2) 77 | ggplot(regions[!is.na(founding_year)], aes(founding_year)) + geom_bar() + 78 | xlab("") + ylab("") + 79 | ggtitle("Number of datacenters by founding year", 80 | subtitle = "Note that regions with unknown founding year were excluded from the plot.") + 81 | theme_bw() + 82 | scale_x_continuous(breaks=min(regions$founding_year, na.rm=TRUE):max(regions$founding_year, na.rm=TRUE)) + 83 | theme(panel.grid.major.x=element_blank(), panel.grid.minor.x=element_blank()) + 84 | geom_vline(xintercept = mean(regions$founding_year, na.rm=TRUE), color = "red", linewidth=1.5) 85 | ``` 86 | 87 | 88 | Now let's filter for the regions in Europe! 89 | 90 | > For this, you might need to lookup the continent of the provided country ids. 91 | 92 | ```{r} 93 | library(countrycode) 94 | regions[, continent := countrycode(country_id, origin = 'iso2c', destination = 'continent'), 95 | by = country_id] 96 | regions <- regions[continent == 'Europe'] 97 | ``` 98 | 99 | After filtering, there should be `r regions[, .N]` regions left. 100 | 101 | > You can add inline R code chunks by using backquotes, followed by `r`, e.g. writing `r 2+2` will return `4`. 102 | 103 | ```{r} 104 | regions <- merge(regions, zones[, .(zones = .N), by = region_id]) 105 | ``` 106 | 107 | Let's count the number of zones per region, and merge it to the `regions` dataset. There are `r regions[zones==3, .N]` regions with 3 zones, and `r regions[zones==1, .N]` regions with a single zone. Showing this visually for each vendor: 108 | 109 | ```{r} 110 | ggplot(regions, aes(vendor_id, fill = factor(zones))) + geom_bar() + 111 | theme_bw() + xlab('') + ylab('Number of regions') + 112 | scale_fill_discrete("Number of availability zones in the region") + theme(legend.position = 'top') 113 | 114 | ``` 115 | 116 | Now let's load a GeoJSON file on the boundaries of the European countries. You can use [`leakyMirror/map-of-europe`'s `europe.geojson`](https://raw.githubusercontent.com/leakyMirror/map-of-europe/refs/heads/master/GeoJSON/europe.geojson). 117 | 118 | ```{r} 119 | download.file( 120 | 'https://raw.githubusercontent.com/leakyMirror/map-of-europe/refs/heads/master/GeoJSON/europe.geojson', 121 | 'europe.json') 122 | library(sf) 123 | countries <- st_read('europe.json') 124 | ``` 125 | 126 | Plotting the downloaded shapes: 127 | 128 | ```{r} 129 | ggplot() + geom_sf(data = countries) + theme_void() 130 | ``` 131 | 132 | Now let's get a background tile for this area! 133 | 134 | > Note that we used `ggmap`'s `get_stamenmap` in the class, but I suggest switching to `maptiles`'s `get_tiles` (using the same API token), as it returns the tiles in the correct WGS84 projection that is easier to use with the other loaded datasets. You can pass the loaded GeoJSON object to `get_tiles`, use `zoom=4` and the API key. Experiment with the other parameters as well to replicate the below using `geom_spatraster_rgb` from the `tidyterra` package! 135 | 136 | ```{r} 137 | library(maptiles) 138 | library(tidyterra) 139 | m <- get_tiles( 140 | countries, crop = TRUE, 141 | provider="Stadia.StamenTerrainBackground", zoom = 4, 142 | apikey = "TODO") 143 | ggplot() + geom_spatraster_rgb(data = m) + theme_void() 144 | ``` 145 | 146 | Now let's put together all the loaded layers (background raster map, polygon on country borders, and location of regions weighted by the number of zones, using color to represent if the region is powered by green energy, and use the `shape` to also visualize the vendor)! 147 | 148 | ```{r} 149 | ggplot() + 150 | geom_spatraster_rgb(data = m) + 151 | geom_sf(data = countries, 152 | color = 'black', 153 | fill = 'transparent', 154 | linewidth = .25) + 155 | geom_point(data = regions, aes( 156 | lon, lat, 157 | color = green_energy, size = zones, shape = vendor_id), alpha = 0.5) + 158 | scale_color_manual(values = c("TRUE" = "darkgreen", "FALSE" = "darkred")) + 159 | theme_void() 160 | ``` 161 | 162 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This is the R script repository of the "[Data Visualization 3: Practical Data Visualization with R](https://courses.ceu.edu/courses/2022-2023/data-visualization-2-practical-data-visualization-r)" course in the 2023/2024 Winter term, part of the [MSc in Business Analytics](https://courses.ceu.edu/programs/ms/master-science-business-analytics) at CEU. For the previous editions, see [2019/2020 Spring](https://github.com/daroczig/CEU-DV2/tree/2019-2020), [2020/2021 Winter](https://github.com/daroczig/CEU-DV2/tree/2020-2021), [2021/2022 Winter](https://github.com/daroczig/CEU-DV2/tree/2021-2022), and [2022/2023 Winter](https://github.com/daroczig/CEU-DV2/tree/2022-2023). 2 | 3 | ## Table of Contents 4 | 5 | * [Schedule](https://github.com/daroczig/CEU-DV2#schedule) 6 | * [Syllabus](https://github.com/daroczig/CEU-DV2#syllabus) 7 | * [Technical Prerequisites](https://github.com/daroczig/CEU-DV2#technical-prerequisites) 8 | * [Class Schedule](https://github.com/daroczig/CEU-DV2#class-schedule) 9 | * [Contact](https://github.com/daroczig/CEU-DV2#contacts) 10 | 11 | ## Schedule 12 | 13 | 3 x 100 mins on Jan 22 and 31: 14 | 15 | * 13:30 - 15:10 session 1 16 | * 15:10 - 15:40 break 17 | * 15:40 - 17:20 session 2 18 | * 17:20 - 17:40 break 19 | * 17:40 - 19:20 session 3 20 | 21 | ## Location 22 | 23 | In-person at the Vienna campus (QS B-421). 24 | 25 | ## Syllabus 26 | 27 | Please find in the `syllabus` folder of this repository. 28 | 29 | ## Technical Prerequisites 30 | 31 | Please bring your own laptop* and make sure to install the below items **before** attending the first class: 32 | 33 | 0. Join the Teams channel dedicated to the class at `ba-dv3-2024` with the `671u734` team code 34 | 1. Install `R` from https://cran.r-project.org 35 | 2. Install `RStudio Desktop` (Open Source License) from https://posit.co/download/rstudio-desktop/ 36 | 3. Register an account at https://github.com 37 | 4. Enter the following commands in the R console (bottom left panel of RStudio) and make sure you see a plot in the bottom right panel and no errors in the R console: 38 | 39 | ```r 40 | install.packages(c('ggplot2', 'gganimate', 'transformr', 'gifski')) 41 | library(ggplot2) 42 | library(gganimate) 43 | ggplot(diamonds, aes(cut)) + geom_bar() + 44 | transition_states(color, state_length = 0.1) 45 | ``` 46 | 47 | Optional steps I highly suggest to do as well before attending the class if you plan to use `git`: 48 | 49 | 4. Bookmark, watch or star this repository so that you can easily find it later 50 | 5. Install `git` from https://git-scm.com/ 51 | 6. Verify that in RStudio, you can see the path of the `git` executable binary in the Tools/Global Options menu's "Git/Svn" tab -- if not, then you might have to restart RStudio (if you installed git after starting RStudio) or installed git by not adding that to the PATH on Windows. Either way, browse the "git executable" manually (in some `bin` folder look for thee `git` executable file). 52 | 7. Create an RSA key (optionally with a passphrase for increased security -- that you have to enter every time you push and pull to and from GitHub). Copy the public key and add that to you SSH keys on your GitHub profile. 53 | 8. Create a new project choosing "version control", then "git" and paste the SSH version of the repo URL copied from GitHub in the pop-up -- now RStudio should be able to download the repo. If it asks you to accept GitHub's fingerprint, say "Yes". 54 | 9. If RStudio/git is complaining that you have to set your identity, click on the "Git" tab in the top-right panel, then click on the Gear icon and then "Shell" -- here you can set your username and e-mail address in the command line, so that RStudio/git integration can work. Use the following commands: 55 | 56 | ```sh 57 | $ git config --global user.name "Your Name" 58 | $ git config --global user.email "Your e-mail address" 59 | ``` 60 | Close this window, commit, push changes, all set. 61 | 62 | Find more resources in Jenny Bryan's "[Happy Git and GitHub for the useR](http://happygitwithr.com/)" tutorial if in doubt or [contact me](#contact). 63 | 64 | (*) If you may not be able to use your own laptop, there's a shared RStudio Server set up in AWS for you - including all the required R packages already installed for you. Look up the class Slack channel for how to access, or find below the steps how the service was configured: 65 | 66 |
💪 RStudio Server installation steps 67 | 68 | ``` 69 | # most recent R builds 70 | wget -q -O- https://cloud.r-project.org/bin/linux/ubuntu/marutter_pubkey.asc | sudo tee -a /etc/apt/trusted.gpg.d/cran_ubuntu_key.asc 71 | echo "deb [arch=amd64] https://cloud.r-project.org/bin/linux/ubuntu noble-cran40/" | sudo tee -a /etc/apt/sources.list.d/cran_r.list 72 | sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 67C2D66C4B1D4339 51716619E084DAB9 73 | sudo apt update && sudo apt upgrade 74 | sudo apt install r-base 75 | # apt builds of all CRAN packages 76 | wget -q -O- https://eddelbuettel.github.io/r2u/assets/dirk_eddelbuettel_key.asc | sudo tee -a /etc/apt/trusted.gpg.d/cranapt_key.asc 77 | echo "deb [arch=amd64] https://r2u.stat.illinois.edu/ubuntu noble main" | sudo tee -a /etc/apt/sources.list.d/cranapt.list 78 | sudo apt update 79 | # install some R packages 80 | sudo apt install -y r-base gdebi-core r-cran-ggplot2 r-cran-gganimate \ 81 | cargo libudunits2-dev libssl-dev libgdal-dev desktop-file-utils \ 82 | r-cran-data.table r-cran-rcpp r-cran-dplyr r-cran-ggally r-cran-pander r-cran-readxl \ 83 | r-cran-ggrepel r-cran-hexbin r-cran-animation r-cran-dendextend r-cran-nbclust \ 84 | r-cran-ggmap r-cran-maps r-cran-devtools r-cran-ggraph r-cran-ggthemes \ 85 | r-cran-leaflet r-cran-mapproj \ 86 | r-cran-gtextras r-cran-datasaurus r-cran-psych r-cran-svglite \ 87 | r-cran-tidygeocoder \ 88 | r-cran-ggiraph r-cran-plotly r-cran-concaveman r-cran-gifski 89 | # install RStudio IDE 90 | wget https://download2.rstudio.org/server/jammy/amd64/rstudio-server-2024.12.0-467-amd64.deb 91 | sudo gdebi rstudio-server-*.deb 92 | # never do this in prod 93 | echo "www-port=80" | sudo tee -a /etc/rstudio/rserver.conf 94 | sudo rstudio-server restart 95 | ``` 96 | 97 |
98 | 99 |
💪 Creating users 100 | 101 | ```r 102 | secret <- 'something super secret' 103 | users <- c('list', 'of', 'users') 104 | 105 | library(logger) 106 | library(glue) 107 | for (user in users) { 108 | 109 | ## remove invalid character 110 | user <- sub('@.*', '', user) 111 | user <- sub('-', '_', user) 112 | user <- sub('.', '_', user, fixed = TRUE) 113 | user <- tolower(user) 114 | 115 | log_info('Creating {user}') 116 | system(glue("sudo adduser --disabled-password --quiet --gecos '' {user}")) 117 | 118 | log_info('Setting password for {user}') 119 | system(glue("echo '{user}:{secret}' | sudo chpasswd")) # note the single quotes + placement of sudo 120 | 121 | log_info('Adding {user} to sudo group') 122 | system(glue('sudo adduser {user} sudo')) 123 | 124 | } 125 | ``` 126 | 127 |
128 | 129 | ## Class Schedule 130 | 131 | ### Week 1 132 | 133 | 1. Warm-up exercise and security reminder: [1.R](1.R#L1) 134 | 2. Intro / recap on R and ggplot2 from previous courses by introducing MDS: [1.R](1.R#L67) 135 | 3. Geocoding: [1.R](1.R#L144) 136 | 4. Shapefiles: [1.R](1.R#L232) 137 | 5. Scaling / standardizing variables: [1.R](1.R#L332) 138 | 6. Simpson's paradox: [1.R](1.R#L398) 139 | 7. Anscombe's quartett [1.R](1.R#L459) 140 | 141 | ### Week 2 142 | 143 | 1. Review homework: [homework.Rmd](homework.Rmd) 144 | 3. Warm-up exercises on `ggplot2` calls and EDA [2.R](2.R#L1) 145 | 4. Extract points from a plot [2.R](2.R#L71) 146 | 5. Hierarchical clustering [2.R](2.R#L107) 147 | 6. Animations [2.R](2.R#L136) 148 | 7. Themes [2.R](2.R#L385) 149 | 8. Interactive plots [2.R](2.R#L460) and [2-interactive-plots.Rmd](2-interactive-plots.Rmd) 150 | 151 | ## Homeworks 152 | 153 | ### Homework 1 154 | 155 | Replicate the following document using R markdown: https://rpubs.com/daroczig-ceu/dv3-2024-hw 156 | 157 | Submission: prepare an R Markdown or Quartro document echoing all used R commands, and knit to HTML or PDF and upload to Moodle before Jan 31 noon (CET). 158 | 159 | ### Final project 160 | 161 | Use any publicly accessible dataset (preferably from the [TidyTuesday projects](https://github.com/rfordatascience/tidytuesday)), but if you don't feel creative, feel free to pick the [`palmerpenguins` dataset](https://allisonhorst.github.io/palmerpenguins/.) and demonstrate what you have learned in this class by generating different data visualizations that makes sense and are insightful, plus provide comments on those in plain English. This can totally be a continuation of your Intro to R submission. 162 | 163 | Required items to include in your work for grade "B": 164 | - at least 5 plots (with at least 3 different `ggplot2` geoms) not presented yet in any of your previous CEU projects 165 | - a meaningful animation using `gganimate` (instead of presenting random stuff moving around, make sure to create an animation that makes sense for your use-case, e.g. showing how things changed over time) 166 | - either (1) register an account at stadiamap to fetch map tiles or (2) use shapefile(s) to present some geospatial data (e.g. points or polygons rendered on a background map) 167 | 168 | For grade "A": 169 | - make sure to fine-tune your plots and make those pretty by always setting proper (axis) titles, scales, custom color palettes etc. 170 | - define a custom theme (e.g. background color, grid, font family and color) and use that on all (or at least on most) plots 171 | - include at least one interactive plot (e.g. via `plotly` or `ggigraph`) 172 | 173 | Submission: prepare an R Markdown or Quartro document (printing both the R code and its output) and knit to HTML and upload to Moodle before Feb 14 midnight (CET). Make sure to submit HTML with R code chunks and plots! 174 | 175 | ## Contact 176 | 177 | File a [GitHub ticket](https://github.com/daroczig/CEU-DV2/issues). 178 | -------------------------------------------------------------------------------- /1.R: -------------------------------------------------------------------------------- 1 | ## ############################################################################# 2 | ## warm-up exercise and security reminder 3 | 4 | ## we learned at the "Intro to R" course that we should not do this: 5 | source('http://bit.ly/CEU-R-shoes') 6 | 7 | ## let's install a package instead! 8 | install.packages('remotes') 9 | remotes::install_github('daroczig/students') 10 | 11 | library(students) 12 | ?students 13 | 14 | ## this is a dataset on students from a study group, 15 | ## where we run a math test and found interesting association with the shoe size 16 | ## TODO EDA 17 | students 18 | 19 | cor(students$shoe, students$math) 20 | lm(math ~ shoe, students) 21 | 22 | plot(students$shoe, students$math) 23 | abline(lm(math ~ shoe, students), col = 'red') 24 | 25 | library(ggplot2) 26 | ggplot(students, aes(math, shoe)) + geom_point() + geom_smooth(method = 'lm') 27 | 28 | ## EDA - everyone! 29 | str(students) 30 | summary(students) 31 | plot(students) 32 | 33 | library(ggplot2) 34 | ggplot(students, aes(math, shoe, color = z)) + geom_point() 35 | ggplot(students, aes(math, shoe, color = y)) + geom_point() # !! 36 | 37 | library(GGally) 38 | ggpairs(students) 39 | 40 | ## https://datavizuniverse.substack.com/p/navigating-the-table-jungle 41 | library(gtExtras) 42 | gt_plt_summary(students) 43 | 44 | ## partial correlation 45 | residuals(lm(math ~ x, students)) 46 | residuals(lm(shoe ~ x, students)) 47 | cor(residuals(lm(math ~ x, students)), residuals(lm(shoe ~ x, students))) 48 | 49 | library(psych) 50 | partial.r(students, 1:2, 3) 51 | 52 | plot(residuals(lm(math ~ x, students)), residuals(lm(shoe ~ x, students))) 53 | abline(lm(residuals(lm(math ~ x, students)) ~ residuals(lm(shoe ~ x, students)))) 54 | 55 | plot(residuals(lm(math ~ x, students)), residuals(lm(shoe ~ x, students))) 56 | abline(lm(residuals(lm(shoe ~ x, students)) ~ residuals(lm(math ~ x, students)))) 57 | 58 | ## had enough 59 | rm(list = ls()) 60 | 61 | ## but wow! 62 | students 63 | .secret # "A warm hello from the Internet." 64 | ## TODO look at the source code of the package! 65 | ## TODO always install from trusted source 66 | 67 | ## ############################################################################# 68 | ## intro / recap on R and ggplot2 from previous courses by introducing MDS 69 | 70 | ## download data to a file in your temp folder 71 | t <- tempfile() 72 | t 73 | t <- tempfile(fileext = '.xls') 74 | t 75 | 76 | ## or keep in the current working directory 77 | t <- 'cities.xls' 78 | 79 | download.file('https://bit.ly/de-cities-distance', t, mode = 'wb') 80 | 81 | ## further checks on the downloaded file 82 | file.info(t) 83 | pander::openFileInOS(t) 84 | 85 | ## read the downloaded file 86 | library(readxl) 87 | cities <- read_excel(t) 88 | 89 | cities 90 | ## tibble VS data.frame VS data.table 91 | str(cities) 92 | 93 | ## get rid of 1st column and last three rows (metadata) 94 | cities <- cities[, -1] 95 | cities <- cities[1:(nrow(cities) - 3), ] 96 | str(cities) 97 | 98 | mds <- cmdscale(as.dist(cities)) 99 | mds 100 | 101 | plot(mds) 102 | text(mds[, 1], mds[, 2], names(cities)) 103 | 104 | ## TODO interpret what we see 105 | 106 | ## flipping both x and y axis 107 | mds <- -mds 108 | plot(mds) 109 | text(mds[, 1], mds[, 2], names(cities)) 110 | ## flipping only on y axis 111 | mds[, 1] <- -mds[, 1] 112 | plot(mds) 113 | text(mds[, 1], mds[, 2], names(cities)) 114 | ## flipping only on x axis 115 | mds[, 2] <- -mds[, 2] 116 | plot(mds) 117 | text(mds[, 1], mds[, 2], names(cities)) 118 | 119 | ## TODO ggplot2 way 120 | mds <- as.data.frame(mds) 121 | mds$city <- rownames(mds) 122 | str(mds) 123 | 124 | library(ggplot2) 125 | ggplot(mds, aes(V1, V2, label = city)) + 126 | geom_text() + theme_bw() 127 | 128 | ## flip one axis and grid 129 | ggplot(mds, aes(V1, -V2, label = city)) + 130 | geom_text() + theme_void() 131 | 132 | ## ############################################################################# 133 | ## TODO visualize the distance between the European cities 134 | ## stored in the built-in dataframe: 135 | 136 | ?eurodist 137 | 138 | mds <- cmdscale(eurodist) 139 | mds <- as.data.frame(mds) 140 | mds$city <- rownames(mds) 141 | ggplot(mds, aes(V1, -V2, label = city)) + 142 | geom_text() + theme_bw() 143 | 144 | ## ############################################################################ 145 | ## geocoding 146 | 147 | ## test on https://nominatim.openstreetmap.org/ui/search.html 148 | ## e.g. "CEU, Budapest, Hungary" or "central european university, vienna" 149 | 150 | library(ggmap) # geocode one address 151 | library(tidygeocoder) # geocode a data.frame 152 | 153 | library(data.table) 154 | mds <- data.table(geocode(mds, 'city')) 155 | str(mds) 156 | 157 | ## built-in polygons for the background 158 | ?maps::world 159 | world <- map_data('world') 160 | ggplot() + 161 | geom_map(data = world, map = world, aes(long, lat, map_id = region)) + 162 | coord_fixed(1.3) 163 | 164 | ## adding the points 165 | ggplot() + 166 | geom_map(data = world, map = world, aes(long, lat, map_id = region)) + 167 | coord_fixed(1.3) + 168 | geom_point(data = mds, aes(long, lat), color = 'orange') 169 | 170 | ## wow, that's odd to see a few cities from EU in USA :O 171 | mds 172 | ## NOTE Lyons (plural due to English spelling) instead of Lyon 173 | 174 | ggplot() + 175 | geom_map(data = world, map = world, aes(long, lat, map_id = region)) + 176 | coord_fixed(1.3) + 177 | geom_point(data = mds, aes(long, lat, color = city)) 178 | 179 | ## make it a thematic map 180 | str(world) 181 | world$a <- grepl('^A', world$region) 182 | 183 | ggplot() + 184 | geom_map(data = world, map = world, aes(long, lat, map_id = region, fill = a)) + 185 | coord_fixed(1.3) + 186 | geom_point(data = mds, aes(long, lat), color = 'black') + 187 | theme(legend.position = 'none') 188 | 189 | library(countrycode) 190 | world <- data.table(world) 191 | world[, continent := countrycode(region, origin = 'country.name', destination = 'continent'), by = region] 192 | warnings() 193 | 194 | ggplot() + 195 | geom_map(data = world, map = world, aes(long, lat, map_id = region, fill = continent)) + 196 | coord_fixed(1.3) + 197 | geom_point(data = mds, aes(long, lat), color = 'black') + 198 | theme(legend.position = 'none') 199 | 200 | ## fancy background from http://maps.stamen.com -- unfortunately deprecated, 201 | ## so need to register a key from get_stadiamap 202 | library(ggmap) 203 | register_stadiamaps('YOUR-API-KEY') 204 | 205 | map <- get_stadiamap( 206 | c( 207 | left = min(mds$long) * 0.995, 208 | right = max(mds$long) * 1.001, 209 | bottom = min(mds$lat) * 0.999, 210 | top = max(mds$lat)) * 1.001, 211 | maptype = 'stamen_toner', 212 | zoom = 4) 213 | 214 | ggmap(map) + 215 | geom_point(data = mds, aes(long, lat), color = 'orange') + 216 | theme_void() + theme(legend.position = 'none') 217 | 218 | library(sf) 219 | geomds <- st_as_sf(x = mds, coords = c("long", "lat")) 220 | st_bbox(geomds) 221 | unname(st_bbox(geomds)) 222 | 223 | map <- get_stadiamap( 224 | unname(st_bbox(geomds)), 225 | maptype = 'stamen_terrain_background', 226 | zoom = 4) 227 | 228 | ggmap(map) + 229 | geom_point(data = mds, aes(long, lat), color = 'black') + 230 | theme_void() + theme(legend.position = 'none') 231 | 232 | ## ############################################################################# 233 | ## plotting shapefiles 234 | ## ############################################################################# 235 | 236 | ## search for "Austria shapefile" University of Texas 237 | ## https://geodata.lib.utexas.edu/?f%5Bdc_format_s%5D%5B%5D=Shapefile&f%5Bdc_subject_sm%5D%5B%5D=Boundaries&f%5Bdct_spatial_sm%5D%5B%5D=Austria&f%5Blayer_geom_type_s%5D%5B%5D=Polygon&per_page=20 238 | download.file( 239 | 'https://stacks.stanford.edu/object/rc343vz5889', 240 | 'Austria_boundary.zip') 241 | download.file( 242 | 'https://stacks.stanford.edu/object/yv617vc9132', 243 | 'Austria_divisions.zip') 244 | unzip('Austria_boundary.zip') 245 | unzip('Austria_divisions.zip') 246 | 247 | ## Look around what we have 248 | library(sf) # simple features 249 | st_layers('.') 250 | 251 | ## Read a shapefile from the current working directory (refer to the filename without file extension) 252 | adm0 <- st_read('.', layer = 'AUT_adm0') 253 | str(adm0) 254 | 255 | plot(adm0) 256 | 257 | st_geometry(adm0) 258 | ggplot() + geom_sf(data = adm0) 259 | 260 | ## Now let's load smaller administrative areas as well 261 | adm2 <- st_read('.', 'AUT_adm2') 262 | 263 | ## And some points to be added to the map as well 264 | cities <- fread('https://simplemaps.com/static/data/country-cities/at/at.csv') 265 | 266 | ## Party time 267 | library(ggplot2) 268 | ggplot() + 269 | geom_sf(data = adm0, color = 'black', size = 1) + 270 | geom_sf(data = adm2, color = 'gray', size = .2) + 271 | geom_point(data = cities, 272 | aes(lng, lat, size = population), 273 | color = 'orange') + 274 | theme_void() + 275 | theme(legend.position = 'top') 276 | 277 | ## how to access the original data elements? 278 | str(adm2) 279 | adm2$NAME_2 280 | 281 | ## add label 282 | ggplot() + 283 | geom_sf(data = adm0, color = 'black', size = 1) + 284 | geom_sf(data = adm2, color = 'gray', size = .2) + 285 | geom_sf_text(data = adm2, aes(label = NAME_2)) + 286 | geom_point(data = cities, 287 | aes(lng, lat, size = population), 288 | color = 'orange') + 289 | theme_void() + 290 | theme(legend.position = 'top') 291 | 292 | ## thematic maps: Statutory city VS 293 | ggplot() + 294 | geom_sf(data = adm0, color = 'black', size = 1) + 295 | geom_sf(data = adm2, color = 'gray', aes(fill = TYPE_2), size = .2) + 296 | geom_sf_text(data = adm2, aes(label = NAME_2)) + 297 | geom_point(data = cities, 298 | aes(lng, lat, size = population), 299 | color = 'orange') + 300 | theme_void() + 301 | theme(legend.position = 'top') 302 | 303 | ## using geojson files 304 | download.file( 305 | 'https://raw.githubusercontent.com/ginseng666/GeoJSON-TopoJSON-Austria/master/2021/simplified-95/bezirke_95_topo.json', 306 | 'austria.geojson') 307 | 308 | library(leaflet) 309 | 310 | map <- st_read('austria.geojson') 311 | popup <- paste0('Name: ', map$name) 312 | 313 | leaflet(map) %>% 314 | addPolygons( 315 | weight = 1, smoothFactor = 0.5, 316 | opacity = 1.0, fillOpacity = 0.5, 317 | popup = popup, 318 | label = ~name, 319 | layerId = ~name, 320 | labelOptions = labelOptions(noHide = TRUE), 321 | highlightOptions = highlightOptions( 322 | color = 'white', weight = 2, 323 | bringToFront = TRUE)) 324 | 325 | library(ggplot2) 326 | ggplot() + 327 | geom_sf(data = map, color = 'white', fill = 'darkgreen', size = .2) + 328 | theme_void() 329 | 330 | ## find more details at https://tmieno2.github.io/R-as-GIS-for-Economists/index.html 331 | 332 | ## ############################################################################# 333 | ## TODO non-geo example 334 | 335 | ?mtcars 336 | str(mtcars) 337 | mtcars 338 | 339 | mds <- cmdscale(dist(mtcars)) 340 | plot(mds) 341 | text(mds[, 1], mds[, 2], rownames(mtcars)) 342 | ## oh no, the overlaps! 343 | 344 | mds <- as.data.frame(mds) 345 | mds$car <- rownames(mds) 346 | ggplot(mds, aes(V1, V2, label = car)) + 347 | geom_text() + theme_bw() 348 | 349 | library(ggrepel) 350 | ggplot(mds, aes(V1, V2, label = car)) + 351 | geom_text_repel() + theme_bw() 352 | 353 | ## ############################################################################# 354 | ## QQ what does it mean that two cards are "close to each other"? 355 | ## NOTE think about why the above visualization is off 356 | 357 | ## check actual distances eg for Camaro (or other sport cars) 358 | which(rownames(mtcars) == 'Camaro Z28') 359 | sort(as.matrix(dist(mtcars))[, 24]) 360 | ## Mercedes sedans are closer?! than e.g. Ferrari Dino or Maserati Bora 361 | 362 | mtcars 363 | 364 | subset(mtcars, hp >= 245) 365 | 366 | ?cmdscale 367 | ?dist 368 | 369 | summary(mtcars) 370 | 371 | ## need to standardize to give every variable equal weight! 372 | mtcars$hp - mean(mtcars$hp) 373 | mean(mtcars$hp - mean(mtcars$hp)) 374 | 375 | (x <- (mtcars$hp - mean(mtcars$hp)) / sd(mtcars$hp)) 376 | mean(x) 377 | sd(x) 378 | hist(x) 379 | 380 | x 381 | scale(mtcars$hp) 382 | plot(x, scale(mtcars$hp)) 383 | x - scale(mtcars$hp) 384 | 385 | plot(mtcars$hp, scale(mtcars$hp)) 386 | 387 | ?scale 388 | scale(mtcars) 389 | 390 | mds <- cmdscale(dist(scale(mtcars))) 391 | mds <- as.data.frame(mds) 392 | mds$car <- rownames(mds) 393 | ggplot(mds, aes(V1, V2, label = car)) + 394 | geom_text_repel() + theme_bw() 395 | 396 | subset(mtcars, hp >= 200) 397 | 398 | ## ############################################################################# 399 | ## introduction to Simpson's paradox with the Berkeley example 400 | 401 | ## then do the analysis in R 402 | UCBAdmissions 403 | plot(UCBAdmissions) 404 | 405 | berkeley <- as.data.frame(UCBAdmissions) 406 | 407 | ggplot(berkeley, aes(Gender, Freq, fill = Admit)) + geom_col() 408 | 409 | p <- ggplot(berkeley, aes(Gender, Freq, fill = Admit)) + geom_col(position = 'fill') 410 | p 411 | 412 | p + facet_wrap(~Dept) 413 | p + facet_wrap(~Dept) + scale_fill_manual(values = c('Admitted' = 'darkgreen', 'Rejected' = 'red')) 414 | # https://colorbrewer2.org 415 | p + facet_wrap(~Dept) + scale_fill_brewer(palette = 'Dark2') 416 | 417 | ggplot(berkeley, aes(Gender, Freq, fill = Admit)) + geom_col() + 418 | facet_wrap(~Dept) + scale_fill_brewer(palette = 'Dark2') 419 | 420 | ## ############################################################################# 421 | ## TODO exercise visualize a model on the association between 422 | ## the lengh and width of sepal in the iris dataset 423 | 424 | ?iris 425 | str(iris) 426 | 427 | ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point() 428 | ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point() + 429 | geom_smooth() 430 | ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point() + 431 | geom_smooth(method = 'lm', se = FALSE) 432 | ## note the change in the sign of the slope! 433 | ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) + geom_point() + 434 | geom_smooth(method = 'lm', se = FALSE) 435 | 436 | summary(lm(Sepal.Width ~ Sepal.Length, data = iris)) 437 | summary(lm(Sepal.Width ~ Sepal.Length + Species, data = iris)) 438 | 439 | ## NOTE when checking the scatterplot colored by Species 440 | ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) + geom_point() 441 | 442 | ## TODO add model 443 | ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) + geom_point() + geom_smooth(method = 'lm') 444 | 445 | ggplot(iris, aes(Sepal.Length, Sepal.Width)) + 446 | geom_point(aes(color = Species)) + 447 | geom_smooth(aes(color = Species), method = 'lm') + 448 | geom_smooth(method = 'lm', color = 'black', linewidth = 2) 449 | 450 | ## compare the overlap of groups with the MDS version: 451 | mds <- as.data.frame(cmdscale(dist(iris))) 452 | ggplot(mds, aes(V1, V2, color = iris$Species)) + geom_point() 453 | ggplot(mds, aes(V1, V2, color = iris$Species)) + geom_point() + geom_smooth(method = 'lm') 454 | 455 | ## without the Species column 456 | mds <- as.data.frame(cmdscale(dist(iris[, -5]))) 457 | ggplot(mds, aes(V1, V2, color = iris$Species)) + geom_point() 458 | 459 | ## ############################################################################# 460 | ## back to dataviz ... why? let's see Anscombe's quartett 461 | 462 | ## dirty data 463 | anscombe 464 | 465 | ## select 1st set 466 | anscombe[, c(1, 5)] 467 | 468 | ## using base R to see the 1st set 469 | plot(anscombe[, c(1, 5)]) 470 | 471 | ## add linear model 472 | lm(anscombe[, c(5, 1)]) 473 | ## note to change 5 and 1 (estimating y based on x) 474 | abline(lm(anscombe[, c(5, 1)])) 475 | 476 | library(ggplot2) 477 | ggplot(anscombe[, c(1, 5)], aes(x1, y1)) + geom_point() + geom_smooth(method = 'lm', se = TRUE) 478 | ggplot(anscombe[, c(2, 6)], aes(x2, y2)) + geom_point() + geom_smooth(method = 'lm', se = TRUE) 479 | 480 | ## TODO compute the mean of x1 and y1 481 | mean(anscombe[, 1]) 482 | mean(anscombe[, 5]) 483 | 484 | ## TODO compute the mean of x2 and y2 485 | mean(anscombe[, 2]) 486 | mean(anscombe[, 6]) 487 | 488 | ## intro to loops 489 | lapply(1:4, function(i) mean(anscombe[, c(i)])) 490 | lapply(1:4, function(i) sd(anscombe[, c(i)])) 491 | lapply(1:4, function(i) cor(anscombe[, c(i, i+4)])) 492 | 493 | apply(anscombe, 2, mean) 494 | 495 | ## loop to create separate & tidy dataset for each set 496 | lapply(1:4, function(i) anscombe[, c(i, i+4)]) 497 | 498 | ## use data.table to merge into one single data frame 499 | library(data.table) 500 | rbindlist(lapply(1:4, function(i) anscombe[, c(i, i+4)])) 501 | ## add a "set" column so that we know which set the row belongs to 502 | rbindlist(lapply(1:4, function(i) cbind(dataset = i, anscombe[, c(i, i+4)]))) 503 | 504 | ## save in a variable for future use 505 | df <- rbindlist(lapply(1:4, function(i) cbind(dataset = i, anscombe[, c(i, i+4)]))) 506 | setnames(df, c('dataset', 'x', 'y')) 507 | 508 | ## let's switch from base R to ggplot and show all 4 sets in subplots 509 | library(ggplot2) 510 | ggplot(df, aes(x, y)) + geom_point() + facet_wrap(~dataset) 511 | ggplot(df, aes(x, y)) + geom_point() + facet_wrap(~dataset) + geom_smooth() 512 | ggplot(df, aes(x, y)) + geom_point() + facet_wrap(~dataset) + geom_smooth(method = 'lm') 513 | 514 | ## intro to https://www.research.autodesk.com/publications/same-stats-different-graphs/ 515 | df <- datasauRus::datasaurus_dozen_wide 516 | dt <- rbindlist(lapply(1:13, function(i) 517 | data.frame( 518 | x = df[, 2 * i - 1, drop = TRUE], 519 | y = df[, 2 * i, drop = TRUE], 520 | dataset_id = i))) 521 | 522 | ggplot(dino_df, aes(x, y)) + geom_point() + 523 | geom_smooth(method = 'lm', se = FALSE) + 524 | facet_wrap(~dataset) + 525 | theme_bw() 526 | -------------------------------------------------------------------------------- /2.R: -------------------------------------------------------------------------------- 1 | ## ############################################################################# 2 | ## EDA warmup - alternatives to boxplot 3 | ## ############################################################################# 4 | 5 | library(data.table) 6 | df <- fread('http://bit.ly/CEU-R-numbers-set') 7 | str(df) 8 | 9 | summary(df) 10 | table(df$x) 11 | 12 | summary(df) 13 | lapply(df, summary) 14 | lapply(unique(df$x), function(set) summary(df[x == set])) 15 | 16 | ## data.table way 17 | df[, as.list(summary(y)), by = x] 18 | 19 | pairs(df) 20 | library(GGally) 21 | ggpairs(df) 22 | 23 | library(ggplot2) 24 | ggplot(df, aes(x, y)) + geom_point() 25 | ggplot(df, aes(x, y)) + geom_point() + geom_smooth(method = 'lm') 26 | ggplot(df, aes(x, y)) + geom_point(alpha = 0.1) 27 | ggplot(df, aes(x, y)) + geom_jitter(alpha = 0.1) 28 | 29 | ## hexbin 30 | ggplot(df, aes(x, y)) + geom_hex() 31 | 32 | ggplot(df, aes(factor(x), y)) + geom_boxplot() 33 | ## NOTE jitter showed interesting patterns .. not visible 34 | 35 | ggplot(df, aes(factor(x), y)) + geom_violin() 36 | ggplot(df, aes(factor(x), y)) + geom_violin() + geom_jitter() 37 | ggplot(df, aes(factor(x), y)) + geom_violin() + 38 | geom_jitter(width = 0.1, alpha = 0.1) 39 | 40 | ggplot(df, aes(y)) + geom_histogram() + facet_wrap(~x) 41 | 42 | ggplot(df, aes(y, fill = factor(x))) + geom_density() 43 | ggplot(df, aes(y, fill = factor(x))) + geom_density(alpha = .25) 44 | ggplot(df, aes(y, fill = factor(x))) + 45 | geom_density(alpha = .25) + 46 | theme(legend.position = 'top') 47 | 48 | ## df <- rbind( 49 | ## data.table(x = 1, y = rbeta(1e3, 0.1, 0.1)), 50 | ## data.table(x = 2, y = rnorm(1e3, 0.5, 0.75)), 51 | ## data.table(x = 3, y = runif(1e3) * 2 - 0.5), 52 | ## data.table(x = 4, y = rnorm(1e3, 0.5, 0.75))) 53 | 54 | ## TODO do similar exploratory data analysis on the below dataset 55 | df <- fread('http://bit.ly/CEU-R-numbers') 56 | ## generated at https://gist.github.com/daroczig/23d1323652a70c03b27cfaa6b080aa3c 57 | 58 | ## TODO find interesting pattern in data? 59 | 60 | ggplot(df, aes(x, y)) + geom_point() # slow? 61 | ggplot(df, aes(x, y)) + geom_point(alpha = 0.05) 62 | ggplot(df, aes(x, y)) + geom_point(size = 0.2, alpha = 0.1) 63 | ggplot(df, aes(x, y)) + geom_hex(binwidth = 5) 64 | ggplot(df, aes(x, y)) + geom_count() 65 | 66 | df[, .N, by = list(x, y)] 67 | ggplot(df[, .N, by = list(x, y)], aes(x, y, fill = N)) + geom_tile() 68 | 69 | ## 3d plot density? 70 | 71 | ## ############################################################################# 72 | ## manually extract data from plots 73 | ## ############################################################################# 74 | 75 | ## using grid.locator 76 | devtools::install_github('doehm/traceR') 77 | 78 | library(traceR) 79 | ## NOTE locator is not reliable in the RStudio IDE built-in graphics device 80 | ## so we need to open a new device 81 | grDevices::x11() 82 | df2 <- trace_image() 83 | df2 84 | inspect_trace(df2) 85 | 86 | ggplot(df2, aes(x, y)) + geom_point(color = 'red') + geom_path() 87 | library(ggforce) 88 | ggplot(df2, aes(x, y)) + geom_point(color = 'red') + geom_bspline0() 89 | ggplot(df2, aes(x, y)) + geom_point(color = 'red') + geom_bspline_closed0() 90 | 91 | ## this fish doesn't have an eye! 92 | df3 <- trace_image() 93 | ggplot(df2, aes(x, y)) + geom_point(color = 'red') + geom_bspline_closed0() + 94 | geom_bspline_closed0(data=df3, fill = 'white') 95 | 96 | ## oops 97 | df3 <- trace_image(scale = FALSE) 98 | ggplot(df2, aes(x, y)) + geom_point(color = 'red') + geom_bspline_closed0() + 99 | geom_bspline_closed0(data=df3, fill = 'white') 100 | 101 | ggplot(df, aes(x, y)) + geom_hex(binwidth = 5) 102 | df2 <- trace_image(scale = FALSE) 103 | df3 <- trace_image(scale = FALSE) 104 | ggplot(df2, aes(x, y)) + geom_point(color = 'red') + geom_bspline_closed0() + 105 | geom_bspline_closed0(data=df3, fill = 'white') 106 | 107 | ## ############################################################################# 108 | ## why the dataviz? edu purposes, eg showing how clustering works 109 | ## ############################################################################# 110 | 111 | ## hierarchical clustering 112 | ?hclust 113 | 114 | ## doing this on iris instead of mtcars (re MDS) to be able to compare with Species 115 | dm <- dist(iris[, 1:4]) 116 | str(dm) 117 | 118 | hc <- hclust(dm) 119 | str(hc) 120 | 121 | ## plot the dendogram 122 | plot(hc) 123 | rect.hclust(hc, k = 3) 124 | 125 | ## see first 2 observations merged into a cluster: 126 | hc$merge[1, ] 127 | iris[c(102, 143), ] 128 | 129 | ## poor man's animation 130 | for (i in 2:8) { 131 | plot(hc) 132 | rect.hclust(hc, k = i) 133 | Sys.sleep(1) 134 | } 135 | 136 | ## actual animation! 137 | library(animation) 138 | ani.options(interval = 1) 139 | ani.options(autobrowse = FALSE) # on server 140 | saveGIF({ 141 | for (i in 2:8) { 142 | plot(hc) 143 | rect.hclust(hc, k = i) 144 | ## no need for sleep 145 | } 146 | }) 147 | ?saveGIF 148 | ?ani.options 149 | ## ani.width, ani.height = 480 150 | 151 | ## ggplot for dendograms 152 | library(dendextend) 153 | d <- as.dendrogram(hc) 154 | d <- color_branches(d, k = 2) 155 | plot(d) 156 | 157 | ggplot(d) 158 | ggplot(d, labels = FALSE) 159 | ggplot(d, labels = FALSE, horiz = TRUE) 160 | ggplot(d, labels = FALSE) + scale_y_reverse(expand = c(0.2, 0)) + coord_polar(theta="x") 161 | ## https://talgalili.github.io/dendextend/articles/dendextend.html 162 | 163 | ## TODO create an animation with ggplot2 instead of the above plot/rect.hclust method 164 | for (i in 2:8) { 165 | ## NOTE the need to print .. otherwise pretty boring "animation" 166 | print(ggplot(color_branches(as.dendrogram(hc), k = i))) 167 | Sys.sleep(1) 168 | } 169 | 170 | saveGIF({ 171 | for (i in 2:8) { 172 | print(ggplot(color_branches(as.dendrogram(hc), k = i))) 173 | } 174 | }, ani.width = 960) 175 | 176 | ## cluster membership 177 | clusters <- cutree(hc, 3) 178 | clusters 179 | 180 | ## why 3? 181 | library(NbClust) 182 | NbClust(iris[, 1:4], method = 'complete', index = 'all') 183 | ## QQ elbow rule: diminishing returns are no longer worth the additional cost 184 | suggestions <- NbClust(iris[, 1:4], method = 'complete', index = 'all') 185 | str(suggestions) 186 | 187 | library(ggplot2) 188 | ggplot(iris, aes(Sepal.Length, Sepal.Width, shape = Species)) + geom_point() 189 | ggplot(iris, aes(Sepal.Length, Sepal.Width, shape = Species)) + geom_point(size = 3) 190 | ggplot(iris, aes(Sepal.Length, Sepal.Width, shape = Species, color = factor(clusters))) + 191 | geom_point(size = 3) 192 | ggplot(iris, aes(Sepal.Length, Sepal.Width, shape = Species, color = factor(clusters))) + 193 | geom_point(size = 3) + theme_bw() + theme(legend.position = 'top') 194 | 195 | ## add linear models by both species and cluster membership? 196 | ggplot(iris, aes(Sepal.Length, Sepal.Width, shape = Species, color = factor(clusters))) + 197 | geom_point(size = 3) + theme_bw() + theme(legend.position = 'top') + 198 | geom_smooth(method = 'lm') 199 | ## hm, that's difficult to interpret .. e.g. why 2 green lines? 200 | ## let's differentiate line types as well? 201 | ggplot(iris, aes(Sepal.Length, Sepal.Width, shape = Species, color = factor(clusters), linetype = Species)) + 202 | geom_point(size = 3) + theme_bw() + theme(legend.position = 'top') + 203 | geom_smooth(method = 'lm') 204 | ## still not happy 205 | 206 | ## loop over Species? instead of faceting, our new dimension for visualization will be time 207 | library(data.table) 208 | ## need ~tidy data 209 | IRIS <- as.data.table(iris) 210 | IRIS$cluster <- factor(clusters) 211 | saveGIF({ 212 | for (species in unique(IRIS$Species)) { 213 | print(ggplot(IRIS[Species == species], 214 | aes(Sepal.Length, Sepal.Width, color = cluster)) + 215 | geom_point(size = 3) + 216 | geom_smooth(method = 'lm') + 217 | ggtitle(species)) # add later 218 | } 219 | }, ani.width = 960) 220 | ## meh, we can do better 221 | 222 | library(gganimate) 223 | ggplot(iris, aes(Sepal.Length, Sepal.Width, color = factor(clusters))) + 224 | geom_point(size = 3) + 225 | geom_smooth(method = 'lm') + 226 | transition_states(Species) 227 | ## NOTE check the scale for clusters ... also need to add title 228 | 229 | ggplot(iris, aes(Sepal.Length, Sepal.Width, color = factor(clusters))) + 230 | geom_point(size = 3) + 231 | geom_smooth(method = 'lm') + 232 | transition_states(Species) + 233 | labs( 234 | title = paste("{closest_state}"), # Python's f-string, R's glue 235 | subtitle = 'Number of flowers: {nrow(subset(iris, Species == closest_state))}') 236 | 237 | ## see more fancy plot at https://twitter.com/daroczig/status/1201542038640111616 238 | 239 | ## TODO this should have been a facet ... 240 | ggplot(iris, aes(Sepal.Length, Sepal.Width, color = factor(clusters))) + 241 | geom_point(size = 3) + 242 | geom_smooth(method = 'lm') + 243 | facet_wrap(~Species) 244 | 245 | ## traditional cluster plots 246 | library(cluster) 247 | clusplot(iris, clusters, color = TRUE, shade = TRUE, labels = 2) 248 | 249 | ## gg Hull plot 250 | ## "convex hull of a shape is the smallest convex set that contains it" 251 | library(ggforce) 252 | ggplot(iris, aes(Sepal.Length, Sepal.Width, color = factor(clusters))) + 253 | geom_point(size = 3) + 254 | geom_mark_hull(aes(label = clusters)) 255 | 256 | ggplot(iris, aes(Sepal.Length, Sepal.Width)) + 257 | geom_point(size = 3) + 258 | geom_mark_hull(aes(fill = factor(clusters), label = clusters), 259 | ## very concave 260 | concavity = 1) 261 | 262 | ## ############################################################################ 263 | ## back to datasaurus! 264 | ## ############################################################################ 265 | 266 | ## finish the datasaurus exercise from last week 267 | library(data.table) 268 | df <- datasauRus::datasaurus_dozen_wide 269 | dt <- rbindlist(lapply(1:13, function(i) 270 | data.frame( 271 | x = df[, 2 * i - 1, drop = TRUE], 272 | y = df[, 2 * i, drop = TRUE], 273 | dataset_id = i))) 274 | ## alternatively 275 | dt <- rbindlist(lapply(seq(1, 26, by = 2), function(i) 276 | data.frame( 277 | x = df[, c(i), drop = TRUE], 278 | y = df[, c(i+1), drop = TRUE], 279 | dataset_id = i) 280 | )) 281 | 282 | ggplot(dt, aes(x, y)) + geom_point() + 283 | geom_smooth(method = 'lm', se = FALSE) + 284 | facet_wrap(~dataset_id) + 285 | theme_bw() 286 | 287 | ## human-friendly dataset ids 288 | dt <- rbindlist(lapply(seq(1, 26, by = 2), function(i) { 289 | data.frame( 290 | x = df[, c(i), drop = TRUE], 291 | y = df[, c(i+1), drop = TRUE], 292 | # sub('_x$', '', names(df)[i]) 293 | dataset_id = substr(names(df)[i], 1, nchar(names(df)[i])-2)) 294 | } 295 | )) 296 | 297 | ## reading more docs 298 | dt <- datasauRus::datasaurus_dozen 299 | 300 | library(datasauRus) 301 | datasaurus_dozen 302 | 303 | library(ggplot2) 304 | ggplot(datasaurus_dozen, aes(x, y)) + 305 | geom_point() + facet_wrap(~dataset) 306 | 307 | library(gganimate) 308 | ggplot(datasaurus_dozen, aes(x, y)) + 309 | geom_point() + geom_smooth(method = 'lm') + 310 | transition_states(dataset) 311 | 312 | subtitle <- function(df, digits = 4) { 313 | paste0( 314 | 'mean(x)=', round(mean(df$x), digits), ', ', 'sd(x)=', round(sd(df$x), digits), '\n', 315 | 'mean(y)=', round(mean(df$y), digits), ', ', 'sd(y)=', round(sd(df$y), digits), '\n', 316 | 'cor(x,y)=', round(cor(df$x, df$y), digits) 317 | ) 318 | } 319 | subtitle(datasaurus_dozen) 320 | 321 | ggplot(datasaurus_dozen, aes(x, y)) + 322 | geom_point() + geom_smooth(method = 'lm') + 323 | transition_states(dataset) + 324 | labs( 325 | title = paste("{closest_state}"), 326 | subtitle = '{subtitle(subset(datasaurus_dozen, dataset == closest_state))}') 327 | 328 | ## TODO bug: overlapping title and subtitle?! not sure ... let's move it down to "x" 329 | ggplot(datasaurus_dozen, aes(x, y)) + 330 | geom_point() + geom_smooth(method = 'lm') + 331 | transition_states(dataset) + 332 | labs( 333 | x = "{closest_state}", y = "", 334 | subtitle = '{subtitle(subset(datasaurus_dozen, dataset == closest_state))}') + 335 | theme(plot.margin = ggplot2::margin(1, 0, 0, 0, "cm")) 336 | 337 | 338 | ## TODO racing bar chart on departure delay per origin 339 | library(nycflights13) 340 | library(data.table) 341 | dt <- data.table(flights)[, .(delay = mean(dep_delay, na.rm = TRUE)), by = .(origin, month)][order(month, delay)] 342 | ## NOTE need to record order within the dataset that we can map it to the plot 343 | dt[, order := order(delay), by = month] 344 | library(ggplot2) 345 | library(gganimate) 346 | g <- ggplot(dt, aes(order, delay)) + 347 | geom_col(aes(fill = origin)) + 348 | ## NOTE need to add text as X axis labels are for order 349 | ## NOTE vjust/hjust https://stackoverflow.com/a/7267364/564164 350 | geom_text(aes(label = origin), hjust = 1.2, vjust = 0.5, angle = 90, size = 14, color = 'white') + 351 | xlab('') + ylab('Average departure delays (minutes)') + 352 | theme_bw() + 353 | theme(legend.position = 'none', 354 | axis.ticks.x = element_blank(), 355 | panel.grid.minor.x = element_blank(), 356 | panel.grid.major.x = element_blank()) + 357 | ## NOTE look up month based on closest state 358 | ggtitle('2013/{month.name[as.numeric(closest_state)]}') + 359 | transition_states(month) + 360 | ## dynamic y axis for each month 361 | view_follow(fixed_x = FALSE) 362 | animate(g, height = 600, width = 600) 363 | 364 | ## TODO racing bar chart on arr delay per dest 365 | dt <- data.table(flights)[, .(delay = mean(arr_delay, na.rm = TRUE)), by = .(dest, month)][order(month, delay)] 366 | dt[, order := order(delay), by = month] 367 | g <- ggplot(dt, aes(order, delay)) + 368 | geom_col(aes(fill = dest)) + 369 | ## NOTE labels for negative values 370 | geom_text(aes(label = dest, hjust = delay < 0), angle = 90, size = 2) + 371 | xlab('') + ylab('Average arrival delays (minutes)') + 372 | theme_bw() + 373 | theme(legend.position = 'none', 374 | axis.ticks.x = element_blank(), 375 | panel.grid.minor.x = element_blank(), 376 | panel.grid.major.x = element_blank()) + 377 | ggtitle('2013/{month.name[as.numeric(closest_state)]}') + 378 | transition_states(month) + 379 | view_follow(fixed_x = FALSE) 380 | animate(g, height = 600, width = 1200, fps = 5) 381 | 382 | ## quite too much .. maybe filter for top10 383 | dt[order<=10] 384 | 385 | ## ############################################################################ 386 | ## tweaking ggplot2 themes 387 | ## ############################################################################ 388 | 389 | ## plot weight and acceleration of cars on a scatterplot 390 | ## colored by transmission 391 | ggplot(mtcars, aes(wt, qsec, color = factor(am))) + geom_point() 392 | p <- last_plot() 393 | 394 | ## themes 395 | library(ggthemes) 396 | p + theme_economist() + scale_fill_economist() 397 | p + theme_stata() + scale_fill_stata() 398 | p + theme_excel() + scale_fill_excel() 399 | p + theme_gdocs() + scale_fill_gdocs() 400 | 401 | ## create a custom theme for future usage 402 | ?theme 403 | theme_custom <- function() { 404 | theme( 405 | axis.text = element_text( 406 | family = 'Times New Roman', 407 | color = "orange", 408 | size = 12), 409 | axis.title = element_text( 410 | family = 'Times New Roman', 411 | color = "orange", 412 | size = 16, 413 | face = "bold"), 414 | axis.text.y = element_text(angle = 90, hjust = 0.5), 415 | panel.background = element_rect( 416 | fill = "orange", 417 | color = "white", 418 | linewidth = 2) 419 | ) 420 | } 421 | p + theme_custom() 422 | ?theme 423 | 424 | remotes::install_github('cttobin/ggthemr') 425 | library(ggthemr) 426 | ?ggthemr 427 | ## https://github.com/cttobin/ggthemr 428 | 429 | ggthemr('pale', layout = 'scientific', spacing = 2, type = 'inner') 430 | p 431 | 432 | ggthemr('grass', layout = 'clear', spacing = 2, type = 'outer') 433 | p 434 | 435 | ## old-school palette-definition without helper functions 436 | ggthemr( 437 | palette = structure( 438 | list( 439 | background = 'papayawhip', 440 | text = c(inner = 'orange', outer = 'black'), 441 | line = c(inner = 'orange', outer = 'black'), 442 | gridline = 'white', 443 | swatch = structure(RColorBrewer::brewer.pal(8, 'Dark2'), 444 | class = 'ggthemr_swatch'), 445 | gradient = c(low = 'white', high = 'red')), 446 | class = 'ggthemr_palette'), 447 | layout = structure( 448 | list( 449 | panel.grid.major = function(...) element_line(...), 450 | panel.grid.minor = function(...) element_line(linewidth = 0.25, ...) 451 | ), class = 'ggthemr_layout'), 452 | text_size = 12, spacing = 0.5) 453 | p 454 | 455 | ggplot(iris, aes(Sepal.Length, fill = Species)) + geom_density(alpha = .5) 456 | 457 | ## drop back to the default colors and layout 458 | ggthemr_reset() 459 | 460 | ## ############################################################################ 461 | ## interactive plots 462 | ## ############################################################################ 463 | 464 | ## plot weight and acceleration of cars on a scatterplot 465 | ## colored by transmission 466 | (p <- ggplot(mtcars, aes(wt, qsec, color = factor(am))) + geom_point()) 467 | 468 | ## much easier approach, although less control 469 | library(plotly) 470 | ggplotly(p) 471 | 472 | ## even preserves the set ggplot theme: 473 | ggthemr('grass', layout = 'clear', spacing = 2, type = 'outer') 474 | ggplotly(p) 475 | ggthemr_reset() 476 | 477 | 478 | library(ggiraph) 479 | ?girafe 480 | girafe(ggobj = p) 481 | ## not much change, only an extra export feature? 482 | ## although can highlight, also shows up in the Viewer (where the anims went as well) 483 | 484 | p <- ggplot(mtcars, aes( 485 | x = wt, 486 | y = qsec, 487 | color = factor(am), 488 | ## paste('Number of gears:', gear)) 489 | tooltip = rownames(mtcars))) + 490 | geom_point_interactive() 491 | girafe(ggobj = p) 492 | 493 | p <- ggplot(mtcars, aes( 494 | x = wt, 495 | y = qsec, 496 | color = factor(am), 497 | ## NOTE this newly added field 498 | data_id = factor(gear), 499 | tooltip = rownames(mtcars))) + 500 | geom_point_interactive() 501 | girafe(ggobj = p) 502 | girafe(ggobj = p, options = list(opts_hover(css = "fill:black;"))) 503 | 504 | girafe(ggobj = p, options = list( 505 | opts_hover(css = "fill:black;"), 506 | opts_zoom(max = 2) 507 | )) 508 | --------------------------------------------------------------------------------