├── .gitignore ├── 2020_07_28_penguins.Rmd ├── 2020_08_04_europe_energy.Rmd ├── 2020_08_18_extinct_plants.Rmd ├── 2020_08_25_chopped.Rmd ├── 2020_09_01_crop_yields.Rmd ├── 2020_09_08_friends.Rmd ├── 2020_09_15_government_spending_kids.Rmd ├── 2020_09_22_himalayan_climbers.Rmd ├── 2020_09_29_taylor_swift_beyonce.Rmd ├── 2020_10_06_ncaa_womens_basketball.Rmd ├── 2020_10_20_beer_awards.Rmd ├── 2020_11_03_ikea.Rmd ├── 2020_11_10_phone_history.Rmd ├── README.md ├── african-american-achievements.Rmd ├── african-american-history.Rmd ├── animal-crossing.Rmd ├── australian-animal-outcomes.Rmd ├── baltimore_bridges.Rmd ├── beach-volleyball.Rmd ├── beer-production.Rmd ├── bike_traffic.Rmd ├── bird-collisions.Rmd ├── board-games.Rmd ├── bob-ross.Rmd ├── broadway-shinybones ├── .gitignore ├── README.md ├── _site.yml ├── app.R ├── broadway-shinybones.Rproj ├── components │ └── README.md ├── pages │ └── README.md └── show_metrics.rds ├── broadway.Rmd ├── car-economy.Rmd ├── caribou-locations.Rmd ├── cetaceans.Rmd ├── cocktails.Rmd ├── coffee-ratings.Rmd ├── college-majors.Rmd ├── cord-19.Rmd ├── cran-code.Rmd ├── crop-yields-shiny ├── app.Rmd └── yields_tidy.rds ├── data-screencasts.Rproj ├── french-trains.Rmd ├── gdpr.Rmd ├── golden-age-tv.Rmd ├── grand-slams.Rmd ├── honeycomb-puzzle.Rmd ├── horror-movie-ratings.Rmd ├── malaria.Rmd ├── media-franchises.Rmd ├── medium-datasci.Rmd ├── movie-profit.Rmd ├── nobel-prize.Rmd ├── nyc-pizza.Rmd ├── nyc-restaurants.Rmd ├── nyc-squirrels-app └── app.R ├── nyc-squirrels.Rmd ├── office-transcripts.Rmd ├── pascals-triangle.Rmd ├── plastic-waste.Rmd ├── r-downloads.Rmd ├── ramen-ratings.Rmd ├── riddler-die-reroll.Rmd ├── riddler-die-roll-low.Rmd ├── riddler-prisoner-coin-flip.Rmd ├── riddler-spam-comments.Rmd ├── screencast-annotations └── README.md ├── seattle-pets.Rmd ├── simpsons-guests.Rmd ├── space-launches.Rmd ├── student-teacher-ratios.Rmd ├── thanksgiving.Rmd ├── tidytuesday-tweets.Rmd ├── tour-de-france.Rmd ├── trees.Rmd ├── umbrella-week.Rmd ├── uncanny-xmen.Rmd ├── us-dairy.Rmd ├── us-wind.Rmd ├── us_phds.Rmd ├── volcano-eruptions.Rmd ├── wine-ratings.Rmd ├── women-workplace-app └── app.R ├── women-workplace.Rmd └── womens-world-cup.Rmd /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | -------------------------------------------------------------------------------- /2020_09_01_crop_yields.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "TidyTemplate" 3 | date: 2020-09-01 4 | output: html_output 5 | --- 6 | 7 | ```{r setup, include=FALSE} 8 | 9 | knitr::opts_chunk$set(echo = TRUE) 10 | 11 | library(tidyverse) 12 | library(tidytuesdayR) 13 | library(scales) 14 | library(janitor) 15 | theme_set(theme_light()) 16 | 17 | ``` 18 | 19 | # Load the weekly Data 20 | 21 | Dowload the weekly data and make available in the `tt` object. 22 | 23 | ```{r Load} 24 | tt <- tt_load("2020-09-01") 25 | ``` 26 | 27 | ```{r} 28 | tt$key_crop_yields %>% 29 | View() 30 | 31 | yields <- tt$key_crop_yields %>% 32 | clean_names() %>% 33 | rename_all(str_remove, "_tonnes.*") 34 | 35 | arable_land <- tt$arable_land_pin %>% 36 | clean_names() %>% 37 | rename(arable_land_needed = 4) 38 | 39 | arable_land %>% 40 | filter(entity == "Afghanistan") %>% 41 | ggplot(aes(year, arable_land_needed)) + 42 | geom_line() 43 | 44 | fertilizer <- tt$cereal_crop_yield_vs_fertilizer_application %>% 45 | janitor::clean_names() %>% 46 | rename(yield = 4, 47 | fertilizer_use = 5) 48 | ``` 49 | 50 | ```{r} 51 | yields_tidy <- yields %>% 52 | pivot_longer(wheat:bananas, names_to = "crop", values_to = "yield") %>% 53 | filter(!is.na(yield)) %>% 54 | mutate(crop = str_replace_all(crop, "_", " "), 55 | crop = str_to_title(crop)) 56 | 57 | yields_tidy %>% 58 | write_rds("crop-yields-shiny/yields_tidy.rds") 59 | ``` 60 | 61 | ```{r} 62 | yields_tidy %>% 63 | filter(code == "USA") %>% 64 | mutate(crop = fct_reorder(crop, -yield)) %>% 65 | ggplot(aes(year, yield)) + 66 | geom_line() + 67 | facet_wrap(~ crop) 68 | 69 | yields_tidy %>% 70 | filter(code == "USA") %>% 71 | mutate(crop = fct_reorder(crop, -yield)) %>% 72 | ggplot(aes(year, yield, color = crop)) + 73 | geom_line() + 74 | labs(x = "Year", 75 | y = "Yield (tonnes per hectare)", 76 | title = "Crop yields in the US over time", 77 | color = "Crop") 78 | 79 | yields_tidy %>% 80 | filter(entity == "India") %>% 81 | mutate(crop = fct_reorder(crop, -yield)) %>% 82 | ggplot(aes(year, yield, color = crop)) + 83 | geom_line() + 84 | labs(x = "Year", 85 | y = "Yield (tonnes per hectare)", 86 | title = "Crop yields in the US over time", 87 | color = "Crop") 88 | ``` 89 | 90 | ```{r} 91 | yields_tidy %>% 92 | filter(crop == "Wheat") %>% 93 | add_count(entity) %>% 94 | filter(n == max(n)) %>% 95 | filter(entity %in% sample(unique(entity), 25)) %>% 96 | ggplot(aes(year, yield)) + 97 | geom_line() + 98 | facet_wrap(~ entity) 99 | 100 | crop_yields_50_years <- yields_tidy %>% 101 | arrange(entity, year) %>% 102 | filter(year >= 1968) %>% 103 | group_by(entity, code, crop) %>% 104 | summarize(year_start = min(year), 105 | year_end = max(year), 106 | yield_start = first(yield), 107 | yield_end = last(yield)) %>% 108 | ungroup() %>% 109 | filter(year_start == 1968) %>% 110 | mutate(yield_ratio = yield_end / yield_start) 111 | 112 | crop_yields_50_years %>% 113 | filter(!is.na(code)) %>% 114 | ggplot(aes(yield_start, yield_end)) + 115 | geom_abline(color = "red") + 116 | geom_point() + 117 | facet_wrap(~ crop, scales = "free") 118 | 119 | crop_yields_50_years %>% 120 | mutate(crop = fct_reorder(crop, yield_ratio)) %>% 121 | ggplot(aes(yield_ratio, crop)) + 122 | geom_boxplot() + 123 | scale_x_log10() 124 | 125 | crop_yields_50_years %>% 126 | group_by(crop) %>% 127 | summarize(median_yield_ratio = median(yield_ratio)) %>% 128 | mutate(crop = fct_reorder(crop, median_yield_ratio)) %>% 129 | ggplot(aes(median_yield_ratio, crop)) + 130 | geom_col() + 131 | labs(title = "How much has the average country improved at producing this crop?", 132 | x = "(2018 yield) / (1968 yield)", 133 | y = "") 134 | 135 | crop_yields_50_years %>% 136 | filter(is.na(code)) %>% 137 | filter(entity %in% c("Africa", "Asia", "Northern America", "South America", "Oceania")) %>% 138 | ggplot(aes(yield_start, yield_end, color = entity)) + 139 | geom_abline(color = "red") + 140 | geom_point() + 141 | expand_limits(y = 0, x = 0) + 142 | facet_wrap(~ crop, scales = "free") + 143 | labs(x = "Tonnes per hectare in 1968", 144 | y = "Tonnes per hectare in 2018", 145 | color = "Continent") 146 | ``` 147 | 148 | ```{r} 149 | library(ggrepel) 150 | library(countrycode) 151 | 152 | crop_yields_50_years %>% 153 | filter(crop == "Wheat", 154 | !is.na(code)) %>% 155 | mutate(continent = countrycode(code, "iso3c", "continent")) %>% 156 | filter(!is.na(continent)) %>% 157 | ggplot(aes(yield_start, yield_ratio)) + 158 | geom_point(aes(color = continent)) + 159 | scale_x_log10() + 160 | scale_y_log10(breaks = c(.25, .5, 1, 2, 4), 161 | labels = c("1/4X", "1/2X", "Same", "2X", "4X")) + 162 | geom_hline(yintercept = 1, linetype = "dotted") + 163 | geom_text_repel(aes(label = entity), force = .1, 164 | size = 2.5) + 165 | labs(x = "1968 yield (tonnes per hectare), log scale", 166 | y = "(2018 yield) / (1968 yield), log scale", 167 | color = "Continent", 168 | title = "How has wheat efficiency changed across countries?") 169 | 170 | countrycode("USA", "iso3c", "continent") 171 | ``` 172 | 173 | 174 | 175 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Code from screencasts of #tidytuesday 2 | 3 | These are the R Markdown documents produced during live analyses of [#tidytuesday data](https://github.com/rfordatascience/tidytuesday). 4 | -------------------------------------------------------------------------------- /african-american-achievements.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "African-American Achievements" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | theme_set(theme_light()) 13 | 14 | tuesdata <- tidytuesdayR::tt_load('2020-06-09') 15 | 16 | science <- tuesdata$science 17 | ``` 18 | 19 | ```{r} 20 | tuesdata$firsts %>% 21 | View() 22 | 23 | tuesdata$science %>% 24 | View() 25 | ``` 26 | 27 | ```{r} 28 | firsts <- tuesdata$firsts 29 | 30 | tuesdata$firsts %>% 31 | ggplot(aes(year)) + 32 | geom_histogram() 33 | 34 | tuesdata$firsts %>% 35 | count(category, sort = TRUE) %>% 36 | mutate(category = fct_reorder(category, n)) %>% 37 | ggplot(aes(n, category)) + 38 | geom_col() 39 | ``` 40 | 41 | ```{r} 42 | firsts <- tuesdata$firsts %>% 43 | mutate(person = str_remove(person, "[\\[\\(].*"), 44 | person = str_trim(person)) 45 | ``` 46 | 47 | ```{r} 48 | library(plotly) 49 | library(glue) 50 | 51 | g <- firsts %>% 52 | ggplot(aes(year, 53 | category, 54 | color = category, 55 | text = glue("{ year }: { accomplishment }\n{ person }"))) + 56 | geom_point() + 57 | theme(axis.text.y = element_blank(), 58 | axis.ticks.y = element_blank(), 59 | panel.grid.major.y = element_blank(), 60 | panel.grid.minor.y = element_blank(), 61 | legend.position = "none") + 62 | labs(title = "Timeline of some notable African-American achievements", 63 | subtitle = "Source: https://en.wikipedia.org/wiki/List_of_African-American_firsts", 64 | y = "Category", 65 | x = "Year") 66 | 67 | ggplotly(g, tooltip = "text") 68 | ``` 69 | 70 | ### Science 71 | 72 | ```{r} 73 | tuesdata$science %>% 74 | ggplot(aes(birth)) + 75 | geom_histogram() 76 | 77 | tuesdata$science %>% 78 | separate_rows(occupation_s, sep = "; ") %>% 79 | mutate(occupation = str_to_title(occupation_s)) %>% 80 | count(occupation, sort = TRUE) 81 | 82 | science %>% 83 | filter(str_detect(occupation_s, regex("istician", ignore_case = TRUE))) %>% 84 | pull(name) 85 | 86 | science %>% 87 | filter(str_detect(occupation_s, "statistician")) %>% 88 | View() 89 | ``` 90 | 91 | ```{r} 92 | library(rvest) 93 | 94 | science_html <- science %>% 95 | mutate(html = map(links, possibly(read_html, NULL, quiet = FALSE))) 96 | ``` 97 | 98 | ```{r} 99 | extract_infobox <- . %>% 100 | html_node(".vcard") %>% 101 | html_table(header = FALSE) %>% 102 | as_tibble() 103 | 104 | infoboxes <- science_html %>% 105 | filter(!map_lgl(html, is.null)) %>% 106 | mutate(infobox = map(html, possibly(extract_infobox, NULL))) %>% 107 | select(link = links, infobox) %>% 108 | unnest(infobox) %>% 109 | filter(X1 != "" | X2 != "", X1 != "Scientific career") %>% 110 | rename(key = X1, value = X2) 111 | 112 | science_infoboxes <- infoboxes %>% 113 | group_by(link) %>% 114 | mutate(name = first(key)) %>% 115 | group_by(key) %>% 116 | filter(n() >= 10) %>% 117 | ungroup() %>% 118 | distinct(name, key, .keep_all = TRUE) %>% 119 | spread(key, value) %>% 120 | janitor::clean_names() 121 | ``` 122 | 123 | ```{r} 124 | science_infoboxes %>% 125 | count(nationality, sort = TRUE) 126 | ``` 127 | 128 | 129 | 130 | 131 | 132 | -------------------------------------------------------------------------------- /african-american-history.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "African-American History" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | library(tidytuesdayR) 13 | library(scales) 14 | theme_set(theme_light()) 15 | 16 | tuesdata <- tt_load('2020-06-16') 17 | ``` 18 | 19 | ```{r} 20 | slave_routes <- tuesdata$slave_routes 21 | 22 | slave_routes %>% 23 | summarize(sum(n_slaves_arrived, na.rm = TRUE)) 24 | 25 | slave_routes %>% 26 | count(ship_name, sort = TRUE) 27 | 28 | slave_routes %>% 29 | ggplot(aes(n_slaves_arrived)) + 30 | geom_histogram() 31 | ``` 32 | 33 | ```{r} 34 | slave_routes %>% 35 | filter(!is.na(ship_name), !is.na(n_slaves_arrived)) %>% 36 | filter(fct_lump(ship_name, 12) != "Other") %>% 37 | mutate(ship_name = fct_reorder(ship_name, n_slaves_arrived)) %>% 38 | ggplot(aes(n_slaves_arrived, ship_name)) + 39 | geom_boxplot() 40 | ``` 41 | 42 | ```{r} 43 | slave_routes %>% 44 | count(port_arrival, sort = TRUE) 45 | 46 | slave_routes %>% 47 | count(port_origin, sort = TRUE) 48 | 49 | 50 | slave_routes %>% 51 | ggplot(aes(year_arrival)) + 52 | geom_histogram() + 53 | geom_vline(xintercept = c(1776, 1865), lty = 2) + 54 | labs(title = "# of slave ships over time") 55 | ``` 56 | 57 | ```{r} 58 | slave_routes %>% 59 | filter(!is.na(port_origin)) %>% 60 | mutate(port_origin = str_trunc(port_origin, 25), 61 | port_origin = fct_lump(port_origin, 24), 62 | port_origin = fct_reorder(port_origin, year_arrival)) %>% 63 | count(port_origin, 64 | decade = 10 * (year_arrival %/% 10)) %>% 65 | ggplot(aes(decade, n)) + 66 | geom_line() + 67 | facet_wrap(~ port_origin) + 68 | geom_vline(xintercept = c(1807), lty = 2) + 69 | theme(strip.text = element_text(size = 6)) 70 | 71 | slave_routes %>% 72 | filter(port_origin == "New York") %>% 73 | arrange(desc(year_arrival)) %>% 74 | View() 75 | ``` 76 | 77 | ```{r} 78 | slave_routes %>% 79 | filter(fct_lump(ship_name, 12) != "Other") %>% 80 | count(ship_name, port_origin, port_arrival, sort = TRUE) %>% 81 | group_by(ship_name) %>% 82 | mutate(percent = n / sum(n)) %>% 83 | top_n(1, percent) %>% 84 | arrange(desc(percent)) 85 | ``` 86 | 87 | ```{r} 88 | library(grid) 89 | library(ggraph) 90 | 91 | set.seed(2020) 92 | 93 | slave_routes %>% 94 | count(port_origin, port_arrival, sort = TRUE) %>% 95 | filter(port_origin != port_arrival) %>% 96 | head(40) %>% 97 | ggraph(layout = "fr") + 98 | geom_edge_link(aes(width = n), 99 | arrow = arrow(type = "closed", 100 | length = unit(.1, "inch"))) + 101 | geom_node_point() + 102 | geom_node_text(aes(label = name), repel = TRUE) + 103 | scale_edge_width_continuous(range = c(.5, 4)) + 104 | labs(title = "Common slave routes", 105 | edge_width = "# of ships") 106 | ``` 107 | 108 | ```{r} 109 | slave_routes %>% 110 | group_by(port_origin) %>% 111 | summarize(n_ships = n(), 112 | total_recorded = sum(n_slaves_arrived, na.rm = TRUE), 113 | pct_missing = mean(is.na(n_slaves_arrived)), 114 | estimated_total = mean(n_slaves_arrived, na.rm = TRUE) * n()) %>% 115 | arrange(desc(n_ships)) 116 | ``` 117 | 118 | ```{r} 119 | census <- tuesdata$census 120 | 121 | census_gathered <- census %>% 122 | mutate(other = total - white - black) %>% 123 | arrange(year) %>% 124 | gather(racial_category, population, white, black_free, black_slaves, other) 125 | 126 | census_gathered %>% 127 | filter(region == "USA Total") %>% 128 | ggplot(aes(year, population, fill = racial_category)) + 129 | geom_col() + 130 | scale_y_continuous(labels = comma) + 131 | labs(x = "Year", 132 | y = "Census Population", 133 | fill = "Racial category", 134 | title = "Census racial makeup of US, 1790-1870", 135 | subtitle = "No 'other' category existed before 1860") 136 | ``` 137 | 138 | ```{r} 139 | census_gathered %>% 140 | filter(region != "USA Total", !is.na(division)) %>% 141 | mutate(division = fct_reorder(division, -population, sum)) %>% 142 | ggplot(aes(year, population, fill = racial_category)) + 143 | geom_col() + 144 | scale_y_continuous(labels = comma) + 145 | facet_wrap(~ division) + 146 | labs(x = "Year", 147 | y = "Census Population", 148 | fill = "Racial category", 149 | title = "Census racial makeup of US, 1790-1870", 150 | subtitle = "No 'other' category existed before 1860") 151 | ``` 152 | 153 | ```{r} 154 | census_gathered %>% 155 | filter(region != "USA Total", !is.na(division)) %>% 156 | mutate(division = fct_reorder(division, -population, sum)) %>% 157 | group_by(division, year) %>% 158 | mutate(percent = population / sum(population)) %>% 159 | ggplot(aes(year, percent, fill = racial_category)) + 160 | geom_col() + 161 | scale_y_continuous(labels = percent) + 162 | facet_wrap(~ division) + 163 | labs(x = "Year", 164 | y = "% of Census Population", 165 | fill = "Racial category", 166 | title = "Census racial makeup of US, 1790-1870", 167 | subtitle = "No 'other' category existed before 1860") 168 | ``` 169 | 170 | ```{r} 171 | library(ggwordcloud) 172 | 173 | name_counts <- tuesdata$african_names %>% 174 | count(name, gender, sort = TRUE) 175 | 176 | wordcloud(name_counts$name, name_counts$n) 177 | 178 | name_counts %>% 179 | head(100) %>% 180 | ggplot(aes(label = name, size = n, color = gender)) + 181 | geom_text_wordcloud() 182 | ``` 183 | 184 | ```{r} 185 | library(tidytext) 186 | 187 | tuesdata$african_names %>% 188 | filter(!is.na(gender)) %>% 189 | mutate(gender = fct_recode(gender, Man = "Boy", Woman = "Girl")) %>% 190 | count(name, gender, sort = TRUE) %>% 191 | group_by(gender) %>% 192 | top_n(20, n) %>% 193 | ungroup() %>% 194 | mutate(name = reorder_within(name, n, gender)) %>% 195 | ggplot(aes(n, name)) + 196 | geom_col() + 197 | scale_y_reordered() + 198 | facet_wrap(~ gender, scales = "free") 199 | ``` 200 | 201 | ```{r} 202 | tuesdata$african_names %>% 203 | count(country_origin, sort = TRUE) 204 | ``` 205 | 206 | 207 | -------------------------------------------------------------------------------- /animal-crossing.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Animal Crossing" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | theme_set(theme_light()) 13 | 14 | critic <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/critic.tsv') 15 | user_reviews <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/user_reviews.tsv') 16 | items <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/items.csv') 17 | villagers <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/villagers.csv') 18 | ``` 19 | 20 | ```{r} 21 | villagers 22 | items %>% 23 | View() 24 | ``` 25 | 26 | ### Text Analysis 27 | 28 | ```{r} 29 | critic %>% 30 | ggplot(aes(grade)) + 31 | geom_histogram() 32 | 33 | user_reviews %>% 34 | ggplot(aes(grade)) + 35 | geom_histogram() 36 | 37 | head(user_reviews$text) 38 | ``` 39 | 40 | ```{r} 41 | library(tidytext) 42 | library(lubridate) 43 | 44 | user_review_words <- user_reviews %>% 45 | unnest_tokens(word, text) %>% 46 | anti_join(stop_words, by = "word") %>% 47 | count(user_name, date, grade, word) 48 | ``` 49 | 50 | ```{r} 51 | by_week <- user_reviews %>% 52 | group_by(week = floor_date(date, "week", week_start = 1)) %>% 53 | summarize(nb_reviews = n(), 54 | avg_grade = mean(grade), 55 | pct_zero = mean(grade == 0), 56 | pct_ten = mean(grade == 10)) 57 | 58 | by_week %>% 59 | ggplot(aes(week, avg_grade)) + 60 | geom_line() + 61 | geom_point(aes(size = nb_reviews)) + 62 | expand_limits(y = 0) + 63 | labs(x = "Time", 64 | y = "Average grade", 65 | size = "# of reviews") 66 | 67 | user_reviews %>% 68 | ggplot(aes(grade)) + 69 | geom_histogram() + 70 | labs(title = "Most reviews were very low or very high") 71 | 72 | by_week %>% 73 | gather(type, value, contains("pct")) %>% 74 | mutate(type = ifelse(type == "pct_zero", "% rated 0", "% rated 10")) %>% 75 | ggplot(aes(week, value, color = type)) + 76 | geom_line() + 77 | geom_point(aes(size = nb_reviews)) + 78 | scale_y_continuous(labels = scales::percent) + 79 | expand_limits(y = 0) + 80 | labs(x = "Time", 81 | y = "% of reviews", 82 | size = "Total reviews in week", 83 | title = "Reviews got more polarizing in middle of game") 84 | ``` 85 | 86 | 87 | ```{r} 88 | # user_reviews %>% 89 | # mutate(start_text = str_sub(text, 1, pmin(30, str_length(text))), 90 | # regex = paste0(".", start_text, ".*")) %>% 91 | # mutate(new_text = map2_chr(text, regex, str_remove)) %>% 92 | # filter(str_length(new_text) != str_length(text)) 93 | 94 | by_word <- user_review_words %>% 95 | group_by(word) %>% 96 | summarize(avg_grade = mean(grade), 97 | nb_reviews = n()) %>% 98 | arrange(desc(nb_reviews)) %>% 99 | filter(nb_reviews >= 25) %>% 100 | arrange(desc(avg_grade)) 101 | 102 | by_word %>% 103 | filter(nb_reviews >= 75) %>% 104 | ggplot(aes(nb_reviews, avg_grade)) + 105 | geom_point() + 106 | geom_text(aes(label = word), vjust = 1, hjust = 1, check_overlap = TRUE) + 107 | scale_x_log10() 108 | ``` 109 | 110 | ```{r} 111 | by_word %>% 112 | top_n(20, -avg_grade) %>% 113 | ggplot(aes(nb_reviews, avg_grade)) + 114 | geom_point() + 115 | geom_text(aes(label = word), vjust = 1, hjust = 1, check_overlap = TRUE) + 116 | scale_x_log10() + 117 | labs(title = "What words were associated with low-grade reviews?", 118 | subtitle = "20 most negative words; only words in at least 25 reviews") 119 | ``` 120 | 121 | ```{r} 122 | library(widyr) 123 | library(stm) 124 | 125 | review_matrix <- user_review_words %>% 126 | group_by(word) %>% 127 | filter(n() >= 25) %>% 128 | cast_sparse(user_name, word, n) 129 | 130 | topic_model_6 <- stm(review_matrix, 131 | K = 6, 132 | verbose = TRUE, 133 | init.type = "Spectral", 134 | emtol = 5e-5) 135 | ``` 136 | 137 | ```{r} 138 | tidy(topic_model_6) %>% 139 | group_by(topic) %>% 140 | top_n(12, beta) %>% 141 | mutate(term = reorder_within(term, beta, topic)) %>% 142 | ggplot(aes(beta, term, fill = factor(topic))) + 143 | geom_col(show.legend = FALSE) + 144 | scale_y_reordered() + 145 | facet_wrap(~ topic, scales = "free_y") 146 | 147 | tidy(topic_model_6) %>% 148 | filter(term == "progress") 149 | 150 | topic_model_gamma <- tidy(topic_model_6, matrix = "gamma") %>% 151 | mutate(user_name = rownames(review_matrix)[document]) %>% 152 | inner_join(user_reviews, by = "user_name") 153 | 154 | topic_model_gamma %>% 155 | group_by(topic) %>% 156 | top_n(1, gamma) 157 | 158 | topic_model_gamma %>% 159 | group_by(topic) %>% 160 | summarize(correlation = cor(gamma, grade), 161 | spearman_correlation = cor(gamma, grade, method = "spearman")) 162 | 163 | topic_model_gamma %>% 164 | group_by(week = floor_date(date, "week", week_start = 1), 165 | topic) %>% 166 | summarize(avg_gamma = mean(gamma)) %>% 167 | ggplot(aes(week, avg_gamma, color = factor(topic))) + 168 | geom_line() + 169 | expand_limits(y = 0) + 170 | scale_y_continuous(labels = scales::percent) + 171 | labs(x = "Time", 172 | y = "Average gamma (document-topic association)") 173 | ``` 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | -------------------------------------------------------------------------------- /beach-volleyball.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Beach Volleyball" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | library(scales) 13 | library(lubridate) 14 | theme_set(theme_light()) 15 | 16 | tuesdata <- tidytuesdayR::tt_load('2020-05-19') 17 | 18 | vb_matches <- tuesdata$vb_matches %>% 19 | mutate(match_id = row_number()) 20 | ``` 21 | 22 | ```{r} 23 | vb_matches %>% 24 | count(circuit, tournament, date, sort = TRUE) 25 | 26 | vb_matches %>% 27 | count(gender, sort = TRUE) 28 | 29 | vb_matches %>% 30 | count(year) %>% 31 | 32 | ``` 33 | 34 | ```{r} 35 | vb_long <- vb_matches %>% 36 | rename(w_p1_name = w_player1, w_p2_name = w_player2, 37 | l_p1_name = l_player1, l_p2_name = l_player2, 38 | w_team_rank = w_rank, l_team_rank = l_rank) %>% 39 | mutate_at(vars(starts_with("w_"), starts_with("l_")), as.character) %>% 40 | pivot_longer(cols = c(starts_with("w_"), starts_with("l_"))) %>% 41 | separate(name, c("winner_loser", "player", "name"), 42 | sep = "_", 43 | extra = "merge", 44 | fill = "right") %>% 45 | mutate(winner_loser = str_to_upper(winner_loser)) 46 | 47 | vb_player_matches <- vb_long %>% 48 | filter(name != "rank") %>% 49 | spread(name, value) %>% 50 | type_convert() 51 | ``` 52 | 53 | ```{r} 54 | vb_sets <- vb_matches %>% 55 | select(match_id, circuit:match_num, score) %>% 56 | separate_rows(score, sep = ", ") %>% 57 | mutate(score = str_remove(score, " retired")) %>% 58 | mutate(score = na_if(score, "Forfeit or other")) %>% 59 | separate(score, c("w_score", "l_score"), convert = TRUE) 60 | ``` 61 | 62 | ## Look at players 63 | 64 | ```{r} 65 | by_player <- vb_player_matches %>% 66 | group_by(name, gender) %>% 67 | summarize(n_matches = n(), 68 | pct_winner = mean(winner_loser == "W"), 69 | first_game = min(date), 70 | last_game = max(date)) %>% 71 | arrange(desc(n_matches)) %>% 72 | ungroup() 73 | 74 | by_player %>% 75 | filter(n_matches >= 200) %>% 76 | ggplot(aes(n_matches, pct_winner, color = gender)) + 77 | geom_point() + 78 | scale_x_log10() + 79 | scale_y_continuous(labels = percent) + 80 | labs(x = "# of matches since 2000 (log scale)", 81 | y = "% of matches won") 82 | 83 | by_player %>% 84 | filter(n_matches >= 200) %>% 85 | arrange(desc(pct_winner)) 86 | ``` 87 | 88 | ```{r} 89 | vb_player_matches %>% 90 | summarize_all(~ mean(!is.na(.))) %>% 91 | gather() %>% 92 | View() 93 | ``` 94 | 95 | ```{r} 96 | vb_player_matches %>% 97 | group_by(tournament) %>% 98 | summarize(pct_has_attacks = mean(!is.na(tot_attacks)), 99 | n = n()) %>% 100 | arrange(desc(n)) 101 | ``` 102 | 103 | ### How would we predict if a player will win in 2019? 104 | 105 | ```{r} 106 | summarize_players <- . %>% 107 | summarize(n_matches = n(), 108 | pct_winner = mean(winner_loser == "W"), 109 | avg_attacks = mean(tot_attacks, na.rm = TRUE), 110 | avg_errors = mean(tot_errors, na.rm = TRUE), 111 | avg_serve_errors = mean(tot_serve_errors, na.rm = TRUE), 112 | avg_kills = mean(tot_kills, na.rm = TRUE), 113 | avg_aces = mean(tot_aces, na.rm = TRUE), 114 | n_with_data = sum(!is.na(tot_attacks))) %>% 115 | ungroup() %>% 116 | arrange(desc(n_matches)) 117 | 118 | players_before_2019 <- vb_player_matches %>% 119 | filter(year < 2019) %>% 120 | group_by(name, gender, hgt, birthdate, country) %>% 121 | summarize_players() %>% 122 | filter(!is.na(avg_attacks)) 123 | 124 | players_2019 <- vb_player_matches %>% 125 | filter(year == 2019) %>% 126 | group_by(name, gender, hgt, birthdate, country, year, 127 | age = year - year(birthdate)) %>% 128 | summarize_players() 129 | ``` 130 | 131 | ```{r} 132 | performance_joined <- players_before_2019 %>% 133 | inner_join(players_2019 %>% 134 | select(name, n_matches, pct_winner), 135 | by = "name", 136 | suffix = c("", "_2019")) 137 | 138 | performance_joined %>% 139 | filter(n_matches >= 10, 140 | n_matches_2019 >= 5) %>% 141 | ggplot(aes(pct_winner, pct_winner_2019)) + 142 | geom_point() + 143 | geom_abline(color = "red") + 144 | geom_smooth(method = "lm") 145 | 146 | performance_joined %>% 147 | mutate(n_wins_2019 = n_matches_2019 * pct_winner_2019, 148 | country = fct_lump(country, 3)) %>% 149 | glm(cbind(n_wins_2019, n_matches_2019 - n_wins_2019) ~ 150 | pct_winner + avg_errors + avg_serve_errors, 151 | data = ., 152 | family = "binomial") %>% 153 | summary() 154 | ``` 155 | 156 | 157 | ```{r} 158 | players_before_2019 %>% 159 | filter(n_with_data >= 20) %>% 160 | ggplot(aes(avg_serve_errors, avg_aces, size = n_with_data)) + 161 | geom_point() + 162 | labs(size = "Matches", 163 | title = "DON'T TRUST THIS") 164 | ``` 165 | 166 | ```{r} 167 | 168 | ``` 169 | 170 | 171 | 172 | ```{r} 173 | 174 | ``` 175 | 176 | 177 | ```{r} 178 | player_first_year_summarized %>% 179 | filter(!is.nan(avg_attacks)) 180 | ``` 181 | 182 | 183 | 184 | -------------------------------------------------------------------------------- /bike_traffic.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Bike Traffic" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | library(lubridate) 13 | library(scales) 14 | theme_set(theme_light()) 15 | 16 | bike_traffic_raw <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-02/bike_traffic.csv") 17 | 18 | bike_traffic <- bike_traffic_raw %>% 19 | mutate(date = mdy_hms(date)) %>% 20 | filter(bike_count < 2000) %>% 21 | select(-ped_count) 22 | ``` 23 | 24 | ```{r} 25 | bike_traffic %>% 26 | count(crossing, direction) 27 | ``` 28 | 29 | ```{r} 30 | bike_traffic %>% 31 | ggplot(aes(date, fill = is.na(bike_count))) + 32 | geom_histogram() + 33 | facet_grid(crossing ~ direction) 34 | ``` 35 | 36 | ### When in the day do we see bikers? 37 | 38 | ```{r} 39 | bike_traffic %>% 40 | group_by(crossing, 41 | hour = hour(date)) %>% 42 | summarize(bike_count = sum(bike_count, na.rm = TRUE)) %>% 43 | mutate(pct_bike = bike_count / sum(bike_count)) %>% 44 | ggplot(aes(hour, pct_bike, color = crossing)) + 45 | geom_line() + 46 | geom_point() + 47 | scale_y_continuous(labels = percent_format()) + 48 | labs(title = "When in the day do people bike through these Seattle crossings?", 49 | subtitle = "Based on crossings from 2014-February 2019", 50 | color = "Crossing", 51 | x = "Time of day (local time)", 52 | y = "% of bike crossings that happen in this hour") 53 | ``` 54 | 55 | ```{r} 56 | bike_by_time_window <- bike_traffic %>% 57 | mutate(hour = hour(date)) %>% 58 | mutate(time_window = case_when( 59 | between(hour, 7, 10) ~ "Morning Commute", 60 | between(hour, 11, 15) ~ "Midday", 61 | between(hour, 16, 18) ~ "Evening Commute", 62 | TRUE ~ "Night" 63 | )) %>% 64 | group_by(crossing, 65 | time_window) %>% 66 | summarize(number_missing = sum(is.na(bike_count)), 67 | bike_count = sum(bike_count, na.rm = TRUE)) %>% 68 | mutate(pct_bike = bike_count / sum(bike_count)) 69 | 70 | bike_by_time_window %>% 71 | select(-number_missing, -bike_count) %>% 72 | spread(time_window, pct_bike) %>% 73 | mutate(TotalCommute = `Evening Commute` + `Morning Commute`) %>% 74 | arrange(desc(TotalCommute)) 75 | 76 | bike_by_time_window %>% 77 | ggplot(aes(time_window, pct_bike)) + 78 | geom_col() + 79 | coord_flip() + 80 | facet_wrap(~ crossing) 81 | 82 | bike_by_time_window %>% 83 | group_by(crossing) %>% 84 | summarize(total_bikes = sum(bike_count), 85 | pct_commute = sum(bike_count[str_detect(time_window, "Commute")]) / total_bikes) %>% 86 | ggplot(aes(total_bikes, pct_commute)) + 87 | geom_point() + 88 | scale_x_log10() 89 | ``` 90 | 91 | ```{r} 92 | bike_traffic %>% 93 | group_by(crossing, 94 | weekday = wday(date, label = TRUE), 95 | hour = hour(date)) %>% 96 | summarize(total_bikes = sum(bike_count, na.rm = TRUE)) %>% 97 | group_by(crossing) %>% 98 | mutate(pct_bike = total_bikes / sum(total_bikes)) %>% 99 | ggplot(aes(hour, pct_bike, color = crossing)) + 100 | geom_line(show.legend = FALSE) + 101 | facet_grid(crossing ~ weekday) + 102 | scale_y_continuous(labels = percent_format()) + 103 | labs(x = "Time of week", 104 | y = "% of bike crossings happening in this hour", 105 | title = "When in the week do people in Seattle bike?", 106 | subtitle = "Based on crossings from 2014-February 2019") 107 | ``` 108 | 109 | ```{r} 110 | bike_traffic %>% 111 | filter(date < "2018-01-01") %>% 112 | group_by(crossing, 113 | month = fct_relevel(month.name[month(date)], month.name)) %>% 114 | summarize(total_bikes = sum(bike_count, na.rm = TRUE)) %>% 115 | mutate(pct_bike = total_bikes / sum(total_bikes)) %>% 116 | ggplot(aes(month, pct_bike, color = crossing, group = crossing)) + 117 | geom_line() + 118 | expand_limits(y = 0) + 119 | scale_y_continuous(labels = percent_format()) + 120 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 121 | labs(title = "What time of year do people bike?", 122 | subtitle = "Based on 2014-2017 bike crossings", 123 | y = "% of yearly trips in this month", 124 | x = "") 125 | ``` 126 | 127 | ### What directions do people commute by bike? 128 | 129 | ```{r} 130 | bike_by_direction_hour_crossing <- bike_traffic %>% 131 | filter(crossing != "MTS Trail", 132 | !wday(date, label = TRUE) %in% c("Sat", "Sun"), 133 | direction %in% c("North", "South")) %>% 134 | mutate(hour = hour(date)) %>% 135 | group_by(crossing, 136 | direction, 137 | hour) %>% 138 | summarize(bike_count = sum(bike_count, na.rm = TRUE)) %>% 139 | mutate(pct_bike = bike_count / sum(bike_count)) 140 | 141 | bike_by_direction_hour_crossing %>% 142 | group_by(crossing) %>% 143 | mutate(average_hour = sum((hour * pct_bike)[direction == "North"])) %>% 144 | ungroup() %>% 145 | mutate(crossing = fct_reorder(crossing, average_hour)) %>% 146 | ggplot(aes(hour, pct_bike, color = direction)) + 147 | geom_line() + 148 | facet_grid(crossing ~ .) + 149 | scale_y_continuous(labels = percent_format()) + 150 | labs(x = "Time of day", 151 | y = "% of bike crossings happening in this hour", 152 | title = "In which directions do people commute by bike?", 153 | subtitle = "Based on weekday crossings at six Seattle locations from 2014-February 2019", 154 | color = "Direction") 155 | ``` 156 | 157 | 158 | 159 | 160 | 161 | 162 | -------------------------------------------------------------------------------- /bird-collisions.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Bird Collisions" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ## R Markdown 11 | 12 | ```{r} 13 | library(tidyverse) 14 | theme_set(theme_light()) 15 | 16 | mp_light <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-30/mp_light.csv") 17 | 18 | bird_collisions <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-30/bird_collisions.csv") %>% 19 | left_join(mp_light, by = "date") 20 | ``` 21 | 22 | ```{r} 23 | bird_collisions %>% 24 | ggplot(aes(date, fill = locality)) + 25 | geom_histogram() 26 | ``` 27 | 28 | ```{r} 29 | bird_collisions %>% 30 | gather(category, value, -date, -light_score) %>% 31 | count(category, value, light_score_missing = is.na(light_score)) %>% 32 | group_by(category) %>% 33 | top_n(16, n) %>% 34 | ungroup() %>% 35 | mutate(value = fct_reorder(value, n, sum), 36 | category = fct_reorder(category, n, length)) %>% 37 | ggplot(aes(value, n, fill = light_score_missing)) + 38 | geom_col() + 39 | facet_wrap(~ category, scales = "free_y") + 40 | coord_flip() + 41 | labs(x = "# of collisions", 42 | y = "", 43 | fill = "Light score missing", 44 | title = "Category breakdowns of collisions") 45 | ``` 46 | 47 | ```{r} 48 | bird_collisions %>% 49 | filter(!is.na(light_score)) %>% 50 | count(date, locality) %>% 51 | ggplot(aes(n, color = locality)) + 52 | geom_density() + 53 | scale_x_log10() + 54 | labs(x = "# of collisions per night") 55 | 56 | bird_collisions %>% 57 | filter(!is.na(light_score)) %>% 58 | distinct(date, light_score) %>% 59 | ggplot(aes(light_score)) + 60 | geom_histogram() 61 | 62 | geom_mean <- function(x) { 63 | exp(mean(log(x + 1)) - 1) 64 | } 65 | 66 | by_day_mp <- bird_collisions %>% 67 | filter(!is.na(light_score)) %>% 68 | group_by(date, locality) %>% 69 | summarize(collisions = n()) %>% 70 | ungroup() %>% 71 | complete(date, locality, fill = list(collisions = 0)) %>% 72 | right_join(mp_light %>% crossing(locality = c("CHI", "MP")), by = c("date", "locality")) %>% 73 | filter(date <= "2016-11-13") %>% 74 | replace_na(list(collisions = 0)) %>% 75 | mutate(locality = ifelse(locality == "CHI", "Greater Chicago", "McCormick Place")) 76 | 77 | bootstrap_cis <- by_day_mp %>% 78 | bootstraps(times = 1000) %>% 79 | unnest(map(splits, as.data.frame)) %>% 80 | group_by(light_score, locality, id) %>% 81 | summarize(avg_collisions = geom_mean(collisions)) %>% 82 | summarize(bootstrap_low = quantile(avg_collisions, .025), 83 | bootstrap_high = quantile(avg_collisions, .975)) 84 | 85 | by_day_mp %>% 86 | group_by(light_score, locality) %>% 87 | summarize(avg_collisions = geom_mean(collisions), 88 | nights = n()) %>% 89 | ggplot(aes(light_score, color = locality)) + 90 | geom_line(aes(y = avg_collisions)) + 91 | geom_ribbon(aes(ymin = bootstrap_low, ymax = bootstrap_high), 92 | data = bootstrap_cis, 93 | alpha = .25) + 94 | expand_limits(y = 0) + 95 | labs(x = "Light score at McCormick place (higher means more lights on)", 96 | y = "Geometric mean of the number of collisions", 97 | title = "Brighter lights at McCormick place correlate with more bird collisions there, and not with Chicago overall", 98 | subtitle = "Ribbon shows 95% bootstrapped percentile confidence interval", 99 | color = "") 100 | ``` 101 | 102 | ### Look at confounders 103 | 104 | ```{r} 105 | library(lubridate) 106 | 107 | bird_collisions %>% 108 | filter(date >= "2005-01-01") %>% 109 | count(month = month(date, label = TRUE), #floor_date(date, "month"), 110 | locality) %>% 111 | ggplot(aes(month, n, color = locality, group = locality)) + 112 | geom_line() 113 | ``` 114 | 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /board-games.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Board Games" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | theme_set(theme_light()) 13 | 14 | board_games_raw <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-12/board_games.csv") 15 | 16 | holdout_set <- board_games_raw %>% 17 | filter(game_id %% 5 == 0) 18 | 19 | board_games <- board_games_raw %>% 20 | filter(game_id %% 5 != 0) 21 | ``` 22 | 23 | ### EDA 24 | 25 | ```{r} 26 | board_games %>% 27 | count(publisher, sort = TRUE) 28 | 29 | board_games %>% 30 | count(year_published) %>% 31 | arrange(desc(year_published)) 32 | ggplot(aes(year_published, n)) + 33 | geom_line() 34 | 35 | board_games %>% 36 | ggplot(aes(average_rating)) + 37 | geom_histogram() 38 | 39 | board_games %>% 40 | filter(max_playtime > 5, max_playtime < 1000) %>% 41 | ggplot(aes(max_playtime / 60)) + 42 | geom_histogram(binwidth = .25) + 43 | scale_x_log10(breaks = 2 ^ seq(-2, 4)) 44 | 45 | board_games %>% 46 | count(year_published) %>% 47 | ggplot(aes(year_published, n)) + 48 | geom_line() 49 | ``` 50 | 51 | Categorical variables: 52 | 53 | ```{r} 54 | categorical_variables <- board_games %>% 55 | select(game_id, name, family, category, artist, designer, mechanic) %>% 56 | gather(type, value, -game_id, -name) %>% 57 | filter(!is.na(value)) %>% 58 | separate_rows(value, sep = ",") %>% 59 | arrange(game_id) 60 | 61 | categorical_counts <- categorical_variables %>% 62 | count(type, value, sort = TRUE) 63 | 64 | library(drlib) 65 | 66 | categorical_counts %>% 67 | group_by(type) %>% 68 | top_n(10, n) %>% 69 | ungroup() %>% 70 | mutate(value = reorder_within(value, n, type), 71 | type = fct_reorder(type, n, .desc = TRUE)) %>% 72 | ggplot(aes(value, n, fill = type)) + 73 | geom_col(show.legend = FALSE) + 74 | facet_wrap(~ type, scales = "free_y") + 75 | coord_flip() + 76 | scale_x_reordered() + 77 | labs(title = "Most common categories") 78 | ``` 79 | 80 | ### Predict average rating 81 | 82 | ```{r} 83 | board_games %>% 84 | group_by(decade = 10 * (year_published %/% 10)) %>% 85 | summarize(average_rating = mean(average_rating)) %>% 86 | ggplot(aes(decade, average_rating)) + 87 | geom_line() 88 | ``` 89 | 90 | 91 | ```{r} 92 | library(broom) 93 | 94 | lm(average_rating ~ 95 | log2(max_players + 1) + 96 | log2(max_playtime + 1) + 97 | year_published, board_games) %>% 98 | tidy() 99 | ``` 100 | 101 | ```{r} 102 | by_categorical <- board_games %>% 103 | inner_join(categorical_variables, by = c("game_id", "name")) %>% 104 | select(type, value, average_rating) %>% 105 | group_by(type, value) %>% 106 | summarize(games = n(), 107 | average_rating = mean(average_rating)) %>% 108 | arrange(desc(games)) 109 | 110 | board_games %>% 111 | inner_join(categorical_variables, by = c("game_id", "name")) %>% 112 | filter(type == "category") %>% 113 | mutate(value = fct_lump(value, 15), 114 | value = fct_reorder(value, average_rating)) %>% 115 | ggplot(aes(value, average_rating)) + 116 | geom_boxplot() + 117 | coord_flip() 118 | 119 | board_games %>% 120 | inner_join(categorical_variables, by = c("game_id", "name")) %>% 121 | filter(type == "family") %>% 122 | mutate(value = fct_lump(value, 15), 123 | value = fct_reorder(value, average_rating)) %>% 124 | ggplot(aes(value, average_rating)) + 125 | geom_boxplot() + 126 | coord_flip() 127 | ``` 128 | 129 | Conclusion: categorical variables can be correlated with higher/lower rated games. 130 | 131 | ```{r} 132 | non_categorical_features <- board_games %>% 133 | transmute(game_id, 134 | name, 135 | year = year_published - 1950, 136 | log2_max_players = log2(max_players + 1), 137 | log2_max_playtime = log2(max_playtime + 1)) %>% 138 | gather(feature, value, -game_id, -name) 139 | 140 | features <- categorical_variables %>% 141 | unite(feature, type, value, sep = ": ") %>% 142 | add_count(feature) %>% 143 | filter(n >= 20) %>% 144 | mutate(value = 1) %>% 145 | bind_rows(non_categorical_features) 146 | 147 | library(glmnet) 148 | library(tidytext) 149 | library(Matrix) 150 | 151 | # Predictor 152 | feature_matrix <- features %>% 153 | cast_sparse(game_id, feature, value) 154 | 155 | # What I'm predicting 156 | ratings <- board_games$average_rating[match(rownames(feature_matrix), board_games$game_id)] 157 | 158 | cv_lasso <- cv.glmnet(feature_matrix, ratings) 159 | plot(cv_lasso) 160 | 161 | cv_lasso$glmnet.fit %>% 162 | tidy() %>% 163 | filter(lambda == cv_lasso$lambda.1se) %>% 164 | arrange(desc(estimate)) %>% 165 | filter(term != "(Intercept)") %>% 166 | top_n(25, abs(estimate)) %>% 167 | mutate(term = fct_reorder(term, estimate)) %>% 168 | ggplot(aes(term, estimate)) + 169 | geom_col() + 170 | coord_flip() + 171 | labs(title = "Largest coefficients in our predictive model", 172 | subtitle = "Based on a LASSO regression", 173 | x = "", 174 | y = "Coefficient") 175 | ``` 176 | 177 | 178 | 179 | 180 | -------------------------------------------------------------------------------- /bob-ross.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Untitled" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | theme_set(theme_light()) 13 | 14 | bob_ross <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-08-06/bob-ross.csv") 15 | ``` 16 | 17 | ```{r} 18 | bob_ross_gathered <- bob_ross %>% 19 | janitor::clean_names() %>% 20 | gather(element, present, -episode, -title) %>% 21 | filter(present == 1) %>% 22 | mutate(title = str_to_title(str_remove_all(title, '"')), 23 | element = str_to_title(str_replace(element, "_", " "))) %>% 24 | select(-present) %>% 25 | extract(episode, c("season", "episode_number"), "S(.*)E(.*)", convert = TRUE, remove = FALSE) %>% 26 | arrange(season, episode_number) 27 | ``` 28 | 29 | ### Exploring the paintings 30 | 31 | ```{r} 32 | bob_ross_gathered %>% 33 | count(element, sort = TRUE) %>% 34 | head(25) %>% 35 | mutate(element = fct_reorder(element, n)) %>% 36 | ggplot(aes(element, n)) + 37 | geom_col() + 38 | coord_flip() 39 | ``` 40 | 41 | What are the most "crowded" paintings, with the most elements in them? 42 | 43 | ```{r} 44 | bob_ross_gathered %>% 45 | add_count(episode) %>% 46 | arrange(desc(n)) 47 | ``` 48 | 49 | How have Ross's paintings been changing over time? 50 | 51 | ```{r} 52 | by_season_element <- bob_ross_gathered %>% 53 | filter(!element %in% c("Tree", "Trees")) %>% 54 | group_by(season) %>% 55 | mutate(number_episodes = n_distinct(episode)) %>% 56 | count(season, element, number_episodes, sort = TRUE) %>% 57 | mutate(percent_included = n / number_episodes) %>% 58 | group_by(element) %>% 59 | mutate(element_total = sum(n)) %>% 60 | ungroup() 61 | 62 | by_season_element %>% 63 | filter(element_total >= 50) %>% 64 | ggplot(aes(season, percent_included, color = element)) + 65 | geom_line() + 66 | scale_y_continuous(labels = scales::percent_format()) + 67 | expand_limits(y = 0) + 68 | facet_wrap(~ element) 69 | ``` 70 | 71 | Could have used: [many models with broom](https://r4ds.had.co.nz/many-models.html) 72 | 73 | ### Clustering 74 | 75 | What tends to appear together? 76 | 77 | ```{r} 78 | library(widyr) 79 | 80 | correlations <- bob_ross_gathered %>% 81 | add_count(element) %>% 82 | filter(n >= 5) %>% 83 | pairwise_cor(element, episode, sort = TRUE) 84 | 85 | correlations %>% 86 | filter(item1 == "River") %>% 87 | mutate(item2 = fct_reorder(item2, correlation)) %>% 88 | ggplot(aes(item2, correlation)) + 89 | geom_col() + 90 | coord_flip() + 91 | labs(title = "What tends to appear with a river?", 92 | subtitle = "Among elements that appeared in at least 10 paintings") 93 | 94 | correlations %>% 95 | filter(item1 == "Snow") %>% 96 | mutate(item2 = fct_reorder(item2, correlation)) %>% 97 | ggplot(aes(item2, correlation)) + 98 | geom_col() + 99 | coord_flip() + 100 | labs(title = "What tends to appear with snow?", 101 | subtitle = "Among elements that appeared in at least 10 paintings") 102 | ``` 103 | 104 | ```{r} 105 | library(ggraph) 106 | library(igraph) 107 | 108 | set.seed(2019) 109 | 110 | correlations %>% 111 | head(100) %>% 112 | graph_from_data_frame() %>% 113 | ggraph() + 114 | geom_edge_link(aes(alpha = correlation)) + 115 | geom_node_point() + 116 | geom_node_text(aes(label = name), vjust = 1, hjust = 1) + 117 | theme_void() 118 | ``` 119 | 120 | ### Principal Component Analysis 121 | 122 | What dimensions drive a lot of the variation among paintings? 123 | 124 | ```{r} 125 | library(reshape2) 126 | library(broom) 127 | library(tidytext) 128 | 129 | binary_matrix <- bob_ross_gathered %>% 130 | acast(title ~ element) 131 | 132 | # Center the columns 133 | centered_matrix <- t(t(binary_matrix) - colMeans(binary_matrix)) 134 | 135 | svd_result <- svd(centered_matrix) 136 | 137 | element_weights <- tidy(svd_result, matrix = "v") %>% 138 | mutate(element = colnames(binary_matrix)[column]) 139 | 140 | element_weights %>% 141 | filter(PC <= 4) %>% 142 | group_by(PC) %>% 143 | top_n(16, abs(value)) %>% 144 | ungroup() %>% 145 | mutate(element = reorder_within(element, value, PC)) %>% 146 | ggplot(aes(element, value, fill = factor(PC))) + 147 | geom_col(show.legend = FALSE) + 148 | facet_wrap(~ PC, scales = "free") + 149 | scale_x_reordered() + 150 | coord_flip() + 151 | labs(title = "First four principal components of elements in Bob Ross paintings") 152 | ``` 153 | 154 | 1. Mountains/Conifer vs Ocean/Beach and deciduous trees 155 | 2. Trees, especially deciduous, vs Ocean 156 | 3. Spring/Summer vs Winter 157 | 4. Lake vs River 158 | 159 | ```{r} 160 | painting_weights <- broom::tidy(svd_result, matrix = "u") %>% 161 | mutate(painting = rownames(binary_matrix)[row]) 162 | ``` 163 | 164 | ```{r} 165 | painting_weights %>% 166 | filter(PC == 1) %>% 167 | arrange((value)) 168 | 169 | bob_ross_gathered %>% 170 | filter(title == "Frozen Solitude") 171 | 172 | painting_weights %>% 173 | filter(PC <= 4) %>% 174 | group_by(PC) %>% 175 | top_n(20, abs(value)) %>% 176 | ungroup() %>% 177 | mutate(painting = reorder_within(painting, value, PC)) %>% 178 | ggplot(aes(painting, value, fill = factor(PC))) + 179 | geom_col(show.legend = FALSE) + 180 | facet_wrap(~ PC, scales = "free") + 181 | scale_x_reordered() + 182 | coord_flip() + 183 | labs(title = "First four principal components of Bob Ross paintings") 184 | 185 | ``` 186 | 187 | ```{r} 188 | broom::tidy(svd_result, matrix = "d") %>% 189 | ggplot(aes(PC, percent)) + 190 | geom_point() 191 | ``` 192 | 193 | 194 | -------------------------------------------------------------------------------- /broadway-shinybones/.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | -------------------------------------------------------------------------------- /broadway-shinybones/README.md: -------------------------------------------------------------------------------- 1 | # 2 | 3 | -------------------------------------------------------------------------------- /broadway-shinybones/_site.yml: -------------------------------------------------------------------------------- 1 | name: Broadway Shows 2 | sidebar: 3 | - text: Revenue 4 | icon: dollar-sign 5 | module: 6 | metric_panel: 7 | metric: !expr DATA$broadway_revenue_usd_gross 8 | - text: Price 9 | icon: ticket-alt 10 | module: 11 | metric_panel: 12 | metric: !expr DATA$broadway_revenue_avg_ticket_price 13 | - text: Capacity 14 | icon: users 15 | module: 16 | metric_panel: 17 | metric: !expr DATA$broadway_revenue_pct_capacity 18 | -------------------------------------------------------------------------------- /broadway-shinybones/app.R: -------------------------------------------------------------------------------- 1 | # Load Libraries ----- 2 | library(shiny) 3 | library(shinydashboard) 4 | library(shinybones) 5 | library(shinymetrics) 6 | 7 | # Load Utilities ----- 8 | source_dirs('utils') 9 | source_dirs('components') 10 | source_dirs('pages') 11 | 12 | # Global Data ---- 13 | # This is passed to all page modules as an argument named data_global 14 | DATA <- readRDS("show_metrics.rds") 15 | 16 | # Configuration 17 | options("yaml.eval.expr" = TRUE) 18 | CONFIG <- yaml::read_yaml('_site.yml') 19 | 20 | # UI ---- 21 | ui <- function(request){ 22 | dashboardPage( 23 | # Header ---- 24 | dashboardHeader(title = CONFIG$name), 25 | 26 | # Sidebar ---- 27 | dashboardSidebar( 28 | sb_create_sidebar(CONFIG, DATA) 29 | ), 30 | 31 | # Body ----- 32 | dashboardBody( 33 | sb_create_tab_items(CONFIG, DATA) 34 | ) 35 | ) 36 | } 37 | 38 | # Server ----- 39 | server <- function(input, output, session){ 40 | sb_call_modules(CONFIG, DATA) 41 | } 42 | 43 | # Run App ---- 44 | shinyApp(ui = ui, server = server, enableBookmarking = 'url') 45 | -------------------------------------------------------------------------------- /broadway-shinybones/broadway-shinybones.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 4 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /broadway-shinybones/components/README.md: -------------------------------------------------------------------------------- 1 | # Components 2 | 3 | Add all shared components to this directory. 4 | -------------------------------------------------------------------------------- /broadway-shinybones/pages/README.md: -------------------------------------------------------------------------------- 1 | # Pages 2 | 3 | Add all page modules to this directory. 4 | -------------------------------------------------------------------------------- /broadway-shinybones/show_metrics.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tacookson/data-screencasts/388a9cfa794a67d5cbaa3b9b860520c0a3e0868e/broadway-shinybones/show_metrics.rds -------------------------------------------------------------------------------- /broadway.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Broadway shows" 3 | name: metrics_broadway_revenue 4 | owner: admiral.david@gmail.com 5 | metrics: 6 | usd_gross: 7 | title: Total Gross (USD) 8 | description: Not adjusted for inflation. 9 | avg_ticket_price: 10 | title: Average Ticket Price 11 | description: Not adjusted for inflation. 12 | pct_capacity: 13 | title: Percent capacity 14 | description: Averaged across weeks in the time period. 15 | dimensions: 16 | show: 17 | title: Show 18 | description: Show's title 19 | --- 20 | 21 | ```{r setup, include=FALSE} 22 | knitr::opts_chunk$set(echo = TRUE) 23 | ``` 24 | 25 | ```{r} 26 | library(tidyverse) 27 | library(scales) 28 | theme_set(theme_light()) 29 | 30 | tuesdata <- tidytuesdayR::tt_load('2020-04-28') 31 | ``` 32 | 33 | ```{r} 34 | grosses <- tuesdata$grosses 35 | 36 | grosses %>% 37 | filter(show %in% c("Hamilton", "The Lion King")) %>% 38 | ggplot(aes(week_ending, weekly_gross, color = show)) + 39 | geom_line() + 40 | scale_y_continuous(labels = scales::dollar) + 41 | expand_limits(y = 0) 42 | ``` 43 | 44 | Tidymetric `cross_by_periods()` and `cross_by_dimensions()` 45 | 46 | ```{r} 47 | # devtools::install_github("ramnathv/tidymetrics") 48 | library(tidymetrics) 49 | 50 | shows_summarized <- grosses %>% 51 | filter(show %in% c("Hamilton", "The Lion King", 52 | "Les Miserables", "Rent", 53 | "The Phantom of the Opera", "Wicked", 54 | "Harry Potter and the Cursed Child, Parts One and Two", 55 | "The Book of Mormon")) %>% 56 | mutate(show = str_remove(show, "\\, Parts.*")) %>% 57 | rename(date = week_ending) %>% 58 | cross_by_dimensions(show) %>% 59 | cross_by_periods(c("month", "quarter", "year"), 60 | windows = 28) %>% 61 | summarize(usd_gross = sum(weekly_gross), 62 | avg_ticket_price = mean(avg_ticket_price), 63 | pct_capacity = mean(pct_capacity)) %>% 64 | ungroup() 65 | 66 | show_metrics <- create_metrics(shows_summarized) 67 | ``` 68 | 69 | The shinymetrics package: 70 | 71 | ```{r} 72 | # devtools::install_github("ramnathv/shinymetrics") 73 | library(shinymetrics) 74 | saveRDS(show_metrics, "broadway-shinybones/show_metrics.rds") 75 | 76 | preview_metric(show_metrics$broadway_revenue_usd_gross) 77 | preview_metric(show_metrics$broadway_revenue_avg_ticket_price) 78 | preview_metric(show_metrics$broadway_revenue_pct_capacity) 79 | ``` 80 | 81 | ```{r} 82 | shows_summarized %>% 83 | filter(period == "quarter", 84 | show != "All") %>% 85 | ggplot(aes(date, usd_gross, fill = show)) + 86 | geom_col() 87 | 88 | shows_summarized %>% 89 | filter(period == "quarter", 90 | show != "All") %>% 91 | ggplot(aes(date, avg_ticket_price, col = show)) + 92 | geom_line() + 93 | expand_limits(y = 0) 94 | ``` 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /car-economy.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Car Fuel Economy" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | theme_set(theme_light()) 13 | ``` 14 | 15 | ```{r} 16 | big_epa_cars <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-15/big_epa_cars.csv") %>% 17 | mutate(uses_electricity = ifelse(highwayE > 0, "Uses Electricity", "Doesn't Use Electricity")) 18 | ``` 19 | 20 | ```{r} 21 | big_epa_cars_alphabetical <- big_epa_cars %>% 22 | select(sort(colnames(big_epa_cars))) 23 | ``` 24 | 25 | ```{r} 26 | big_epa_cars %>% 27 | ggplot(aes(highway08, city08)) + 28 | geom_point() + 29 | geom_abline(color = "red") + 30 | facet_wrap(~ uses_electricity, scales = "free") + 31 | expand_limits(x = 0, y = 0) + 32 | labs(x = "Highway MPG", 33 | y = "City MPG", 34 | title = "How does fuel efficiency differ between city + highway?") 35 | ``` 36 | 37 | ```{r} 38 | big_epa_cars %>% 39 | select(city08, highway08, make, model, cylinders, displ, drive, engId, eng_dscr) 40 | ``` 41 | 42 | ```{r} 43 | big_epa_cars %>% 44 | filter(cityE == 0) %>% 45 | mutate(VClass = fct_lump(VClass, 8), 46 | VClass = fct_reorder(VClass, city08)) %>% 47 | ggplot(aes(VClass, city08)) + 48 | geom_boxplot() + 49 | coord_flip() 50 | 51 | big_epa_cars %>% 52 | filter(cityE == 0) %>% 53 | mutate(drive = fct_reorder(drive, city08)) %>% 54 | ggplot(aes(drive, city08)) + 55 | geom_boxplot() + 56 | coord_flip() 57 | 58 | big_epa_cars %>% 59 | filter(cityE == 0) %>% 60 | ggplot(aes(cylinders, city08, group = cylinders)) + 61 | geom_boxplot() 62 | 63 | big_epa_cars %>% 64 | filter(cityE == 0) %>% 65 | ggplot(aes(displ, city08)) + 66 | geom_point() + 67 | expand_limits(x = 0, y = 0) 68 | ``` 69 | 70 | ## Goal: Predict city fuel efficiency for single-fuel, non-electric cars 71 | 72 | ```{r} 73 | # Cross validation holdout set 74 | non_electric_cars <- big_epa_cars %>% 75 | filter(cityA08 == 0, 76 | cityE == 0) %>% 77 | sample_frac(1) 78 | 79 | training_set <- non_electric_cars %>% 80 | filter(row_number() %% 5 != 0) 81 | ``` 82 | 83 | ```{r} 84 | library(broom) 85 | 86 | training_set %>% 87 | ggplot(aes(displ, city08)) + 88 | geom_point() + 89 | geom_smooth(method = "lm") 90 | 91 | library(splines) 92 | augmented_data <- lm(city08 ~ ns(displ, 2), data = training_set) %>% 93 | augment(data = training_set) 94 | 95 | augmented_data %>% 96 | ggplot(aes(displ, city08)) + 97 | geom_point() + 98 | geom_line(aes(y = .fitted), color = "red", size = 2) 99 | 100 | models <- tibble(df = 1:10) %>% 101 | mutate(lm_model = map(df, ~ lm(city08 ~ ns(displ, df = .), data = training_set))) 102 | 103 | augmented_unnested <- models %>% 104 | mutate(augmented = map(lm_model, augment, data = training_set)) %>% 105 | unnest(augmented) 106 | 107 | augmented_unnested %>% 108 | ggplot(aes(displ, city08)) + 109 | geom_point(data = training_set) + 110 | geom_line(aes(y = .fitted, color = factor(df)), size = 2) + 111 | labs(x = "Engine volume (L)", 112 | y = "City MPG", 113 | color = "# of degrees of freedom") + 114 | expand_limits(x = 0, y = 0) 115 | 116 | augmented_unnested %>% 117 | ggplot(aes(displ, .resid)) + 118 | geom_point() + 119 | facet_wrap(~ df) 120 | 121 | glanced_models <- models %>% 122 | rename(spline_df = df) %>% 123 | mutate(glanced = map(lm_model, glance, data = training_set)) %>% 124 | unnest(glanced) 125 | 126 | glanced_models %>% 127 | ggplot(aes(spline_df, adj.r.squared)) + 128 | geom_line() 129 | ``` 130 | 131 | ```{r} 132 | lm(city08 ~ ns(displ, 4), data = training_set) %>% 133 | anova() %>% 134 | tidy() %>% 135 | mutate(pct_variation = sumsq / sum(sumsq)) 136 | 137 | ggplot(training_set, aes(cylinders, displ, group = cylinders)) + 138 | geom_boxplot() 139 | ``` 140 | 141 | ```{r} 142 | training_set %>% 143 | ggplot(aes(year, city08)) + 144 | geom_point() + 145 | geom_smooth(method = "loess") 146 | 147 | efficiency_time <- training_set %>% 148 | mutate(VClass = fct_lump(VClass, 6), 149 | guzzler = !is.na(guzzler)) %>% 150 | group_by(year = 2 * floor(year / 2), VClass) %>% 151 | summarize_at(vars(city08, cylinders, displ, guzzler), mean) 152 | 153 | efficiency_time %>% 154 | ggplot(aes(year, city08, color = VClass)) + 155 | geom_line() + 156 | expand_limits(y = 0) 157 | 158 | efficiency_time %>% 159 | ggplot(aes(year, displ, color = VClass)) + 160 | geom_line() + 161 | expand_limits(y = 0) 162 | 163 | efficiency_time %>% 164 | ggplot(aes(year, guzzler, color = VClass)) + 165 | geom_line() + 166 | expand_limits(y = 0) 167 | ``` 168 | 169 | Hypothesis: Engine size started shrinking around 2009 (probably regulatory pressure), especially in large and midsize cars, and this led to an increase in city fuel efficiency. 170 | 171 | 172 | -------------------------------------------------------------------------------- /caribou-locations.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Untitled" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | theme_set(theme_light()) 13 | 14 | tuesdata <- tidytuesdayR::tt_load('2020-06-23') 15 | ``` 16 | 17 | ```{r} 18 | individuals <- tuesdata$individuals 19 | locations <- tuesdata$locations 20 | ``` 21 | 22 | ```{r} 23 | individuals %>% 24 | summarize(across(sex:study_site, list(~ mean(!is.na(.))))) 25 | 26 | individuals %>% 27 | filter(deploy_off_type == "dead") %>% 28 | count(death_cause, sort = TRUE) 29 | ``` 30 | 31 | ```{r} 32 | library(sf) 33 | province_sf <- read_sf("~/Downloads/province") 34 | 35 | bc <- province_sf %>% 36 | filter(PROV == "BC") 37 | ``` 38 | 39 | ```{r} 40 | individuals %>% 41 | filter(deploy_on_latitude > 40) %>% 42 | count(study_site, deploy_on_longitude, deploy_on_latitude, sort = TRUE) %>% 43 | ggplot() + 44 | # geom_sf(data = bc) + 45 | # borders("world", regions = "canada") + 46 | geom_point(aes(deploy_on_longitude, deploy_on_latitude, 47 | size = n, 48 | color = study_site)) + 49 | scale_size_continuous(guide = FALSE) 50 | ``` 51 | 52 | ```{r} 53 | individuals %>% 54 | count(animal_id, sort = TRUE) 55 | 56 | individuals %>% 57 | filter(animal_id == "MO_car150") %>% 58 | View() 59 | ``` 60 | 61 | ### Looking at locations 62 | 63 | ```{r} 64 | locations %>% 65 | ggplot(aes(longitude, latitude, color = study_site)) + 66 | geom_point() 67 | ``` 68 | 69 | ```{r} 70 | by_animal <- locations %>% 71 | group_by(animal_id, study_site) %>% 72 | summarize(start = min(timestamp), 73 | end = max(timestamp), 74 | num_points = n()) %>% 75 | ungroup() %>% 76 | arrange(desc(num_points)) 77 | ``` 78 | 79 | ```{r} 80 | library(lubridate) 81 | 82 | example_animal <- locations %>% 83 | arrange(timestamp) %>% 84 | filter(animal_id == sample(unique(animal_id), 1)) 85 | 86 | example_animal %>% 87 | mutate(quarter = as.Date(floor_date(timestamp, "quarter"))) %>% 88 | ggplot(aes(longitude, latitude, color = timestamp)) + 89 | geom_point(alpha = .5) + 90 | geom_path(alpha = .5) + 91 | facet_wrap(~ quarter) + 92 | labs(title = "One caribou over time") 93 | ``` 94 | 95 | ```{r} 96 | library(geosphere) 97 | 98 | locations_with_gaps <- locations %>% 99 | group_by(animal_id) %>% 100 | mutate(last_lon = lag(longitude), 101 | last_lat = lag(latitude), 102 | hours = as.numeric(difftime(timestamp, lag(timestamp), unit = "hours")), 103 | km = distHaversine(cbind(longitude, latitude), cbind(last_lon, last_lat)) / 1000, 104 | kph = km / hours) %>% 105 | ungroup() 106 | 107 | locations_with_gaps %>% 108 | filter(hours <= 8) %>% 109 | ggplot(aes(kph)) + 110 | geom_histogram() + 111 | scale_x_log10(labels = scales::comma) + 112 | labs(title = "On average, how fast do caribou travel?") 113 | 114 | by_animal <- locations_with_gaps %>% 115 | filter(hours <= 8, 116 | hours >= .5) %>% 117 | group_by(animal_id, study_site) %>% 118 | summarize(start = min(timestamp), 119 | end = max(timestamp), 120 | num_points = n(), 121 | avg_speed = mean(kph, na.rm = TRUE)) %>% 122 | ungroup() %>% 123 | arrange(desc(num_points)) %>% 124 | filter(num_points >= 10) 125 | ``` 126 | 127 | ```{r} 128 | by_animal %>% 129 | ggplot(aes(num_points, avg_speed)) + 130 | geom_point() + 131 | scale_x_log10() + 132 | expand_limits(y = 0) 133 | 134 | by_animal %>% 135 | arrange(desc(avg_speed)) 136 | 137 | locations_with_gaps %>% 138 | filter(animal_id == "QU_car107") %>% 139 | mutate(quarter = as.Date(floor_date(timestamp, "quarter"))) %>% 140 | ggplot(aes(longitude, latitude, color = kph)) + 141 | geom_point(alpha = .5) + 142 | geom_path(alpha = .5) + 143 | facet_wrap(~ quarter) + 144 | labs(title = "One caribou over time") 145 | 146 | locations_with_gaps %>% 147 | filter(animal_id == "SC_car171") %>% 148 | arrange(desc(kph)) %>% 149 | View() 150 | ``` 151 | 152 | ```{r} 153 | locations_with_gaps %>% 154 | filter(study_site != "Hart Ranges") %>% 155 | filter(hours <= 8, 156 | hours >= .5) %>% 157 | group_by(month = month(timestamp, label = TRUE), 158 | study_site) %>% 159 | summarize(avg_speed = median(kph), 160 | n = n()) %>% 161 | ggplot(aes(month, avg_speed, group = study_site, 162 | color = study_site)) + 163 | geom_line() + 164 | geom_point(aes(size = n)) + 165 | expand_limits(y = 0) + 166 | facet_wrap(~ study_site) + 167 | theme(legend.position = "none") + 168 | labs(title = "Seasonal trend in Caribou speed", 169 | y = "Average speed (kph)") 170 | ``` 171 | 172 | 173 | ```{r} 174 | by_animal %>% 175 | filter(num_points >= 10) %>% 176 | arrange(desc(avg_speed)) 177 | ``` 178 | 179 | 180 | ```{r} 181 | locations %>% 182 | arrange(timestamp) %>% 183 | group_by(animal_id) %>% 184 | mutate(gap = round(difftime(timestamp, lag(timestamp), unit = "hours"))) %>% 185 | ungroup() %>% 186 | filter(!is.na(gap)) %>% 187 | filter(gap <= 24) %>% 188 | ggplot(aes(gap)) + 189 | geom_histogram(binwidth = 2) 190 | ``` 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | -------------------------------------------------------------------------------- /cetaceans.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Untitled" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | theme_set(theme_light()) 13 | 14 | cetaceans_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-12-18/allCetaceanData.csv") 15 | 16 | cetaceans <- cetaceans_raw %>% 17 | select(-X1) %>% 18 | mutate(birthYear = as.integer(birthYear), 19 | age = if_else(status == "Died", year(statusDate) - birthYear, NA_real_)) 20 | ``` 21 | 22 | ```{r} 23 | library(lubridate) 24 | 25 | cetaceans %>% 26 | count(acquisition, sort = TRUE) 27 | 28 | cetaceans %>% 29 | count(status, sort = TRUE) 30 | 31 | cetaceans %>% 32 | filter(status == "Died") %>% 33 | select(birthYear, statusDate) %>% 34 | filter(!is.na(birthYear), !is.na(statusDate)) %>% 35 | mutate(age = year(statusDate) - birthYear) %>% 36 | ggplot(aes(age)) + 37 | geom_histogram() 38 | 39 | cetaceans %>% 40 | count(species = fct_lump(species, 5), sort = TRUE) %>% 41 | mutate(species = fct_reorder(species, n)) %>% 42 | ggplot(aes(species, n)) + 43 | geom_col() + 44 | coord_flip() 45 | 46 | cetaceans %>% 47 | mutate(species = fct_lump(species, 5), sort = TRUE) %>% 48 | filter(!is.na(birthYear), !is.na(statusDate)) %>% 49 | mutate(age = year(statusDate) - birthYear) %>% 50 | ggplot(aes(species, age)) + 51 | geom_boxplot() + 52 | coord_flip() 53 | ``` 54 | 55 | ```{r} 56 | cetaceans %>% 57 | count(acquisition, originLocation, sort = TRUE) %>% 58 | filter(originLocation != "Unknown") %>% 59 | View() 60 | 61 | cetaceans %>% 62 | ggplot(aes(originDate, fill = acquisition)) + 63 | geom_histogram() 64 | 65 | library(scales) 66 | 67 | cetaceans %>% 68 | filter(originDate >= "1960-01-01") %>% 69 | count(acquisition, 70 | decade = 5 * (year(originDate) %/% 5)) %>% 71 | complete(acquisition, decade, fill = list(n = 0)) %>% 72 | mutate(acquisition = fct_reorder(acquisition, n, sum)) %>% 73 | group_by(decade) %>% 74 | mutate(percent = n / sum(n)) %>% 75 | ggplot(aes(decade, percent, fill = acquisition)) + 76 | geom_area() + 77 | scale_y_continuous(labels = percent_format()) + 78 | theme_minimal() + 79 | labs(x = "year", 80 | y = "% of dolphins recorded") 81 | ``` 82 | 83 | ```{r} 84 | library(fuzzyjoin) 85 | 86 | regexes <- tribble( 87 | ~ regex, ~ category, 88 | "Unknown", "Unknown", 89 | "Gulf of Mexico", "Gulf of Mexico", 90 | "Florida|FL", "Florida", 91 | "Texas|TX", "Texas", 92 | "SeaWorld", "SeaWorld", 93 | "Pacific", "Pacific Ocean", 94 | "Atlantic", "Atlantic Ocean" 95 | ) 96 | 97 | cetaceans_annotated <- cetaceans %>% 98 | mutate(unique_id = row_number()) %>% 99 | regex_left_join(regexes, c(originLocation = "regex")) %>% 100 | distinct(unique_id, .keep_all = TRUE) %>% 101 | mutate(category = coalesce(category, originLocation)) 102 | 103 | cetaceans_annotated %>% 104 | filter(acquisition == "Capture") %>% 105 | mutate(category = fct_lump(category, 4), 106 | category = fct_reorder(category, category, length)) %>% 107 | ggplot(aes(originDate, fill = category)) + 108 | geom_histogram() 109 | 110 | 111 | count(category, 112 | decade = 5 * (year(originDate) %/% 5)) %>% 113 | complete(category, decade, fill = list(n = 0)) %>% 114 | mutate(category = fct_lump(category, 7), 115 | category = fct_reorder(category, n, sum)) %>% 116 | group_by(decade) %>% 117 | mutate(percent = n / sum(n)) %>% 118 | ggplot(aes(decade, percent, fill = category)) + 119 | geom_area() + 120 | scale_y_continuous(labels = percent_format()) + 121 | theme_minimal() + 122 | labs(x = "year", 123 | y = "% of dolphins recorded", 124 | title = "Dolphins born in captivity") 125 | ``` 126 | 127 | ### Survival analysis 128 | 129 | ```{r} 130 | library(survival) 131 | 132 | dolphin_survival <- cetaceans %>% 133 | filter(status %in% c("Alive", "Died")) %>% 134 | mutate(deathYear = ifelse(status == "Alive", 2017, year(statusDate)), 135 | status = ifelse(status == "Alive", 0, 1), 136 | age = deathYear - birthYear) %>% 137 | filter(!is.na(deathYear)) %>% 138 | select(birthYear, deathYear, status, sex, age, acquisition, species) %>% 139 | filter(deathYear >= birthYear) %>% 140 | filter(sex != "U") 141 | 142 | model <- survfit(Surv(age, status) ~ sex, dolphin_survival) 143 | 144 | library(broom) 145 | 146 | tidy(model) %>% 147 | ggplot(aes(time, estimate, color = strata)) + 148 | geom_line() + 149 | geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = .2) + 150 | scale_y_continuous(labels = percent_format()) + 151 | labs(y = "Estimated % survival") 152 | 153 | coxph(Surv(age, status) ~ sex, dolphin_survival) %>% 154 | tidy() 155 | ``` 156 | 157 | ```{r} 158 | model <- survfit(Surv(age, status) ~ acquisition, dolphin_survival) 159 | 160 | library(broom) 161 | 162 | tidy(model) %>% 163 | filter(strata != "acquisition=Unknown") %>% 164 | ggplot(aes(time, estimate, color = strata)) + 165 | geom_line() + 166 | geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = .2) + 167 | scale_y_continuous(labels = percent_format()) + 168 | labs(y = "Estimated % survival") 169 | ``` 170 | 171 | -------------------------------------------------------------------------------- /coffee-ratings.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Coffee Ratings" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidytuesdayR) 12 | library(tidyverse) 13 | theme_set(theme_light()) 14 | 15 | coffee <- tt_load("2020-07-07") 16 | 17 | coffee_ratings <- coffee$coffee_ratings %>% 18 | mutate(coffee_id = row_number()) %>% 19 | filter(total_cup_points > 0) 20 | ``` 21 | 22 | ```{r} 23 | View(coffee_ratings) 24 | 25 | coffee_ratings %>% 26 | count(species, sort = TRUE) 27 | 28 | coffee_lumped <- coffee_ratings %>% 29 | filter(!is.na(variety)) %>% 30 | mutate(variety = fct_lump(variety, 12), sort = TRUE) 31 | 32 | coffee_lumped %>% 33 | mutate(variety = fct_reorder(variety, total_cup_points)) %>% 34 | ggplot(aes(total_cup_points, variety)) + 35 | geom_boxplot() 36 | 37 | coffee_lumped %>% 38 | ggplot(aes(total_cup_points, fill = variety)) + 39 | geom_histogram(binwidth = 2) + 40 | facet_wrap(~ variety, scale = "free_y") + 41 | theme(legend.position = "none") 42 | ``` 43 | 44 | ```{r} 45 | coffee_ratings %>% 46 | summarize(across(everything(), ~ mean(!is.na(.)))) %>% 47 | gather() %>% 48 | View() 49 | 50 | coffee_ratings %>% 51 | count(producer, sort = TRUE) 52 | 53 | coffee_ratings %>% 54 | count(company, sort = TRUE) 55 | 56 | coffee_ratings %>% 57 | count(color, sort = TRUE) 58 | 59 | coffee_ratings %>% 60 | count(country = fct_lump(country_of_origin, 12), sort = TRUE) %>% 61 | filter(!is.na(country)) %>% 62 | mutate(country = fct_reorder(country, n)) %>% 63 | ggplot(aes(n, country)) + 64 | geom_col() 65 | 66 | coffee_ratings %>% 67 | filter(!is.na(country_of_origin)) %>% 68 | mutate(country = fct_lump(country_of_origin, 12), 69 | country = fct_reorder(country, total_cup_points)) %>% 70 | ggplot(aes(total_cup_points, country)) + 71 | geom_boxplot() 72 | ``` 73 | 74 | Interesting dimensions: 75 | 76 | * Country 77 | * Variety 78 | * Company?? 79 | 80 | ```{r} 81 | library(ggridges) 82 | 83 | coffee_metrics <- coffee_ratings %>% 84 | select(coffee_id, total_cup_points, variety, company, 85 | country_of_origin, 86 | altitude_mean_meters, 87 | aroma:moisture) %>% 88 | pivot_longer(aroma:cupper_points, names_to = "metric", values_to = "value") 89 | 90 | coffee_metrics %>% 91 | mutate(metric = fct_reorder(metric, value)) %>% 92 | ggplot(aes(value, metric)) + 93 | geom_density_ridges() 94 | 95 | coffee_metrics %>% 96 | group_by(metric) %>% 97 | summarize(average = mean(value), 98 | sd = sd(value)) %>% 99 | arrange(desc(average)) 100 | ``` 101 | 102 | ```{r} 103 | library(widyr) 104 | library(ggraph) 105 | library(igraph) 106 | library(tidytext) 107 | 108 | correlations <- coffee_metrics %>% 109 | pairwise_cor(metric, coffee_id, value, sort = TRUE) 110 | 111 | correlations %>% 112 | head(50) %>% 113 | graph_from_data_frame() %>% 114 | ggraph() + 115 | geom_edge_link(aes(edge_alpha = correlation)) + 116 | geom_node_point() + 117 | geom_node_text(aes(label = name), repel = TRUE) 118 | 119 | coffee_metrics %>% 120 | filter(!metric %in% c("sweetness", "clean_cup", "uniformity")) %>% 121 | group_by(metric) %>% 122 | mutate(centered = value - mean(value)) %>% 123 | ungroup() %>% 124 | widely_svd(metric, coffee_id, value) %>% 125 | filter(between(dimension, 2, 5)) %>% 126 | mutate(metric = reorder_within(metric, value, dimension)) %>% 127 | ggplot(aes(value, metric)) + 128 | geom_col() + 129 | scale_y_reordered() + 130 | facet_wrap(~ dimension, scales = "free_y") 131 | ``` 132 | 133 | ```{r} 134 | 135 | coffee_ratings %>% 136 | filter(altitude_mean_meters < 10000, 137 | altitude != 1) %>% 138 | mutate(altitude_mean_meters = pmin(altitude_mean_meters, 3000)) %>% 139 | ggplot(aes(altitude_mean_meters, total_cup_points)) + 140 | geom_point() + 141 | geom_smooth(method = "lm") 142 | 143 | coffee_metrics %>% 144 | filter(altitude_mean_meters < 10000) %>% 145 | mutate(altitude_mean_meters = pmin(altitude_mean_meters, 3000)) %>% 146 | mutate(km = altitude_mean_meters / 1000) %>% 147 | group_by(metric) %>% 148 | summarize(correlation = cor(altitude_mean_meters, value), 149 | model = list(lm(value ~ km))) %>% 150 | mutate(tidied = map(model, broom::tidy, conf.int = TRUE)) %>% 151 | unnest(tidied) %>% 152 | filter(term == "km") %>% 153 | ungroup() %>% 154 | mutate(metric = fct_reorder(metric, estimate)) %>% 155 | ggplot(aes(estimate, metric, color = p.value < .05)) + 156 | geom_point() + 157 | geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = .1) + 158 | labs(y = "Evaluation of coffee", 159 | x = "Each kilometer of altitude contributes this much to score (95% confidence interval)") 160 | ``` 161 | 162 | 163 | 164 | 165 | -------------------------------------------------------------------------------- /crop-yields-shiny/app.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: 'Our World in Data: Crop Yields' 3 | runtime: shiny 4 | output: html_document 5 | --- 6 | 7 | ```{r setup, include=FALSE} 8 | knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE) 9 | ``` 10 | 11 | ```{r} 12 | library(dplyr) 13 | library(ggplot2) 14 | library(forcats) 15 | library(shiny) 16 | library(stringr) 17 | library(plotly) 18 | theme_set(theme_light()) 19 | 20 | # Data cleaning and processing 21 | yields_tidy <- readRDS("yields_tidy.rds") 22 | 23 | top_crops <- yields_tidy %>% 24 | count(crop, sort = TRUE) %>% 25 | head(9) %>% 26 | pull(crop) 27 | ``` 28 | 29 | ```{r} 30 | plot_yields <- function(tbl, facet_scales = "fixed") { 31 | g <- tbl %>% 32 | mutate(crop = fct_reorder(crop, -yield)) %>% 33 | mutate(entity = fct_reorder(entity, -yield)) %>% 34 | ggplot(aes(year, yield, color = entity)) + 35 | geom_line() + 36 | expand_limits(y = 0) + 37 | facet_wrap(~ crop, scales = facet_scales) + 38 | labs(x = "Year", 39 | y = "Yield (tonnes per hectare)", 40 | title = "Crop yields over time", 41 | color = "Country") 42 | 43 | ggplotly(g) 44 | } 45 | 46 | # yields_tidy %>% 47 | # filter(code %in% c("USA", "IND")) %>% 48 | # plot_yields() 49 | ``` 50 | 51 | 52 | ```{r eruptions, echo=FALSE} 53 | inputPanel( 54 | selectInput("entity", 55 | label = "Country/Continent/Region:", 56 | choices = unique(yields_tidy$entity), 57 | selected = c("United States", "India"), 58 | selectize = TRUE, 59 | multiple = TRUE), 60 | selectInput("crop", 61 | label = "Crops:", 62 | choices = unique(yields_tidy$crop), 63 | selected = top_crops, 64 | selectize = TRUE, 65 | multiple = TRUE), 66 | radioButtons("facet_scales", 67 | label = "", 68 | choices = c("Free y-axis" = "free_y", 69 | "Shared y-axis" = "fixed")) 70 | ) 71 | 72 | renderPlotly({ 73 | yields_tidy %>% 74 | filter(entity %in% input$entity, 75 | crop %in% input$crop) %>% 76 | plot_yields(facet_scales = input$facet_scales) 77 | }) 78 | ``` 79 | -------------------------------------------------------------------------------- /crop-yields-shiny/yields_tidy.rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tacookson/data-screencasts/388a9cfa794a67d5cbaa3b9b860520c0a3e0868e/crop-yields-shiny/yields_tidy.rds -------------------------------------------------------------------------------- /data-screencasts.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /french-trains.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "French Trains" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | library(scales) 13 | theme_set(theme_light()) 14 | 15 | full_trains <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-02-26/full_trains.csv") %>% 16 | mutate(pct_late_at_departure = num_late_at_departure / total_num_trips, 17 | arrival_station = str_to_title(arrival_station), 18 | departure_station = str_to_title(departure_station), 19 | date = as.Date(sprintf("%d-%02d-01", year, month))) %>% 20 | arrange(departure_station, arrival_station, month) %>% 21 | fill(service) 22 | ``` 23 | 24 | ```{r} 25 | november_2018 <- full_trains %>% 26 | filter(year == 2018, month == 11) 27 | 28 | november_2018 %>% 29 | ggplot(aes(pct_late_at_departure)) + 30 | geom_histogram(binwidth = .05) + 31 | scale_x_continuous(labels = percent_format()) 32 | 33 | november_2018 %>% 34 | mutate(departure_station = fct_lump(departure_station, 3)) %>% 35 | ggplot(aes(departure_station, pct_late_at_departure)) + 36 | geom_boxplot() + 37 | scale_y_continuous(labels = percent_format()) 38 | 39 | november_2018 %>% 40 | # mutate(arrival_station = fct_infreq(fct_lump(arrival_station, prop = .01))) %>% 41 | # mutate(departure_station = fct_infreq(fct_lump(departure_station, prop = .01))) %>% 42 | mutate(arrival_station = fct_reorder(fct_lump(arrival_station, prop = .01), pct_late_at_departure)) %>% 43 | mutate(departure_station = fct_reorder(fct_lump(departure_station, prop = .01), pct_late_at_departure)) %>% 44 | group_by(arrival_station, departure_station) %>% 45 | summarize(pct_late_at_departure = sum(num_late_at_departure) / sum(total_num_trips)) %>% 46 | ggplot(aes(arrival_station, departure_station, fill = pct_late_at_departure)) + 47 | geom_tile() + 48 | scale_fill_gradient2(low = "blue", high = "red", midpoint = .25, labels = percent_format()) + 49 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 50 | labs(x = "Arrival station", 51 | y = "Departure station", 52 | fill = "% late at departure", 53 | title = "Which routes have the most delayed trains in November 2018?", 54 | subtitle = "Stations with only one arriving/departing route were lumped into 'Other'") 55 | ``` 56 | 57 | ### Over time 58 | 59 | ```{r} 60 | full_trains %>% 61 | filter(departure_station == "Lyon Part Dieu") %>% 62 | ggplot(aes(date, pct_late_at_departure, color = arrival_station)) + 63 | geom_line() + 64 | scale_y_continuous(labels = percent_format()) + 65 | expand_limits(y = 0) 66 | 67 | by_departure_station_month <- full_trains %>% 68 | group_by(departure_station = fct_lump(departure_station, prop = .01), 69 | date) %>% 70 | summarize_at(vars(contains("num")), sum) %>% 71 | ungroup() %>% 72 | mutate(pct_late_at_departure = num_late_at_departure / total_num_trips) 73 | 74 | by_departure_station_month %>% 75 | mutate(departure_station = fct_reorder(departure_station, -pct_late_at_departure, last)) %>% 76 | ggplot(aes(date, pct_late_at_departure, color = departure_station)) + 77 | geom_line() + 78 | scale_y_continuous(labels = percent_format()) + 79 | labs(x = "Month", 80 | y = "% late at departure", 81 | color = "Departure station") 82 | ``` 83 | 84 | ```{r} 85 | by_departure_station_month <- full_trains %>% 86 | group_by(departure_station = ifelse(service == "International", 87 | paste0(departure_station, " (International)"), 88 | departure_station), 89 | service, 90 | year, 91 | month = fct_reorder(month.name[month], month)) %>% 92 | summarize_at(vars(contains("num")), sum) %>% 93 | ungroup() %>% 94 | mutate(pct_late_at_departure = num_late_at_departure / total_num_trips) 95 | 96 | by_departure_station_month %>% 97 | mutate(departure_station = fct_reorder(departure_station, (service != "International") + pct_late_at_departure, mean)) %>% 98 | ggplot(aes(month, departure_station, fill = pct_late_at_departure)) + 99 | geom_tile() + 100 | scale_fill_gradient2(low = "blue", high = "red", midpoint = .25, labels = percent_format()) + 101 | facet_wrap(~ year, nrow = 1, scales = "free_x") + 102 | theme(axis.text.x = element_text(angle = 90, hjust = 1), 103 | axis.ticks = element_blank(), 104 | panel.grid = element_blank()) + 105 | labs(fill = "% late at departure") + 106 | labs(x = "Month", 107 | y = "Departure station", 108 | title = "Which stations had delays in which months?", 109 | subtitle = "Ordered by the average delay, with international routes on the bottom") 110 | ``` 111 | 112 | -------------------------------------------------------------------------------- /gdpr.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | name: metrics_gdpr_violations 3 | owner: admiral.david@gmail.com 4 | metrics: 5 | nb_violations: 6 | title: Number of GDPR Violations 7 | description: Number of GDPR Violations in this time period. 8 | nb_total_fine: 9 | title: Total Fine (Euros) 10 | description: Total fine across violations in Euros. (TODO) 11 | dimensions: 12 | country: 13 | title: Country 14 | description: France/Germany/etc 15 | article_title: 16 | title: Article 17 | description: GDPR Article 18 | type: 19 | title: Type 20 | description: Type of violation 21 | --- 22 | 23 | ```{r setup, include=FALSE} 24 | knitr::opts_chunk$set(echo = TRUE) 25 | ``` 26 | 27 | ```{r} 28 | library(tidyverse) 29 | library(lubridate) 30 | library(scales) 31 | theme_set(theme_light()) 32 | 33 | 34 | tuesdata <- tidytuesdayR::tt_load('2020-04-21') 35 | 36 | gdpr_violations <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_violations.tsv') %>% 37 | mutate(date = na_if(mdy(date), "1970-01-01")) %>% 38 | rename(country = name) 39 | ``` 40 | 41 | ```{r} 42 | gdpr_violations %>% 43 | summarize(total = sum(price)) 44 | 45 | gdpr_violations %>% 46 | count(country = fct_lump(country, 8, w = price), 47 | sort = TRUE, wt = price, name = "total_price") %>% 48 | mutate(country = fct_reorder(country, total_price)) %>% 49 | ggplot(aes(total_price, country)) + 50 | geom_col() + 51 | scale_x_continuous(labels = dollar_format()) 52 | ``` 53 | 54 | ```{r} 55 | gdpr_violations %>% 56 | count(month = floor_date(date, "month"), 57 | country = fct_lump(country, 6, w = price), 58 | sort = TRUE, wt = price, name = "total_price") %>% 59 | mutate(country = fct_reorder(country, -total_price, sum)) %>% 60 | ggplot(aes(month, total_price, fill = country)) + 61 | geom_col() + 62 | scale_y_continuous(labels = dollar_format()) + 63 | labs(x = "Time", 64 | y = "Total fines", 65 | fill = "Country") 66 | ``` 67 | 68 | ```{r} 69 | gdpr_violations %>% 70 | count(source, country, sort = TRUE) 71 | 72 | gdpr_violations %>% 73 | select(controller, date, article_violated, type, summary, price) %>% 74 | mutate(summary = str_trunc(summary, 140)) %>% 75 | arrange(desc(price)) %>% 76 | mutate(price = dollar(price)) %>% 77 | head(10) 78 | ``` 79 | 80 | ```{r} 81 | gdpr_violations %>% 82 | mutate(type = fct_lump(type, 8, w = price), 83 | type = fct_reorder(type, price), 84 | country = fct_lump(country, 5)) %>% 85 | ggplot(aes(price, type)) + 86 | geom_boxplot() + 87 | geom_jitter(aes(color = country), width = 0, height = .25) + 88 | scale_x_log10(labels = dollar_format()) 89 | ``` 90 | 91 | ### Which article was violated? 92 | 93 | ```{r} 94 | article_titles <- gdpr_text %>% 95 | distinct(article, article_title) 96 | 97 | separated_articles <- gdpr_violations %>% 98 | separate_rows(article_violated, sep = "\\|") %>% 99 | extract(article_violated, "article", "Art\\. ?(\\d+)", convert = TRUE, remove = FALSE) %>% 100 | left_join(article_titles, by = "article") %>% 101 | mutate(article_title = paste0(article, ". ", str_trunc(article_title, 50)), 102 | article_title = ifelse(is.na(article), "Unknown", article_title)) %>% 103 | add_count(id) %>% 104 | mutate(price_per_article = price / n) 105 | 106 | separated_articles %>% 107 | group_by(article = fct_lump(article_title, 8, w = price)) %>% 108 | summarize(total_fine = sum(price_per_article), 109 | violations = n()) %>% 110 | arrange(desc(total_fine)) %>% 111 | mutate(article = fct_reorder(article, total_fine)) %>% 112 | ggplot(aes(total_fine, article)) + 113 | geom_col() + 114 | scale_x_continuous(labels = dollar_format()) + 115 | labs(title = "What articles got the most fines?", 116 | y = "", 117 | x = "Total fine") 118 | ``` 119 | 120 | ```{r} 121 | gdpr_text <- tuesdata$gdpr_text 122 | ``` 123 | 124 | ```{r} 125 | gdpr_violations %>% 126 | filter(str_detect(controller, "Vodafone")) %>% 127 | group_by(date, country) %>% 128 | summarize(violations = n(), 129 | total_fine = sum(price)) %>% 130 | ggplot(aes(date, total_fine, size = violations, color = country)) + 131 | geom_point() + 132 | scale_y_continuous(labels = dollar_format()) + 133 | scale_size_continuous(guide = FALSE) + 134 | labs(title = "Vodafone's GDPR violations", 135 | color = "", 136 | x = "Time", 137 | y = "Total fine on this day") 138 | ``` 139 | 140 | ```{r} 141 | library(tidymetrics) 142 | 143 | summarized <- separated_articles %>% 144 | filter(!is.na(date)) %>% 145 | mutate(country = fct_lump(country, 6, w = price_per_article), 146 | article_title = fct_lump(article_title, 6, w = price_per_article), 147 | type = fct_lump(type, 6, w = price_per_article)) %>% 148 | cross_by_dimensions(country, article_title, type) %>% 149 | cross_by_periods(c("month", "quarter")) %>% 150 | summarize(nb_violations = n_distinct(id), 151 | nb_total_fine = sum(price_per_article)) %>% 152 | ungroup() 153 | 154 | gdpr_metrics <- create_metrics(summarized) 155 | ``` 156 | 157 | ```{r} 158 | library(shinymetrics) 159 | 160 | preview_metric(gdpr_metrics$gdpr_violations_nb_violations) 161 | ``` 162 | 163 | 164 | 165 | 166 | -------------------------------------------------------------------------------- /golden-age-tv.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Untitled" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | theme_set(theme_light()) 13 | 14 | tv_ratings <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-08/IMDb_Economist_tv_ratings.csv") 15 | ``` 16 | 17 | ```{r} 18 | tv_ratings %>% 19 | filter(title == "Buffy the Vampire Slayer") 20 | ``` 21 | 22 | ## Do most shows get better or worse over time? 23 | 24 | ```{r} 25 | tv_ratings %>% 26 | count(seasonNumber) %>% 27 | ggplot(aes(seasonNumber, n)) + 28 | geom_line() 29 | 30 | tv_ratings %>% 31 | filter(seasonNumber <= 7) %>% 32 | group_by(seasonNumber) %>% 33 | summarize(av_rating = mean(av_rating)) %>% 34 | ggplot(aes(seasonNumber, av_rating)) + 35 | geom_line() 36 | 37 | tv_ratings_full_show <- tv_ratings %>% 38 | group_by(title) %>% 39 | filter(1 %in% seasonNumber, 40 | all(seasonNumber == dplyr::row_number())) 41 | 42 | tv_ratings_full_show %>% 43 | filter(n() >= 5) %>% 44 | filter(seasonNumber <= 5) %>% 45 | group_by(seasonNumber) %>% 46 | ggplot(aes(factor(seasonNumber), av_rating)) + 47 | geom_boxplot() 48 | ``` 49 | 50 | What are the highest rated seasons of TV? 51 | 52 | ```{r} 53 | by_show <- tv_ratings %>% 54 | group_by(title) %>% 55 | summarize(avg_rating = mean(av_rating), 56 | sd = sd(av_rating), 57 | seasons = n()) %>% 58 | arrange(desc(avg_rating)) 59 | 60 | most_variable_shows <- by_show %>% 61 | filter(seasons >= 5) %>% 62 | top_n(6, sd) 63 | 64 | tv_ratings %>% 65 | inner_join(most_variable_shows, by = "title") %>% 66 | ggplot(aes(seasonNumber, av_rating, color = title)) + 67 | geom_line() + 68 | geom_point() + 69 | scale_x_continuous(breaks = function(lim) seq(floor(lim[1]), ceiling(lim[2]), 2)) + 70 | facet_wrap(~ title) + 71 | theme(legend.position = "none") 72 | ``` 73 | 74 | ### Has TV been getting better or worse over time? 75 | 76 | ```{r} 77 | library(lubridate) 78 | 79 | tv_ratings %>% 80 | mutate(year = year(date)) %>% 81 | ggplot(aes(av_rating)) + 82 | geom_histogram() + 83 | facet_wrap(~ year) 84 | 85 | by_year <- tv_ratings %>% 86 | group_by(year = 2 * (year(date) %/% 2)) %>% 87 | summarize(mean_rating = mean(av_rating), 88 | median_rating = median(av_rating), 89 | mean_season_1 = mean(av_rating[seasonNumber == 1]), 90 | avg_season = mean(seasonNumber), 91 | sd = sd(av_rating), 92 | observations = n()) 93 | 94 | by_year %>% 95 | gather(metric, value, -year, -observations, -avg_season, -sd) %>% 96 | ggplot(aes(year, value, color = metric)) + 97 | geom_line() + 98 | geom_point() 99 | 100 | tv_ratings %>% 101 | group_by(year = 5 * (year(date) %/% 5), 102 | seasonNumber = ifelse(seasonNumber >= 4, "4+", seasonNumber)) %>% 103 | summarize(mean_rating = mean(av_rating), 104 | observations = n()) %>% 105 | ggplot(aes(year, mean_rating, color = seasonNumber)) + 106 | geom_line() + 107 | labs(color = "Season", 108 | y = "Average rating in each 5 year period") 109 | 110 | tv_ratings %>% 111 | ggplot(aes(date, av_rating)) + 112 | geom_point() + 113 | geom_smooth(method = "loess") 114 | 115 | tv_ratings %>% 116 | mutate(year = 2 * (year(date) %/% 2)) %>% 117 | ggplot(aes(year, av_rating, group = year)) + 118 | geom_boxplot() 119 | 120 | tv_ratings %>% 121 | filter(seasonNumber == 1) %>% 122 | mutate(year = 2 * (year(date) %/% 2)) %>% 123 | ggplot(aes(year, av_rating, group = year)) + 124 | geom_boxplot() 125 | ``` 126 | 127 | ### Show survival 128 | 129 | If season 1 is good, what's the probability they get a season 2? 130 | 131 | ```{r} 132 | first_three_seasons <- tv_ratings %>% 133 | filter(seasonNumber <= 3) %>% 134 | group_by(title) %>% 135 | mutate(date_first_season = min(date)) %>% 136 | ungroup() %>% 137 | transmute(titleId, 138 | title, 139 | date_first_season, 140 | seasonNumber = paste0("season", seasonNumber), 141 | av_rating) %>% 142 | distinct(title, seasonNumber, .keep_all = TRUE) %>% 143 | spread(seasonNumber, av_rating) %>% 144 | filter(!is.na(season1)) %>% 145 | filter(date_first_season <= "2017-01-01") 146 | 147 | library(broom) 148 | 149 | glm(!is.na(season2) ~ season1, data = first_three_seasons) %>% 150 | summary() 151 | ``` 152 | 153 | ```{r} 154 | first_three_seasons %>% 155 | group_by(season1_bin = cut(season1, c(0, 7, 7.5, 8, 8.5, 10)), 156 | time_bin = ifelse(date_first_season < "2000-01-01", "Before 2000", "After 2000")) %>% 157 | summarize(had_second_season = mean(!is.na(season2)), 158 | observations = n()) %>% 159 | ggplot(aes(season1_bin, 160 | had_second_season, 161 | color = time_bin, 162 | group = time_bin)) + 163 | geom_line() + 164 | geom_point() + 165 | scale_y_continuous(labels = scales::percent_format()) 166 | ``` 167 | 168 | ```{r} 169 | library(broom) 170 | 171 | new_data <- crossing( 172 | year = 1990:2018, 173 | season1 = seq(6, 9) 174 | ) 175 | 176 | library(splines) 177 | 178 | mod <- first_three_seasons %>% 179 | mutate(year = year(date_first_season), 180 | had_second_season = !is.na(season2)) %>% 181 | glm(had_second_season ~ season1 * year, data = ., family = "binomial") 182 | 183 | summary(mod) 184 | 185 | mod %>% 186 | augment(newdata = new_data, type.predict = "response") %>% 187 | ggplot(aes(year, .fitted, color = factor(season1))) + 188 | geom_line() + 189 | scale_y_continuous(labels = scales::percent_format()) + 190 | labs(title = "Probability of getting season 2 has become more dependent on ratings of season 1", 191 | color = "Season 1 rating", 192 | x = "Year show premiered", 193 | y = "Predicted probability of getting season 2") 194 | ``` 195 | 196 | 197 | 198 | Whether a show gets a second season depends on the ratings of the first season 199 | 200 | 201 | 202 | -------------------------------------------------------------------------------- /honeycomb-puzzle.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Untitled" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | 13 | words <- tibble(word = read_lines("https://norvig.com/ngrams/enable1.txt")) %>% 14 | mutate(word_length = str_length(word)) %>% 15 | filter(word_length >= 4, 16 | !str_detect(word, "s")) %>% 17 | mutate(letters = str_split(word, ""), 18 | letters = map(letters, unique), 19 | unique_letters = lengths(letters)) %>% 20 | mutate(points = ifelse(word_length == 4, 1, word_length) + 21 | 15 * (unique_letters == 7)) %>% 22 | filter(unique_letters <= 7) %>% 23 | arrange(desc(points)) 24 | 25 | center_letter <- "g" 26 | other_letters <- c("a", "p", "x", "m", "e", "l") 27 | 28 | get_words <- function(center_letter, other_letters) { 29 | words %>% 30 | filter(str_detect(word, center_letter)) %>% 31 | mutate(invalid_letters = map(letters, setdiff, c(center_letter, other_letters))) %>% 32 | filter(lengths(invalid_letters) == 0) %>% 33 | arrange(desc(points)) 34 | } 35 | 36 | library(tidytext) 37 | 38 | letters_unnested <- words %>% 39 | select(word, points) %>% 40 | unnest_tokens(letter, word, token = "characters", drop = FALSE) %>% 41 | distinct(word, letter, .keep_all = TRUE) 42 | 43 | letters_summarized <- letters_unnested %>% 44 | group_by(letter) %>% 45 | summarize(n_words = n(), 46 | n_points = sum(points)) %>% 47 | arrange(desc(n_points)) 48 | ``` 49 | 50 | ```{r} 51 | word_matrix <- letters_unnested %>% 52 | reshape2::acast(word ~ letter, fun.aggregate = length) 53 | 54 | # Points per word (lines up with rows of word matrix) 55 | points_per_word <- words$points 56 | names(points_per_word) <- words$word 57 | points_per_word <- points_per_word[rownames(word_matrix)] 58 | 59 | get_score <- function(honeycomb_letters) { 60 | center_letter <- honeycomb_letters[1] 61 | 62 | permitted_letters <- colnames(word_matrix) %in% honeycomb_letters 63 | 64 | num_forbidden <- word_matrix %*% (1L - permitted_letters) 65 | word_permitted <- num_forbidden == 0L & word_matrix[, center_letter] == 1L 66 | 67 | sum(points_per_word[word_permitted]) 68 | } 69 | 70 | get_score(c("e", "i", "a", "r", "n", "t", "l")) 71 | 72 | get_words("e", c("i", "a", "r", "n", "t", "l")) 73 | ``` 74 | 75 | ```{r} 76 | center_letter <- "e" 77 | 78 | find_best_combination <- function(center_letter, possible_letters) { 79 | good_letter_combinations <- combn(possible_letters, 6) 80 | 81 | # Every column is one of the possible honeycombs 82 | forbidden_matrix <- 1L - apply(good_letter_combinations, 83 | 2, 84 | function(.) colnames(word_matrix) %in% c(center_letter, .)) 85 | 86 | filtered_word_matrix <- word_matrix[word_matrix[, center_letter] == 1, ] 87 | word_allowed_matrix <- filtered_word_matrix %*% forbidden_matrix == 0 88 | scores <- t(word_allowed_matrix) %*% points_per_word[rownames(word_allowed_matrix)] 89 | 90 | list(center_letter = center_letter, 91 | other_letters = good_letter_combinations[, which.max(scores)], 92 | score = max(scores)) 93 | } 94 | 95 | pool <- head(letters_summarized$letter, 16) 96 | 97 | find_best_combination("e", setdiff(pool, "e")) 98 | find_best_combination("i", setdiff(pool, "i")) 99 | find_best_combination("a", setdiff(pool, "a")) 100 | find_best_combination("r", setdiff(pool, "r")) 101 | find_best_combination("n", setdiff(pool, "n")) 102 | find_best_combination("t", setdiff(pool, "t")) 103 | find_best_combination("g", setdiff(pool, "g")) 104 | 105 | get_score(c("r", "e", "i", "a", "n", "t", "g")) 106 | ``` 107 | 108 | ```{r} 109 | permitted_letters <- colnames(word_matrix) %in% honeycomb_letters 110 | 111 | num_forbidden <- word_matrix %*% (1L - permitted_letters) 112 | word_permitted <- num_forbidden == 0L & word_matrix[, center_letter] == 1L 113 | 114 | sum(points_per_word[word_permitted]) 115 | ``` 116 | 117 | 118 | ```{r} 119 | words %>% 120 | unnest(letters) %>% 121 | group_by(letters) %>% 122 | summarize(total_points = sum(points), 123 | ) 124 | ``` 125 | 126 | ```{r} 127 | words 128 | ``` 129 | 130 | 131 | 132 | ```{r} 133 | 134 | ``` 135 | 136 | 137 | -------------------------------------------------------------------------------- /horror-movie-ratings.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Untitled" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | theme_set(theme_light()) 13 | 14 | horror_movies_raw <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-22/horror_movies.csv") 15 | ``` 16 | 17 | ```{r} 18 | horror_movies <- horror_movies_raw %>% 19 | arrange(desc(review_rating)) %>% 20 | extract(title, "year", "\\((\\d\\d\\d\\d)\\)$", remove = FALSE, convert = TRUE) %>% 21 | mutate(budget = parse_number(budget)) %>% 22 | separate(plot, c("director", "cast_sentence", "plot"), extra = "merge", sep = "\\. ", fill = "right") %>% 23 | distinct(title, .keep_all = TRUE) 24 | ``` 25 | 26 | Most of the movies are since 2012. 27 | 28 | ```{r} 29 | horror_movies %>% 30 | count(genres, sort = TRUE) 31 | 32 | horror_movies %>% 33 | count(language, sort = TRUE) 34 | 35 | horror_movies %>% 36 | ggplot(aes(budget)) + 37 | geom_histogram() + 38 | scale_x_log10(labels = scales::dollar) 39 | ``` 40 | 41 | Do higher budget movies end up higher rated? 42 | 43 | ```{r} 44 | horror_movies %>% 45 | ggplot(aes(budget, review_rating)) + 46 | geom_point() + 47 | scale_x_log10(labels = scales::dollar) + 48 | geom_smooth(method = "lm") 49 | ``` 50 | 51 | No relationship between budget and rating. How about movie rating and review? 52 | 53 | ```{r} 54 | horror_movies %>% 55 | mutate(movie_rating = fct_lump(movie_rating, 5), 56 | movie_rating = fct_reorder(movie_rating, review_rating, na.rm = TRUE)) %>% 57 | ggplot(aes(movie_rating, review_rating)) + 58 | geom_boxplot() + 59 | coord_flip() 60 | 61 | horror_movies %>% 62 | filter(!is.na(movie_rating)) %>% 63 | mutate(movie_rating = fct_lump(movie_rating, 5)) %>% 64 | lm(review_rating ~ movie_rating, data = .) %>% 65 | anova() 66 | ``` 67 | 68 | ```{r} 69 | horror_movies %>% 70 | separate_rows(genres, sep = "\\| ") %>% 71 | mutate(genre = fct_lump(genres, 5)) %>% 72 | ggplot(aes(genre, review_rating)) + 73 | geom_boxplot() 74 | ``` 75 | 76 | ```{r} 77 | library(tidytext) 78 | 79 | horror_movies_unnested <- horror_movies %>% 80 | unnest_tokens(word, plot) %>% 81 | anti_join(stop_words, by = "word") %>% 82 | filter(!is.na(word)) 83 | 84 | horror_movies_unnested %>% 85 | filter(!is.na(review_rating)) %>% 86 | group_by(word) %>% 87 | summarize(movies = n(), 88 | avg_rating = mean(review_rating)) %>% 89 | arrange(desc(movies)) %>% 90 | filter(movies >= 100) %>% 91 | mutate(word = fct_reorder(word, avg_rating)) %>% 92 | ggplot(aes(avg_rating, word)) + 93 | geom_point() 94 | ``` 95 | 96 | ### Lasso regression for predicing review rating based on words in plot 97 | 98 | ```{r} 99 | library(glmnet) 100 | library(Matrix) 101 | 102 | movie_word_matrix <- horror_movies_unnested %>% 103 | filter(!is.na(review_rating)) %>% 104 | add_count(word) %>% 105 | filter(n >= 20) %>% 106 | count(title, word) %>% 107 | cast_sparse(title, word, n) 108 | 109 | rating <- horror_movies$review_rating[match(rownames(movie_word_matrix), horror_movies$title)] 110 | 111 | lasso_model <- cv.glmnet(movie_word_matrix, rating) 112 | ``` 113 | 114 | ```{r} 115 | library(broom) 116 | 117 | tidy(lasso_model$glmnet.fit) %>% 118 | filter(term %in% c("quickly", "seek", "army", "teacher", "unexpected", "friends", "evil")) %>% 119 | ggplot(aes(lambda, estimate, color = term)) + 120 | geom_line() + 121 | scale_x_log10() + 122 | geom_vline(xintercept = lasso_model$lambda.min) + 123 | geom_hline(yintercept = 0, lty = 2) 124 | ``` 125 | 126 | ```{r} 127 | plot(lasso_model) 128 | 129 | tidy(lasso_model$glmnet.fit) %>% 130 | filter(lambda == lasso_model$lambda.min, 131 | term != "(Intercept)") %>% 132 | mutate(term = fct_reorder(term, estimate)) %>% 133 | ggplot(aes(term, estimate)) + 134 | geom_col() + 135 | coord_flip() 136 | ``` 137 | 138 | Throwing everything into a linear model: director, cast, genre, rating, plot words. 139 | 140 | ```{r} 141 | features <- horror_movies %>% 142 | filter(!is.na(review_rating)) %>% 143 | select(title, genres, director, cast, movie_rating, language, release_country) %>% 144 | mutate(director = str_remove(director, "Directed by ")) %>% 145 | gather(type, value, -title) %>% 146 | filter(!is.na(value)) %>% 147 | separate_rows(value, sep = "\\| ?") %>% 148 | unite(feature, type, value, sep = ": ") %>% 149 | mutate(n = 1) 150 | 151 | movie_feature_matrix <- horror_movies_unnested %>% 152 | filter(!is.na(review_rating)) %>% 153 | count(title, feature = paste0("word: ", word)) %>% 154 | bind_rows(features) %>% 155 | add_count(feature) %>% 156 | filter(n >= 10) %>% 157 | cast_sparse(title, feature) 158 | 159 | rating <- horror_movies$review_rating[match(rownames(movie_feature_matrix), horror_movies$title)] 160 | 161 | feature_lasso_model <- cv.glmnet(movie_feature_matrix, rating) 162 | ``` 163 | 164 | ```{r} 165 | plot(feature_lasso_model) 166 | 167 | tidy(feature_lasso_model$glmnet.fit) %>% 168 | filter(lambda == feature_lasso_model$lambda.1se, 169 | term != "(Intercept)") %>% 170 | mutate(term = fct_reorder(term, estimate)) %>% 171 | ggplot(aes(term, estimate)) + 172 | geom_col() + 173 | coord_flip() + 174 | labs(x = "", 175 | y = "Coefficient for predicting movie rating", 176 | title = "What affects a horror movie rating?", 177 | subtitle = "Based on a lasso regression to predict IMDb ratings of ~3000 movies") 178 | ``` 179 | 180 | What am I going to watch? 181 | 182 | ```{r} 183 | horror_movies %>% 184 | filter(str_detect(genres, "Comedy"), 185 | !is.na(movie_rating), 186 | !is.na(budget), 187 | movie_rating != "PG") %>% 188 | arrange(desc(review_rating)) %>% 189 | select(title, review_rating, movie_rating, plot, director, budget, language) %>% 190 | View() 191 | ``` 192 | 193 | 194 | 195 | 196 | -------------------------------------------------------------------------------- /malaria.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Malaria Atlas" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | theme_set(theme_light()) 13 | 14 | library(malariaAtlas) 15 | ``` 16 | 17 | ```{r} 18 | kenya_pr <- tbl_df(malariaAtlas::getPR(ISO = "KEN", species = "BOTH")) %>% 19 | filter(!is.na(pr)) 20 | ``` 21 | 22 | ```{r} 23 | kenya_pr %>% 24 | group_by(year_start) %>% 25 | summarize(examined = sum(examined), 26 | positive = sum(positive), 27 | studies = n()) %>% 28 | mutate(pr = positive / examined) %>% 29 | ggplot(aes(year_start, pr)) + 30 | geom_line() 31 | 32 | kenya_pr %>% 33 | mutate(decade = 10 * (year_start %/% 10)) %>% 34 | arrange(pr) %>% 35 | ggplot(aes(longitude, latitude, color = pr)) + 36 | borders("world", regions = "Kenya") + 37 | geom_point() + 38 | scale_color_gradient2(low = "blue", high = "red", midpoint = .5, labels = scales::percent_format()) + 39 | facet_wrap(~ decade) + 40 | theme_void() + 41 | coord_map() + 42 | labs(color = "Prevalence") 43 | ``` 44 | 45 | ### Aggregated across countries 46 | 47 | ```{r} 48 | malaria_inc <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-11-13/malaria_inc.csv") 49 | 50 | malaria_inc_processed <- malaria_inc %>% 51 | setNames(c("country", "code", "year", "incidence")) %>% 52 | mutate(incidence = incidence / 1000) 53 | ``` 54 | 55 | ```{r} 56 | malaria_inc_processed %>% 57 | filter(country %in% sample(unique(country), 6)) %>% 58 | ggplot(aes(year, incidence, color = country)) + 59 | geom_line() + 60 | scale_y_continuous(labels = scales::percent_format()) 61 | ``` 62 | 63 | Looking at 2015 levels and the change from 2015 to 2000 64 | 65 | ```{r} 66 | malaria_spread <- malaria_inc_processed %>% 67 | mutate(year = paste0("Y", year)) %>% 68 | spread(year, incidence) 69 | 70 | malaria_spread %>% 71 | filter(country != "Turkey", 72 | !is.na(code)) %>% 73 | mutate(current = Y2015, 74 | change = Y2015 - Y2000) %>% 75 | ggplot(aes(current, change)) + 76 | geom_point() + 77 | geom_text(aes(label = code), vjust = 1, hjust = 1) 78 | ``` 79 | 80 | ```{r} 81 | world <- map_data("world") %>% 82 | filter(region != "Antarctica") 83 | 84 | malaria_inc_processed %>% 85 | filter(incidence < 1) %>% 86 | inner_join(maps::iso3166 %>% 87 | select(a3, mapname), by = c(code = "a3")) %>% 88 | inner_join(world, by = c(mapname = "region")) %>% 89 | ggplot(aes(long, lat, group = group, fill = incidence)) + 90 | geom_polygon() + 91 | scale_fill_gradient2(low = "blue", high = "red", midpoint = .20, labels = scales::percent_format()) + 92 | coord_map() + 93 | facet_wrap(~ year) + 94 | theme_void() + 95 | labs(title = "Malaria incidence over time around the world") 96 | ``` 97 | 98 | ### Malaria deaths over time 99 | 100 | 101 | ```{r} 102 | library(tidyverse) 103 | 104 | malaria_deaths <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-11-13/malaria_deaths.csv") 105 | 106 | malaria_deaths_processed <- malaria_deaths %>% 107 | setNames(c("country", "code", "year", "deaths")) 108 | ``` 109 | 110 | ```{r} 111 | malaria_deaths_processed %>% 112 | filter(country %in% sample(unique(country), 6)) %>% 113 | ggplot(aes(year, deaths, color = country)) + 114 | geom_line() + 115 | labs(y = "Deaths per 100,000") 116 | ``` 117 | 118 | ```{r} 119 | library(fuzzyjoin) 120 | library(stringr) 121 | 122 | malaria_country_data <- malaria_deaths_processed %>% 123 | inner_join(maps::iso3166 %>% 124 | select(a3, mapname), by = c(code = "a3")) %>% 125 | mutate(mapname = str_remove(mapname, "\\(.*")) 126 | 127 | malaria_map_data <- map_data("world") %>% 128 | filter(region != "Antarctica") %>% 129 | tbl_df() %>% 130 | inner_join(malaria_country_data, by = c(region = "mapname")) 131 | 132 | malaria_map_data %>% 133 | ggplot(aes(long, lat, group = group, fill = deaths)) + 134 | geom_polygon() + 135 | scale_fill_gradient2(low = "blue", high = "red", midpoint = 100) + 136 | theme_void() + 137 | labs(title = "Malaria deaths over time around the world", 138 | fill = "Deaths per 100,000") 139 | ``` 140 | 141 | ```{r} 142 | library(gganimate) 143 | library(countrycode) 144 | 145 | malaria_map_data %>% 146 | mutate(continent = countrycode(code, "iso3c", "continent")) %>% 147 | filter(continent == "Africa") %>% 148 | ggplot(aes(long, lat, group = group, fill = deaths)) + 149 | geom_polygon() + 150 | scale_fill_gradient2(low = "blue", high = "red", midpoint = 100) + 151 | theme_void() + 152 | labs(title = "Malaria deaths over time in Africa ({ current_frame })", 153 | fill = "Deaths per 100,000") + 154 | transition_manual(year) 155 | 156 | anim_save("malaria_map.gif") 157 | ``` 158 | 159 | -------------------------------------------------------------------------------- /media-franchises.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Media Franchise Revenues" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | theme_set(theme_light()) 13 | 14 | media_franchises <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-02/media_franchises.csv") 15 | ``` 16 | 17 | ```{r} 18 | media_franchises %>% 19 | count(franchise, sort = TRUE) 20 | 21 | franchises <- media_franchises %>% 22 | group_by(franchise, original_media, year_created, creators, owners) %>% 23 | summarize(categories = n(), 24 | total_revenue = sum(revenue), 25 | most_profitable = revenue_category[which.max(revenue)]) %>% 26 | ungroup() 27 | 28 | franchises 29 | 30 | media_franchises 31 | ``` 32 | 33 | ### Franchise-level 34 | 35 | What are the biggest franchises? 36 | 37 | ```{r} 38 | library(glue) 39 | 40 | top_franchises <- franchises %>% 41 | mutate(franchise = glue("{ franchise } ({ year_created })")) %>% 42 | top_n(20, total_revenue) 43 | 44 | media_franchises %>% 45 | mutate(franchise = glue("{ franchise } ({ year_created })")) %>% 46 | semi_join(top_franchises, by = "franchise") %>% 47 | mutate(franchise = fct_reorder(franchise, revenue, sum), 48 | revenue_category = fct_reorder(revenue_category, revenue, sum)) %>% 49 | ggplot(aes(franchise, revenue)) + 50 | geom_col(aes(fill = revenue_category)) + 51 | geom_text(aes(y = total_revenue, 52 | label = paste0(scales::dollar(total_revenue, accuracy = 1), "B")), 53 | data = top_franchises, 54 | hjust = 0) + 55 | scale_y_continuous(labels = scales::dollar) + 56 | expand_limits(y = 100) + 57 | coord_flip() + 58 | theme(panel.grid.major.y = element_blank()) + 59 | guides(fill = guide_legend(reverse = TRUE)) + 60 | labs(title = "What are the most profitable franchises of all time?", 61 | fill = "Category", 62 | x = "", 63 | y = "Revenue (Billions)") 64 | ``` 65 | 66 | ```{r} 67 | media_franchises %>% 68 | group_by(owners) %>% 69 | filter(n_distinct(franchise) > 2) %>% 70 | ungroup() %>% 71 | mutate(franchise = fct_reorder(franchise, revenue, sum), 72 | owners = fct_reorder(owners, -revenue, sum), 73 | revenue_category = fct_reorder(revenue_category, revenue, sum)) %>% 74 | ggplot(aes(franchise, revenue, fill = revenue_category)) + 75 | geom_col() + 76 | facet_wrap(~ owners, scales = "free_y") + 77 | guides(fill = guide_legend(reverse = TRUE)) + 78 | coord_flip() + 79 | labs(title = "What companies own at least 3 franchises?", 80 | fill = "Category", 81 | x = "", 82 | y = "Revenue (Billions)") 83 | ``` 84 | 85 | ```{r} 86 | franchises %>% 87 | ggplot(aes(year_created, total_revenue)) + 88 | geom_point(aes(size = total_revenue, color = original_media)) + 89 | geom_text(aes(label = franchise), check_overlap = TRUE, vjust = 1, hjust = 1) + 90 | expand_limits(x = 1910) + 91 | labs(title = "When were the 'great' franchises created?") 92 | ``` 93 | 94 | ```{r} 95 | media_franchises %>% 96 | group_by(original_media) %>% 97 | summarize(revenue = sum(revenue)) 98 | 99 | original_media_revenue_categories <- media_franchises %>% 100 | group_by(original_media) %>% 101 | filter(sum(revenue) >= 45) %>% 102 | group_by(original_media, revenue_category) %>% 103 | summarize(revenue = sum(revenue)) %>% 104 | ungroup() %>% 105 | mutate(revenue_category = fct_reorder(revenue_category, revenue, sum), 106 | original_media = fct_reorder(original_media, -revenue, sum)) 107 | 108 | original_media_revenue_categories %>% 109 | ggplot(aes(revenue_category, revenue)) + 110 | geom_col() + 111 | scale_y_continuous(labels = scales::dollar) + 112 | coord_flip() + 113 | facet_wrap(~ original_media) + 114 | labs(x = "Revenue category", 115 | y = "Revenue (Billions)", 116 | title = "What kinds of media lead to what types of revenue?") 117 | ``` 118 | 119 | ```{r} 120 | original_media_revenue_categories %>% 121 | mutate(revenue_category = fct_rev(revenue_category), 122 | original_media = fct_rev(original_media)) %>% 123 | ggplot(aes(revenue_category, original_media, fill = revenue)) + 124 | geom_tile() + 125 | scale_fill_gradient2(low = "white", high = "red", labels = scales::dollar) + 126 | theme(panel.grid = element_blank(), 127 | axis.text.x = element_text(angle = 90, hjust = 1)) + 128 | labs(fill = "Revenue (billions)") 129 | ``` 130 | 131 | 132 | -------------------------------------------------------------------------------- /medium-datasci.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Untitled" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | 13 | medium_datasci <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-12-04/medium_datasci.csv") 14 | 15 | theme_set(theme_light()) 16 | ``` 17 | 18 | ```{r} 19 | medium_processed <- medium_datasci %>% 20 | select(-x1) %>% 21 | mutate(post_id = row_number()) 22 | ``` 23 | 24 | ```{r} 25 | medium_processed %>% 26 | count(author, sort = TRUE) 27 | 28 | medium_processed %>% 29 | summarize_at(vars(starts_with("tag_")), sum) 30 | 31 | medium_gathered <- medium_processed %>% 32 | gather(tag, value, starts_with("tag")) %>% 33 | mutate(tag = str_remove(tag, "tag_")) %>% 34 | filter(value == 1) 35 | 36 | medium_gathered %>% 37 | count(tag, sort = TRUE) 38 | 39 | medium_gathered %>% 40 | group_by(tag) %>% 41 | summarize(median_claps = median(claps)) %>% 42 | arrange(desc(median_claps)) 43 | 44 | medium_processed %>% 45 | ggplot(aes(claps)) + 46 | geom_histogram() + 47 | scale_x_log10(labels = scales::comma_format()) 48 | 49 | medium_processed %>% 50 | mutate(reading_time = pmin(10, reading_time)) %>% 51 | ggplot(aes(reading_time)) + 52 | geom_histogram(binwidth = .5) + 53 | scale_x_continuous(breaks = seq(2, 10, 2), 54 | labels = c(seq(2, 8, 2), "10+")) + 55 | labs(x = "Medium reading time") 56 | 57 | medium_gathered %>% 58 | group_by(tag) %>% 59 | summarize(reading_time = mean(reading_time)) %>% 60 | arrange(desc(reading_time)) 61 | ``` 62 | 63 | ### Text mining 64 | 65 | ```{r} 66 | library(tidytext) 67 | 68 | medium_words <- medium_processed %>% 69 | filter(!is.na(title)) %>% 70 | select(post_id, title, subtitle, year, reading_time, claps) %>% 71 | unnest_tokens(word, title) %>% 72 | anti_join(stop_words, by = "word") %>% 73 | filter(!(word %in% c("de", "en", "la", "para")), 74 | str_detect(word, "[a-z]")) 75 | 76 | medium_words %>% 77 | count(word, sort = TRUE) %>% 78 | mutate(word = fct_reorder(word, n)) %>% 79 | head(20) %>% 80 | ggplot(aes(word, n)) + 81 | geom_col() + 82 | coord_flip() + 83 | labs(title = "Common words in Medium post titles") 84 | ``` 85 | 86 | ```{r} 87 | medium_words_filtered <- medium_words %>% 88 | add_count(word) %>% 89 | filter(n >= 250) 90 | 91 | tag_claps <- medium_words_filtered %>% 92 | group_by(word) %>% 93 | summarize(median_claps = median(claps), 94 | geometric_mean_claps = exp(mean(log(claps + 1))) - 1, 95 | occurences = n()) %>% 96 | arrange(desc(median_claps)) 97 | 98 | library(widyr) 99 | library(ggraph) 100 | library(igraph) 101 | 102 | top_word_cors <- medium_words_filtered %>% 103 | select(post_id, word) %>% 104 | pairwise_cor(word, post_id, sort = TRUE) %>% 105 | head(150) 106 | 107 | vertices <- tag_claps %>% 108 | filter(word %in% top_word_cors$item1 | 109 | word %in% top_word_cors$item2) 110 | 111 | set.seed(2018) 112 | 113 | top_word_cors %>% 114 | graph_from_data_frame(vertices = vertices) %>% 115 | ggraph() + 116 | geom_edge_link() + 117 | geom_node_point(aes(size = occurences * 1.1)) + 118 | geom_node_point(aes(size = occurences, 119 | color = geometric_mean_claps)) + 120 | geom_node_text(aes(label = name), repel = TRUE) + 121 | scale_color_gradient2(low = "blue", 122 | high = "red", 123 | midpoint = 10) + 124 | theme_void() + 125 | labs(title = "What gets claps in Medium article titles?", 126 | subtitle = "Color shows the geometric mean of # of claps on articles with this word in the title", 127 | size = "# of occurrences", 128 | color = "Claps") 129 | ``` 130 | 131 | ### Predicting # of claps based on title + tag 132 | 133 | ```{r} 134 | # turn into a sparse matrix 135 | post_word_matrix <- medium_words_filtered %>% 136 | distinct(post_id, word, claps) %>% 137 | cast_sparse(post_id, word) 138 | 139 | # Fit a LASSO model 140 | library(glmnet) 141 | 142 | claps <- medium_processed$claps[match(rownames(post_word_matrix), medium_processed$post_id)] 143 | 144 | lasso_model <- cv.glmnet(post_word_matrix, log(claps + 1)) 145 | ``` 146 | 147 | ```{r} 148 | library(broom) 149 | 150 | tidy(lasso_model$glmnet.fit) %>% 151 | filter(term %in% c("hadoop", "learning", "gdpr", "deep", "startup", "marketing")) %>% 152 | ggplot(aes(lambda, estimate, color = term)) + 153 | geom_line() + 154 | scale_x_log10() 155 | 156 | tidy(lasso_model$glmnet.fit) %>% 157 | filter(lambda == lasso_model$lambda.min) %>% 158 | arrange(desc(estimate)) %>% 159 | View() 160 | ``` 161 | 162 | -------------------------------------------------------------------------------- /nyc-pizza.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Pizza" 3 | output: html_document 4 | editor_options: 5 | chunk_output_type: console 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | ``` 11 | 12 | ## R Markdown 13 | 14 | ```{r} 15 | library(tidyverse) 16 | theme_set(theme_light())\ 17 | ``` 18 | 19 | ```{r} 20 | pizza_jared <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-01/pizza_jared.csv") 21 | pizza_barstool <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-01/pizza_barstool.csv") 22 | pizza_datafiniti <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-01/pizza_datafiniti.csv") 23 | ``` 24 | 25 | ```{r} 26 | answer_orders <- c("Never Again", "Poor", "Average", "Good", "Excellent") 27 | 28 | by_place_answer <- pizza_jared %>% 29 | mutate(time = as.POSIXct(time, origin = "1970-01-01"), 30 | date = as.Date(time), 31 | answer = fct_relevel(answer, answer_orders)) %>% 32 | group_by(place, answer) %>% 33 | summarize(votes = sum(votes)) %>% 34 | mutate(total = sum(votes), 35 | percent = votes / total, 36 | answer_integer = as.integer(answer), 37 | average = sum(answer_integer * percent)) %>% 38 | ungroup() 39 | 40 | by_place <- by_place_answer %>% 41 | distinct(place, total, average) 42 | ``` 43 | 44 | ```{r} 45 | by_place_answer %>% 46 | filter(as.integer(fct_reorder(place, total, .desc = TRUE)) <= 16, 47 | answer != "Fair") %>% 48 | mutate(place = glue::glue("{ place } ({ total })"), 49 | place = fct_reorder(place, average)) %>% 50 | ggplot(aes(answer, percent)) + 51 | geom_col() + 52 | facet_wrap(~ place) + 53 | scale_y_continuous(labels = scales::percent) + 54 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 55 | labs(x = "", 56 | y = "% of respondents", 57 | title = "What is the most popular pizza place in Open Stats meetup?", 58 | subtitle = "Only the 16 pizza places with the most respondents. # respondents shown in parentheses.") 59 | ``` 60 | 61 | ```{r} 62 | library(broom) 63 | 64 | # THIS TRICK DOESN'T WORK 65 | # tidy(lm(c(1, 2, 3, 4, 5) ~ 1, weights = c(100, 300, 100, 200, 150)), conf.int = TRUE) 66 | 67 | t_test_repeated <- function(x, frequency) { 68 | tidy(t.test(rep(x, frequency))) 69 | } 70 | 71 | by_place_answer %>% 72 | filter(total >= 3) %>% 73 | group_by(place, total) %>% 74 | summarize(t_test_result = list(t_test_repeated(answer_integer, votes))) %>% 75 | ungroup() %>% 76 | unnest(t_test_result) %>% 77 | select(place, total, average = estimate, low = conf.low, high = conf.high) %>% 78 | top_n(16, total) %>% 79 | mutate(place = fct_reorder(place, average)) %>% 80 | ggplot(aes(average, place)) + 81 | geom_point(aes(size = total)) + 82 | geom_errorbarh(aes(xmin = low, xmax = high)) + 83 | labs(x = "Average score (1-5 Likert Scale)", 84 | y = "", 85 | title = "What is the most popular pizza place in Open Stats meetup?", 86 | subtitle = "Only the 16 pizza places with the most respondents.", 87 | size = "# of respondents") 88 | ``` 89 | 90 | ```{r} 91 | # Don't bother comparing them, this is a bad graph 92 | pizza_barstool %>% 93 | select(place = name, 94 | barstool_total = review_stats_all_count, 95 | barstool_average = review_stats_all_average_score) %>% 96 | inner_join(by_place, by = "place") %>% 97 | group_by(place) %>% 98 | filter(n() == 1) %>% 99 | ungroup() %>% 100 | filter(barstool_total >= 5, 101 | total >= 5) %>% 102 | ggplot(aes(average, barstool_average)) + 103 | geom_point() + 104 | labs(x = "Meetup", 105 | y = "Barstool") 106 | ``` 107 | 108 | ```{r} 109 | pizza_barstool %>% 110 | top_n(50, review_stats_all_count) %>% 111 | ggplot(aes(price_level, review_stats_all_average_score, group = price_level)) + 112 | geom_boxplot() 113 | 114 | pizza_barstool %>% 115 | filter(review_stats_all_count >= 50) %>% 116 | mutate(name = fct_reorder(name, review_stats_all_average_score)) %>% 117 | ggplot(aes(review_stats_all_average_score, name, size = review_stats_all_count)) + 118 | geom_point() + 119 | labs(x = "Average rating", 120 | y = "", 121 | size = "# of reviews", 122 | title = "Barstool Sports ratings of pizza places", 123 | subtitle = "Only places with at least 50 reviews") 124 | ``` 125 | 126 | ```{r} 127 | pizza_barstool %>% 128 | filter(review_stats_all_count >= 20) %>% 129 | mutate(city = fct_lump(city, 3)) %>% 130 | add_count(city) %>% 131 | mutate(city = glue::glue("{ city } ({ n })")) %>% 132 | ggplot(aes(city, review_stats_all_average_score)) + 133 | geom_boxplot() + 134 | labs(title = "Do pizza ratings differ across cities?", 135 | subtitle = "Only pizza places with at least 20 reviews") 136 | ``` 137 | 138 | ```{r} 139 | pizza_cleaned <- pizza_barstool %>% 140 | select(place = name, 141 | price_level, 142 | contains("review")) %>% 143 | rename_all(~ str_remove(., "review_stats_")) %>% 144 | select(-contains("provider")) 145 | 146 | pizza_cleaned %>% 147 | filter(critic_count > 0) %>% 148 | ggplot(aes(critic_average_score, dave_average_score)) + 149 | geom_point() + 150 | geom_abline(color = "red") + 151 | geom_smooth(method = "lm") + 152 | labs(title = "Does Barstool Sports' Dave agree with the critics?", 153 | x = "Critic average score", 154 | y = "Dave score") 155 | ``` 156 | 157 | ```{r} 158 | pizza_cleaned %>% 159 | filter(community_count >= 20) %>% 160 | ggplot(aes(community_average_score, dave_average_score)) + 161 | geom_point(aes(size = community_count)) + 162 | geom_abline(color = "red") + 163 | geom_smooth(method = "lm") + 164 | labs(size = "# of community reviews", 165 | x = "Community score", 166 | y = "Dave score") 167 | ``` 168 | -------------------------------------------------------------------------------- /nyc-restaurants.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "NYC Restaurants" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | library(lubridate) 13 | 14 | theme_set(theme_light()) 15 | 16 | # You can use this url to download the data directly into R (will take a few seconds) 17 | restaurant_inspections_raw <- read_csv("https://data.cityofnewyork.us/api/views/43nn-pn8j/rows.csv") 18 | 19 | restaurant_inspections <- restaurant_inspections_raw %>% 20 | janitor::clean_names() %>% 21 | select(-phone, -grade_date, -record_date, -building, -street) %>% 22 | mutate(inspection_date = mdy(inspection_date)) %>% 23 | separate(inspection_type, c("inspection_program", "inspection_type"), sep = " / ") 24 | ``` 25 | 26 | ```{r} 27 | restaurant_inspections %>% 28 | count(dba, camis, sort = TRUE) 29 | 30 | restaurant_inspections %>% 31 | count(year = year(inspection_date)) 32 | 33 | restaurant_inspections %>% 34 | count(grade, sort = TRUE) 35 | 36 | restaurant_inspections %>% 37 | count(violation_code, violation_description, sort = TRUE) 38 | 39 | restaurant_inspections %>% 40 | filter(camis == 41297769, inspection_date == "2018-09-25") %>% 41 | count(camis, dba, inspection_date, sort = TRUE) 42 | 43 | restaurant_inspections %>% 44 | count(cuisine_description, sort = TRUE) 45 | 46 | restaurant_inspections %>% 47 | filter(action == "No violations were recorded at the time of this inspection.") %>% 48 | count(critical_flag) 49 | 50 | inspections <- restaurant_inspections %>% 51 | group_by(camis, 52 | dba, 53 | boro, 54 | zipcode, 55 | cuisine_description, 56 | inspection_date, 57 | action, 58 | score, 59 | grade, 60 | inspection_type, 61 | inspection_program) %>% 62 | summarize(critical_violations = sum(critical_flag == "Critical", na.rm = TRUE), 63 | non_critical_violations = sum(critical_flag == "Not Critical", na.rm = TRUE)) %>% 64 | ungroup() 65 | 66 | most_recent_cycle_inspection <- inspections %>% 67 | filter(inspection_program == "Cycle Inspection", 68 | inspection_type == "Initial Inspection") %>% 69 | arrange(desc(inspection_date)) %>% 70 | distinct(camis, .keep_all = TRUE) 71 | ``` 72 | 73 | ```{r} 74 | by_dba <- most_recent_cycle_inspection %>% 75 | group_by(dba, cuisine = cuisine_description) %>% 76 | summarize(locations = n(), 77 | avg_score = mean(score), 78 | median_score = median(score)) %>% 79 | ungroup() %>% 80 | arrange(desc(locations)) 81 | 82 | by_dba %>% 83 | mutate(locations_bin = cut(locations, c(0, 1, 3, 10, Inf), labels = c("1", "2-3", "3-10", ">10"))) %>% 84 | ggplot(aes(locations_bin, avg_score + 1)) + 85 | geom_boxplot() + 86 | scale_y_log10() 87 | ``` 88 | 89 | ```{r} 90 | by_cuisine <- by_dba %>% 91 | group_by(cuisine) %>% 92 | summarize(avg_score = mean(avg_score), 93 | median_score = median(avg_score), 94 | restaurants = n()) %>% 95 | arrange(desc(restaurants)) 96 | 97 | library(broom) 98 | 99 | cuisine_conf_ints <- by_dba %>% 100 | add_count(cuisine) %>% 101 | filter(n > 100) %>% 102 | nest(-cuisine) %>% 103 | mutate(model = map(data, ~ t.test(.$avg_score))) %>% 104 | unnest(map(model, tidy)) 105 | 106 | cuisine_conf_ints %>% 107 | mutate(cuisine = str_remove(cuisine, " \\(.*"), 108 | cuisine = fct_reorder(cuisine, estimate)) %>% 109 | ggplot(aes(estimate, cuisine)) + 110 | geom_point() + 111 | geom_errorbarh(aes(xmin = conf.low, 112 | xmax = conf.high)) + 113 | labs(x = "Average inspection score (higher means more violations)", 114 | y = "Type of cuisine", 115 | title = "Average inspection score by type of cuisine in NYC", 116 | subtitle = "Each restaurant chain was counted once based on its average score") 117 | ``` 118 | 119 | ```{r} 120 | violation_cuisine_counts <- restaurant_inspections %>% 121 | semi_join(most_recent_cycle_inspection, by = c("camis", "inspection_date")) %>% 122 | count(critical_flag, violation_code, violation_description, cuisine = cuisine_description) %>% 123 | group_by(violation_code) %>% 124 | mutate(violation_total = sum(n)) %>% 125 | group_by(cuisine) %>% 126 | mutate(cuisine_total = sum(n)) %>% 127 | ungroup() %>% 128 | filter(violation_total >= 1000, 129 | cuisine_total >= 2000) %>% 130 | group_by(violation_description) %>% 131 | mutate(fraction = n / cuisine_total, 132 | avg_fraction = mean(fraction)) %>% 133 | ungroup() 134 | 135 | violation_cuisine_counts %>% 136 | mutate(relative_risk = fraction / avg_fraction) %>% 137 | arrange(desc(relative_risk)) %>% 138 | filter(str_detect(violation_description, "mice")) 139 | ``` 140 | 141 | ### What violations tend to occur together? 142 | 143 | ```{r} 144 | library(widyr) 145 | 146 | violations <- restaurant_inspections %>% 147 | semi_join(most_recent_cycle_inspection, by = c("camis", "inspection_date")) %>% 148 | filter(!is.na(violation_description)) 149 | 150 | violations %>% 151 | pairwise_cor(violation_description, camis, sort = TRUE) 152 | 153 | principal_components <- violations %>% 154 | mutate(value = 1) %>% 155 | widely_svd(violation_description, camis, value, nv = 6) 156 | 157 | principal_components %>% 158 | filter(dimension == 2) %>% 159 | top_n(10, abs(value)) %>% 160 | mutate(violation_description = str_sub(violation_description, 1, 60), 161 | violation_description = fct_reorder(violation_description, value)) %>% 162 | ggplot(aes(violation_description, value)) + 163 | geom_col() + 164 | coord_flip() 165 | ``` 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /nyc-squirrels-app/app.R: -------------------------------------------------------------------------------- 1 | # 2 | # This is a Shiny web application. You can run the application by clicking 3 | # the 'Run App' button above. 4 | # 5 | # Find out more about building applications with Shiny here: 6 | # 7 | # http://shiny.rstudio.com/ 8 | # 9 | 10 | library(shiny) 11 | 12 | squirrel_variables <- by_hectare %>% 13 | select(-(hectare:lat)) %>% 14 | colnames() 15 | 16 | names(squirrel_variables) <- squirrel_variables %>% 17 | str_replace_all("_", " ") %>% 18 | str_to_title() 19 | 20 | # Define UI for application that draws a histogram 21 | ui <- fluidPage( 22 | 23 | # Application title 24 | titlePanel("Central Park Squirrels"), 25 | 26 | # Sidebar with a slider input for number of bins 27 | sidebarLayout( 28 | sidebarPanel( 29 | selectInput("variable", 30 | "Variable:", 31 | choices = squirrel_variables), 32 | sliderInput("min_squirrels", 33 | "Minimum squirrels:", 34 | min = 1, 35 | max = 30, 36 | value = 10) 37 | ), 38 | 39 | # Show a plot of the generated distribution 40 | mainPanel( 41 | plotOutput("park_plot") 42 | ) 43 | ) 44 | ) 45 | 46 | # Define server logic required to draw a histogram 47 | server <- function(input, output) { 48 | 49 | output$park_plot <- renderPlot({ 50 | # generate bins based on input$bins from ui.R 51 | var <- sym(input$variable) 52 | 53 | filtered <- by_hectare %>% 54 | filter(n >= input$min_squirrels) 55 | 56 | midpoint <- mean(filtered[[input$variable]]) 57 | 58 | filtered %>% 59 | ggplot() + 60 | geom_sf(data = central_park_sf) + 61 | geom_point(aes(long, lat, size = n, color = !!var)) + 62 | theme_void() + 63 | scale_color_gradient2(low = "blue", high = "red", mid = "pink", 64 | midpoint = midpoint, labels = scales::percent) + 65 | labs(color = paste("%", input$variable), 66 | size = "# of squirrels") + 67 | coord_sf(datum = NA) 68 | }) 69 | } 70 | 71 | # Run the application 72 | shinyApp(ui = ui, server = server) 73 | -------------------------------------------------------------------------------- /nyc-squirrels.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Untitled" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | theme_set(theme_light()) 13 | 14 | nyc_squirrels <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-29/nyc_squirrels.csv") 15 | ``` 16 | 17 | ```{r} 18 | nyc_squirrels %>% 19 | count(zip_codes, sort = TRUE) 20 | ``` 21 | 22 | ```{r} 23 | nyc_squirrels %>% 24 | ggplot(aes(long, lat)) + 25 | geom_point() 26 | 27 | by_hectare <- nyc_squirrels %>% 28 | filter(!is.na(primary_fur_color)) %>% 29 | group_by(hectare) %>% 30 | summarize(long = mean(long), 31 | lat = mean(lat), 32 | pct_gray = mean(primary_fur_color == "Gray", na.rm = TRUE), 33 | n = n()) 34 | 35 | by_hectare %>% 36 | filter(n >= 10) %>% 37 | ggplot(aes(long, lat, size = n, color = pct_gray)) + 38 | geom_point() + 39 | theme_void() 40 | 41 | by_hectare %>% 42 | filter(n >= 10) %>% 43 | ggplot(aes(lat, pct_gray)) + 44 | geom_point() + 45 | geom_smooth() 46 | 47 | by_hectare %>% 48 | mutate(n_gray = round(pct_gray * n)) %>% 49 | glm(cbind(n_gray, n - n_gray) ~ lat, data = ., family = "binomial") %>% 50 | summary() 51 | ``` 52 | 53 | Squirrels may be more likely to be gray the higher north in the park you go, and more likely to be cinnamon. 54 | 55 | ```{r} 56 | nyc_squirrels %>% 57 | count(highlight_fur_color, sort = TRUE) 58 | 59 | nyc_squirrels %>% 60 | count(approaches, indifferent, runs_from, sort = TRUE) 61 | ``` 62 | 63 | Does a squirrel run away? 64 | 65 | ```{r} 66 | glm(runs_from ~ lat, data = nyc_squirrels, family = "binomial") %>% 67 | summary() 68 | ``` 69 | 70 | ```{r} 71 | library(sf) 72 | 73 | central_park_sf <- read_sf("~/Downloads/CentralAndProspectParks/") 74 | 75 | by_hectare <- nyc_squirrels %>% 76 | add_count(hectare) %>% 77 | mutate(above_ground = !is.na(location) & location == "Above Ground") %>% 78 | group_by(hectare, n) %>% 79 | summarize_at(vars(long, lat, approaches:runs_from, ends_with("ing"), above_ground), mean) %>% 80 | ungroup() 81 | 82 | by_hectare %>% 83 | filter(n >= 10) %>% 84 | ggplot() + 85 | geom_sf(data = central_park_sf) + 86 | geom_point(aes(long, lat, size = n, color = runs_from)) + 87 | theme_void() + 88 | scale_color_gradient2(low = "blue", high = "red", mid = "pink", 89 | midpoint = .3, labels = scales::percent) + 90 | labs(color = "% of squirrels run", 91 | size = "# of squirrels", 92 | title = "Squirrels in the northwest corner of Central Park are more likely to run away") + 93 | coord_sf(datum = NA) 94 | ``` 95 | 96 | ```{r} 97 | central_park_sf %>% 98 | count(lanes, sort = TRUE) 99 | 100 | ggplot(central_park_sf) + 101 | geom_sf() + 102 | geom_point(aes(long, lat, color = runs_from), data = by_hectare) + 103 | coord_sf(datum = NA) 104 | ``` 105 | 106 | ```{r} 107 | central_park_sf %>% 108 | ggplot() + 109 | geom_sf(aes(color = bicycle)) + 110 | coord_sf(datum = NA) 111 | ``` 112 | 113 | 114 | 115 | 116 | -------------------------------------------------------------------------------- /office-transcripts.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "The Office" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | library(schrute) 13 | theme_set(theme_light()) 14 | 15 | office_transcripts <- as_tibble(theoffice) %>% 16 | mutate(season = as.integer(season), 17 | episode = as.integer(episode)) %>% 18 | mutate(character = str_remove_all(character, '"')) %>% 19 | mutate(name = str_to_lower(str_remove_all(episode_name, "\\.| \\(Part.*"))) 20 | 21 | office_ratings <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-17/office_ratings.csv') %>% 22 | mutate(name = str_to_lower(str_remove_all(title, "\\.| \\(Part.*|\\: Part.*"))) 23 | ``` 24 | 25 | ```{r} 26 | library(ggrepel) 27 | 28 | office_ratings %>% 29 | group_by(season) %>% 30 | summarize(avg_rating = mean(imdb_rating)) %>% 31 | ggplot(aes(season, avg_rating)) + 32 | geom_line() + 33 | scale_x_continuous(breaks = 1:9) 34 | 35 | office_ratings %>% 36 | mutate(title = fct_inorder(title), 37 | episode_number = row_number()) %>% 38 | ggplot(aes(episode_number, imdb_rating)) + 39 | geom_line() + 40 | geom_smooth() + 41 | geom_point(aes(color = factor(season), size = total_votes)) + 42 | geom_text(aes(label = title), check_overlap = TRUE, hjust = 1) + 43 | expand_limits(x = -10) + 44 | theme(panel.grid.major.x = element_blank(), 45 | legend.position = "none") + 46 | labs(x = "Episode number", 47 | y = "IMDB Rating", 48 | title = "Popularity of The Office episodes over time", 49 | subtitle = "Color represents season, size represents # of ratings") 50 | ``` 51 | 52 | ```{r} 53 | office_ratings %>% 54 | arrange(desc(imdb_rating)) %>% 55 | mutate(title = paste0(season, ".", episode, " ", title), 56 | title = fct_reorder(title, imdb_rating)) %>% 57 | head(20) %>% 58 | ggplot(aes(title, imdb_rating, color = factor(season), size = total_votes)) + 59 | geom_point() + 60 | coord_flip() + 61 | labs(color = "Season", 62 | title = "Most popular episodes of The Office") 63 | ``` 64 | 65 | ### Transcripts 66 | 67 | ```{r} 68 | library(tidytext) 69 | 70 | blacklist <- c("yeah", "hey", "uh", "gonna") 71 | blacklist_characters <- c("Everyone", "All", "Both", "Guy", "Girl", "Group") 72 | 73 | transcript_words <- office_transcripts %>% 74 | group_by(character) %>% 75 | filter(n() >= 100, 76 | n_distinct(episode_name) > 2) %>% 77 | ungroup() %>% 78 | select(-text_w_direction) %>% 79 | unnest_tokens(word, text) %>% 80 | anti_join(stop_words, by = "word") %>% 81 | filter(!word %in% blacklist, 82 | !character %in% blacklist_characters) 83 | 84 | character_tf_idf <- transcript_words %>% 85 | add_count(word) %>% 86 | filter(n >= 20) %>% 87 | count(word, character) %>% 88 | bind_tf_idf(word, character, n) %>% 89 | arrange(desc(tf_idf)) 90 | ``` 91 | 92 | ```{r} 93 | character_tf_idf %>% 94 | filter(character %in% c("Dwight", "Jim", "David Wallace", "Darryl", "Jan", "Holly")) %>% 95 | group_by(character) %>% 96 | top_n(10, tf_idf) %>% 97 | ungroup() %>% 98 | mutate(word = reorder_within(word, tf_idf, character)) %>% 99 | ggplot(aes(word, tf_idf)) + 100 | geom_col() + 101 | coord_flip() + 102 | scale_x_reordered() + 103 | facet_wrap(~ character, scales = "free_y") + 104 | labs(x = "", 105 | y = "TF-IDF of character-word pairs") 106 | ``` 107 | 108 | ```{r} 109 | office_transcripts %>% 110 | count(character, sort = TRUE) %>% 111 | filter(character == "Dwight") 112 | ``` 113 | 114 | ### Machine learning model 115 | 116 | What affects popularity of an episode: 117 | 118 | * Season/time 119 | * Director 120 | * Writer 121 | * Lines per character 122 | 123 | ```{r} 124 | ratings_summarized <- office_ratings %>% 125 | group_by(name) %>% 126 | summarize(imdb_rating = mean(imdb_rating)) 127 | 128 | character_lines_ratings <- office_transcripts %>% 129 | filter(!character %in% blacklist_characters) %>% 130 | count(character, name) %>% 131 | group_by(character) %>% 132 | filter(sum(n) >= 50, 133 | n() >= 5) %>% 134 | inner_join(ratings_summarized, by = "name") 135 | 136 | character_lines_ratings %>% 137 | summarize(avg_rating = mean(imdb_rating), 138 | nb_episodes = n()) %>% 139 | arrange(desc(avg_rating)) %>% 140 | View() 141 | ``` 142 | 143 | ```{r} 144 | director_writer_features <- office_transcripts %>% 145 | distinct(name, director, writer) %>% 146 | gather(type, value, director, writer) %>% 147 | separate_rows(value, sep = ";") %>% 148 | unite(feature, type, value, sep = ": ") %>% 149 | group_by(feature) %>% 150 | filter(n() >= 3) %>% 151 | mutate(value = 1) %>% 152 | ungroup() 153 | 154 | character_line_features <- character_lines_ratings %>% 155 | ungroup() %>% 156 | transmute(name, feature = character, value = log2(n)) 157 | 158 | season_features = office_ratings %>% 159 | distinct(name, season) %>% 160 | transmute(name, feature = paste("season:", season), value = 1) 161 | 162 | features <- bind_rows(director_writer_features, 163 | character_line_features, 164 | season_features) %>% 165 | semi_join(office_ratings, by = "name") %>% 166 | semi_join(office_transcripts, by = "name") 167 | ``` 168 | 169 | ```{r} 170 | episode_feature_matrix <- features %>% 171 | cast_sparse(name, feature, value) 172 | 173 | ratings <- ratings_summarized$imdb_rating[match(rownames(episode_feature_matrix), ratings_summarized$name)] 174 | 175 | library(glmnet) 176 | library(broom) 177 | 178 | mod <- cv.glmnet(episode_feature_matrix, ratings) 179 | 180 | plot(mod) 181 | 182 | tidy(mod$glmnet.fit) %>% 183 | filter(lambda == mod$lambda.min, 184 | term != "(Intercept)") %>% 185 | mutate(term = fct_reorder(term, estimate)) %>% 186 | ggplot(aes(term, estimate, fill = estimate > 0)) + 187 | geom_col() + 188 | coord_flip() + 189 | labs(y = "Estimated effect on the rating of an episode") + 190 | theme(legend.position = "none") 191 | ``` 192 | -------------------------------------------------------------------------------- /pascals-triangle.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "The accumulate function" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(purrr) 12 | 13 | letters 14 | 15 | accumulate(letters, paste) 16 | 17 | accumulate(1:5, ~ . ^ 2, .init = 2) 18 | ``` 19 | 20 | Pascal's Triangle 21 | 22 | ```{r} 23 | accumulate(1:6, ~ c(0, .) + c(., 0), .init = 1) 24 | ``` 25 | 26 | In this short screencast I share a #tidyverse trick of the day: Use accumulate() from the purrr package to calculate Pascal's Triangle in one line of code. 27 | 28 | -------------------------------------------------------------------------------- /plastic-waste.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Plastic Waste" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | library(scales) 13 | theme_set(theme_light()) 14 | 15 | coast_vs_waste <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-21/coastal-population-vs-mismanaged-plastic.csv") 16 | 17 | mismanaged_vs_gdp <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-21/per-capita-mismanaged-plastic-waste-vs-gdp-per-capita.csv") 18 | 19 | waste_vs_gdp <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-21/per-capita-plastic-waste-vs-gdp-per-capita.csv") 20 | ``` 21 | 22 | ```{r} 23 | library(janitor) 24 | 25 | # Data cleaning 26 | clean_dataset <- function(tbl) { 27 | tbl %>% 28 | clean_names() %>% 29 | rename(country = entity, 30 | country_code = code) %>% 31 | filter(year == 2010) %>% 32 | select(-year) 33 | } 34 | 35 | plastic_waste <- coast_vs_waste %>% 36 | clean_dataset() %>% 37 | select(-total_population_gapminder) %>% 38 | inner_join(clean_dataset(mismanaged_vs_gdp) %>% 39 | select(-total_population_gapminder), by = c("country", "country_code")) %>% 40 | inner_join(clean_dataset(waste_vs_gdp), by = c("country", "country_code")) %>% 41 | select(country, 42 | country_code, 43 | mismanaged_waste = mismanaged_plastic_waste_tonnes, 44 | coastal_population, 45 | total_population = total_population_gapminder, 46 | mismanaged_per_capita = per_capita_mismanaged_plastic_waste_kilograms_per_person_per_day, 47 | gdp_per_capita = gdp_per_capita_ppp_constant_2011_international_rate) %>% 48 | filter(!is.na(mismanaged_waste)) 49 | ``` 50 | 51 | ```{r} 52 | g1 <- plastic_waste %>% 53 | arrange(-total_population) %>% 54 | mutate(pct_population_coastal = pmin(1, coastal_population / total_population), 55 | high_coastal_pop = ifelse(pct_population_coastal >= .8, ">=80%", "<80%")) %>% 56 | ggplot(aes(gdp_per_capita, mismanaged_per_capita)) + 57 | geom_point(aes(size = total_population)) + 58 | geom_text(aes(label = country), vjust = 1, hjust = 1, check_overlap = TRUE) + 59 | scale_x_log10(labels = dollar_format()) + 60 | scale_y_log10() + 61 | scale_size_continuous(guide = FALSE) + 62 | labs(x = "GDP per capita", 63 | y = "Mismanaged plastic waste (kg per person per day)", 64 | color = "Coastal population", 65 | title = "How plastic waste mismanagement correlates with country income", 66 | subtitle = "Based in Our World in Data 2010 numbers. Size represents total population") 67 | 68 | g1 69 | ``` 70 | 71 | ```{r} 72 | plastic_waste %>% 73 | mutate(pct_population_coastal = pmin(1, coastal_population / total_population)) %>% 74 | arrange(-total_population) %>% 75 | ggplot(aes(pct_population_coastal, mismanaged_per_capita)) + 76 | geom_point(aes(size = total_population)) + 77 | geom_text(aes(label = country), vjust = 1, hjust = 1, check_overlap = TRUE) 78 | ``` 79 | 80 | ```{r} 81 | tbl_df(iso3166) 82 | 83 | library(fuzzyjoin) 84 | 85 | plastic_data <- plastic_waste %>% 86 | inner_join(iso3166, by = c("country_code" = "a3")) 87 | 88 | map_data("world") %>% 89 | tbl_df() %>% 90 | filter(region != "Antarctica") %>% 91 | regex_left_join(plastic_data, by = c("region" = "mapname")) %>% 92 | ggplot(aes(long, lat, group = group, fill = mismanaged_per_capita)) + 93 | geom_polygon() + 94 | scale_fill_gradient2(trans = "log10", 95 | low = "blue", 96 | high = "red", 97 | mid = "pink", 98 | midpoint = log10(.02)) + 99 | coord_fixed(2) + 100 | ggthemes::theme_map() + 101 | labs(fill = "Mismanaged plastic waste per-cap", 102 | title = "Where in the world is waste mismanaged?") 103 | ``` 104 | 105 | ### Comparing to other country stats 106 | 107 | ```{r} 108 | library(WDI) 109 | 110 | indicators <- c("co2_emissions_per_capita" = "EN.ATM.CO2E.PC", 111 | "cpia_transparency" = "IQ.CPA.TRAN.XQ") 112 | 113 | other_data <- WDI(indicator = indicators, start = 2010, end = 2010) %>% 114 | tbl_df() %>% 115 | select(-country) 116 | 117 | plastic_with_indicators <- other_data %>% 118 | inner_join(plastic_data, by = c(iso2c = "a2")) %>% 119 | arrange(desc(total_population)) 120 | 121 | plastic_with_indicators %>% 122 | ggplot(aes(gdp_per_capita, co2_emissions_per_capita)) + 123 | geom_point(aes(size = total_population)) + 124 | geom_text(aes(label = country), vjust = 1, hjust = 1, check_overlap = TRUE) + 125 | scale_size_continuous(guide = FALSE) + 126 | scale_x_log10() + 127 | scale_y_log10() + 128 | labs(x = "GDP per capita", 129 | y = "CO2 emissions (tons per capita)", 130 | color = "Coastal population", 131 | title = "How plastic waste mismanagement correlates with country income", 132 | subtitle = "Based in Our World in Data 2010 numbers. Size represents total population") 133 | 134 | library(patchwork) 135 | 136 | g2 + 137 | labs(title = "CO2 emissions are correlated with country income, but not plastic waste") + 138 | g1 + 139 | labs(title = "", 140 | subtitle = "") 141 | ``` 142 | 143 | ```{r} 144 | # Looking by CPIA trust rating is a trust 145 | plastic_with_indicators %>% 146 | filter(!is.na(cpia_transparency)) %>% 147 | ggplot(aes(cpia_transparency, mismanaged_per_capita, group = cpia_transparency)) + 148 | geom_boxplot() 149 | 150 | plastic_with_indicators %>% 151 | arrange(desc(cpia_transparency)) %>% 152 | View() 153 | ``` 154 | 155 | 156 | 157 | 158 | 159 | 160 | -------------------------------------------------------------------------------- /r-downloads.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Downloads" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | library(scales) 13 | library(countrycode) 14 | theme_set(theme_light()) 15 | 16 | r_downloads_year_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-10-30/r_downloads_year.csv") 17 | 18 | r_downloads_year <- r_downloads_year_raw %>% 19 | select(-X1) %>% 20 | mutate(country_name = countrycode(country, "iso2c", "country.name")) 21 | ``` 22 | 23 | ```{r} 24 | r_downloads_year %>% 25 | count(date) %>% 26 | ggplot(aes(date, n)) + 27 | geom_line() + 28 | expand_limits(y = 0) + 29 | labs(y = "# of R downloads per day") 30 | 31 | library(lubridate) 32 | 33 | r_downloads_year %>% 34 | count(date) %>% 35 | group_by(weekday = wday(date, label = TRUE)) %>% 36 | summarize(average = mean(n)) %>% 37 | ggplot(aes(weekday, average)) + 38 | geom_line(group = 1) + 39 | expand_limits(y = 0) + 40 | labs(y = "Average downloads per weekday") 41 | ``` 42 | 43 | R is downloaded about 1500 times each weekend day and around 3000 times on a typical weekday (averaged over the last year). 44 | 45 | ```{r} 46 | r_downloads_year %>% 47 | group_by(week = floor_date(date, "week")) %>% 48 | summarize(n = n_distinct(ip_id)) %>% 49 | filter(week > min(week)) %>% 50 | ggplot(aes(week, n)) + 51 | geom_line() + 52 | expand_limits(y = 0) + 53 | labs(y = "# of R downloads per week (distinct IPs)") 54 | ``` 55 | 56 | What time of day were people installing R? 57 | 58 | ```{r} 59 | r_downloads_year %>% 60 | mutate(country = countrycode(country, "iso2c", "country.name")) %>% 61 | filter(!is.na(country)) %>% 62 | count(hour = hour(time), 63 | country = fct_lump(country, 8)) %>% 64 | ggplot(aes(hour, n)) + 65 | geom_line() + 66 | expand_limits(y = 0) + 67 | facet_wrap(~ country, scales = "free_y") 68 | ``` 69 | 70 | 71 | ```{r} 72 | library(countrycode) 73 | 74 | r_downloads_year %>% 75 | count(country = countrycode(country, "iso2c", "country.name"), sort = TRUE) %>% 76 | mutate(percent = n / sum(n)) %>% 77 | filter(!is.na(country)) %>% 78 | head(16) %>% 79 | mutate(country = fct_reorder(country, percent)) %>% 80 | ggplot(aes(country, percent)) + 81 | geom_col() + 82 | coord_flip() + 83 | scale_y_continuous(labels = percent_format()) + 84 | labs(title = "What countries install the most R?") 85 | ``` 86 | 87 | More than a third of R installations come from the US. 88 | 89 | ```{r} 90 | r_downloads_year %>% 91 | mutate(version = fct_lump(version, 8)) %>% 92 | count(date, version) %>% 93 | ggplot(aes(date, n, color = version)) + 94 | geom_line() 95 | ``` 96 | 97 | What operating system do R users use? 98 | 99 | ```{r} 100 | r_downloads_year %>% 101 | count(country = fct_lump(country, 8), 102 | week = floor_date(date, "week")) %>% 103 | filter(week > min(week)) %>% 104 | ggplot(aes(week, n, color = country)) + 105 | geom_line() 106 | ``` 107 | 108 | ### R package downloads 109 | 110 | ```{r} 111 | package_downloads <- read_csv("http://cran-logs.rstudio.com/2018/2018-10-27.csv.gz") 112 | ``` 113 | 114 | ```{r} 115 | package_downloads %>% 116 | filter(country %in% c("US", "IN")) %>% 117 | group_by(country, package, sort = TRUE) %>% 118 | summarize(n = n_distinct(ip_id)) %>% 119 | spread(country, n, fill = 0) %>% 120 | ungroup() %>% 121 | mutate(total = US + IN, 122 | IN = (IN + 1) / sum(IN + 1), 123 | US = (US + 1) / sum(US + 1), 124 | ratio = US / IN) %>% 125 | filter(total >= 1000) %>% 126 | arrange((ratio)) %>% 127 | View() 128 | ``` 129 | 130 | ```{r} 131 | library(cranlogs) 132 | 133 | cranlogs::cran_downloads(packages = c("tidyverse", "broom"), when = "last-week") 134 | ``` 135 | 136 | 137 | ### Appendix: Why count only distinct IPs? 138 | 139 | ```{r} 140 | r_download_gaps <- r_downloads_year %>% 141 | mutate(datetime = as.POSIXlt(date) + time) %>% 142 | arrange(datetime) %>% 143 | group_by(ip_id) %>% 144 | mutate(gap = as.numeric(datetime - lag(datetime))) %>% 145 | filter(!is.na(gap)) 146 | ``` 147 | 148 | ```{r} 149 | ip_counts <- r_downloads_year %>% 150 | count(ip_id, sort = TRUE) 151 | ``` 152 | 153 | A majority (`r percent(mean(ip_counts$n >= 100))`) of IP addresses that installed R did so more than 100 times in a year. 154 | 155 | ```{r} 156 | r_download_gaps %>% 157 | ggplot(aes(gap)) + 158 | geom_histogram() + 159 | geom_vline(color = "red", lty = 2, xintercept = 86400) + 160 | scale_x_log10(breaks = 60 ^ (0:4), 161 | labels = c("Second", "Minute", "Hour", "2.5 Days", "120 Days")) 162 | ``` 163 | 164 | ```{r} 165 | r_download_gaps 166 | ``` 167 | 168 | 169 | -------------------------------------------------------------------------------- /ramen-ratings.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Ramen Ratings" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ## R Markdown 11 | 12 | ```{r} 13 | library(tidyverse) 14 | theme_set(theme_light()) 15 | 16 | ramen_ratings <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-06-04/ramen_ratings.csv") 17 | ``` 18 | 19 | ```{r} 20 | library(drlib) 21 | 22 | ramen_ratings_processed <- ramen_ratings %>% 23 | mutate(style = fct_lump(style, 4), 24 | country = fct_lump(country, 12), 25 | brand = fct_lump(brand, 20)) %>% 26 | replace_na(list(style = "Other")) %>% 27 | mutate(brand = fct_relevel(brand, "Other"), 28 | country = fct_relevel(country, "Other"), 29 | style = fct_relevel(style, "Pack")) 30 | 31 | ramen_ratings_processed %>% 32 | gather(category, value, -review_number, -stars) %>% 33 | count(category, value) %>% 34 | group_by(category) %>% 35 | top_n(20, n) %>% 36 | ungroup() %>% 37 | mutate(value = reorder_within(value, n, category)) %>% 38 | ggplot(aes(value, n)) + 39 | geom_col() + 40 | facet_wrap(~ category, scales = "free_y") + 41 | scale_x_reordered() + 42 | coord_flip() + 43 | labs(title = "Categorical predictors (after processing)", 44 | x = "Predictor", 45 | y = "Count") 46 | ``` 47 | 48 | ```{r} 49 | library(broom) 50 | 51 | lm(stars ~ brand + country + style, ramen_ratings_processed) %>% 52 | tidy(conf.int = TRUE) %>% 53 | filter(term != "(Intercept)") %>% 54 | arrange(desc(estimate)) %>% 55 | extract(term, c("category", "term"), "^([a-z]+)([A-Z].*)") %>% 56 | mutate(term = fct_reorder(term, estimate)) %>% 57 | ggplot(aes(estimate, term, color = category)) + 58 | geom_point() + 59 | geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) + 60 | geom_vline(lty = 2, xintercept = 0) + 61 | facet_wrap(~ category, ncol = 1, scales = "free_y") + 62 | theme(legend.position = "none") + 63 | labs(x = "Estimated effect on ramen rating", 64 | y = "", 65 | title = "Coefficients that predict ramen ratings", 66 | subtitle = "Less common brands and countries were lumped together as the reference level") 67 | ``` 68 | 69 | ```{r} 70 | library(tidytext) 71 | 72 | ramen_ratings_processed %>% 73 | filter(!is.na(stars)) %>% 74 | unnest_tokens(word, variety) %>% 75 | group_by(word) %>% 76 | summarize(avg_rating = mean(stars), 77 | n = n()) %>% 78 | arrange(desc(n)) 79 | ``` 80 | 81 | ### Web scraping 82 | 83 | ```{r} 84 | library(rvest) 85 | 86 | ramen_list <- read_html("https://www.theramenrater.com/resources-2/the-list/") 87 | 88 | # How the original data was (probably) created 89 | ramen_reviews <- ramen_list %>% 90 | html_node("#myTable") %>% 91 | html_table() %>% 92 | tbl_df() %>% 93 | janitor::clean_names() %>% 94 | select(-t) 95 | ``` 96 | 97 | ```{r} 98 | review_links <- read_html("https://www.theramenrater.com/resources-2/the-list/") %>% 99 | html_nodes("#myTable a") 100 | 101 | reviews <- tibble(review_number = parse_number(html_text(review_links)), 102 | link = html_attr(review_links, "href")) 103 | ``` 104 | 105 | See here for more about possibly and other "dealing with failure" functions: https://r4ds.had.co.nz/iteration.html#dealing-with-failure 106 | 107 | ```{r} 108 | page <- read_html("https://www.theramenrater.com/2019/05/23/3180-yum-yum-moo-deng/") 109 | 110 | get_review_text <- function(url) { 111 | message(url) 112 | 113 | read_html(url) %>% 114 | html_nodes(".entry-content > p") %>% 115 | html_text() %>% 116 | str_subset(".") 117 | } 118 | 119 | review_text <- reviews %>% 120 | head(250) %>% 121 | mutate(text = map(link, possibly(get_review_text, character(0), quiet = FALSE))) 122 | ``` 123 | 124 | More on correlation graphs: https://www.tidytextmining.com/ngrams.html 125 | 126 | ```{r} 127 | library(tidytext) 128 | 129 | review_paragraphs <- review_text %>% 130 | filter(!map_lgl(text, is.null)) %>% 131 | unnest() %>% 132 | filter(str_detect(text, "Finished")) %>% 133 | mutate(text = str_remove(text, "Finished.*?\\. ")) 134 | 135 | review_paragraphs_tokenized <- review_paragraphs %>% 136 | unnest_tokens(word, text) %>% 137 | anti_join(stop_words, by = "word") %>% 138 | filter(str_detect(word, "[a-z]")) %>% 139 | inner_join(ramen_ratings, by = "review_number") 140 | 141 | review_words <- review_paragraphs_tokenized %>% 142 | filter(!is.na(stars)) %>% 143 | group_by(word) %>% 144 | summarize(number = n(), 145 | reviews = n_distinct(review_number), 146 | avg_rating = mean(stars)) %>% 147 | arrange(desc(reviews)) 148 | 149 | review_words_filtered <- review_words %>% 150 | filter(reviews < 200, reviews >= 10) 151 | 152 | library(widyr) 153 | word_cors <- review_paragraphs_tokenized %>% 154 | semi_join(review_words_filtered, by = "word") %>% 155 | distinct(review_number, word) %>% 156 | pairwise_cor(word, review_number, sort = TRUE) 157 | ``` 158 | 159 | ```{r} 160 | library(igraph) 161 | library(ggraph) 162 | 163 | set.seed(2019) 164 | 165 | filtered_cors <- word_cors %>% 166 | head(300) 167 | 168 | nodes <- review_words_filtered %>% 169 | filter(word %in% filtered_cors$item1 | word %in% filtered_cors$item2) 170 | 171 | filtered_cors %>% 172 | graph_from_data_frame(vertices = nodes) %>% 173 | ggraph() + 174 | geom_edge_link() + 175 | geom_node_point(aes(size = reviews * 1.1)) + 176 | geom_node_point(aes(size = reviews, color = avg_rating)) + 177 | geom_node_text(aes(label = name), repel = TRUE) + 178 | scale_color_gradient2(low = "red", high = "blue", midpoint = 4) + 179 | theme_void() + 180 | labs(color = "Average rating", 181 | size = "# of reviews", 182 | title = "Network of words used together in ramen reviews", 183 | subtitle = "Based on 250 ramen reviews and their star ratings") 184 | ``` 185 | 186 | -------------------------------------------------------------------------------- /riddler-die-reroll.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "The Riddler: Rerolling a die" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | 13 | start <- 1:6 14 | 15 | next_rolls <- sample(start, replace = TRUE) 16 | 17 | simulate_roll <- function(num_sides = 6) { 18 | rolls <- accumulate(1:1000, function(x, ...) { 19 | result <- sample(x, replace = TRUE) 20 | 21 | if (length(unique(result)) == 1) { 22 | done(result) 23 | } else { 24 | result 25 | } 26 | }, .init = 1:num_sides) 27 | 28 | length(rolls) - 1 29 | } 30 | 31 | simulations4 <- replicate(1e6, simulate_roll(4)) 32 | 33 | mean(simulations) 34 | 35 | tibble(simulations) %>% 36 | count(simulations, sort = TRUE) 37 | ``` 38 | 39 | On average, it takes 9.666 rolls to get the same number on all sides. 40 | 41 | ```{r simulations_crossed} 42 | simulations_crossed <- crossing(trial = 1:20000, 43 | num_sides = 2:10) %>% 44 | mutate(simulation = map_dbl(num_sides, simulate_roll)) 45 | ``` 46 | 47 | ```{r} 48 | summarized <- simulations_crossed %>% 49 | group_by(num_sides) %>% 50 | summarize(average_roll = mean(simulation), 51 | variance_roll = var(simulation), 52 | sd_roll = sd(simulation), 53 | cov = sd_roll / average_roll) 54 | 55 | summarized %>% 56 | ggplot(aes(num_sides, average_roll)) + 57 | geom_point() + 58 | geom_line() + 59 | expand_limits(y = 0) 60 | 61 | lm(average_roll ~ num_sides, summarized) 62 | 63 | simulations_crossed %>% 64 | ggplot(aes(simulation)) + 65 | geom_histogram(binwidth = 1) + 66 | facet_wrap(~ num_sides, scales = "free_y") 67 | ``` 68 | -------------------------------------------------------------------------------- /riddler-die-roll-low.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Riddler: How Low Can You Roll: Cumulative window functions" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | 13 | set.seed(2020) 14 | 15 | sim <- tibble(roll = sample(0:9, 1e7, replace = TRUE)) %>% 16 | group_by(group = lag(cumsum(roll == 0), default = 0)) %>% 17 | filter(roll <= cummin(roll)) %>% 18 | mutate(decimal = roll * 10 ^ -row_number()) %>% 19 | summarize(score = sum(decimal)) 20 | ``` 21 | 22 | ```{r} 23 | scores %>% 24 | summarize(mean(score)) 25 | 26 | scores %>% 27 | ggplot(aes(score)) + 28 | geom_histogram(binwidth = .001) + 29 | scale_x_continuous(breaks = seq(0, 1, .1)) 30 | 31 | mean(scores$score) 32 | ``` 33 | 34 | 35 | -------------------------------------------------------------------------------- /riddler-prisoner-coin-flip.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Riddler: Flip your way to freedom" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | library(scales) 13 | theme_set(theme_light()) 14 | 15 | sim <- crossing(trial = 1:50000, 16 | probability = seq(.01, 1, .01)) %>% 17 | mutate(num_flips = rbinom(n(), 4, probability), 18 | num_tails = rbinom(n(), num_flips, .5), 19 | set_free = num_flips != 0 & num_tails == 0) 20 | 21 | probability_exact <- function(p, n = 4) { 22 | sum(dbinom(0:n, n, p) * c(0, dbinom(1:n, 1:n, .5))) 23 | } 24 | 25 | probabilities <- sim %>% 26 | group_by(probability) %>% 27 | summarize(pct_free = mean(set_free)) %>% 28 | mutate(exact = map_dbl(probability, probability_exact)) 29 | 30 | probabilities %>% 31 | ggplot(aes(probability, pct_free)) + 32 | geom_line() + 33 | geom_line(aes(y = exact), color = "red", lty = 2) 34 | 35 | opt <- optimise(probability_exact, c(0, 1), maximum = TRUE) 36 | ``` 37 | 38 | The highest chance of escape is `r percent(opt$objective)` when the prisoners use the random number generator to have a `r percent(optim$maximum)` chance of flipping the coin. 39 | 40 | ### Extra credit 41 | 42 | ```{r} 43 | sim_n <- crossing(trial = 1:100000, 44 | probability = seq(.02, 1, .02), 45 | n = 2:8) %>% 46 | mutate(num_flips = rbinom(n(), n, probability), 47 | num_tails = rbinom(n(), num_flips, .5), 48 | set_free = num_flips != 0 & num_tails == 0) 49 | 50 | probabilities_n <- sim_n %>% 51 | group_by(probability, n) %>% 52 | summarize(simulated = mean(set_free)) %>% 53 | ungroup() %>% 54 | mutate(exact = map2_dbl(probability, n, probability_exact)) 55 | 56 | probabilities_n %>% 57 | ggplot(aes(probability, exact, color = factor(n))) + 58 | geom_line() + 59 | geom_point(aes(y = simulated), size = .4) + 60 | scale_x_continuous(labels = percent) + 61 | scale_y_continuous(labels = percent) + 62 | labs(x = "Probability of flipping the coin", 63 | y = "Probability of escape", 64 | color = "# of prisoners", 65 | title = "What's the chance of escaping with n prisoners?", 66 | subtitle = "Points show simulations of 100,000 prisoners each") 67 | ``` 68 | 69 | ```{r} 70 | optimize_for_n <- function(n) { 71 | optimise(function(p) probability_exact(p, n), c(0, 1), maximum = TRUE) 72 | } 73 | 74 | optimal_n <- tibble(n = 2:60) %>% 75 | mutate(optimal = map(n, optimize_for_n)) %>% 76 | unnest_wider(optimal) 77 | 78 | optimal_n %>% 79 | gather(metric, value, -n) %>% 80 | mutate(metric = ifelse(metric == "maximum", "Optimal probability to flip", "Probability of escape")) %>% 81 | ggplot(aes(n, value, color = metric)) + 82 | geom_line() + 83 | scale_y_continuous(labels = percent) + 84 | expand_limits(y = 0) + 85 | labs(x = "# of prisoners", 86 | y = "Probability", 87 | color = "") 88 | 89 | optimal_n %>% 90 | arrange(desc(n)) %>% 91 | mutate(expected_coins_flipped = n * maximum) %>% 92 | ggplot(aes(n, expected_coins_flipped)) + 93 | geom_line() 94 | ``` 95 | 96 | -------------------------------------------------------------------------------- /riddler-spam-comments.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Riddler: Spam Comments" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | 13 | sim <- replicate(1e6, which(cumsum(rexp(250, 1:250)) > 3)[1] - 1) 14 | 15 | mean(sim, na.rm = TRUE) 16 | ``` 17 | 18 | ```{r} 19 | comment_counts <- tibble(n_comments = sim) %>% 20 | count(n_comments) %>% 21 | mutate(density = n / sum(n)) 22 | 23 | library(broom) 24 | 25 | augmented_exp <- nls(density ~ lambda * exp(-lambda * n_comments), 26 | data = comment_counts, 27 | start = list(lambda = 1)) %>% 28 | augment(comment_counts) 29 | 30 | augmented_geometric <- nls(density ~ (1 - p) ^ n_comments * p, 31 | data = comment_counts, 32 | start = list(p = .05)) %>% 33 | augment(comment_counts) 34 | 35 | augmented_geometric %>% 36 | ggplot(aes(n_comments, density)) + 37 | geom_line() + 38 | geom_line(aes(y = .fitted), color = "red") 39 | ``` 40 | 41 | The number of comments after 3 days is described by a geometric distribution. 42 | 43 | What determines the parameter $p$ (and therefore the expected value) of the geometric distribution? 44 | 45 | 46 | ```{r} 47 | tidy_sim <- crossing(trial = 1:5e4, 48 | step = 1:250) %>% 49 | mutate(waiting = rexp(n(), step)) %>% 50 | group_by(trial) %>% 51 | mutate(cumulative = cumsum(waiting)) %>% 52 | ungroup() 53 | 54 | ncomments <- tidy_sim %>% 55 | mutate(within_3 = cumulative < 3) %>% 56 | group_by(trial) %>% 57 | summarize(n_comments = sum(within_3)) 58 | ``` 59 | 60 | ```{r} 61 | comments_by_threshold <- tidy_sim %>% 62 | crossing(threshold = seq(.25, 3, .25)) %>% 63 | mutate(within = cumulative < threshold) %>% 64 | group_by(threshold, trial) %>% 65 | summarize(n_comments = sum(within)) 66 | 67 | comments_by_threshold %>% 68 | summarize(expected_value = mean(n_comments)) %>% 69 | ggplot(aes(threshold, expected_value)) + 70 | geom_line() + 71 | geom_line(aes(y = exp(threshold) - 1), color = "red") + 72 | labs(x = "# of days", 73 | y = "Expected number of comments", 74 | title = "How many comments cumulatively over time?", 75 | subtitle = "Red line shows exp(x) - 1") 76 | 77 | comment_counts %>% 78 | filter(!is.na(n_comments)) %>% 79 | ggplot(aes(n_comments, density)) + 80 | geom_line() + 81 | geom_line(aes(y = (1 - 1 / exp(3)) ^ n_comments / exp(3)), color = "red") 82 | ``` 83 | 84 | The number of comments after n days is described by a geometric distribution with expected value $e^{n}-1$ (that is, with success probability $exp(-n)$). 85 | -------------------------------------------------------------------------------- /seattle-pets.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Untitled" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | library(lubridate) 13 | theme_set(theme_light()) 14 | 15 | seattle_pets <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-26/seattle_pets.csv") %>% 16 | mutate(license_issue_date = mdy(license_issue_date)) %>% 17 | rename(animal_name = animals_name) 18 | ``` 19 | 20 | ```{r} 21 | seattle_pets %>% 22 | filter(license_issue_date >= "2017-01-01") %>% 23 | count(species, primary_breed, sort = TRUE) %>% 24 | filter(species %in% c("Cat", "Dog")) %>% 25 | mutate(percent = n / sum(n)) %>% 26 | group_by(species) %>% 27 | top_n(10, percent) %>% 28 | ungroup() %>% 29 | mutate(primary_breed = fct_reorder(primary_breed, percent)) %>% 30 | ggplot(aes(primary_breed, percent, fill = species)) + 31 | geom_col(show.legend = FALSE) + 32 | scale_y_continuous(labels = scales::percent_format()) + 33 | facet_wrap(~ species, scales = "free_y", ncol = 1) + 34 | coord_flip() + 35 | labs(x = "Primary breed", 36 | y = "% of this species", 37 | title = "Most common cat and dog breeds", 38 | subtitle = "Of licensed pets in Seattle 2017-2018") 39 | ``` 40 | 41 | ```{r} 42 | dogs <- seattle_pets %>% 43 | filter(species == "Dog") 44 | 45 | name_counts <- dogs %>% 46 | group_by(animal_name) %>% 47 | summarize(name_total = n()) %>% 48 | filter(name_total >= 100) 49 | 50 | breed_counts <- dogs %>% 51 | group_by(primary_breed) %>% 52 | summarize(breed_total = n()) %>% 53 | filter(breed_total >= 200) 54 | 55 | total_dogs <- nrow(dogs) 56 | 57 | name_breed_counts <- dogs %>% 58 | count(primary_breed, animal_name) %>% 59 | complete(primary_breed, animal_name, fill = list(n = 0)) %>% 60 | inner_join(name_counts, by = "animal_name") %>% 61 | inner_join(breed_counts, by = "primary_breed") 62 | 63 | # One-sided hypergeometric p-value 64 | hypergeom_test <- name_breed_counts %>% 65 | mutate(percent_of_breed = n / breed_total, 66 | percent_overall = name_total / total_dogs) %>% 67 | mutate(overrepresented_ratio = percent_of_breed / percent_overall) %>% 68 | arrange(desc(overrepresented_ratio)) %>% 69 | mutate(hypergeom_p_value = 1 - phyper(n, name_total, total_dogs - name_total, breed_total), 70 | holm_p_value = p.adjust(hypergeom_p_value), 71 | fdr = p.adjust(hypergeom_p_value, method = "fdr")) 72 | 73 | hypergeom_test %>% 74 | filter(fdr < .05) 75 | 76 | ggplot(hypergeom_test, aes(hypergeom_p_value)) + 77 | geom_histogram(binwidth = .1) + 78 | labs(x = "One-sided hypergeometric p-values for overrepresented name") 79 | ``` 80 | 81 | ```{r} 82 | crossing(name_total = c(100, 200, 300), 83 | breed_total = seq(200, 1000, 25)) %>% 84 | mutate(max_p_value = 1 - phyper(0, name_total, total_dogs - name_total, breed_total)) %>% 85 | ggplot(aes(breed_total, max_p_value, color = factor(name_total))) + 86 | geom_line() + 87 | labs(x = "Total # of dogs in breed", 88 | y = "Maximum one-sided p-value", 89 | color = "# of dogs with name") 90 | ``` 91 | 92 | ```{r} 93 | library(scales) 94 | 95 | hypergeom_test %>% 96 | filter(fdr <= .05) %>% 97 | arrange(fdr) %>% 98 | transmute(`Breed` = primary_breed, 99 | `Name` = animal_name, 100 | `# of dogs with name` = n, 101 | `% of breed` = percent(percent_of_breed), 102 | `% overall` = percent(percent_overall), 103 | `FDR-adjusted one-sided p-value` = fdr) %>% 104 | knitr::kable() 105 | ``` 106 | 107 | -------------------------------------------------------------------------------- /simpsons-guests.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Simpsons Guests" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | theme_set(theme_light()) 13 | 14 | simpsons <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-08-27/simpsons-guests.csv") %>% 15 | mutate(self = str_detect(role, "self|selves"), 16 | season = parse_number(season)) 17 | ``` 18 | 19 | ```{r} 20 | simpsons %>% 21 | filter(self) %>% 22 | count(guest_star, sort = TRUE) %>% 23 | filter(n > 1) %>% 24 | mutate(guest_star = fct_reorder(guest_star, n)) %>% 25 | ggplot(aes(guest_star, n)) + 26 | geom_col() + 27 | coord_flip() + 28 | labs(title = "Who has played themselves in multiple Simpsons episodes?") 29 | ``` 30 | 31 | ```{r} 32 | simpsons %>% 33 | separate_rows(role, sep = ";\\s+") %>% 34 | add_count(role) %>% 35 | filter(n >= 8) %>% 36 | count(season, role) %>% 37 | mutate(role = fct_reorder(role, -n, sum)) %>% 38 | ggplot(aes(season, n)) + 39 | geom_col() + 40 | facet_wrap(~ role) 41 | ``` 42 | 43 | ```{r} 44 | simpsons 45 | ``` 46 | 47 | ### Bringing in Simpsons dialogue 48 | 49 | ```{r} 50 | dialogue <- read_csv("~/Downloads/simpsons_dataset.csv") %>% 51 | select(role = raw_character_text, line = spoken_words) 52 | 53 | guests_processed <- simpsons %>% 54 | separate_rows(role, sep = ";\\s+") %>% 55 | mutate(role = ifelse(self, guest_star, role), 56 | role = ifelse(role == "Edna Krabappel", "Edna Krabappel-Flanders", role)) 57 | 58 | guests_summarized <- guests_processed %>% 59 | filter(season <= 27) %>% 60 | group_by(guest_star, role, self) %>% 61 | summarize(nb_episodes = n(), 62 | first_season = min(season), 63 | last_season = max(season)) %>% 64 | arrange(desc(nb_episodes)) %>% 65 | group_by(role) %>% 66 | filter(n() == 1) %>% 67 | ungroup() %>% 68 | filter(!is.na(role)) 69 | 70 | dialogue_summarized <- dialogue %>% 71 | group_by(role) %>% 72 | summarize(nb_lines = n(), 73 | random_line = sample(line, 1)) %>% 74 | arrange(desc(nb_lines)) 75 | 76 | guest_roles <- guests_summarized %>% 77 | inner_join(dialogue_summarized, by = "role") %>% 78 | mutate(lines_per_episode = nb_lines / nb_episodes) 79 | 80 | guest_roles %>% 81 | mutate(self = ifelse(self, "Playing Themselves", "Playing a Character")) %>% 82 | ggplot(aes(lines_per_episode)) + 83 | geom_histogram(binwidth = 2, center = 1) + 84 | facet_wrap(~ self, ncol = 1) + 85 | labs(x = "Average # of lines per episode", 86 | title = "Most guest stars, especially those playing themselves, have relatively few lines per episode") 87 | 88 | guest_roles %>% 89 | arrange(desc(lines_per_episode)) 90 | ``` 91 | 92 | ```{r} 93 | library(tidytext) 94 | 95 | role_words <- dialogue %>% 96 | filter(!is.na(line), !is.na(role)) %>% 97 | mutate(line_number = row_number()) %>% 98 | unnest_tokens(word, line) %>% 99 | anti_join(stop_words, by = "word") %>% 100 | distinct(role, line_number, word) %>% 101 | count(role, word, sort = TRUE) 102 | 103 | role_word_tf_idf <- role_words %>% 104 | group_by(role) %>% 105 | mutate(total_words = sum(n)) %>% 106 | ungroup() %>% 107 | bind_tf_idf(word, role, n) %>% 108 | arrange(desc(tf_idf)) 109 | 110 | role_word_tf_idf %>% 111 | filter(total_words >= 500) %>% 112 | distinct(role, .keep_all = TRUE) %>% 113 | mutate(role_word = paste0(role, ": ", word)) %>% 114 | head(20) %>% 115 | mutate(role_word = fct_reorder(role_word, tf_idf)) %>% 116 | ggplot(aes(role_word, tf_idf)) + 117 | geom_col() + 118 | coord_flip() + 119 | labs(title = "Using TF-IDF as a Simpsons catchphrase detector", 120 | subtitle = "Only the 53 characters that speak at least 500 words in 27 seasons", 121 | x = "", 122 | y = "TF-IDF") 123 | ``` 124 | 125 | ```{r} 126 | guests_summarized %>% 127 | filter(nb_episodes > 1) %>% 128 | inner_join(role_word_tf_idf, by = "role") %>% 129 | filter(total_words >= 100) %>% 130 | arrange(desc(tf_idf)) %>% 131 | distinct(role, .keep_all = TRUE) %>% 132 | select(guest_star, role, word, tf_idf) 133 | ``` 134 | 135 | 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /space-launches.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Space Launches" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | theme_set(theme_light()) 13 | 14 | agencies <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv") 15 | launches <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv") 16 | ``` 17 | 18 | ```{r} 19 | launches %>% 20 | count(launch_year, agency_type) %>% 21 | ggplot(aes(launch_year, n, color = agency_type)) + 22 | geom_line() + 23 | labs(x = "Time", 24 | y = "# of launches this year", 25 | color = "Agency type") 26 | ``` 27 | 28 | ```{r} 29 | library(countrycode) 30 | 31 | agencies %>% 32 | View() 33 | 34 | launches %>% 35 | count(agency_type, agency, sort = TRUE) 36 | 37 | agencies %>% 38 | count(state_code, wt = count, sort = TRUE) %>% 39 | View() 40 | 41 | launches_processed <- launches %>% 42 | filter(launch_date <= Sys.Date()) %>% 43 | mutate(state_code_cleaned = fct_collapse( 44 | state_code, 45 | "RU" = c("SU", "RU"), 46 | "FR" = "F", 47 | "JP" = "J", 48 | "IT" = "I" 49 | )) %>% 50 | mutate(state_name = countrycode(state_code_cleaned, "iso2c", "country.name"), 51 | state_name = fct_lump(state_name, 6)) %>% 52 | replace_na(list(state_name = "Other")) 53 | 54 | launches_processed %>% 55 | count(launch_year, state_name) %>% 56 | mutate(state_name = fct_reorder(state_name, -n, sum)) %>% 57 | ggplot(aes(launch_year, n, color = state_name)) + 58 | geom_line() + 59 | labs(x = "Time", 60 | y = "Launches per year", 61 | color = "Responsible state", 62 | title = "Launches per year per country", 63 | subtitle = "Combines Soviet Union (pre-1990) with Russia") 64 | ``` 65 | 66 | ### Focus on private + startup launches 67 | 68 | ```{r} 69 | agencies %>% 70 | filter(agency_type %in% c("private", "startup")) %>% 71 | View() 72 | 73 | private_startup_launches <- launches_processed %>% 74 | filter(agency_type %in% c("private", "startup")) %>% 75 | inner_join(agencies %>% 76 | select(agency, agency_name = name, short_name, parent), by = "agency") %>% 77 | mutate(agency_name_lumped = fct_lump(agency_name, 6), 78 | agency_name_lumped = if_else(agency_name_lumped == "Other" & state_name == "United States", 79 | "Other US", as.character(agency_name_lumped))) 80 | 81 | private_startup_launches %>% 82 | count(agency_name_lumped, state_name, sort = TRUE) %>% 83 | mutate(agency_name_lumped = fct_reorder(agency_name_lumped, n, sum)) %>% 84 | ggplot(aes(agency_name_lumped, n, fill = state_name)) + 85 | geom_col() + 86 | coord_flip() + 87 | labs(x = "", 88 | y = "# of launches overall", 89 | title = "What private/startup agencies have had the most launches?", 90 | fill = "Country") 91 | 92 | private_startup_launches %>% 93 | count(agency_name_lumped, 94 | decade = 5 * (launch_year %/% 5)) %>% 95 | complete(agency_name_lumped, decade, fill = list(n = 0)) %>% 96 | mutate(agency_name_lumped = fct_reorder(agency_name_lumped, -n, sum)) %>% 97 | ggplot(aes(decade, n, color = agency_name_lumped)) + 98 | geom_line() + 99 | facet_wrap(~ agency_name_lumped) + 100 | theme(legend.position = "none") + 101 | labs(x = "Time", 102 | y = "# of launches in 5 year period") 103 | ``` 104 | 105 | ```{r} 106 | vehicles <- launches_processed %>% 107 | group_by(type, state_name) %>% 108 | summarize(first_launch = min(launch_year), 109 | last_launch = max(launch_year), 110 | launches = n()) %>% 111 | ungroup() 112 | 113 | russian_vehicles <- vehicles %>% 114 | filter(state_name == "Russia") %>% 115 | arrange(desc(launches)) %>% 116 | filter(launches >= 30) 117 | 118 | launches_processed %>% 119 | semi_join(russian_vehicles, by = "type") %>% 120 | mutate(type = fct_reorder(type, launch_date, min)) %>% 121 | ggplot(aes(launch_date, type)) + 122 | geom_jitter(color = "blue", alpha = .25, width = 0, height = .2) + 123 | theme(legend.position = "none") + 124 | labs(title = "Timeline of Soviet/Russian space vehicles", 125 | x = "Launch date", 126 | y = "Vehicle type", 127 | subtitle = "Only vehicles with at least 30 launches") 128 | 129 | launches_processed %>% 130 | filter(state_code == "US") %>% 131 | add_count(type) %>% 132 | filter(n >= 20) %>% 133 | mutate(type = fct_reorder(type, launch_date, min), 134 | agency_type = str_to_title(agency_type)) %>% 135 | ggplot(aes(launch_date, type, color = agency_type)) + 136 | geom_jitter(alpha = .25, width = 0, height = .2) + 137 | labs(title = "Timeline of US space vehicles", 138 | x = "Launch date", 139 | y = "Vehicle type", 140 | color = "Agency type", 141 | subtitle = "Only vehicles with at least 20 launches") 142 | 143 | 144 | by_type %>% 145 | arrange(desc(launches)) %>% 146 | View() 147 | count(type, sort = TRUE) 148 | ``` 149 | 150 | 151 | -------------------------------------------------------------------------------- /thanksgiving.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Untitled" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | theme_set(theme_light()) 13 | 14 | thanksgiving_survey <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-11-20/thanksgiving_meals.csv") %>% 15 | mutate(family_income = fct_reorder(family_income, parse_number(family_income))) 16 | ``` 17 | 18 | ```{r} 19 | thanksgiving_survey %>% 20 | count(celebrate) 21 | ``` 22 | 23 | Almost everyone serves turkey as the main dish 24 | 25 | ```{r} 26 | thanksgiving_survey %>% 27 | count(main_dish, sort = TRUE) 28 | ``` 29 | 30 | ```{r} 31 | thanksgiving_survey %>% 32 | count(main_dish, main_prep, sort = TRUE) 33 | ``` 34 | 35 | ```{r} 36 | thanksgiving_survey %>% 37 | count(cranberry, sort = TRUE) 38 | 39 | thanksgiving_survey %>% 40 | count(gravy, sort = TRUE) 41 | ``` 42 | 43 | ### Relationship with income 44 | 45 | ```{r} 46 | thanksgiving_survey %>% 47 | group_by(family_income) %>% 48 | summarize(celebrate = sum(celebrate == "Yes"), 49 | total = n(), 50 | low = qbeta(0.025, celebrate + .5, total - celebrate + .5), 51 | high = qbeta(0.975, celebrate + .5, total - celebrate + .5)) %>% 52 | ggplot(aes(family_income, celebrate / total, group = 1)) + 53 | geom_line() + 54 | geom_ribbon(aes(ymin = low, ymax = high), alpha = .2) + 55 | scale_y_continuous(labels = scales::percent_format()) + 56 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 57 | labs(x = "Family income", 58 | y = "% celebrating Thanksgiving") 59 | ``` 60 | 61 | ```{r} 62 | thanksgiving_survey %>% 63 | filter(cranberry %in% c("Canned", "Homemade")) %>% 64 | group_by(family_income) %>% 65 | summarize(homemade = sum(cranberry == "Homemade"), 66 | total = n(), 67 | low = qbeta(0.025, homemade + .5, total - homemade + .5), 68 | high = qbeta(0.975, homemade + .5, total - homemade + .5)) %>% 69 | ggplot(aes(family_income, homemade / total, group = 1)) + 70 | geom_line() + 71 | geom_ribbon(aes(ymin = low, ymax = high), alpha = .2) + 72 | scale_y_continuous(labels = scales::percent_format()) + 73 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 74 | labs(x = "Family income", 75 | y = "% serving homemade") 76 | ``` 77 | 78 | ```{r} 79 | food_gathered <- thanksgiving_survey %>% 80 | select(id, starts_with("side"), 81 | starts_with("pie"), 82 | starts_with("dessert")) %>% 83 | select(-side15, -pie13, -dessert12) %>% 84 | gather(type, value, -id) %>% 85 | filter(!is.na(value), 86 | !value %in% c("None", "Other (please specify)")) %>% 87 | mutate(type = str_remove(type, "\\d+")) 88 | 89 | n_respondents <- n_distinct(food_gathered$id) 90 | ``` 91 | 92 | What are the most common pies, sides, and desserts? 93 | 94 | ```{r} 95 | food_gathered %>% 96 | count(type, value, sort = TRUE) %>% 97 | mutate(value = fct_reorder(value, n)) %>% 98 | ggplot(aes(value, n / n_respondents, fill = type)) + 99 | geom_col(show.legend = FALSE) + 100 | coord_flip() + 101 | scale_y_continuous(labels = scales::percent_format()) + 102 | facet_wrap(~ type, scales = "free_y", ncol = 1) + 103 | labs(x = "", 104 | y = "% of respondents") 105 | ``` 106 | 107 | How do the servings differ by income? 108 | 109 | ```{r} 110 | food_gathered %>% 111 | inner_join(thanksgiving_survey, by = "id") %>% 112 | mutate(age_number = parse_number(age)) %>% 113 | group_by(value) %>% 114 | summarize(average_age = mean(age_number, na.rm = TRUE), 115 | total = n()) %>% 116 | arrange(desc(average_age)) 117 | 118 | food_by_region <- food_gathered %>% 119 | inner_join(thanksgiving_survey, by = "id") %>% 120 | group_by(us_region) %>% 121 | mutate(respondents = n_distinct(id)) %>% 122 | count(us_region, respondents, type, value) %>% 123 | ungroup() %>% 124 | mutate(percent = n / respondents) 125 | 126 | food_by_region %>% 127 | filter(value == "Cornbread") %>% 128 | arrange(desc(percent)) 129 | ``` 130 | 131 | ```{r} 132 | library(ebbr) 133 | 134 | food_gathered %>% 135 | inner_join(thanksgiving_survey, by = "id") %>% 136 | filter(!is.na(prayer)) %>% 137 | group_by(type, value) %>% 138 | summarize(prayer = sum(prayer == "Yes"), 139 | total = n()) %>% 140 | add_ebb_estimate(prayer, total) %>% 141 | arrange(desc(.fitted)) %>% 142 | View() 143 | ``` 144 | 145 | ### Network of foods 146 | 147 | What sides, pies and desserts are eaten together? 148 | 149 | ```{r} 150 | library(widyr) 151 | 152 | food_cors <- food_gathered %>% 153 | pairwise_cor(value, id, sort = TRUE) 154 | ``` 155 | 156 | ```{r} 157 | library(ggraph) 158 | library(igraph) 159 | 160 | set.seed(2018) 161 | 162 | food_types <- food_gathered %>% 163 | count(value, type, sort = TRUE) 164 | 165 | food_cors %>% 166 | head(75) %>% 167 | graph_from_data_frame(vertices = food_types) %>% 168 | ggraph() + 169 | geom_edge_link() + 170 | geom_node_point(aes(color = type, size = n / n_respondents)) + 171 | geom_node_text(aes(label = name), vjust = 1, hjust = 1, repel = TRUE) + 172 | scale_size_continuous(labels = scales::percent_format()) + 173 | theme_void() + 174 | labs(title = "What foods get served together at Thanksgiving?", 175 | color = "", 176 | size = "% of respondents") 177 | ``` 178 | 179 | 180 | 181 | -------------------------------------------------------------------------------- /tour-de-france.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Tour de France" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | # devtools::install_github("thebioengineer/tidytuesdayR") 12 | 13 | library(tidytuesdayR) 14 | library(tidyverse) 15 | theme_set(theme_light()) 16 | ``` 17 | 18 | ```{r} 19 | library(lubridate) 20 | 21 | tuesdata <- tidytuesdayR::tt_load('2020-04-07') 22 | 23 | tdf_winners <- tuesdata$tdf_winners %>% 24 | mutate(year = year(start_date), 25 | speed = distance / time_overall) 26 | 27 | stage_data <- tuesdata$stage_data 28 | tdf_stages <- tuesdata$tdf_stages %>% 29 | janitor::clean_names() %>% 30 | mutate(year = year(date)) 31 | ``` 32 | 33 | ```{r} 34 | View(tuesdata$tdf_winners) 35 | ``` 36 | 37 | ```{r} 38 | tdf_winners %>% 39 | count(birth_country, sort = TRUE) %>% 40 | mutate(birth_country = fct_reorder(birth_country, n)) %>% 41 | ggplot(aes(n, birth_country)) + 42 | geom_col() + 43 | labs(y = "", 44 | title = "What countries were the most Tour de France winners born in?") 45 | 46 | tdf_winners %>% 47 | count(winner_name, birth_country, sort = TRUE) 48 | ``` 49 | 50 | ```{r} 51 | by_decade <- tdf_winners %>% 52 | group_by(decade = 10 * (year %/% 10)) %>% 53 | summarize(winner_age = mean(age), 54 | winner_height = mean(height, na.rm = TRUE), 55 | winner_weight = mean(weight, na.rm = TRUE), 56 | winner_margin = mean(time_margin, na.rm = TRUE), 57 | winner_speed = mean(speed, na.rm = TRUE)) 58 | 59 | by_decade %>% 60 | filter(decade >= 1910) %>% 61 | ggplot(aes(decade, winner_margin * 60)) + 62 | geom_line() + 63 | expand_limits(y = 0) + 64 | labs(x = "Decade", 65 | y = "Average margin of winner (minutes)", 66 | title = "Tour de France races have been getting closer") 67 | 68 | by_decade %>% 69 | ggplot(aes(decade, winner_speed)) + 70 | geom_line() + 71 | expand_limits(y = 0) + 72 | labs(x = "Decade", 73 | y = "Average speed of winner (km/h)", 74 | title = "Tour de France winners have been getting faster") 75 | ``` 76 | 77 | Life expectancy of 78 | 79 | ```{r} 80 | library(survival) 81 | library(broom) 82 | 83 | surv_model <- tdf_winners %>% 84 | distinct(winner_name, .keep_all = TRUE) %>% 85 | transmute(winner_name, 86 | birth_year = year(born), 87 | death_year = year(died), 88 | dead = as.integer(!is.na(death_year))) %>% 89 | mutate(age_at_death = coalesce(death_year, 2020) - birth_year) %>% 90 | survfit(Surv(age_at_death, dead) ~ 1, data = .) 91 | 92 | glance(surv_model) 93 | ``` 94 | 95 | Median life expectancy of a Tour de France winner is 77. 96 | 97 | ```{r} 98 | stages_joined <- stage_data %>% 99 | extract(stage_results_id, "stage", "stage-(.*)") %>% 100 | inner_join(tdf_stages, by = c("year", "stage")) %>% 101 | mutate(rank = as.integer(rank)) %>% 102 | group_by(year, stage) %>% 103 | mutate(finishers = sum(!is.na(rank))) %>% 104 | ungroup() %>% 105 | mutate(percentile = 1 - rank / finishers) 106 | 107 | total_points <- stages_joined %>% 108 | group_by(year, rider) %>% 109 | summarize(total_points = sum(points, na.rm = TRUE)) %>% 110 | mutate(points_rank = percent_rank(total_points)) %>% 111 | ungroup() 112 | 113 | stages_joined %>% 114 | filter(stage == "1") %>% 115 | group_by(winner_country) %>% 116 | summarize(stages = n(), 117 | median_percentile = median(percentile, na.rm = TRUE)) %>% 118 | arrange(desc(stages)) 119 | ``` 120 | 121 | Does the winner of the first stage predict their final point ranking? 122 | 123 | ```{r} 124 | stages_joined %>% 125 | filter(stage == "1") %>% 126 | inner_join(total_points, by = c("year", "rider")) %>% 127 | select(year, rider, 128 | percentile_first_stage = percentile, 129 | points_rank) %>% 130 | mutate(first_stage_bin = cut(percentile_first_stage, seq(0, 1, .1), 131 | include.lowest = TRUE)) %>% 132 | filter(!is.na(first_stage_bin)) %>% 133 | ggplot(aes(first_stage_bin, points_rank)) + 134 | geom_boxplot() + 135 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 136 | scale_y_continuous(labels = scales::percent) + 137 | labs(x = "Decile performance in the first stage", 138 | y = "Overall points percentile") 139 | ``` 140 | 141 | ```{r} 142 | library(gganimate) 143 | library(tidytext) 144 | 145 | top_10_2017 <- total_points %>% 146 | filter(year == max(year)) %>% 147 | top_n(10, total_points) 148 | 149 | stages_joined %>% 150 | filter(year == max(year)) %>% 151 | semi_join(top_10_2017, by = "rider") %>% 152 | mutate(stage = as.integer(stage), 153 | points = coalesce(points, 0)) %>% 154 | arrange(stage) %>% 155 | group_by(rider) %>% 156 | mutate(cumulative_points = cumsum(points)) %>% 157 | ungroup() %>% 158 | # mutate(rider = reorder_within(rider, cumulative_points, stage)) %>% 159 | ggplot(aes(cumulative_points, rider, fill = cumulative_points)) + 160 | geom_col() + 161 | transition_time(stage) + 162 | theme(legend.position = "none") + 163 | labs(title = "The 2017 Tour de France. Stage: { frame_time }", 164 | x = "Cumulative points at this stage", 165 | y = "") 166 | ``` 167 | 168 | 169 | 170 | 171 | 172 | -------------------------------------------------------------------------------- /trees.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Trees in NYC" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | This Rmd (unlike others in this repository) isn't a screencast, but rather created in a [live session of the New York Open Statistical Programming Meetup](https://www.meetup.com/nyhackr/events/260545903/). 11 | 12 | Tree data here: https://data.cityofnewyork.us/Environment/2015-Street-Tree-Census-Tree-Data/pi5s-9p35 13 | Zip code data here: https://data.cityofnewyork.us/widgets/i8iw-xf4u 14 | 15 | ```{r} 16 | library(tidyverse) 17 | theme_set(theme_light()) 18 | 19 | tree_dataset_raw <- read_csv("~/Desktop/dataset_ideas/2015_Street_Tree_Census_-_Tree_Data.csv") 20 | 21 | trees <- tree_dataset_raw %>% 22 | mutate(health = fct_relevel(health, c("Poor", "Fair", "Good"))) %>% 23 | mutate(postcode = as.character(postcode)) %>% 24 | mutate(maple = str_detect(str_to_lower(spc_common), "maple")) 25 | ``` 26 | 27 | ```{r} 28 | View(trees) 29 | 30 | three_boroughs <- trees %>% 31 | filter(postcode %in% c(10023, 10002, 10009)) 32 | 33 | three_boroughs %>% 34 | filter(!is.na(health)) %>% 35 | count(postcode, health, sort = TRUE) %>% 36 | ggplot(aes(health, n)) + 37 | geom_col() + 38 | facet_wrap(~ postcode) 39 | 40 | summarize_trees <- function(data) { 41 | data %>% 42 | summarize(average_health = mean(as.numeric(health), na.rm = TRUE), 43 | percent_good_health = mean(health == "Good", na.rm = TRUE), 44 | percent_maple = mean(maple, na.rm = TRUE), 45 | trees = n()) 46 | } 47 | 48 | trees_by_zipcode <- trees %>% 49 | group_by(postcode, borough) %>% 50 | summarize_trees() %>% 51 | filter(trees >= 100) 52 | 53 | trees_by_zipcode %>% 54 | arrange(desc(average_health)) 55 | 56 | trees %>% 57 | group_by(borough) %>% 58 | summarize_trees() %>% 59 | arrange(average_health) 60 | ``` 61 | 62 | ```{r} 63 | library(sf) 64 | 65 | zip_codes <- read_sf("~/Downloads/ZIP_CODE_040114/") 66 | 67 | class(zip_codes) 68 | 69 | joined_trees <- zip_codes %>% 70 | left_join(trees_by_zipcode, by = c("ZIPCODE" = "postcode")) %>% 71 | mutate(population_density = POPULATION / AREA, 72 | tree_density = trees / AREA, 73 | people_per_tree = POPULATION / trees) 74 | ``` 75 | 76 | ```{r} 77 | ggplot(joined_trees, aes(fill = percent_good_health)) + 78 | geom_sf() + 79 | scale_fill_gradient2(low = "brown", 80 | high = "darkgreen", 81 | midpoint = .8, 82 | labels = scales::percent) + 83 | theme_void() + 84 | coord_sf(datum = NA) + 85 | labs(fill = "% trees in good health", 86 | title = "Where are the healthiest (and unhealthiest) trees in NYC?", 87 | subtitle = "Based on a 2015 survey of 600,000 trees") 88 | ``` 89 | 90 | ### Species 91 | 92 | ```{r} 93 | trees %>% 94 | filter(!is.na(spc_common)) %>% 95 | count(spc_common = fct_lump(spc_common, n = 19), 96 | borough, 97 | sort = TRUE) %>% 98 | mutate(spc_common = fct_reorder(spc_common, n, sum)) %>% 99 | ggplot(aes(spc_common, n, fill = borough)) + 100 | geom_col() + 101 | coord_flip() + 102 | labs(x = "", 103 | y = "# of trees in NYC", 104 | fill = "Borough", 105 | title = "What species of tree do you find in NYC?") 106 | ``` 107 | 108 | ```{r} 109 | trees %>% 110 | group_by(spc_common, maple) %>% 111 | summarize_trees() %>% 112 | filter(trees >= 500) %>% 113 | arrange(percent_good_health) %>% 114 | ggplot(aes(trees, percent_good_health)) + 115 | geom_point(aes(color = maple)) + 116 | geom_text(aes(label = spc_common), vjust = 1, hjust = 1, check_overlap = TRUE) + 117 | scale_x_log10(labels = scales::comma) + 118 | scale_y_continuous(labels = scales::percent) + 119 | labs(x = "# of trees in NYC", 120 | y = "% marked as 'Good' health") 121 | ``` 122 | 123 | ```{r} 124 | trees %>% 125 | filter(!is.na(maple)) %>% 126 | group_by(borough, maple) %>% 127 | summarize_trees() %>% 128 | mutate(maple = ifelse(maple, "Maple", "Other")) %>% 129 | select(borough, maple, percent_good_health) %>% 130 | spread(maple, percent_good_health) %>% 131 | ggplot(aes(Other, Maple)) + 132 | geom_point() + 133 | geom_text(aes(label = borough)) + 134 | geom_abline(color = "red") 135 | ``` 136 | 137 | 138 | ```{r} 139 | ggplot(joined_trees, aes(fill = percent_maple)) + 140 | geom_sf() + 141 | scale_fill_gradient2(low = "brown", 142 | high = "darkgreen", 143 | midpoint = .1, 144 | labels = scales::percent) + 145 | theme_void() + 146 | coord_sf(datum = NA) + 147 | labs(fill = "% trees that are maple", 148 | title = "Where are the maple trees in NYC?", 149 | subtitle = "Based on a 2015 survey of 600,000 trees") 150 | ``` 151 | 152 | ### Looking at tree and population density 153 | 154 | ```{r} 155 | processed_zipcodes <- joined_trees %>% 156 | select(ZIPCODE, POPULATION, AREA, trees) %>% 157 | rename_all(str_to_lower) 158 | 159 | processed_zipcodes %>% 160 | ggplot(aes(POPULATION / AREA)) + 161 | geom_histogram() 162 | 163 | processed_zipcodes %>% 164 | filter(population >= 1000) %>% 165 | arrange(desc(people_per_tree)) 166 | ``` 167 | 168 | ```{r} 169 | ggplot(joined_trees, aes(fill = tree_density * (5280 ^ 2))) + 170 | geom_sf() + 171 | scale_fill_gradient2(low = "brown", 172 | high = "darkgreen", 173 | midpoint = log10(3000), 174 | trans = "log10") + 175 | theme_void() + 176 | coord_sf(datum = NA) + 177 | labs(fill = "# of trees per square mile", 178 | title = "Where are the densest trees?", 179 | subtitle = "Based on a 2015 survey of 600,000 trees") 180 | ``` 181 | 182 | -------------------------------------------------------------------------------- /umbrella-week.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Untitled" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | Louie walks to and from work every day. In his city, there is a 50 percent chance of rain each morning and an independent 40 percent chance each evening. His habit is to bring (and use) an umbrella if it’s raining when he leaves the house or office, but to leave them all behind if not. Louie owns three umbrellas. 11 | 12 | On Sunday night, two are with him at home and one is at his office. Assuming it never starts raining during his walk to his home or office, what is the probability that he makes it through the work week without getting wet? 13 | 14 | ```{r} 15 | library(tidyverse) 16 | 17 | set.seed(2018) 18 | 19 | simulations <- crossing(trial = 1:1e5, 20 | weekday = 1:5, 21 | commute = c("Morning", "Evening")) %>% 22 | arrange(trial, weekday, desc(commute)) %>% 23 | mutate(rain = rbinom(n(), 1, ifelse(commute == "Morning", .5, .4)), 24 | home_change = case_when( 25 | commute == "Morning" & rain ~ -1, 26 | commute == "Evening" & rain ~ 1, 27 | TRUE ~ 0), 28 | office_change = -home_change) %>% 29 | group_by(trial) %>% 30 | mutate(home = 2 + cumsum(home_change), 31 | office = 1 + cumsum(office_change)) 32 | 33 | simulations %>% 34 | summarize(dry = !any(home < 0 | office < 0)) %>% 35 | summarize(dry = mean(dry)) 36 | 37 | days <- c("Mon", "Tue", "Wed", "Thu", "Fri") 38 | 39 | simulations %>% 40 | ungroup() %>% 41 | filter(home < 0 | office < 0) %>% 42 | distinct(trial, .keep_all = TRUE) %>% 43 | count(weekday, commute, sort = TRUE) %>% 44 | mutate(weekday = factor(days[weekday], levels = days), 45 | commute = fct_relevel(commute, "Morning")) %>% 46 | ggplot(aes(weekday, n / 1e5, fill = commute)) + 47 | geom_col(position = "dodge") + 48 | scale_y_continuous(labels = scales::percent_format()) + 49 | labs(title = "When does Louie first get wet?", 50 | y = "Probability overall") 51 | ``` 52 | 53 | Answer to the riddle: ~69.3%. 54 | 55 | -------------------------------------------------------------------------------- /uncanny-xmen.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Uncanny X-Men: Claremont Run" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidytuesdayR) 12 | library(tidyverse) 13 | theme_set(theme_light()) 14 | 15 | xmen <- tidytuesdayR::tt_load('2020-06-30') 16 | characters <- tt_load(2018, week = 9)$week9_comic_characters 17 | ``` 18 | 19 | ```{r} 20 | character_visualization <- xmen$character_visualization %>% 21 | separate(character, c("superhero", "secret_identity"), sep = " = ", fill = "right") 22 | ``` 23 | 24 | ```{r} 25 | character_visualization %>% 26 | group_by(superhero) 27 | ``` 28 | 29 | ```{r} 30 | by_character %>% 31 | left_join(characters, by = c(superhero = "name")) %>% 32 | View() 33 | 34 | characters %>% 35 | filter(str_detect(name, "Angel")) %>% 36 | distinct(name) 37 | ``` 38 | 39 | 40 | ```{r} 41 | library(ggrepel) 42 | 43 | by_character <- character_visualization %>% 44 | group_by(superhero) %>% 45 | summarize(across(speech:depicted, 46 | list(total = sum, 47 | issues = ~ sum(. > 0), 48 | avg = ~ mean(.[depicted > 0])))) 49 | 50 | by_character %>% 51 | arrange(desc(speech_issues)) %>% 52 | filter(speech_issues > 0) %>% 53 | mutate(superhero = fct_reorder(superhero, speech_issues)) %>% 54 | ggplot(aes(speech_issues, superhero)) + 55 | geom_col() + 56 | labs(title = "Which X-Men speak in the most issues?", 57 | x = "# of issues", 58 | y = "") 59 | 60 | by_character %>% 61 | ggplot(aes(depicted_issues, speech_avg)) + 62 | geom_point() + 63 | geom_text_repel(aes(label = superhero)) + 64 | labs(x = "# of issues in which character appears", 65 | y = "Average lines per issue depicted") + 66 | expand_limits(y = 0) 67 | 68 | by_character %>% 69 | ggplot(aes(depicted_issues, thought_avg)) + 70 | geom_point() + 71 | geom_text_repel(aes(label = superhero)) + 72 | scale_x_log10() + 73 | labs(x = "# of issues in which character appears", 74 | y = "Average thoughts per issue") + 75 | expand_limits(y = 0) 76 | 77 | by_character %>% 78 | ggplot(aes(depicted_issues, speech_avg / thought_avg)) + 79 | geom_point() + 80 | geom_text_repel(aes(label = superhero)) + 81 | expand_limits(y = 1) + 82 | scale_y_log10(breaks = c(1, 3, 10, 30), 83 | labels = c("Same", "3X", "10X", "30X")) + 84 | labs(x = "# of issues depicted", 85 | y = "Speech to thought ratio (log scale)", 86 | title = "Which X-Men think, and which speak?") 87 | ``` 88 | 89 | ```{r} 90 | by_character_costume <- character_visualization %>% 91 | group_by(superhero, costume) %>% 92 | summarize(across(speech:depicted, 93 | list(total = sum, 94 | issues = ~ sum(. > 0), 95 | avg = ~ mean(.[depicted > 0])))) %>% 96 | ungroup() %>% 97 | mutate(speech_thought_ratio = speech_avg / thought_avg) 98 | 99 | by_character_costume %>% 100 | mutate(superhero = fct_reorder(superhero, speech_total)) %>% 101 | filter(speech_avg > 0) %>% 102 | ggplot(aes(speech_total, 103 | superhero, 104 | color = costume, 105 | size = depicted_issues)) + 106 | geom_point() + 107 | labs(x = "# of lines (in/out of costume)") 108 | 109 | costume_ratios <- by_character_costume %>% 110 | filter(speech_avg > 0) %>% 111 | group_by(superhero) %>% 112 | summarize(costume_ratio = speech_total[1] / speech_total[2]) %>% 113 | inner_join(by_character, by = "superhero") 114 | 115 | costume_ratios %>% 116 | ggplot(aes(speech_total, 117 | costume_ratio)) + 118 | geom_point() + 119 | geom_text_repel(aes(label = superhero)) + 120 | scale_y_log10() + 121 | labs(x = "Total # of lines", 122 | y = "Lines in costume / lines out of costume") 123 | 124 | costume_ratios %>% 125 | mutate(superhero = fct_reorder(superhero, costume_ratio)) %>% 126 | ggplot(aes(costume_ratio, y = superhero)) + 127 | geom_errorbarh(aes(xmin = 1, xmax = costume_ratio), height = 0) + 128 | geom_point(aes(size = speech_total, color = costume_ratio > 1)) + 129 | scale_x_log10() + 130 | scale_color_discrete(guide = FALSE) + 131 | labs(size ="# of lines", 132 | x = "Lines in costume / lines out of costume", 133 | y = "", 134 | title = "Which X-Men tend to speak in costume/out of costume?") 135 | ``` 136 | 137 | ```{r} 138 | xmen$locations %>% 139 | group_by(location) %>% 140 | summarize(total = n(), 141 | issues = n_distinct(issue)) %>% 142 | arrange(desc(issues)) 143 | ``` 144 | 145 | ```{r} 146 | xmen$comic_bechdel %>% 147 | bind_rows(xmen$xmen_bechdel %>% mutate(writer = "Chris Claremont")) %>% 148 | filter(!is.na(pass_bechdel), 149 | !is.na(writer)) %>% 150 | group_by(writer = fct_lump(writer, 5)) %>% 151 | summarize(n_issues = n(), 152 | pct_bechdel = mean(pass_bechdel == "yes")) %>% 153 | arrange(desc(n_issues)) 154 | ``` 155 | 156 | ```{r} 157 | xmen$xmen_bechdel %>% 158 | filter(!is.na(pass_bechdel)) %>% 159 | group_by(issue_group = 20 * (issue %/% 20)) %>% 160 | summarize(pct_bechdel = mean(pass_bechdel == "yes"), 161 | n = n()) %>% 162 | ggplot(aes(issue_group, pct_bechdel)) + 163 | geom_line() + 164 | scale_y_continuous(labels = scales::percent) 165 | ``` 166 | 167 | ```{r} 168 | by_issue_character <- character_visualization %>% 169 | group_by(issue_group = 20 * (issue %/% 20), superhero) %>% 170 | summarize(across(speech:depicted, sum)) %>% 171 | ungroup() 172 | 173 | by_issue_character %>% 174 | filter(fct_lump(superhero, 9, w = speech) != "Other") %>% 175 | mutate(superhero = fct_reorder(superhero, -speech, sum)) %>% 176 | ggplot(aes(issue_group, speech)) + 177 | geom_col() + 178 | facet_wrap(~ superhero) 179 | ``` 180 | 181 | 182 | 183 | -------------------------------------------------------------------------------- /us-dairy.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Untitled" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | theme_set(theme_light()) 13 | 14 | milk_products_facts <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-29/milk_products_facts.csv") 15 | ``` 16 | 17 | ```{r} 18 | milk_products_tidied <- milk_products_facts %>% 19 | gather(product, lbs_per_person, -year) %>% 20 | separate(product, c("category", "product"), sep = "_", 21 | extra = "merge", fill = "right") %>% 22 | mutate(product = coalesce(product, category), 23 | product = str_to_title(str_replace_all(product, "_", " ")), 24 | category = str_to_title(category), 25 | product = ifelse(product == "Other", paste(product, category), product)) 26 | 27 | milk_products_tidied %>% 28 | group_by(category, year) %>% 29 | summarize(lbs_per_person = sum(lbs_per_person)) %>% 30 | ggplot(aes(year, lbs_per_person, color = category)) + 31 | geom_line() + 32 | labs(title = "Dairy consumption by category", 33 | subtitle = "Based on US consumption (source: USDA)", 34 | x = "Year", 35 | y = "Lbs per person") 36 | 37 | milk_products_tidied %>% 38 | group_by(product = fct_lump(product, 6, w = lbs_per_person), 39 | year) %>% 40 | summarize(lbs_per_person = sum(lbs_per_person)) %>% 41 | ggplot(aes(year, lbs_per_person, color = product)) + 42 | geom_line() 43 | 44 | milk_products_tidied %>% 45 | ggplot(aes(year, lbs_per_person)) + 46 | geom_line() + 47 | facet_wrap(~ product, scales = "free") + 48 | expand_limits(y = 0) 49 | 50 | milk_products_tidied %>% 51 | filter(year == max(year)) %>% 52 | mutate(product = fct_reorder(product, lbs_per_person, sum)) %>% 53 | ggplot(aes(product, lbs_per_person, fill = category)) + 54 | geom_col() + 55 | coord_flip() + 56 | labs(x = "", 57 | y = "Pounds consumed per US person in 2017") 58 | ``` 59 | 60 | ```{r} 61 | library(sweep) 62 | library(timetk) 63 | library(lubridate) 64 | 65 | milk_product_ts <- milk_products_tidied %>% 66 | mutate(year = as.Date("0001-01-01") + years(year - 1)) %>% 67 | nest(-category, -product) %>% 68 | mutate(ts = map(data, tk_ts, start = 1975, freq = 1)) 69 | 70 | milk_product_ets <- milk_product_ts %>% 71 | mutate(model = map(ts, ets)) 72 | 73 | milk_product_ets %>% 74 | unnest(map(model, sw_glance)) 75 | 76 | milk_product_ts %>% 77 | crossing(model_name = c("auto.arima", "ets")) %>% 78 | mutate(model = map2(model_name, ts, ~ invoke(.x, list(.y))), 79 | forecast = map(model, forecast, h = 10)) %>% 80 | unnest(map(forecast, sw_sweep)) %>% 81 | ggplot(aes(index, lbs_per_person, color = model_name, lty = key)) + 82 | geom_line() + 83 | geom_ribbon(aes(ymin = lo.80, ymax = hi.80), alpha = .5) + 84 | facet_wrap(~ product, scales = "free_y") + 85 | expand_limits(y = 0) + 86 | scale_x_continuous(breaks = c(1980, 2000, 2020)) + 87 | scale_linetype_discrete(guide = FALSE) + 88 | labs(x = "Year", 89 | y = "Average US consumption (lbs per person)", 90 | title = "Forecasted consumption of dairy products", 91 | subtitle = "Based on USDA data 1975-2017. Showing 80% prediction intervals.", 92 | color = "Model") 93 | ``` 94 | 95 | ```{r} 96 | cheese <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-29/clean_cheese.csv") 97 | 98 | cheese_tidied <- cheese %>% 99 | gather(type, lbs_per_person, -Year) %>% 100 | rename(year = Year) %>% 101 | mutate(type = str_to_title(type), 102 | type = fct_recode(type, "Total American Cheese" = "Total American Chese")) 103 | 104 | cheese_tidied %>% 105 | ggplot(aes(year, lbs_per_person)) + 106 | geom_line() + 107 | facet_wrap(~ type, scales = "free_y") + 108 | expand_limits(y = 0) 109 | 110 | cheese_ts <- cheese_tidied %>% 111 | mutate(year = as.Date("0001-01-01") + years(year - 1)) %>% 112 | nest(-type) %>% 113 | mutate(ts = map(data, tk_ts, start = 1970, freq = 1)) 114 | 115 | cheese_ts %>% 116 | crossing(model_name = c("auto.arima", "ets")) %>% 117 | mutate(model = map2(model_name, ts, ~ invoke(.x, list(.y))), 118 | forecast = map(model, forecast, h = 10)) %>% 119 | unnest(map(forecast, sw_sweep)) %>% 120 | ggplot(aes(index, lbs_per_person, color = model_name, lty = key)) + 121 | geom_line() + 122 | geom_ribbon(aes(ymin = lo.80, ymax = hi.80), alpha = .5) + 123 | facet_wrap(~ type, scales = "free_y") + 124 | expand_limits(y = 0) + 125 | scale_x_continuous(breaks = c(1980, 2000, 2020)) + 126 | scale_linetype_discrete(guide = FALSE) + 127 | labs(x = "Year", 128 | y = "Average US consumption (lbs per person)", 129 | title = "Forecasted consumption of dairy products", 130 | subtitle = "Based on USDA data 1975-2017. Showing 80% prediction intervals.", 131 | color = "Model") 132 | ``` 133 | 134 | 135 | -------------------------------------------------------------------------------- /us-wind.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Untitled" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | 13 | us_wind <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-11-06/us_wind.csv") 14 | ``` 15 | 16 | Look at the continental US (48 states, District of Columbia) 17 | 18 | ```{r} 19 | us_wind_processed <- us_wind %>% 20 | filter(!t_state %in% c("AK", "HI", "GU", "PR")) %>% 21 | mutate(t_cap = ifelse(t_cap < 0, NA, t_cap)) %>% 22 | mutate_if(is.numeric, ~ ifelse(. == -9999, NA, .)) 23 | 24 | wind_projects <- us_wind_processed %>% 25 | group_by(p_name, t_state) %>% 26 | summarize(year = min(p_year, na.rm = TRUE), 27 | turbines = n(), 28 | total_capacity = sum(t_cap), 29 | lon = mean(xlong), 30 | lat = mean(ylat), 31 | lon_sd = sd(xlong), 32 | lat_sd = sd(ylat)) %>% 33 | ungroup() 34 | ``` 35 | 36 | How has turbine capacity changed over time? 37 | 38 | ```{r} 39 | turbine_models <- us_wind_processed %>% 40 | group_by(t_model) %>% 41 | summarize(t_cap = median(t_cap), 42 | t_hh = median(t_hh), 43 | t_rd = median(t_rd), 44 | t_rsa = median(t_rsa), 45 | t_ttlh = median(t_ttlh), 46 | turbines = n(), 47 | projects = n_distinct(p_name)) %>% 48 | arrange(desc(projects)) 49 | 50 | turbine_models %>% 51 | ggplot(aes(t_ttlh, t_cap)) + 52 | geom_point() + 53 | labs(title = "When it comes to turbines, bigger is better!", 54 | x = "Turbine total height (meters)", 55 | y = "Turbine capacity (kW)") 56 | 57 | turbine_models %>% 58 | ggplot(aes(t_rsa, t_cap)) + 59 | geom_point() + 60 | labs(title = "When it comes to turbines, bigger is better!", 61 | x = "Turbine rotor swept area (meters ^ 2)", 62 | y = "Turbine capacity (kW)") 63 | 64 | ``` 65 | 66 | 67 | ```{r} 68 | wind_projects %>% 69 | ggplot(aes(year, total_capacity)) + 70 | geom_point() 71 | 72 | wind_projects %>% 73 | ggplot(aes(year, total_capacity / turbines)) + 74 | geom_point() 75 | ``` 76 | 77 | ```{r} 78 | wind_projects %>% 79 | ggplot(aes(lon, lat, size = turbines, color = year)) + 80 | borders("state") + 81 | geom_point() + 82 | coord_map() + 83 | theme_void() 84 | ``` 85 | 86 | ### Animation 87 | 88 | ```{r} 89 | library(gganimate) 90 | 91 | ggplot(mtcars, aes(factor(cyl), mpg)) + 92 | geom_boxplot() + 93 | # Here comes the gganimate code 94 | transition_states( 95 | gear, 96 | transition_length = 2, 97 | state_length = 1 98 | ) + 99 | enter_fade() + 100 | exit_shrink() + 101 | ease_aes('sine-in-out') 102 | ``` 103 | 104 | ```{r} 105 | wind_projects 106 | 107 | wind_projects %>% 108 | filter(!is.na(year), !is.infinite(year)) %>% 109 | crossing(time = 1981:2018) %>% 110 | filter(year <= time) %>% 111 | ggplot(aes(lon, lat, size = turbines, color = year)) + 112 | borders("state") + 113 | geom_point() + 114 | transition_manual(time) + 115 | scale_color_continuous(guide = FALSE) + 116 | labs(title = "Locations of wind turbine projects in continental US (1981-2018)") + 117 | coord_map() + 118 | theme_void() 119 | 120 | p 121 | anim_save("turbines.gif") 122 | ``` 123 | 124 | 125 | 126 | ```{r} 127 | us_wind_processed 128 | ``` 129 | 130 | 131 | -------------------------------------------------------------------------------- /us_phds.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "US PhDs" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | theme_set(theme_light()) 13 | 14 | # Major field of study 15 | major_fields_raw <- readxl::read_xlsx("~/Downloads/data_tables/sed17-sr-tab012.xlsx", 16 | skip = 3) 17 | 18 | major_fields <- major_fields_raw %>% 19 | rename(field = `Field of study`) %>% 20 | gather(key, value, -field) %>% 21 | mutate(year = as.numeric(ifelse(str_detect(key, "X__"), NA, key)), 22 | type = ifelse(!str_detect(value, "Number|Percent"), NA, value), 23 | value = as.numeric(value)) %>% 24 | fill(year, type) %>% 25 | select(-key) %>% 26 | filter(!is.na(value)) %>% 27 | spread(type, value) 28 | ``` 29 | 30 | ```{r} 31 | fine_fields <- readxl::read_xlsx("~/Downloads/data_tables/sed17-sr-tab013.xlsx", 32 | skip = 3) %>% 33 | rename(field = 1) %>% 34 | gather(year, number, -field) %>% 35 | mutate(year = as.numeric(year), 36 | number = as.numeric(number)) %>% 37 | filter(!is.na(number)) 38 | 39 | fine_fields %>% 40 | filter(field %in% sample(unique(field), 6)) %>% 41 | ggplot(aes(year, number, color = field)) + 42 | geom_line() 43 | ``` 44 | 45 | ```{r} 46 | # get the broad field names and the major field names 47 | 48 | sex <- c("All", "Male", "Female", "All doctorate recipientsa", "All fieldsa") 49 | 50 | broad_fields <- readxl::read_xlsx("~/Downloads/data_tables/sed17-sr-tab014.xlsx", skip = 4) %>% 51 | rename(field = 1) %>% 52 | filter(!field %in% sex) %>% 53 | mutate(field = fct_recode(field, 54 | "Life sciences" = "Life sciencesb", 55 | "Other" = "Otherc")) %>% 56 | pull(field) %>% 57 | as.character() 58 | ``` 59 | 60 | 61 | ```{r} 62 | recipients_year_field_sex <- readxl::read_xlsx("~/Downloads/data_tables/sed17-sr-tab015.xlsx", skip = 3) %>% 63 | rename(field = 1) %>% 64 | select(-contains("change")) %>% 65 | mutate(field = as.character(fct_recode(field, "All" = "All doctorate recipientsa", 66 | "Other" = "Otherb")), 67 | sex = if_else(field %in% sex, field, NA_character_), 68 | broad_field = ifelse(field %in% broad_fields, field, NA)) %>% 69 | fill(sex, broad_field) %>% 70 | gather(year, number, -sex, -broad_field, -field) %>% 71 | mutate(year = as.numeric(year), 72 | number = as.numeric(number)) %>% 73 | filter(!field %in% sex) %>% 74 | filter(!is.na(number)) 75 | 76 | recipients_year_field_sex %>% 77 | filter(sex != "All", 78 | broad_field == "Mathematics and computer sciences") %>% 79 | ggplot(aes(year, number, color = sex)) + 80 | geom_line() + 81 | expand_limits(y = 0) + 82 | facet_wrap(~ field) 83 | 84 | recipients_year_field_sex %>% 85 | spread(sex, number) %>% 86 | mutate(pct_male = Male / All) %>% 87 | filter(broad_field == "Engineering") %>% 88 | mutate(field = fct_reorder(field, -pct_male)) %>% 89 | ggplot(aes(year, pct_male, color = field)) + 90 | geom_line() + 91 | scale_y_continuous(labels = scales::percent_format()) + 92 | labs(x = "Year", 93 | y = "% of PhD recipients reporting as male", 94 | color = "Major field", 95 | title = "Breakdown by sex over time within Engineering major fields") 96 | 97 | recipients_year_field_sex %>% 98 | spread(sex, number) %>% 99 | mutate(pct_male = Male / All) %>% 100 | filter(broad_field == "Humanities and arts") %>% 101 | mutate(field = fct_reorder(field, -pct_male)) %>% 102 | ggplot(aes(year, pct_male, color = field)) + 103 | geom_line() + 104 | scale_y_continuous(labels = scales::percent_format()) + 105 | labs(x = "Year", 106 | y = "% of PhD recipients reporting as male", 107 | color = "Major field", 108 | title = "Breakdown by sex over time within Humanities & Arts major fields") 109 | 110 | recipients_year_field_sex %>% 111 | spread(sex, number) %>% 112 | mutate(pct_male = Male / All) %>% 113 | filter(broad_field == "Education") %>% 114 | mutate(field = fct_reorder(field, -pct_male)) %>% 115 | ggplot(aes(year, pct_male, color = field)) + 116 | geom_line() + 117 | scale_y_continuous(labels = scales::percent_format()) + 118 | labs(x = "Year", 119 | y = "% of PhD recipients reporting as male", 120 | color = "Major field", 121 | title = "Breakdown by sex over time within Education major fields") 122 | ``` 123 | 124 | Three levels: 125 | 126 | * Broad field (Life sciences) 127 | * Major field (Biological and biomedical sciences) 128 | * Subfield (Computational biology) 129 | 130 | 131 | -------------------------------------------------------------------------------- /volcano-eruptions.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Volcanoes" 3 | output: html_document 4 | --- 5 | 6 | ```{r} 7 | library(tidyverse) 8 | theme_set(theme_light()) 9 | 10 | tuesdata <- tidytuesdayR::tt_load('2020-05-12') 11 | ``` 12 | 13 | ```{r} 14 | volcano <- tuesdata$volcano %>% 15 | mutate(last_eruption_year = as.numeric(last_eruption_year)) 16 | 17 | eruptions <- tuesdata$eruptions 18 | ``` 19 | 20 | ```{r} 21 | volcano %>% 22 | count(evidence_category, sort = TRUE) 23 | 24 | volcano %>% 25 | mutate(years_ago = 2020 - last_eruption_year) %>% 26 | ggplot(aes(years_ago + 1, fill = evidence_category)) + 27 | geom_histogram() + 28 | scale_x_log10() 29 | ``` 30 | 31 | ```{r} 32 | volcano %>% 33 | count(region, sort = TRUE) 34 | 35 | volcano %>% 36 | count(primary_volcano_type, sort = TRUE) 37 | ``` 38 | 39 | ```{r} 40 | library(ggthemes) 41 | 42 | volcano %>% 43 | mutate(primary_volcano_type = str_remove(primary_volcano_type, "\\(.*\\)"), 44 | primary_volcano_type = fct_lump(primary_volcano_type, 6)) %>% 45 | ggplot(aes(longitude, latitude)) + 46 | borders() + 47 | geom_point(aes(color = primary_volcano_type), size = .5) + 48 | theme_map() + 49 | labs(title = "Volcanoes of the World", 50 | color = "Type") 51 | ``` 52 | 53 | ```{r} 54 | library(leaflet) 55 | library(glue) 56 | library(htmlwidgets) 57 | library(DT) 58 | 59 | template <- "<>{ volcano_name }

{ primary_volcano_type }

" 60 | 61 | volcano %>% 62 | mutate(transformed_pop = log2(population_within_5_km + 1), 63 | pop_color = colorNumeric(c("blue", "red"), transformed_pop)(transformed_pop)) %>% 64 | gather(key, value, 65 | volcano_name, primary_volcano_type, last_eruption_year, 66 | country, 67 | tectonic_settings, 68 | population_within_5_km) %>% 69 | mutate(key = str_to_title(str_replace_all(key, "_", " ")), 70 | key = paste0("", key, "")) %>% 71 | replace_na(list(value = "Unknown")) %>% 72 | nest(data = c(key, value)) %>% 73 | mutate(html = map(data, 74 | knitr::kable, 75 | format = "html", 76 | escape = FALSE, 77 | col.names = c("", ""))) %>% 78 | leaflet() %>% 79 | addTiles() %>% 80 | addCircleMarkers(lat = ~ latitude, 81 | lng = ~ longitude, 82 | color = ~ pop_color, 83 | popup = ~ html, 84 | radius = 1) %>% 85 | addMeasure() 86 | ``` 87 | 88 | ```{r} 89 | volcano %>% 90 | ggplot(aes(population_within_5_km)) + 91 | geom_histogram() + 92 | scale_x_log10() 93 | ``` 94 | 95 | ```{r} 96 | library(gganimate) 97 | 98 | eruptions %>% 99 | ggplot(aes(2020 - start_year)) + 100 | geom_histogram() + 101 | scale_x_log10() 102 | 103 | eruptions %>% 104 | filter(start_year >= 1900, 105 | eruption_category != "Discredited Eruption") 106 | ggplot(aes(2020 - start_year)) + 107 | geom_histogram() + 108 | scale_x_log10() 109 | ``` 110 | 111 | ```{r} 112 | library(gganimate) 113 | 114 | anim_graph <- eruptions %>% 115 | filter(start_year >= 1900) %>% 116 | mutate(volume = .00001 * 10 ^ vei) %>% 117 | ggplot(aes(longitude, latitude)) + 118 | borders() + 119 | geom_point(aes(size = volume, 120 | color = volume)) + 121 | theme_map() + 122 | scale_color_gradient2(low = "blue", high = "red", 123 | midpoint = log10(.01), 124 | trans = "log10", 125 | guide = FALSE) + 126 | scale_size_continuous(range = c(1, 6)) + 127 | # transition_reveal(start_year) + 128 | transition_time(start_year) + 129 | labs(title = "Eruptions: Year { round(frame_time) }") + 130 | theme(legend.position = "none") 131 | 132 | animate(anim_graph, 150, fps = 4) 133 | 134 | anim_save("eruptions.gif") 135 | ``` 136 | 137 | 138 | 139 | 140 | 141 | ```{r} 142 | eruptions 143 | ``` 144 | 145 | -------------------------------------------------------------------------------- /women-workplace-app/app.R: -------------------------------------------------------------------------------- 1 | library(shiny) 2 | library(ggplot2) 3 | library(plotly) 4 | library(dplyr) 5 | library(scales) 6 | theme_set(theme_light()) 7 | 8 | jobs_gender <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-05/jobs_gender.csv") 9 | 10 | # Define UI for application that draws a histogram 11 | ui <- fluidPage( 12 | 13 | # Application title 14 | titlePanel("Occupation and gender explorer"), 15 | 16 | # Sidebar with a slider input for number of bins 17 | sidebarLayout( 18 | sidebarPanel( 19 | selectInput("major_category", 20 | "Occupation category:", 21 | choices = unique(jobs_gender$major_category)) 22 | ), 23 | 24 | # Show a plot of the generated distribution 25 | mainPanel( 26 | plotlyOutput("jobs_scatter", height = "700px") 27 | ) 28 | ) 29 | ) 30 | 31 | # Define server logic required to draw a histogram 32 | server <- function(input, output) { 33 | 34 | output$jobs_scatter <- renderPlotly({ 35 | p <- jobs_gender %>% 36 | filter(year == 2016, 37 | total_workers >= 20000) %>% 38 | filter(major_category == input$major_category) %>% 39 | arrange(desc(wage_percent_of_male)) %>% 40 | mutate(percent_female = workers_female / total_workers, 41 | wage_percent_female = total_earnings_female / total_earnings_male) %>% 42 | ggplot(aes(percent_female, 43 | wage_percent_female, 44 | color = minor_category, 45 | size = total_workers, 46 | label = occupation)) + 47 | geom_point() + 48 | scale_size_continuous(range = c(1, 10), guide = FALSE) + 49 | labs(x = "% of workforce reported as female", 50 | y = "% of median female salary / median male", 51 | title = "Gender disparity and pay gap in 2016", 52 | subtitle = "Only occupations with at least 20,000 workers total", 53 | color = "Minor category") + 54 | scale_x_continuous(labels = percent_format()) + 55 | scale_y_continuous(labels = percent_format()) 56 | 57 | ggplotly(p) 58 | }) 59 | } 60 | 61 | # Run the application 62 | shinyApp(ui = ui, server = server) 63 | 64 | -------------------------------------------------------------------------------- /women-workplace.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Women in the Workplace" 3 | output: html_document 4 | --- 5 | 6 | ```{r setup, include=FALSE} 7 | knitr::opts_chunk$set(echo = TRUE) 8 | ``` 9 | 10 | ```{r} 11 | library(tidyverse) 12 | library(scales) 13 | theme_set(theme_light()) 14 | 15 | jobs_gender <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-05/jobs_gender.csv") 16 | earnings_female <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-05/earnings_female.csv") 17 | employed_gender <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-05/employed_gender.csv") 18 | ``` 19 | 20 | ```{r} 21 | summarize_jobs_gender <- function(tbl) { 22 | tbl %>% 23 | summarize(total_earnings = sum(total_earnings * total_workers) / sum(total_workers), 24 | total_earnings_male = sum(total_earnings_male * workers_male, na.rm = TRUE) / 25 | sum(workers_male[!is.na(total_earnings_male)]), 26 | total_earnings_female = sum(total_earnings_female * workers_female, na.rm = TRUE) / 27 | sum(workers_female[!is.na(total_earnings_female)]), 28 | total_workers = sum(total_workers), 29 | workers_male = sum(workers_male), 30 | workers_female = sum(workers_female)) %>% 31 | mutate(wage_percent_of_male = total_earnings_female / total_earnings_male) 32 | } 33 | 34 | by_year_major_category <- jobs_gender %>% 35 | filter(complete.cases(.)) %>% 36 | group_by(year, major_category) %>% 37 | summarize_jobs_gender() 38 | 39 | by_year_major_category %>% 40 | mutate(major_category = fct_reorder(major_category, -total_earnings)) %>% 41 | ggplot(aes(year, total_earnings, color = major_category)) + 42 | geom_line() + 43 | expand_limits(y = 0) 44 | 45 | by_year_major_category %>% 46 | mutate(major_category = fct_reorder(major_category, -wage_percent_of_male)) %>% 47 | ggplot(aes(year, wage_percent_of_male, color = major_category)) + 48 | geom_line() 49 | ``` 50 | 51 | ```{r} 52 | by_minor_category_2016 <- jobs_gender %>% 53 | filter(year == 2016) %>% 54 | group_by(major_category, minor_category) %>% 55 | summarize_jobs_gender() %>% 56 | ungroup() 57 | ``` 58 | 59 | ```{r} 60 | by_minor_category_2016 %>% 61 | mutate(minor_category = fct_reorder(minor_category, wage_percent_of_male)) %>% 62 | ggplot(aes(minor_category, wage_percent_of_male, fill = major_category)) + 63 | geom_col() + 64 | coord_flip() 65 | ``` 66 | 67 | ```{r} 68 | library(plotly) 69 | 70 | p <- jobs_gender %>% 71 | filter(year == 2016) %>% 72 | filter(major_category == "Healthcare Practitioners and Technical") %>% 73 | arrange(desc(wage_percent_of_male)) %>% 74 | ggplot(aes(workers_female / total_workers, 75 | total_earnings, 76 | size = total_workers, 77 | label = occupation)) + 78 | geom_point() + 79 | scale_size_continuous(range = c(1, 10)) + 80 | labs(size = "Total # of workers", 81 | x = "% of workforce reported as female", 82 | y = "Median salary in the occupation") + 83 | scale_x_continuous(labels = percent_format()) + 84 | scale_y_continuous(labels = dollar_format()) + 85 | expand_limits(y = 0) 86 | 87 | ggplotly(p) 88 | ``` 89 | 90 | ```{r} 91 | p <- jobs_gender %>% 92 | filter(year == 2016, 93 | total_workers >= 20000) %>% 94 | filter(major_category == "Computer, Engineering, and Science") %>% 95 | arrange(desc(wage_percent_of_male)) %>% 96 | ggplot(aes(workers_female / total_workers, 97 | total_earnings_female / total_earnings_male, 98 | color = minor_category, 99 | size = total_workers, 100 | label = occupation)) + 101 | geom_point() + 102 | scale_size_continuous(range = c(1, 10)) + 103 | labs(size = "Total # of workers", 104 | x = "% of workforce reported as female", 105 | y = "% of median female salary / median male") + 106 | scale_x_continuous(labels = percent_format()) + 107 | scale_y_continuous(labels = percent_format()) 108 | 109 | ggplotly(p) 110 | ``` 111 | 112 | --------------------------------------------------------------------------------