├── .gitignore
├── 2020_07_28_penguins.Rmd
├── 2020_08_04_europe_energy.Rmd
├── 2020_08_18_extinct_plants.Rmd
├── 2020_08_25_chopped.Rmd
├── 2020_09_01_crop_yields.Rmd
├── 2020_09_08_friends.Rmd
├── 2020_09_15_government_spending_kids.Rmd
├── 2020_09_22_himalayan_climbers.Rmd
├── 2020_09_29_taylor_swift_beyonce.Rmd
├── 2020_10_06_ncaa_womens_basketball.Rmd
├── 2020_10_20_beer_awards.Rmd
├── 2020_11_03_ikea.Rmd
├── 2020_11_10_phone_history.Rmd
├── README.md
├── african-american-achievements.Rmd
├── african-american-history.Rmd
├── animal-crossing.Rmd
├── australian-animal-outcomes.Rmd
├── baltimore_bridges.Rmd
├── beach-volleyball.Rmd
├── beer-production.Rmd
├── bike_traffic.Rmd
├── bird-collisions.Rmd
├── board-games.Rmd
├── bob-ross.Rmd
├── broadway-shinybones
    ├── .gitignore
    ├── README.md
    ├── _site.yml
    ├── app.R
    ├── broadway-shinybones.Rproj
    ├── components
    │   └── README.md
    ├── pages
    │   └── README.md
    └── show_metrics.rds
├── broadway.Rmd
├── car-economy.Rmd
├── caribou-locations.Rmd
├── cetaceans.Rmd
├── cocktails.Rmd
├── coffee-ratings.Rmd
├── college-majors.Rmd
├── cord-19.Rmd
├── cran-code.Rmd
├── crop-yields-shiny
    ├── app.Rmd
    └── yields_tidy.rds
├── data-screencasts.Rproj
├── french-trains.Rmd
├── gdpr.Rmd
├── golden-age-tv.Rmd
├── grand-slams.Rmd
├── honeycomb-puzzle.Rmd
├── horror-movie-ratings.Rmd
├── malaria.Rmd
├── media-franchises.Rmd
├── medium-datasci.Rmd
├── movie-profit.Rmd
├── nobel-prize.Rmd
├── nyc-pizza.Rmd
├── nyc-restaurants.Rmd
├── nyc-squirrels-app
    └── app.R
├── nyc-squirrels.Rmd
├── office-transcripts.Rmd
├── pascals-triangle.Rmd
├── plastic-waste.Rmd
├── r-downloads.Rmd
├── ramen-ratings.Rmd
├── riddler-die-reroll.Rmd
├── riddler-die-roll-low.Rmd
├── riddler-prisoner-coin-flip.Rmd
├── riddler-spam-comments.Rmd
├── screencast-annotations
    └── README.md
├── seattle-pets.Rmd
├── simpsons-guests.Rmd
├── space-launches.Rmd
├── student-teacher-ratios.Rmd
├── thanksgiving.Rmd
├── tidytuesday-tweets.Rmd
├── tour-de-france.Rmd
├── trees.Rmd
├── umbrella-week.Rmd
├── uncanny-xmen.Rmd
├── us-dairy.Rmd
├── us-wind.Rmd
├── us_phds.Rmd
├── volcano-eruptions.Rmd
├── wine-ratings.Rmd
├── women-workplace-app
    └── app.R
├── women-workplace.Rmd
└── womens-world-cup.Rmd


/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | 


--------------------------------------------------------------------------------
/2020_09_01_crop_yields.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "TidyTemplate"
  3 | date: 2020-09-01
  4 | output: html_output
  5 | ---
  6 | 
  7 | ```{r setup, include=FALSE}
  8 | 
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | 
 11 | library(tidyverse)
 12 | library(tidytuesdayR)
 13 | library(scales)
 14 | library(janitor)
 15 | theme_set(theme_light())
 16 | 
 17 | ```
 18 | 
 19 | # Load the weekly Data
 20 | 
 21 | Dowload the weekly data and make available in the `tt` object.
 22 | 
 23 | ```{r Load}
 24 | tt <- tt_load("2020-09-01")
 25 | ```
 26 | 
 27 | ```{r}
 28 | tt$key_crop_yields %>%
 29 |   View()
 30 | 
 31 | yields <- tt$key_crop_yields %>%
 32 |   clean_names() %>%
 33 |   rename_all(str_remove, "_tonnes.*")
 34 | 
 35 | arable_land <- tt$arable_land_pin %>%
 36 |   clean_names() %>%
 37 |   rename(arable_land_needed = 4)
 38 | 
 39 | arable_land %>%
 40 |   filter(entity == "Afghanistan") %>%
 41 |   ggplot(aes(year, arable_land_needed)) +
 42 |   geom_line()
 43 | 
 44 | fertilizer <- tt$cereal_crop_yield_vs_fertilizer_application %>%
 45 |   janitor::clean_names() %>%
 46 |   rename(yield = 4,
 47 |          fertilizer_use = 5)
 48 | ```
 49 | 
 50 | ```{r}
 51 | yields_tidy <- yields %>%
 52 |   pivot_longer(wheat:bananas, names_to = "crop", values_to = "yield") %>%
 53 |   filter(!is.na(yield)) %>%
 54 |   mutate(crop = str_replace_all(crop, "_", " "),
 55 |          crop = str_to_title(crop))
 56 | 
 57 | yields_tidy %>%
 58 |   write_rds("crop-yields-shiny/yields_tidy.rds")
 59 | ```
 60 | 
 61 | ```{r}
 62 | yields_tidy %>%
 63 |   filter(code == "USA") %>%
 64 |   mutate(crop = fct_reorder(crop, -yield)) %>%
 65 |   ggplot(aes(year, yield)) +
 66 |   geom_line() +
 67 |   facet_wrap(~ crop)
 68 | 
 69 | yields_tidy %>%
 70 |   filter(code == "USA") %>%
 71 |   mutate(crop = fct_reorder(crop, -yield)) %>%
 72 |   ggplot(aes(year, yield, color = crop)) +
 73 |   geom_line() +
 74 |   labs(x = "Year",
 75 |        y = "Yield (tonnes per hectare)",
 76 |        title = "Crop yields in the US over time",
 77 |        color = "Crop")
 78 | 
 79 | yields_tidy %>%
 80 |   filter(entity == "India") %>%
 81 |   mutate(crop = fct_reorder(crop, -yield)) %>%
 82 |   ggplot(aes(year, yield, color = crop)) +
 83 |   geom_line() +
 84 |   labs(x = "Year",
 85 |        y = "Yield (tonnes per hectare)",
 86 |        title = "Crop yields in the US over time",
 87 |        color = "Crop")
 88 | ```
 89 | 
 90 | ```{r}
 91 | yields_tidy %>%
 92 |   filter(crop == "Wheat") %>%
 93 |   add_count(entity) %>%
 94 |   filter(n == max(n)) %>%
 95 |   filter(entity %in% sample(unique(entity), 25)) %>%
 96 |   ggplot(aes(year, yield)) +
 97 |   geom_line() +
 98 |   facet_wrap(~ entity)
 99 | 
100 | crop_yields_50_years <- yields_tidy %>%
101 |   arrange(entity, year) %>%
102 |   filter(year >= 1968) %>%
103 |   group_by(entity, code, crop) %>%
104 |   summarize(year_start = min(year),
105 |             year_end = max(year),
106 |             yield_start = first(yield),
107 |             yield_end = last(yield)) %>%
108 |   ungroup() %>%
109 |   filter(year_start == 1968) %>%
110 |   mutate(yield_ratio = yield_end / yield_start)
111 | 
112 | crop_yields_50_years %>%
113 |   filter(!is.na(code)) %>%
114 |   ggplot(aes(yield_start, yield_end)) +
115 |   geom_abline(color = "red") +
116 |   geom_point() +
117 |   facet_wrap(~ crop, scales = "free")
118 | 
119 | crop_yields_50_years %>%
120 |   mutate(crop = fct_reorder(crop, yield_ratio)) %>%
121 |   ggplot(aes(yield_ratio, crop)) +
122 |   geom_boxplot() +
123 |   scale_x_log10()
124 | 
125 | crop_yields_50_years %>%
126 |   group_by(crop) %>%
127 |   summarize(median_yield_ratio = median(yield_ratio)) %>%
128 |   mutate(crop = fct_reorder(crop, median_yield_ratio)) %>%
129 |   ggplot(aes(median_yield_ratio, crop)) +
130 |   geom_col() +
131 |   labs(title = "How much has the average country improved at producing this crop?",
132 |        x = "(2018 yield) / (1968 yield)",
133 |        y = "")
134 | 
135 | crop_yields_50_years %>%
136 |   filter(is.na(code)) %>%
137 |   filter(entity %in% c("Africa", "Asia", "Northern America", "South America", "Oceania")) %>%
138 |   ggplot(aes(yield_start, yield_end, color = entity)) +
139 |   geom_abline(color = "red") +
140 |   geom_point() +
141 |   expand_limits(y = 0, x = 0) +
142 |   facet_wrap(~ crop, scales = "free") +
143 |   labs(x = "Tonnes per hectare in 1968",
144 |        y = "Tonnes per hectare in 2018",
145 |        color = "Continent")
146 | ```
147 | 
148 | ```{r}
149 | library(ggrepel)
150 | library(countrycode)
151 | 
152 | crop_yields_50_years %>%
153 |   filter(crop == "Wheat",
154 |          !is.na(code)) %>%
155 |   mutate(continent = countrycode(code, "iso3c", "continent")) %>%
156 |   filter(!is.na(continent)) %>%
157 |   ggplot(aes(yield_start, yield_ratio)) +
158 |   geom_point(aes(color = continent)) +
159 |   scale_x_log10() +
160 |   scale_y_log10(breaks = c(.25, .5, 1, 2, 4),
161 |                 labels = c("1/4X", "1/2X", "Same", "2X", "4X")) +
162 |   geom_hline(yintercept = 1, linetype = "dotted") +
163 |   geom_text_repel(aes(label = entity), force = .1,
164 |                   size = 2.5) +
165 |   labs(x = "1968 yield (tonnes per hectare), log scale",
166 |        y = "(2018 yield) / (1968 yield), log scale",
167 |        color = "Continent",
168 |        title = "How has wheat efficiency changed across countries?")
169 | 
170 | countrycode("USA", "iso3c", "continent")
171 | ```
172 | 
173 | 
174 | 
175 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Code from screencasts of #tidytuesday
2 | 
3 | These are the R Markdown documents produced during live analyses of [#tidytuesday data](https://github.com/rfordatascience/tidytuesday).
4 | 


--------------------------------------------------------------------------------
/african-american-achievements.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "African-American Achievements"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | theme_set(theme_light())
 13 | 
 14 | tuesdata <- tidytuesdayR::tt_load('2020-06-09')
 15 | 
 16 | science <- tuesdata$science
 17 | ```
 18 | 
 19 | ```{r}
 20 | tuesdata$firsts %>%
 21 |   View()
 22 | 
 23 | tuesdata$science %>%
 24 |   View()
 25 | ```
 26 | 
 27 | ```{r}
 28 | firsts <- tuesdata$firsts
 29 | 
 30 | tuesdata$firsts %>%
 31 |   ggplot(aes(year)) +
 32 |   geom_histogram()
 33 | 
 34 | tuesdata$firsts %>%
 35 |   count(category, sort = TRUE) %>%
 36 |   mutate(category = fct_reorder(category, n)) %>%
 37 |   ggplot(aes(n, category)) +
 38 |   geom_col()
 39 | ```
 40 | 
 41 | ```{r}
 42 | firsts <- tuesdata$firsts %>%
 43 |   mutate(person = str_remove(person, "[\\[\\(].*"),
 44 |          person = str_trim(person))
 45 | ```
 46 | 
 47 | ```{r}
 48 | library(plotly)
 49 | library(glue)
 50 | 
 51 | g <- firsts %>%
 52 |   ggplot(aes(year,
 53 |              category,
 54 |              color = category,
 55 |              text = glue("{ year }: { accomplishment }\n{ person }"))) +
 56 |   geom_point() +
 57 |   theme(axis.text.y = element_blank(),
 58 |         axis.ticks.y = element_blank(),
 59 |         panel.grid.major.y = element_blank(),
 60 |         panel.grid.minor.y = element_blank(),
 61 |         legend.position = "none") +
 62 |   labs(title = "Timeline of some notable African-American achievements",
 63 |        subtitle = "Source: https://en.wikipedia.org/wiki/List_of_African-American_firsts",
 64 |        y = "Category",
 65 |        x = "Year")
 66 | 
 67 | ggplotly(g, tooltip = "text")
 68 | ```
 69 | 
 70 | ### Science
 71 | 
 72 | ```{r}
 73 | tuesdata$science %>%
 74 |   ggplot(aes(birth)) +
 75 |   geom_histogram()
 76 | 
 77 | tuesdata$science %>%
 78 |   separate_rows(occupation_s, sep = "; ") %>%
 79 |   mutate(occupation = str_to_title(occupation_s)) %>%
 80 |   count(occupation, sort = TRUE)
 81 | 
 82 | science %>%
 83 |   filter(str_detect(occupation_s, regex("istician", ignore_case = TRUE))) %>%
 84 |   pull(name)
 85 | 
 86 | science %>%
 87 |   filter(str_detect(occupation_s, "statistician")) %>%
 88 |   View()
 89 | ```
 90 | 
 91 | ```{r}
 92 | library(rvest)
 93 | 
 94 | science_html <- science %>%
 95 |   mutate(html = map(links, possibly(read_html, NULL, quiet = FALSE)))
 96 | ```
 97 | 
 98 | ```{r}
 99 | extract_infobox <- . %>%
100 |   html_node(".vcard") %>%
101 |   html_table(header = FALSE) %>%
102 |   as_tibble()
103 | 
104 | infoboxes <- science_html %>%
105 |   filter(!map_lgl(html, is.null)) %>%
106 |   mutate(infobox = map(html, possibly(extract_infobox, NULL))) %>%
107 |   select(link = links, infobox) %>%
108 |   unnest(infobox) %>%
109 |   filter(X1 != "" | X2 != "", X1 != "Scientific career") %>%
110 |   rename(key = X1, value = X2)
111 | 
112 | science_infoboxes <- infoboxes %>%
113 |   group_by(link) %>%
114 |   mutate(name = first(key)) %>%
115 |   group_by(key) %>%
116 |   filter(n() >= 10) %>%
117 |   ungroup() %>%
118 |   distinct(name, key, .keep_all = TRUE) %>%
119 |   spread(key, value) %>%
120 |   janitor::clean_names()
121 | ```
122 | 
123 | ```{r}
124 | science_infoboxes %>%
125 |   count(nationality, sort = TRUE)
126 | ```
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/african-american-history.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "African-American History"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | library(tidytuesdayR)
 13 | library(scales)
 14 | theme_set(theme_light())
 15 | 
 16 | tuesdata <- tt_load('2020-06-16')
 17 | ```
 18 | 
 19 | ```{r}
 20 | slave_routes <- tuesdata$slave_routes
 21 | 
 22 | slave_routes %>%
 23 |   summarize(sum(n_slaves_arrived, na.rm = TRUE))
 24 | 
 25 | slave_routes %>%
 26 |   count(ship_name, sort = TRUE)
 27 | 
 28 | slave_routes %>%
 29 |   ggplot(aes(n_slaves_arrived)) +
 30 |   geom_histogram()
 31 | ```
 32 | 
 33 | ```{r}
 34 | slave_routes %>%
 35 |   filter(!is.na(ship_name), !is.na(n_slaves_arrived)) %>%
 36 |   filter(fct_lump(ship_name, 12) != "Other") %>%
 37 |   mutate(ship_name = fct_reorder(ship_name, n_slaves_arrived)) %>%
 38 |   ggplot(aes(n_slaves_arrived, ship_name)) +
 39 |   geom_boxplot()
 40 | ```
 41 | 
 42 | ```{r}
 43 | slave_routes %>%
 44 |   count(port_arrival, sort = TRUE)
 45 | 
 46 | slave_routes %>%
 47 |   count(port_origin, sort = TRUE)
 48 | 
 49 | 
 50 | slave_routes %>%
 51 |   ggplot(aes(year_arrival)) +
 52 |   geom_histogram() +
 53 |   geom_vline(xintercept = c(1776, 1865), lty = 2) +
 54 |   labs(title = "# of slave ships over time")
 55 | ```
 56 | 
 57 | ```{r}
 58 | slave_routes %>%
 59 |   filter(!is.na(port_origin)) %>%
 60 |   mutate(port_origin = str_trunc(port_origin, 25),
 61 |          port_origin = fct_lump(port_origin, 24),
 62 |          port_origin = fct_reorder(port_origin, year_arrival)) %>%
 63 |   count(port_origin,
 64 |         decade = 10 * (year_arrival %/% 10)) %>%
 65 |   ggplot(aes(decade, n)) +
 66 |   geom_line() +
 67 |   facet_wrap(~ port_origin) +
 68 |   geom_vline(xintercept = c(1807), lty = 2) +
 69 |   theme(strip.text = element_text(size = 6))
 70 | 
 71 | slave_routes %>%
 72 |   filter(port_origin == "New York") %>%
 73 |   arrange(desc(year_arrival)) %>%
 74 |   View()
 75 | ```
 76 | 
 77 | ```{r}
 78 | slave_routes %>%
 79 |   filter(fct_lump(ship_name, 12) != "Other") %>%
 80 |   count(ship_name, port_origin, port_arrival, sort = TRUE) %>%
 81 |   group_by(ship_name) %>%
 82 |   mutate(percent = n / sum(n)) %>%
 83 |   top_n(1, percent) %>%
 84 |   arrange(desc(percent))
 85 | ```
 86 | 
 87 | ```{r}
 88 | library(grid)
 89 | library(ggraph)
 90 | 
 91 | set.seed(2020)
 92 | 
 93 | slave_routes %>%
 94 |   count(port_origin, port_arrival, sort = TRUE) %>%
 95 |   filter(port_origin != port_arrival) %>%
 96 |   head(40) %>%
 97 |   ggraph(layout = "fr") +
 98 |   geom_edge_link(aes(width = n),
 99 |                  arrow = arrow(type = "closed",
100 |                                length = unit(.1, "inch"))) +
101 |   geom_node_point() +
102 |   geom_node_text(aes(label = name), repel = TRUE) +
103 |   scale_edge_width_continuous(range = c(.5, 4)) +
104 |   labs(title = "Common slave routes",
105 |        edge_width = "# of ships")
106 | ```
107 | 
108 | ```{r}
109 | slave_routes %>%
110 |   group_by(port_origin) %>%
111 |   summarize(n_ships = n(),
112 |             total_recorded = sum(n_slaves_arrived, na.rm = TRUE),
113 |             pct_missing = mean(is.na(n_slaves_arrived)),
114 |             estimated_total = mean(n_slaves_arrived, na.rm = TRUE) * n()) %>%
115 |   arrange(desc(n_ships))
116 | ```
117 | 
118 | ```{r}
119 | census <- tuesdata$census
120 | 
121 | census_gathered <- census %>%
122 |   mutate(other = total - white - black) %>%
123 |   arrange(year) %>%
124 |   gather(racial_category, population, white, black_free, black_slaves, other)
125 | 
126 | census_gathered %>%
127 |   filter(region == "USA Total") %>%
128 |   ggplot(aes(year, population, fill = racial_category)) +
129 |   geom_col() +
130 |   scale_y_continuous(labels = comma) +
131 |   labs(x = "Year",
132 |        y = "Census Population",
133 |        fill = "Racial category",
134 |        title = "Census racial makeup of US, 1790-1870",
135 |        subtitle = "No 'other' category existed before 1860")
136 | ```
137 | 
138 | ```{r}
139 | census_gathered %>%
140 |   filter(region != "USA Total", !is.na(division)) %>%
141 |   mutate(division = fct_reorder(division, -population, sum)) %>%
142 |   ggplot(aes(year, population, fill = racial_category)) +
143 |   geom_col() +
144 |   scale_y_continuous(labels = comma) +
145 |   facet_wrap(~ division) +
146 |   labs(x = "Year",
147 |        y = "Census Population",
148 |        fill = "Racial category",
149 |        title = "Census racial makeup of US, 1790-1870",
150 |        subtitle = "No 'other' category existed before 1860")
151 | ```
152 | 
153 | ```{r}
154 | census_gathered %>%
155 |   filter(region != "USA Total", !is.na(division)) %>%
156 |   mutate(division = fct_reorder(division, -population, sum)) %>%
157 |   group_by(division, year) %>%
158 |   mutate(percent = population / sum(population)) %>%
159 |   ggplot(aes(year, percent, fill = racial_category)) +
160 |   geom_col() +
161 |   scale_y_continuous(labels = percent) +
162 |   facet_wrap(~ division) +
163 |   labs(x = "Year",
164 |        y = "% of Census Population",
165 |        fill = "Racial category",
166 |        title = "Census racial makeup of US, 1790-1870",
167 |        subtitle = "No 'other' category existed before 1860")
168 | ```
169 | 
170 | ```{r}
171 | library(ggwordcloud)
172 | 
173 | name_counts <- tuesdata$african_names %>%
174 |   count(name, gender, sort = TRUE)
175 | 
176 | wordcloud(name_counts$name, name_counts$n)
177 | 
178 | name_counts %>%
179 |   head(100) %>%
180 |   ggplot(aes(label = name, size = n, color = gender)) +
181 |   geom_text_wordcloud()
182 | ```
183 | 
184 | ```{r}
185 | library(tidytext)
186 | 
187 | tuesdata$african_names %>%
188 |   filter(!is.na(gender)) %>%
189 |   mutate(gender = fct_recode(gender, Man = "Boy", Woman = "Girl")) %>%
190 |   count(name, gender, sort = TRUE) %>%
191 |   group_by(gender) %>%
192 |   top_n(20, n) %>%
193 |   ungroup() %>%
194 |   mutate(name = reorder_within(name, n, gender)) %>%
195 |   ggplot(aes(n, name)) +
196 |   geom_col() +
197 |   scale_y_reordered() +
198 |   facet_wrap(~ gender, scales = "free")
199 | ```
200 | 
201 | ```{r}
202 | tuesdata$african_names %>%
203 |   count(country_origin, sort = TRUE)
204 | ```
205 | 
206 | 
207 | 


--------------------------------------------------------------------------------
/animal-crossing.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Animal Crossing"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | theme_set(theme_light())
 13 | 
 14 | critic <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/critic.tsv')
 15 | user_reviews <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/user_reviews.tsv')
 16 | items <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/items.csv')
 17 | villagers <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/villagers.csv')
 18 | ```
 19 | 
 20 | ```{r}
 21 | villagers
 22 | items %>%
 23 |   View()
 24 | ```
 25 | 
 26 | ### Text Analysis
 27 | 
 28 | ```{r}
 29 | critic %>%
 30 |   ggplot(aes(grade)) +
 31 |   geom_histogram()
 32 | 
 33 | user_reviews %>%
 34 |   ggplot(aes(grade)) +
 35 |   geom_histogram()
 36 | 
 37 | head(user_reviews$text)
 38 | ```
 39 | 
 40 | ```{r}
 41 | library(tidytext)
 42 | library(lubridate)
 43 | 
 44 | user_review_words <- user_reviews %>%
 45 |   unnest_tokens(word, text) %>%
 46 |   anti_join(stop_words, by = "word") %>%
 47 |   count(user_name, date, grade, word)
 48 | ```
 49 | 
 50 | ```{r}
 51 | by_week <- user_reviews %>%
 52 |   group_by(week = floor_date(date, "week", week_start = 1)) %>%
 53 |   summarize(nb_reviews = n(),
 54 |             avg_grade = mean(grade),
 55 |             pct_zero = mean(grade == 0),
 56 |             pct_ten = mean(grade == 10))
 57 | 
 58 | by_week %>%
 59 |   ggplot(aes(week, avg_grade)) +
 60 |   geom_line() +
 61 |   geom_point(aes(size = nb_reviews)) +
 62 |   expand_limits(y = 0) +
 63 |   labs(x = "Time",
 64 |        y = "Average grade",
 65 |        size = "# of reviews")
 66 | 
 67 | user_reviews %>%
 68 |   ggplot(aes(grade)) +
 69 |   geom_histogram() +
 70 |   labs(title = "Most reviews were very low or very high")
 71 | 
 72 | by_week %>%
 73 |   gather(type, value, contains("pct")) %>%
 74 |   mutate(type = ifelse(type == "pct_zero", "% rated 0", "% rated 10")) %>%
 75 |   ggplot(aes(week, value, color = type)) +
 76 |   geom_line() +
 77 |   geom_point(aes(size = nb_reviews)) +
 78 |   scale_y_continuous(labels = scales::percent) +
 79 |   expand_limits(y = 0) +
 80 |   labs(x = "Time",
 81 |        y = "% of reviews",
 82 |        size = "Total reviews in week",
 83 |        title = "Reviews got more polarizing in middle of game")
 84 | ```
 85 | 
 86 | 
 87 | ```{r}
 88 | # user_reviews %>%
 89 | #   mutate(start_text = str_sub(text, 1, pmin(30, str_length(text))),
 90 | #          regex = paste0(".", start_text, ".*")) %>%
 91 | #   mutate(new_text = map2_chr(text, regex, str_remove)) %>%
 92 | #   filter(str_length(new_text) != str_length(text))
 93 | 
 94 | by_word <- user_review_words %>%
 95 |   group_by(word) %>%
 96 |   summarize(avg_grade = mean(grade),
 97 |             nb_reviews = n()) %>%
 98 |   arrange(desc(nb_reviews)) %>%
 99 |   filter(nb_reviews >= 25) %>%
100 |   arrange(desc(avg_grade))
101 |   
102 | by_word %>%
103 |   filter(nb_reviews >= 75) %>%
104 |   ggplot(aes(nb_reviews, avg_grade)) +
105 |   geom_point() +
106 |   geom_text(aes(label = word), vjust = 1, hjust = 1, check_overlap = TRUE) +
107 |   scale_x_log10()
108 | ```
109 | 
110 | ```{r}
111 | by_word %>%
112 |   top_n(20, -avg_grade) %>%
113 |   ggplot(aes(nb_reviews, avg_grade)) +
114 |   geom_point() +
115 |   geom_text(aes(label = word), vjust = 1, hjust = 1, check_overlap = TRUE) +
116 |   scale_x_log10() +
117 |   labs(title = "What words were associated with low-grade reviews?",
118 |        subtitle = "20 most negative words; only words in at least 25 reviews")
119 | ```
120 | 
121 | ```{r}
122 | library(widyr)
123 | library(stm)
124 | 
125 | review_matrix <- user_review_words %>%
126 |   group_by(word) %>%
127 |   filter(n() >= 25) %>%
128 |   cast_sparse(user_name, word, n)
129 | 
130 | topic_model_6 <- stm(review_matrix,
131 |                      K = 6,
132 |                      verbose = TRUE,
133 |                      init.type = "Spectral",
134 |                      emtol = 5e-5)
135 | ```
136 | 
137 | ```{r}
138 | tidy(topic_model_6) %>%
139 |   group_by(topic) %>%
140 |   top_n(12, beta) %>%
141 |   mutate(term = reorder_within(term, beta, topic)) %>%
142 |   ggplot(aes(beta, term, fill = factor(topic))) +
143 |   geom_col(show.legend = FALSE) +
144 |   scale_y_reordered() +
145 |   facet_wrap(~ topic, scales = "free_y")
146 | 
147 | tidy(topic_model_6) %>%
148 |   filter(term == "progress")
149 | 
150 | topic_model_gamma <- tidy(topic_model_6, matrix = "gamma") %>%
151 |   mutate(user_name = rownames(review_matrix)[document]) %>%
152 |   inner_join(user_reviews, by = "user_name")
153 | 
154 | topic_model_gamma %>%
155 |   group_by(topic) %>%
156 |   top_n(1, gamma)
157 | 
158 | topic_model_gamma %>%
159 |   group_by(topic) %>%
160 |   summarize(correlation = cor(gamma, grade),
161 |             spearman_correlation = cor(gamma, grade, method = "spearman"))
162 | 
163 | topic_model_gamma %>%
164 |   group_by(week = floor_date(date, "week", week_start = 1),
165 |            topic) %>%
166 |   summarize(avg_gamma = mean(gamma)) %>%
167 |   ggplot(aes(week, avg_gamma, color = factor(topic))) +
168 |   geom_line() +
169 |   expand_limits(y = 0) +
170 |   scale_y_continuous(labels = scales::percent) +
171 |   labs(x = "Time",
172 |        y = "Average gamma (document-topic association)")
173 | ```
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 


--------------------------------------------------------------------------------
/beach-volleyball.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Beach Volleyball"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | library(scales)
 13 | library(lubridate)
 14 | theme_set(theme_light())
 15 | 
 16 | tuesdata <- tidytuesdayR::tt_load('2020-05-19')
 17 | 
 18 | vb_matches <- tuesdata$vb_matches %>%
 19 |   mutate(match_id = row_number())
 20 | ```
 21 | 
 22 | ```{r}
 23 | vb_matches %>%
 24 |   count(circuit, tournament, date, sort = TRUE)
 25 | 
 26 | vb_matches %>%
 27 |   count(gender, sort = TRUE)
 28 | 
 29 | vb_matches %>%
 30 |   count(year) %>%
 31 |   
 32 | ```
 33 | 
 34 | ```{r}
 35 | vb_long <- vb_matches %>%
 36 |   rename(w_p1_name = w_player1, w_p2_name = w_player2,
 37 |          l_p1_name = l_player1, l_p2_name = l_player2,
 38 |          w_team_rank = w_rank, l_team_rank = l_rank) %>%
 39 |   mutate_at(vars(starts_with("w_"), starts_with("l_")), as.character) %>%
 40 |   pivot_longer(cols = c(starts_with("w_"), starts_with("l_"))) %>%
 41 |   separate(name, c("winner_loser", "player", "name"),
 42 |            sep = "_",
 43 |            extra = "merge",
 44 |            fill = "right") %>%
 45 |   mutate(winner_loser = str_to_upper(winner_loser))
 46 | 
 47 | vb_player_matches <- vb_long %>%
 48 |   filter(name != "rank") %>%
 49 |   spread(name, value) %>%
 50 |   type_convert()
 51 | ```
 52 | 
 53 | ```{r}
 54 | vb_sets <- vb_matches %>%
 55 |   select(match_id, circuit:match_num, score) %>%
 56 |   separate_rows(score, sep = ", ") %>%
 57 |   mutate(score = str_remove(score, " retired")) %>%
 58 |   mutate(score = na_if(score, "Forfeit or other")) %>%
 59 |   separate(score, c("w_score", "l_score"), convert = TRUE)
 60 | ```
 61 | 
 62 | ## Look at players
 63 | 
 64 | ```{r}
 65 | by_player <- vb_player_matches %>%
 66 |   group_by(name, gender) %>%
 67 |   summarize(n_matches = n(),
 68 |             pct_winner = mean(winner_loser == "W"),
 69 |             first_game = min(date),
 70 |             last_game = max(date)) %>%
 71 |   arrange(desc(n_matches)) %>%
 72 |   ungroup()
 73 | 
 74 | by_player %>%
 75 |   filter(n_matches >= 200) %>%
 76 |   ggplot(aes(n_matches, pct_winner, color = gender)) +
 77 |   geom_point() +
 78 |   scale_x_log10() +
 79 |   scale_y_continuous(labels = percent) +
 80 |   labs(x = "# of matches since 2000 (log scale)",
 81 |        y = "% of matches won")
 82 | 
 83 | by_player %>%
 84 |   filter(n_matches >= 200) %>%
 85 |   arrange(desc(pct_winner))
 86 | ```
 87 | 
 88 | ```{r}
 89 | vb_player_matches %>%
 90 |   summarize_all(~ mean(!is.na(.))) %>%
 91 |   gather() %>%
 92 |   View()
 93 | ```
 94 | 
 95 | ```{r}
 96 | vb_player_matches %>%
 97 |   group_by(tournament) %>%
 98 |   summarize(pct_has_attacks = mean(!is.na(tot_attacks)),
 99 |             n = n()) %>%
100 |   arrange(desc(n))
101 | ```
102 | 
103 | ### How would we predict if a player will win in 2019?
104 | 
105 | ```{r}
106 | summarize_players <- . %>%
107 |   summarize(n_matches = n(),
108 |             pct_winner = mean(winner_loser == "W"),
109 |             avg_attacks = mean(tot_attacks, na.rm = TRUE),
110 |             avg_errors = mean(tot_errors, na.rm = TRUE),
111 |             avg_serve_errors = mean(tot_serve_errors, na.rm = TRUE),
112 |             avg_kills = mean(tot_kills, na.rm = TRUE),
113 |             avg_aces = mean(tot_aces, na.rm = TRUE),
114 |             n_with_data = sum(!is.na(tot_attacks))) %>%
115 |   ungroup() %>%
116 |   arrange(desc(n_matches))
117 | 
118 | players_before_2019 <- vb_player_matches %>%
119 |   filter(year < 2019) %>%
120 |   group_by(name, gender, hgt, birthdate, country) %>%
121 |   summarize_players() %>%
122 |   filter(!is.na(avg_attacks))
123 | 
124 | players_2019 <- vb_player_matches %>%
125 |   filter(year == 2019) %>%
126 |   group_by(name, gender, hgt, birthdate, country, year,
127 |            age = year - year(birthdate)) %>%
128 |   summarize_players()
129 | ```
130 | 
131 | ```{r}
132 | performance_joined <- players_before_2019 %>%
133 |   inner_join(players_2019 %>%
134 |                select(name, n_matches, pct_winner),
135 |              by = "name",
136 |              suffix = c("", "_2019"))
137 | 
138 | performance_joined %>%
139 |   filter(n_matches >= 10,
140 |          n_matches_2019 >= 5) %>%
141 |   ggplot(aes(pct_winner, pct_winner_2019)) +
142 |   geom_point() +
143 |   geom_abline(color = "red") +
144 |   geom_smooth(method = "lm")
145 | 
146 | performance_joined %>%
147 |   mutate(n_wins_2019 = n_matches_2019 * pct_winner_2019,
148 |          country = fct_lump(country, 3)) %>%
149 |   glm(cbind(n_wins_2019, n_matches_2019 - n_wins_2019) ~
150 |         pct_winner + avg_errors + avg_serve_errors,
151 |       data = .,
152 |       family = "binomial") %>%
153 |   summary()
154 | ```
155 | 
156 | 
157 | ```{r}
158 | players_before_2019 %>%
159 |   filter(n_with_data >= 20) %>%
160 |   ggplot(aes(avg_serve_errors, avg_aces, size = n_with_data)) +
161 |   geom_point() +
162 |   labs(size = "Matches",
163 |        title = "DON'T TRUST THIS")
164 | ```
165 | 
166 | ```{r}
167 | 
168 | ```
169 | 
170 | 
171 | 
172 | ```{r}
173 | 
174 | ```
175 | 
176 | 
177 | ```{r}
178 | player_first_year_summarized %>%
179 |   filter(!is.nan(avg_attacks))
180 | ```
181 | 
182 | 
183 | 
184 | 


--------------------------------------------------------------------------------
/bike_traffic.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Bike Traffic"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | library(lubridate)
 13 | library(scales)
 14 | theme_set(theme_light())
 15 | 
 16 | bike_traffic_raw <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-02/bike_traffic.csv")
 17 | 
 18 | bike_traffic <- bike_traffic_raw %>%
 19 |   mutate(date = mdy_hms(date)) %>%
 20 |   filter(bike_count < 2000) %>%
 21 |   select(-ped_count)
 22 | ```
 23 | 
 24 | ```{r}
 25 | bike_traffic %>%
 26 |   count(crossing, direction)
 27 | ```
 28 | 
 29 | ```{r}
 30 | bike_traffic %>%
 31 |   ggplot(aes(date, fill = is.na(bike_count))) +
 32 |   geom_histogram() +
 33 |   facet_grid(crossing ~ direction)
 34 | ```
 35 | 
 36 | ### When in the day do we see bikers?
 37 | 
 38 | ```{r}
 39 | bike_traffic %>%
 40 |   group_by(crossing,
 41 |            hour = hour(date)) %>%
 42 |   summarize(bike_count = sum(bike_count, na.rm = TRUE)) %>%
 43 |   mutate(pct_bike = bike_count / sum(bike_count)) %>%
 44 |   ggplot(aes(hour, pct_bike, color = crossing)) +
 45 |   geom_line() +
 46 |   geom_point() +
 47 |   scale_y_continuous(labels = percent_format()) +
 48 |   labs(title = "When in the day do people bike through these Seattle crossings?",
 49 |        subtitle = "Based on crossings from 2014-February 2019",
 50 |        color = "Crossing",
 51 |        x = "Time of day (local time)",
 52 |        y = "% of bike crossings that happen in this hour")
 53 | ```
 54 | 
 55 | ```{r}
 56 | bike_by_time_window <- bike_traffic %>%
 57 |   mutate(hour = hour(date)) %>%
 58 |   mutate(time_window = case_when(
 59 |     between(hour, 7, 10) ~ "Morning Commute",
 60 |     between(hour, 11, 15) ~ "Midday",
 61 |     between(hour, 16, 18) ~ "Evening Commute",
 62 |     TRUE ~ "Night"
 63 |   )) %>%
 64 |   group_by(crossing,
 65 |            time_window) %>%
 66 |   summarize(number_missing = sum(is.na(bike_count)),
 67 |             bike_count = sum(bike_count, na.rm = TRUE)) %>%
 68 |   mutate(pct_bike = bike_count / sum(bike_count))
 69 | 
 70 | bike_by_time_window %>%
 71 |   select(-number_missing, -bike_count) %>%
 72 |   spread(time_window, pct_bike) %>%
 73 |   mutate(TotalCommute = `Evening Commute` + `Morning Commute`) %>%
 74 |   arrange(desc(TotalCommute))
 75 | 
 76 | bike_by_time_window %>%
 77 |   ggplot(aes(time_window, pct_bike)) +
 78 |   geom_col() +
 79 |   coord_flip() +
 80 |   facet_wrap(~ crossing)
 81 | 
 82 | bike_by_time_window %>%
 83 |   group_by(crossing) %>%
 84 |   summarize(total_bikes = sum(bike_count),
 85 |             pct_commute = sum(bike_count[str_detect(time_window, "Commute")]) / total_bikes) %>%
 86 |   ggplot(aes(total_bikes, pct_commute)) +
 87 |   geom_point() +
 88 |   scale_x_log10()
 89 | ```
 90 | 
 91 | ```{r}
 92 | bike_traffic %>%
 93 |   group_by(crossing,
 94 |            weekday = wday(date, label = TRUE),
 95 |            hour = hour(date)) %>%
 96 |   summarize(total_bikes = sum(bike_count, na.rm = TRUE)) %>%
 97 |   group_by(crossing) %>%
 98 |   mutate(pct_bike = total_bikes / sum(total_bikes)) %>%
 99 |   ggplot(aes(hour, pct_bike, color = crossing)) +
100 |   geom_line(show.legend = FALSE) +
101 |   facet_grid(crossing ~ weekday) +
102 |   scale_y_continuous(labels = percent_format()) +
103 |   labs(x = "Time of week",
104 |        y = "% of bike crossings happening in this hour",
105 |        title = "When in the week do people in Seattle bike?",
106 |        subtitle = "Based on crossings from 2014-February 2019")
107 | ```
108 | 
109 | ```{r}
110 | bike_traffic %>%
111 |   filter(date < "2018-01-01") %>%
112 |   group_by(crossing,
113 |            month = fct_relevel(month.name[month(date)], month.name)) %>%
114 |   summarize(total_bikes = sum(bike_count, na.rm = TRUE)) %>%
115 |   mutate(pct_bike = total_bikes / sum(total_bikes)) %>%
116 |   ggplot(aes(month, pct_bike, color = crossing, group = crossing)) +
117 |   geom_line() +
118 |   expand_limits(y = 0) +
119 |   scale_y_continuous(labels = percent_format()) +
120 |   theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
121 |   labs(title = "What time of year do people bike?",
122 |        subtitle = "Based on 2014-2017 bike crossings",
123 |        y = "% of yearly trips in this month",
124 |        x = "")
125 | ```
126 | 
127 | ### What directions do people commute by bike?
128 | 
129 | ```{r}
130 | bike_by_direction_hour_crossing <- bike_traffic %>%
131 |   filter(crossing != "MTS Trail",
132 |          !wday(date, label = TRUE) %in% c("Sat", "Sun"),
133 |          direction %in% c("North", "South")) %>%
134 |   mutate(hour = hour(date)) %>%
135 |   group_by(crossing,
136 |            direction,
137 |            hour) %>%
138 |   summarize(bike_count = sum(bike_count, na.rm = TRUE)) %>%
139 |   mutate(pct_bike = bike_count / sum(bike_count))
140 | 
141 | bike_by_direction_hour_crossing %>%
142 |   group_by(crossing) %>%
143 |   mutate(average_hour = sum((hour * pct_bike)[direction == "North"])) %>%
144 |   ungroup() %>%
145 |   mutate(crossing = fct_reorder(crossing, average_hour)) %>%
146 |   ggplot(aes(hour, pct_bike, color = direction)) +
147 |   geom_line() +
148 |   facet_grid(crossing ~ .) +
149 |   scale_y_continuous(labels = percent_format()) +
150 |   labs(x = "Time of day",
151 |        y = "% of bike crossings happening in this hour",
152 |        title = "In which directions do people commute by bike?",
153 |        subtitle = "Based on weekday crossings at six Seattle locations from 2014-February 2019",
154 |        color = "Direction")
155 | ```
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 


--------------------------------------------------------------------------------
/bird-collisions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Bird Collisions"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ## R Markdown
 11 | 
 12 | ```{r}
 13 | library(tidyverse)
 14 | theme_set(theme_light())
 15 | 
 16 | mp_light <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-30/mp_light.csv")
 17 | 
 18 | bird_collisions <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-30/bird_collisions.csv") %>%
 19 |   left_join(mp_light, by = "date")
 20 | ```
 21 | 
 22 | ```{r}
 23 | bird_collisions %>%
 24 |   ggplot(aes(date, fill = locality)) +
 25 |   geom_histogram()
 26 | ```
 27 | 
 28 | ```{r}
 29 | bird_collisions %>%
 30 |   gather(category, value, -date, -light_score) %>%
 31 |   count(category, value, light_score_missing = is.na(light_score)) %>%
 32 |   group_by(category) %>%
 33 |   top_n(16, n) %>%
 34 |   ungroup() %>%
 35 |   mutate(value = fct_reorder(value, n, sum),
 36 |          category = fct_reorder(category, n, length)) %>%
 37 |   ggplot(aes(value, n, fill = light_score_missing)) +
 38 |   geom_col() +
 39 |   facet_wrap(~ category, scales = "free_y") +
 40 |   coord_flip() +
 41 |   labs(x = "# of collisions",
 42 |        y = "",
 43 |        fill = "Light score missing",
 44 |        title = "Category breakdowns of collisions")
 45 | ```
 46 | 
 47 | ```{r}
 48 | bird_collisions %>%
 49 |   filter(!is.na(light_score)) %>%
 50 |   count(date, locality) %>%
 51 |   ggplot(aes(n, color = locality)) +
 52 |   geom_density() +
 53 |   scale_x_log10() +
 54 |   labs(x = "# of collisions per night")
 55 | 
 56 | bird_collisions %>%
 57 |   filter(!is.na(light_score)) %>%
 58 |   distinct(date, light_score) %>%
 59 |   ggplot(aes(light_score)) +
 60 |   geom_histogram()
 61 | 
 62 | geom_mean <- function(x) {
 63 |   exp(mean(log(x + 1)) - 1)
 64 | }
 65 | 
 66 | by_day_mp <- bird_collisions %>%
 67 |   filter(!is.na(light_score)) %>%
 68 |   group_by(date, locality) %>%
 69 |   summarize(collisions = n()) %>%
 70 |   ungroup() %>%
 71 |   complete(date, locality, fill = list(collisions = 0)) %>%
 72 |   right_join(mp_light %>% crossing(locality = c("CHI", "MP")), by = c("date", "locality")) %>%
 73 |   filter(date <= "2016-11-13") %>%
 74 |   replace_na(list(collisions = 0)) %>%
 75 |   mutate(locality = ifelse(locality == "CHI", "Greater Chicago", "McCormick Place"))
 76 | 
 77 | bootstrap_cis <- by_day_mp %>%
 78 |   bootstraps(times = 1000) %>%
 79 |   unnest(map(splits, as.data.frame)) %>%
 80 |   group_by(light_score, locality, id) %>%
 81 |   summarize(avg_collisions = geom_mean(collisions)) %>%
 82 |   summarize(bootstrap_low = quantile(avg_collisions, .025),
 83 |             bootstrap_high = quantile(avg_collisions, .975))
 84 | 
 85 | by_day_mp %>%
 86 |   group_by(light_score, locality) %>%
 87 |   summarize(avg_collisions = geom_mean(collisions),
 88 |             nights = n()) %>%
 89 |   ggplot(aes(light_score, color = locality)) +
 90 |   geom_line(aes(y = avg_collisions)) +
 91 |   geom_ribbon(aes(ymin = bootstrap_low, ymax = bootstrap_high),
 92 |               data = bootstrap_cis,
 93 |               alpha = .25) +
 94 |   expand_limits(y = 0) +
 95 |   labs(x = "Light score at McCormick place (higher means more lights on)",
 96 |        y = "Geometric mean of the number of collisions",
 97 |        title = "Brighter lights at McCormick place correlate with more bird collisions there, and not with Chicago overall",
 98 |        subtitle = "Ribbon shows 95% bootstrapped percentile confidence interval",
 99 |        color = "")
100 | ```
101 | 
102 | ### Look at confounders
103 | 
104 | ```{r}
105 | library(lubridate)
106 | 
107 | bird_collisions %>%
108 |   filter(date >= "2005-01-01") %>%
109 |   count(month = month(date, label = TRUE), #floor_date(date, "month"),
110 |         locality) %>%
111 |   ggplot(aes(month, n, color = locality, group = locality)) +
112 |   geom_line()
113 | ```
114 | 
115 | 
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/board-games.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Board Games"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | theme_set(theme_light())
 13 | 
 14 | board_games_raw <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-12/board_games.csv")
 15 | 
 16 | holdout_set <- board_games_raw %>%
 17 |   filter(game_id %% 5 == 0)
 18 | 
 19 | board_games <- board_games_raw %>%
 20 |   filter(game_id %% 5 != 0)
 21 | ```
 22 | 
 23 | ### EDA
 24 | 
 25 | ```{r}
 26 | board_games %>%
 27 |   count(publisher, sort = TRUE)
 28 | 
 29 | board_games %>%
 30 |   count(year_published) %>%
 31 |   arrange(desc(year_published))
 32 |   ggplot(aes(year_published, n)) +
 33 |   geom_line()
 34 |   
 35 | board_games %>%
 36 |   ggplot(aes(average_rating)) +
 37 |   geom_histogram()
 38 | 
 39 | board_games %>%
 40 |   filter(max_playtime > 5, max_playtime < 1000) %>%
 41 |   ggplot(aes(max_playtime / 60)) +
 42 |   geom_histogram(binwidth = .25) +
 43 |   scale_x_log10(breaks = 2 ^ seq(-2, 4))
 44 | 
 45 | board_games %>%
 46 |   count(year_published) %>%
 47 |   ggplot(aes(year_published, n)) +
 48 |   geom_line()
 49 | ```
 50 | 
 51 | Categorical variables:
 52 | 
 53 | ```{r}
 54 | categorical_variables <- board_games %>%
 55 |   select(game_id, name, family, category, artist, designer, mechanic) %>%
 56 |   gather(type, value, -game_id, -name) %>%
 57 |   filter(!is.na(value)) %>%
 58 |   separate_rows(value, sep = ",") %>%
 59 |   arrange(game_id)
 60 | 
 61 | categorical_counts <- categorical_variables %>%
 62 |   count(type, value, sort = TRUE)
 63 | 
 64 | library(drlib)
 65 | 
 66 | categorical_counts %>%
 67 |   group_by(type) %>%
 68 |   top_n(10, n) %>%
 69 |   ungroup() %>%
 70 |   mutate(value = reorder_within(value, n, type),
 71 |          type = fct_reorder(type, n, .desc = TRUE)) %>%
 72 |   ggplot(aes(value, n, fill = type)) +
 73 |   geom_col(show.legend = FALSE) +
 74 |   facet_wrap(~ type, scales = "free_y") +
 75 |   coord_flip() +
 76 |   scale_x_reordered() +
 77 |   labs(title = "Most common categories")
 78 | ```
 79 | 
 80 | ### Predict average rating
 81 | 
 82 | ```{r}
 83 | board_games %>%
 84 |   group_by(decade = 10 * (year_published %/% 10)) %>%
 85 |   summarize(average_rating = mean(average_rating)) %>%
 86 |   ggplot(aes(decade, average_rating)) +
 87 |   geom_line()
 88 | ```
 89 | 
 90 | 
 91 | ```{r}
 92 | library(broom)
 93 | 
 94 | lm(average_rating ~
 95 |      log2(max_players + 1) +
 96 |      log2(max_playtime + 1) +
 97 |      year_published, board_games) %>%
 98 |   tidy()
 99 | ```
100 | 
101 | ```{r}
102 | by_categorical <- board_games %>%
103 |   inner_join(categorical_variables, by = c("game_id", "name")) %>%
104 |   select(type, value, average_rating) %>%
105 |   group_by(type, value) %>%
106 |   summarize(games = n(),
107 |             average_rating = mean(average_rating)) %>%
108 |   arrange(desc(games))
109 | 
110 | board_games %>%
111 |   inner_join(categorical_variables, by = c("game_id", "name")) %>%
112 |   filter(type == "category") %>%
113 |   mutate(value = fct_lump(value, 15),
114 |          value = fct_reorder(value, average_rating)) %>%
115 |   ggplot(aes(value, average_rating)) +
116 |   geom_boxplot() +
117 |   coord_flip()
118 | 
119 | board_games %>%
120 |   inner_join(categorical_variables, by = c("game_id", "name")) %>%
121 |   filter(type == "family") %>%
122 |   mutate(value = fct_lump(value, 15),
123 |          value = fct_reorder(value, average_rating)) %>%
124 |   ggplot(aes(value, average_rating)) +
125 |   geom_boxplot() +
126 |   coord_flip()
127 | ```
128 | 
129 | Conclusion: categorical variables can be correlated with higher/lower rated games.
130 | 
131 | ```{r}
132 | non_categorical_features <- board_games %>%
133 |   transmute(game_id,
134 |             name,
135 |             year = year_published - 1950,
136 |             log2_max_players = log2(max_players + 1),
137 |             log2_max_playtime = log2(max_playtime + 1)) %>%
138 |   gather(feature, value, -game_id, -name)
139 | 
140 | features <- categorical_variables %>%
141 |   unite(feature, type, value, sep = ": ") %>%
142 |   add_count(feature) %>%
143 |   filter(n >= 20) %>%
144 |   mutate(value = 1) %>%
145 |   bind_rows(non_categorical_features)
146 | 
147 | library(glmnet)
148 | library(tidytext)
149 | library(Matrix)
150 | 
151 | # Predictor
152 | feature_matrix <- features %>%
153 |   cast_sparse(game_id, feature, value)
154 | 
155 | # What I'm predicting
156 | ratings <- board_games$average_rating[match(rownames(feature_matrix), board_games$game_id)]
157 | 
158 | cv_lasso <- cv.glmnet(feature_matrix, ratings)
159 | plot(cv_lasso)
160 | 
161 | cv_lasso$glmnet.fit %>%
162 |   tidy() %>%
163 |   filter(lambda == cv_lasso$lambda.1se) %>%
164 |   arrange(desc(estimate)) %>%
165 |   filter(term != "(Intercept)") %>%
166 |   top_n(25, abs(estimate)) %>%
167 |   mutate(term = fct_reorder(term, estimate)) %>%
168 |   ggplot(aes(term, estimate)) +
169 |   geom_col() +
170 |   coord_flip() +
171 |   labs(title = "Largest coefficients in our predictive model",
172 |        subtitle = "Based on a LASSO regression",
173 |        x = "",
174 |        y = "Coefficient")
175 | ```
176 | 
177 | 
178 | 
179 | 
180 | 


--------------------------------------------------------------------------------
/bob-ross.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Untitled"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | theme_set(theme_light())
 13 | 
 14 | bob_ross <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-08-06/bob-ross.csv")
 15 | ```
 16 | 
 17 | ```{r}
 18 | bob_ross_gathered <- bob_ross %>%
 19 |   janitor::clean_names() %>%
 20 |   gather(element, present, -episode, -title) %>%
 21 |   filter(present == 1) %>%
 22 |   mutate(title = str_to_title(str_remove_all(title, '"')),
 23 |          element = str_to_title(str_replace(element, "_", " "))) %>%
 24 |   select(-present) %>%
 25 |   extract(episode, c("season", "episode_number"), "S(.*)E(.*)", convert = TRUE, remove = FALSE) %>%
 26 |   arrange(season, episode_number)
 27 | ```
 28 | 
 29 | ### Exploring the paintings
 30 | 
 31 | ```{r}
 32 | bob_ross_gathered %>%
 33 |   count(element, sort = TRUE) %>%
 34 |   head(25) %>%
 35 |   mutate(element = fct_reorder(element, n)) %>%
 36 |   ggplot(aes(element, n)) +
 37 |   geom_col() +
 38 |   coord_flip()
 39 | ```
 40 | 
 41 | What are the most "crowded" paintings, with the most elements in them?
 42 | 
 43 | ```{r}
 44 | bob_ross_gathered %>%
 45 |   add_count(episode) %>%
 46 |   arrange(desc(n))
 47 | ```
 48 | 
 49 | How have Ross's paintings been changing over time?
 50 | 
 51 | ```{r}
 52 | by_season_element <- bob_ross_gathered %>%
 53 |   filter(!element %in% c("Tree", "Trees")) %>%
 54 |   group_by(season) %>%
 55 |   mutate(number_episodes = n_distinct(episode)) %>%
 56 |   count(season, element, number_episodes, sort = TRUE) %>%
 57 |   mutate(percent_included = n / number_episodes) %>%
 58 |   group_by(element) %>%
 59 |   mutate(element_total = sum(n)) %>%
 60 |   ungroup()
 61 | 
 62 | by_season_element %>%
 63 |   filter(element_total >= 50) %>%
 64 |   ggplot(aes(season, percent_included, color = element)) +
 65 |   geom_line() +
 66 |   scale_y_continuous(labels = scales::percent_format()) +
 67 |   expand_limits(y = 0) +
 68 |   facet_wrap(~ element)
 69 | ```
 70 | 
 71 | Could have used: [many models with broom](https://r4ds.had.co.nz/many-models.html)
 72 | 
 73 | ### Clustering
 74 | 
 75 | What tends to appear together?
 76 | 
 77 | ```{r}
 78 | library(widyr)
 79 | 
 80 | correlations <- bob_ross_gathered %>%
 81 |   add_count(element) %>%
 82 |   filter(n >= 5) %>%
 83 |   pairwise_cor(element, episode, sort = TRUE)
 84 | 
 85 | correlations %>%
 86 |   filter(item1 == "River") %>%
 87 |   mutate(item2 = fct_reorder(item2, correlation)) %>%
 88 |   ggplot(aes(item2, correlation)) +
 89 |   geom_col() +
 90 |   coord_flip() +
 91 |   labs(title = "What tends to appear with a river?",
 92 |        subtitle = "Among elements that appeared in at least 10 paintings")
 93 | 
 94 | correlations %>%
 95 |   filter(item1 == "Snow") %>%
 96 |   mutate(item2 = fct_reorder(item2, correlation)) %>%
 97 |   ggplot(aes(item2, correlation)) +
 98 |   geom_col() +
 99 |   coord_flip() +
100 |   labs(title = "What tends to appear with snow?",
101 |        subtitle = "Among elements that appeared in at least 10 paintings")
102 | ```
103 | 
104 | ```{r}
105 | library(ggraph)
106 | library(igraph)
107 | 
108 | set.seed(2019)
109 | 
110 | correlations %>%
111 |   head(100) %>%
112 |   graph_from_data_frame() %>%
113 |   ggraph() +
114 |   geom_edge_link(aes(alpha = correlation)) +
115 |   geom_node_point() +
116 |   geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
117 |   theme_void()
118 | ```
119 | 
120 | ### Principal Component Analysis
121 | 
122 | What dimensions drive a lot of the variation among paintings?
123 | 
124 | ```{r}
125 | library(reshape2)
126 | library(broom)
127 | library(tidytext)
128 | 
129 | binary_matrix <- bob_ross_gathered %>%
130 |   acast(title ~ element)
131 | 
132 | # Center the columns
133 | centered_matrix <- t(t(binary_matrix) - colMeans(binary_matrix))
134 | 
135 | svd_result <- svd(centered_matrix)
136 | 
137 | element_weights <- tidy(svd_result, matrix = "v") %>%
138 |   mutate(element = colnames(binary_matrix)[column])
139 | 
140 | element_weights %>%
141 |   filter(PC <= 4) %>%
142 |   group_by(PC) %>%
143 |   top_n(16, abs(value)) %>%
144 |   ungroup() %>%
145 |   mutate(element = reorder_within(element, value, PC)) %>%
146 |   ggplot(aes(element, value, fill = factor(PC))) +
147 |   geom_col(show.legend = FALSE) +
148 |   facet_wrap(~ PC, scales = "free") +
149 |   scale_x_reordered() +
150 |   coord_flip() +
151 |   labs(title = "First four principal components of elements in Bob Ross paintings")
152 | ```
153 | 
154 | 1. Mountains/Conifer vs Ocean/Beach and deciduous trees
155 | 2. Trees, especially deciduous, vs Ocean
156 | 3. Spring/Summer vs Winter
157 | 4. Lake vs River
158 | 
159 | ```{r}
160 | painting_weights <- broom::tidy(svd_result, matrix = "u") %>%
161 |   mutate(painting = rownames(binary_matrix)[row])
162 | ```
163 | 
164 | ```{r}
165 | painting_weights %>%
166 |   filter(PC == 1) %>%
167 |   arrange((value))
168 | 
169 | bob_ross_gathered %>%
170 |   filter(title == "Frozen Solitude")
171 | 
172 | painting_weights %>%
173 |   filter(PC <= 4) %>%
174 |   group_by(PC) %>%
175 |   top_n(20, abs(value)) %>%
176 |   ungroup() %>%
177 |   mutate(painting = reorder_within(painting, value, PC)) %>%
178 |   ggplot(aes(painting, value, fill = factor(PC))) +
179 |   geom_col(show.legend = FALSE) +
180 |   facet_wrap(~ PC, scales = "free") +
181 |   scale_x_reordered() +
182 |   coord_flip() +
183 |   labs(title = "First four principal components of Bob Ross paintings")
184 | 
185 | ```
186 | 
187 | ```{r}
188 | broom::tidy(svd_result, matrix = "d") %>%
189 |   ggplot(aes(PC, percent)) +
190 |   geom_point()
191 | ```
192 | 
193 | 
194 | 


--------------------------------------------------------------------------------
/broadway-shinybones/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | 


--------------------------------------------------------------------------------
/broadway-shinybones/README.md:
--------------------------------------------------------------------------------
1 | # <App Name>
2 | 
3 | 


--------------------------------------------------------------------------------
/broadway-shinybones/_site.yml:
--------------------------------------------------------------------------------
 1 | name: Broadway Shows
 2 | sidebar:
 3 |   - text: Revenue
 4 |     icon: dollar-sign
 5 |     module:
 6 |         metric_panel:
 7 |             metric: !expr DATA$broadway_revenue_usd_gross
 8 |   - text: Price
 9 |     icon: ticket-alt
10 |     module:
11 |         metric_panel:
12 |             metric: !expr DATA$broadway_revenue_avg_ticket_price
13 |   - text: Capacity
14 |     icon: users
15 |     module:
16 |         metric_panel:
17 |             metric: !expr DATA$broadway_revenue_pct_capacity
18 | 


--------------------------------------------------------------------------------
/broadway-shinybones/app.R:
--------------------------------------------------------------------------------
 1 | # Load Libraries -----
 2 | library(shiny)
 3 | library(shinydashboard)
 4 | library(shinybones)
 5 | library(shinymetrics)
 6 | 
 7 | # Load Utilities -----
 8 | source_dirs('utils')
 9 | source_dirs('components')
10 | source_dirs('pages')
11 | 
12 | # Global Data ----
13 | # This is passed to all page modules as an argument named data_global
14 | DATA <- readRDS("show_metrics.rds")
15 | 
16 | # Configuration
17 | options("yaml.eval.expr" = TRUE)
18 | CONFIG <- yaml::read_yaml('_site.yml')
19 | 
20 | # UI ----
21 | ui <- function(request){
22 |   dashboardPage(
23 |     # Header ----
24 |     dashboardHeader(title = CONFIG$name),
25 | 
26 |     # Sidebar ----
27 |     dashboardSidebar(
28 |       sb_create_sidebar(CONFIG, DATA)
29 |     ),
30 | 
31 |     # Body -----
32 |     dashboardBody(
33 |       sb_create_tab_items(CONFIG, DATA)
34 |     )
35 |   )
36 | }
37 | 
38 | # Server -----
39 | server <- function(input, output, session){
40 |   sb_call_modules(CONFIG, DATA)
41 | }
42 | 
43 | # Run App ----
44 | shinyApp(ui = ui, server = server, enableBookmarking = 'url')
45 | 


--------------------------------------------------------------------------------
/broadway-shinybones/broadway-shinybones.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 4
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/broadway-shinybones/components/README.md:
--------------------------------------------------------------------------------
1 | # Components
2 | 
3 | Add all shared components to this directory.
4 | 


--------------------------------------------------------------------------------
/broadway-shinybones/pages/README.md:
--------------------------------------------------------------------------------
1 | # Pages
2 | 
3 | Add all page modules to this directory.
4 | 


--------------------------------------------------------------------------------
/broadway-shinybones/show_metrics.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tacookson/data-screencasts/388a9cfa794a67d5cbaa3b9b860520c0a3e0868e/broadway-shinybones/show_metrics.rds


--------------------------------------------------------------------------------
/broadway.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Broadway shows"
 3 | name: metrics_broadway_revenue
 4 | owner: admiral.david@gmail.com
 5 | metrics:
 6 |   usd_gross:
 7 |     title: Total Gross (USD)
 8 |     description: Not adjusted for inflation.
 9 |   avg_ticket_price:
10 |     title: Average Ticket Price
11 |     description: Not adjusted for inflation.
12 |   pct_capacity:
13 |     title: Percent capacity
14 |     description: Averaged across weeks in the time period.
15 | dimensions:
16 |   show:
17 |     title: Show
18 |     description: Show's title
19 | ---
20 | 
21 | ```{r setup, include=FALSE}
22 | knitr::opts_chunk$set(echo = TRUE)
23 | ```
24 | 
25 | ```{r}
26 | library(tidyverse)
27 | library(scales)
28 | theme_set(theme_light())
29 | 
30 | tuesdata <- tidytuesdayR::tt_load('2020-04-28')
31 | ```
32 | 
33 | ```{r}
34 | grosses <- tuesdata$grosses
35 | 
36 | grosses %>%
37 |   filter(show %in% c("Hamilton", "The Lion King")) %>%
38 |   ggplot(aes(week_ending, weekly_gross, color = show)) +
39 |   geom_line() +
40 |   scale_y_continuous(labels = scales::dollar) +
41 |   expand_limits(y = 0)
42 | ```
43 | 
44 | Tidymetric `cross_by_periods()` and `cross_by_dimensions()`
45 | 
46 | ```{r}
47 | # devtools::install_github("ramnathv/tidymetrics")
48 | library(tidymetrics)
49 | 
50 | shows_summarized <- grosses %>%
51 |   filter(show %in% c("Hamilton", "The Lion King",
52 |                      "Les Miserables", "Rent",
53 |                      "The Phantom of the Opera", "Wicked",
54 |                      "Harry Potter and the Cursed Child, Parts One and Two",
55 |                      "The Book of Mormon")) %>%
56 |   mutate(show = str_remove(show, "\\, Parts.*")) %>%
57 |   rename(date = week_ending) %>%
58 |   cross_by_dimensions(show) %>%
59 |   cross_by_periods(c("month", "quarter", "year"),
60 |                    windows = 28) %>%
61 |   summarize(usd_gross = sum(weekly_gross),
62 |             avg_ticket_price = mean(avg_ticket_price),
63 |             pct_capacity = mean(pct_capacity)) %>%
64 |   ungroup()
65 | 
66 | show_metrics <- create_metrics(shows_summarized)
67 | ```
68 | 
69 | The shinymetrics package:
70 | 
71 | ```{r}
72 | # devtools::install_github("ramnathv/shinymetrics")
73 | library(shinymetrics)
74 | saveRDS(show_metrics, "broadway-shinybones/show_metrics.rds")
75 | 
76 | preview_metric(show_metrics$broadway_revenue_usd_gross)
77 | preview_metric(show_metrics$broadway_revenue_avg_ticket_price)
78 | preview_metric(show_metrics$broadway_revenue_pct_capacity)
79 | ```
80 | 
81 | ```{r}
82 | shows_summarized %>%
83 |   filter(period == "quarter",
84 |          show != "All") %>%
85 |   ggplot(aes(date, usd_gross, fill = show)) +
86 |   geom_col()
87 | 
88 | shows_summarized %>%
89 |   filter(period == "quarter",
90 |          show != "All") %>%
91 |   ggplot(aes(date, avg_ticket_price, col = show)) +
92 |   geom_line() +
93 |   expand_limits(y = 0)
94 | ```
95 | 
96 | 
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/car-economy.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Car Fuel Economy"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | theme_set(theme_light())
 13 | ```
 14 | 
 15 | ```{r}
 16 | big_epa_cars <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-15/big_epa_cars.csv") %>%
 17 |   mutate(uses_electricity = ifelse(highwayE > 0, "Uses Electricity", "Doesn't Use Electricity"))
 18 | ```
 19 | 
 20 | ```{r}
 21 | big_epa_cars_alphabetical <- big_epa_cars %>%
 22 |   select(sort(colnames(big_epa_cars)))
 23 | ```
 24 | 
 25 | ```{r}
 26 | big_epa_cars %>%
 27 |   ggplot(aes(highway08, city08)) +
 28 |   geom_point() +
 29 |   geom_abline(color = "red") +
 30 |   facet_wrap(~ uses_electricity, scales = "free") +
 31 |   expand_limits(x = 0, y = 0) +
 32 |   labs(x = "Highway MPG",
 33 |        y = "City MPG",
 34 |        title = "How does fuel efficiency differ between city + highway?")
 35 | ```
 36 | 
 37 | ```{r}
 38 | big_epa_cars %>%
 39 |   select(city08, highway08, make, model, cylinders, displ, drive, engId, eng_dscr)
 40 | ```
 41 | 
 42 | ```{r}
 43 | big_epa_cars %>%
 44 |   filter(cityE == 0) %>%
 45 |   mutate(VClass = fct_lump(VClass, 8),
 46 |          VClass = fct_reorder(VClass, city08)) %>%
 47 |   ggplot(aes(VClass, city08)) +
 48 |   geom_boxplot() +
 49 |   coord_flip()
 50 | 
 51 | big_epa_cars %>%
 52 |   filter(cityE == 0) %>%
 53 |   mutate(drive = fct_reorder(drive, city08)) %>%
 54 |   ggplot(aes(drive, city08)) +
 55 |   geom_boxplot() +
 56 |   coord_flip()
 57 | 
 58 | big_epa_cars %>%
 59 |   filter(cityE == 0) %>%
 60 |   ggplot(aes(cylinders, city08, group = cylinders)) +
 61 |   geom_boxplot()
 62 | 
 63 | big_epa_cars %>%
 64 |   filter(cityE == 0) %>%
 65 |   ggplot(aes(displ, city08)) +
 66 |   geom_point() +
 67 |   expand_limits(x = 0, y = 0)
 68 | ```
 69 | 
 70 | ## Goal: Predict city fuel efficiency for single-fuel, non-electric cars
 71 | 
 72 | ```{r}
 73 | # Cross validation holdout set
 74 | non_electric_cars <- big_epa_cars %>%
 75 |   filter(cityA08 == 0,
 76 |          cityE == 0) %>%
 77 |   sample_frac(1)
 78 | 
 79 | training_set <- non_electric_cars %>%
 80 |   filter(row_number() %% 5 != 0)
 81 | ```
 82 | 
 83 | ```{r}
 84 | library(broom)
 85 | 
 86 | training_set %>%
 87 |   ggplot(aes(displ, city08)) +
 88 |   geom_point() +
 89 |   geom_smooth(method = "lm")
 90 | 
 91 | library(splines)
 92 | augmented_data <- lm(city08 ~ ns(displ, 2), data = training_set) %>%
 93 |   augment(data = training_set)
 94 | 
 95 | augmented_data %>%
 96 |   ggplot(aes(displ, city08)) +
 97 |   geom_point() +
 98 |   geom_line(aes(y = .fitted), color = "red", size = 2)
 99 | 
100 | models <- tibble(df = 1:10) %>%
101 |   mutate(lm_model = map(df, ~ lm(city08 ~ ns(displ, df = .), data = training_set)))
102 | 
103 | augmented_unnested <- models %>%
104 |   mutate(augmented = map(lm_model, augment, data = training_set)) %>%
105 |   unnest(augmented)
106 | 
107 | augmented_unnested %>%
108 |   ggplot(aes(displ, city08)) +
109 |   geom_point(data = training_set) +
110 |   geom_line(aes(y = .fitted, color = factor(df)), size = 2) +
111 |   labs(x = "Engine volume (L)",
112 |        y = "City MPG",
113 |        color = "# of degrees of freedom") +
114 |   expand_limits(x = 0, y = 0)
115 | 
116 | augmented_unnested %>%
117 |   ggplot(aes(displ, .resid)) +
118 |   geom_point() +
119 |   facet_wrap(~ df)
120 | 
121 | glanced_models <- models %>%
122 |   rename(spline_df = df) %>%
123 |   mutate(glanced = map(lm_model, glance, data = training_set)) %>%
124 |   unnest(glanced)
125 | 
126 | glanced_models %>%
127 |   ggplot(aes(spline_df, adj.r.squared)) +
128 |   geom_line()
129 | ```
130 | 
131 | ```{r}
132 | lm(city08 ~ ns(displ, 4), data = training_set) %>%
133 |   anova() %>%
134 |   tidy() %>%
135 |   mutate(pct_variation = sumsq / sum(sumsq))
136 | 
137 | ggplot(training_set, aes(cylinders, displ, group = cylinders)) +
138 |   geom_boxplot()
139 | ```
140 | 
141 | ```{r}
142 | training_set %>%
143 |   ggplot(aes(year, city08)) +
144 |   geom_point() +
145 |   geom_smooth(method = "loess")
146 | 
147 | efficiency_time <- training_set %>%
148 |   mutate(VClass = fct_lump(VClass, 6),
149 |          guzzler = !is.na(guzzler)) %>%
150 |   group_by(year = 2 * floor(year / 2), VClass) %>%
151 |   summarize_at(vars(city08, cylinders, displ, guzzler), mean)
152 | 
153 | efficiency_time %>%
154 |   ggplot(aes(year, city08, color = VClass)) +
155 |   geom_line() +
156 |   expand_limits(y = 0)
157 | 
158 | efficiency_time %>%
159 |   ggplot(aes(year, displ, color = VClass)) +
160 |   geom_line() +
161 |   expand_limits(y = 0)
162 | 
163 | efficiency_time %>%
164 |   ggplot(aes(year, guzzler, color = VClass)) +
165 |   geom_line() +
166 |   expand_limits(y = 0)
167 | ```
168 | 
169 | Hypothesis: Engine size started shrinking around 2009 (probably regulatory pressure), especially in large and midsize cars, and this led to an increase in city fuel efficiency.
170 | 
171 | 
172 | 


--------------------------------------------------------------------------------
/caribou-locations.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Untitled"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | theme_set(theme_light())
 13 | 
 14 | tuesdata <- tidytuesdayR::tt_load('2020-06-23')
 15 | ```
 16 | 
 17 | ```{r}
 18 | individuals <- tuesdata$individuals
 19 | locations <- tuesdata$locations
 20 | ```
 21 | 
 22 | ```{r}
 23 | individuals %>%
 24 |   summarize(across(sex:study_site, list(~ mean(!is.na(.)))))
 25 | 
 26 | individuals %>%
 27 |   filter(deploy_off_type == "dead") %>%
 28 |   count(death_cause, sort = TRUE)
 29 | ```
 30 | 
 31 | ```{r}
 32 | library(sf)
 33 | province_sf <- read_sf("~/Downloads/province")
 34 | 
 35 | bc <- province_sf %>%
 36 |   filter(PROV == "BC")
 37 | ```
 38 | 
 39 | ```{r}
 40 | individuals %>%
 41 |   filter(deploy_on_latitude > 40) %>%
 42 |   count(study_site, deploy_on_longitude, deploy_on_latitude, sort = TRUE) %>%
 43 |   ggplot() +
 44 |   # geom_sf(data = bc) +
 45 |   # borders("world", regions = "canada") +
 46 |   geom_point(aes(deploy_on_longitude, deploy_on_latitude,
 47 |                  size = n,
 48 |              color = study_site)) +
 49 |   scale_size_continuous(guide = FALSE)
 50 | ```
 51 | 
 52 | ```{r}
 53 | individuals %>%
 54 |   count(animal_id, sort = TRUE)
 55 | 
 56 | individuals %>%
 57 |   filter(animal_id == "MO_car150") %>%
 58 |   View()
 59 | ```
 60 | 
 61 | ### Looking at locations
 62 | 
 63 | ```{r}
 64 | locations %>%
 65 |   ggplot(aes(longitude, latitude, color = study_site)) +
 66 |   geom_point()
 67 | ```
 68 | 
 69 | ```{r}
 70 | by_animal <- locations %>%
 71 |   group_by(animal_id, study_site) %>%
 72 |   summarize(start = min(timestamp),
 73 |             end = max(timestamp),
 74 |             num_points = n()) %>%
 75 |   ungroup() %>%
 76 |   arrange(desc(num_points))
 77 | ```
 78 | 
 79 | ```{r}
 80 | library(lubridate)
 81 | 
 82 | example_animal <- locations %>%
 83 |   arrange(timestamp) %>%
 84 |   filter(animal_id == sample(unique(animal_id), 1))
 85 | 
 86 | example_animal %>%
 87 |   mutate(quarter = as.Date(floor_date(timestamp, "quarter"))) %>%
 88 |   ggplot(aes(longitude, latitude, color = timestamp)) +
 89 |   geom_point(alpha = .5) +
 90 |   geom_path(alpha = .5) +
 91 |   facet_wrap(~ quarter) +
 92 |   labs(title = "One caribou over time")
 93 | ```
 94 | 
 95 | ```{r}
 96 | library(geosphere)
 97 | 
 98 | locations_with_gaps <- locations %>%
 99 |   group_by(animal_id) %>%
100 |   mutate(last_lon = lag(longitude),
101 |          last_lat = lag(latitude),
102 |          hours = as.numeric(difftime(timestamp, lag(timestamp), unit = "hours")),
103 |          km = distHaversine(cbind(longitude, latitude), cbind(last_lon, last_lat)) / 1000,
104 |          kph = km / hours) %>%
105 |   ungroup()
106 | 
107 | locations_with_gaps %>%
108 |   filter(hours <= 8) %>%
109 |   ggplot(aes(kph)) +
110 |   geom_histogram() +
111 |   scale_x_log10(labels = scales::comma) +
112 |   labs(title = "On average, how fast do caribou travel?")
113 | 
114 | by_animal <- locations_with_gaps %>%
115 |   filter(hours <= 8,
116 |          hours >= .5) %>%
117 |   group_by(animal_id, study_site) %>%
118 |   summarize(start = min(timestamp),
119 |             end = max(timestamp),
120 |             num_points = n(),
121 |             avg_speed = mean(kph, na.rm = TRUE)) %>%
122 |   ungroup() %>%
123 |   arrange(desc(num_points)) %>%
124 |   filter(num_points >= 10)
125 | ```
126 | 
127 | ```{r}
128 | by_animal %>%
129 |   ggplot(aes(num_points, avg_speed)) +
130 |   geom_point() +
131 |   scale_x_log10() +
132 |   expand_limits(y = 0)
133 | 
134 | by_animal %>%
135 |   arrange(desc(avg_speed))
136 | 
137 | locations_with_gaps %>%
138 |   filter(animal_id == "QU_car107") %>%
139 |   mutate(quarter = as.Date(floor_date(timestamp, "quarter"))) %>%
140 |   ggplot(aes(longitude, latitude, color = kph)) +
141 |   geom_point(alpha = .5) +
142 |   geom_path(alpha = .5) +
143 |   facet_wrap(~ quarter) +
144 |   labs(title = "One caribou over time")
145 | 
146 | locations_with_gaps %>%
147 |   filter(animal_id == "SC_car171") %>%
148 |   arrange(desc(kph)) %>%
149 |   View()
150 | ```
151 | 
152 | ```{r}
153 | locations_with_gaps %>%
154 |   filter(study_site != "Hart Ranges") %>%
155 |   filter(hours <= 8,
156 |          hours >= .5) %>%
157 |   group_by(month = month(timestamp, label = TRUE),
158 |            study_site) %>%
159 |   summarize(avg_speed = median(kph),
160 |             n = n()) %>%
161 |   ggplot(aes(month, avg_speed, group = study_site,
162 |              color = study_site)) +
163 |   geom_line() +
164 |   geom_point(aes(size = n)) +
165 |   expand_limits(y = 0) +
166 |   facet_wrap(~ study_site) +
167 |   theme(legend.position = "none") +
168 |   labs(title = "Seasonal trend in Caribou speed",
169 |        y = "Average speed (kph)")
170 | ```
171 | 
172 | 
173 | ```{r}
174 | by_animal %>%
175 |   filter(num_points >= 10) %>%
176 |   arrange(desc(avg_speed))
177 | ```
178 | 
179 | 
180 | ```{r}
181 | locations %>%
182 |   arrange(timestamp) %>%
183 |   group_by(animal_id) %>%
184 |   mutate(gap = round(difftime(timestamp, lag(timestamp), unit = "hours"))) %>%
185 |   ungroup() %>%
186 |   filter(!is.na(gap)) %>%
187 |   filter(gap <= 24) %>%
188 |   ggplot(aes(gap)) +
189 |   geom_histogram(binwidth = 2)
190 | ```
191 | 
192 | 
193 | 
194 | 
195 | 
196 | 
197 | 
198 | 


--------------------------------------------------------------------------------
/cetaceans.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Untitled"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | theme_set(theme_light())
 13 | 
 14 | cetaceans_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-12-18/allCetaceanData.csv")
 15 | 
 16 | cetaceans <- cetaceans_raw %>%
 17 |   select(-X1) %>%
 18 |   mutate(birthYear = as.integer(birthYear),
 19 |          age = if_else(status == "Died", year(statusDate) - birthYear, NA_real_))
 20 | ```
 21 | 
 22 | ```{r}
 23 | library(lubridate)
 24 | 
 25 | cetaceans %>%
 26 |   count(acquisition, sort = TRUE)
 27 | 
 28 | cetaceans %>%
 29 |   count(status, sort = TRUE)
 30 | 
 31 | cetaceans %>%
 32 |   filter(status == "Died") %>%
 33 |   select(birthYear, statusDate) %>%
 34 |   filter(!is.na(birthYear), !is.na(statusDate)) %>%
 35 |   mutate(age = year(statusDate) - birthYear) %>%
 36 |   ggplot(aes(age)) +
 37 |   geom_histogram()
 38 | 
 39 | cetaceans %>%
 40 |   count(species = fct_lump(species, 5), sort = TRUE) %>%
 41 |   mutate(species = fct_reorder(species, n)) %>%
 42 |   ggplot(aes(species, n)) +
 43 |   geom_col() +
 44 |   coord_flip()
 45 | 
 46 | cetaceans %>%
 47 |   mutate(species = fct_lump(species, 5), sort = TRUE) %>%
 48 |   filter(!is.na(birthYear), !is.na(statusDate)) %>%
 49 |   mutate(age = year(statusDate) - birthYear) %>%
 50 |   ggplot(aes(species, age)) +
 51 |   geom_boxplot() +
 52 |   coord_flip()
 53 | ```
 54 | 
 55 | ```{r}
 56 | cetaceans %>%
 57 |   count(acquisition, originLocation, sort = TRUE) %>%
 58 |   filter(originLocation != "Unknown") %>%
 59 |   View()
 60 | 
 61 | cetaceans %>%
 62 |   ggplot(aes(originDate, fill = acquisition)) +
 63 |   geom_histogram()
 64 | 
 65 | library(scales)
 66 | 
 67 | cetaceans %>%
 68 |   filter(originDate >= "1960-01-01") %>%
 69 |   count(acquisition,
 70 |         decade = 5 * (year(originDate) %/% 5)) %>%
 71 |   complete(acquisition, decade, fill = list(n = 0)) %>%
 72 |   mutate(acquisition = fct_reorder(acquisition, n, sum)) %>%
 73 |   group_by(decade) %>%
 74 |   mutate(percent = n / sum(n)) %>%
 75 |   ggplot(aes(decade, percent, fill = acquisition)) +
 76 |   geom_area() +
 77 |   scale_y_continuous(labels = percent_format()) +
 78 |   theme_minimal() +
 79 |   labs(x = "year",
 80 |        y = "% of dolphins recorded")
 81 | ```
 82 | 
 83 | ```{r}
 84 | library(fuzzyjoin)
 85 | 
 86 | regexes <- tribble(
 87 |   ~ regex, ~ category,
 88 |   "Unknown", "Unknown",
 89 |   "Gulf of Mexico", "Gulf of Mexico",
 90 |   "Florida|FL", "Florida",
 91 |   "Texas|TX", "Texas",
 92 |   "SeaWorld", "SeaWorld",
 93 |   "Pacific", "Pacific Ocean",
 94 |   "Atlantic", "Atlantic Ocean"
 95 | )
 96 | 
 97 | cetaceans_annotated <- cetaceans %>%
 98 |   mutate(unique_id = row_number()) %>%
 99 |   regex_left_join(regexes, c(originLocation = "regex")) %>%
100 |   distinct(unique_id, .keep_all = TRUE) %>%
101 |   mutate(category = coalesce(category, originLocation))
102 | 
103 | cetaceans_annotated %>%
104 |   filter(acquisition == "Capture") %>%
105 |   mutate(category = fct_lump(category, 4),
106 |          category = fct_reorder(category, category, length)) %>%
107 |   ggplot(aes(originDate, fill = category)) +
108 |   geom_histogram()
109 | 
110 | 
111 |   count(category,
112 |         decade = 5 * (year(originDate) %/% 5)) %>%
113 |   complete(category, decade, fill = list(n = 0)) %>%
114 |   mutate(category = fct_lump(category, 7),
115 |          category = fct_reorder(category, n, sum)) %>%
116 |   group_by(decade) %>%
117 |   mutate(percent = n / sum(n)) %>%
118 |   ggplot(aes(decade, percent, fill = category)) +
119 |   geom_area() +
120 |   scale_y_continuous(labels = percent_format()) +
121 |   theme_minimal() +
122 |   labs(x = "year",
123 |        y = "% of dolphins recorded",
124 |        title = "Dolphins born in captivity")
125 | ```
126 | 
127 | ### Survival analysis
128 | 
129 | ```{r}
130 | library(survival)
131 | 
132 | dolphin_survival <- cetaceans %>%
133 |   filter(status %in% c("Alive", "Died")) %>%
134 |   mutate(deathYear = ifelse(status == "Alive", 2017, year(statusDate)),
135 |          status = ifelse(status == "Alive", 0, 1),
136 |          age = deathYear - birthYear) %>%
137 |   filter(!is.na(deathYear)) %>%
138 |   select(birthYear, deathYear, status, sex, age, acquisition, species) %>%
139 |   filter(deathYear >= birthYear) %>%
140 |   filter(sex != "U")
141 | 
142 | model <- survfit(Surv(age, status) ~ sex, dolphin_survival)
143 | 
144 | library(broom)
145 | 
146 | tidy(model) %>%
147 |   ggplot(aes(time, estimate, color = strata)) +
148 |   geom_line() +
149 |   geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = .2) +
150 |   scale_y_continuous(labels = percent_format()) +
151 |   labs(y = "Estimated % survival")
152 | 
153 | coxph(Surv(age, status) ~ sex, dolphin_survival) %>%
154 |   tidy()
155 | ```
156 | 
157 | ```{r}
158 | model <- survfit(Surv(age, status) ~ acquisition, dolphin_survival)
159 | 
160 | library(broom)
161 | 
162 | tidy(model) %>%
163 |   filter(strata != "acquisition=Unknown") %>%
164 |   ggplot(aes(time, estimate, color = strata)) +
165 |   geom_line() +
166 |   geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = .2) +
167 |   scale_y_continuous(labels = percent_format()) +
168 |   labs(y = "Estimated % survival")
169 | ```
170 | 
171 | 


--------------------------------------------------------------------------------
/coffee-ratings.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Coffee Ratings"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidytuesdayR)
 12 | library(tidyverse)
 13 | theme_set(theme_light())
 14 | 
 15 | coffee <- tt_load("2020-07-07")
 16 | 
 17 | coffee_ratings <- coffee$coffee_ratings %>%
 18 |   mutate(coffee_id = row_number()) %>%
 19 |   filter(total_cup_points > 0)
 20 | ```
 21 | 
 22 | ```{r}
 23 | View(coffee_ratings)
 24 | 
 25 | coffee_ratings %>%
 26 |   count(species, sort = TRUE)
 27 | 
 28 | coffee_lumped <- coffee_ratings %>%
 29 |   filter(!is.na(variety)) %>%
 30 |   mutate(variety = fct_lump(variety, 12), sort = TRUE)
 31 | 
 32 | coffee_lumped %>%
 33 |   mutate(variety = fct_reorder(variety, total_cup_points)) %>%
 34 |   ggplot(aes(total_cup_points, variety)) +
 35 |   geom_boxplot()
 36 | 
 37 | coffee_lumped %>%
 38 |   ggplot(aes(total_cup_points, fill = variety)) +
 39 |   geom_histogram(binwidth = 2) +
 40 |   facet_wrap(~ variety, scale = "free_y") +
 41 |   theme(legend.position = "none")
 42 | ```
 43 | 
 44 | ```{r}
 45 | coffee_ratings %>%
 46 |   summarize(across(everything(), ~ mean(!is.na(.)))) %>%
 47 |   gather() %>%
 48 |   View()
 49 | 
 50 | coffee_ratings %>%
 51 |   count(producer, sort = TRUE)
 52 | 
 53 | coffee_ratings %>%
 54 |   count(company, sort = TRUE)
 55 | 
 56 | coffee_ratings %>%
 57 |   count(color, sort = TRUE)
 58 | 
 59 | coffee_ratings %>%
 60 |   count(country = fct_lump(country_of_origin, 12), sort = TRUE) %>%
 61 |   filter(!is.na(country)) %>%
 62 |   mutate(country = fct_reorder(country, n)) %>%
 63 |   ggplot(aes(n, country)) +
 64 |   geom_col()
 65 | 
 66 | coffee_ratings %>%
 67 |   filter(!is.na(country_of_origin)) %>%
 68 |   mutate(country = fct_lump(country_of_origin, 12),
 69 |          country = fct_reorder(country, total_cup_points)) %>%
 70 |   ggplot(aes(total_cup_points, country)) +
 71 |   geom_boxplot()
 72 | ```
 73 | 
 74 | Interesting dimensions:
 75 | 
 76 | * Country
 77 | * Variety
 78 | * Company??
 79 | 
 80 | ```{r}
 81 | library(ggridges)
 82 | 
 83 | coffee_metrics <- coffee_ratings %>%
 84 |   select(coffee_id, total_cup_points, variety, company,
 85 |          country_of_origin,
 86 |          altitude_mean_meters,
 87 |          aroma:moisture) %>%
 88 |   pivot_longer(aroma:cupper_points, names_to = "metric", values_to = "value")
 89 | 
 90 | coffee_metrics %>%
 91 |   mutate(metric = fct_reorder(metric, value)) %>%
 92 |   ggplot(aes(value, metric)) +
 93 |   geom_density_ridges()
 94 | 
 95 | coffee_metrics %>%
 96 |   group_by(metric) %>%
 97 |   summarize(average = mean(value),
 98 |             sd = sd(value)) %>%
 99 |   arrange(desc(average))
100 | ```
101 | 
102 | ```{r}
103 | library(widyr)
104 | library(ggraph)
105 | library(igraph)
106 | library(tidytext)
107 | 
108 | correlations <- coffee_metrics %>%
109 |   pairwise_cor(metric, coffee_id, value, sort = TRUE)
110 | 
111 | correlations %>%
112 |   head(50) %>%
113 |   graph_from_data_frame() %>%
114 |   ggraph() +
115 |   geom_edge_link(aes(edge_alpha = correlation)) +
116 |   geom_node_point() +
117 |   geom_node_text(aes(label = name), repel = TRUE)
118 | 
119 | coffee_metrics %>%
120 |   filter(!metric %in% c("sweetness", "clean_cup", "uniformity")) %>%
121 |   group_by(metric) %>%
122 |   mutate(centered = value - mean(value)) %>%
123 |   ungroup() %>%
124 |   widely_svd(metric, coffee_id, value) %>%
125 |   filter(between(dimension, 2, 5)) %>%
126 |   mutate(metric = reorder_within(metric, value, dimension)) %>%
127 |   ggplot(aes(value, metric)) +
128 |   geom_col() +
129 |   scale_y_reordered() +
130 |   facet_wrap(~ dimension, scales = "free_y")
131 | ```
132 | 
133 | ```{r}
134 | 
135 | coffee_ratings %>%
136 |   filter(altitude_mean_meters < 10000,
137 |          altitude != 1) %>%
138 |   mutate(altitude_mean_meters = pmin(altitude_mean_meters, 3000)) %>%
139 |   ggplot(aes(altitude_mean_meters, total_cup_points)) +
140 |   geom_point() +
141 |   geom_smooth(method = "lm")
142 | 
143 | coffee_metrics %>%
144 |   filter(altitude_mean_meters < 10000) %>%
145 |   mutate(altitude_mean_meters = pmin(altitude_mean_meters, 3000)) %>%
146 |   mutate(km = altitude_mean_meters / 1000) %>%
147 |   group_by(metric) %>%
148 |   summarize(correlation = cor(altitude_mean_meters, value),
149 |             model = list(lm(value ~ km))) %>%
150 |   mutate(tidied = map(model, broom::tidy, conf.int = TRUE)) %>%
151 |   unnest(tidied) %>%
152 |   filter(term == "km") %>%
153 |   ungroup() %>%
154 |   mutate(metric = fct_reorder(metric, estimate)) %>%
155 |   ggplot(aes(estimate, metric, color = p.value < .05)) +
156 |   geom_point() +
157 |   geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = .1) +
158 |   labs(y = "Evaluation of coffee",
159 |        x = "Each kilometer of altitude contributes this much to score (95% confidence interval)")
160 | ```
161 | 
162 | 
163 | 
164 | 
165 | 


--------------------------------------------------------------------------------
/crop-yields-shiny/app.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: 'Our World in Data: Crop Yields'
 3 | runtime: shiny
 4 | output: html_document
 5 | ---
 6 | 
 7 | ```{r setup, include=FALSE}
 8 | knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE)
 9 | ```
10 | 
11 | ```{r}
12 | library(dplyr)
13 | library(ggplot2)
14 | library(forcats)
15 | library(shiny)
16 | library(stringr)
17 | library(plotly)
18 | theme_set(theme_light())
19 | 
20 | # Data cleaning and processing
21 | yields_tidy <- readRDS("yields_tidy.rds")
22 | 
23 | top_crops <- yields_tidy %>%
24 |   count(crop, sort = TRUE) %>%
25 |   head(9) %>%
26 |   pull(crop)
27 | ```
28 | 
29 | ```{r}
30 | plot_yields <- function(tbl, facet_scales = "fixed") {
31 |   g <- tbl %>%
32 |     mutate(crop = fct_reorder(crop, -yield)) %>%
33 |     mutate(entity = fct_reorder(entity, -yield)) %>%
34 |     ggplot(aes(year, yield, color = entity)) +
35 |     geom_line() +
36 |     expand_limits(y = 0) +
37 |     facet_wrap(~ crop, scales = facet_scales) +
38 |     labs(x = "Year",
39 |          y = "Yield (tonnes per hectare)",
40 |          title = "Crop yields over time",
41 |          color = "Country")
42 |   
43 |   ggplotly(g)
44 | }
45 | 
46 | # yields_tidy %>%
47 | #   filter(code %in% c("USA", "IND")) %>%
48 | #   plot_yields()
49 | ```
50 | 
51 | 
52 | ```{r eruptions, echo=FALSE}
53 | inputPanel(
54 |   selectInput("entity",
55 |               label = "Country/Continent/Region:",
56 |               choices = unique(yields_tidy$entity),
57 |               selected = c("United States", "India"),
58 |               selectize = TRUE,
59 |               multiple = TRUE),
60 |   selectInput("crop",
61 |               label = "Crops:",
62 |               choices = unique(yields_tidy$crop),
63 |               selected = top_crops,
64 |               selectize = TRUE,
65 |               multiple = TRUE),
66 |   radioButtons("facet_scales",
67 |                label = "",
68 |                choices = c("Free y-axis" = "free_y",
69 |                            "Shared y-axis" = "fixed"))
70 | )
71 | 
72 | renderPlotly({
73 |   yields_tidy %>%
74 |     filter(entity %in% input$entity,
75 |            crop %in% input$crop) %>%
76 |     plot_yields(facet_scales = input$facet_scales)
77 | })
78 | ```
79 | 


--------------------------------------------------------------------------------
/crop-yields-shiny/yields_tidy.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tacookson/data-screencasts/388a9cfa794a67d5cbaa3b9b860520c0a3e0868e/crop-yields-shiny/yields_tidy.rds


--------------------------------------------------------------------------------
/data-screencasts.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/french-trains.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "French Trains"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | library(scales)
 13 | theme_set(theme_light())
 14 | 
 15 | full_trains <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-02-26/full_trains.csv") %>%
 16 |   mutate(pct_late_at_departure = num_late_at_departure / total_num_trips,
 17 |          arrival_station = str_to_title(arrival_station),
 18 |          departure_station = str_to_title(departure_station),
 19 |          date = as.Date(sprintf("%d-%02d-01", year, month))) %>%
 20 |   arrange(departure_station, arrival_station, month) %>%
 21 |   fill(service)
 22 | ```
 23 | 
 24 | ```{r}
 25 | november_2018 <- full_trains %>%
 26 |   filter(year == 2018, month == 11)
 27 | 
 28 | november_2018 %>%
 29 |   ggplot(aes(pct_late_at_departure)) +
 30 |   geom_histogram(binwidth = .05) +
 31 |   scale_x_continuous(labels = percent_format())
 32 | 
 33 | november_2018 %>%
 34 |   mutate(departure_station = fct_lump(departure_station, 3)) %>%
 35 |   ggplot(aes(departure_station, pct_late_at_departure)) +
 36 |   geom_boxplot() +
 37 |   scale_y_continuous(labels = percent_format())
 38 | 
 39 | november_2018 %>%
 40 | #  mutate(arrival_station = fct_infreq(fct_lump(arrival_station, prop = .01))) %>%
 41 | #  mutate(departure_station = fct_infreq(fct_lump(departure_station, prop = .01))) %>%
 42 |   mutate(arrival_station = fct_reorder(fct_lump(arrival_station, prop = .01), pct_late_at_departure)) %>%
 43 |   mutate(departure_station = fct_reorder(fct_lump(departure_station, prop = .01), pct_late_at_departure)) %>%
 44 |   group_by(arrival_station, departure_station) %>%
 45 |   summarize(pct_late_at_departure = sum(num_late_at_departure) / sum(total_num_trips)) %>%
 46 |   ggplot(aes(arrival_station, departure_station, fill = pct_late_at_departure)) +
 47 |   geom_tile() +
 48 |   scale_fill_gradient2(low = "blue", high = "red", midpoint = .25, labels = percent_format()) +
 49 |   theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
 50 |   labs(x = "Arrival station",
 51 |        y = "Departure station",
 52 |        fill = "% late at departure",
 53 |        title = "Which routes have the most delayed trains in November 2018?",
 54 |        subtitle = "Stations with only one arriving/departing route were lumped into 'Other'")
 55 | ```
 56 | 
 57 | ### Over time
 58 | 
 59 | ```{r}
 60 | full_trains %>%
 61 |   filter(departure_station == "Lyon Part Dieu") %>%
 62 |   ggplot(aes(date, pct_late_at_departure, color = arrival_station)) +
 63 |   geom_line() +
 64 |   scale_y_continuous(labels = percent_format()) +
 65 |   expand_limits(y = 0)
 66 | 
 67 | by_departure_station_month <- full_trains %>%
 68 |   group_by(departure_station = fct_lump(departure_station, prop = .01),
 69 |            date) %>%
 70 |   summarize_at(vars(contains("num")), sum) %>%
 71 |   ungroup() %>%
 72 |   mutate(pct_late_at_departure = num_late_at_departure / total_num_trips)
 73 | 
 74 | by_departure_station_month %>%
 75 |   mutate(departure_station = fct_reorder(departure_station, -pct_late_at_departure, last)) %>%
 76 |   ggplot(aes(date, pct_late_at_departure, color = departure_station)) +
 77 |   geom_line() +
 78 |   scale_y_continuous(labels = percent_format()) +
 79 |   labs(x = "Month",
 80 |        y = "% late at departure",
 81 |        color = "Departure station")
 82 | ```
 83 | 
 84 | ```{r}
 85 | by_departure_station_month <- full_trains %>%
 86 |   group_by(departure_station = ifelse(service == "International",
 87 |                                       paste0(departure_station, " (International)"),
 88 |                                       departure_station),
 89 |            service,
 90 |            year,
 91 |            month = fct_reorder(month.name[month], month)) %>%
 92 |   summarize_at(vars(contains("num")), sum) %>%
 93 |   ungroup() %>%
 94 |   mutate(pct_late_at_departure = num_late_at_departure / total_num_trips)
 95 | 
 96 | by_departure_station_month %>%
 97 |   mutate(departure_station = fct_reorder(departure_station, (service != "International") + pct_late_at_departure, mean)) %>%
 98 |   ggplot(aes(month, departure_station, fill = pct_late_at_departure)) +
 99 |   geom_tile() +
100 |   scale_fill_gradient2(low = "blue", high = "red", midpoint = .25, labels = percent_format()) +
101 |   facet_wrap(~ year, nrow = 1, scales = "free_x") +
102 |   theme(axis.text.x = element_text(angle = 90, hjust = 1),
103 |         axis.ticks = element_blank(),
104 |         panel.grid = element_blank()) +
105 |   labs(fill = "% late at departure") +
106 |   labs(x = "Month",
107 |        y = "Departure station",
108 |        title = "Which stations had delays in which months?",
109 |        subtitle = "Ordered by the average delay, with international routes on the bottom")
110 | ```
111 | 
112 | 


--------------------------------------------------------------------------------
/gdpr.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | name: metrics_gdpr_violations
  3 | owner: admiral.david@gmail.com
  4 | metrics:
  5 |   nb_violations:
  6 |     title: Number of GDPR Violations
  7 |     description: Number of GDPR Violations in this time period.
  8 |   nb_total_fine:
  9 |     title: Total Fine (Euros)
 10 |     description: Total fine across violations in Euros. (TODO)
 11 | dimensions:
 12 |   country:
 13 |     title: Country
 14 |     description: France/Germany/etc
 15 |   article_title:
 16 |     title: Article
 17 |     description: GDPR Article
 18 |   type:
 19 |     title: Type
 20 |     description: Type of violation
 21 | ---
 22 | 
 23 | ```{r setup, include=FALSE}
 24 | knitr::opts_chunk$set(echo = TRUE)
 25 | ```
 26 | 
 27 | ```{r}
 28 | library(tidyverse)
 29 | library(lubridate)
 30 | library(scales)
 31 | theme_set(theme_light())
 32 | 
 33 | 
 34 | tuesdata <- tidytuesdayR::tt_load('2020-04-21')
 35 | 
 36 | gdpr_violations <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_violations.tsv') %>%
 37 |   mutate(date = na_if(mdy(date), "1970-01-01")) %>%
 38 |   rename(country = name)
 39 | ```
 40 | 
 41 | ```{r}
 42 | gdpr_violations %>%
 43 |   summarize(total = sum(price))
 44 | 
 45 | gdpr_violations %>%
 46 |   count(country = fct_lump(country, 8, w = price),
 47 |         sort = TRUE, wt = price, name = "total_price") %>%
 48 |   mutate(country = fct_reorder(country, total_price)) %>%
 49 |   ggplot(aes(total_price, country)) +
 50 |   geom_col() +
 51 |   scale_x_continuous(labels = dollar_format())
 52 | ```
 53 | 
 54 | ```{r}
 55 | gdpr_violations %>%
 56 |   count(month = floor_date(date, "month"),
 57 |         country = fct_lump(country, 6, w = price),
 58 |         sort = TRUE, wt = price, name = "total_price") %>%
 59 |   mutate(country = fct_reorder(country, -total_price, sum)) %>%
 60 |   ggplot(aes(month, total_price, fill = country)) +
 61 |   geom_col() +
 62 |   scale_y_continuous(labels = dollar_format()) +
 63 |   labs(x = "Time",
 64 |        y = "Total fines",
 65 |        fill = "Country")
 66 | ```
 67 | 
 68 | ```{r}
 69 | gdpr_violations %>%
 70 |   count(source, country, sort = TRUE)
 71 | 
 72 | gdpr_violations %>%
 73 |   select(controller, date, article_violated, type, summary, price) %>%
 74 |   mutate(summary = str_trunc(summary, 140)) %>%
 75 |   arrange(desc(price)) %>%
 76 |   mutate(price = dollar(price)) %>%
 77 |   head(10)
 78 | ```
 79 | 
 80 | ```{r}
 81 | gdpr_violations %>%
 82 |   mutate(type = fct_lump(type, 8, w = price),
 83 |          type = fct_reorder(type, price),
 84 |          country = fct_lump(country, 5)) %>%
 85 |   ggplot(aes(price, type)) +
 86 |   geom_boxplot() +
 87 |   geom_jitter(aes(color = country), width = 0, height = .25) +
 88 |   scale_x_log10(labels = dollar_format())
 89 | ```
 90 | 
 91 | ### Which article was violated?
 92 | 
 93 | ```{r}
 94 | article_titles <- gdpr_text %>%
 95 |   distinct(article, article_title)
 96 | 
 97 | separated_articles <- gdpr_violations %>%
 98 |   separate_rows(article_violated, sep = "\\|") %>%
 99 |   extract(article_violated, "article", "Art\\. ?(\\d+)", convert = TRUE, remove = FALSE) %>%
100 |   left_join(article_titles, by = "article") %>%
101 |   mutate(article_title = paste0(article, ". ", str_trunc(article_title, 50)),
102 |          article_title = ifelse(is.na(article), "Unknown", article_title)) %>%
103 |   add_count(id) %>%
104 |   mutate(price_per_article = price / n)
105 | 
106 | separated_articles %>%
107 |   group_by(article = fct_lump(article_title, 8, w = price)) %>%
108 |   summarize(total_fine = sum(price_per_article),
109 |             violations = n()) %>%
110 |   arrange(desc(total_fine)) %>%
111 |   mutate(article = fct_reorder(article, total_fine)) %>%
112 |   ggplot(aes(total_fine, article)) +
113 |   geom_col() +
114 |   scale_x_continuous(labels = dollar_format()) +
115 |   labs(title = "What articles got the most fines?",
116 |        y = "",
117 |        x = "Total fine")
118 | ```
119 | 
120 | ```{r}
121 | gdpr_text <- tuesdata$gdpr_text
122 | ```
123 | 
124 | ```{r}
125 | gdpr_violations %>%
126 |   filter(str_detect(controller, "Vodafone")) %>%
127 |   group_by(date, country) %>%
128 |   summarize(violations = n(),
129 |             total_fine = sum(price)) %>%
130 |   ggplot(aes(date, total_fine, size = violations, color = country)) +
131 |   geom_point() +
132 |   scale_y_continuous(labels = dollar_format()) +
133 |   scale_size_continuous(guide = FALSE) +
134 |   labs(title = "Vodafone's GDPR violations",
135 |        color = "",
136 |        x = "Time",
137 |        y = "Total fine on this day")
138 | ```
139 | 
140 | ```{r}
141 | library(tidymetrics)
142 | 
143 | summarized <- separated_articles %>%
144 |   filter(!is.na(date)) %>%
145 |   mutate(country = fct_lump(country, 6, w = price_per_article),
146 |          article_title = fct_lump(article_title, 6, w = price_per_article),
147 |          type = fct_lump(type, 6, w = price_per_article)) %>%
148 |   cross_by_dimensions(country, article_title, type) %>%
149 |   cross_by_periods(c("month", "quarter")) %>%
150 |   summarize(nb_violations = n_distinct(id),
151 |             nb_total_fine = sum(price_per_article)) %>%
152 |   ungroup()
153 | 
154 | gdpr_metrics <- create_metrics(summarized)
155 | ```
156 | 
157 | ```{r}
158 | library(shinymetrics)
159 | 
160 | preview_metric(gdpr_metrics$gdpr_violations_nb_violations)
161 | ```
162 | 
163 | 
164 | 
165 | 
166 | 


--------------------------------------------------------------------------------
/golden-age-tv.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Untitled"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | theme_set(theme_light())
 13 | 
 14 | tv_ratings <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-08/IMDb_Economist_tv_ratings.csv")
 15 | ```
 16 | 
 17 | ```{r}
 18 | tv_ratings %>%
 19 |   filter(title == "Buffy the Vampire Slayer")
 20 | ```
 21 | 
 22 | ## Do most shows get better or worse over time?
 23 | 
 24 | ```{r}
 25 | tv_ratings %>%
 26 |   count(seasonNumber) %>%
 27 |   ggplot(aes(seasonNumber, n)) +
 28 |   geom_line()
 29 | 
 30 | tv_ratings %>%
 31 |   filter(seasonNumber <= 7) %>%
 32 |   group_by(seasonNumber) %>%
 33 |   summarize(av_rating = mean(av_rating)) %>%
 34 |   ggplot(aes(seasonNumber, av_rating)) +
 35 |   geom_line()
 36 | 
 37 | tv_ratings_full_show <- tv_ratings %>%
 38 |   group_by(title) %>%
 39 |   filter(1 %in% seasonNumber,
 40 |          all(seasonNumber == dplyr::row_number()))
 41 | 
 42 | tv_ratings_full_show %>%
 43 |   filter(n() >= 5) %>%
 44 |   filter(seasonNumber <= 5) %>%
 45 |   group_by(seasonNumber) %>%
 46 |   ggplot(aes(factor(seasonNumber), av_rating)) +
 47 |   geom_boxplot()
 48 | ```
 49 | 
 50 | What are the highest rated seasons of TV?
 51 | 
 52 | ```{r}
 53 | by_show <- tv_ratings %>%
 54 |   group_by(title) %>%
 55 |   summarize(avg_rating = mean(av_rating),
 56 |             sd = sd(av_rating),
 57 |             seasons = n()) %>%
 58 |   arrange(desc(avg_rating))
 59 | 
 60 | most_variable_shows <- by_show %>%
 61 |   filter(seasons >= 5) %>%
 62 |   top_n(6, sd)
 63 | 
 64 | tv_ratings %>%
 65 |   inner_join(most_variable_shows, by = "title") %>%
 66 |   ggplot(aes(seasonNumber, av_rating, color = title)) +
 67 |   geom_line() +
 68 |   geom_point() +
 69 |   scale_x_continuous(breaks = function(lim) seq(floor(lim[1]), ceiling(lim[2]), 2)) +
 70 |   facet_wrap(~ title) +
 71 |   theme(legend.position = "none")
 72 | ```
 73 | 
 74 | ### Has TV been getting better or worse over time?
 75 | 
 76 | ```{r}
 77 | library(lubridate)
 78 | 
 79 | tv_ratings %>%
 80 |   mutate(year = year(date)) %>%
 81 |   ggplot(aes(av_rating)) +
 82 |   geom_histogram() +
 83 |   facet_wrap(~ year)
 84 | 
 85 | by_year <- tv_ratings %>%
 86 |   group_by(year = 2 * (year(date) %/% 2)) %>%
 87 |   summarize(mean_rating = mean(av_rating),
 88 |             median_rating = median(av_rating),
 89 |             mean_season_1 = mean(av_rating[seasonNumber == 1]),
 90 |             avg_season = mean(seasonNumber),
 91 |             sd = sd(av_rating),
 92 |             observations = n())
 93 | 
 94 | by_year %>%
 95 |   gather(metric, value, -year, -observations, -avg_season, -sd) %>%
 96 |   ggplot(aes(year, value, color = metric)) +
 97 |   geom_line() +
 98 |   geom_point()
 99 | 
100 | tv_ratings %>%
101 |   group_by(year = 5 * (year(date) %/% 5),
102 |            seasonNumber = ifelse(seasonNumber >= 4, "4+", seasonNumber)) %>%
103 |   summarize(mean_rating = mean(av_rating),
104 |             observations = n()) %>%
105 |   ggplot(aes(year, mean_rating, color = seasonNumber)) +
106 |   geom_line() +
107 |   labs(color = "Season",
108 |        y = "Average rating in each 5 year period")
109 | 
110 | tv_ratings %>%
111 |   ggplot(aes(date, av_rating)) +
112 |   geom_point() +
113 |   geom_smooth(method = "loess")
114 | 
115 | tv_ratings %>%
116 |   mutate(year = 2 * (year(date) %/% 2)) %>%
117 |   ggplot(aes(year, av_rating, group = year)) +
118 |   geom_boxplot()
119 | 
120 | tv_ratings %>%
121 |   filter(seasonNumber == 1) %>%
122 |   mutate(year = 2 * (year(date) %/% 2)) %>%
123 |   ggplot(aes(year, av_rating, group = year)) +
124 |   geom_boxplot()
125 | ```
126 | 
127 | ### Show survival
128 | 
129 | If season 1 is good, what's the probability they get a season 2?
130 | 
131 | ```{r}
132 | first_three_seasons <- tv_ratings %>%
133 |   filter(seasonNumber <= 3) %>%
134 |   group_by(title) %>%
135 |   mutate(date_first_season = min(date)) %>%
136 |   ungroup() %>%
137 |   transmute(titleId,
138 |             title,
139 |             date_first_season,
140 |             seasonNumber = paste0("season", seasonNumber),
141 |             av_rating) %>%
142 |   distinct(title, seasonNumber, .keep_all = TRUE) %>%
143 |   spread(seasonNumber, av_rating) %>%
144 |   filter(!is.na(season1)) %>%
145 |   filter(date_first_season <= "2017-01-01")
146 | 
147 | library(broom)
148 | 
149 | glm(!is.na(season2) ~ season1, data = first_three_seasons) %>%
150 |   summary()
151 | ```
152 | 
153 | ```{r}
154 | first_three_seasons %>%
155 |   group_by(season1_bin = cut(season1, c(0, 7, 7.5, 8, 8.5, 10)),
156 |            time_bin = ifelse(date_first_season < "2000-01-01", "Before 2000", "After 2000")) %>%
157 |   summarize(had_second_season = mean(!is.na(season2)),
158 |             observations = n()) %>%
159 |   ggplot(aes(season1_bin,
160 |              had_second_season,
161 |              color = time_bin,
162 |              group = time_bin)) +
163 |   geom_line() +
164 |   geom_point() +
165 |   scale_y_continuous(labels = scales::percent_format())
166 | ```
167 | 
168 | ```{r}
169 | library(broom)
170 | 
171 | new_data <- crossing(
172 |   year = 1990:2018,
173 |   season1 = seq(6, 9)
174 | )
175 | 
176 | library(splines)
177 | 
178 | mod <- first_three_seasons %>%
179 |   mutate(year = year(date_first_season),
180 |          had_second_season = !is.na(season2)) %>%
181 |   glm(had_second_season ~ season1 * year, data = ., family = "binomial")
182 | 
183 | summary(mod)
184 | 
185 | mod %>%
186 |   augment(newdata = new_data, type.predict = "response") %>%
187 |   ggplot(aes(year, .fitted, color = factor(season1))) +
188 |   geom_line() +
189 |   scale_y_continuous(labels = scales::percent_format()) +
190 |   labs(title = "Probability of getting season 2 has become more dependent on ratings of season 1",
191 |        color = "Season 1 rating",
192 |        x = "Year show premiered",
193 |        y = "Predicted probability of getting season 2")
194 | ```
195 | 
196 | 
197 | 
198 | Whether a show gets a second season depends on the ratings of the first season
199 | 
200 | 
201 | 
202 | 


--------------------------------------------------------------------------------
/honeycomb-puzzle.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Untitled"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | 
 13 | words <- tibble(word = read_lines("https://norvig.com/ngrams/enable1.txt")) %>%
 14 |   mutate(word_length = str_length(word)) %>%
 15 |   filter(word_length >= 4,
 16 |          !str_detect(word, "s")) %>%
 17 |   mutate(letters = str_split(word, ""),
 18 |          letters = map(letters, unique),
 19 |          unique_letters = lengths(letters)) %>%
 20 |   mutate(points = ifelse(word_length == 4, 1, word_length) +
 21 |            15 * (unique_letters == 7)) %>%
 22 |   filter(unique_letters <= 7) %>%
 23 |   arrange(desc(points))
 24 | 
 25 | center_letter <- "g"
 26 | other_letters <- c("a", "p", "x", "m", "e", "l")
 27 | 
 28 | get_words <- function(center_letter, other_letters) {
 29 |   words %>%
 30 |     filter(str_detect(word, center_letter)) %>%
 31 |     mutate(invalid_letters = map(letters, setdiff, c(center_letter, other_letters))) %>%
 32 |     filter(lengths(invalid_letters) == 0) %>%
 33 |     arrange(desc(points))
 34 | }
 35 | 
 36 | library(tidytext)
 37 | 
 38 | letters_unnested <- words %>%
 39 |   select(word, points) %>%
 40 |   unnest_tokens(letter, word, token = "characters", drop = FALSE) %>%
 41 |   distinct(word, letter, .keep_all = TRUE)
 42 | 
 43 | letters_summarized <- letters_unnested %>%
 44 |   group_by(letter) %>%
 45 |   summarize(n_words = n(),
 46 |             n_points = sum(points)) %>%
 47 |   arrange(desc(n_points))
 48 | ```
 49 | 
 50 | ```{r}
 51 | word_matrix <- letters_unnested %>%
 52 |   reshape2::acast(word ~ letter, fun.aggregate = length)
 53 | 
 54 | # Points per word (lines up with rows of word matrix)
 55 | points_per_word <- words$points
 56 | names(points_per_word) <- words$word
 57 | points_per_word <- points_per_word[rownames(word_matrix)]
 58 | 
 59 | get_score <- function(honeycomb_letters) {
 60 |   center_letter <- honeycomb_letters[1]
 61 | 
 62 |   permitted_letters <- colnames(word_matrix) %in% honeycomb_letters
 63 |   
 64 |   num_forbidden <- word_matrix %*% (1L - permitted_letters)
 65 |   word_permitted <- num_forbidden == 0L & word_matrix[, center_letter] == 1L
 66 |   
 67 |   sum(points_per_word[word_permitted])
 68 | }
 69 | 
 70 | get_score(c("e", "i", "a", "r", "n", "t", "l"))
 71 | 
 72 | get_words("e", c("i", "a", "r", "n", "t", "l"))
 73 | ```
 74 | 
 75 | ```{r}
 76 | center_letter <- "e"
 77 | 
 78 | find_best_combination <- function(center_letter, possible_letters) {
 79 |   good_letter_combinations <- combn(possible_letters, 6)
 80 |   
 81 |   # Every column is one of the possible honeycombs
 82 |   forbidden_matrix <- 1L - apply(good_letter_combinations,
 83 |                                  2,
 84 |                                  function(.) colnames(word_matrix) %in% c(center_letter, .))
 85 |   
 86 |   filtered_word_matrix <- word_matrix[word_matrix[, center_letter] == 1, ]
 87 |   word_allowed_matrix <- filtered_word_matrix %*% forbidden_matrix == 0
 88 |   scores <- t(word_allowed_matrix) %*% points_per_word[rownames(word_allowed_matrix)]
 89 |   
 90 |   list(center_letter = center_letter,
 91 |        other_letters = good_letter_combinations[, which.max(scores)],
 92 |        score = max(scores))
 93 | }
 94 | 
 95 | pool <- head(letters_summarized$letter, 16)
 96 | 
 97 | find_best_combination("e", setdiff(pool, "e"))
 98 | find_best_combination("i", setdiff(pool, "i"))
 99 | find_best_combination("a", setdiff(pool, "a"))
100 | find_best_combination("r", setdiff(pool, "r"))
101 | find_best_combination("n", setdiff(pool, "n"))
102 | find_best_combination("t", setdiff(pool, "t"))
103 | find_best_combination("g", setdiff(pool, "g"))
104 | 
105 | get_score(c("r", "e", "i", "a", "n", "t", "g"))
106 | ```
107 | 
108 | ```{r}
109 | permitted_letters <- colnames(word_matrix) %in% honeycomb_letters
110 | 
111 | num_forbidden <- word_matrix %*% (1L - permitted_letters)
112 | word_permitted <- num_forbidden == 0L & word_matrix[, center_letter] == 1L
113 | 
114 | sum(points_per_word[word_permitted])
115 | ```
116 | 
117 | 
118 | ```{r}
119 | words %>%
120 |   unnest(letters) %>%
121 |   group_by(letters) %>%
122 |   summarize(total_points = sum(points),
123 |             )
124 | ```
125 | 
126 | ```{r}
127 | words
128 | ```
129 | 
130 | 
131 | 
132 | ```{r}
133 | 
134 | ```
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/horror-movie-ratings.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Untitled"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | theme_set(theme_light())
 13 | 
 14 | horror_movies_raw <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-22/horror_movies.csv")
 15 | ```
 16 | 
 17 | ```{r}
 18 | horror_movies <- horror_movies_raw %>%
 19 |   arrange(desc(review_rating)) %>%
 20 |   extract(title, "year", "\\((\\d\\d\\d\\d)\\)$", remove = FALSE, convert = TRUE) %>%
 21 |   mutate(budget = parse_number(budget)) %>%
 22 |   separate(plot, c("director", "cast_sentence", "plot"), extra = "merge", sep = "\\. ", fill = "right") %>%
 23 |   distinct(title, .keep_all = TRUE)
 24 | ```
 25 | 
 26 | Most of the movies are since 2012.
 27 | 
 28 | ```{r}
 29 | horror_movies %>%
 30 |   count(genres, sort = TRUE)
 31 | 
 32 | horror_movies %>%
 33 |   count(language, sort = TRUE)
 34 | 
 35 | horror_movies %>%
 36 |   ggplot(aes(budget)) +
 37 |   geom_histogram() +
 38 |   scale_x_log10(labels = scales::dollar)
 39 | ```
 40 | 
 41 | Do higher budget movies end up higher rated?
 42 | 
 43 | ```{r}
 44 | horror_movies %>%
 45 |   ggplot(aes(budget, review_rating)) +
 46 |   geom_point() +
 47 |   scale_x_log10(labels = scales::dollar) +
 48 |   geom_smooth(method = "lm")
 49 | ```
 50 | 
 51 | No relationship between budget and rating. How about movie rating and review?
 52 | 
 53 | ```{r}
 54 | horror_movies %>%
 55 |   mutate(movie_rating = fct_lump(movie_rating, 5),
 56 |          movie_rating = fct_reorder(movie_rating, review_rating, na.rm = TRUE)) %>%
 57 |   ggplot(aes(movie_rating, review_rating)) +
 58 |   geom_boxplot() +
 59 |   coord_flip()
 60 | 
 61 | horror_movies %>%
 62 |   filter(!is.na(movie_rating)) %>%
 63 |   mutate(movie_rating = fct_lump(movie_rating, 5)) %>%
 64 |   lm(review_rating ~ movie_rating, data = .) %>%
 65 |   anova()
 66 | ```
 67 | 
 68 | ```{r}
 69 | horror_movies %>%
 70 |   separate_rows(genres, sep = "\\| ") %>%
 71 |   mutate(genre = fct_lump(genres, 5)) %>%
 72 |   ggplot(aes(genre, review_rating)) +
 73 |   geom_boxplot()
 74 | ```
 75 | 
 76 | ```{r}
 77 | library(tidytext)
 78 | 
 79 | horror_movies_unnested <- horror_movies %>%
 80 |   unnest_tokens(word, plot) %>%
 81 |   anti_join(stop_words, by = "word") %>%
 82 |   filter(!is.na(word))
 83 | 
 84 | horror_movies_unnested %>%
 85 |   filter(!is.na(review_rating)) %>%
 86 |   group_by(word) %>%
 87 |   summarize(movies = n(),
 88 |             avg_rating = mean(review_rating)) %>%
 89 |   arrange(desc(movies)) %>%
 90 |   filter(movies >= 100) %>%
 91 |   mutate(word = fct_reorder(word, avg_rating)) %>%
 92 |   ggplot(aes(avg_rating, word)) +
 93 |   geom_point()
 94 | ```
 95 | 
 96 | ### Lasso regression for predicing review rating based on words in plot
 97 | 
 98 | ```{r}
 99 | library(glmnet)
100 | library(Matrix)
101 | 
102 | movie_word_matrix <- horror_movies_unnested %>%
103 |   filter(!is.na(review_rating)) %>%
104 |   add_count(word) %>%
105 |   filter(n >= 20) %>%
106 |   count(title, word) %>%
107 |   cast_sparse(title, word, n)
108 | 
109 | rating <- horror_movies$review_rating[match(rownames(movie_word_matrix), horror_movies$title)]
110 | 
111 | lasso_model <- cv.glmnet(movie_word_matrix, rating)
112 | ```
113 | 
114 | ```{r}
115 | library(broom)
116 | 
117 | tidy(lasso_model$glmnet.fit) %>%
118 |   filter(term %in% c("quickly", "seek", "army", "teacher", "unexpected", "friends", "evil")) %>%
119 |   ggplot(aes(lambda, estimate, color = term)) +
120 |   geom_line() +
121 |   scale_x_log10() +
122 |   geom_vline(xintercept = lasso_model$lambda.min) +
123 |   geom_hline(yintercept = 0, lty = 2)
124 | ```
125 | 
126 | ```{r}
127 | plot(lasso_model)
128 | 
129 | tidy(lasso_model$glmnet.fit) %>%
130 |   filter(lambda == lasso_model$lambda.min,
131 |          term != "(Intercept)") %>%
132 |   mutate(term = fct_reorder(term, estimate)) %>%
133 |   ggplot(aes(term, estimate)) +
134 |   geom_col() +
135 |   coord_flip()
136 | ```
137 | 
138 | Throwing everything into a linear model: director, cast, genre, rating, plot words.
139 | 
140 | ```{r}
141 | features <- horror_movies %>%
142 |   filter(!is.na(review_rating)) %>%
143 |   select(title, genres, director, cast, movie_rating, language, release_country) %>%
144 |   mutate(director = str_remove(director, "Directed by ")) %>%
145 |   gather(type, value, -title) %>%
146 |   filter(!is.na(value)) %>%
147 |   separate_rows(value, sep = "\\| ?") %>%
148 |   unite(feature, type, value, sep = ": ") %>%
149 |   mutate(n = 1)
150 | 
151 | movie_feature_matrix <- horror_movies_unnested %>%
152 |   filter(!is.na(review_rating)) %>%
153 |   count(title, feature = paste0("word: ", word)) %>%
154 |   bind_rows(features) %>%
155 |   add_count(feature) %>%
156 |   filter(n >= 10) %>%
157 |   cast_sparse(title, feature)
158 | 
159 | rating <- horror_movies$review_rating[match(rownames(movie_feature_matrix), horror_movies$title)]
160 | 
161 | feature_lasso_model <- cv.glmnet(movie_feature_matrix, rating)
162 | ```
163 | 
164 | ```{r}
165 | plot(feature_lasso_model)
166 | 
167 | tidy(feature_lasso_model$glmnet.fit) %>%
168 |   filter(lambda == feature_lasso_model$lambda.1se,
169 |          term != "(Intercept)") %>%
170 |   mutate(term = fct_reorder(term, estimate)) %>%
171 |   ggplot(aes(term, estimate)) +
172 |   geom_col() +
173 |   coord_flip() +
174 |   labs(x = "",
175 |        y = "Coefficient for predicting movie rating",
176 |        title = "What affects a horror movie rating?",
177 |        subtitle = "Based on a lasso regression to predict IMDb ratings of ~3000 movies")
178 | ```
179 | 
180 | What am I going to watch?
181 | 
182 | ```{r}
183 | horror_movies %>%
184 |   filter(str_detect(genres, "Comedy"),
185 |          !is.na(movie_rating),
186 |          !is.na(budget),
187 |          movie_rating != "PG") %>%
188 |   arrange(desc(review_rating)) %>%
189 |   select(title, review_rating, movie_rating, plot, director, budget, language) %>%
190 |   View()
191 | ```
192 | 
193 | 
194 | 
195 | 
196 | 


--------------------------------------------------------------------------------
/malaria.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Malaria Atlas"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | theme_set(theme_light())
 13 | 
 14 | library(malariaAtlas)
 15 | ```
 16 | 
 17 | ```{r}
 18 | kenya_pr <- tbl_df(malariaAtlas::getPR(ISO = "KEN", species = "BOTH")) %>%
 19 |   filter(!is.na(pr))
 20 | ```
 21 | 
 22 | ```{r}
 23 | kenya_pr %>%
 24 |   group_by(year_start) %>%
 25 |   summarize(examined = sum(examined),
 26 |             positive = sum(positive),
 27 |             studies = n()) %>%
 28 |   mutate(pr = positive / examined) %>%
 29 |   ggplot(aes(year_start, pr)) +
 30 |   geom_line()
 31 | 
 32 | kenya_pr %>%
 33 |   mutate(decade = 10 * (year_start %/% 10)) %>%
 34 |   arrange(pr) %>%
 35 |   ggplot(aes(longitude, latitude, color = pr)) +
 36 |   borders("world", regions = "Kenya") +
 37 |   geom_point() +
 38 |   scale_color_gradient2(low = "blue", high = "red", midpoint = .5, labels = scales::percent_format()) +
 39 |   facet_wrap(~ decade) +
 40 |   theme_void() +
 41 |   coord_map() +
 42 |   labs(color = "Prevalence")
 43 | ```
 44 | 
 45 | ### Aggregated across countries
 46 | 
 47 | ```{r}
 48 | malaria_inc <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-11-13/malaria_inc.csv")
 49 | 
 50 | malaria_inc_processed <- malaria_inc %>%
 51 |   setNames(c("country", "code", "year", "incidence")) %>%
 52 |   mutate(incidence = incidence / 1000)
 53 | ```
 54 | 
 55 | ```{r}
 56 | malaria_inc_processed %>%
 57 |   filter(country %in% sample(unique(country), 6)) %>%
 58 |   ggplot(aes(year, incidence, color = country)) +
 59 |   geom_line() +
 60 |   scale_y_continuous(labels = scales::percent_format())
 61 | ```
 62 | 
 63 | Looking at 2015 levels and the change from 2015 to 2000
 64 | 
 65 | ```{r}
 66 | malaria_spread <- malaria_inc_processed %>%
 67 |   mutate(year = paste0("Y", year)) %>%
 68 |   spread(year, incidence)
 69 | 
 70 | malaria_spread %>%
 71 |   filter(country != "Turkey",
 72 |          !is.na(code)) %>%
 73 |   mutate(current = Y2015,
 74 |          change =  Y2015 - Y2000) %>%
 75 |   ggplot(aes(current, change)) +
 76 |   geom_point() +
 77 |   geom_text(aes(label = code), vjust = 1, hjust = 1)
 78 | ```
 79 | 
 80 | ```{r}
 81 | world <- map_data("world") %>%
 82 |   filter(region != "Antarctica")
 83 | 
 84 | malaria_inc_processed %>%
 85 |   filter(incidence < 1) %>%
 86 |   inner_join(maps::iso3166 %>%
 87 |                select(a3, mapname), by = c(code = "a3")) %>%
 88 |   inner_join(world, by = c(mapname = "region")) %>%
 89 |   ggplot(aes(long, lat, group = group, fill = incidence)) +
 90 |   geom_polygon() +
 91 |   scale_fill_gradient2(low = "blue", high = "red", midpoint = .20, labels = scales::percent_format()) +
 92 |   coord_map() +
 93 |   facet_wrap(~ year) +
 94 |   theme_void() +
 95 |   labs(title = "Malaria incidence over time around the world")
 96 | ```
 97 | 
 98 | ### Malaria deaths over time
 99 | 
100 | 
101 | ```{r}
102 | library(tidyverse)
103 | 
104 | malaria_deaths <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-11-13/malaria_deaths.csv")
105 | 
106 | malaria_deaths_processed <- malaria_deaths %>%
107 |   setNames(c("country", "code", "year", "deaths"))
108 | ```
109 | 
110 | ```{r}
111 | malaria_deaths_processed %>%
112 |   filter(country %in% sample(unique(country), 6)) %>%
113 |   ggplot(aes(year, deaths, color = country)) +
114 |   geom_line() +
115 |   labs(y = "Deaths per 100,000")
116 | ```
117 | 
118 | ```{r}
119 | library(fuzzyjoin)
120 | library(stringr)
121 | 
122 | malaria_country_data <- malaria_deaths_processed %>%
123 |   inner_join(maps::iso3166 %>%
124 |                select(a3, mapname), by = c(code = "a3")) %>%
125 |   mutate(mapname = str_remove(mapname, "\\(.*"))
126 | 
127 | malaria_map_data <- map_data("world") %>%
128 |   filter(region != "Antarctica") %>%
129 |   tbl_df() %>%
130 |   inner_join(malaria_country_data, by = c(region = "mapname"))
131 | 
132 | malaria_map_data %>%
133 |   ggplot(aes(long, lat, group = group, fill = deaths)) +
134 |   geom_polygon() +
135 |   scale_fill_gradient2(low = "blue", high = "red", midpoint = 100) +
136 |   theme_void() +
137 |   labs(title = "Malaria deaths over time around the world",
138 |        fill = "Deaths per 100,000")
139 | ```
140 | 
141 | ```{r}
142 | library(gganimate)
143 | library(countrycode)
144 | 
145 | malaria_map_data %>%
146 |   mutate(continent = countrycode(code, "iso3c", "continent")) %>%
147 |   filter(continent == "Africa") %>%
148 |   ggplot(aes(long, lat, group = group, fill = deaths)) +
149 |   geom_polygon() +
150 |   scale_fill_gradient2(low = "blue", high = "red", midpoint = 100) +
151 |   theme_void() +
152 |   labs(title = "Malaria deaths over time in Africa ({ current_frame })",
153 |        fill = "Deaths per 100,000") +
154 |   transition_manual(year)
155 | 
156 | anim_save("malaria_map.gif")
157 | ```
158 | 
159 | 


--------------------------------------------------------------------------------
/media-franchises.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Media Franchise Revenues"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | theme_set(theme_light())
 13 | 
 14 | media_franchises <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-02/media_franchises.csv")
 15 | ```
 16 | 
 17 | ```{r}
 18 | media_franchises %>%
 19 |   count(franchise, sort = TRUE)
 20 | 
 21 | franchises <- media_franchises %>%
 22 |   group_by(franchise, original_media, year_created, creators, owners) %>%
 23 |   summarize(categories = n(),
 24 |             total_revenue = sum(revenue),
 25 |             most_profitable = revenue_category[which.max(revenue)]) %>%
 26 |   ungroup()
 27 | 
 28 | franchises
 29 | 
 30 | media_franchises
 31 | ```
 32 | 
 33 | ### Franchise-level
 34 | 
 35 | What are the biggest franchises?
 36 | 
 37 | ```{r}
 38 | library(glue)
 39 | 
 40 | top_franchises <- franchises %>%
 41 |   mutate(franchise = glue("{ franchise } ({ year_created })")) %>%
 42 |   top_n(20, total_revenue)
 43 | 
 44 | media_franchises %>%
 45 |   mutate(franchise = glue("{ franchise } ({ year_created })")) %>%
 46 |   semi_join(top_franchises, by = "franchise") %>%
 47 |   mutate(franchise = fct_reorder(franchise, revenue, sum),
 48 |          revenue_category = fct_reorder(revenue_category, revenue, sum)) %>%
 49 |   ggplot(aes(franchise, revenue)) +
 50 |   geom_col(aes(fill = revenue_category)) +
 51 |   geom_text(aes(y = total_revenue,
 52 |                 label = paste0(scales::dollar(total_revenue, accuracy = 1), "B")),
 53 |             data = top_franchises,
 54 |             hjust = 0) +
 55 |   scale_y_continuous(labels = scales::dollar) +
 56 |   expand_limits(y = 100) +
 57 |   coord_flip() +
 58 |   theme(panel.grid.major.y = element_blank()) +
 59 |   guides(fill = guide_legend(reverse = TRUE)) +
 60 |   labs(title = "What are the most profitable franchises of all time?",
 61 |        fill = "Category",
 62 |        x = "",
 63 |        y = "Revenue (Billions)")
 64 | ```
 65 | 
 66 | ```{r}
 67 | media_franchises %>%
 68 |   group_by(owners) %>%
 69 |   filter(n_distinct(franchise) > 2) %>%
 70 |   ungroup() %>%
 71 |   mutate(franchise = fct_reorder(franchise, revenue, sum),
 72 |          owners = fct_reorder(owners, -revenue, sum),
 73 |          revenue_category = fct_reorder(revenue_category, revenue, sum)) %>%
 74 |   ggplot(aes(franchise, revenue, fill = revenue_category)) +
 75 |   geom_col() +
 76 |   facet_wrap(~ owners, scales = "free_y") +
 77 |   guides(fill = guide_legend(reverse = TRUE)) +
 78 |   coord_flip() +
 79 |   labs(title = "What companies own at least 3 franchises?",
 80 |        fill = "Category",
 81 |        x = "",
 82 |        y = "Revenue (Billions)")
 83 | ```
 84 | 
 85 | ```{r}
 86 | franchises %>%
 87 |   ggplot(aes(year_created, total_revenue)) +
 88 |   geom_point(aes(size = total_revenue, color = original_media)) +
 89 |   geom_text(aes(label = franchise), check_overlap = TRUE, vjust = 1, hjust = 1) +
 90 |   expand_limits(x = 1910) +
 91 |   labs(title = "When were the 'great' franchises created?")
 92 | ```
 93 | 
 94 | ```{r}
 95 | media_franchises %>%
 96 |   group_by(original_media) %>%
 97 |   summarize(revenue = sum(revenue)) 
 98 | 
 99 | original_media_revenue_categories <- media_franchises %>%
100 |   group_by(original_media) %>%
101 |   filter(sum(revenue) >= 45) %>%
102 |   group_by(original_media, revenue_category) %>%
103 |   summarize(revenue = sum(revenue)) %>%
104 |   ungroup() %>%
105 |   mutate(revenue_category = fct_reorder(revenue_category, revenue, sum),
106 |          original_media = fct_reorder(original_media, -revenue, sum))
107 | 
108 | original_media_revenue_categories %>%
109 |   ggplot(aes(revenue_category, revenue)) +
110 |   geom_col() +
111 |   scale_y_continuous(labels = scales::dollar) +
112 |   coord_flip() +
113 |   facet_wrap(~ original_media) +
114 |   labs(x = "Revenue category",
115 |        y = "Revenue (Billions)",
116 |        title = "What kinds of media lead to what types of revenue?")
117 | ```
118 | 
119 | ```{r}
120 | original_media_revenue_categories %>%
121 |   mutate(revenue_category = fct_rev(revenue_category),
122 |          original_media = fct_rev(original_media)) %>%
123 |   ggplot(aes(revenue_category, original_media, fill = revenue)) +
124 |   geom_tile() +
125 |   scale_fill_gradient2(low = "white", high = "red", labels = scales::dollar) +
126 |   theme(panel.grid = element_blank(),
127 |         axis.text.x = element_text(angle = 90, hjust = 1)) +
128 |   labs(fill = "Revenue (billions)")
129 | ```
130 | 
131 | 
132 | 


--------------------------------------------------------------------------------
/medium-datasci.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Untitled"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | 
 13 | medium_datasci <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-12-04/medium_datasci.csv")
 14 | 
 15 | theme_set(theme_light())
 16 | ```
 17 | 
 18 | ```{r}
 19 | medium_processed <- medium_datasci %>%
 20 |   select(-x1) %>%
 21 |   mutate(post_id = row_number())
 22 | ```
 23 | 
 24 | ```{r}
 25 | medium_processed %>%
 26 |   count(author, sort = TRUE)
 27 | 
 28 | medium_processed %>%
 29 |   summarize_at(vars(starts_with("tag_")), sum)
 30 | 
 31 | medium_gathered <- medium_processed %>%
 32 |   gather(tag, value, starts_with("tag")) %>%
 33 |   mutate(tag = str_remove(tag, "tag_")) %>%
 34 |   filter(value == 1)
 35 | 
 36 | medium_gathered %>%
 37 |   count(tag, sort = TRUE)
 38 | 
 39 | medium_gathered %>%
 40 |   group_by(tag) %>%
 41 |   summarize(median_claps = median(claps)) %>%
 42 |   arrange(desc(median_claps))
 43 | 
 44 | medium_processed %>%
 45 |   ggplot(aes(claps)) +
 46 |   geom_histogram() +
 47 |   scale_x_log10(labels = scales::comma_format())
 48 | 
 49 | medium_processed %>%
 50 |   mutate(reading_time = pmin(10, reading_time)) %>%
 51 |   ggplot(aes(reading_time)) +
 52 |   geom_histogram(binwidth = .5) +
 53 |   scale_x_continuous(breaks = seq(2, 10, 2),
 54 |                      labels = c(seq(2, 8, 2), "10+")) +
 55 |   labs(x = "Medium reading time")
 56 | 
 57 | medium_gathered %>%
 58 |   group_by(tag) %>%
 59 |   summarize(reading_time = mean(reading_time)) %>%
 60 |   arrange(desc(reading_time))
 61 | ```
 62 | 
 63 | ### Text mining
 64 | 
 65 | ```{r}
 66 | library(tidytext)
 67 | 
 68 | medium_words <- medium_processed %>%
 69 |   filter(!is.na(title)) %>%
 70 |   select(post_id, title, subtitle, year, reading_time, claps) %>%
 71 |   unnest_tokens(word, title) %>%
 72 |   anti_join(stop_words, by = "word") %>%
 73 |   filter(!(word %in% c("de", "en", "la", "para")),
 74 |          str_detect(word, "[a-z]"))
 75 | 
 76 | medium_words %>%
 77 |   count(word, sort = TRUE) %>%
 78 |   mutate(word = fct_reorder(word, n)) %>%
 79 |   head(20) %>%
 80 |   ggplot(aes(word, n)) +
 81 |   geom_col() +
 82 |   coord_flip() +
 83 |   labs(title = "Common words in Medium post titles")
 84 | ```
 85 | 
 86 | ```{r}
 87 | medium_words_filtered <- medium_words %>%
 88 |   add_count(word) %>%
 89 |   filter(n >= 250)
 90 | 
 91 | tag_claps <- medium_words_filtered %>%
 92 |   group_by(word) %>%
 93 |   summarize(median_claps = median(claps),
 94 |             geometric_mean_claps = exp(mean(log(claps + 1))) - 1,
 95 |             occurences = n()) %>%
 96 |   arrange(desc(median_claps))
 97 | 
 98 | library(widyr)
 99 | library(ggraph)
100 | library(igraph)
101 | 
102 | top_word_cors <- medium_words_filtered %>%
103 |   select(post_id, word) %>%
104 |   pairwise_cor(word, post_id, sort = TRUE) %>%
105 |   head(150)
106 | 
107 | vertices <- tag_claps %>%
108 |   filter(word %in% top_word_cors$item1 |
109 |            word %in% top_word_cors$item2)
110 | 
111 | set.seed(2018)
112 | 
113 | top_word_cors %>%
114 |   graph_from_data_frame(vertices = vertices) %>%
115 |   ggraph() +
116 |   geom_edge_link() +
117 |   geom_node_point(aes(size = occurences * 1.1)) +
118 |   geom_node_point(aes(size = occurences,
119 |                       color = geometric_mean_claps)) +
120 |   geom_node_text(aes(label = name), repel = TRUE) +
121 |   scale_color_gradient2(low = "blue",
122 |                         high = "red",
123 |                         midpoint = 10) +
124 |   theme_void() +
125 |   labs(title = "What gets claps in Medium article titles?",
126 |        subtitle = "Color shows the geometric mean of # of claps on articles with this word in the title",
127 |        size = "# of occurrences",
128 |        color = "Claps")
129 | ```
130 | 
131 | ### Predicting # of claps based on title + tag
132 | 
133 | ```{r}
134 | # turn into a sparse matrix
135 | post_word_matrix <- medium_words_filtered %>%
136 |   distinct(post_id, word, claps) %>%
137 |   cast_sparse(post_id, word)
138 | 
139 | # Fit a LASSO model
140 | library(glmnet)
141 | 
142 | claps <- medium_processed$claps[match(rownames(post_word_matrix), medium_processed$post_id)]
143 | 
144 | lasso_model <- cv.glmnet(post_word_matrix, log(claps + 1))
145 | ```
146 | 
147 | ```{r}
148 | library(broom)
149 | 
150 | tidy(lasso_model$glmnet.fit) %>%
151 |   filter(term %in% c("hadoop", "learning", "gdpr", "deep", "startup", "marketing")) %>%
152 |   ggplot(aes(lambda, estimate, color = term)) +
153 |   geom_line() +
154 |   scale_x_log10()
155 | 
156 | tidy(lasso_model$glmnet.fit) %>%
157 |   filter(lambda == lasso_model$lambda.min) %>%
158 |   arrange(desc(estimate)) %>%
159 |   View()
160 | ```
161 | 
162 | 


--------------------------------------------------------------------------------
/nyc-pizza.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Pizza"
  3 | output: html_document
  4 | editor_options: 
  5 |   chunk_output_type: console
  6 | ---
  7 | 
  8 | ```{r setup, include=FALSE}
  9 | knitr::opts_chunk$set(echo = TRUE)
 10 | ```
 11 | 
 12 | ## R Markdown
 13 | 
 14 | ```{r}
 15 | library(tidyverse)
 16 | theme_set(theme_light())\
 17 | ```
 18 | 
 19 | ```{r}
 20 | pizza_jared <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-01/pizza_jared.csv")
 21 | pizza_barstool <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-01/pizza_barstool.csv")
 22 | pizza_datafiniti <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-01/pizza_datafiniti.csv")
 23 | ```
 24 | 
 25 | ```{r}
 26 | answer_orders <- c("Never Again", "Poor", "Average", "Good", "Excellent")
 27 | 
 28 | by_place_answer <- pizza_jared %>%
 29 |   mutate(time = as.POSIXct(time, origin = "1970-01-01"),
 30 |          date = as.Date(time),
 31 |          answer = fct_relevel(answer, answer_orders)) %>%
 32 |   group_by(place, answer) %>%
 33 |   summarize(votes = sum(votes)) %>%
 34 |   mutate(total = sum(votes),
 35 |          percent = votes / total,
 36 |          answer_integer = as.integer(answer),
 37 |          average = sum(answer_integer * percent)) %>%
 38 |   ungroup()
 39 | 
 40 | by_place <- by_place_answer %>%
 41 |   distinct(place, total, average)
 42 | ```
 43 | 
 44 | ```{r}
 45 | by_place_answer %>%
 46 |   filter(as.integer(fct_reorder(place, total, .desc = TRUE)) <= 16,
 47 |          answer != "Fair") %>%
 48 |   mutate(place = glue::glue("{ place } ({ total })"),
 49 |          place = fct_reorder(place, average)) %>%
 50 |   ggplot(aes(answer, percent)) +
 51 |   geom_col() +
 52 |   facet_wrap(~ place) +
 53 |   scale_y_continuous(labels = scales::percent) +
 54 |   theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
 55 |   labs(x = "",
 56 |        y = "% of respondents",
 57 |        title = "What is the most popular pizza place in Open Stats meetup?",
 58 |        subtitle = "Only the 16 pizza places with the most respondents. # respondents shown in parentheses.")
 59 | ```
 60 | 
 61 | ```{r}
 62 | library(broom)
 63 | 
 64 | # THIS TRICK DOESN'T WORK
 65 | # tidy(lm(c(1, 2, 3, 4, 5) ~ 1, weights = c(100, 300, 100, 200, 150)), conf.int = TRUE)
 66 | 
 67 | t_test_repeated <- function(x, frequency) {
 68 |   tidy(t.test(rep(x, frequency)))
 69 | }
 70 | 
 71 | by_place_answer %>%
 72 |   filter(total >= 3) %>%
 73 |   group_by(place, total) %>%
 74 |   summarize(t_test_result = list(t_test_repeated(answer_integer, votes))) %>%
 75 |   ungroup() %>%
 76 |   unnest(t_test_result) %>%
 77 |   select(place, total, average = estimate, low = conf.low, high = conf.high) %>%
 78 |   top_n(16, total) %>%
 79 |   mutate(place = fct_reorder(place, average)) %>%
 80 |   ggplot(aes(average, place)) +
 81 |   geom_point(aes(size = total)) +
 82 |   geom_errorbarh(aes(xmin = low, xmax = high)) +
 83 |   labs(x = "Average score (1-5 Likert Scale)",
 84 |        y = "",
 85 |        title = "What is the most popular pizza place in Open Stats meetup?",
 86 |        subtitle = "Only the 16 pizza places with the most respondents.",
 87 |        size = "# of respondents")
 88 | ```
 89 | 
 90 | ```{r}
 91 | # Don't bother comparing them, this is a bad graph
 92 | pizza_barstool %>%
 93 |   select(place = name,
 94 |          barstool_total = review_stats_all_count,
 95 |          barstool_average = review_stats_all_average_score) %>%
 96 |   inner_join(by_place, by = "place") %>%
 97 |   group_by(place) %>%
 98 |   filter(n() == 1) %>%
 99 |   ungroup() %>%
100 |   filter(barstool_total >= 5,
101 |          total >= 5) %>%
102 |   ggplot(aes(average, barstool_average)) +
103 |   geom_point() +
104 |   labs(x = "Meetup",
105 |        y = "Barstool")
106 | ```
107 | 
108 | ```{r}
109 | pizza_barstool %>%
110 |   top_n(50, review_stats_all_count) %>%
111 |   ggplot(aes(price_level, review_stats_all_average_score, group = price_level)) +
112 |   geom_boxplot()
113 | 
114 | pizza_barstool %>%
115 |   filter(review_stats_all_count >= 50) %>%
116 |   mutate(name = fct_reorder(name, review_stats_all_average_score)) %>%
117 |   ggplot(aes(review_stats_all_average_score, name, size = review_stats_all_count)) +
118 |   geom_point() +
119 |   labs(x = "Average rating",
120 |        y = "",
121 |        size = "# of reviews",
122 |        title = "Barstool Sports ratings of pizza places",
123 |        subtitle = "Only places with at least 50 reviews")
124 | ```
125 | 
126 | ```{r}
127 | pizza_barstool %>%
128 |   filter(review_stats_all_count >= 20) %>%
129 |   mutate(city = fct_lump(city, 3)) %>%
130 |   add_count(city) %>%
131 |   mutate(city = glue::glue("{ city } ({ n })")) %>%
132 |   ggplot(aes(city, review_stats_all_average_score)) +
133 |   geom_boxplot() +
134 |   labs(title = "Do pizza ratings differ across cities?",
135 |        subtitle = "Only pizza places with at least 20 reviews")
136 | ```
137 | 
138 | ```{r}
139 | pizza_cleaned <- pizza_barstool %>%
140 |   select(place = name,
141 |          price_level,
142 |          contains("review")) %>%
143 |   rename_all(~ str_remove(., "review_stats_")) %>%
144 |   select(-contains("provider"))
145 | 
146 | pizza_cleaned %>%
147 |   filter(critic_count > 0) %>%
148 |   ggplot(aes(critic_average_score, dave_average_score)) +
149 |   geom_point() +
150 |   geom_abline(color = "red") +
151 |   geom_smooth(method = "lm") +
152 |   labs(title = "Does Barstool Sports' Dave agree with the critics?",
153 |        x = "Critic average score",
154 |        y = "Dave score")
155 | ```
156 | 
157 | ```{r}
158 | pizza_cleaned %>%
159 |   filter(community_count >= 20) %>%
160 |   ggplot(aes(community_average_score, dave_average_score)) +
161 |   geom_point(aes(size = community_count)) +
162 |   geom_abline(color = "red") +
163 |   geom_smooth(method = "lm") +
164 |   labs(size = "# of community reviews",
165 |        x = "Community score",
166 |        y = "Dave score")
167 | ```
168 | 


--------------------------------------------------------------------------------
/nyc-restaurants.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "NYC Restaurants"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | library(lubridate)
 13 | 
 14 | theme_set(theme_light())
 15 | 
 16 | # You can use this url to download the data directly into R (will take a few seconds)
 17 | restaurant_inspections_raw <- read_csv("https://data.cityofnewyork.us/api/views/43nn-pn8j/rows.csv")
 18 | 
 19 | restaurant_inspections <- restaurant_inspections_raw %>% 
 20 |   janitor::clean_names() %>%
 21 |   select(-phone, -grade_date, -record_date, -building, -street) %>%
 22 |   mutate(inspection_date = mdy(inspection_date)) %>%
 23 |   separate(inspection_type, c("inspection_program", "inspection_type"), sep = " / ")
 24 | ```
 25 | 
 26 | ```{r}
 27 | restaurant_inspections %>%
 28 |   count(dba, camis, sort = TRUE)
 29 | 
 30 | restaurant_inspections %>%
 31 |   count(year = year(inspection_date))
 32 | 
 33 | restaurant_inspections %>%
 34 |   count(grade, sort = TRUE)
 35 | 
 36 | restaurant_inspections %>%
 37 |   count(violation_code, violation_description, sort = TRUE)
 38 | 
 39 | restaurant_inspections %>%
 40 |   filter(camis == 41297769, inspection_date == "2018-09-25") %>%
 41 |   count(camis, dba, inspection_date, sort = TRUE)
 42 | 
 43 | restaurant_inspections %>%
 44 |   count(cuisine_description, sort = TRUE)
 45 | 
 46 | restaurant_inspections %>%
 47 |   filter(action == "No violations were recorded at the time of this inspection.") %>%
 48 |   count(critical_flag)
 49 | 
 50 | inspections <- restaurant_inspections %>%
 51 |   group_by(camis,
 52 |            dba,
 53 |            boro,
 54 |            zipcode,
 55 |            cuisine_description,
 56 |            inspection_date,
 57 |            action,
 58 |            score,
 59 |            grade,
 60 |            inspection_type,
 61 |            inspection_program) %>%
 62 |   summarize(critical_violations = sum(critical_flag == "Critical", na.rm = TRUE),
 63 |             non_critical_violations = sum(critical_flag == "Not Critical", na.rm = TRUE)) %>%
 64 |   ungroup()
 65 | 
 66 | most_recent_cycle_inspection <- inspections %>%
 67 |   filter(inspection_program == "Cycle Inspection",
 68 |          inspection_type == "Initial Inspection") %>%
 69 |   arrange(desc(inspection_date)) %>%
 70 |   distinct(camis, .keep_all = TRUE)
 71 | ```
 72 | 
 73 | ```{r}
 74 | by_dba <- most_recent_cycle_inspection %>%
 75 |   group_by(dba, cuisine = cuisine_description) %>%
 76 |   summarize(locations = n(),
 77 |             avg_score = mean(score),
 78 |             median_score = median(score)) %>%
 79 |   ungroup() %>%
 80 |   arrange(desc(locations))
 81 | 
 82 | by_dba %>%
 83 |   mutate(locations_bin = cut(locations, c(0, 1, 3, 10, Inf), labels = c("1", "2-3", "3-10", ">10"))) %>%
 84 |   ggplot(aes(locations_bin, avg_score + 1)) +
 85 |   geom_boxplot() +
 86 |   scale_y_log10()
 87 | ```
 88 | 
 89 | ```{r}
 90 | by_cuisine <- by_dba %>%
 91 |   group_by(cuisine) %>%
 92 |   summarize(avg_score = mean(avg_score),
 93 |             median_score = median(avg_score),
 94 |             restaurants = n()) %>%
 95 |   arrange(desc(restaurants))
 96 | 
 97 | library(broom)
 98 | 
 99 | cuisine_conf_ints <- by_dba %>%
100 |   add_count(cuisine) %>%
101 |   filter(n > 100) %>%
102 |   nest(-cuisine) %>%
103 |   mutate(model = map(data, ~ t.test(.$avg_score))) %>%
104 |   unnest(map(model, tidy))
105 | 
106 | cuisine_conf_ints %>%
107 |   mutate(cuisine = str_remove(cuisine, " \\(.*"),
108 |          cuisine = fct_reorder(cuisine, estimate)) %>%
109 |   ggplot(aes(estimate, cuisine)) +
110 |   geom_point() +
111 |   geom_errorbarh(aes(xmin = conf.low,
112 |                      xmax = conf.high)) +
113 |   labs(x = "Average inspection score (higher means more violations)",
114 |        y = "Type of cuisine",
115 |        title = "Average inspection score by type of cuisine in NYC",
116 |        subtitle = "Each restaurant chain was counted once based on its average score")
117 | ```
118 | 
119 | ```{r}
120 | violation_cuisine_counts <- restaurant_inspections %>%
121 |   semi_join(most_recent_cycle_inspection, by = c("camis", "inspection_date")) %>%
122 |   count(critical_flag, violation_code, violation_description, cuisine = cuisine_description) %>%
123 |   group_by(violation_code) %>%
124 |   mutate(violation_total = sum(n)) %>%
125 |   group_by(cuisine) %>%
126 |   mutate(cuisine_total = sum(n)) %>%
127 |   ungroup() %>%
128 |   filter(violation_total >= 1000,
129 |          cuisine_total >= 2000) %>%
130 |   group_by(violation_description) %>%
131 |   mutate(fraction = n / cuisine_total,
132 |          avg_fraction = mean(fraction)) %>%
133 |   ungroup()
134 | 
135 | violation_cuisine_counts %>%
136 |   mutate(relative_risk = fraction / avg_fraction) %>%
137 |   arrange(desc(relative_risk)) %>%
138 |   filter(str_detect(violation_description, "mice"))
139 | ```
140 | 
141 | ### What violations tend to occur together?
142 | 
143 | ```{r}
144 | library(widyr)
145 | 
146 | violations <- restaurant_inspections %>%
147 |   semi_join(most_recent_cycle_inspection, by = c("camis", "inspection_date")) %>%
148 |   filter(!is.na(violation_description))
149 | 
150 | violations %>%
151 |   pairwise_cor(violation_description, camis, sort = TRUE)
152 | 
153 | principal_components <- violations %>%
154 |   mutate(value = 1) %>%
155 |   widely_svd(violation_description, camis, value, nv = 6)
156 | 
157 | principal_components %>%
158 |   filter(dimension == 2) %>%
159 |   top_n(10, abs(value)) %>%
160 |   mutate(violation_description = str_sub(violation_description, 1, 60),
161 |          violation_description = fct_reorder(violation_description, value)) %>%
162 |   ggplot(aes(violation_description, value)) +
163 |   geom_col() +
164 |   coord_flip()
165 | ```
166 | 
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/nyc-squirrels-app/app.R:
--------------------------------------------------------------------------------
 1 | #
 2 | # This is a Shiny web application. You can run the application by clicking
 3 | # the 'Run App' button above.
 4 | #
 5 | # Find out more about building applications with Shiny here:
 6 | #
 7 | #    http://shiny.rstudio.com/
 8 | #
 9 | 
10 | library(shiny)
11 | 
12 | squirrel_variables <- by_hectare %>%
13 |     select(-(hectare:lat)) %>%
14 |     colnames()
15 | 
16 | names(squirrel_variables) <- squirrel_variables %>%
17 |     str_replace_all("_", " ") %>%
18 |     str_to_title()
19 | 
20 | # Define UI for application that draws a histogram
21 | ui <- fluidPage(
22 | 
23 |     # Application title
24 |     titlePanel("Central Park Squirrels"),
25 | 
26 |     # Sidebar with a slider input for number of bins 
27 |     sidebarLayout(
28 |         sidebarPanel(
29 |             selectInput("variable",
30 |                         "Variable:",
31 |                         choices = squirrel_variables),
32 |             sliderInput("min_squirrels",
33 |                         "Minimum squirrels:",
34 |                         min = 1,
35 |                         max = 30,
36 |                         value = 10)
37 |         ),
38 | 
39 |         # Show a plot of the generated distribution
40 |         mainPanel(
41 |            plotOutput("park_plot")
42 |         )
43 |     )
44 | )
45 | 
46 | # Define server logic required to draw a histogram
47 | server <- function(input, output) {
48 | 
49 |     output$park_plot <- renderPlot({
50 |         # generate bins based on input$bins from ui.R
51 |         var <- sym(input$variable)
52 |         
53 |         filtered <- by_hectare %>%
54 |             filter(n >= input$min_squirrels)
55 |         
56 |         midpoint <- mean(filtered[[input$variable]])
57 | 
58 |         filtered %>%
59 |             ggplot() +
60 |             geom_sf(data = central_park_sf) +  
61 |             geom_point(aes(long, lat, size = n, color = !!var)) +
62 |             theme_void() +
63 |             scale_color_gradient2(low = "blue", high = "red", mid = "pink",
64 |                                   midpoint = midpoint, labels = scales::percent) +
65 |             labs(color = paste("%", input$variable),
66 |                  size = "# of squirrels") +
67 |             coord_sf(datum = NA)
68 |     })
69 | }
70 | 
71 | # Run the application 
72 | shinyApp(ui = ui, server = server)
73 | 


--------------------------------------------------------------------------------
/nyc-squirrels.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Untitled"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | theme_set(theme_light())
 13 | 
 14 | nyc_squirrels <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-29/nyc_squirrels.csv")
 15 | ```
 16 | 
 17 | ```{r}
 18 | nyc_squirrels %>%
 19 |   count(zip_codes, sort = TRUE)
 20 | ```
 21 | 
 22 | ```{r}
 23 | nyc_squirrels %>%
 24 |   ggplot(aes(long, lat)) +
 25 |   geom_point()
 26 | 
 27 | by_hectare <- nyc_squirrels %>%
 28 |   filter(!is.na(primary_fur_color)) %>%
 29 |   group_by(hectare) %>%
 30 |   summarize(long = mean(long),
 31 |             lat = mean(lat),
 32 |             pct_gray = mean(primary_fur_color == "Gray", na.rm = TRUE),
 33 |             n = n())
 34 | 
 35 | by_hectare %>%
 36 |   filter(n >= 10) %>%
 37 |   ggplot(aes(long, lat, size = n, color = pct_gray)) +
 38 |   geom_point() +
 39 |   theme_void()
 40 | 
 41 | by_hectare %>%
 42 |   filter(n >= 10) %>%
 43 |   ggplot(aes(lat, pct_gray)) +
 44 |   geom_point() +
 45 |   geom_smooth()
 46 | 
 47 | by_hectare %>%
 48 |   mutate(n_gray = round(pct_gray * n)) %>%
 49 |   glm(cbind(n_gray, n - n_gray) ~ lat, data = ., family = "binomial") %>%
 50 |   summary()
 51 | ```
 52 | 
 53 | Squirrels may be more likely to be gray the higher north in the park you go, and more likely to be cinnamon.
 54 | 
 55 | ```{r}
 56 | nyc_squirrels %>%
 57 |   count(highlight_fur_color, sort = TRUE)
 58 | 
 59 | nyc_squirrels %>%
 60 |   count(approaches, indifferent, runs_from, sort = TRUE)
 61 | ```
 62 | 
 63 | Does a squirrel run away?
 64 | 
 65 | ```{r}
 66 | glm(runs_from ~ lat, data = nyc_squirrels, family = "binomial") %>%
 67 |   summary()
 68 | ```
 69 | 
 70 | ```{r}
 71 | library(sf)
 72 | 
 73 | central_park_sf <- read_sf("~/Downloads/CentralAndProspectParks/")
 74 | 
 75 | by_hectare <- nyc_squirrels %>%
 76 |   add_count(hectare) %>%
 77 |   mutate(above_ground = !is.na(location) & location == "Above Ground") %>%
 78 |   group_by(hectare, n) %>%
 79 |   summarize_at(vars(long, lat, approaches:runs_from,  ends_with("ing"), above_ground), mean) %>%
 80 |   ungroup()
 81 | 
 82 | by_hectare %>%
 83 |   filter(n >= 10) %>%
 84 |   ggplot() +
 85 |   geom_sf(data = central_park_sf) +  
 86 |   geom_point(aes(long, lat, size = n, color = runs_from)) +
 87 |   theme_void() +
 88 |   scale_color_gradient2(low = "blue", high = "red", mid = "pink",
 89 |                         midpoint = .3, labels = scales::percent) +
 90 |   labs(color = "% of squirrels run",
 91 |        size = "# of squirrels",
 92 |        title = "Squirrels in the northwest corner of Central Park are more likely to run away") +
 93 |   coord_sf(datum = NA)
 94 | ```
 95 | 
 96 | ```{r}
 97 | central_park_sf %>%
 98 |   count(lanes, sort = TRUE)
 99 | 
100 | ggplot(central_park_sf) +
101 |   geom_sf() +
102 |   geom_point(aes(long, lat, color = runs_from), data = by_hectare) +
103 |   coord_sf(datum = NA)
104 | ```
105 | 
106 | ```{r}
107 | central_park_sf %>%
108 |   ggplot() +
109 |   geom_sf(aes(color = bicycle)) +
110 |   coord_sf(datum = NA)
111 | ```
112 | 
113 | 
114 | 
115 | 
116 | 


--------------------------------------------------------------------------------
/office-transcripts.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "The Office"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | library(schrute)
 13 | theme_set(theme_light())
 14 | 
 15 | office_transcripts <- as_tibble(theoffice) %>%
 16 |   mutate(season = as.integer(season),
 17 |          episode = as.integer(episode)) %>%
 18 |   mutate(character = str_remove_all(character, '"')) %>%
 19 |   mutate(name = str_to_lower(str_remove_all(episode_name, "\\.| \\(Part.*")))
 20 | 
 21 | office_ratings <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-17/office_ratings.csv') %>%
 22 |   mutate(name = str_to_lower(str_remove_all(title, "\\.| \\(Part.*|\\: Part.*")))
 23 | ```
 24 | 
 25 | ```{r}
 26 | library(ggrepel)
 27 | 
 28 | office_ratings %>%
 29 |   group_by(season) %>%
 30 |   summarize(avg_rating = mean(imdb_rating)) %>%
 31 |   ggplot(aes(season, avg_rating)) +
 32 |   geom_line() +
 33 |   scale_x_continuous(breaks = 1:9)
 34 | 
 35 | office_ratings %>%
 36 |   mutate(title = fct_inorder(title),
 37 |          episode_number = row_number()) %>%
 38 |   ggplot(aes(episode_number, imdb_rating)) +
 39 |   geom_line() +
 40 |   geom_smooth() +
 41 |   geom_point(aes(color = factor(season), size = total_votes)) +
 42 |   geom_text(aes(label = title), check_overlap = TRUE, hjust = 1) +
 43 |   expand_limits(x = -10) +
 44 |   theme(panel.grid.major.x = element_blank(),
 45 |         legend.position = "none") +
 46 |   labs(x = "Episode number",
 47 |        y = "IMDB Rating",
 48 |        title = "Popularity of The Office episodes over time",
 49 |        subtitle = "Color represents season, size represents # of ratings")
 50 | ```
 51 | 
 52 | ```{r}
 53 | office_ratings %>%
 54 |   arrange(desc(imdb_rating)) %>%
 55 |   mutate(title = paste0(season, ".", episode, " ", title),
 56 |          title = fct_reorder(title, imdb_rating)) %>%
 57 |   head(20) %>%
 58 |   ggplot(aes(title, imdb_rating, color = factor(season), size = total_votes)) +
 59 |   geom_point() +
 60 |   coord_flip() +
 61 |   labs(color = "Season",
 62 |        title = "Most popular episodes of The Office")
 63 | ```
 64 | 
 65 | ### Transcripts
 66 | 
 67 | ```{r}
 68 | library(tidytext)
 69 | 
 70 | blacklist <- c("yeah", "hey", "uh", "gonna")
 71 | blacklist_characters <- c("Everyone", "All", "Both", "Guy", "Girl", "Group")
 72 | 
 73 | transcript_words <- office_transcripts %>%
 74 |   group_by(character) %>%
 75 |   filter(n() >= 100,
 76 |          n_distinct(episode_name) > 2) %>%
 77 |   ungroup() %>%
 78 |   select(-text_w_direction) %>%
 79 |   unnest_tokens(word, text) %>%
 80 |   anti_join(stop_words, by = "word") %>%
 81 |   filter(!word %in% blacklist,
 82 |          !character %in% blacklist_characters)
 83 | 
 84 | character_tf_idf <- transcript_words %>%
 85 |   add_count(word) %>%
 86 |   filter(n >= 20) %>%
 87 |   count(word, character) %>%
 88 |   bind_tf_idf(word, character, n) %>%
 89 |   arrange(desc(tf_idf))
 90 | ```
 91 | 
 92 | ```{r}
 93 | character_tf_idf %>%
 94 |   filter(character %in% c("Dwight", "Jim", "David Wallace", "Darryl", "Jan", "Holly")) %>%
 95 |   group_by(character) %>%
 96 |   top_n(10, tf_idf) %>%
 97 |   ungroup() %>%
 98 |   mutate(word = reorder_within(word, tf_idf, character)) %>%
 99 |   ggplot(aes(word, tf_idf)) +
100 |   geom_col() +
101 |   coord_flip() +
102 |   scale_x_reordered() +
103 |   facet_wrap(~ character, scales = "free_y") +
104 |   labs(x = "",
105 |        y = "TF-IDF of character-word pairs")
106 | ```
107 | 
108 | ```{r}
109 | office_transcripts %>%
110 |   count(character, sort = TRUE) %>%
111 |   filter(character == "Dwight")
112 | ```
113 | 
114 | ### Machine learning model
115 | 
116 | What affects popularity of an episode:
117 | 
118 | * Season/time
119 | * Director
120 | * Writer
121 | * Lines per character
122 | 
123 | ```{r}
124 | ratings_summarized <- office_ratings %>%
125 |   group_by(name) %>%
126 |   summarize(imdb_rating = mean(imdb_rating))
127 | 
128 | character_lines_ratings <- office_transcripts %>%
129 |   filter(!character %in% blacklist_characters) %>%
130 |   count(character, name) %>%
131 |   group_by(character) %>%
132 |   filter(sum(n) >= 50,
133 |          n() >= 5) %>%
134 |   inner_join(ratings_summarized, by = "name")
135 | 
136 | character_lines_ratings %>%
137 |   summarize(avg_rating = mean(imdb_rating),
138 |             nb_episodes = n()) %>%
139 |   arrange(desc(avg_rating)) %>%
140 |   View()
141 | ```
142 | 
143 | ```{r}
144 | director_writer_features <- office_transcripts %>%
145 |   distinct(name, director, writer) %>%
146 |   gather(type, value, director, writer) %>%
147 |   separate_rows(value, sep = ";") %>%
148 |   unite(feature, type, value, sep = ": ") %>%
149 |   group_by(feature) %>%
150 |   filter(n() >= 3) %>%
151 |   mutate(value = 1) %>%
152 |   ungroup()
153 | 
154 | character_line_features <- character_lines_ratings %>%
155 |   ungroup() %>%
156 |   transmute(name, feature = character, value = log2(n))
157 | 
158 | season_features = office_ratings %>%
159 |   distinct(name, season) %>%
160 |   transmute(name, feature = paste("season:", season), value = 1)
161 | 
162 | features <- bind_rows(director_writer_features,
163 |                       character_line_features,
164 |                       season_features) %>%
165 |   semi_join(office_ratings, by = "name") %>%
166 |   semi_join(office_transcripts, by = "name")
167 | ```
168 | 
169 | ```{r}
170 | episode_feature_matrix <- features %>%
171 |   cast_sparse(name, feature, value)
172 | 
173 | ratings <- ratings_summarized$imdb_rating[match(rownames(episode_feature_matrix), ratings_summarized$name)]
174 | 
175 | library(glmnet)
176 | library(broom)
177 | 
178 | mod <- cv.glmnet(episode_feature_matrix, ratings)
179 | 
180 | plot(mod)
181 | 
182 | tidy(mod$glmnet.fit) %>%
183 |   filter(lambda == mod$lambda.min,
184 |          term != "(Intercept)") %>%
185 |   mutate(term = fct_reorder(term, estimate)) %>%
186 |   ggplot(aes(term, estimate, fill = estimate > 0)) +
187 |   geom_col() +
188 |   coord_flip() +
189 |   labs(y = "Estimated effect on the rating of an episode") +
190 |   theme(legend.position = "none")
191 | ```
192 | 


--------------------------------------------------------------------------------
/pascals-triangle.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "The accumulate function"
 3 | output: html_document
 4 | ---
 5 | 
 6 | ```{r setup, include=FALSE}
 7 | knitr::opts_chunk$set(echo = TRUE)
 8 | ```
 9 | 
10 | ```{r}
11 | library(purrr)
12 | 
13 | letters
14 | 
15 | accumulate(letters, paste)
16 | 
17 | accumulate(1:5, ~ . ^ 2, .init = 2)
18 | ```
19 | 
20 | Pascal's Triangle
21 | 
22 | ```{r}
23 | accumulate(1:6, ~ c(0, .) + c(., 0), .init = 1)
24 | ```
25 | 
26 | In this short screencast I share a #tidyverse trick of the day: Use accumulate() from the purrr package to calculate Pascal's Triangle in one line of code.
27 | 
28 | 


--------------------------------------------------------------------------------
/plastic-waste.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Plastic Waste"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | library(scales)
 13 | theme_set(theme_light())
 14 | 
 15 | coast_vs_waste <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-21/coastal-population-vs-mismanaged-plastic.csv")
 16 | 
 17 | mismanaged_vs_gdp <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-21/per-capita-mismanaged-plastic-waste-vs-gdp-per-capita.csv")
 18 | 
 19 | waste_vs_gdp <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-21/per-capita-plastic-waste-vs-gdp-per-capita.csv")
 20 | ```
 21 | 
 22 | ```{r}
 23 | library(janitor)
 24 | 
 25 | # Data cleaning
 26 | clean_dataset <- function(tbl) {
 27 |   tbl %>%
 28 |     clean_names() %>%
 29 |     rename(country = entity,
 30 |            country_code = code) %>%
 31 |     filter(year == 2010) %>%
 32 |     select(-year)
 33 | }
 34 | 
 35 | plastic_waste <- coast_vs_waste %>%
 36 |   clean_dataset() %>%
 37 |   select(-total_population_gapminder) %>%
 38 |   inner_join(clean_dataset(mismanaged_vs_gdp) %>%
 39 |                select(-total_population_gapminder), by = c("country", "country_code")) %>%
 40 |   inner_join(clean_dataset(waste_vs_gdp), by = c("country", "country_code")) %>%
 41 |   select(country,
 42 |          country_code,
 43 |          mismanaged_waste = mismanaged_plastic_waste_tonnes,
 44 |          coastal_population,
 45 |          total_population = total_population_gapminder,
 46 |          mismanaged_per_capita = per_capita_mismanaged_plastic_waste_kilograms_per_person_per_day,
 47 |          gdp_per_capita = gdp_per_capita_ppp_constant_2011_international_rate) %>%
 48 |   filter(!is.na(mismanaged_waste))
 49 | ```
 50 | 
 51 | ```{r}
 52 | g1 <- plastic_waste %>%
 53 |   arrange(-total_population) %>%
 54 |   mutate(pct_population_coastal = pmin(1, coastal_population / total_population),
 55 |          high_coastal_pop = ifelse(pct_population_coastal >= .8, ">=80%", "<80%")) %>%
 56 |   ggplot(aes(gdp_per_capita, mismanaged_per_capita)) +
 57 |   geom_point(aes(size = total_population)) +
 58 |   geom_text(aes(label = country), vjust = 1, hjust = 1, check_overlap = TRUE) +
 59 |   scale_x_log10(labels = dollar_format()) +
 60 |   scale_y_log10() +
 61 |   scale_size_continuous(guide = FALSE) +
 62 |   labs(x = "GDP per capita",
 63 |        y = "Mismanaged plastic waste (kg per person per day)",
 64 |        color = "Coastal population",
 65 |        title = "How plastic waste mismanagement correlates with country income",
 66 |        subtitle = "Based in Our World in Data 2010 numbers. Size represents total population")
 67 | 
 68 | g1
 69 | ```
 70 | 
 71 | ```{r}
 72 | plastic_waste %>%
 73 |   mutate(pct_population_coastal = pmin(1, coastal_population / total_population)) %>%
 74 |   arrange(-total_population) %>%
 75 |   ggplot(aes(pct_population_coastal, mismanaged_per_capita)) +
 76 |   geom_point(aes(size = total_population)) +
 77 |   geom_text(aes(label = country), vjust = 1, hjust = 1, check_overlap = TRUE)
 78 | ```
 79 | 
 80 | ```{r}
 81 | tbl_df(iso3166)
 82 | 
 83 | library(fuzzyjoin)
 84 | 
 85 | plastic_data <- plastic_waste %>%
 86 |   inner_join(iso3166, by = c("country_code" = "a3"))
 87 | 
 88 | map_data("world") %>%
 89 |   tbl_df() %>%
 90 |   filter(region != "Antarctica") %>%
 91 |   regex_left_join(plastic_data, by = c("region" = "mapname")) %>%
 92 |   ggplot(aes(long, lat, group = group, fill = mismanaged_per_capita)) +
 93 |   geom_polygon() +
 94 |   scale_fill_gradient2(trans = "log10",
 95 |                        low = "blue",
 96 |                        high = "red",
 97 |                        mid = "pink",
 98 |                        midpoint = log10(.02)) +
 99 |   coord_fixed(2) +
100 |   ggthemes::theme_map() +
101 |   labs(fill = "Mismanaged plastic waste per-cap",
102 |        title = "Where in the world is waste mismanaged?")
103 | ```
104 | 
105 | ### Comparing to other country stats
106 | 
107 | ```{r}
108 | library(WDI)
109 | 
110 | indicators <- c("co2_emissions_per_capita" = "EN.ATM.CO2E.PC",
111 |                 "cpia_transparency" = "IQ.CPA.TRAN.XQ")
112 | 
113 | other_data <- WDI(indicator = indicators, start = 2010, end = 2010) %>%
114 |   tbl_df() %>%
115 |   select(-country)
116 | 
117 | plastic_with_indicators <- other_data %>%
118 |   inner_join(plastic_data, by = c(iso2c = "a2")) %>%
119 |   arrange(desc(total_population))
120 | 
121 | plastic_with_indicators %>%
122 |   ggplot(aes(gdp_per_capita, co2_emissions_per_capita)) +
123 |   geom_point(aes(size = total_population)) +
124 |   geom_text(aes(label = country), vjust = 1, hjust = 1, check_overlap = TRUE) +
125 |   scale_size_continuous(guide = FALSE) +
126 |   scale_x_log10() +
127 |   scale_y_log10() +
128 |   labs(x = "GDP per capita",
129 |        y = "CO2 emissions (tons per capita)",
130 |        color = "Coastal population",
131 |        title = "How plastic waste mismanagement correlates with country income",
132 |        subtitle = "Based in Our World in Data 2010 numbers. Size represents total population")
133 | 
134 | library(patchwork)
135 | 
136 | g2 +
137 |   labs(title = "CO2 emissions are correlated with country income, but not plastic waste") +
138 |   g1 +
139 |   labs(title = "",
140 |        subtitle = "")
141 | ```
142 | 
143 | ```{r}
144 | # Looking by CPIA trust rating is a trust
145 | plastic_with_indicators %>%
146 |   filter(!is.na(cpia_transparency)) %>%
147 |   ggplot(aes(cpia_transparency, mismanaged_per_capita, group = cpia_transparency)) +
148 |   geom_boxplot()
149 | 
150 | plastic_with_indicators %>%
151 |   arrange(desc(cpia_transparency)) %>%
152 |   View()
153 | ```
154 | 
155 | 
156 | 
157 | 
158 | 
159 | 
160 | 


--------------------------------------------------------------------------------
/r-downloads.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "R Downloads"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | library(scales)
 13 | library(countrycode)
 14 | theme_set(theme_light())
 15 | 
 16 | r_downloads_year_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-10-30/r_downloads_year.csv")
 17 | 
 18 | r_downloads_year <- r_downloads_year_raw %>%
 19 |   select(-X1) %>%
 20 |   mutate(country_name = countrycode(country, "iso2c", "country.name"))
 21 | ```
 22 | 
 23 | ```{r}
 24 | r_downloads_year %>%
 25 |   count(date) %>%
 26 |   ggplot(aes(date, n)) +
 27 |   geom_line() +
 28 |   expand_limits(y = 0) +
 29 |   labs(y = "# of R downloads per day")
 30 | 
 31 | library(lubridate)
 32 | 
 33 | r_downloads_year %>%
 34 |   count(date) %>%
 35 |   group_by(weekday = wday(date, label = TRUE)) %>%
 36 |   summarize(average = mean(n)) %>%
 37 |   ggplot(aes(weekday, average)) +
 38 |   geom_line(group = 1) +
 39 |   expand_limits(y = 0) +
 40 |   labs(y = "Average downloads per weekday")
 41 | ```
 42 | 
 43 | R is downloaded about 1500 times each weekend day and around 3000 times on a typical weekday (averaged over the last year).
 44 | 
 45 | ```{r}
 46 | r_downloads_year %>%
 47 |   group_by(week = floor_date(date, "week")) %>%
 48 |   summarize(n = n_distinct(ip_id)) %>%
 49 |   filter(week > min(week)) %>%
 50 |   ggplot(aes(week, n)) +
 51 |   geom_line() +
 52 |   expand_limits(y = 0) +
 53 |   labs(y = "# of R downloads per week (distinct IPs)")
 54 | ```
 55 | 
 56 | What time of day were people installing R?
 57 | 
 58 | ```{r}
 59 | r_downloads_year %>%
 60 |   mutate(country = countrycode(country, "iso2c", "country.name")) %>%
 61 |   filter(!is.na(country)) %>%
 62 |   count(hour = hour(time),
 63 |         country = fct_lump(country, 8)) %>%
 64 |   ggplot(aes(hour, n)) +
 65 |   geom_line() +
 66 |   expand_limits(y = 0) +
 67 |   facet_wrap(~ country, scales = "free_y")
 68 | ```
 69 | 
 70 | 
 71 | ```{r}
 72 | library(countrycode)
 73 | 
 74 | r_downloads_year %>%
 75 |   count(country = countrycode(country, "iso2c", "country.name"), sort = TRUE) %>%
 76 |   mutate(percent = n / sum(n)) %>%
 77 |   filter(!is.na(country)) %>%
 78 |   head(16) %>%
 79 |   mutate(country = fct_reorder(country, percent)) %>%
 80 |   ggplot(aes(country, percent)) +
 81 |   geom_col() +
 82 |   coord_flip() +
 83 |   scale_y_continuous(labels = percent_format()) +
 84 |   labs(title = "What countries install the most R?")
 85 | ```
 86 | 
 87 | More than a third of R installations come from the US.
 88 | 
 89 | ```{r}
 90 | r_downloads_year %>%
 91 |   mutate(version = fct_lump(version, 8)) %>%
 92 |   count(date, version) %>%
 93 |   ggplot(aes(date, n, color = version)) +
 94 |   geom_line()
 95 | ```
 96 | 
 97 | What operating system do R users use?
 98 | 
 99 | ```{r}
100 | r_downloads_year %>%
101 |   count(country = fct_lump(country, 8),
102 |         week = floor_date(date, "week")) %>%
103 |   filter(week > min(week)) %>%
104 |   ggplot(aes(week, n, color = country)) +
105 |   geom_line()
106 | ```
107 | 
108 | ### R package downloads
109 | 
110 | ```{r}
111 | package_downloads <- read_csv("http://cran-logs.rstudio.com/2018/2018-10-27.csv.gz")
112 | ```
113 | 
114 | ```{r}
115 | package_downloads %>%
116 |   filter(country %in% c("US", "IN")) %>%
117 |   group_by(country, package, sort = TRUE) %>%
118 |   summarize(n = n_distinct(ip_id)) %>%
119 |   spread(country, n, fill = 0) %>%
120 |   ungroup() %>%
121 |   mutate(total = US + IN,
122 |          IN = (IN + 1) / sum(IN + 1),
123 |          US = (US + 1) / sum(US + 1),
124 |          ratio = US / IN) %>%
125 |   filter(total >= 1000) %>%
126 |   arrange((ratio)) %>%
127 |   View()
128 | ```
129 | 
130 | ```{r}
131 | library(cranlogs)
132 | 
133 | cranlogs::cran_downloads(packages = c("tidyverse", "broom"), when = "last-week")
134 | ```
135 | 
136 | 
137 | ### Appendix: Why count only distinct IPs?
138 | 
139 | ```{r}
140 | r_download_gaps <- r_downloads_year %>%
141 |   mutate(datetime = as.POSIXlt(date) + time) %>%
142 |   arrange(datetime) %>%
143 |   group_by(ip_id) %>%
144 |   mutate(gap = as.numeric(datetime - lag(datetime))) %>%
145 |   filter(!is.na(gap))
146 | ```
147 | 
148 | ```{r}
149 | ip_counts <- r_downloads_year %>%
150 |   count(ip_id, sort = TRUE)
151 | ```
152 | 
153 | A majority (`r percent(mean(ip_counts$n >= 100))`) of IP addresses that installed R did so more than 100 times in a year.
154 | 
155 | ```{r}
156 | r_download_gaps %>%
157 |   ggplot(aes(gap)) +
158 |   geom_histogram() +
159 |   geom_vline(color = "red", lty = 2, xintercept = 86400) +
160 |   scale_x_log10(breaks = 60 ^ (0:4),
161 |                 labels = c("Second", "Minute", "Hour", "2.5 Days", "120 Days"))
162 | ```
163 | 
164 | ```{r}
165 | r_download_gaps 
166 | ```
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/ramen-ratings.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Ramen Ratings"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ## R Markdown
 11 | 
 12 | ```{r}
 13 | library(tidyverse)
 14 | theme_set(theme_light())
 15 | 
 16 | ramen_ratings <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-06-04/ramen_ratings.csv")
 17 | ```
 18 | 
 19 | ```{r}
 20 | library(drlib)
 21 | 
 22 | ramen_ratings_processed <- ramen_ratings %>%
 23 |   mutate(style = fct_lump(style, 4),
 24 |          country = fct_lump(country, 12),
 25 |          brand = fct_lump(brand, 20)) %>%
 26 |   replace_na(list(style = "Other")) %>%
 27 |   mutate(brand = fct_relevel(brand, "Other"),
 28 |          country = fct_relevel(country, "Other"),
 29 |          style = fct_relevel(style, "Pack"))
 30 | 
 31 | ramen_ratings_processed %>%
 32 |   gather(category, value, -review_number, -stars) %>%
 33 |   count(category, value) %>%
 34 |   group_by(category) %>%
 35 |   top_n(20, n) %>%
 36 |   ungroup() %>%
 37 |   mutate(value = reorder_within(value, n, category)) %>%
 38 |   ggplot(aes(value, n)) +
 39 |   geom_col() +
 40 |   facet_wrap(~ category, scales = "free_y") +
 41 |   scale_x_reordered() +
 42 |   coord_flip() +
 43 |   labs(title = "Categorical predictors (after processing)",
 44 |        x = "Predictor",
 45 |        y = "Count")
 46 | ```
 47 | 
 48 | ```{r}
 49 | library(broom)
 50 | 
 51 | lm(stars ~ brand + country + style, ramen_ratings_processed) %>%
 52 |   tidy(conf.int = TRUE) %>%
 53 |   filter(term != "(Intercept)") %>%
 54 |   arrange(desc(estimate)) %>%
 55 |   extract(term, c("category", "term"), "^([a-z]+)([A-Z].*)") %>%
 56 |   mutate(term = fct_reorder(term, estimate)) %>%
 57 |   ggplot(aes(estimate, term, color = category)) +
 58 |   geom_point() +
 59 |   geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
 60 |   geom_vline(lty = 2, xintercept = 0) +
 61 |   facet_wrap(~ category, ncol = 1, scales = "free_y") +
 62 |   theme(legend.position = "none") +
 63 |   labs(x = "Estimated effect on ramen rating",
 64 |        y = "",
 65 |        title = "Coefficients that predict ramen ratings",
 66 |        subtitle = "Less common brands and countries were lumped together as the reference level")
 67 | ```
 68 | 
 69 | ```{r}
 70 | library(tidytext)
 71 | 
 72 | ramen_ratings_processed %>%
 73 |   filter(!is.na(stars)) %>%
 74 |   unnest_tokens(word, variety) %>%
 75 |   group_by(word) %>%
 76 |   summarize(avg_rating = mean(stars),
 77 |             n = n()) %>%
 78 |   arrange(desc(n))
 79 | ```
 80 | 
 81 | ### Web scraping
 82 | 
 83 | ```{r}
 84 | library(rvest)
 85 | 
 86 | ramen_list <- read_html("https://www.theramenrater.com/resources-2/the-list/")
 87 | 
 88 | # How the original data was (probably) created
 89 | ramen_reviews <- ramen_list %>%
 90 |   html_node("#myTable") %>%
 91 |   html_table() %>%
 92 |   tbl_df() %>%
 93 |   janitor::clean_names() %>%
 94 |   select(-t)
 95 | ```
 96 | 
 97 | ```{r}
 98 | review_links <- read_html("https://www.theramenrater.com/resources-2/the-list/") %>%
 99 |   html_nodes("#myTable a")
100 | 
101 | reviews <- tibble(review_number = parse_number(html_text(review_links)),
102 |                   link = html_attr(review_links, "href"))
103 | ```
104 | 
105 | See here for more about possibly and other "dealing with failure" functions: https://r4ds.had.co.nz/iteration.html#dealing-with-failure
106 | 
107 | ```{r}
108 | page <- read_html("https://www.theramenrater.com/2019/05/23/3180-yum-yum-moo-deng/")
109 | 
110 | get_review_text <- function(url) {
111 |   message(url)
112 |   
113 |   read_html(url) %>%
114 |     html_nodes(".entry-content > p") %>%
115 |     html_text() %>%
116 |     str_subset(".")
117 | }
118 | 
119 | review_text <- reviews %>%
120 |   head(250) %>%
121 |   mutate(text = map(link, possibly(get_review_text, character(0), quiet = FALSE)))
122 | ```
123 | 
124 | More on correlation graphs: https://www.tidytextmining.com/ngrams.html
125 | 
126 | ```{r}
127 | library(tidytext)
128 | 
129 | review_paragraphs <- review_text %>% 
130 |   filter(!map_lgl(text, is.null)) %>%
131 |   unnest() %>%
132 |   filter(str_detect(text, "Finished")) %>%
133 |   mutate(text = str_remove(text, "Finished.*?\\. "))
134 | 
135 | review_paragraphs_tokenized <- review_paragraphs %>%
136 |   unnest_tokens(word, text) %>%
137 |   anti_join(stop_words, by = "word") %>%
138 |   filter(str_detect(word, "[a-z]")) %>%
139 |   inner_join(ramen_ratings, by = "review_number")
140 | 
141 | review_words <- review_paragraphs_tokenized %>%
142 |   filter(!is.na(stars)) %>%
143 |   group_by(word) %>%
144 |   summarize(number = n(),
145 |             reviews = n_distinct(review_number),
146 |             avg_rating = mean(stars)) %>%
147 |   arrange(desc(reviews))
148 | 
149 | review_words_filtered <- review_words %>%
150 |   filter(reviews < 200, reviews >= 10)
151 | 
152 | library(widyr)
153 | word_cors <- review_paragraphs_tokenized %>%
154 |   semi_join(review_words_filtered, by = "word") %>%
155 |   distinct(review_number, word) %>%
156 |   pairwise_cor(word, review_number, sort = TRUE)
157 | ```
158 | 
159 | ```{r}
160 | library(igraph)
161 | library(ggraph)
162 | 
163 | set.seed(2019)
164 | 
165 | filtered_cors <- word_cors %>%
166 |   head(300)
167 | 
168 | nodes <- review_words_filtered %>%
169 |   filter(word %in% filtered_cors$item1 | word %in% filtered_cors$item2)
170 | 
171 | filtered_cors %>%
172 |   graph_from_data_frame(vertices = nodes) %>%
173 |   ggraph() +
174 |   geom_edge_link() +
175 |   geom_node_point(aes(size = reviews * 1.1)) +
176 |   geom_node_point(aes(size = reviews, color = avg_rating)) +
177 |   geom_node_text(aes(label = name), repel = TRUE) +
178 |   scale_color_gradient2(low = "red", high = "blue", midpoint = 4) +
179 |   theme_void() +
180 |   labs(color = "Average rating",
181 |        size = "# of reviews",
182 |        title = "Network of words used together in ramen reviews",
183 |        subtitle = "Based on 250 ramen reviews and their star ratings")
184 | ```
185 | 
186 | 


--------------------------------------------------------------------------------
/riddler-die-reroll.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "The Riddler: Rerolling a die"
 3 | output: html_document
 4 | ---
 5 | 
 6 | ```{r setup, include=FALSE}
 7 | knitr::opts_chunk$set(echo = TRUE)
 8 | ```
 9 | 
10 | ```{r}
11 | library(tidyverse)
12 | 
13 | start <- 1:6
14 | 
15 | next_rolls <- sample(start, replace = TRUE)
16 | 
17 | simulate_roll <- function(num_sides = 6) {
18 |   rolls <- accumulate(1:1000, function(x, ...) {
19 |     result <- sample(x, replace = TRUE)
20 |     
21 |     if (length(unique(result)) == 1) {
22 |       done(result)
23 |     } else {
24 |       result
25 |     }
26 |   }, .init = 1:num_sides)
27 |   
28 |   length(rolls) - 1
29 | }
30 | 
31 | simulations4 <- replicate(1e6, simulate_roll(4))
32 | 
33 | mean(simulations)
34 | 
35 | tibble(simulations) %>%
36 |   count(simulations, sort = TRUE)
37 | ```
38 | 
39 | On average, it takes 9.666 rolls to get the same number on all sides.
40 | 
41 | ```{r simulations_crossed}
42 | simulations_crossed <- crossing(trial = 1:20000,
43 |                                 num_sides = 2:10) %>%
44 |   mutate(simulation = map_dbl(num_sides, simulate_roll))
45 | ```
46 | 
47 | ```{r}
48 | summarized <- simulations_crossed %>%
49 |   group_by(num_sides) %>%
50 |   summarize(average_roll = mean(simulation),
51 |             variance_roll = var(simulation),
52 |             sd_roll = sd(simulation),
53 |             cov = sd_roll / average_roll)
54 | 
55 | summarized %>%
56 |   ggplot(aes(num_sides, average_roll)) +
57 |   geom_point() +
58 |   geom_line() +
59 |   expand_limits(y = 0)
60 | 
61 | lm(average_roll ~ num_sides, summarized)
62 | 
63 | simulations_crossed %>%
64 |   ggplot(aes(simulation)) +
65 |   geom_histogram(binwidth = 1) +
66 |   facet_wrap(~ num_sides, scales = "free_y")
67 | ```
68 | 


--------------------------------------------------------------------------------
/riddler-die-roll-low.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Riddler: How Low Can You Roll: Cumulative window functions"
 3 | output: html_document
 4 | ---
 5 | 
 6 | ```{r setup, include=FALSE}
 7 | knitr::opts_chunk$set(echo = TRUE)
 8 | ```
 9 | 
10 | ```{r}
11 | library(tidyverse)
12 | 
13 | set.seed(2020)
14 | 
15 | sim <- tibble(roll = sample(0:9, 1e7, replace = TRUE)) %>%
16 |   group_by(group = lag(cumsum(roll == 0), default = 0)) %>%
17 |   filter(roll <= cummin(roll)) %>%
18 |   mutate(decimal = roll * 10 ^ -row_number()) %>%
19 |   summarize(score = sum(decimal))
20 | ```
21 | 
22 | ```{r}
23 | scores %>%
24 |   summarize(mean(score))
25 | 
26 | scores %>%
27 |   ggplot(aes(score)) +
28 |   geom_histogram(binwidth = .001) +
29 |   scale_x_continuous(breaks = seq(0, 1, .1))
30 | 
31 | mean(scores$score)
32 | ```
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/riddler-prisoner-coin-flip.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Riddler: Flip your way to freedom"
 3 | output: html_document
 4 | ---
 5 | 
 6 | ```{r setup, include=FALSE}
 7 | knitr::opts_chunk$set(echo = TRUE)
 8 | ```
 9 | 
10 | ```{r}
11 | library(tidyverse)
12 | library(scales)
13 | theme_set(theme_light())
14 | 
15 | sim <- crossing(trial = 1:50000,
16 |          probability = seq(.01, 1, .01)) %>%
17 |   mutate(num_flips = rbinom(n(), 4, probability),
18 |          num_tails = rbinom(n(), num_flips, .5),
19 |          set_free = num_flips != 0 & num_tails == 0)
20 | 
21 | probability_exact <- function(p, n = 4) {
22 |   sum(dbinom(0:n, n, p) * c(0, dbinom(1:n, 1:n, .5)))
23 | }
24 | 
25 | probabilities <- sim %>%
26 |   group_by(probability) %>%
27 |   summarize(pct_free = mean(set_free)) %>%
28 |   mutate(exact = map_dbl(probability, probability_exact))
29 | 
30 | probabilities %>%
31 |   ggplot(aes(probability, pct_free)) +
32 |   geom_line() +
33 |   geom_line(aes(y = exact), color = "red", lty = 2)
34 | 
35 | opt <- optimise(probability_exact, c(0, 1), maximum = TRUE)
36 | ```
37 | 
38 | The highest chance of escape is `r percent(opt$objective)` when the prisoners use the random number generator to have a `r percent(optim$maximum)` chance of flipping the coin.
39 | 
40 | ### Extra credit
41 | 
42 | ```{r}
43 | sim_n <- crossing(trial = 1:100000,
44 |                   probability = seq(.02, 1, .02),
45 |                   n = 2:8) %>%
46 |   mutate(num_flips = rbinom(n(), n, probability),
47 |          num_tails = rbinom(n(), num_flips, .5),
48 |          set_free = num_flips != 0 & num_tails == 0)
49 | 
50 | probabilities_n <- sim_n %>%
51 |   group_by(probability, n) %>%
52 |   summarize(simulated = mean(set_free)) %>%
53 |   ungroup() %>%
54 |   mutate(exact = map2_dbl(probability, n, probability_exact))
55 | 
56 | probabilities_n %>%
57 |   ggplot(aes(probability, exact, color = factor(n))) +
58 |   geom_line() +
59 |   geom_point(aes(y = simulated), size = .4) +
60 |   scale_x_continuous(labels = percent) +
61 |   scale_y_continuous(labels = percent) +
62 |   labs(x = "Probability of flipping the coin",
63 |        y = "Probability of escape",
64 |        color = "# of prisoners",
65 |        title = "What's the chance of escaping with n prisoners?",
66 |        subtitle = "Points show simulations of 100,000 prisoners each")
67 | ```
68 | 
69 | ```{r}
70 | optimize_for_n <- function(n) {
71 |   optimise(function(p) probability_exact(p, n), c(0, 1), maximum = TRUE)
72 | }
73 | 
74 | optimal_n <- tibble(n = 2:60) %>%
75 |   mutate(optimal = map(n, optimize_for_n)) %>%
76 |   unnest_wider(optimal)
77 | 
78 | optimal_n %>%
79 |   gather(metric, value, -n) %>%
80 |   mutate(metric = ifelse(metric == "maximum", "Optimal probability to flip", "Probability of escape")) %>%
81 |   ggplot(aes(n, value, color = metric)) +
82 |   geom_line() +
83 |   scale_y_continuous(labels = percent) +
84 |   expand_limits(y = 0) +
85 |   labs(x = "# of prisoners",
86 |        y = "Probability",
87 |        color = "")
88 | 
89 | optimal_n %>%
90 |   arrange(desc(n)) %>%
91 |   mutate(expected_coins_flipped = n * maximum) %>%
92 |   ggplot(aes(n, expected_coins_flipped)) +
93 |   geom_line()
94 | ```
95 | 
96 | 


--------------------------------------------------------------------------------
/riddler-spam-comments.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Riddler: Spam Comments"
 3 | output: html_document
 4 | ---
 5 | 
 6 | ```{r setup, include=FALSE}
 7 | knitr::opts_chunk$set(echo = TRUE)
 8 | ```
 9 | 
10 | ```{r}
11 | library(tidyverse)
12 | 
13 | sim <- replicate(1e6, which(cumsum(rexp(250, 1:250)) > 3)[1] - 1)
14 | 
15 | mean(sim, na.rm = TRUE)
16 | ```
17 | 
18 | ```{r}
19 | comment_counts <- tibble(n_comments = sim) %>%
20 |   count(n_comments) %>%
21 |   mutate(density = n / sum(n))
22 | 
23 | library(broom)
24 | 
25 | augmented_exp <- nls(density ~ lambda * exp(-lambda * n_comments),
26 |                      data = comment_counts,
27 |                      start = list(lambda = 1)) %>%
28 |   augment(comment_counts)
29 | 
30 | augmented_geometric <- nls(density ~ (1 - p) ^ n_comments * p,
31 |                            data = comment_counts,
32 |                            start = list(p = .05)) %>%
33 |   augment(comment_counts)
34 | 
35 | augmented_geometric %>%
36 |   ggplot(aes(n_comments, density)) +
37 |   geom_line() +
38 |   geom_line(aes(y = .fitted), color = "red")
39 | ```
40 | 
41 | The number of comments after 3 days is described by a geometric distribution.
42 | 
43 | What determines the parameter $p$ (and therefore the expected value) of the geometric distribution?
44 | 
45 | 
46 | ```{r}
47 | tidy_sim <- crossing(trial = 1:5e4,
48 |          step = 1:250) %>%
49 |   mutate(waiting = rexp(n(), step)) %>%
50 |   group_by(trial) %>%
51 |   mutate(cumulative = cumsum(waiting)) %>%
52 |   ungroup()
53 | 
54 | ncomments <- tidy_sim %>%
55 |   mutate(within_3 = cumulative < 3) %>%
56 |   group_by(trial) %>%
57 |   summarize(n_comments = sum(within_3))
58 | ```
59 | 
60 | ```{r}
61 | comments_by_threshold <- tidy_sim %>%
62 |   crossing(threshold = seq(.25, 3, .25)) %>%
63 |   mutate(within = cumulative < threshold) %>%
64 |   group_by(threshold, trial) %>%
65 |   summarize(n_comments = sum(within))
66 | 
67 | comments_by_threshold %>%
68 |   summarize(expected_value = mean(n_comments)) %>%
69 |   ggplot(aes(threshold, expected_value)) +
70 |   geom_line() +
71 |   geom_line(aes(y = exp(threshold) - 1), color = "red") +
72 |   labs(x = "# of days",
73 |        y = "Expected number of comments",
74 |        title = "How many comments cumulatively over time?",
75 |        subtitle = "Red line shows exp(x) - 1")
76 | 
77 | comment_counts %>%
78 |   filter(!is.na(n_comments)) %>%
79 |   ggplot(aes(n_comments, density)) +
80 |   geom_line() +
81 |   geom_line(aes(y = (1 - 1 / exp(3)) ^ n_comments / exp(3)), color = "red")
82 | ```
83 | 
84 | The number of comments after n days is described by a geometric distribution with expected value $e^{n}-1$ (that is, with success probability $exp(-n)$).
85 | 


--------------------------------------------------------------------------------
/seattle-pets.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Untitled"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | library(lubridate)
 13 | theme_set(theme_light())
 14 | 
 15 | seattle_pets <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-26/seattle_pets.csv") %>%
 16 |   mutate(license_issue_date = mdy(license_issue_date)) %>%
 17 |   rename(animal_name = animals_name)
 18 | ```
 19 | 
 20 | ```{r}
 21 | seattle_pets %>%
 22 |   filter(license_issue_date >= "2017-01-01") %>%
 23 |   count(species, primary_breed, sort = TRUE) %>%
 24 |   filter(species %in% c("Cat", "Dog")) %>%
 25 |   mutate(percent = n / sum(n)) %>%
 26 |   group_by(species) %>%
 27 |   top_n(10, percent) %>%
 28 |   ungroup() %>%
 29 |   mutate(primary_breed = fct_reorder(primary_breed, percent)) %>%
 30 |   ggplot(aes(primary_breed, percent, fill = species)) +
 31 |   geom_col(show.legend = FALSE) +
 32 |   scale_y_continuous(labels = scales::percent_format()) +
 33 |   facet_wrap(~ species, scales = "free_y", ncol = 1) +
 34 |   coord_flip() +
 35 |   labs(x = "Primary breed",
 36 |        y = "% of this species",
 37 |        title = "Most common cat and dog breeds",
 38 |        subtitle = "Of licensed pets in Seattle 2017-2018")
 39 | ```
 40 | 
 41 | ```{r}
 42 | dogs <- seattle_pets %>%
 43 |   filter(species == "Dog")
 44 | 
 45 | name_counts <- dogs %>%
 46 |   group_by(animal_name) %>%
 47 |   summarize(name_total = n()) %>%
 48 |   filter(name_total >= 100)
 49 | 
 50 | breed_counts <- dogs %>%
 51 |   group_by(primary_breed) %>%
 52 |   summarize(breed_total = n()) %>%
 53 |   filter(breed_total >= 200)
 54 | 
 55 | total_dogs <- nrow(dogs)
 56 | 
 57 | name_breed_counts <- dogs %>%
 58 |   count(primary_breed, animal_name) %>%
 59 |   complete(primary_breed, animal_name, fill = list(n = 0)) %>%
 60 |   inner_join(name_counts, by = "animal_name") %>%
 61 |   inner_join(breed_counts, by = "primary_breed")
 62 | 
 63 | # One-sided hypergeometric p-value
 64 | hypergeom_test <- name_breed_counts %>%
 65 |   mutate(percent_of_breed = n / breed_total,
 66 |          percent_overall = name_total / total_dogs) %>%
 67 |   mutate(overrepresented_ratio = percent_of_breed / percent_overall) %>%
 68 |   arrange(desc(overrepresented_ratio)) %>%
 69 |   mutate(hypergeom_p_value = 1 - phyper(n, name_total, total_dogs - name_total, breed_total),
 70 |          holm_p_value = p.adjust(hypergeom_p_value),
 71 |          fdr = p.adjust(hypergeom_p_value, method = "fdr"))
 72 | 
 73 | hypergeom_test %>%
 74 |   filter(fdr < .05)
 75 | 
 76 | ggplot(hypergeom_test, aes(hypergeom_p_value)) +
 77 |   geom_histogram(binwidth = .1) +
 78 |   labs(x = "One-sided hypergeometric p-values for overrepresented name")
 79 | ```
 80 | 
 81 | ```{r}
 82 | crossing(name_total = c(100, 200, 300),
 83 |          breed_total = seq(200, 1000, 25)) %>%
 84 |   mutate(max_p_value = 1 - phyper(0, name_total, total_dogs - name_total, breed_total)) %>%
 85 |   ggplot(aes(breed_total, max_p_value, color = factor(name_total))) +
 86 |   geom_line() +
 87 |   labs(x = "Total # of dogs in breed",
 88 |        y = "Maximum one-sided p-value",
 89 |        color = "# of dogs with name")
 90 | ```
 91 | 
 92 | ```{r}
 93 | library(scales)
 94 | 
 95 | hypergeom_test %>%
 96 |   filter(fdr <= .05) %>%
 97 |   arrange(fdr) %>%
 98 |   transmute(`Breed` = primary_breed,
 99 |             `Name` = animal_name,
100 |             `# of dogs with name` = n,
101 |             `% of breed` = percent(percent_of_breed),
102 |             `% overall` = percent(percent_overall),
103 |             `FDR-adjusted one-sided p-value` = fdr) %>%
104 |   knitr::kable()
105 | ```
106 | 
107 | 


--------------------------------------------------------------------------------
/simpsons-guests.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Simpsons Guests"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | theme_set(theme_light())
 13 | 
 14 | simpsons <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-08-27/simpsons-guests.csv") %>%
 15 |   mutate(self = str_detect(role, "self|selves"),
 16 |          season = parse_number(season))
 17 | ```
 18 | 
 19 | ```{r}
 20 | simpsons %>%
 21 |   filter(self) %>%
 22 |   count(guest_star, sort = TRUE) %>%
 23 |   filter(n > 1) %>%
 24 |   mutate(guest_star = fct_reorder(guest_star, n)) %>%
 25 |   ggplot(aes(guest_star, n)) +
 26 |   geom_col() +
 27 |   coord_flip() +
 28 |   labs(title = "Who has played themselves in multiple Simpsons episodes?")
 29 | ```
 30 | 
 31 | ```{r}
 32 | simpsons %>%
 33 |   separate_rows(role, sep = ";\\s+") %>%
 34 |   add_count(role) %>%
 35 |   filter(n >= 8) %>%
 36 |   count(season, role) %>%
 37 |   mutate(role = fct_reorder(role, -n, sum)) %>%
 38 |   ggplot(aes(season, n)) +
 39 |   geom_col() +
 40 |   facet_wrap(~ role)
 41 | ```
 42 | 
 43 | ```{r}
 44 | simpsons
 45 | ```
 46 | 
 47 | ### Bringing in Simpsons dialogue
 48 | 
 49 | ```{r}
 50 | dialogue <- read_csv("~/Downloads/simpsons_dataset.csv") %>%
 51 |   select(role = raw_character_text, line = spoken_words)
 52 | 
 53 | guests_processed <- simpsons %>%
 54 |   separate_rows(role, sep = ";\\s+") %>%
 55 |   mutate(role = ifelse(self, guest_star, role),
 56 |          role = ifelse(role == "Edna Krabappel", "Edna Krabappel-Flanders", role))
 57 | 
 58 | guests_summarized <- guests_processed %>%
 59 |   filter(season <= 27) %>%
 60 |   group_by(guest_star, role, self) %>%
 61 |   summarize(nb_episodes = n(),
 62 |             first_season = min(season),
 63 |             last_season = max(season)) %>%
 64 |   arrange(desc(nb_episodes)) %>%
 65 |   group_by(role) %>%
 66 |   filter(n() == 1) %>%
 67 |   ungroup() %>%
 68 |   filter(!is.na(role))
 69 | 
 70 | dialogue_summarized <- dialogue %>%
 71 |   group_by(role) %>%
 72 |   summarize(nb_lines = n(),
 73 |             random_line = sample(line, 1)) %>%
 74 |   arrange(desc(nb_lines))
 75 | 
 76 | guest_roles <- guests_summarized %>%
 77 |   inner_join(dialogue_summarized, by = "role") %>%
 78 |   mutate(lines_per_episode = nb_lines / nb_episodes)
 79 | 
 80 | guest_roles %>%
 81 |   mutate(self = ifelse(self, "Playing Themselves", "Playing a Character")) %>%
 82 |   ggplot(aes(lines_per_episode)) +
 83 |   geom_histogram(binwidth = 2, center = 1) +
 84 |   facet_wrap(~ self, ncol = 1) +
 85 |   labs(x = "Average # of lines per episode",
 86 |        title = "Most guest stars, especially those playing themselves, have relatively few lines per episode")
 87 | 
 88 | guest_roles %>%
 89 |   arrange(desc(lines_per_episode))
 90 | ```
 91 | 
 92 | ```{r}
 93 | library(tidytext)
 94 | 
 95 | role_words <- dialogue %>%
 96 |   filter(!is.na(line), !is.na(role)) %>%
 97 |   mutate(line_number = row_number()) %>%
 98 |   unnest_tokens(word, line) %>%
 99 |   anti_join(stop_words, by = "word") %>%
100 |   distinct(role, line_number, word) %>%
101 |   count(role, word, sort = TRUE)
102 | 
103 | role_word_tf_idf <- role_words %>%
104 |   group_by(role) %>%
105 |   mutate(total_words = sum(n)) %>%
106 |   ungroup() %>%
107 |   bind_tf_idf(word, role, n) %>%
108 |   arrange(desc(tf_idf))
109 | 
110 | role_word_tf_idf %>%
111 |   filter(total_words >= 500) %>%
112 |   distinct(role, .keep_all = TRUE) %>%
113 |   mutate(role_word = paste0(role, ": ", word)) %>%
114 |   head(20) %>%
115 |   mutate(role_word = fct_reorder(role_word, tf_idf)) %>%
116 |   ggplot(aes(role_word, tf_idf)) +
117 |   geom_col() +
118 |   coord_flip() +
119 |   labs(title = "Using TF-IDF as a Simpsons catchphrase detector",
120 |        subtitle = "Only the 53 characters that speak at least 500 words in 27 seasons",
121 |        x = "",
122 |        y = "TF-IDF")
123 | ```
124 | 
125 | ```{r}
126 | guests_summarized %>%
127 |   filter(nb_episodes > 1) %>%
128 |   inner_join(role_word_tf_idf, by = "role") %>%
129 |   filter(total_words >= 100) %>%
130 |   arrange(desc(tf_idf)) %>%
131 |   distinct(role, .keep_all = TRUE) %>%
132 |   select(guest_star, role, word, tf_idf)
133 | ```
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 


--------------------------------------------------------------------------------
/space-launches.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Space Launches"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | theme_set(theme_light())
 13 | 
 14 | agencies <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv")
 15 | launches <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv")
 16 | ```
 17 | 
 18 | ```{r}
 19 | launches %>%
 20 |   count(launch_year, agency_type) %>%
 21 |   ggplot(aes(launch_year, n, color = agency_type)) +
 22 |   geom_line() +
 23 |   labs(x = "Time",
 24 |        y = "# of launches this year",
 25 |        color = "Agency type")
 26 | ```
 27 | 
 28 | ```{r}
 29 | library(countrycode)
 30 | 
 31 | agencies %>%
 32 |   View()
 33 | 
 34 | launches %>%
 35 |   count(agency_type, agency, sort = TRUE)
 36 | 
 37 | agencies %>%
 38 |   count(state_code, wt = count, sort = TRUE) %>%
 39 |   View()
 40 | 
 41 | launches_processed <- launches %>%
 42 |   filter(launch_date <= Sys.Date()) %>%
 43 |   mutate(state_code_cleaned = fct_collapse(
 44 |     state_code,
 45 |     "RU" = c("SU", "RU"),
 46 |     "FR" = "F",
 47 |     "JP" = "J",
 48 |     "IT" = "I"
 49 |   )) %>%
 50 |   mutate(state_name = countrycode(state_code_cleaned, "iso2c", "country.name"),
 51 |          state_name = fct_lump(state_name, 6)) %>%
 52 |   replace_na(list(state_name = "Other"))
 53 | 
 54 | launches_processed %>%
 55 |   count(launch_year, state_name) %>%
 56 |   mutate(state_name = fct_reorder(state_name, -n, sum)) %>%
 57 |   ggplot(aes(launch_year, n, color = state_name)) +
 58 |   geom_line() +
 59 |   labs(x = "Time",
 60 |        y = "Launches per year",
 61 |        color = "Responsible state",
 62 |        title = "Launches per year per country",
 63 |        subtitle = "Combines Soviet Union (pre-1990) with Russia")
 64 | ```
 65 | 
 66 | ### Focus on private + startup launches
 67 | 
 68 | ```{r}
 69 | agencies %>%
 70 |   filter(agency_type %in% c("private", "startup")) %>%
 71 |   View()
 72 | 
 73 | private_startup_launches <- launches_processed %>%
 74 |   filter(agency_type %in% c("private", "startup")) %>%
 75 |   inner_join(agencies %>%
 76 |                select(agency, agency_name = name, short_name, parent), by = "agency") %>%
 77 |   mutate(agency_name_lumped = fct_lump(agency_name, 6),
 78 |          agency_name_lumped = if_else(agency_name_lumped == "Other" & state_name == "United States",
 79 |                                      "Other US", as.character(agency_name_lumped)))
 80 | 
 81 | private_startup_launches %>%
 82 |   count(agency_name_lumped, state_name, sort = TRUE) %>%
 83 |   mutate(agency_name_lumped = fct_reorder(agency_name_lumped, n, sum)) %>%
 84 |   ggplot(aes(agency_name_lumped, n, fill = state_name)) +
 85 |   geom_col() +
 86 |   coord_flip() +
 87 |   labs(x = "",
 88 |        y = "# of launches overall",
 89 |        title = "What private/startup agencies have had the most launches?",
 90 |        fill = "Country")
 91 | 
 92 | private_startup_launches %>%
 93 |   count(agency_name_lumped,
 94 |         decade = 5 * (launch_year %/% 5)) %>%
 95 |   complete(agency_name_lumped, decade, fill = list(n = 0)) %>%
 96 |   mutate(agency_name_lumped = fct_reorder(agency_name_lumped, -n, sum)) %>%
 97 |   ggplot(aes(decade, n, color = agency_name_lumped)) +
 98 |   geom_line() +
 99 |   facet_wrap(~ agency_name_lumped) +
100 |   theme(legend.position = "none") +
101 |   labs(x = "Time",
102 |        y = "# of launches in 5 year period")
103 | ```
104 | 
105 | ```{r}
106 | vehicles <- launches_processed %>%
107 |   group_by(type, state_name) %>%
108 |   summarize(first_launch = min(launch_year),
109 |             last_launch = max(launch_year),
110 |             launches = n()) %>%
111 |   ungroup()
112 | 
113 | russian_vehicles <- vehicles %>%
114 |   filter(state_name == "Russia") %>%
115 |   arrange(desc(launches)) %>%
116 |   filter(launches >= 30)
117 | 
118 | launches_processed %>%
119 |   semi_join(russian_vehicles, by = "type") %>%
120 |   mutate(type = fct_reorder(type, launch_date, min)) %>%
121 |   ggplot(aes(launch_date, type)) +
122 |   geom_jitter(color = "blue", alpha = .25, width = 0, height = .2) +
123 |   theme(legend.position = "none") +
124 |   labs(title = "Timeline of Soviet/Russian space vehicles",
125 |        x = "Launch date",
126 |        y = "Vehicle type",
127 |        subtitle = "Only vehicles with at least 30 launches")
128 | 
129 | launches_processed %>%
130 |   filter(state_code == "US") %>%
131 |   add_count(type) %>%
132 |   filter(n >= 20) %>%
133 |   mutate(type = fct_reorder(type, launch_date, min),
134 |          agency_type = str_to_title(agency_type)) %>%
135 |   ggplot(aes(launch_date, type, color = agency_type)) +
136 |   geom_jitter(alpha = .25, width = 0, height = .2) +
137 |   labs(title = "Timeline of US space vehicles",
138 |        x = "Launch date",
139 |        y = "Vehicle type",
140 |        color = "Agency type",
141 |        subtitle = "Only vehicles with at least 20 launches")
142 | 
143 | 
144 | by_type %>%
145 |   arrange(desc(launches)) %>%
146 |   View()
147 |   count(type, sort = TRUE)
148 | ```
149 | 
150 | 
151 | 


--------------------------------------------------------------------------------
/thanksgiving.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Untitled"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | theme_set(theme_light())
 13 | 
 14 | thanksgiving_survey <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-11-20/thanksgiving_meals.csv") %>%
 15 |   mutate(family_income = fct_reorder(family_income, parse_number(family_income)))
 16 | ```
 17 | 
 18 | ```{r}
 19 | thanksgiving_survey %>%
 20 |   count(celebrate)
 21 | ```
 22 | 
 23 | Almost everyone serves turkey as the main dish
 24 | 
 25 | ```{r}
 26 | thanksgiving_survey %>%
 27 |   count(main_dish, sort = TRUE)
 28 | ```
 29 | 
 30 | ```{r}
 31 | thanksgiving_survey %>%
 32 |   count(main_dish, main_prep, sort = TRUE)
 33 | ```
 34 | 
 35 | ```{r}
 36 | thanksgiving_survey %>%
 37 |   count(cranberry, sort = TRUE)
 38 | 
 39 | thanksgiving_survey %>%
 40 |   count(gravy, sort = TRUE)
 41 | ```
 42 | 
 43 | ### Relationship with income
 44 | 
 45 | ```{r}
 46 | thanksgiving_survey %>%
 47 |   group_by(family_income) %>%
 48 |   summarize(celebrate = sum(celebrate == "Yes"),
 49 |             total = n(),
 50 |             low = qbeta(0.025, celebrate + .5, total - celebrate + .5),
 51 |             high = qbeta(0.975, celebrate + .5, total - celebrate + .5)) %>%
 52 |   ggplot(aes(family_income, celebrate / total, group = 1)) +
 53 |   geom_line() +
 54 |   geom_ribbon(aes(ymin = low, ymax = high), alpha = .2) +
 55 |   scale_y_continuous(labels = scales::percent_format()) +
 56 |   theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
 57 |   labs(x = "Family income",
 58 |        y = "% celebrating Thanksgiving")
 59 | ```
 60 | 
 61 | ```{r}
 62 | thanksgiving_survey %>%
 63 |   filter(cranberry %in% c("Canned", "Homemade")) %>%
 64 |   group_by(family_income) %>%
 65 |   summarize(homemade = sum(cranberry == "Homemade"),
 66 |             total = n(),
 67 |             low = qbeta(0.025, homemade + .5, total - homemade + .5),
 68 |             high = qbeta(0.975, homemade + .5, total - homemade + .5)) %>%
 69 |   ggplot(aes(family_income, homemade / total, group = 1)) +
 70 |   geom_line() +
 71 |   geom_ribbon(aes(ymin = low, ymax = high), alpha = .2) +
 72 |   scale_y_continuous(labels = scales::percent_format()) +
 73 |   theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
 74 |   labs(x = "Family income",
 75 |        y = "% serving homemade")
 76 | ```
 77 | 
 78 | ```{r}
 79 | food_gathered <- thanksgiving_survey %>%
 80 |   select(id, starts_with("side"),
 81 |          starts_with("pie"),
 82 |          starts_with("dessert")) %>%
 83 |   select(-side15, -pie13, -dessert12) %>%
 84 |   gather(type, value, -id) %>%
 85 |   filter(!is.na(value),
 86 |          !value %in% c("None", "Other (please specify)")) %>%
 87 |   mutate(type = str_remove(type, "\\d+"))
 88 | 
 89 | n_respondents <- n_distinct(food_gathered$id)
 90 | ```
 91 | 
 92 | What are the most common pies, sides, and desserts?
 93 | 
 94 | ```{r}
 95 | food_gathered %>%
 96 |   count(type, value, sort = TRUE) %>%
 97 |   mutate(value = fct_reorder(value, n)) %>%
 98 |   ggplot(aes(value, n / n_respondents, fill = type)) +
 99 |   geom_col(show.legend = FALSE) +
100 |   coord_flip() +
101 |   scale_y_continuous(labels = scales::percent_format()) +
102 |   facet_wrap(~ type, scales = "free_y", ncol = 1) +
103 |   labs(x = "",
104 |        y = "% of respondents")
105 | ```
106 | 
107 | How do the servings differ by income?
108 | 
109 | ```{r}
110 | food_gathered %>%
111 |   inner_join(thanksgiving_survey, by = "id") %>%
112 |   mutate(age_number = parse_number(age)) %>%
113 |   group_by(value) %>%
114 |   summarize(average_age = mean(age_number, na.rm = TRUE),
115 |             total = n()) %>%
116 |   arrange(desc(average_age))
117 | 
118 | food_by_region <- food_gathered %>%
119 |   inner_join(thanksgiving_survey, by = "id") %>%
120 |   group_by(us_region) %>%
121 |   mutate(respondents = n_distinct(id)) %>%
122 |   count(us_region, respondents, type, value) %>%
123 |   ungroup() %>%
124 |   mutate(percent = n / respondents)
125 | 
126 | food_by_region %>%
127 |   filter(value == "Cornbread") %>%
128 |   arrange(desc(percent))
129 | ```
130 | 
131 | ```{r}
132 | library(ebbr)
133 | 
134 | food_gathered %>%
135 |   inner_join(thanksgiving_survey, by = "id") %>%
136 |   filter(!is.na(prayer)) %>%
137 |   group_by(type, value) %>%
138 |   summarize(prayer = sum(prayer == "Yes"),
139 |             total = n()) %>%
140 |   add_ebb_estimate(prayer, total) %>%
141 |   arrange(desc(.fitted)) %>%
142 |   View()
143 | ```
144 | 
145 | ### Network of foods
146 | 
147 | What sides, pies and desserts are eaten together?
148 | 
149 | ```{r}
150 | library(widyr)
151 | 
152 | food_cors <- food_gathered %>%
153 |   pairwise_cor(value, id, sort = TRUE)
154 | ```
155 | 
156 | ```{r}
157 | library(ggraph)
158 | library(igraph)
159 | 
160 | set.seed(2018)
161 | 
162 | food_types <- food_gathered %>%
163 |   count(value, type, sort = TRUE)
164 | 
165 | food_cors %>%
166 |   head(75) %>%
167 |   graph_from_data_frame(vertices = food_types) %>%
168 |   ggraph() +
169 |   geom_edge_link() +
170 |   geom_node_point(aes(color = type, size = n / n_respondents)) +
171 |   geom_node_text(aes(label = name), vjust = 1, hjust = 1, repel = TRUE) +
172 |   scale_size_continuous(labels = scales::percent_format()) +
173 |   theme_void() +
174 |   labs(title = "What foods get served together at Thanksgiving?",
175 |        color = "",
176 |        size = "% of respondents")
177 | ```
178 | 
179 | 
180 | 
181 | 


--------------------------------------------------------------------------------
/tour-de-france.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Tour de France"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | # devtools::install_github("thebioengineer/tidytuesdayR")
 12 | 
 13 | library(tidytuesdayR)
 14 | library(tidyverse)
 15 | theme_set(theme_light())
 16 | ```
 17 | 
 18 | ```{r}
 19 | library(lubridate)
 20 | 
 21 | tuesdata <- tidytuesdayR::tt_load('2020-04-07')
 22 | 
 23 | tdf_winners <- tuesdata$tdf_winners %>%
 24 |   mutate(year = year(start_date),
 25 |          speed = distance / time_overall)
 26 | 
 27 | stage_data <- tuesdata$stage_data
 28 | tdf_stages <- tuesdata$tdf_stages %>%
 29 |   janitor::clean_names() %>%
 30 |   mutate(year = year(date))
 31 | ```
 32 | 
 33 | ```{r}
 34 | View(tuesdata$tdf_winners)
 35 | ```
 36 | 
 37 | ```{r}
 38 | tdf_winners %>%
 39 |   count(birth_country, sort = TRUE) %>%
 40 |   mutate(birth_country = fct_reorder(birth_country, n)) %>%
 41 |   ggplot(aes(n, birth_country)) +
 42 |   geom_col() +
 43 |   labs(y = "",
 44 |        title = "What countries were the most Tour de France winners born in?")
 45 | 
 46 | tdf_winners %>%
 47 |   count(winner_name, birth_country, sort = TRUE)
 48 | ```
 49 | 
 50 | ```{r}
 51 | by_decade <- tdf_winners %>%
 52 |   group_by(decade = 10 * (year %/% 10)) %>%
 53 |   summarize(winner_age = mean(age),
 54 |             winner_height = mean(height, na.rm = TRUE),
 55 |             winner_weight = mean(weight, na.rm = TRUE),
 56 |             winner_margin = mean(time_margin, na.rm = TRUE),
 57 |             winner_speed = mean(speed, na.rm = TRUE))
 58 | 
 59 | by_decade %>%
 60 |   filter(decade >= 1910) %>%
 61 |   ggplot(aes(decade, winner_margin * 60)) +
 62 |   geom_line() +
 63 |   expand_limits(y = 0) +
 64 |   labs(x = "Decade",
 65 |        y = "Average margin of winner (minutes)",
 66 |        title = "Tour de France races have been getting closer")
 67 | 
 68 | by_decade %>%
 69 |   ggplot(aes(decade, winner_speed)) +
 70 |   geom_line() +
 71 |   expand_limits(y = 0) +
 72 |   labs(x = "Decade",
 73 |        y = "Average speed of winner (km/h)",
 74 |        title = "Tour de France winners have been getting faster")
 75 | ```
 76 | 
 77 | Life expectancy of 
 78 | 
 79 | ```{r}
 80 | library(survival)
 81 | library(broom)
 82 | 
 83 | surv_model <- tdf_winners %>%
 84 |   distinct(winner_name, .keep_all = TRUE) %>%
 85 |   transmute(winner_name,
 86 |             birth_year = year(born),
 87 |             death_year = year(died),
 88 |             dead = as.integer(!is.na(death_year))) %>%
 89 |   mutate(age_at_death = coalesce(death_year, 2020) - birth_year) %>%
 90 |   survfit(Surv(age_at_death, dead) ~ 1, data = .)
 91 | 
 92 | glance(surv_model)
 93 | ```
 94 | 
 95 | Median life expectancy of a Tour de France winner is 77.
 96 | 
 97 | ```{r}
 98 | stages_joined <- stage_data %>%
 99 |   extract(stage_results_id, "stage", "stage-(.*)") %>%
100 |   inner_join(tdf_stages, by = c("year", "stage")) %>%
101 |   mutate(rank = as.integer(rank)) %>%
102 |   group_by(year, stage) %>%
103 |   mutate(finishers = sum(!is.na(rank))) %>%
104 |   ungroup() %>%
105 |   mutate(percentile = 1 - rank / finishers)
106 | 
107 | total_points <- stages_joined %>%
108 |   group_by(year, rider) %>%
109 |   summarize(total_points = sum(points, na.rm = TRUE)) %>%
110 |   mutate(points_rank = percent_rank(total_points)) %>%
111 |   ungroup()
112 | 
113 | stages_joined %>%
114 |   filter(stage == "1") %>%
115 |   group_by(winner_country) %>%
116 |   summarize(stages = n(),
117 |             median_percentile = median(percentile, na.rm = TRUE)) %>%
118 |   arrange(desc(stages))
119 | ```
120 | 
121 | Does the winner of the first stage predict their final point ranking?
122 | 
123 | ```{r}
124 | stages_joined %>%
125 |   filter(stage == "1") %>%
126 |   inner_join(total_points, by = c("year", "rider")) %>%
127 |   select(year, rider,
128 |          percentile_first_stage = percentile,
129 |          points_rank) %>%
130 |   mutate(first_stage_bin = cut(percentile_first_stage, seq(0, 1, .1),
131 |                                include.lowest = TRUE)) %>%
132 |   filter(!is.na(first_stage_bin)) %>%
133 |   ggplot(aes(first_stage_bin, points_rank)) +
134 |   geom_boxplot() +
135 |   theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
136 |   scale_y_continuous(labels = scales::percent) +
137 |   labs(x = "Decile performance in the first stage",
138 |        y = "Overall points percentile")
139 | ```
140 | 
141 | ```{r}
142 | library(gganimate)
143 | library(tidytext)
144 | 
145 | top_10_2017 <- total_points %>%
146 |   filter(year == max(year)) %>%
147 |   top_n(10, total_points)
148 | 
149 | stages_joined %>%
150 |   filter(year == max(year)) %>%
151 |   semi_join(top_10_2017, by = "rider") %>%
152 |   mutate(stage = as.integer(stage),
153 |          points = coalesce(points, 0)) %>%
154 |   arrange(stage) %>%
155 |   group_by(rider) %>%
156 |   mutate(cumulative_points = cumsum(points)) %>%
157 |   ungroup() %>%
158 |   # mutate(rider = reorder_within(rider, cumulative_points, stage)) %>%
159 |   ggplot(aes(cumulative_points, rider, fill = cumulative_points)) +
160 |   geom_col() +
161 |   transition_time(stage) +
162 |   theme(legend.position = "none") +
163 |   labs(title = "The 2017 Tour de France. Stage: { frame_time }",
164 |        x = "Cumulative points at this stage",
165 |        y = "")
166 | ```
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 


--------------------------------------------------------------------------------
/trees.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Trees in NYC"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | This Rmd (unlike others in this repository) isn't a screencast, but rather created in a [live session of the New York Open Statistical Programming Meetup](https://www.meetup.com/nyhackr/events/260545903/).
 11 | 
 12 | Tree data here: https://data.cityofnewyork.us/Environment/2015-Street-Tree-Census-Tree-Data/pi5s-9p35
 13 | Zip code data here: https://data.cityofnewyork.us/widgets/i8iw-xf4u
 14 | 
 15 | ```{r}
 16 | library(tidyverse)
 17 | theme_set(theme_light())
 18 | 
 19 | tree_dataset_raw <- read_csv("~/Desktop/dataset_ideas/2015_Street_Tree_Census_-_Tree_Data.csv")
 20 | 
 21 | trees <- tree_dataset_raw %>%
 22 |   mutate(health = fct_relevel(health, c("Poor", "Fair", "Good"))) %>%
 23 |   mutate(postcode = as.character(postcode)) %>%
 24 |   mutate(maple = str_detect(str_to_lower(spc_common), "maple"))
 25 | ```
 26 | 
 27 | ```{r}
 28 | View(trees)
 29 | 
 30 | three_boroughs <- trees %>%
 31 |   filter(postcode %in% c(10023, 10002, 10009))
 32 | 
 33 | three_boroughs %>%
 34 |   filter(!is.na(health)) %>%
 35 |   count(postcode, health, sort = TRUE) %>%
 36 |   ggplot(aes(health, n)) +
 37 |   geom_col() +
 38 |   facet_wrap(~ postcode)
 39 | 
 40 | summarize_trees <- function(data) {
 41 |   data %>%
 42 |     summarize(average_health = mean(as.numeric(health), na.rm = TRUE),
 43 |               percent_good_health = mean(health == "Good", na.rm = TRUE),
 44 |               percent_maple = mean(maple, na.rm = TRUE),
 45 |               trees = n())
 46 | }
 47 | 
 48 | trees_by_zipcode <- trees %>%
 49 |   group_by(postcode, borough) %>%
 50 |   summarize_trees() %>%
 51 |   filter(trees >= 100)
 52 | 
 53 | trees_by_zipcode %>%
 54 |   arrange(desc(average_health))
 55 | 
 56 | trees %>%
 57 |   group_by(borough) %>%
 58 |   summarize_trees() %>%
 59 |   arrange(average_health)
 60 | ```
 61 | 
 62 | ```{r}
 63 | library(sf)
 64 | 
 65 | zip_codes <- read_sf("~/Downloads/ZIP_CODE_040114/")
 66 | 
 67 | class(zip_codes)
 68 | 
 69 | joined_trees <- zip_codes %>%
 70 |   left_join(trees_by_zipcode, by = c("ZIPCODE" = "postcode")) %>%
 71 |   mutate(population_density = POPULATION / AREA,
 72 |          tree_density = trees / AREA,
 73 |          people_per_tree = POPULATION / trees)
 74 | ```
 75 | 
 76 | ```{r}
 77 | ggplot(joined_trees, aes(fill = percent_good_health)) +
 78 |   geom_sf() +
 79 |   scale_fill_gradient2(low = "brown",
 80 |                        high = "darkgreen",
 81 |                        midpoint = .8,
 82 |                        labels = scales::percent) +
 83 |   theme_void() +
 84 |   coord_sf(datum = NA) +
 85 |   labs(fill = "% trees in good health",
 86 |        title = "Where are the healthiest (and unhealthiest) trees in NYC?",
 87 |        subtitle = "Based on a 2015 survey of 600,000 trees")
 88 | ```
 89 | 
 90 | ### Species
 91 | 
 92 | ```{r}
 93 | trees %>%
 94 |   filter(!is.na(spc_common)) %>%
 95 |   count(spc_common = fct_lump(spc_common, n = 19),
 96 |         borough,
 97 |         sort = TRUE) %>%
 98 |   mutate(spc_common = fct_reorder(spc_common, n, sum)) %>%
 99 |   ggplot(aes(spc_common, n, fill = borough)) +
100 |   geom_col() +
101 |   coord_flip() +
102 |   labs(x = "",
103 |        y = "# of trees in NYC",
104 |        fill = "Borough",
105 |        title = "What species of tree do you find in NYC?")
106 | ```
107 | 
108 | ```{r}
109 | trees %>%
110 |   group_by(spc_common, maple) %>%
111 |   summarize_trees() %>%
112 |   filter(trees >= 500) %>%
113 |   arrange(percent_good_health) %>%
114 |   ggplot(aes(trees, percent_good_health)) +
115 |   geom_point(aes(color = maple)) +
116 |   geom_text(aes(label = spc_common), vjust = 1, hjust = 1, check_overlap = TRUE) +
117 |   scale_x_log10(labels = scales::comma) +
118 |   scale_y_continuous(labels = scales::percent) +
119 |   labs(x = "# of trees in NYC",
120 |        y = "% marked as 'Good' health")
121 | ```
122 | 
123 | ```{r}
124 | trees %>%
125 |   filter(!is.na(maple)) %>%
126 |   group_by(borough, maple) %>%
127 |   summarize_trees() %>%
128 |   mutate(maple = ifelse(maple, "Maple", "Other")) %>%
129 |   select(borough, maple, percent_good_health) %>%
130 |   spread(maple, percent_good_health) %>%
131 |   ggplot(aes(Other, Maple)) +
132 |   geom_point() +
133 |   geom_text(aes(label = borough)) +
134 |   geom_abline(color = "red")
135 | ```
136 | 
137 | 
138 | ```{r}
139 | ggplot(joined_trees, aes(fill = percent_maple)) +
140 |   geom_sf() +
141 |   scale_fill_gradient2(low = "brown",
142 |                        high = "darkgreen",
143 |                        midpoint = .1,
144 |                        labels = scales::percent) +
145 |   theme_void() +
146 |   coord_sf(datum = NA) +
147 |   labs(fill = "% trees that are maple",
148 |        title = "Where are the maple trees in NYC?",
149 |        subtitle = "Based on a 2015 survey of 600,000 trees")
150 | ```
151 | 
152 | ### Looking at tree and population density
153 | 
154 | ```{r}
155 | processed_zipcodes <- joined_trees %>%
156 |   select(ZIPCODE, POPULATION, AREA, trees) %>%
157 |   rename_all(str_to_lower)
158 | 
159 | processed_zipcodes %>%
160 |   ggplot(aes(POPULATION / AREA)) +
161 |   geom_histogram()
162 | 
163 | processed_zipcodes %>%
164 |   filter(population >= 1000) %>%
165 |   arrange(desc(people_per_tree))
166 | ```
167 | 
168 | ```{r}
169 | ggplot(joined_trees, aes(fill = tree_density * (5280 ^ 2))) +
170 |   geom_sf() +
171 |   scale_fill_gradient2(low = "brown",
172 |                        high = "darkgreen",
173 |                        midpoint = log10(3000),
174 |                        trans = "log10") +
175 |   theme_void() +
176 |   coord_sf(datum = NA) +
177 |   labs(fill = "# of trees per square mile",
178 |        title = "Where are the densest trees?",
179 |        subtitle = "Based on a 2015 survey of 600,000 trees")
180 | ```
181 | 
182 | 


--------------------------------------------------------------------------------
/umbrella-week.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "Untitled"
 3 | output: html_document
 4 | ---
 5 | 
 6 | ```{r setup, include=FALSE}
 7 | knitr::opts_chunk$set(echo = TRUE)
 8 | ```
 9 | 
10 | Louie walks to and from work every day. In his city, there is a 50 percent chance of rain each morning and an independent 40 percent chance each evening. His habit is to bring (and use) an umbrella if it’s raining when he leaves the house or office, but to leave them all behind if not. Louie owns three umbrellas.
11 | 
12 | On Sunday night, two are with him at home and one is at his office. Assuming it never starts raining during his walk to his home or office, what is the probability that he makes it through the work week without getting wet?
13 | 
14 | ```{r}
15 | library(tidyverse)
16 | 
17 | set.seed(2018)
18 | 
19 | simulations <- crossing(trial = 1:1e5,
20 |          weekday = 1:5,
21 |          commute = c("Morning", "Evening")) %>%
22 |   arrange(trial, weekday, desc(commute)) %>%
23 |   mutate(rain = rbinom(n(), 1, ifelse(commute == "Morning", .5, .4)),
24 |          home_change = case_when(
25 |            commute == "Morning" & rain ~ -1,
26 |            commute == "Evening" & rain ~ 1,
27 |            TRUE ~ 0),
28 |          office_change = -home_change) %>%
29 |   group_by(trial) %>%
30 |   mutate(home = 2 + cumsum(home_change),
31 |          office = 1 + cumsum(office_change))
32 | 
33 | simulations %>%
34 |   summarize(dry = !any(home < 0 | office < 0)) %>%
35 |   summarize(dry = mean(dry))
36 | 
37 | days <- c("Mon", "Tue", "Wed", "Thu", "Fri")
38 | 
39 | simulations %>%
40 |   ungroup() %>%
41 |   filter(home < 0 | office < 0) %>%
42 |   distinct(trial, .keep_all = TRUE) %>%
43 |   count(weekday, commute, sort = TRUE) %>%
44 |   mutate(weekday = factor(days[weekday], levels = days),
45 |          commute = fct_relevel(commute, "Morning")) %>%
46 |   ggplot(aes(weekday, n / 1e5, fill = commute)) +
47 |   geom_col(position = "dodge") +
48 |   scale_y_continuous(labels = scales::percent_format()) +
49 |   labs(title = "When does Louie first get wet?",
50 |        y = "Probability overall")
51 | ```
52 | 
53 | Answer to the riddle: ~69.3%.
54 | 
55 | 


--------------------------------------------------------------------------------
/uncanny-xmen.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Uncanny X-Men: Claremont Run"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidytuesdayR)
 12 | library(tidyverse)
 13 | theme_set(theme_light())
 14 | 
 15 | xmen <- tidytuesdayR::tt_load('2020-06-30')
 16 | characters <- tt_load(2018, week = 9)$week9_comic_characters
 17 | ```
 18 | 
 19 | ```{r}
 20 | character_visualization <- xmen$character_visualization %>%
 21 |   separate(character, c("superhero", "secret_identity"), sep = " = ", fill = "right")
 22 | ```
 23 | 
 24 | ```{r}
 25 | character_visualization %>%
 26 |   group_by(superhero)
 27 | ```
 28 | 
 29 | ```{r}
 30 | by_character %>%
 31 |   left_join(characters, by = c(superhero = "name")) %>%
 32 |   View()
 33 | 
 34 | characters %>%
 35 |   filter(str_detect(name, "Angel")) %>%
 36 |   distinct(name)
 37 | ```
 38 | 
 39 | 
 40 | ```{r}
 41 | library(ggrepel)
 42 | 
 43 | by_character <- character_visualization %>%
 44 |   group_by(superhero) %>%
 45 |   summarize(across(speech:depicted,
 46 |                    list(total = sum,
 47 |                         issues = ~ sum(. > 0),
 48 |                         avg = ~ mean(.[depicted > 0]))))
 49 | 
 50 | by_character %>%
 51 |   arrange(desc(speech_issues)) %>%
 52 |   filter(speech_issues > 0) %>%
 53 |   mutate(superhero = fct_reorder(superhero, speech_issues)) %>%
 54 |   ggplot(aes(speech_issues, superhero)) +
 55 |   geom_col() +
 56 |   labs(title = "Which X-Men speak in the most issues?",
 57 |        x = "# of issues",
 58 |        y = "")
 59 | 
 60 | by_character %>%
 61 |   ggplot(aes(depicted_issues, speech_avg)) +
 62 |   geom_point() +
 63 |   geom_text_repel(aes(label = superhero)) +
 64 |   labs(x = "# of issues in which character appears",
 65 |        y = "Average lines per issue depicted") +
 66 |   expand_limits(y = 0)
 67 | 
 68 | by_character %>%
 69 |   ggplot(aes(depicted_issues, thought_avg)) +
 70 |   geom_point() +
 71 |   geom_text_repel(aes(label = superhero)) +
 72 |   scale_x_log10() +
 73 |   labs(x = "# of issues in which character appears",
 74 |        y = "Average thoughts per issue") +
 75 |   expand_limits(y = 0)
 76 | 
 77 | by_character %>%
 78 |   ggplot(aes(depicted_issues, speech_avg / thought_avg)) +
 79 |   geom_point() +
 80 |   geom_text_repel(aes(label = superhero)) +
 81 |   expand_limits(y = 1) +
 82 |   scale_y_log10(breaks = c(1, 3, 10, 30),
 83 |                 labels = c("Same", "3X", "10X", "30X")) +
 84 |   labs(x = "# of issues depicted",
 85 |        y = "Speech to thought ratio (log scale)",
 86 |        title = "Which X-Men think, and which speak?")
 87 | ```
 88 | 
 89 | ```{r}
 90 | by_character_costume <- character_visualization %>%
 91 |   group_by(superhero, costume) %>%
 92 |   summarize(across(speech:depicted,
 93 |                    list(total = sum,
 94 |                         issues = ~ sum(. > 0),
 95 |                         avg = ~ mean(.[depicted > 0])))) %>%
 96 |   ungroup() %>%
 97 |   mutate(speech_thought_ratio = speech_avg / thought_avg)
 98 | 
 99 | by_character_costume %>%
100 |   mutate(superhero = fct_reorder(superhero, speech_total)) %>%
101 |   filter(speech_avg > 0) %>%
102 |   ggplot(aes(speech_total,
103 |              superhero,
104 |              color = costume,
105 |              size = depicted_issues)) +
106 |   geom_point() +
107 |   labs(x = "# of lines (in/out of costume)")
108 | 
109 | costume_ratios <- by_character_costume %>%
110 |   filter(speech_avg > 0) %>%
111 |   group_by(superhero) %>%
112 |   summarize(costume_ratio = speech_total[1] / speech_total[2]) %>%
113 |   inner_join(by_character, by = "superhero")
114 | 
115 | costume_ratios %>%
116 |   ggplot(aes(speech_total,
117 |              costume_ratio)) +
118 |   geom_point() +
119 |   geom_text_repel(aes(label = superhero)) +
120 |   scale_y_log10() +
121 |   labs(x = "Total # of lines",
122 |        y = "Lines in costume / lines out of costume")
123 | 
124 | costume_ratios %>%
125 |   mutate(superhero = fct_reorder(superhero, costume_ratio)) %>%
126 |   ggplot(aes(costume_ratio, y = superhero)) +
127 |   geom_errorbarh(aes(xmin = 1, xmax = costume_ratio), height = 0) +
128 |   geom_point(aes(size = speech_total, color = costume_ratio > 1)) +
129 |   scale_x_log10() +
130 |   scale_color_discrete(guide = FALSE) +
131 |   labs(size ="# of lines",
132 |        x = "Lines in costume / lines out of costume",
133 |        y = "",
134 |        title = "Which X-Men tend to speak in costume/out of costume?")
135 | ```
136 | 
137 | ```{r}
138 | xmen$locations %>%
139 |   group_by(location) %>%
140 |   summarize(total = n(),
141 |             issues = n_distinct(issue)) %>%
142 |   arrange(desc(issues))
143 | ```
144 | 
145 | ```{r}
146 | xmen$comic_bechdel %>%
147 |   bind_rows(xmen$xmen_bechdel %>% mutate(writer = "Chris Claremont")) %>%
148 |   filter(!is.na(pass_bechdel),
149 |          !is.na(writer)) %>%
150 |   group_by(writer = fct_lump(writer, 5)) %>%
151 |   summarize(n_issues = n(),
152 |             pct_bechdel = mean(pass_bechdel == "yes")) %>%
153 |   arrange(desc(n_issues))
154 | ```
155 | 
156 | ```{r}
157 | xmen$xmen_bechdel %>%
158 |   filter(!is.na(pass_bechdel)) %>%
159 |   group_by(issue_group = 20 * (issue %/% 20)) %>%
160 |   summarize(pct_bechdel = mean(pass_bechdel == "yes"),
161 |             n = n()) %>%
162 |   ggplot(aes(issue_group, pct_bechdel)) +
163 |   geom_line() +
164 |   scale_y_continuous(labels = scales::percent)
165 | ```
166 | 
167 | ```{r}
168 | by_issue_character <- character_visualization %>%
169 |   group_by(issue_group = 20 * (issue %/% 20), superhero) %>%
170 |   summarize(across(speech:depicted, sum)) %>%
171 |   ungroup()
172 | 
173 | by_issue_character %>%
174 |   filter(fct_lump(superhero, 9, w = speech) != "Other") %>%
175 |   mutate(superhero = fct_reorder(superhero, -speech, sum)) %>%
176 |   ggplot(aes(issue_group, speech)) +
177 |   geom_col() +
178 |   facet_wrap(~ superhero)
179 | ```
180 | 
181 | 
182 | 
183 | 


--------------------------------------------------------------------------------
/us-dairy.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Untitled"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | theme_set(theme_light())
 13 | 
 14 | milk_products_facts <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-29/milk_products_facts.csv")
 15 | ```
 16 | 
 17 | ```{r}
 18 | milk_products_tidied <- milk_products_facts %>%
 19 |   gather(product, lbs_per_person, -year) %>%
 20 |   separate(product, c("category", "product"), sep = "_",
 21 |            extra = "merge", fill = "right") %>%
 22 |   mutate(product = coalesce(product, category),
 23 |          product = str_to_title(str_replace_all(product, "_", " ")),
 24 |          category = str_to_title(category),
 25 |          product = ifelse(product == "Other", paste(product, category), product))
 26 | 
 27 | milk_products_tidied %>%
 28 |   group_by(category, year) %>%
 29 |   summarize(lbs_per_person = sum(lbs_per_person)) %>%
 30 |   ggplot(aes(year, lbs_per_person, color = category)) +
 31 |   geom_line() +
 32 |   labs(title = "Dairy consumption by category",
 33 |        subtitle = "Based on US consumption (source: USDA)",
 34 |        x = "Year",
 35 |        y = "Lbs per person")
 36 | 
 37 | milk_products_tidied %>%
 38 |   group_by(product = fct_lump(product, 6, w = lbs_per_person),
 39 |            year) %>%
 40 |   summarize(lbs_per_person = sum(lbs_per_person)) %>%
 41 |   ggplot(aes(year, lbs_per_person, color = product)) +
 42 |   geom_line()
 43 | 
 44 | milk_products_tidied %>%
 45 |   ggplot(aes(year, lbs_per_person)) +
 46 |   geom_line() +
 47 |   facet_wrap(~ product, scales = "free") +
 48 |   expand_limits(y = 0)
 49 | 
 50 | milk_products_tidied %>%
 51 |   filter(year == max(year)) %>%
 52 |   mutate(product = fct_reorder(product, lbs_per_person, sum)) %>%
 53 |   ggplot(aes(product, lbs_per_person, fill = category)) +
 54 |   geom_col() +
 55 |   coord_flip() +
 56 |   labs(x = "",
 57 |        y = "Pounds consumed per US person in 2017")
 58 | ```
 59 | 
 60 | ```{r}
 61 | library(sweep)
 62 | library(timetk)
 63 | library(lubridate)
 64 | 
 65 | milk_product_ts <- milk_products_tidied %>%
 66 |   mutate(year = as.Date("0001-01-01") + years(year - 1)) %>%
 67 |   nest(-category, -product) %>%
 68 |   mutate(ts = map(data, tk_ts, start = 1975, freq = 1))
 69 | 
 70 | milk_product_ets <- milk_product_ts %>%
 71 |   mutate(model = map(ts, ets))
 72 | 
 73 | milk_product_ets %>%
 74 |   unnest(map(model, sw_glance))
 75 | 
 76 | milk_product_ts %>%
 77 |   crossing(model_name = c("auto.arima", "ets")) %>%
 78 |   mutate(model = map2(model_name, ts, ~ invoke(.x, list(.y))),
 79 |          forecast = map(model, forecast, h = 10)) %>%
 80 |   unnest(map(forecast, sw_sweep)) %>%
 81 |   ggplot(aes(index, lbs_per_person, color = model_name, lty = key)) +
 82 |   geom_line() +
 83 |   geom_ribbon(aes(ymin = lo.80, ymax = hi.80), alpha = .5) +
 84 |   facet_wrap(~ product, scales = "free_y") +
 85 |   expand_limits(y = 0) +
 86 |   scale_x_continuous(breaks = c(1980, 2000, 2020)) +
 87 |   scale_linetype_discrete(guide = FALSE) +
 88 |   labs(x = "Year",
 89 |        y = "Average US consumption (lbs per person)",
 90 |        title = "Forecasted consumption of dairy products",
 91 |        subtitle = "Based on USDA data 1975-2017. Showing 80% prediction intervals.",
 92 |        color = "Model")
 93 | ```
 94 | 
 95 | ```{r}
 96 | cheese <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-29/clean_cheese.csv")
 97 | 
 98 | cheese_tidied <- cheese %>%
 99 |   gather(type, lbs_per_person, -Year) %>%
100 |   rename(year = Year) %>%
101 |   mutate(type = str_to_title(type),
102 |          type = fct_recode(type, "Total American Cheese" = "Total American Chese"))
103 | 
104 | cheese_tidied %>%
105 |   ggplot(aes(year, lbs_per_person)) +
106 |   geom_line() +
107 |   facet_wrap(~ type, scales = "free_y") +
108 |   expand_limits(y = 0)
109 | 
110 | cheese_ts <- cheese_tidied %>%
111 |   mutate(year = as.Date("0001-01-01") + years(year - 1)) %>%
112 |   nest(-type) %>%
113 |   mutate(ts = map(data, tk_ts, start = 1970, freq = 1))
114 | 
115 | cheese_ts %>%
116 |   crossing(model_name = c("auto.arima", "ets")) %>%
117 |   mutate(model = map2(model_name, ts, ~ invoke(.x, list(.y))),
118 |          forecast = map(model, forecast, h = 10)) %>%
119 |   unnest(map(forecast, sw_sweep)) %>%
120 |   ggplot(aes(index, lbs_per_person, color = model_name, lty = key)) +
121 |   geom_line() +
122 |   geom_ribbon(aes(ymin = lo.80, ymax = hi.80), alpha = .5) +
123 |   facet_wrap(~ type, scales = "free_y") +
124 |   expand_limits(y = 0) +
125 |   scale_x_continuous(breaks = c(1980, 2000, 2020)) +
126 |   scale_linetype_discrete(guide = FALSE) +
127 |   labs(x = "Year",
128 |        y = "Average US consumption (lbs per person)",
129 |        title = "Forecasted consumption of dairy products",
130 |        subtitle = "Based on USDA data 1975-2017. Showing 80% prediction intervals.",
131 |        color = "Model")
132 | ```
133 | 
134 | 
135 | 


--------------------------------------------------------------------------------
/us-wind.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Untitled"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | 
 13 | us_wind <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-11-06/us_wind.csv")
 14 | ```
 15 | 
 16 | Look at the continental US (48 states, District of Columbia)
 17 | 
 18 | ```{r}
 19 | us_wind_processed <- us_wind %>%
 20 |   filter(!t_state %in% c("AK", "HI", "GU", "PR")) %>%
 21 |   mutate(t_cap = ifelse(t_cap < 0, NA, t_cap)) %>%
 22 |   mutate_if(is.numeric, ~ ifelse(. == -9999, NA, .))
 23 | 
 24 | wind_projects <- us_wind_processed %>%
 25 |   group_by(p_name, t_state) %>%
 26 |   summarize(year = min(p_year, na.rm = TRUE),
 27 |             turbines = n(),
 28 |             total_capacity = sum(t_cap),
 29 |             lon = mean(xlong),
 30 |             lat = mean(ylat),
 31 |             lon_sd = sd(xlong),
 32 |             lat_sd = sd(ylat)) %>%
 33 |   ungroup()
 34 | ```
 35 | 
 36 | How has turbine capacity changed over time?
 37 | 
 38 | ```{r}
 39 | turbine_models <- us_wind_processed %>%
 40 |   group_by(t_model) %>%
 41 |   summarize(t_cap = median(t_cap),
 42 |             t_hh = median(t_hh),
 43 |             t_rd = median(t_rd),
 44 |             t_rsa = median(t_rsa),
 45 |             t_ttlh = median(t_ttlh),
 46 |             turbines = n(),
 47 |             projects = n_distinct(p_name)) %>%
 48 |   arrange(desc(projects))
 49 | 
 50 | turbine_models %>%
 51 |   ggplot(aes(t_ttlh, t_cap)) +
 52 |   geom_point() +
 53 |   labs(title = "When it comes to turbines, bigger is better!",
 54 |        x = "Turbine total height (meters)",
 55 |        y = "Turbine capacity (kW)")
 56 | 
 57 | turbine_models %>%
 58 |   ggplot(aes(t_rsa, t_cap)) +
 59 |   geom_point() +
 60 |   labs(title = "When it comes to turbines, bigger is better!",
 61 |        x = "Turbine rotor swept area (meters ^ 2)",
 62 |        y = "Turbine capacity (kW)")
 63 | 
 64 | ```
 65 | 
 66 | 
 67 | ```{r}
 68 | wind_projects %>%
 69 |   ggplot(aes(year, total_capacity)) +
 70 |   geom_point()
 71 | 
 72 | wind_projects %>%
 73 |   ggplot(aes(year, total_capacity / turbines)) +
 74 |   geom_point()
 75 | ```
 76 | 
 77 | ```{r}
 78 | wind_projects %>%
 79 |   ggplot(aes(lon, lat, size = turbines, color = year)) +
 80 |   borders("state") +
 81 |   geom_point() +
 82 |   coord_map() +
 83 |   theme_void()
 84 | ```
 85 | 
 86 | ### Animation
 87 | 
 88 | ```{r}
 89 | library(gganimate)
 90 | 
 91 | ggplot(mtcars, aes(factor(cyl), mpg)) + 
 92 |   geom_boxplot() + 
 93 |   # Here comes the gganimate code
 94 |   transition_states(
 95 |     gear,
 96 |     transition_length = 2,
 97 |     state_length = 1
 98 |   ) +
 99 |   enter_fade() + 
100 |   exit_shrink() +
101 |   ease_aes('sine-in-out')
102 | ```
103 | 
104 | ```{r}
105 | wind_projects
106 | 
107 | wind_projects %>%
108 |   filter(!is.na(year), !is.infinite(year)) %>%
109 |   crossing(time = 1981:2018) %>%
110 |   filter(year <= time) %>%
111 |   ggplot(aes(lon, lat, size = turbines, color = year)) +
112 |   borders("state") +
113 |   geom_point() +
114 |   transition_manual(time) +
115 |   scale_color_continuous(guide = FALSE) +
116 |   labs(title = "Locations of wind turbine projects in continental US (1981-2018)") +
117 |   coord_map() +
118 |   theme_void()
119 | 
120 | p
121 | anim_save("turbines.gif")
122 | ```
123 | 
124 | 
125 | 
126 | ```{r}
127 | us_wind_processed
128 | ```
129 | 
130 | 
131 | 


--------------------------------------------------------------------------------
/us_phds.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "US PhDs"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | theme_set(theme_light())
 13 | 
 14 | # Major field of study
 15 | major_fields_raw <- readxl::read_xlsx("~/Downloads/data_tables/sed17-sr-tab012.xlsx",
 16 |                   skip = 3)
 17 | 
 18 | major_fields <- major_fields_raw %>%
 19 |   rename(field = `Field of study`) %>%
 20 |   gather(key, value, -field) %>%
 21 |   mutate(year = as.numeric(ifelse(str_detect(key, "X__"), NA, key)),
 22 |          type = ifelse(!str_detect(value, "Number|Percent"), NA, value),
 23 |          value = as.numeric(value)) %>%
 24 |   fill(year, type) %>%
 25 |   select(-key) %>%
 26 |   filter(!is.na(value)) %>%
 27 |   spread(type, value)
 28 | ```
 29 | 
 30 | ```{r}
 31 | fine_fields <- readxl::read_xlsx("~/Downloads/data_tables/sed17-sr-tab013.xlsx",
 32 |                   skip = 3) %>%
 33 |   rename(field = 1) %>%
 34 |   gather(year, number, -field) %>%
 35 |   mutate(year = as.numeric(year),
 36 |          number = as.numeric(number)) %>%
 37 |   filter(!is.na(number))
 38 | 
 39 | fine_fields %>%
 40 |   filter(field %in% sample(unique(field), 6)) %>%
 41 |   ggplot(aes(year, number, color = field)) +
 42 |   geom_line()
 43 | ```
 44 | 
 45 | ```{r}
 46 | # get the broad field names and the major field names
 47 | 
 48 | sex <- c("All", "Male", "Female", "All doctorate recipientsa", "All fieldsa")
 49 | 
 50 | broad_fields <- readxl::read_xlsx("~/Downloads/data_tables/sed17-sr-tab014.xlsx", skip = 4) %>%
 51 |   rename(field = 1) %>%
 52 |   filter(!field %in% sex) %>%
 53 |   mutate(field = fct_recode(field,
 54 |                             "Life sciences" = "Life sciencesb",
 55 |                             "Other" = "Otherc")) %>%
 56 |   pull(field) %>%
 57 |   as.character()
 58 | ```
 59 | 
 60 | 
 61 | ```{r}
 62 | recipients_year_field_sex <- readxl::read_xlsx("~/Downloads/data_tables/sed17-sr-tab015.xlsx", skip = 3) %>%
 63 |   rename(field = 1) %>%
 64 |   select(-contains("change")) %>%
 65 |   mutate(field = as.character(fct_recode(field, "All" = "All doctorate recipientsa",
 66 |                                          "Other" = "Otherb")),
 67 |          sex = if_else(field %in% sex, field, NA_character_),
 68 |          broad_field = ifelse(field %in% broad_fields, field, NA)) %>%
 69 |   fill(sex, broad_field) %>%
 70 |   gather(year, number, -sex, -broad_field, -field) %>%
 71 |   mutate(year = as.numeric(year),
 72 |          number = as.numeric(number)) %>%
 73 |   filter(!field %in% sex) %>%
 74 |   filter(!is.na(number))
 75 | 
 76 | recipients_year_field_sex %>%
 77 |   filter(sex != "All",
 78 |          broad_field == "Mathematics and computer sciences") %>%
 79 |   ggplot(aes(year, number, color = sex)) +
 80 |   geom_line() +
 81 |   expand_limits(y = 0) +
 82 |   facet_wrap(~ field)
 83 | 
 84 | recipients_year_field_sex %>%
 85 |   spread(sex, number) %>%
 86 |   mutate(pct_male = Male / All) %>%
 87 |   filter(broad_field == "Engineering") %>%
 88 |   mutate(field = fct_reorder(field, -pct_male)) %>%
 89 |   ggplot(aes(year, pct_male, color = field)) +
 90 |   geom_line() +
 91 |   scale_y_continuous(labels = scales::percent_format()) +
 92 |   labs(x = "Year",
 93 |        y = "% of PhD recipients reporting as male",
 94 |        color = "Major field",
 95 |        title = "Breakdown by sex over time within Engineering major fields")
 96 | 
 97 | recipients_year_field_sex %>%
 98 |   spread(sex, number) %>%
 99 |   mutate(pct_male = Male / All) %>%
100 |   filter(broad_field == "Humanities and arts") %>%
101 |   mutate(field = fct_reorder(field, -pct_male)) %>%
102 |   ggplot(aes(year, pct_male, color = field)) +
103 |   geom_line() +
104 |   scale_y_continuous(labels = scales::percent_format()) +
105 |   labs(x = "Year",
106 |        y = "% of PhD recipients reporting as male",
107 |        color = "Major field",
108 |        title = "Breakdown by sex over time within Humanities & Arts major fields")
109 | 
110 | recipients_year_field_sex %>%
111 |   spread(sex, number) %>%
112 |   mutate(pct_male = Male / All) %>%
113 |   filter(broad_field == "Education") %>%
114 |   mutate(field = fct_reorder(field, -pct_male)) %>%
115 |   ggplot(aes(year, pct_male, color = field)) +
116 |   geom_line() +
117 |   scale_y_continuous(labels = scales::percent_format()) +
118 |   labs(x = "Year",
119 |        y = "% of PhD recipients reporting as male",
120 |        color = "Major field",
121 |        title = "Breakdown by sex over time within Education major fields")
122 | ```
123 | 
124 | Three levels:
125 | 
126 | * Broad field (Life sciences)
127 | * Major field (Biological and biomedical sciences)
128 | * Subfield (Computational biology)
129 | 
130 | 
131 | 


--------------------------------------------------------------------------------
/volcano-eruptions.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Volcanoes"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r}
  7 | library(tidyverse)
  8 | theme_set(theme_light())
  9 | 
 10 | tuesdata <- tidytuesdayR::tt_load('2020-05-12')
 11 | ```
 12 | 
 13 | ```{r}
 14 | volcano <- tuesdata$volcano %>%
 15 |   mutate(last_eruption_year = as.numeric(last_eruption_year))
 16 | 
 17 | eruptions <- tuesdata$eruptions
 18 | ```
 19 | 
 20 | ```{r}
 21 | volcano %>%
 22 |   count(evidence_category, sort = TRUE)
 23 | 
 24 | volcano %>%
 25 |   mutate(years_ago = 2020 - last_eruption_year) %>%
 26 |   ggplot(aes(years_ago + 1, fill = evidence_category)) +
 27 |   geom_histogram() +
 28 |   scale_x_log10()
 29 | ```
 30 | 
 31 | ```{r}
 32 | volcano %>%
 33 |   count(region, sort = TRUE)
 34 | 
 35 | volcano %>%
 36 |   count(primary_volcano_type, sort = TRUE)
 37 | ```
 38 | 
 39 | ```{r}
 40 | library(ggthemes)
 41 | 
 42 | volcano %>%
 43 |   mutate(primary_volcano_type = str_remove(primary_volcano_type, "\\(.*\\)"),
 44 |          primary_volcano_type = fct_lump(primary_volcano_type, 6)) %>%
 45 |   ggplot(aes(longitude, latitude)) +
 46 |   borders() +
 47 |   geom_point(aes(color = primary_volcano_type), size = .5) +
 48 |   theme_map() +
 49 |   labs(title = "Volcanoes of the World",
 50 |        color = "Type")
 51 | ```
 52 | 
 53 | ```{r}
 54 | library(leaflet)
 55 | library(glue)
 56 | library(htmlwidgets)
 57 | library(DT)
 58 | 
 59 | template <- "<>{ volcano_name }</p><p>{ primary_volcano_type }</p>"
 60 | 
 61 | volcano %>%
 62 |   mutate(transformed_pop = log2(population_within_5_km + 1),
 63 |          pop_color = colorNumeric(c("blue", "red"), transformed_pop)(transformed_pop)) %>%
 64 |   gather(key, value,
 65 |          volcano_name, primary_volcano_type, last_eruption_year,
 66 |          country,
 67 |          tectonic_settings,
 68 |          population_within_5_km) %>%
 69 |   mutate(key = str_to_title(str_replace_all(key, "_", " ")),
 70 |          key = paste0("<b>", key, "</b>")) %>%
 71 |   replace_na(list(value = "Unknown")) %>%
 72 |   nest(data = c(key, value)) %>%
 73 |   mutate(html = map(data,
 74 |                     knitr::kable,
 75 |                     format = "html",
 76 |                     escape = FALSE,
 77 |                     col.names = c("", ""))) %>%
 78 |   leaflet() %>%
 79 |   addTiles() %>%
 80 |   addCircleMarkers(lat = ~ latitude,
 81 |                    lng = ~ longitude,
 82 |                    color = ~ pop_color,
 83 |                    popup = ~ html,
 84 |                    radius = 1) %>%
 85 |   addMeasure()
 86 | ```
 87 | 
 88 | ```{r}
 89 | volcano %>%
 90 |   ggplot(aes(population_within_5_km)) +
 91 |   geom_histogram() +
 92 |   scale_x_log10()
 93 | ```
 94 | 
 95 | ```{r}
 96 | library(gganimate)
 97 | 
 98 | eruptions %>%
 99 |   ggplot(aes(2020 - start_year)) +
100 |   geom_histogram() +
101 |   scale_x_log10()
102 | 
103 | eruptions %>%
104 |   filter(start_year >= 1900,
105 |          eruption_category != "Discredited Eruption")
106 |   ggplot(aes(2020 - start_year)) +
107 |   geom_histogram() +
108 |   scale_x_log10()
109 | ```
110 | 
111 | ```{r}
112 | library(gganimate)
113 | 
114 | anim_graph <- eruptions %>%
115 |   filter(start_year >= 1900) %>%
116 |   mutate(volume = .00001 * 10 ^ vei) %>%
117 |   ggplot(aes(longitude, latitude)) +
118 |   borders() +
119 |   geom_point(aes(size = volume,
120 |                  color = volume)) +
121 |   theme_map() +
122 |   scale_color_gradient2(low = "blue", high = "red",
123 |                         midpoint = log10(.01),
124 |                         trans = "log10",
125 |                         guide = FALSE) +
126 |   scale_size_continuous(range = c(1, 6)) +
127 |   # transition_reveal(start_year) +
128 |   transition_time(start_year) +
129 |   labs(title = "Eruptions: Year { round(frame_time) }") +
130 |   theme(legend.position = "none")
131 | 
132 | animate(anim_graph, 150, fps = 4)
133 | 
134 | anim_save("eruptions.gif")
135 | ```
136 | 
137 | 
138 | 
139 | 
140 | 
141 | ```{r}
142 | eruptions
143 | ```
144 | 
145 | 


--------------------------------------------------------------------------------
/women-workplace-app/app.R:
--------------------------------------------------------------------------------
 1 | library(shiny)
 2 | library(ggplot2)
 3 | library(plotly)
 4 | library(dplyr)
 5 | library(scales)
 6 | theme_set(theme_light())
 7 | 
 8 | jobs_gender <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-05/jobs_gender.csv")
 9 | 
10 | # Define UI for application that draws a histogram
11 | ui <- fluidPage(
12 |    
13 |    # Application title
14 |    titlePanel("Occupation and gender explorer"),
15 |    
16 |    # Sidebar with a slider input for number of bins 
17 |    sidebarLayout(
18 |       sidebarPanel(
19 |          selectInput("major_category",
20 |                      "Occupation category:",
21 |                      choices = unique(jobs_gender$major_category))
22 |       ),
23 |       
24 |       # Show a plot of the generated distribution
25 |       mainPanel(
26 |          plotlyOutput("jobs_scatter", height = "700px")
27 |       )
28 |    )
29 | )
30 | 
31 | # Define server logic required to draw a histogram
32 | server <- function(input, output) {
33 |    
34 |    output$jobs_scatter <- renderPlotly({
35 |      p <- jobs_gender %>%
36 |        filter(year == 2016,
37 |               total_workers >= 20000) %>%
38 |        filter(major_category == input$major_category) %>%
39 |        arrange(desc(wage_percent_of_male)) %>%
40 |        mutate(percent_female = workers_female / total_workers,
41 |               wage_percent_female = total_earnings_female / total_earnings_male) %>%
42 |        ggplot(aes(percent_female,
43 |                   wage_percent_female,
44 |                   color = minor_category,
45 |                   size = total_workers,
46 |                   label = occupation)) +
47 |        geom_point() +
48 |        scale_size_continuous(range = c(1, 10), guide = FALSE) +
49 |        labs(x = "% of workforce reported as female",
50 |             y = "% of median female salary / median male",
51 |             title = "Gender disparity and pay gap in 2016",
52 |             subtitle = "Only occupations with at least 20,000 workers total",
53 |             color = "Minor category") +
54 |        scale_x_continuous(labels = percent_format()) +
55 |        scale_y_continuous(labels = percent_format())
56 |      
57 |      ggplotly(p)
58 |    })
59 | }
60 | 
61 | # Run the application 
62 | shinyApp(ui = ui, server = server)
63 | 
64 | 


--------------------------------------------------------------------------------
/women-workplace.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Women in the Workplace"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ```{r setup, include=FALSE}
  7 | knitr::opts_chunk$set(echo = TRUE)
  8 | ```
  9 | 
 10 | ```{r}
 11 | library(tidyverse)
 12 | library(scales)
 13 | theme_set(theme_light())
 14 | 
 15 | jobs_gender <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-05/jobs_gender.csv")
 16 | earnings_female <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-05/earnings_female.csv") 
 17 | employed_gender <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-05/employed_gender.csv") 
 18 | ```
 19 | 
 20 | ```{r}
 21 | summarize_jobs_gender <- function(tbl) {
 22 |   tbl %>%
 23 |     summarize(total_earnings = sum(total_earnings * total_workers) / sum(total_workers),
 24 |               total_earnings_male = sum(total_earnings_male * workers_male, na.rm = TRUE) /
 25 |                 sum(workers_male[!is.na(total_earnings_male)]),
 26 |               total_earnings_female = sum(total_earnings_female * workers_female, na.rm = TRUE) /
 27 |                 sum(workers_female[!is.na(total_earnings_female)]),
 28 |               total_workers = sum(total_workers),
 29 |               workers_male = sum(workers_male),
 30 |               workers_female = sum(workers_female)) %>%
 31 |     mutate(wage_percent_of_male = total_earnings_female / total_earnings_male)
 32 | }
 33 | 
 34 | by_year_major_category <- jobs_gender %>%
 35 |   filter(complete.cases(.)) %>%
 36 |   group_by(year, major_category) %>%
 37 |   summarize_jobs_gender()
 38 | 
 39 | by_year_major_category %>%
 40 |   mutate(major_category = fct_reorder(major_category, -total_earnings)) %>%
 41 |   ggplot(aes(year, total_earnings, color = major_category)) +
 42 |   geom_line() +
 43 |   expand_limits(y = 0)
 44 | 
 45 | by_year_major_category %>%
 46 |   mutate(major_category = fct_reorder(major_category, -wage_percent_of_male)) %>%
 47 |   ggplot(aes(year, wage_percent_of_male, color = major_category)) +
 48 |   geom_line()
 49 | ```
 50 | 
 51 | ```{r}
 52 | by_minor_category_2016 <- jobs_gender %>%
 53 |   filter(year == 2016) %>%
 54 |   group_by(major_category, minor_category) %>%
 55 |   summarize_jobs_gender() %>%
 56 |   ungroup()
 57 | ```
 58 | 
 59 | ```{r}
 60 | by_minor_category_2016 %>%
 61 |   mutate(minor_category = fct_reorder(minor_category, wage_percent_of_male)) %>%
 62 |   ggplot(aes(minor_category, wage_percent_of_male, fill = major_category)) +
 63 |   geom_col() +
 64 |   coord_flip()
 65 | ```
 66 | 
 67 | ```{r}
 68 | library(plotly)
 69 | 
 70 | p <- jobs_gender %>%
 71 |   filter(year == 2016) %>%
 72 |   filter(major_category == "Healthcare Practitioners and Technical") %>%
 73 |   arrange(desc(wage_percent_of_male)) %>%
 74 |   ggplot(aes(workers_female / total_workers,
 75 |              total_earnings,
 76 |              size = total_workers,
 77 |              label = occupation)) +
 78 |   geom_point() +
 79 |   scale_size_continuous(range = c(1, 10)) +
 80 |   labs(size = "Total # of workers",
 81 |        x = "% of workforce reported as female",
 82 |        y = "Median salary in the occupation") +
 83 |   scale_x_continuous(labels = percent_format()) +
 84 |   scale_y_continuous(labels = dollar_format()) +
 85 |   expand_limits(y = 0)
 86 | 
 87 | ggplotly(p)
 88 | ```
 89 | 
 90 | ```{r}
 91 | p <- jobs_gender %>%
 92 |   filter(year == 2016,
 93 |          total_workers >= 20000) %>%
 94 |   filter(major_category == "Computer, Engineering, and Science") %>%
 95 |   arrange(desc(wage_percent_of_male)) %>%
 96 |   ggplot(aes(workers_female / total_workers,
 97 |              total_earnings_female / total_earnings_male,
 98 |              color = minor_category,
 99 |              size = total_workers,
100 |              label = occupation)) +
101 |   geom_point() +
102 |   scale_size_continuous(range = c(1, 10)) +
103 |   labs(size = "Total # of workers",
104 |        x = "% of workforce reported as female",
105 |        y = "% of median female salary / median male") +
106 |   scale_x_continuous(labels = percent_format()) +
107 |   scale_y_continuous(labels = percent_format())
108 | 
109 | ggplotly(p)
110 | ```
111 | 
112 | 


--------------------------------------------------------------------------------