├── .gitignore
├── 2020_07_28_penguins.Rmd
├── 2020_08_04_europe_energy.Rmd
├── 2020_08_18_extinct_plants.Rmd
├── 2020_08_25_chopped.Rmd
├── 2020_09_01_crop_yields.Rmd
├── 2020_09_08_friends.Rmd
├── 2020_09_15_government_spending_kids.Rmd
├── 2020_09_22_himalayan_climbers.Rmd
├── 2020_09_29_taylor_swift_beyonce.Rmd
├── 2020_10_06_ncaa_womens_basketball.Rmd
├── 2020_10_20_beer_awards.Rmd
├── 2020_11_03_ikea.Rmd
├── 2020_11_10_phone_history.Rmd
├── README.md
├── african-american-achievements.Rmd
├── african-american-history.Rmd
├── animal-crossing.Rmd
├── australian-animal-outcomes.Rmd
├── baltimore_bridges.Rmd
├── beach-volleyball.Rmd
├── beer-production.Rmd
├── bike_traffic.Rmd
├── bird-collisions.Rmd
├── board-games.Rmd
├── bob-ross.Rmd
├── broadway-shinybones
├── .gitignore
├── README.md
├── _site.yml
├── app.R
├── broadway-shinybones.Rproj
├── components
│ └── README.md
├── pages
│ └── README.md
└── show_metrics.rds
├── broadway.Rmd
├── car-economy.Rmd
├── caribou-locations.Rmd
├── cetaceans.Rmd
├── cocktails.Rmd
├── coffee-ratings.Rmd
├── college-majors.Rmd
├── cord-19.Rmd
├── cran-code.Rmd
├── crop-yields-shiny
├── app.Rmd
└── yields_tidy.rds
├── data-screencasts.Rproj
├── french-trains.Rmd
├── gdpr.Rmd
├── golden-age-tv.Rmd
├── grand-slams.Rmd
├── honeycomb-puzzle.Rmd
├── horror-movie-ratings.Rmd
├── malaria.Rmd
├── media-franchises.Rmd
├── medium-datasci.Rmd
├── movie-profit.Rmd
├── nobel-prize.Rmd
├── nyc-pizza.Rmd
├── nyc-restaurants.Rmd
├── nyc-squirrels-app
└── app.R
├── nyc-squirrels.Rmd
├── office-transcripts.Rmd
├── pascals-triangle.Rmd
├── plastic-waste.Rmd
├── r-downloads.Rmd
├── ramen-ratings.Rmd
├── riddler-die-reroll.Rmd
├── riddler-die-roll-low.Rmd
├── riddler-prisoner-coin-flip.Rmd
├── riddler-spam-comments.Rmd
├── screencast-annotations
└── README.md
├── seattle-pets.Rmd
├── simpsons-guests.Rmd
├── space-launches.Rmd
├── student-teacher-ratios.Rmd
├── thanksgiving.Rmd
├── tidytuesday-tweets.Rmd
├── tour-de-france.Rmd
├── trees.Rmd
├── umbrella-week.Rmd
├── uncanny-xmen.Rmd
├── us-dairy.Rmd
├── us-wind.Rmd
├── us_phds.Rmd
├── volcano-eruptions.Rmd
├── wine-ratings.Rmd
├── women-workplace-app
└── app.R
├── women-workplace.Rmd
└── womens-world-cup.Rmd
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 |
--------------------------------------------------------------------------------
/2020_09_01_crop_yields.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "TidyTemplate"
3 | date: 2020-09-01
4 | output: html_output
5 | ---
6 |
7 | ```{r setup, include=FALSE}
8 |
9 | knitr::opts_chunk$set(echo = TRUE)
10 |
11 | library(tidyverse)
12 | library(tidytuesdayR)
13 | library(scales)
14 | library(janitor)
15 | theme_set(theme_light())
16 |
17 | ```
18 |
19 | # Load the weekly Data
20 |
21 | Dowload the weekly data and make available in the `tt` object.
22 |
23 | ```{r Load}
24 | tt <- tt_load("2020-09-01")
25 | ```
26 |
27 | ```{r}
28 | tt$key_crop_yields %>%
29 | View()
30 |
31 | yields <- tt$key_crop_yields %>%
32 | clean_names() %>%
33 | rename_all(str_remove, "_tonnes.*")
34 |
35 | arable_land <- tt$arable_land_pin %>%
36 | clean_names() %>%
37 | rename(arable_land_needed = 4)
38 |
39 | arable_land %>%
40 | filter(entity == "Afghanistan") %>%
41 | ggplot(aes(year, arable_land_needed)) +
42 | geom_line()
43 |
44 | fertilizer <- tt$cereal_crop_yield_vs_fertilizer_application %>%
45 | janitor::clean_names() %>%
46 | rename(yield = 4,
47 | fertilizer_use = 5)
48 | ```
49 |
50 | ```{r}
51 | yields_tidy <- yields %>%
52 | pivot_longer(wheat:bananas, names_to = "crop", values_to = "yield") %>%
53 | filter(!is.na(yield)) %>%
54 | mutate(crop = str_replace_all(crop, "_", " "),
55 | crop = str_to_title(crop))
56 |
57 | yields_tidy %>%
58 | write_rds("crop-yields-shiny/yields_tidy.rds")
59 | ```
60 |
61 | ```{r}
62 | yields_tidy %>%
63 | filter(code == "USA") %>%
64 | mutate(crop = fct_reorder(crop, -yield)) %>%
65 | ggplot(aes(year, yield)) +
66 | geom_line() +
67 | facet_wrap(~ crop)
68 |
69 | yields_tidy %>%
70 | filter(code == "USA") %>%
71 | mutate(crop = fct_reorder(crop, -yield)) %>%
72 | ggplot(aes(year, yield, color = crop)) +
73 | geom_line() +
74 | labs(x = "Year",
75 | y = "Yield (tonnes per hectare)",
76 | title = "Crop yields in the US over time",
77 | color = "Crop")
78 |
79 | yields_tidy %>%
80 | filter(entity == "India") %>%
81 | mutate(crop = fct_reorder(crop, -yield)) %>%
82 | ggplot(aes(year, yield, color = crop)) +
83 | geom_line() +
84 | labs(x = "Year",
85 | y = "Yield (tonnes per hectare)",
86 | title = "Crop yields in the US over time",
87 | color = "Crop")
88 | ```
89 |
90 | ```{r}
91 | yields_tidy %>%
92 | filter(crop == "Wheat") %>%
93 | add_count(entity) %>%
94 | filter(n == max(n)) %>%
95 | filter(entity %in% sample(unique(entity), 25)) %>%
96 | ggplot(aes(year, yield)) +
97 | geom_line() +
98 | facet_wrap(~ entity)
99 |
100 | crop_yields_50_years <- yields_tidy %>%
101 | arrange(entity, year) %>%
102 | filter(year >= 1968) %>%
103 | group_by(entity, code, crop) %>%
104 | summarize(year_start = min(year),
105 | year_end = max(year),
106 | yield_start = first(yield),
107 | yield_end = last(yield)) %>%
108 | ungroup() %>%
109 | filter(year_start == 1968) %>%
110 | mutate(yield_ratio = yield_end / yield_start)
111 |
112 | crop_yields_50_years %>%
113 | filter(!is.na(code)) %>%
114 | ggplot(aes(yield_start, yield_end)) +
115 | geom_abline(color = "red") +
116 | geom_point() +
117 | facet_wrap(~ crop, scales = "free")
118 |
119 | crop_yields_50_years %>%
120 | mutate(crop = fct_reorder(crop, yield_ratio)) %>%
121 | ggplot(aes(yield_ratio, crop)) +
122 | geom_boxplot() +
123 | scale_x_log10()
124 |
125 | crop_yields_50_years %>%
126 | group_by(crop) %>%
127 | summarize(median_yield_ratio = median(yield_ratio)) %>%
128 | mutate(crop = fct_reorder(crop, median_yield_ratio)) %>%
129 | ggplot(aes(median_yield_ratio, crop)) +
130 | geom_col() +
131 | labs(title = "How much has the average country improved at producing this crop?",
132 | x = "(2018 yield) / (1968 yield)",
133 | y = "")
134 |
135 | crop_yields_50_years %>%
136 | filter(is.na(code)) %>%
137 | filter(entity %in% c("Africa", "Asia", "Northern America", "South America", "Oceania")) %>%
138 | ggplot(aes(yield_start, yield_end, color = entity)) +
139 | geom_abline(color = "red") +
140 | geom_point() +
141 | expand_limits(y = 0, x = 0) +
142 | facet_wrap(~ crop, scales = "free") +
143 | labs(x = "Tonnes per hectare in 1968",
144 | y = "Tonnes per hectare in 2018",
145 | color = "Continent")
146 | ```
147 |
148 | ```{r}
149 | library(ggrepel)
150 | library(countrycode)
151 |
152 | crop_yields_50_years %>%
153 | filter(crop == "Wheat",
154 | !is.na(code)) %>%
155 | mutate(continent = countrycode(code, "iso3c", "continent")) %>%
156 | filter(!is.na(continent)) %>%
157 | ggplot(aes(yield_start, yield_ratio)) +
158 | geom_point(aes(color = continent)) +
159 | scale_x_log10() +
160 | scale_y_log10(breaks = c(.25, .5, 1, 2, 4),
161 | labels = c("1/4X", "1/2X", "Same", "2X", "4X")) +
162 | geom_hline(yintercept = 1, linetype = "dotted") +
163 | geom_text_repel(aes(label = entity), force = .1,
164 | size = 2.5) +
165 | labs(x = "1968 yield (tonnes per hectare), log scale",
166 | y = "(2018 yield) / (1968 yield), log scale",
167 | color = "Continent",
168 | title = "How has wheat efficiency changed across countries?")
169 |
170 | countrycode("USA", "iso3c", "continent")
171 | ```
172 |
173 |
174 |
175 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Code from screencasts of #tidytuesday
2 |
3 | These are the R Markdown documents produced during live analyses of [#tidytuesday data](https://github.com/rfordatascience/tidytuesday).
4 |
--------------------------------------------------------------------------------
/african-american-achievements.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "African-American Achievements"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | theme_set(theme_light())
13 |
14 | tuesdata <- tidytuesdayR::tt_load('2020-06-09')
15 |
16 | science <- tuesdata$science
17 | ```
18 |
19 | ```{r}
20 | tuesdata$firsts %>%
21 | View()
22 |
23 | tuesdata$science %>%
24 | View()
25 | ```
26 |
27 | ```{r}
28 | firsts <- tuesdata$firsts
29 |
30 | tuesdata$firsts %>%
31 | ggplot(aes(year)) +
32 | geom_histogram()
33 |
34 | tuesdata$firsts %>%
35 | count(category, sort = TRUE) %>%
36 | mutate(category = fct_reorder(category, n)) %>%
37 | ggplot(aes(n, category)) +
38 | geom_col()
39 | ```
40 |
41 | ```{r}
42 | firsts <- tuesdata$firsts %>%
43 | mutate(person = str_remove(person, "[\\[\\(].*"),
44 | person = str_trim(person))
45 | ```
46 |
47 | ```{r}
48 | library(plotly)
49 | library(glue)
50 |
51 | g <- firsts %>%
52 | ggplot(aes(year,
53 | category,
54 | color = category,
55 | text = glue("{ year }: { accomplishment }\n{ person }"))) +
56 | geom_point() +
57 | theme(axis.text.y = element_blank(),
58 | axis.ticks.y = element_blank(),
59 | panel.grid.major.y = element_blank(),
60 | panel.grid.minor.y = element_blank(),
61 | legend.position = "none") +
62 | labs(title = "Timeline of some notable African-American achievements",
63 | subtitle = "Source: https://en.wikipedia.org/wiki/List_of_African-American_firsts",
64 | y = "Category",
65 | x = "Year")
66 |
67 | ggplotly(g, tooltip = "text")
68 | ```
69 |
70 | ### Science
71 |
72 | ```{r}
73 | tuesdata$science %>%
74 | ggplot(aes(birth)) +
75 | geom_histogram()
76 |
77 | tuesdata$science %>%
78 | separate_rows(occupation_s, sep = "; ") %>%
79 | mutate(occupation = str_to_title(occupation_s)) %>%
80 | count(occupation, sort = TRUE)
81 |
82 | science %>%
83 | filter(str_detect(occupation_s, regex("istician", ignore_case = TRUE))) %>%
84 | pull(name)
85 |
86 | science %>%
87 | filter(str_detect(occupation_s, "statistician")) %>%
88 | View()
89 | ```
90 |
91 | ```{r}
92 | library(rvest)
93 |
94 | science_html <- science %>%
95 | mutate(html = map(links, possibly(read_html, NULL, quiet = FALSE)))
96 | ```
97 |
98 | ```{r}
99 | extract_infobox <- . %>%
100 | html_node(".vcard") %>%
101 | html_table(header = FALSE) %>%
102 | as_tibble()
103 |
104 | infoboxes <- science_html %>%
105 | filter(!map_lgl(html, is.null)) %>%
106 | mutate(infobox = map(html, possibly(extract_infobox, NULL))) %>%
107 | select(link = links, infobox) %>%
108 | unnest(infobox) %>%
109 | filter(X1 != "" | X2 != "", X1 != "Scientific career") %>%
110 | rename(key = X1, value = X2)
111 |
112 | science_infoboxes <- infoboxes %>%
113 | group_by(link) %>%
114 | mutate(name = first(key)) %>%
115 | group_by(key) %>%
116 | filter(n() >= 10) %>%
117 | ungroup() %>%
118 | distinct(name, key, .keep_all = TRUE) %>%
119 | spread(key, value) %>%
120 | janitor::clean_names()
121 | ```
122 |
123 | ```{r}
124 | science_infoboxes %>%
125 | count(nationality, sort = TRUE)
126 | ```
127 |
128 |
129 |
130 |
131 |
132 |
--------------------------------------------------------------------------------
/african-american-history.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "African-American History"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | library(tidytuesdayR)
13 | library(scales)
14 | theme_set(theme_light())
15 |
16 | tuesdata <- tt_load('2020-06-16')
17 | ```
18 |
19 | ```{r}
20 | slave_routes <- tuesdata$slave_routes
21 |
22 | slave_routes %>%
23 | summarize(sum(n_slaves_arrived, na.rm = TRUE))
24 |
25 | slave_routes %>%
26 | count(ship_name, sort = TRUE)
27 |
28 | slave_routes %>%
29 | ggplot(aes(n_slaves_arrived)) +
30 | geom_histogram()
31 | ```
32 |
33 | ```{r}
34 | slave_routes %>%
35 | filter(!is.na(ship_name), !is.na(n_slaves_arrived)) %>%
36 | filter(fct_lump(ship_name, 12) != "Other") %>%
37 | mutate(ship_name = fct_reorder(ship_name, n_slaves_arrived)) %>%
38 | ggplot(aes(n_slaves_arrived, ship_name)) +
39 | geom_boxplot()
40 | ```
41 |
42 | ```{r}
43 | slave_routes %>%
44 | count(port_arrival, sort = TRUE)
45 |
46 | slave_routes %>%
47 | count(port_origin, sort = TRUE)
48 |
49 |
50 | slave_routes %>%
51 | ggplot(aes(year_arrival)) +
52 | geom_histogram() +
53 | geom_vline(xintercept = c(1776, 1865), lty = 2) +
54 | labs(title = "# of slave ships over time")
55 | ```
56 |
57 | ```{r}
58 | slave_routes %>%
59 | filter(!is.na(port_origin)) %>%
60 | mutate(port_origin = str_trunc(port_origin, 25),
61 | port_origin = fct_lump(port_origin, 24),
62 | port_origin = fct_reorder(port_origin, year_arrival)) %>%
63 | count(port_origin,
64 | decade = 10 * (year_arrival %/% 10)) %>%
65 | ggplot(aes(decade, n)) +
66 | geom_line() +
67 | facet_wrap(~ port_origin) +
68 | geom_vline(xintercept = c(1807), lty = 2) +
69 | theme(strip.text = element_text(size = 6))
70 |
71 | slave_routes %>%
72 | filter(port_origin == "New York") %>%
73 | arrange(desc(year_arrival)) %>%
74 | View()
75 | ```
76 |
77 | ```{r}
78 | slave_routes %>%
79 | filter(fct_lump(ship_name, 12) != "Other") %>%
80 | count(ship_name, port_origin, port_arrival, sort = TRUE) %>%
81 | group_by(ship_name) %>%
82 | mutate(percent = n / sum(n)) %>%
83 | top_n(1, percent) %>%
84 | arrange(desc(percent))
85 | ```
86 |
87 | ```{r}
88 | library(grid)
89 | library(ggraph)
90 |
91 | set.seed(2020)
92 |
93 | slave_routes %>%
94 | count(port_origin, port_arrival, sort = TRUE) %>%
95 | filter(port_origin != port_arrival) %>%
96 | head(40) %>%
97 | ggraph(layout = "fr") +
98 | geom_edge_link(aes(width = n),
99 | arrow = arrow(type = "closed",
100 | length = unit(.1, "inch"))) +
101 | geom_node_point() +
102 | geom_node_text(aes(label = name), repel = TRUE) +
103 | scale_edge_width_continuous(range = c(.5, 4)) +
104 | labs(title = "Common slave routes",
105 | edge_width = "# of ships")
106 | ```
107 |
108 | ```{r}
109 | slave_routes %>%
110 | group_by(port_origin) %>%
111 | summarize(n_ships = n(),
112 | total_recorded = sum(n_slaves_arrived, na.rm = TRUE),
113 | pct_missing = mean(is.na(n_slaves_arrived)),
114 | estimated_total = mean(n_slaves_arrived, na.rm = TRUE) * n()) %>%
115 | arrange(desc(n_ships))
116 | ```
117 |
118 | ```{r}
119 | census <- tuesdata$census
120 |
121 | census_gathered <- census %>%
122 | mutate(other = total - white - black) %>%
123 | arrange(year) %>%
124 | gather(racial_category, population, white, black_free, black_slaves, other)
125 |
126 | census_gathered %>%
127 | filter(region == "USA Total") %>%
128 | ggplot(aes(year, population, fill = racial_category)) +
129 | geom_col() +
130 | scale_y_continuous(labels = comma) +
131 | labs(x = "Year",
132 | y = "Census Population",
133 | fill = "Racial category",
134 | title = "Census racial makeup of US, 1790-1870",
135 | subtitle = "No 'other' category existed before 1860")
136 | ```
137 |
138 | ```{r}
139 | census_gathered %>%
140 | filter(region != "USA Total", !is.na(division)) %>%
141 | mutate(division = fct_reorder(division, -population, sum)) %>%
142 | ggplot(aes(year, population, fill = racial_category)) +
143 | geom_col() +
144 | scale_y_continuous(labels = comma) +
145 | facet_wrap(~ division) +
146 | labs(x = "Year",
147 | y = "Census Population",
148 | fill = "Racial category",
149 | title = "Census racial makeup of US, 1790-1870",
150 | subtitle = "No 'other' category existed before 1860")
151 | ```
152 |
153 | ```{r}
154 | census_gathered %>%
155 | filter(region != "USA Total", !is.na(division)) %>%
156 | mutate(division = fct_reorder(division, -population, sum)) %>%
157 | group_by(division, year) %>%
158 | mutate(percent = population / sum(population)) %>%
159 | ggplot(aes(year, percent, fill = racial_category)) +
160 | geom_col() +
161 | scale_y_continuous(labels = percent) +
162 | facet_wrap(~ division) +
163 | labs(x = "Year",
164 | y = "% of Census Population",
165 | fill = "Racial category",
166 | title = "Census racial makeup of US, 1790-1870",
167 | subtitle = "No 'other' category existed before 1860")
168 | ```
169 |
170 | ```{r}
171 | library(ggwordcloud)
172 |
173 | name_counts <- tuesdata$african_names %>%
174 | count(name, gender, sort = TRUE)
175 |
176 | wordcloud(name_counts$name, name_counts$n)
177 |
178 | name_counts %>%
179 | head(100) %>%
180 | ggplot(aes(label = name, size = n, color = gender)) +
181 | geom_text_wordcloud()
182 | ```
183 |
184 | ```{r}
185 | library(tidytext)
186 |
187 | tuesdata$african_names %>%
188 | filter(!is.na(gender)) %>%
189 | mutate(gender = fct_recode(gender, Man = "Boy", Woman = "Girl")) %>%
190 | count(name, gender, sort = TRUE) %>%
191 | group_by(gender) %>%
192 | top_n(20, n) %>%
193 | ungroup() %>%
194 | mutate(name = reorder_within(name, n, gender)) %>%
195 | ggplot(aes(n, name)) +
196 | geom_col() +
197 | scale_y_reordered() +
198 | facet_wrap(~ gender, scales = "free")
199 | ```
200 |
201 | ```{r}
202 | tuesdata$african_names %>%
203 | count(country_origin, sort = TRUE)
204 | ```
205 |
206 |
207 |
--------------------------------------------------------------------------------
/animal-crossing.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Animal Crossing"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | theme_set(theme_light())
13 |
14 | critic <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/critic.tsv')
15 | user_reviews <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/user_reviews.tsv')
16 | items <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/items.csv')
17 | villagers <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-05-05/villagers.csv')
18 | ```
19 |
20 | ```{r}
21 | villagers
22 | items %>%
23 | View()
24 | ```
25 |
26 | ### Text Analysis
27 |
28 | ```{r}
29 | critic %>%
30 | ggplot(aes(grade)) +
31 | geom_histogram()
32 |
33 | user_reviews %>%
34 | ggplot(aes(grade)) +
35 | geom_histogram()
36 |
37 | head(user_reviews$text)
38 | ```
39 |
40 | ```{r}
41 | library(tidytext)
42 | library(lubridate)
43 |
44 | user_review_words <- user_reviews %>%
45 | unnest_tokens(word, text) %>%
46 | anti_join(stop_words, by = "word") %>%
47 | count(user_name, date, grade, word)
48 | ```
49 |
50 | ```{r}
51 | by_week <- user_reviews %>%
52 | group_by(week = floor_date(date, "week", week_start = 1)) %>%
53 | summarize(nb_reviews = n(),
54 | avg_grade = mean(grade),
55 | pct_zero = mean(grade == 0),
56 | pct_ten = mean(grade == 10))
57 |
58 | by_week %>%
59 | ggplot(aes(week, avg_grade)) +
60 | geom_line() +
61 | geom_point(aes(size = nb_reviews)) +
62 | expand_limits(y = 0) +
63 | labs(x = "Time",
64 | y = "Average grade",
65 | size = "# of reviews")
66 |
67 | user_reviews %>%
68 | ggplot(aes(grade)) +
69 | geom_histogram() +
70 | labs(title = "Most reviews were very low or very high")
71 |
72 | by_week %>%
73 | gather(type, value, contains("pct")) %>%
74 | mutate(type = ifelse(type == "pct_zero", "% rated 0", "% rated 10")) %>%
75 | ggplot(aes(week, value, color = type)) +
76 | geom_line() +
77 | geom_point(aes(size = nb_reviews)) +
78 | scale_y_continuous(labels = scales::percent) +
79 | expand_limits(y = 0) +
80 | labs(x = "Time",
81 | y = "% of reviews",
82 | size = "Total reviews in week",
83 | title = "Reviews got more polarizing in middle of game")
84 | ```
85 |
86 |
87 | ```{r}
88 | # user_reviews %>%
89 | # mutate(start_text = str_sub(text, 1, pmin(30, str_length(text))),
90 | # regex = paste0(".", start_text, ".*")) %>%
91 | # mutate(new_text = map2_chr(text, regex, str_remove)) %>%
92 | # filter(str_length(new_text) != str_length(text))
93 |
94 | by_word <- user_review_words %>%
95 | group_by(word) %>%
96 | summarize(avg_grade = mean(grade),
97 | nb_reviews = n()) %>%
98 | arrange(desc(nb_reviews)) %>%
99 | filter(nb_reviews >= 25) %>%
100 | arrange(desc(avg_grade))
101 |
102 | by_word %>%
103 | filter(nb_reviews >= 75) %>%
104 | ggplot(aes(nb_reviews, avg_grade)) +
105 | geom_point() +
106 | geom_text(aes(label = word), vjust = 1, hjust = 1, check_overlap = TRUE) +
107 | scale_x_log10()
108 | ```
109 |
110 | ```{r}
111 | by_word %>%
112 | top_n(20, -avg_grade) %>%
113 | ggplot(aes(nb_reviews, avg_grade)) +
114 | geom_point() +
115 | geom_text(aes(label = word), vjust = 1, hjust = 1, check_overlap = TRUE) +
116 | scale_x_log10() +
117 | labs(title = "What words were associated with low-grade reviews?",
118 | subtitle = "20 most negative words; only words in at least 25 reviews")
119 | ```
120 |
121 | ```{r}
122 | library(widyr)
123 | library(stm)
124 |
125 | review_matrix <- user_review_words %>%
126 | group_by(word) %>%
127 | filter(n() >= 25) %>%
128 | cast_sparse(user_name, word, n)
129 |
130 | topic_model_6 <- stm(review_matrix,
131 | K = 6,
132 | verbose = TRUE,
133 | init.type = "Spectral",
134 | emtol = 5e-5)
135 | ```
136 |
137 | ```{r}
138 | tidy(topic_model_6) %>%
139 | group_by(topic) %>%
140 | top_n(12, beta) %>%
141 | mutate(term = reorder_within(term, beta, topic)) %>%
142 | ggplot(aes(beta, term, fill = factor(topic))) +
143 | geom_col(show.legend = FALSE) +
144 | scale_y_reordered() +
145 | facet_wrap(~ topic, scales = "free_y")
146 |
147 | tidy(topic_model_6) %>%
148 | filter(term == "progress")
149 |
150 | topic_model_gamma <- tidy(topic_model_6, matrix = "gamma") %>%
151 | mutate(user_name = rownames(review_matrix)[document]) %>%
152 | inner_join(user_reviews, by = "user_name")
153 |
154 | topic_model_gamma %>%
155 | group_by(topic) %>%
156 | top_n(1, gamma)
157 |
158 | topic_model_gamma %>%
159 | group_by(topic) %>%
160 | summarize(correlation = cor(gamma, grade),
161 | spearman_correlation = cor(gamma, grade, method = "spearman"))
162 |
163 | topic_model_gamma %>%
164 | group_by(week = floor_date(date, "week", week_start = 1),
165 | topic) %>%
166 | summarize(avg_gamma = mean(gamma)) %>%
167 | ggplot(aes(week, avg_gamma, color = factor(topic))) +
168 | geom_line() +
169 | expand_limits(y = 0) +
170 | scale_y_continuous(labels = scales::percent) +
171 | labs(x = "Time",
172 | y = "Average gamma (document-topic association)")
173 | ```
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
--------------------------------------------------------------------------------
/beach-volleyball.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Beach Volleyball"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | library(scales)
13 | library(lubridate)
14 | theme_set(theme_light())
15 |
16 | tuesdata <- tidytuesdayR::tt_load('2020-05-19')
17 |
18 | vb_matches <- tuesdata$vb_matches %>%
19 | mutate(match_id = row_number())
20 | ```
21 |
22 | ```{r}
23 | vb_matches %>%
24 | count(circuit, tournament, date, sort = TRUE)
25 |
26 | vb_matches %>%
27 | count(gender, sort = TRUE)
28 |
29 | vb_matches %>%
30 | count(year) %>%
31 |
32 | ```
33 |
34 | ```{r}
35 | vb_long <- vb_matches %>%
36 | rename(w_p1_name = w_player1, w_p2_name = w_player2,
37 | l_p1_name = l_player1, l_p2_name = l_player2,
38 | w_team_rank = w_rank, l_team_rank = l_rank) %>%
39 | mutate_at(vars(starts_with("w_"), starts_with("l_")), as.character) %>%
40 | pivot_longer(cols = c(starts_with("w_"), starts_with("l_"))) %>%
41 | separate(name, c("winner_loser", "player", "name"),
42 | sep = "_",
43 | extra = "merge",
44 | fill = "right") %>%
45 | mutate(winner_loser = str_to_upper(winner_loser))
46 |
47 | vb_player_matches <- vb_long %>%
48 | filter(name != "rank") %>%
49 | spread(name, value) %>%
50 | type_convert()
51 | ```
52 |
53 | ```{r}
54 | vb_sets <- vb_matches %>%
55 | select(match_id, circuit:match_num, score) %>%
56 | separate_rows(score, sep = ", ") %>%
57 | mutate(score = str_remove(score, " retired")) %>%
58 | mutate(score = na_if(score, "Forfeit or other")) %>%
59 | separate(score, c("w_score", "l_score"), convert = TRUE)
60 | ```
61 |
62 | ## Look at players
63 |
64 | ```{r}
65 | by_player <- vb_player_matches %>%
66 | group_by(name, gender) %>%
67 | summarize(n_matches = n(),
68 | pct_winner = mean(winner_loser == "W"),
69 | first_game = min(date),
70 | last_game = max(date)) %>%
71 | arrange(desc(n_matches)) %>%
72 | ungroup()
73 |
74 | by_player %>%
75 | filter(n_matches >= 200) %>%
76 | ggplot(aes(n_matches, pct_winner, color = gender)) +
77 | geom_point() +
78 | scale_x_log10() +
79 | scale_y_continuous(labels = percent) +
80 | labs(x = "# of matches since 2000 (log scale)",
81 | y = "% of matches won")
82 |
83 | by_player %>%
84 | filter(n_matches >= 200) %>%
85 | arrange(desc(pct_winner))
86 | ```
87 |
88 | ```{r}
89 | vb_player_matches %>%
90 | summarize_all(~ mean(!is.na(.))) %>%
91 | gather() %>%
92 | View()
93 | ```
94 |
95 | ```{r}
96 | vb_player_matches %>%
97 | group_by(tournament) %>%
98 | summarize(pct_has_attacks = mean(!is.na(tot_attacks)),
99 | n = n()) %>%
100 | arrange(desc(n))
101 | ```
102 |
103 | ### How would we predict if a player will win in 2019?
104 |
105 | ```{r}
106 | summarize_players <- . %>%
107 | summarize(n_matches = n(),
108 | pct_winner = mean(winner_loser == "W"),
109 | avg_attacks = mean(tot_attacks, na.rm = TRUE),
110 | avg_errors = mean(tot_errors, na.rm = TRUE),
111 | avg_serve_errors = mean(tot_serve_errors, na.rm = TRUE),
112 | avg_kills = mean(tot_kills, na.rm = TRUE),
113 | avg_aces = mean(tot_aces, na.rm = TRUE),
114 | n_with_data = sum(!is.na(tot_attacks))) %>%
115 | ungroup() %>%
116 | arrange(desc(n_matches))
117 |
118 | players_before_2019 <- vb_player_matches %>%
119 | filter(year < 2019) %>%
120 | group_by(name, gender, hgt, birthdate, country) %>%
121 | summarize_players() %>%
122 | filter(!is.na(avg_attacks))
123 |
124 | players_2019 <- vb_player_matches %>%
125 | filter(year == 2019) %>%
126 | group_by(name, gender, hgt, birthdate, country, year,
127 | age = year - year(birthdate)) %>%
128 | summarize_players()
129 | ```
130 |
131 | ```{r}
132 | performance_joined <- players_before_2019 %>%
133 | inner_join(players_2019 %>%
134 | select(name, n_matches, pct_winner),
135 | by = "name",
136 | suffix = c("", "_2019"))
137 |
138 | performance_joined %>%
139 | filter(n_matches >= 10,
140 | n_matches_2019 >= 5) %>%
141 | ggplot(aes(pct_winner, pct_winner_2019)) +
142 | geom_point() +
143 | geom_abline(color = "red") +
144 | geom_smooth(method = "lm")
145 |
146 | performance_joined %>%
147 | mutate(n_wins_2019 = n_matches_2019 * pct_winner_2019,
148 | country = fct_lump(country, 3)) %>%
149 | glm(cbind(n_wins_2019, n_matches_2019 - n_wins_2019) ~
150 | pct_winner + avg_errors + avg_serve_errors,
151 | data = .,
152 | family = "binomial") %>%
153 | summary()
154 | ```
155 |
156 |
157 | ```{r}
158 | players_before_2019 %>%
159 | filter(n_with_data >= 20) %>%
160 | ggplot(aes(avg_serve_errors, avg_aces, size = n_with_data)) +
161 | geom_point() +
162 | labs(size = "Matches",
163 | title = "DON'T TRUST THIS")
164 | ```
165 |
166 | ```{r}
167 |
168 | ```
169 |
170 |
171 |
172 | ```{r}
173 |
174 | ```
175 |
176 |
177 | ```{r}
178 | player_first_year_summarized %>%
179 | filter(!is.nan(avg_attacks))
180 | ```
181 |
182 |
183 |
184 |
--------------------------------------------------------------------------------
/bike_traffic.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Bike Traffic"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | library(lubridate)
13 | library(scales)
14 | theme_set(theme_light())
15 |
16 | bike_traffic_raw <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-02/bike_traffic.csv")
17 |
18 | bike_traffic <- bike_traffic_raw %>%
19 | mutate(date = mdy_hms(date)) %>%
20 | filter(bike_count < 2000) %>%
21 | select(-ped_count)
22 | ```
23 |
24 | ```{r}
25 | bike_traffic %>%
26 | count(crossing, direction)
27 | ```
28 |
29 | ```{r}
30 | bike_traffic %>%
31 | ggplot(aes(date, fill = is.na(bike_count))) +
32 | geom_histogram() +
33 | facet_grid(crossing ~ direction)
34 | ```
35 |
36 | ### When in the day do we see bikers?
37 |
38 | ```{r}
39 | bike_traffic %>%
40 | group_by(crossing,
41 | hour = hour(date)) %>%
42 | summarize(bike_count = sum(bike_count, na.rm = TRUE)) %>%
43 | mutate(pct_bike = bike_count / sum(bike_count)) %>%
44 | ggplot(aes(hour, pct_bike, color = crossing)) +
45 | geom_line() +
46 | geom_point() +
47 | scale_y_continuous(labels = percent_format()) +
48 | labs(title = "When in the day do people bike through these Seattle crossings?",
49 | subtitle = "Based on crossings from 2014-February 2019",
50 | color = "Crossing",
51 | x = "Time of day (local time)",
52 | y = "% of bike crossings that happen in this hour")
53 | ```
54 |
55 | ```{r}
56 | bike_by_time_window <- bike_traffic %>%
57 | mutate(hour = hour(date)) %>%
58 | mutate(time_window = case_when(
59 | between(hour, 7, 10) ~ "Morning Commute",
60 | between(hour, 11, 15) ~ "Midday",
61 | between(hour, 16, 18) ~ "Evening Commute",
62 | TRUE ~ "Night"
63 | )) %>%
64 | group_by(crossing,
65 | time_window) %>%
66 | summarize(number_missing = sum(is.na(bike_count)),
67 | bike_count = sum(bike_count, na.rm = TRUE)) %>%
68 | mutate(pct_bike = bike_count / sum(bike_count))
69 |
70 | bike_by_time_window %>%
71 | select(-number_missing, -bike_count) %>%
72 | spread(time_window, pct_bike) %>%
73 | mutate(TotalCommute = `Evening Commute` + `Morning Commute`) %>%
74 | arrange(desc(TotalCommute))
75 |
76 | bike_by_time_window %>%
77 | ggplot(aes(time_window, pct_bike)) +
78 | geom_col() +
79 | coord_flip() +
80 | facet_wrap(~ crossing)
81 |
82 | bike_by_time_window %>%
83 | group_by(crossing) %>%
84 | summarize(total_bikes = sum(bike_count),
85 | pct_commute = sum(bike_count[str_detect(time_window, "Commute")]) / total_bikes) %>%
86 | ggplot(aes(total_bikes, pct_commute)) +
87 | geom_point() +
88 | scale_x_log10()
89 | ```
90 |
91 | ```{r}
92 | bike_traffic %>%
93 | group_by(crossing,
94 | weekday = wday(date, label = TRUE),
95 | hour = hour(date)) %>%
96 | summarize(total_bikes = sum(bike_count, na.rm = TRUE)) %>%
97 | group_by(crossing) %>%
98 | mutate(pct_bike = total_bikes / sum(total_bikes)) %>%
99 | ggplot(aes(hour, pct_bike, color = crossing)) +
100 | geom_line(show.legend = FALSE) +
101 | facet_grid(crossing ~ weekday) +
102 | scale_y_continuous(labels = percent_format()) +
103 | labs(x = "Time of week",
104 | y = "% of bike crossings happening in this hour",
105 | title = "When in the week do people in Seattle bike?",
106 | subtitle = "Based on crossings from 2014-February 2019")
107 | ```
108 |
109 | ```{r}
110 | bike_traffic %>%
111 | filter(date < "2018-01-01") %>%
112 | group_by(crossing,
113 | month = fct_relevel(month.name[month(date)], month.name)) %>%
114 | summarize(total_bikes = sum(bike_count, na.rm = TRUE)) %>%
115 | mutate(pct_bike = total_bikes / sum(total_bikes)) %>%
116 | ggplot(aes(month, pct_bike, color = crossing, group = crossing)) +
117 | geom_line() +
118 | expand_limits(y = 0) +
119 | scale_y_continuous(labels = percent_format()) +
120 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
121 | labs(title = "What time of year do people bike?",
122 | subtitle = "Based on 2014-2017 bike crossings",
123 | y = "% of yearly trips in this month",
124 | x = "")
125 | ```
126 |
127 | ### What directions do people commute by bike?
128 |
129 | ```{r}
130 | bike_by_direction_hour_crossing <- bike_traffic %>%
131 | filter(crossing != "MTS Trail",
132 | !wday(date, label = TRUE) %in% c("Sat", "Sun"),
133 | direction %in% c("North", "South")) %>%
134 | mutate(hour = hour(date)) %>%
135 | group_by(crossing,
136 | direction,
137 | hour) %>%
138 | summarize(bike_count = sum(bike_count, na.rm = TRUE)) %>%
139 | mutate(pct_bike = bike_count / sum(bike_count))
140 |
141 | bike_by_direction_hour_crossing %>%
142 | group_by(crossing) %>%
143 | mutate(average_hour = sum((hour * pct_bike)[direction == "North"])) %>%
144 | ungroup() %>%
145 | mutate(crossing = fct_reorder(crossing, average_hour)) %>%
146 | ggplot(aes(hour, pct_bike, color = direction)) +
147 | geom_line() +
148 | facet_grid(crossing ~ .) +
149 | scale_y_continuous(labels = percent_format()) +
150 | labs(x = "Time of day",
151 | y = "% of bike crossings happening in this hour",
152 | title = "In which directions do people commute by bike?",
153 | subtitle = "Based on weekday crossings at six Seattle locations from 2014-February 2019",
154 | color = "Direction")
155 | ```
156 |
157 |
158 |
159 |
160 |
161 |
162 |
--------------------------------------------------------------------------------
/bird-collisions.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Bird Collisions"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ## R Markdown
11 |
12 | ```{r}
13 | library(tidyverse)
14 | theme_set(theme_light())
15 |
16 | mp_light <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-30/mp_light.csv")
17 |
18 | bird_collisions <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-04-30/bird_collisions.csv") %>%
19 | left_join(mp_light, by = "date")
20 | ```
21 |
22 | ```{r}
23 | bird_collisions %>%
24 | ggplot(aes(date, fill = locality)) +
25 | geom_histogram()
26 | ```
27 |
28 | ```{r}
29 | bird_collisions %>%
30 | gather(category, value, -date, -light_score) %>%
31 | count(category, value, light_score_missing = is.na(light_score)) %>%
32 | group_by(category) %>%
33 | top_n(16, n) %>%
34 | ungroup() %>%
35 | mutate(value = fct_reorder(value, n, sum),
36 | category = fct_reorder(category, n, length)) %>%
37 | ggplot(aes(value, n, fill = light_score_missing)) +
38 | geom_col() +
39 | facet_wrap(~ category, scales = "free_y") +
40 | coord_flip() +
41 | labs(x = "# of collisions",
42 | y = "",
43 | fill = "Light score missing",
44 | title = "Category breakdowns of collisions")
45 | ```
46 |
47 | ```{r}
48 | bird_collisions %>%
49 | filter(!is.na(light_score)) %>%
50 | count(date, locality) %>%
51 | ggplot(aes(n, color = locality)) +
52 | geom_density() +
53 | scale_x_log10() +
54 | labs(x = "# of collisions per night")
55 |
56 | bird_collisions %>%
57 | filter(!is.na(light_score)) %>%
58 | distinct(date, light_score) %>%
59 | ggplot(aes(light_score)) +
60 | geom_histogram()
61 |
62 | geom_mean <- function(x) {
63 | exp(mean(log(x + 1)) - 1)
64 | }
65 |
66 | by_day_mp <- bird_collisions %>%
67 | filter(!is.na(light_score)) %>%
68 | group_by(date, locality) %>%
69 | summarize(collisions = n()) %>%
70 | ungroup() %>%
71 | complete(date, locality, fill = list(collisions = 0)) %>%
72 | right_join(mp_light %>% crossing(locality = c("CHI", "MP")), by = c("date", "locality")) %>%
73 | filter(date <= "2016-11-13") %>%
74 | replace_na(list(collisions = 0)) %>%
75 | mutate(locality = ifelse(locality == "CHI", "Greater Chicago", "McCormick Place"))
76 |
77 | bootstrap_cis <- by_day_mp %>%
78 | bootstraps(times = 1000) %>%
79 | unnest(map(splits, as.data.frame)) %>%
80 | group_by(light_score, locality, id) %>%
81 | summarize(avg_collisions = geom_mean(collisions)) %>%
82 | summarize(bootstrap_low = quantile(avg_collisions, .025),
83 | bootstrap_high = quantile(avg_collisions, .975))
84 |
85 | by_day_mp %>%
86 | group_by(light_score, locality) %>%
87 | summarize(avg_collisions = geom_mean(collisions),
88 | nights = n()) %>%
89 | ggplot(aes(light_score, color = locality)) +
90 | geom_line(aes(y = avg_collisions)) +
91 | geom_ribbon(aes(ymin = bootstrap_low, ymax = bootstrap_high),
92 | data = bootstrap_cis,
93 | alpha = .25) +
94 | expand_limits(y = 0) +
95 | labs(x = "Light score at McCormick place (higher means more lights on)",
96 | y = "Geometric mean of the number of collisions",
97 | title = "Brighter lights at McCormick place correlate with more bird collisions there, and not with Chicago overall",
98 | subtitle = "Ribbon shows 95% bootstrapped percentile confidence interval",
99 | color = "")
100 | ```
101 |
102 | ### Look at confounders
103 |
104 | ```{r}
105 | library(lubridate)
106 |
107 | bird_collisions %>%
108 | filter(date >= "2005-01-01") %>%
109 | count(month = month(date, label = TRUE), #floor_date(date, "month"),
110 | locality) %>%
111 | ggplot(aes(month, n, color = locality, group = locality)) +
112 | geom_line()
113 | ```
114 |
115 |
116 |
117 |
118 |
--------------------------------------------------------------------------------
/board-games.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Board Games"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | theme_set(theme_light())
13 |
14 | board_games_raw <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-12/board_games.csv")
15 |
16 | holdout_set <- board_games_raw %>%
17 | filter(game_id %% 5 == 0)
18 |
19 | board_games <- board_games_raw %>%
20 | filter(game_id %% 5 != 0)
21 | ```
22 |
23 | ### EDA
24 |
25 | ```{r}
26 | board_games %>%
27 | count(publisher, sort = TRUE)
28 |
29 | board_games %>%
30 | count(year_published) %>%
31 | arrange(desc(year_published))
32 | ggplot(aes(year_published, n)) +
33 | geom_line()
34 |
35 | board_games %>%
36 | ggplot(aes(average_rating)) +
37 | geom_histogram()
38 |
39 | board_games %>%
40 | filter(max_playtime > 5, max_playtime < 1000) %>%
41 | ggplot(aes(max_playtime / 60)) +
42 | geom_histogram(binwidth = .25) +
43 | scale_x_log10(breaks = 2 ^ seq(-2, 4))
44 |
45 | board_games %>%
46 | count(year_published) %>%
47 | ggplot(aes(year_published, n)) +
48 | geom_line()
49 | ```
50 |
51 | Categorical variables:
52 |
53 | ```{r}
54 | categorical_variables <- board_games %>%
55 | select(game_id, name, family, category, artist, designer, mechanic) %>%
56 | gather(type, value, -game_id, -name) %>%
57 | filter(!is.na(value)) %>%
58 | separate_rows(value, sep = ",") %>%
59 | arrange(game_id)
60 |
61 | categorical_counts <- categorical_variables %>%
62 | count(type, value, sort = TRUE)
63 |
64 | library(drlib)
65 |
66 | categorical_counts %>%
67 | group_by(type) %>%
68 | top_n(10, n) %>%
69 | ungroup() %>%
70 | mutate(value = reorder_within(value, n, type),
71 | type = fct_reorder(type, n, .desc = TRUE)) %>%
72 | ggplot(aes(value, n, fill = type)) +
73 | geom_col(show.legend = FALSE) +
74 | facet_wrap(~ type, scales = "free_y") +
75 | coord_flip() +
76 | scale_x_reordered() +
77 | labs(title = "Most common categories")
78 | ```
79 |
80 | ### Predict average rating
81 |
82 | ```{r}
83 | board_games %>%
84 | group_by(decade = 10 * (year_published %/% 10)) %>%
85 | summarize(average_rating = mean(average_rating)) %>%
86 | ggplot(aes(decade, average_rating)) +
87 | geom_line()
88 | ```
89 |
90 |
91 | ```{r}
92 | library(broom)
93 |
94 | lm(average_rating ~
95 | log2(max_players + 1) +
96 | log2(max_playtime + 1) +
97 | year_published, board_games) %>%
98 | tidy()
99 | ```
100 |
101 | ```{r}
102 | by_categorical <- board_games %>%
103 | inner_join(categorical_variables, by = c("game_id", "name")) %>%
104 | select(type, value, average_rating) %>%
105 | group_by(type, value) %>%
106 | summarize(games = n(),
107 | average_rating = mean(average_rating)) %>%
108 | arrange(desc(games))
109 |
110 | board_games %>%
111 | inner_join(categorical_variables, by = c("game_id", "name")) %>%
112 | filter(type == "category") %>%
113 | mutate(value = fct_lump(value, 15),
114 | value = fct_reorder(value, average_rating)) %>%
115 | ggplot(aes(value, average_rating)) +
116 | geom_boxplot() +
117 | coord_flip()
118 |
119 | board_games %>%
120 | inner_join(categorical_variables, by = c("game_id", "name")) %>%
121 | filter(type == "family") %>%
122 | mutate(value = fct_lump(value, 15),
123 | value = fct_reorder(value, average_rating)) %>%
124 | ggplot(aes(value, average_rating)) +
125 | geom_boxplot() +
126 | coord_flip()
127 | ```
128 |
129 | Conclusion: categorical variables can be correlated with higher/lower rated games.
130 |
131 | ```{r}
132 | non_categorical_features <- board_games %>%
133 | transmute(game_id,
134 | name,
135 | year = year_published - 1950,
136 | log2_max_players = log2(max_players + 1),
137 | log2_max_playtime = log2(max_playtime + 1)) %>%
138 | gather(feature, value, -game_id, -name)
139 |
140 | features <- categorical_variables %>%
141 | unite(feature, type, value, sep = ": ") %>%
142 | add_count(feature) %>%
143 | filter(n >= 20) %>%
144 | mutate(value = 1) %>%
145 | bind_rows(non_categorical_features)
146 |
147 | library(glmnet)
148 | library(tidytext)
149 | library(Matrix)
150 |
151 | # Predictor
152 | feature_matrix <- features %>%
153 | cast_sparse(game_id, feature, value)
154 |
155 | # What I'm predicting
156 | ratings <- board_games$average_rating[match(rownames(feature_matrix), board_games$game_id)]
157 |
158 | cv_lasso <- cv.glmnet(feature_matrix, ratings)
159 | plot(cv_lasso)
160 |
161 | cv_lasso$glmnet.fit %>%
162 | tidy() %>%
163 | filter(lambda == cv_lasso$lambda.1se) %>%
164 | arrange(desc(estimate)) %>%
165 | filter(term != "(Intercept)") %>%
166 | top_n(25, abs(estimate)) %>%
167 | mutate(term = fct_reorder(term, estimate)) %>%
168 | ggplot(aes(term, estimate)) +
169 | geom_col() +
170 | coord_flip() +
171 | labs(title = "Largest coefficients in our predictive model",
172 | subtitle = "Based on a LASSO regression",
173 | x = "",
174 | y = "Coefficient")
175 | ```
176 |
177 |
178 |
179 |
180 |
--------------------------------------------------------------------------------
/bob-ross.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Untitled"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | theme_set(theme_light())
13 |
14 | bob_ross <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-08-06/bob-ross.csv")
15 | ```
16 |
17 | ```{r}
18 | bob_ross_gathered <- bob_ross %>%
19 | janitor::clean_names() %>%
20 | gather(element, present, -episode, -title) %>%
21 | filter(present == 1) %>%
22 | mutate(title = str_to_title(str_remove_all(title, '"')),
23 | element = str_to_title(str_replace(element, "_", " "))) %>%
24 | select(-present) %>%
25 | extract(episode, c("season", "episode_number"), "S(.*)E(.*)", convert = TRUE, remove = FALSE) %>%
26 | arrange(season, episode_number)
27 | ```
28 |
29 | ### Exploring the paintings
30 |
31 | ```{r}
32 | bob_ross_gathered %>%
33 | count(element, sort = TRUE) %>%
34 | head(25) %>%
35 | mutate(element = fct_reorder(element, n)) %>%
36 | ggplot(aes(element, n)) +
37 | geom_col() +
38 | coord_flip()
39 | ```
40 |
41 | What are the most "crowded" paintings, with the most elements in them?
42 |
43 | ```{r}
44 | bob_ross_gathered %>%
45 | add_count(episode) %>%
46 | arrange(desc(n))
47 | ```
48 |
49 | How have Ross's paintings been changing over time?
50 |
51 | ```{r}
52 | by_season_element <- bob_ross_gathered %>%
53 | filter(!element %in% c("Tree", "Trees")) %>%
54 | group_by(season) %>%
55 | mutate(number_episodes = n_distinct(episode)) %>%
56 | count(season, element, number_episodes, sort = TRUE) %>%
57 | mutate(percent_included = n / number_episodes) %>%
58 | group_by(element) %>%
59 | mutate(element_total = sum(n)) %>%
60 | ungroup()
61 |
62 | by_season_element %>%
63 | filter(element_total >= 50) %>%
64 | ggplot(aes(season, percent_included, color = element)) +
65 | geom_line() +
66 | scale_y_continuous(labels = scales::percent_format()) +
67 | expand_limits(y = 0) +
68 | facet_wrap(~ element)
69 | ```
70 |
71 | Could have used: [many models with broom](https://r4ds.had.co.nz/many-models.html)
72 |
73 | ### Clustering
74 |
75 | What tends to appear together?
76 |
77 | ```{r}
78 | library(widyr)
79 |
80 | correlations <- bob_ross_gathered %>%
81 | add_count(element) %>%
82 | filter(n >= 5) %>%
83 | pairwise_cor(element, episode, sort = TRUE)
84 |
85 | correlations %>%
86 | filter(item1 == "River") %>%
87 | mutate(item2 = fct_reorder(item2, correlation)) %>%
88 | ggplot(aes(item2, correlation)) +
89 | geom_col() +
90 | coord_flip() +
91 | labs(title = "What tends to appear with a river?",
92 | subtitle = "Among elements that appeared in at least 10 paintings")
93 |
94 | correlations %>%
95 | filter(item1 == "Snow") %>%
96 | mutate(item2 = fct_reorder(item2, correlation)) %>%
97 | ggplot(aes(item2, correlation)) +
98 | geom_col() +
99 | coord_flip() +
100 | labs(title = "What tends to appear with snow?",
101 | subtitle = "Among elements that appeared in at least 10 paintings")
102 | ```
103 |
104 | ```{r}
105 | library(ggraph)
106 | library(igraph)
107 |
108 | set.seed(2019)
109 |
110 | correlations %>%
111 | head(100) %>%
112 | graph_from_data_frame() %>%
113 | ggraph() +
114 | geom_edge_link(aes(alpha = correlation)) +
115 | geom_node_point() +
116 | geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
117 | theme_void()
118 | ```
119 |
120 | ### Principal Component Analysis
121 |
122 | What dimensions drive a lot of the variation among paintings?
123 |
124 | ```{r}
125 | library(reshape2)
126 | library(broom)
127 | library(tidytext)
128 |
129 | binary_matrix <- bob_ross_gathered %>%
130 | acast(title ~ element)
131 |
132 | # Center the columns
133 | centered_matrix <- t(t(binary_matrix) - colMeans(binary_matrix))
134 |
135 | svd_result <- svd(centered_matrix)
136 |
137 | element_weights <- tidy(svd_result, matrix = "v") %>%
138 | mutate(element = colnames(binary_matrix)[column])
139 |
140 | element_weights %>%
141 | filter(PC <= 4) %>%
142 | group_by(PC) %>%
143 | top_n(16, abs(value)) %>%
144 | ungroup() %>%
145 | mutate(element = reorder_within(element, value, PC)) %>%
146 | ggplot(aes(element, value, fill = factor(PC))) +
147 | geom_col(show.legend = FALSE) +
148 | facet_wrap(~ PC, scales = "free") +
149 | scale_x_reordered() +
150 | coord_flip() +
151 | labs(title = "First four principal components of elements in Bob Ross paintings")
152 | ```
153 |
154 | 1. Mountains/Conifer vs Ocean/Beach and deciduous trees
155 | 2. Trees, especially deciduous, vs Ocean
156 | 3. Spring/Summer vs Winter
157 | 4. Lake vs River
158 |
159 | ```{r}
160 | painting_weights <- broom::tidy(svd_result, matrix = "u") %>%
161 | mutate(painting = rownames(binary_matrix)[row])
162 | ```
163 |
164 | ```{r}
165 | painting_weights %>%
166 | filter(PC == 1) %>%
167 | arrange((value))
168 |
169 | bob_ross_gathered %>%
170 | filter(title == "Frozen Solitude")
171 |
172 | painting_weights %>%
173 | filter(PC <= 4) %>%
174 | group_by(PC) %>%
175 | top_n(20, abs(value)) %>%
176 | ungroup() %>%
177 | mutate(painting = reorder_within(painting, value, PC)) %>%
178 | ggplot(aes(painting, value, fill = factor(PC))) +
179 | geom_col(show.legend = FALSE) +
180 | facet_wrap(~ PC, scales = "free") +
181 | scale_x_reordered() +
182 | coord_flip() +
183 | labs(title = "First four principal components of Bob Ross paintings")
184 |
185 | ```
186 |
187 | ```{r}
188 | broom::tidy(svd_result, matrix = "d") %>%
189 | ggplot(aes(PC, percent)) +
190 | geom_point()
191 | ```
192 |
193 |
194 |
--------------------------------------------------------------------------------
/broadway-shinybones/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 |
--------------------------------------------------------------------------------
/broadway-shinybones/README.md:
--------------------------------------------------------------------------------
1 | #
2 |
3 |
--------------------------------------------------------------------------------
/broadway-shinybones/_site.yml:
--------------------------------------------------------------------------------
1 | name: Broadway Shows
2 | sidebar:
3 | - text: Revenue
4 | icon: dollar-sign
5 | module:
6 | metric_panel:
7 | metric: !expr DATA$broadway_revenue_usd_gross
8 | - text: Price
9 | icon: ticket-alt
10 | module:
11 | metric_panel:
12 | metric: !expr DATA$broadway_revenue_avg_ticket_price
13 | - text: Capacity
14 | icon: users
15 | module:
16 | metric_panel:
17 | metric: !expr DATA$broadway_revenue_pct_capacity
18 |
--------------------------------------------------------------------------------
/broadway-shinybones/app.R:
--------------------------------------------------------------------------------
1 | # Load Libraries -----
2 | library(shiny)
3 | library(shinydashboard)
4 | library(shinybones)
5 | library(shinymetrics)
6 |
7 | # Load Utilities -----
8 | source_dirs('utils')
9 | source_dirs('components')
10 | source_dirs('pages')
11 |
12 | # Global Data ----
13 | # This is passed to all page modules as an argument named data_global
14 | DATA <- readRDS("show_metrics.rds")
15 |
16 | # Configuration
17 | options("yaml.eval.expr" = TRUE)
18 | CONFIG <- yaml::read_yaml('_site.yml')
19 |
20 | # UI ----
21 | ui <- function(request){
22 | dashboardPage(
23 | # Header ----
24 | dashboardHeader(title = CONFIG$name),
25 |
26 | # Sidebar ----
27 | dashboardSidebar(
28 | sb_create_sidebar(CONFIG, DATA)
29 | ),
30 |
31 | # Body -----
32 | dashboardBody(
33 | sb_create_tab_items(CONFIG, DATA)
34 | )
35 | )
36 | }
37 |
38 | # Server -----
39 | server <- function(input, output, session){
40 | sb_call_modules(CONFIG, DATA)
41 | }
42 |
43 | # Run App ----
44 | shinyApp(ui = ui, server = server, enableBookmarking = 'url')
45 |
--------------------------------------------------------------------------------
/broadway-shinybones/broadway-shinybones.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 4
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
--------------------------------------------------------------------------------
/broadway-shinybones/components/README.md:
--------------------------------------------------------------------------------
1 | # Components
2 |
3 | Add all shared components to this directory.
4 |
--------------------------------------------------------------------------------
/broadway-shinybones/pages/README.md:
--------------------------------------------------------------------------------
1 | # Pages
2 |
3 | Add all page modules to this directory.
4 |
--------------------------------------------------------------------------------
/broadway-shinybones/show_metrics.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tacookson/data-screencasts/388a9cfa794a67d5cbaa3b9b860520c0a3e0868e/broadway-shinybones/show_metrics.rds
--------------------------------------------------------------------------------
/broadway.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Broadway shows"
3 | name: metrics_broadway_revenue
4 | owner: admiral.david@gmail.com
5 | metrics:
6 | usd_gross:
7 | title: Total Gross (USD)
8 | description: Not adjusted for inflation.
9 | avg_ticket_price:
10 | title: Average Ticket Price
11 | description: Not adjusted for inflation.
12 | pct_capacity:
13 | title: Percent capacity
14 | description: Averaged across weeks in the time period.
15 | dimensions:
16 | show:
17 | title: Show
18 | description: Show's title
19 | ---
20 |
21 | ```{r setup, include=FALSE}
22 | knitr::opts_chunk$set(echo = TRUE)
23 | ```
24 |
25 | ```{r}
26 | library(tidyverse)
27 | library(scales)
28 | theme_set(theme_light())
29 |
30 | tuesdata <- tidytuesdayR::tt_load('2020-04-28')
31 | ```
32 |
33 | ```{r}
34 | grosses <- tuesdata$grosses
35 |
36 | grosses %>%
37 | filter(show %in% c("Hamilton", "The Lion King")) %>%
38 | ggplot(aes(week_ending, weekly_gross, color = show)) +
39 | geom_line() +
40 | scale_y_continuous(labels = scales::dollar) +
41 | expand_limits(y = 0)
42 | ```
43 |
44 | Tidymetric `cross_by_periods()` and `cross_by_dimensions()`
45 |
46 | ```{r}
47 | # devtools::install_github("ramnathv/tidymetrics")
48 | library(tidymetrics)
49 |
50 | shows_summarized <- grosses %>%
51 | filter(show %in% c("Hamilton", "The Lion King",
52 | "Les Miserables", "Rent",
53 | "The Phantom of the Opera", "Wicked",
54 | "Harry Potter and the Cursed Child, Parts One and Two",
55 | "The Book of Mormon")) %>%
56 | mutate(show = str_remove(show, "\\, Parts.*")) %>%
57 | rename(date = week_ending) %>%
58 | cross_by_dimensions(show) %>%
59 | cross_by_periods(c("month", "quarter", "year"),
60 | windows = 28) %>%
61 | summarize(usd_gross = sum(weekly_gross),
62 | avg_ticket_price = mean(avg_ticket_price),
63 | pct_capacity = mean(pct_capacity)) %>%
64 | ungroup()
65 |
66 | show_metrics <- create_metrics(shows_summarized)
67 | ```
68 |
69 | The shinymetrics package:
70 |
71 | ```{r}
72 | # devtools::install_github("ramnathv/shinymetrics")
73 | library(shinymetrics)
74 | saveRDS(show_metrics, "broadway-shinybones/show_metrics.rds")
75 |
76 | preview_metric(show_metrics$broadway_revenue_usd_gross)
77 | preview_metric(show_metrics$broadway_revenue_avg_ticket_price)
78 | preview_metric(show_metrics$broadway_revenue_pct_capacity)
79 | ```
80 |
81 | ```{r}
82 | shows_summarized %>%
83 | filter(period == "quarter",
84 | show != "All") %>%
85 | ggplot(aes(date, usd_gross, fill = show)) +
86 | geom_col()
87 |
88 | shows_summarized %>%
89 | filter(period == "quarter",
90 | show != "All") %>%
91 | ggplot(aes(date, avg_ticket_price, col = show)) +
92 | geom_line() +
93 | expand_limits(y = 0)
94 | ```
95 |
96 |
97 |
98 |
99 |
--------------------------------------------------------------------------------
/car-economy.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Car Fuel Economy"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | theme_set(theme_light())
13 | ```
14 |
15 | ```{r}
16 | big_epa_cars <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-15/big_epa_cars.csv") %>%
17 | mutate(uses_electricity = ifelse(highwayE > 0, "Uses Electricity", "Doesn't Use Electricity"))
18 | ```
19 |
20 | ```{r}
21 | big_epa_cars_alphabetical <- big_epa_cars %>%
22 | select(sort(colnames(big_epa_cars)))
23 | ```
24 |
25 | ```{r}
26 | big_epa_cars %>%
27 | ggplot(aes(highway08, city08)) +
28 | geom_point() +
29 | geom_abline(color = "red") +
30 | facet_wrap(~ uses_electricity, scales = "free") +
31 | expand_limits(x = 0, y = 0) +
32 | labs(x = "Highway MPG",
33 | y = "City MPG",
34 | title = "How does fuel efficiency differ between city + highway?")
35 | ```
36 |
37 | ```{r}
38 | big_epa_cars %>%
39 | select(city08, highway08, make, model, cylinders, displ, drive, engId, eng_dscr)
40 | ```
41 |
42 | ```{r}
43 | big_epa_cars %>%
44 | filter(cityE == 0) %>%
45 | mutate(VClass = fct_lump(VClass, 8),
46 | VClass = fct_reorder(VClass, city08)) %>%
47 | ggplot(aes(VClass, city08)) +
48 | geom_boxplot() +
49 | coord_flip()
50 |
51 | big_epa_cars %>%
52 | filter(cityE == 0) %>%
53 | mutate(drive = fct_reorder(drive, city08)) %>%
54 | ggplot(aes(drive, city08)) +
55 | geom_boxplot() +
56 | coord_flip()
57 |
58 | big_epa_cars %>%
59 | filter(cityE == 0) %>%
60 | ggplot(aes(cylinders, city08, group = cylinders)) +
61 | geom_boxplot()
62 |
63 | big_epa_cars %>%
64 | filter(cityE == 0) %>%
65 | ggplot(aes(displ, city08)) +
66 | geom_point() +
67 | expand_limits(x = 0, y = 0)
68 | ```
69 |
70 | ## Goal: Predict city fuel efficiency for single-fuel, non-electric cars
71 |
72 | ```{r}
73 | # Cross validation holdout set
74 | non_electric_cars <- big_epa_cars %>%
75 | filter(cityA08 == 0,
76 | cityE == 0) %>%
77 | sample_frac(1)
78 |
79 | training_set <- non_electric_cars %>%
80 | filter(row_number() %% 5 != 0)
81 | ```
82 |
83 | ```{r}
84 | library(broom)
85 |
86 | training_set %>%
87 | ggplot(aes(displ, city08)) +
88 | geom_point() +
89 | geom_smooth(method = "lm")
90 |
91 | library(splines)
92 | augmented_data <- lm(city08 ~ ns(displ, 2), data = training_set) %>%
93 | augment(data = training_set)
94 |
95 | augmented_data %>%
96 | ggplot(aes(displ, city08)) +
97 | geom_point() +
98 | geom_line(aes(y = .fitted), color = "red", size = 2)
99 |
100 | models <- tibble(df = 1:10) %>%
101 | mutate(lm_model = map(df, ~ lm(city08 ~ ns(displ, df = .), data = training_set)))
102 |
103 | augmented_unnested <- models %>%
104 | mutate(augmented = map(lm_model, augment, data = training_set)) %>%
105 | unnest(augmented)
106 |
107 | augmented_unnested %>%
108 | ggplot(aes(displ, city08)) +
109 | geom_point(data = training_set) +
110 | geom_line(aes(y = .fitted, color = factor(df)), size = 2) +
111 | labs(x = "Engine volume (L)",
112 | y = "City MPG",
113 | color = "# of degrees of freedom") +
114 | expand_limits(x = 0, y = 0)
115 |
116 | augmented_unnested %>%
117 | ggplot(aes(displ, .resid)) +
118 | geom_point() +
119 | facet_wrap(~ df)
120 |
121 | glanced_models <- models %>%
122 | rename(spline_df = df) %>%
123 | mutate(glanced = map(lm_model, glance, data = training_set)) %>%
124 | unnest(glanced)
125 |
126 | glanced_models %>%
127 | ggplot(aes(spline_df, adj.r.squared)) +
128 | geom_line()
129 | ```
130 |
131 | ```{r}
132 | lm(city08 ~ ns(displ, 4), data = training_set) %>%
133 | anova() %>%
134 | tidy() %>%
135 | mutate(pct_variation = sumsq / sum(sumsq))
136 |
137 | ggplot(training_set, aes(cylinders, displ, group = cylinders)) +
138 | geom_boxplot()
139 | ```
140 |
141 | ```{r}
142 | training_set %>%
143 | ggplot(aes(year, city08)) +
144 | geom_point() +
145 | geom_smooth(method = "loess")
146 |
147 | efficiency_time <- training_set %>%
148 | mutate(VClass = fct_lump(VClass, 6),
149 | guzzler = !is.na(guzzler)) %>%
150 | group_by(year = 2 * floor(year / 2), VClass) %>%
151 | summarize_at(vars(city08, cylinders, displ, guzzler), mean)
152 |
153 | efficiency_time %>%
154 | ggplot(aes(year, city08, color = VClass)) +
155 | geom_line() +
156 | expand_limits(y = 0)
157 |
158 | efficiency_time %>%
159 | ggplot(aes(year, displ, color = VClass)) +
160 | geom_line() +
161 | expand_limits(y = 0)
162 |
163 | efficiency_time %>%
164 | ggplot(aes(year, guzzler, color = VClass)) +
165 | geom_line() +
166 | expand_limits(y = 0)
167 | ```
168 |
169 | Hypothesis: Engine size started shrinking around 2009 (probably regulatory pressure), especially in large and midsize cars, and this led to an increase in city fuel efficiency.
170 |
171 |
172 |
--------------------------------------------------------------------------------
/caribou-locations.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Untitled"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | theme_set(theme_light())
13 |
14 | tuesdata <- tidytuesdayR::tt_load('2020-06-23')
15 | ```
16 |
17 | ```{r}
18 | individuals <- tuesdata$individuals
19 | locations <- tuesdata$locations
20 | ```
21 |
22 | ```{r}
23 | individuals %>%
24 | summarize(across(sex:study_site, list(~ mean(!is.na(.)))))
25 |
26 | individuals %>%
27 | filter(deploy_off_type == "dead") %>%
28 | count(death_cause, sort = TRUE)
29 | ```
30 |
31 | ```{r}
32 | library(sf)
33 | province_sf <- read_sf("~/Downloads/province")
34 |
35 | bc <- province_sf %>%
36 | filter(PROV == "BC")
37 | ```
38 |
39 | ```{r}
40 | individuals %>%
41 | filter(deploy_on_latitude > 40) %>%
42 | count(study_site, deploy_on_longitude, deploy_on_latitude, sort = TRUE) %>%
43 | ggplot() +
44 | # geom_sf(data = bc) +
45 | # borders("world", regions = "canada") +
46 | geom_point(aes(deploy_on_longitude, deploy_on_latitude,
47 | size = n,
48 | color = study_site)) +
49 | scale_size_continuous(guide = FALSE)
50 | ```
51 |
52 | ```{r}
53 | individuals %>%
54 | count(animal_id, sort = TRUE)
55 |
56 | individuals %>%
57 | filter(animal_id == "MO_car150") %>%
58 | View()
59 | ```
60 |
61 | ### Looking at locations
62 |
63 | ```{r}
64 | locations %>%
65 | ggplot(aes(longitude, latitude, color = study_site)) +
66 | geom_point()
67 | ```
68 |
69 | ```{r}
70 | by_animal <- locations %>%
71 | group_by(animal_id, study_site) %>%
72 | summarize(start = min(timestamp),
73 | end = max(timestamp),
74 | num_points = n()) %>%
75 | ungroup() %>%
76 | arrange(desc(num_points))
77 | ```
78 |
79 | ```{r}
80 | library(lubridate)
81 |
82 | example_animal <- locations %>%
83 | arrange(timestamp) %>%
84 | filter(animal_id == sample(unique(animal_id), 1))
85 |
86 | example_animal %>%
87 | mutate(quarter = as.Date(floor_date(timestamp, "quarter"))) %>%
88 | ggplot(aes(longitude, latitude, color = timestamp)) +
89 | geom_point(alpha = .5) +
90 | geom_path(alpha = .5) +
91 | facet_wrap(~ quarter) +
92 | labs(title = "One caribou over time")
93 | ```
94 |
95 | ```{r}
96 | library(geosphere)
97 |
98 | locations_with_gaps <- locations %>%
99 | group_by(animal_id) %>%
100 | mutate(last_lon = lag(longitude),
101 | last_lat = lag(latitude),
102 | hours = as.numeric(difftime(timestamp, lag(timestamp), unit = "hours")),
103 | km = distHaversine(cbind(longitude, latitude), cbind(last_lon, last_lat)) / 1000,
104 | kph = km / hours) %>%
105 | ungroup()
106 |
107 | locations_with_gaps %>%
108 | filter(hours <= 8) %>%
109 | ggplot(aes(kph)) +
110 | geom_histogram() +
111 | scale_x_log10(labels = scales::comma) +
112 | labs(title = "On average, how fast do caribou travel?")
113 |
114 | by_animal <- locations_with_gaps %>%
115 | filter(hours <= 8,
116 | hours >= .5) %>%
117 | group_by(animal_id, study_site) %>%
118 | summarize(start = min(timestamp),
119 | end = max(timestamp),
120 | num_points = n(),
121 | avg_speed = mean(kph, na.rm = TRUE)) %>%
122 | ungroup() %>%
123 | arrange(desc(num_points)) %>%
124 | filter(num_points >= 10)
125 | ```
126 |
127 | ```{r}
128 | by_animal %>%
129 | ggplot(aes(num_points, avg_speed)) +
130 | geom_point() +
131 | scale_x_log10() +
132 | expand_limits(y = 0)
133 |
134 | by_animal %>%
135 | arrange(desc(avg_speed))
136 |
137 | locations_with_gaps %>%
138 | filter(animal_id == "QU_car107") %>%
139 | mutate(quarter = as.Date(floor_date(timestamp, "quarter"))) %>%
140 | ggplot(aes(longitude, latitude, color = kph)) +
141 | geom_point(alpha = .5) +
142 | geom_path(alpha = .5) +
143 | facet_wrap(~ quarter) +
144 | labs(title = "One caribou over time")
145 |
146 | locations_with_gaps %>%
147 | filter(animal_id == "SC_car171") %>%
148 | arrange(desc(kph)) %>%
149 | View()
150 | ```
151 |
152 | ```{r}
153 | locations_with_gaps %>%
154 | filter(study_site != "Hart Ranges") %>%
155 | filter(hours <= 8,
156 | hours >= .5) %>%
157 | group_by(month = month(timestamp, label = TRUE),
158 | study_site) %>%
159 | summarize(avg_speed = median(kph),
160 | n = n()) %>%
161 | ggplot(aes(month, avg_speed, group = study_site,
162 | color = study_site)) +
163 | geom_line() +
164 | geom_point(aes(size = n)) +
165 | expand_limits(y = 0) +
166 | facet_wrap(~ study_site) +
167 | theme(legend.position = "none") +
168 | labs(title = "Seasonal trend in Caribou speed",
169 | y = "Average speed (kph)")
170 | ```
171 |
172 |
173 | ```{r}
174 | by_animal %>%
175 | filter(num_points >= 10) %>%
176 | arrange(desc(avg_speed))
177 | ```
178 |
179 |
180 | ```{r}
181 | locations %>%
182 | arrange(timestamp) %>%
183 | group_by(animal_id) %>%
184 | mutate(gap = round(difftime(timestamp, lag(timestamp), unit = "hours"))) %>%
185 | ungroup() %>%
186 | filter(!is.na(gap)) %>%
187 | filter(gap <= 24) %>%
188 | ggplot(aes(gap)) +
189 | geom_histogram(binwidth = 2)
190 | ```
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
--------------------------------------------------------------------------------
/cetaceans.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Untitled"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | theme_set(theme_light())
13 |
14 | cetaceans_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-12-18/allCetaceanData.csv")
15 |
16 | cetaceans <- cetaceans_raw %>%
17 | select(-X1) %>%
18 | mutate(birthYear = as.integer(birthYear),
19 | age = if_else(status == "Died", year(statusDate) - birthYear, NA_real_))
20 | ```
21 |
22 | ```{r}
23 | library(lubridate)
24 |
25 | cetaceans %>%
26 | count(acquisition, sort = TRUE)
27 |
28 | cetaceans %>%
29 | count(status, sort = TRUE)
30 |
31 | cetaceans %>%
32 | filter(status == "Died") %>%
33 | select(birthYear, statusDate) %>%
34 | filter(!is.na(birthYear), !is.na(statusDate)) %>%
35 | mutate(age = year(statusDate) - birthYear) %>%
36 | ggplot(aes(age)) +
37 | geom_histogram()
38 |
39 | cetaceans %>%
40 | count(species = fct_lump(species, 5), sort = TRUE) %>%
41 | mutate(species = fct_reorder(species, n)) %>%
42 | ggplot(aes(species, n)) +
43 | geom_col() +
44 | coord_flip()
45 |
46 | cetaceans %>%
47 | mutate(species = fct_lump(species, 5), sort = TRUE) %>%
48 | filter(!is.na(birthYear), !is.na(statusDate)) %>%
49 | mutate(age = year(statusDate) - birthYear) %>%
50 | ggplot(aes(species, age)) +
51 | geom_boxplot() +
52 | coord_flip()
53 | ```
54 |
55 | ```{r}
56 | cetaceans %>%
57 | count(acquisition, originLocation, sort = TRUE) %>%
58 | filter(originLocation != "Unknown") %>%
59 | View()
60 |
61 | cetaceans %>%
62 | ggplot(aes(originDate, fill = acquisition)) +
63 | geom_histogram()
64 |
65 | library(scales)
66 |
67 | cetaceans %>%
68 | filter(originDate >= "1960-01-01") %>%
69 | count(acquisition,
70 | decade = 5 * (year(originDate) %/% 5)) %>%
71 | complete(acquisition, decade, fill = list(n = 0)) %>%
72 | mutate(acquisition = fct_reorder(acquisition, n, sum)) %>%
73 | group_by(decade) %>%
74 | mutate(percent = n / sum(n)) %>%
75 | ggplot(aes(decade, percent, fill = acquisition)) +
76 | geom_area() +
77 | scale_y_continuous(labels = percent_format()) +
78 | theme_minimal() +
79 | labs(x = "year",
80 | y = "% of dolphins recorded")
81 | ```
82 |
83 | ```{r}
84 | library(fuzzyjoin)
85 |
86 | regexes <- tribble(
87 | ~ regex, ~ category,
88 | "Unknown", "Unknown",
89 | "Gulf of Mexico", "Gulf of Mexico",
90 | "Florida|FL", "Florida",
91 | "Texas|TX", "Texas",
92 | "SeaWorld", "SeaWorld",
93 | "Pacific", "Pacific Ocean",
94 | "Atlantic", "Atlantic Ocean"
95 | )
96 |
97 | cetaceans_annotated <- cetaceans %>%
98 | mutate(unique_id = row_number()) %>%
99 | regex_left_join(regexes, c(originLocation = "regex")) %>%
100 | distinct(unique_id, .keep_all = TRUE) %>%
101 | mutate(category = coalesce(category, originLocation))
102 |
103 | cetaceans_annotated %>%
104 | filter(acquisition == "Capture") %>%
105 | mutate(category = fct_lump(category, 4),
106 | category = fct_reorder(category, category, length)) %>%
107 | ggplot(aes(originDate, fill = category)) +
108 | geom_histogram()
109 |
110 |
111 | count(category,
112 | decade = 5 * (year(originDate) %/% 5)) %>%
113 | complete(category, decade, fill = list(n = 0)) %>%
114 | mutate(category = fct_lump(category, 7),
115 | category = fct_reorder(category, n, sum)) %>%
116 | group_by(decade) %>%
117 | mutate(percent = n / sum(n)) %>%
118 | ggplot(aes(decade, percent, fill = category)) +
119 | geom_area() +
120 | scale_y_continuous(labels = percent_format()) +
121 | theme_minimal() +
122 | labs(x = "year",
123 | y = "% of dolphins recorded",
124 | title = "Dolphins born in captivity")
125 | ```
126 |
127 | ### Survival analysis
128 |
129 | ```{r}
130 | library(survival)
131 |
132 | dolphin_survival <- cetaceans %>%
133 | filter(status %in% c("Alive", "Died")) %>%
134 | mutate(deathYear = ifelse(status == "Alive", 2017, year(statusDate)),
135 | status = ifelse(status == "Alive", 0, 1),
136 | age = deathYear - birthYear) %>%
137 | filter(!is.na(deathYear)) %>%
138 | select(birthYear, deathYear, status, sex, age, acquisition, species) %>%
139 | filter(deathYear >= birthYear) %>%
140 | filter(sex != "U")
141 |
142 | model <- survfit(Surv(age, status) ~ sex, dolphin_survival)
143 |
144 | library(broom)
145 |
146 | tidy(model) %>%
147 | ggplot(aes(time, estimate, color = strata)) +
148 | geom_line() +
149 | geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = .2) +
150 | scale_y_continuous(labels = percent_format()) +
151 | labs(y = "Estimated % survival")
152 |
153 | coxph(Surv(age, status) ~ sex, dolphin_survival) %>%
154 | tidy()
155 | ```
156 |
157 | ```{r}
158 | model <- survfit(Surv(age, status) ~ acquisition, dolphin_survival)
159 |
160 | library(broom)
161 |
162 | tidy(model) %>%
163 | filter(strata != "acquisition=Unknown") %>%
164 | ggplot(aes(time, estimate, color = strata)) +
165 | geom_line() +
166 | geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = .2) +
167 | scale_y_continuous(labels = percent_format()) +
168 | labs(y = "Estimated % survival")
169 | ```
170 |
171 |
--------------------------------------------------------------------------------
/coffee-ratings.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Coffee Ratings"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidytuesdayR)
12 | library(tidyverse)
13 | theme_set(theme_light())
14 |
15 | coffee <- tt_load("2020-07-07")
16 |
17 | coffee_ratings <- coffee$coffee_ratings %>%
18 | mutate(coffee_id = row_number()) %>%
19 | filter(total_cup_points > 0)
20 | ```
21 |
22 | ```{r}
23 | View(coffee_ratings)
24 |
25 | coffee_ratings %>%
26 | count(species, sort = TRUE)
27 |
28 | coffee_lumped <- coffee_ratings %>%
29 | filter(!is.na(variety)) %>%
30 | mutate(variety = fct_lump(variety, 12), sort = TRUE)
31 |
32 | coffee_lumped %>%
33 | mutate(variety = fct_reorder(variety, total_cup_points)) %>%
34 | ggplot(aes(total_cup_points, variety)) +
35 | geom_boxplot()
36 |
37 | coffee_lumped %>%
38 | ggplot(aes(total_cup_points, fill = variety)) +
39 | geom_histogram(binwidth = 2) +
40 | facet_wrap(~ variety, scale = "free_y") +
41 | theme(legend.position = "none")
42 | ```
43 |
44 | ```{r}
45 | coffee_ratings %>%
46 | summarize(across(everything(), ~ mean(!is.na(.)))) %>%
47 | gather() %>%
48 | View()
49 |
50 | coffee_ratings %>%
51 | count(producer, sort = TRUE)
52 |
53 | coffee_ratings %>%
54 | count(company, sort = TRUE)
55 |
56 | coffee_ratings %>%
57 | count(color, sort = TRUE)
58 |
59 | coffee_ratings %>%
60 | count(country = fct_lump(country_of_origin, 12), sort = TRUE) %>%
61 | filter(!is.na(country)) %>%
62 | mutate(country = fct_reorder(country, n)) %>%
63 | ggplot(aes(n, country)) +
64 | geom_col()
65 |
66 | coffee_ratings %>%
67 | filter(!is.na(country_of_origin)) %>%
68 | mutate(country = fct_lump(country_of_origin, 12),
69 | country = fct_reorder(country, total_cup_points)) %>%
70 | ggplot(aes(total_cup_points, country)) +
71 | geom_boxplot()
72 | ```
73 |
74 | Interesting dimensions:
75 |
76 | * Country
77 | * Variety
78 | * Company??
79 |
80 | ```{r}
81 | library(ggridges)
82 |
83 | coffee_metrics <- coffee_ratings %>%
84 | select(coffee_id, total_cup_points, variety, company,
85 | country_of_origin,
86 | altitude_mean_meters,
87 | aroma:moisture) %>%
88 | pivot_longer(aroma:cupper_points, names_to = "metric", values_to = "value")
89 |
90 | coffee_metrics %>%
91 | mutate(metric = fct_reorder(metric, value)) %>%
92 | ggplot(aes(value, metric)) +
93 | geom_density_ridges()
94 |
95 | coffee_metrics %>%
96 | group_by(metric) %>%
97 | summarize(average = mean(value),
98 | sd = sd(value)) %>%
99 | arrange(desc(average))
100 | ```
101 |
102 | ```{r}
103 | library(widyr)
104 | library(ggraph)
105 | library(igraph)
106 | library(tidytext)
107 |
108 | correlations <- coffee_metrics %>%
109 | pairwise_cor(metric, coffee_id, value, sort = TRUE)
110 |
111 | correlations %>%
112 | head(50) %>%
113 | graph_from_data_frame() %>%
114 | ggraph() +
115 | geom_edge_link(aes(edge_alpha = correlation)) +
116 | geom_node_point() +
117 | geom_node_text(aes(label = name), repel = TRUE)
118 |
119 | coffee_metrics %>%
120 | filter(!metric %in% c("sweetness", "clean_cup", "uniformity")) %>%
121 | group_by(metric) %>%
122 | mutate(centered = value - mean(value)) %>%
123 | ungroup() %>%
124 | widely_svd(metric, coffee_id, value) %>%
125 | filter(between(dimension, 2, 5)) %>%
126 | mutate(metric = reorder_within(metric, value, dimension)) %>%
127 | ggplot(aes(value, metric)) +
128 | geom_col() +
129 | scale_y_reordered() +
130 | facet_wrap(~ dimension, scales = "free_y")
131 | ```
132 |
133 | ```{r}
134 |
135 | coffee_ratings %>%
136 | filter(altitude_mean_meters < 10000,
137 | altitude != 1) %>%
138 | mutate(altitude_mean_meters = pmin(altitude_mean_meters, 3000)) %>%
139 | ggplot(aes(altitude_mean_meters, total_cup_points)) +
140 | geom_point() +
141 | geom_smooth(method = "lm")
142 |
143 | coffee_metrics %>%
144 | filter(altitude_mean_meters < 10000) %>%
145 | mutate(altitude_mean_meters = pmin(altitude_mean_meters, 3000)) %>%
146 | mutate(km = altitude_mean_meters / 1000) %>%
147 | group_by(metric) %>%
148 | summarize(correlation = cor(altitude_mean_meters, value),
149 | model = list(lm(value ~ km))) %>%
150 | mutate(tidied = map(model, broom::tidy, conf.int = TRUE)) %>%
151 | unnest(tidied) %>%
152 | filter(term == "km") %>%
153 | ungroup() %>%
154 | mutate(metric = fct_reorder(metric, estimate)) %>%
155 | ggplot(aes(estimate, metric, color = p.value < .05)) +
156 | geom_point() +
157 | geom_errorbarh(aes(xmin = conf.low, xmax = conf.high), height = .1) +
158 | labs(y = "Evaluation of coffee",
159 | x = "Each kilometer of altitude contributes this much to score (95% confidence interval)")
160 | ```
161 |
162 |
163 |
164 |
165 |
--------------------------------------------------------------------------------
/crop-yields-shiny/app.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: 'Our World in Data: Crop Yields'
3 | runtime: shiny
4 | output: html_document
5 | ---
6 |
7 | ```{r setup, include=FALSE}
8 | knitr::opts_chunk$set(echo = FALSE, message = FALSE, warning = FALSE)
9 | ```
10 |
11 | ```{r}
12 | library(dplyr)
13 | library(ggplot2)
14 | library(forcats)
15 | library(shiny)
16 | library(stringr)
17 | library(plotly)
18 | theme_set(theme_light())
19 |
20 | # Data cleaning and processing
21 | yields_tidy <- readRDS("yields_tidy.rds")
22 |
23 | top_crops <- yields_tidy %>%
24 | count(crop, sort = TRUE) %>%
25 | head(9) %>%
26 | pull(crop)
27 | ```
28 |
29 | ```{r}
30 | plot_yields <- function(tbl, facet_scales = "fixed") {
31 | g <- tbl %>%
32 | mutate(crop = fct_reorder(crop, -yield)) %>%
33 | mutate(entity = fct_reorder(entity, -yield)) %>%
34 | ggplot(aes(year, yield, color = entity)) +
35 | geom_line() +
36 | expand_limits(y = 0) +
37 | facet_wrap(~ crop, scales = facet_scales) +
38 | labs(x = "Year",
39 | y = "Yield (tonnes per hectare)",
40 | title = "Crop yields over time",
41 | color = "Country")
42 |
43 | ggplotly(g)
44 | }
45 |
46 | # yields_tidy %>%
47 | # filter(code %in% c("USA", "IND")) %>%
48 | # plot_yields()
49 | ```
50 |
51 |
52 | ```{r eruptions, echo=FALSE}
53 | inputPanel(
54 | selectInput("entity",
55 | label = "Country/Continent/Region:",
56 | choices = unique(yields_tidy$entity),
57 | selected = c("United States", "India"),
58 | selectize = TRUE,
59 | multiple = TRUE),
60 | selectInput("crop",
61 | label = "Crops:",
62 | choices = unique(yields_tidy$crop),
63 | selected = top_crops,
64 | selectize = TRUE,
65 | multiple = TRUE),
66 | radioButtons("facet_scales",
67 | label = "",
68 | choices = c("Free y-axis" = "free_y",
69 | "Shared y-axis" = "fixed"))
70 | )
71 |
72 | renderPlotly({
73 | yields_tidy %>%
74 | filter(entity %in% input$entity,
75 | crop %in% input$crop) %>%
76 | plot_yields(facet_scales = input$facet_scales)
77 | })
78 | ```
79 |
--------------------------------------------------------------------------------
/crop-yields-shiny/yields_tidy.rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tacookson/data-screencasts/388a9cfa794a67d5cbaa3b9b860520c0a3e0868e/crop-yields-shiny/yields_tidy.rds
--------------------------------------------------------------------------------
/data-screencasts.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: Default
4 | SaveWorkspace: Default
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 |
--------------------------------------------------------------------------------
/french-trains.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "French Trains"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | library(scales)
13 | theme_set(theme_light())
14 |
15 | full_trains <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-02-26/full_trains.csv") %>%
16 | mutate(pct_late_at_departure = num_late_at_departure / total_num_trips,
17 | arrival_station = str_to_title(arrival_station),
18 | departure_station = str_to_title(departure_station),
19 | date = as.Date(sprintf("%d-%02d-01", year, month))) %>%
20 | arrange(departure_station, arrival_station, month) %>%
21 | fill(service)
22 | ```
23 |
24 | ```{r}
25 | november_2018 <- full_trains %>%
26 | filter(year == 2018, month == 11)
27 |
28 | november_2018 %>%
29 | ggplot(aes(pct_late_at_departure)) +
30 | geom_histogram(binwidth = .05) +
31 | scale_x_continuous(labels = percent_format())
32 |
33 | november_2018 %>%
34 | mutate(departure_station = fct_lump(departure_station, 3)) %>%
35 | ggplot(aes(departure_station, pct_late_at_departure)) +
36 | geom_boxplot() +
37 | scale_y_continuous(labels = percent_format())
38 |
39 | november_2018 %>%
40 | # mutate(arrival_station = fct_infreq(fct_lump(arrival_station, prop = .01))) %>%
41 | # mutate(departure_station = fct_infreq(fct_lump(departure_station, prop = .01))) %>%
42 | mutate(arrival_station = fct_reorder(fct_lump(arrival_station, prop = .01), pct_late_at_departure)) %>%
43 | mutate(departure_station = fct_reorder(fct_lump(departure_station, prop = .01), pct_late_at_departure)) %>%
44 | group_by(arrival_station, departure_station) %>%
45 | summarize(pct_late_at_departure = sum(num_late_at_departure) / sum(total_num_trips)) %>%
46 | ggplot(aes(arrival_station, departure_station, fill = pct_late_at_departure)) +
47 | geom_tile() +
48 | scale_fill_gradient2(low = "blue", high = "red", midpoint = .25, labels = percent_format()) +
49 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
50 | labs(x = "Arrival station",
51 | y = "Departure station",
52 | fill = "% late at departure",
53 | title = "Which routes have the most delayed trains in November 2018?",
54 | subtitle = "Stations with only one arriving/departing route were lumped into 'Other'")
55 | ```
56 |
57 | ### Over time
58 |
59 | ```{r}
60 | full_trains %>%
61 | filter(departure_station == "Lyon Part Dieu") %>%
62 | ggplot(aes(date, pct_late_at_departure, color = arrival_station)) +
63 | geom_line() +
64 | scale_y_continuous(labels = percent_format()) +
65 | expand_limits(y = 0)
66 |
67 | by_departure_station_month <- full_trains %>%
68 | group_by(departure_station = fct_lump(departure_station, prop = .01),
69 | date) %>%
70 | summarize_at(vars(contains("num")), sum) %>%
71 | ungroup() %>%
72 | mutate(pct_late_at_departure = num_late_at_departure / total_num_trips)
73 |
74 | by_departure_station_month %>%
75 | mutate(departure_station = fct_reorder(departure_station, -pct_late_at_departure, last)) %>%
76 | ggplot(aes(date, pct_late_at_departure, color = departure_station)) +
77 | geom_line() +
78 | scale_y_continuous(labels = percent_format()) +
79 | labs(x = "Month",
80 | y = "% late at departure",
81 | color = "Departure station")
82 | ```
83 |
84 | ```{r}
85 | by_departure_station_month <- full_trains %>%
86 | group_by(departure_station = ifelse(service == "International",
87 | paste0(departure_station, " (International)"),
88 | departure_station),
89 | service,
90 | year,
91 | month = fct_reorder(month.name[month], month)) %>%
92 | summarize_at(vars(contains("num")), sum) %>%
93 | ungroup() %>%
94 | mutate(pct_late_at_departure = num_late_at_departure / total_num_trips)
95 |
96 | by_departure_station_month %>%
97 | mutate(departure_station = fct_reorder(departure_station, (service != "International") + pct_late_at_departure, mean)) %>%
98 | ggplot(aes(month, departure_station, fill = pct_late_at_departure)) +
99 | geom_tile() +
100 | scale_fill_gradient2(low = "blue", high = "red", midpoint = .25, labels = percent_format()) +
101 | facet_wrap(~ year, nrow = 1, scales = "free_x") +
102 | theme(axis.text.x = element_text(angle = 90, hjust = 1),
103 | axis.ticks = element_blank(),
104 | panel.grid = element_blank()) +
105 | labs(fill = "% late at departure") +
106 | labs(x = "Month",
107 | y = "Departure station",
108 | title = "Which stations had delays in which months?",
109 | subtitle = "Ordered by the average delay, with international routes on the bottom")
110 | ```
111 |
112 |
--------------------------------------------------------------------------------
/gdpr.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | name: metrics_gdpr_violations
3 | owner: admiral.david@gmail.com
4 | metrics:
5 | nb_violations:
6 | title: Number of GDPR Violations
7 | description: Number of GDPR Violations in this time period.
8 | nb_total_fine:
9 | title: Total Fine (Euros)
10 | description: Total fine across violations in Euros. (TODO)
11 | dimensions:
12 | country:
13 | title: Country
14 | description: France/Germany/etc
15 | article_title:
16 | title: Article
17 | description: GDPR Article
18 | type:
19 | title: Type
20 | description: Type of violation
21 | ---
22 |
23 | ```{r setup, include=FALSE}
24 | knitr::opts_chunk$set(echo = TRUE)
25 | ```
26 |
27 | ```{r}
28 | library(tidyverse)
29 | library(lubridate)
30 | library(scales)
31 | theme_set(theme_light())
32 |
33 |
34 | tuesdata <- tidytuesdayR::tt_load('2020-04-21')
35 |
36 | gdpr_violations <- readr::read_tsv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_violations.tsv') %>%
37 | mutate(date = na_if(mdy(date), "1970-01-01")) %>%
38 | rename(country = name)
39 | ```
40 |
41 | ```{r}
42 | gdpr_violations %>%
43 | summarize(total = sum(price))
44 |
45 | gdpr_violations %>%
46 | count(country = fct_lump(country, 8, w = price),
47 | sort = TRUE, wt = price, name = "total_price") %>%
48 | mutate(country = fct_reorder(country, total_price)) %>%
49 | ggplot(aes(total_price, country)) +
50 | geom_col() +
51 | scale_x_continuous(labels = dollar_format())
52 | ```
53 |
54 | ```{r}
55 | gdpr_violations %>%
56 | count(month = floor_date(date, "month"),
57 | country = fct_lump(country, 6, w = price),
58 | sort = TRUE, wt = price, name = "total_price") %>%
59 | mutate(country = fct_reorder(country, -total_price, sum)) %>%
60 | ggplot(aes(month, total_price, fill = country)) +
61 | geom_col() +
62 | scale_y_continuous(labels = dollar_format()) +
63 | labs(x = "Time",
64 | y = "Total fines",
65 | fill = "Country")
66 | ```
67 |
68 | ```{r}
69 | gdpr_violations %>%
70 | count(source, country, sort = TRUE)
71 |
72 | gdpr_violations %>%
73 | select(controller, date, article_violated, type, summary, price) %>%
74 | mutate(summary = str_trunc(summary, 140)) %>%
75 | arrange(desc(price)) %>%
76 | mutate(price = dollar(price)) %>%
77 | head(10)
78 | ```
79 |
80 | ```{r}
81 | gdpr_violations %>%
82 | mutate(type = fct_lump(type, 8, w = price),
83 | type = fct_reorder(type, price),
84 | country = fct_lump(country, 5)) %>%
85 | ggplot(aes(price, type)) +
86 | geom_boxplot() +
87 | geom_jitter(aes(color = country), width = 0, height = .25) +
88 | scale_x_log10(labels = dollar_format())
89 | ```
90 |
91 | ### Which article was violated?
92 |
93 | ```{r}
94 | article_titles <- gdpr_text %>%
95 | distinct(article, article_title)
96 |
97 | separated_articles <- gdpr_violations %>%
98 | separate_rows(article_violated, sep = "\\|") %>%
99 | extract(article_violated, "article", "Art\\. ?(\\d+)", convert = TRUE, remove = FALSE) %>%
100 | left_join(article_titles, by = "article") %>%
101 | mutate(article_title = paste0(article, ". ", str_trunc(article_title, 50)),
102 | article_title = ifelse(is.na(article), "Unknown", article_title)) %>%
103 | add_count(id) %>%
104 | mutate(price_per_article = price / n)
105 |
106 | separated_articles %>%
107 | group_by(article = fct_lump(article_title, 8, w = price)) %>%
108 | summarize(total_fine = sum(price_per_article),
109 | violations = n()) %>%
110 | arrange(desc(total_fine)) %>%
111 | mutate(article = fct_reorder(article, total_fine)) %>%
112 | ggplot(aes(total_fine, article)) +
113 | geom_col() +
114 | scale_x_continuous(labels = dollar_format()) +
115 | labs(title = "What articles got the most fines?",
116 | y = "",
117 | x = "Total fine")
118 | ```
119 |
120 | ```{r}
121 | gdpr_text <- tuesdata$gdpr_text
122 | ```
123 |
124 | ```{r}
125 | gdpr_violations %>%
126 | filter(str_detect(controller, "Vodafone")) %>%
127 | group_by(date, country) %>%
128 | summarize(violations = n(),
129 | total_fine = sum(price)) %>%
130 | ggplot(aes(date, total_fine, size = violations, color = country)) +
131 | geom_point() +
132 | scale_y_continuous(labels = dollar_format()) +
133 | scale_size_continuous(guide = FALSE) +
134 | labs(title = "Vodafone's GDPR violations",
135 | color = "",
136 | x = "Time",
137 | y = "Total fine on this day")
138 | ```
139 |
140 | ```{r}
141 | library(tidymetrics)
142 |
143 | summarized <- separated_articles %>%
144 | filter(!is.na(date)) %>%
145 | mutate(country = fct_lump(country, 6, w = price_per_article),
146 | article_title = fct_lump(article_title, 6, w = price_per_article),
147 | type = fct_lump(type, 6, w = price_per_article)) %>%
148 | cross_by_dimensions(country, article_title, type) %>%
149 | cross_by_periods(c("month", "quarter")) %>%
150 | summarize(nb_violations = n_distinct(id),
151 | nb_total_fine = sum(price_per_article)) %>%
152 | ungroup()
153 |
154 | gdpr_metrics <- create_metrics(summarized)
155 | ```
156 |
157 | ```{r}
158 | library(shinymetrics)
159 |
160 | preview_metric(gdpr_metrics$gdpr_violations_nb_violations)
161 | ```
162 |
163 |
164 |
165 |
166 |
--------------------------------------------------------------------------------
/golden-age-tv.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Untitled"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | theme_set(theme_light())
13 |
14 | tv_ratings <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-08/IMDb_Economist_tv_ratings.csv")
15 | ```
16 |
17 | ```{r}
18 | tv_ratings %>%
19 | filter(title == "Buffy the Vampire Slayer")
20 | ```
21 |
22 | ## Do most shows get better or worse over time?
23 |
24 | ```{r}
25 | tv_ratings %>%
26 | count(seasonNumber) %>%
27 | ggplot(aes(seasonNumber, n)) +
28 | geom_line()
29 |
30 | tv_ratings %>%
31 | filter(seasonNumber <= 7) %>%
32 | group_by(seasonNumber) %>%
33 | summarize(av_rating = mean(av_rating)) %>%
34 | ggplot(aes(seasonNumber, av_rating)) +
35 | geom_line()
36 |
37 | tv_ratings_full_show <- tv_ratings %>%
38 | group_by(title) %>%
39 | filter(1 %in% seasonNumber,
40 | all(seasonNumber == dplyr::row_number()))
41 |
42 | tv_ratings_full_show %>%
43 | filter(n() >= 5) %>%
44 | filter(seasonNumber <= 5) %>%
45 | group_by(seasonNumber) %>%
46 | ggplot(aes(factor(seasonNumber), av_rating)) +
47 | geom_boxplot()
48 | ```
49 |
50 | What are the highest rated seasons of TV?
51 |
52 | ```{r}
53 | by_show <- tv_ratings %>%
54 | group_by(title) %>%
55 | summarize(avg_rating = mean(av_rating),
56 | sd = sd(av_rating),
57 | seasons = n()) %>%
58 | arrange(desc(avg_rating))
59 |
60 | most_variable_shows <- by_show %>%
61 | filter(seasons >= 5) %>%
62 | top_n(6, sd)
63 |
64 | tv_ratings %>%
65 | inner_join(most_variable_shows, by = "title") %>%
66 | ggplot(aes(seasonNumber, av_rating, color = title)) +
67 | geom_line() +
68 | geom_point() +
69 | scale_x_continuous(breaks = function(lim) seq(floor(lim[1]), ceiling(lim[2]), 2)) +
70 | facet_wrap(~ title) +
71 | theme(legend.position = "none")
72 | ```
73 |
74 | ### Has TV been getting better or worse over time?
75 |
76 | ```{r}
77 | library(lubridate)
78 |
79 | tv_ratings %>%
80 | mutate(year = year(date)) %>%
81 | ggplot(aes(av_rating)) +
82 | geom_histogram() +
83 | facet_wrap(~ year)
84 |
85 | by_year <- tv_ratings %>%
86 | group_by(year = 2 * (year(date) %/% 2)) %>%
87 | summarize(mean_rating = mean(av_rating),
88 | median_rating = median(av_rating),
89 | mean_season_1 = mean(av_rating[seasonNumber == 1]),
90 | avg_season = mean(seasonNumber),
91 | sd = sd(av_rating),
92 | observations = n())
93 |
94 | by_year %>%
95 | gather(metric, value, -year, -observations, -avg_season, -sd) %>%
96 | ggplot(aes(year, value, color = metric)) +
97 | geom_line() +
98 | geom_point()
99 |
100 | tv_ratings %>%
101 | group_by(year = 5 * (year(date) %/% 5),
102 | seasonNumber = ifelse(seasonNumber >= 4, "4+", seasonNumber)) %>%
103 | summarize(mean_rating = mean(av_rating),
104 | observations = n()) %>%
105 | ggplot(aes(year, mean_rating, color = seasonNumber)) +
106 | geom_line() +
107 | labs(color = "Season",
108 | y = "Average rating in each 5 year period")
109 |
110 | tv_ratings %>%
111 | ggplot(aes(date, av_rating)) +
112 | geom_point() +
113 | geom_smooth(method = "loess")
114 |
115 | tv_ratings %>%
116 | mutate(year = 2 * (year(date) %/% 2)) %>%
117 | ggplot(aes(year, av_rating, group = year)) +
118 | geom_boxplot()
119 |
120 | tv_ratings %>%
121 | filter(seasonNumber == 1) %>%
122 | mutate(year = 2 * (year(date) %/% 2)) %>%
123 | ggplot(aes(year, av_rating, group = year)) +
124 | geom_boxplot()
125 | ```
126 |
127 | ### Show survival
128 |
129 | If season 1 is good, what's the probability they get a season 2?
130 |
131 | ```{r}
132 | first_three_seasons <- tv_ratings %>%
133 | filter(seasonNumber <= 3) %>%
134 | group_by(title) %>%
135 | mutate(date_first_season = min(date)) %>%
136 | ungroup() %>%
137 | transmute(titleId,
138 | title,
139 | date_first_season,
140 | seasonNumber = paste0("season", seasonNumber),
141 | av_rating) %>%
142 | distinct(title, seasonNumber, .keep_all = TRUE) %>%
143 | spread(seasonNumber, av_rating) %>%
144 | filter(!is.na(season1)) %>%
145 | filter(date_first_season <= "2017-01-01")
146 |
147 | library(broom)
148 |
149 | glm(!is.na(season2) ~ season1, data = first_three_seasons) %>%
150 | summary()
151 | ```
152 |
153 | ```{r}
154 | first_three_seasons %>%
155 | group_by(season1_bin = cut(season1, c(0, 7, 7.5, 8, 8.5, 10)),
156 | time_bin = ifelse(date_first_season < "2000-01-01", "Before 2000", "After 2000")) %>%
157 | summarize(had_second_season = mean(!is.na(season2)),
158 | observations = n()) %>%
159 | ggplot(aes(season1_bin,
160 | had_second_season,
161 | color = time_bin,
162 | group = time_bin)) +
163 | geom_line() +
164 | geom_point() +
165 | scale_y_continuous(labels = scales::percent_format())
166 | ```
167 |
168 | ```{r}
169 | library(broom)
170 |
171 | new_data <- crossing(
172 | year = 1990:2018,
173 | season1 = seq(6, 9)
174 | )
175 |
176 | library(splines)
177 |
178 | mod <- first_three_seasons %>%
179 | mutate(year = year(date_first_season),
180 | had_second_season = !is.na(season2)) %>%
181 | glm(had_second_season ~ season1 * year, data = ., family = "binomial")
182 |
183 | summary(mod)
184 |
185 | mod %>%
186 | augment(newdata = new_data, type.predict = "response") %>%
187 | ggplot(aes(year, .fitted, color = factor(season1))) +
188 | geom_line() +
189 | scale_y_continuous(labels = scales::percent_format()) +
190 | labs(title = "Probability of getting season 2 has become more dependent on ratings of season 1",
191 | color = "Season 1 rating",
192 | x = "Year show premiered",
193 | y = "Predicted probability of getting season 2")
194 | ```
195 |
196 |
197 |
198 | Whether a show gets a second season depends on the ratings of the first season
199 |
200 |
201 |
202 |
--------------------------------------------------------------------------------
/honeycomb-puzzle.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Untitled"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 |
13 | words <- tibble(word = read_lines("https://norvig.com/ngrams/enable1.txt")) %>%
14 | mutate(word_length = str_length(word)) %>%
15 | filter(word_length >= 4,
16 | !str_detect(word, "s")) %>%
17 | mutate(letters = str_split(word, ""),
18 | letters = map(letters, unique),
19 | unique_letters = lengths(letters)) %>%
20 | mutate(points = ifelse(word_length == 4, 1, word_length) +
21 | 15 * (unique_letters == 7)) %>%
22 | filter(unique_letters <= 7) %>%
23 | arrange(desc(points))
24 |
25 | center_letter <- "g"
26 | other_letters <- c("a", "p", "x", "m", "e", "l")
27 |
28 | get_words <- function(center_letter, other_letters) {
29 | words %>%
30 | filter(str_detect(word, center_letter)) %>%
31 | mutate(invalid_letters = map(letters, setdiff, c(center_letter, other_letters))) %>%
32 | filter(lengths(invalid_letters) == 0) %>%
33 | arrange(desc(points))
34 | }
35 |
36 | library(tidytext)
37 |
38 | letters_unnested <- words %>%
39 | select(word, points) %>%
40 | unnest_tokens(letter, word, token = "characters", drop = FALSE) %>%
41 | distinct(word, letter, .keep_all = TRUE)
42 |
43 | letters_summarized <- letters_unnested %>%
44 | group_by(letter) %>%
45 | summarize(n_words = n(),
46 | n_points = sum(points)) %>%
47 | arrange(desc(n_points))
48 | ```
49 |
50 | ```{r}
51 | word_matrix <- letters_unnested %>%
52 | reshape2::acast(word ~ letter, fun.aggregate = length)
53 |
54 | # Points per word (lines up with rows of word matrix)
55 | points_per_word <- words$points
56 | names(points_per_word) <- words$word
57 | points_per_word <- points_per_word[rownames(word_matrix)]
58 |
59 | get_score <- function(honeycomb_letters) {
60 | center_letter <- honeycomb_letters[1]
61 |
62 | permitted_letters <- colnames(word_matrix) %in% honeycomb_letters
63 |
64 | num_forbidden <- word_matrix %*% (1L - permitted_letters)
65 | word_permitted <- num_forbidden == 0L & word_matrix[, center_letter] == 1L
66 |
67 | sum(points_per_word[word_permitted])
68 | }
69 |
70 | get_score(c("e", "i", "a", "r", "n", "t", "l"))
71 |
72 | get_words("e", c("i", "a", "r", "n", "t", "l"))
73 | ```
74 |
75 | ```{r}
76 | center_letter <- "e"
77 |
78 | find_best_combination <- function(center_letter, possible_letters) {
79 | good_letter_combinations <- combn(possible_letters, 6)
80 |
81 | # Every column is one of the possible honeycombs
82 | forbidden_matrix <- 1L - apply(good_letter_combinations,
83 | 2,
84 | function(.) colnames(word_matrix) %in% c(center_letter, .))
85 |
86 | filtered_word_matrix <- word_matrix[word_matrix[, center_letter] == 1, ]
87 | word_allowed_matrix <- filtered_word_matrix %*% forbidden_matrix == 0
88 | scores <- t(word_allowed_matrix) %*% points_per_word[rownames(word_allowed_matrix)]
89 |
90 | list(center_letter = center_letter,
91 | other_letters = good_letter_combinations[, which.max(scores)],
92 | score = max(scores))
93 | }
94 |
95 | pool <- head(letters_summarized$letter, 16)
96 |
97 | find_best_combination("e", setdiff(pool, "e"))
98 | find_best_combination("i", setdiff(pool, "i"))
99 | find_best_combination("a", setdiff(pool, "a"))
100 | find_best_combination("r", setdiff(pool, "r"))
101 | find_best_combination("n", setdiff(pool, "n"))
102 | find_best_combination("t", setdiff(pool, "t"))
103 | find_best_combination("g", setdiff(pool, "g"))
104 |
105 | get_score(c("r", "e", "i", "a", "n", "t", "g"))
106 | ```
107 |
108 | ```{r}
109 | permitted_letters <- colnames(word_matrix) %in% honeycomb_letters
110 |
111 | num_forbidden <- word_matrix %*% (1L - permitted_letters)
112 | word_permitted <- num_forbidden == 0L & word_matrix[, center_letter] == 1L
113 |
114 | sum(points_per_word[word_permitted])
115 | ```
116 |
117 |
118 | ```{r}
119 | words %>%
120 | unnest(letters) %>%
121 | group_by(letters) %>%
122 | summarize(total_points = sum(points),
123 | )
124 | ```
125 |
126 | ```{r}
127 | words
128 | ```
129 |
130 |
131 |
132 | ```{r}
133 |
134 | ```
135 |
136 |
137 |
--------------------------------------------------------------------------------
/horror-movie-ratings.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Untitled"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | theme_set(theme_light())
13 |
14 | horror_movies_raw <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-22/horror_movies.csv")
15 | ```
16 |
17 | ```{r}
18 | horror_movies <- horror_movies_raw %>%
19 | arrange(desc(review_rating)) %>%
20 | extract(title, "year", "\\((\\d\\d\\d\\d)\\)$", remove = FALSE, convert = TRUE) %>%
21 | mutate(budget = parse_number(budget)) %>%
22 | separate(plot, c("director", "cast_sentence", "plot"), extra = "merge", sep = "\\. ", fill = "right") %>%
23 | distinct(title, .keep_all = TRUE)
24 | ```
25 |
26 | Most of the movies are since 2012.
27 |
28 | ```{r}
29 | horror_movies %>%
30 | count(genres, sort = TRUE)
31 |
32 | horror_movies %>%
33 | count(language, sort = TRUE)
34 |
35 | horror_movies %>%
36 | ggplot(aes(budget)) +
37 | geom_histogram() +
38 | scale_x_log10(labels = scales::dollar)
39 | ```
40 |
41 | Do higher budget movies end up higher rated?
42 |
43 | ```{r}
44 | horror_movies %>%
45 | ggplot(aes(budget, review_rating)) +
46 | geom_point() +
47 | scale_x_log10(labels = scales::dollar) +
48 | geom_smooth(method = "lm")
49 | ```
50 |
51 | No relationship between budget and rating. How about movie rating and review?
52 |
53 | ```{r}
54 | horror_movies %>%
55 | mutate(movie_rating = fct_lump(movie_rating, 5),
56 | movie_rating = fct_reorder(movie_rating, review_rating, na.rm = TRUE)) %>%
57 | ggplot(aes(movie_rating, review_rating)) +
58 | geom_boxplot() +
59 | coord_flip()
60 |
61 | horror_movies %>%
62 | filter(!is.na(movie_rating)) %>%
63 | mutate(movie_rating = fct_lump(movie_rating, 5)) %>%
64 | lm(review_rating ~ movie_rating, data = .) %>%
65 | anova()
66 | ```
67 |
68 | ```{r}
69 | horror_movies %>%
70 | separate_rows(genres, sep = "\\| ") %>%
71 | mutate(genre = fct_lump(genres, 5)) %>%
72 | ggplot(aes(genre, review_rating)) +
73 | geom_boxplot()
74 | ```
75 |
76 | ```{r}
77 | library(tidytext)
78 |
79 | horror_movies_unnested <- horror_movies %>%
80 | unnest_tokens(word, plot) %>%
81 | anti_join(stop_words, by = "word") %>%
82 | filter(!is.na(word))
83 |
84 | horror_movies_unnested %>%
85 | filter(!is.na(review_rating)) %>%
86 | group_by(word) %>%
87 | summarize(movies = n(),
88 | avg_rating = mean(review_rating)) %>%
89 | arrange(desc(movies)) %>%
90 | filter(movies >= 100) %>%
91 | mutate(word = fct_reorder(word, avg_rating)) %>%
92 | ggplot(aes(avg_rating, word)) +
93 | geom_point()
94 | ```
95 |
96 | ### Lasso regression for predicing review rating based on words in plot
97 |
98 | ```{r}
99 | library(glmnet)
100 | library(Matrix)
101 |
102 | movie_word_matrix <- horror_movies_unnested %>%
103 | filter(!is.na(review_rating)) %>%
104 | add_count(word) %>%
105 | filter(n >= 20) %>%
106 | count(title, word) %>%
107 | cast_sparse(title, word, n)
108 |
109 | rating <- horror_movies$review_rating[match(rownames(movie_word_matrix), horror_movies$title)]
110 |
111 | lasso_model <- cv.glmnet(movie_word_matrix, rating)
112 | ```
113 |
114 | ```{r}
115 | library(broom)
116 |
117 | tidy(lasso_model$glmnet.fit) %>%
118 | filter(term %in% c("quickly", "seek", "army", "teacher", "unexpected", "friends", "evil")) %>%
119 | ggplot(aes(lambda, estimate, color = term)) +
120 | geom_line() +
121 | scale_x_log10() +
122 | geom_vline(xintercept = lasso_model$lambda.min) +
123 | geom_hline(yintercept = 0, lty = 2)
124 | ```
125 |
126 | ```{r}
127 | plot(lasso_model)
128 |
129 | tidy(lasso_model$glmnet.fit) %>%
130 | filter(lambda == lasso_model$lambda.min,
131 | term != "(Intercept)") %>%
132 | mutate(term = fct_reorder(term, estimate)) %>%
133 | ggplot(aes(term, estimate)) +
134 | geom_col() +
135 | coord_flip()
136 | ```
137 |
138 | Throwing everything into a linear model: director, cast, genre, rating, plot words.
139 |
140 | ```{r}
141 | features <- horror_movies %>%
142 | filter(!is.na(review_rating)) %>%
143 | select(title, genres, director, cast, movie_rating, language, release_country) %>%
144 | mutate(director = str_remove(director, "Directed by ")) %>%
145 | gather(type, value, -title) %>%
146 | filter(!is.na(value)) %>%
147 | separate_rows(value, sep = "\\| ?") %>%
148 | unite(feature, type, value, sep = ": ") %>%
149 | mutate(n = 1)
150 |
151 | movie_feature_matrix <- horror_movies_unnested %>%
152 | filter(!is.na(review_rating)) %>%
153 | count(title, feature = paste0("word: ", word)) %>%
154 | bind_rows(features) %>%
155 | add_count(feature) %>%
156 | filter(n >= 10) %>%
157 | cast_sparse(title, feature)
158 |
159 | rating <- horror_movies$review_rating[match(rownames(movie_feature_matrix), horror_movies$title)]
160 |
161 | feature_lasso_model <- cv.glmnet(movie_feature_matrix, rating)
162 | ```
163 |
164 | ```{r}
165 | plot(feature_lasso_model)
166 |
167 | tidy(feature_lasso_model$glmnet.fit) %>%
168 | filter(lambda == feature_lasso_model$lambda.1se,
169 | term != "(Intercept)") %>%
170 | mutate(term = fct_reorder(term, estimate)) %>%
171 | ggplot(aes(term, estimate)) +
172 | geom_col() +
173 | coord_flip() +
174 | labs(x = "",
175 | y = "Coefficient for predicting movie rating",
176 | title = "What affects a horror movie rating?",
177 | subtitle = "Based on a lasso regression to predict IMDb ratings of ~3000 movies")
178 | ```
179 |
180 | What am I going to watch?
181 |
182 | ```{r}
183 | horror_movies %>%
184 | filter(str_detect(genres, "Comedy"),
185 | !is.na(movie_rating),
186 | !is.na(budget),
187 | movie_rating != "PG") %>%
188 | arrange(desc(review_rating)) %>%
189 | select(title, review_rating, movie_rating, plot, director, budget, language) %>%
190 | View()
191 | ```
192 |
193 |
194 |
195 |
196 |
--------------------------------------------------------------------------------
/malaria.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Malaria Atlas"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | theme_set(theme_light())
13 |
14 | library(malariaAtlas)
15 | ```
16 |
17 | ```{r}
18 | kenya_pr <- tbl_df(malariaAtlas::getPR(ISO = "KEN", species = "BOTH")) %>%
19 | filter(!is.na(pr))
20 | ```
21 |
22 | ```{r}
23 | kenya_pr %>%
24 | group_by(year_start) %>%
25 | summarize(examined = sum(examined),
26 | positive = sum(positive),
27 | studies = n()) %>%
28 | mutate(pr = positive / examined) %>%
29 | ggplot(aes(year_start, pr)) +
30 | geom_line()
31 |
32 | kenya_pr %>%
33 | mutate(decade = 10 * (year_start %/% 10)) %>%
34 | arrange(pr) %>%
35 | ggplot(aes(longitude, latitude, color = pr)) +
36 | borders("world", regions = "Kenya") +
37 | geom_point() +
38 | scale_color_gradient2(low = "blue", high = "red", midpoint = .5, labels = scales::percent_format()) +
39 | facet_wrap(~ decade) +
40 | theme_void() +
41 | coord_map() +
42 | labs(color = "Prevalence")
43 | ```
44 |
45 | ### Aggregated across countries
46 |
47 | ```{r}
48 | malaria_inc <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-11-13/malaria_inc.csv")
49 |
50 | malaria_inc_processed <- malaria_inc %>%
51 | setNames(c("country", "code", "year", "incidence")) %>%
52 | mutate(incidence = incidence / 1000)
53 | ```
54 |
55 | ```{r}
56 | malaria_inc_processed %>%
57 | filter(country %in% sample(unique(country), 6)) %>%
58 | ggplot(aes(year, incidence, color = country)) +
59 | geom_line() +
60 | scale_y_continuous(labels = scales::percent_format())
61 | ```
62 |
63 | Looking at 2015 levels and the change from 2015 to 2000
64 |
65 | ```{r}
66 | malaria_spread <- malaria_inc_processed %>%
67 | mutate(year = paste0("Y", year)) %>%
68 | spread(year, incidence)
69 |
70 | malaria_spread %>%
71 | filter(country != "Turkey",
72 | !is.na(code)) %>%
73 | mutate(current = Y2015,
74 | change = Y2015 - Y2000) %>%
75 | ggplot(aes(current, change)) +
76 | geom_point() +
77 | geom_text(aes(label = code), vjust = 1, hjust = 1)
78 | ```
79 |
80 | ```{r}
81 | world <- map_data("world") %>%
82 | filter(region != "Antarctica")
83 |
84 | malaria_inc_processed %>%
85 | filter(incidence < 1) %>%
86 | inner_join(maps::iso3166 %>%
87 | select(a3, mapname), by = c(code = "a3")) %>%
88 | inner_join(world, by = c(mapname = "region")) %>%
89 | ggplot(aes(long, lat, group = group, fill = incidence)) +
90 | geom_polygon() +
91 | scale_fill_gradient2(low = "blue", high = "red", midpoint = .20, labels = scales::percent_format()) +
92 | coord_map() +
93 | facet_wrap(~ year) +
94 | theme_void() +
95 | labs(title = "Malaria incidence over time around the world")
96 | ```
97 |
98 | ### Malaria deaths over time
99 |
100 |
101 | ```{r}
102 | library(tidyverse)
103 |
104 | malaria_deaths <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-11-13/malaria_deaths.csv")
105 |
106 | malaria_deaths_processed <- malaria_deaths %>%
107 | setNames(c("country", "code", "year", "deaths"))
108 | ```
109 |
110 | ```{r}
111 | malaria_deaths_processed %>%
112 | filter(country %in% sample(unique(country), 6)) %>%
113 | ggplot(aes(year, deaths, color = country)) +
114 | geom_line() +
115 | labs(y = "Deaths per 100,000")
116 | ```
117 |
118 | ```{r}
119 | library(fuzzyjoin)
120 | library(stringr)
121 |
122 | malaria_country_data <- malaria_deaths_processed %>%
123 | inner_join(maps::iso3166 %>%
124 | select(a3, mapname), by = c(code = "a3")) %>%
125 | mutate(mapname = str_remove(mapname, "\\(.*"))
126 |
127 | malaria_map_data <- map_data("world") %>%
128 | filter(region != "Antarctica") %>%
129 | tbl_df() %>%
130 | inner_join(malaria_country_data, by = c(region = "mapname"))
131 |
132 | malaria_map_data %>%
133 | ggplot(aes(long, lat, group = group, fill = deaths)) +
134 | geom_polygon() +
135 | scale_fill_gradient2(low = "blue", high = "red", midpoint = 100) +
136 | theme_void() +
137 | labs(title = "Malaria deaths over time around the world",
138 | fill = "Deaths per 100,000")
139 | ```
140 |
141 | ```{r}
142 | library(gganimate)
143 | library(countrycode)
144 |
145 | malaria_map_data %>%
146 | mutate(continent = countrycode(code, "iso3c", "continent")) %>%
147 | filter(continent == "Africa") %>%
148 | ggplot(aes(long, lat, group = group, fill = deaths)) +
149 | geom_polygon() +
150 | scale_fill_gradient2(low = "blue", high = "red", midpoint = 100) +
151 | theme_void() +
152 | labs(title = "Malaria deaths over time in Africa ({ current_frame })",
153 | fill = "Deaths per 100,000") +
154 | transition_manual(year)
155 |
156 | anim_save("malaria_map.gif")
157 | ```
158 |
159 |
--------------------------------------------------------------------------------
/media-franchises.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Media Franchise Revenues"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | theme_set(theme_light())
13 |
14 | media_franchises <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-07-02/media_franchises.csv")
15 | ```
16 |
17 | ```{r}
18 | media_franchises %>%
19 | count(franchise, sort = TRUE)
20 |
21 | franchises <- media_franchises %>%
22 | group_by(franchise, original_media, year_created, creators, owners) %>%
23 | summarize(categories = n(),
24 | total_revenue = sum(revenue),
25 | most_profitable = revenue_category[which.max(revenue)]) %>%
26 | ungroup()
27 |
28 | franchises
29 |
30 | media_franchises
31 | ```
32 |
33 | ### Franchise-level
34 |
35 | What are the biggest franchises?
36 |
37 | ```{r}
38 | library(glue)
39 |
40 | top_franchises <- franchises %>%
41 | mutate(franchise = glue("{ franchise } ({ year_created })")) %>%
42 | top_n(20, total_revenue)
43 |
44 | media_franchises %>%
45 | mutate(franchise = glue("{ franchise } ({ year_created })")) %>%
46 | semi_join(top_franchises, by = "franchise") %>%
47 | mutate(franchise = fct_reorder(franchise, revenue, sum),
48 | revenue_category = fct_reorder(revenue_category, revenue, sum)) %>%
49 | ggplot(aes(franchise, revenue)) +
50 | geom_col(aes(fill = revenue_category)) +
51 | geom_text(aes(y = total_revenue,
52 | label = paste0(scales::dollar(total_revenue, accuracy = 1), "B")),
53 | data = top_franchises,
54 | hjust = 0) +
55 | scale_y_continuous(labels = scales::dollar) +
56 | expand_limits(y = 100) +
57 | coord_flip() +
58 | theme(panel.grid.major.y = element_blank()) +
59 | guides(fill = guide_legend(reverse = TRUE)) +
60 | labs(title = "What are the most profitable franchises of all time?",
61 | fill = "Category",
62 | x = "",
63 | y = "Revenue (Billions)")
64 | ```
65 |
66 | ```{r}
67 | media_franchises %>%
68 | group_by(owners) %>%
69 | filter(n_distinct(franchise) > 2) %>%
70 | ungroup() %>%
71 | mutate(franchise = fct_reorder(franchise, revenue, sum),
72 | owners = fct_reorder(owners, -revenue, sum),
73 | revenue_category = fct_reorder(revenue_category, revenue, sum)) %>%
74 | ggplot(aes(franchise, revenue, fill = revenue_category)) +
75 | geom_col() +
76 | facet_wrap(~ owners, scales = "free_y") +
77 | guides(fill = guide_legend(reverse = TRUE)) +
78 | coord_flip() +
79 | labs(title = "What companies own at least 3 franchises?",
80 | fill = "Category",
81 | x = "",
82 | y = "Revenue (Billions)")
83 | ```
84 |
85 | ```{r}
86 | franchises %>%
87 | ggplot(aes(year_created, total_revenue)) +
88 | geom_point(aes(size = total_revenue, color = original_media)) +
89 | geom_text(aes(label = franchise), check_overlap = TRUE, vjust = 1, hjust = 1) +
90 | expand_limits(x = 1910) +
91 | labs(title = "When were the 'great' franchises created?")
92 | ```
93 |
94 | ```{r}
95 | media_franchises %>%
96 | group_by(original_media) %>%
97 | summarize(revenue = sum(revenue))
98 |
99 | original_media_revenue_categories <- media_franchises %>%
100 | group_by(original_media) %>%
101 | filter(sum(revenue) >= 45) %>%
102 | group_by(original_media, revenue_category) %>%
103 | summarize(revenue = sum(revenue)) %>%
104 | ungroup() %>%
105 | mutate(revenue_category = fct_reorder(revenue_category, revenue, sum),
106 | original_media = fct_reorder(original_media, -revenue, sum))
107 |
108 | original_media_revenue_categories %>%
109 | ggplot(aes(revenue_category, revenue)) +
110 | geom_col() +
111 | scale_y_continuous(labels = scales::dollar) +
112 | coord_flip() +
113 | facet_wrap(~ original_media) +
114 | labs(x = "Revenue category",
115 | y = "Revenue (Billions)",
116 | title = "What kinds of media lead to what types of revenue?")
117 | ```
118 |
119 | ```{r}
120 | original_media_revenue_categories %>%
121 | mutate(revenue_category = fct_rev(revenue_category),
122 | original_media = fct_rev(original_media)) %>%
123 | ggplot(aes(revenue_category, original_media, fill = revenue)) +
124 | geom_tile() +
125 | scale_fill_gradient2(low = "white", high = "red", labels = scales::dollar) +
126 | theme(panel.grid = element_blank(),
127 | axis.text.x = element_text(angle = 90, hjust = 1)) +
128 | labs(fill = "Revenue (billions)")
129 | ```
130 |
131 |
132 |
--------------------------------------------------------------------------------
/medium-datasci.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Untitled"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 |
13 | medium_datasci <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-12-04/medium_datasci.csv")
14 |
15 | theme_set(theme_light())
16 | ```
17 |
18 | ```{r}
19 | medium_processed <- medium_datasci %>%
20 | select(-x1) %>%
21 | mutate(post_id = row_number())
22 | ```
23 |
24 | ```{r}
25 | medium_processed %>%
26 | count(author, sort = TRUE)
27 |
28 | medium_processed %>%
29 | summarize_at(vars(starts_with("tag_")), sum)
30 |
31 | medium_gathered <- medium_processed %>%
32 | gather(tag, value, starts_with("tag")) %>%
33 | mutate(tag = str_remove(tag, "tag_")) %>%
34 | filter(value == 1)
35 |
36 | medium_gathered %>%
37 | count(tag, sort = TRUE)
38 |
39 | medium_gathered %>%
40 | group_by(tag) %>%
41 | summarize(median_claps = median(claps)) %>%
42 | arrange(desc(median_claps))
43 |
44 | medium_processed %>%
45 | ggplot(aes(claps)) +
46 | geom_histogram() +
47 | scale_x_log10(labels = scales::comma_format())
48 |
49 | medium_processed %>%
50 | mutate(reading_time = pmin(10, reading_time)) %>%
51 | ggplot(aes(reading_time)) +
52 | geom_histogram(binwidth = .5) +
53 | scale_x_continuous(breaks = seq(2, 10, 2),
54 | labels = c(seq(2, 8, 2), "10+")) +
55 | labs(x = "Medium reading time")
56 |
57 | medium_gathered %>%
58 | group_by(tag) %>%
59 | summarize(reading_time = mean(reading_time)) %>%
60 | arrange(desc(reading_time))
61 | ```
62 |
63 | ### Text mining
64 |
65 | ```{r}
66 | library(tidytext)
67 |
68 | medium_words <- medium_processed %>%
69 | filter(!is.na(title)) %>%
70 | select(post_id, title, subtitle, year, reading_time, claps) %>%
71 | unnest_tokens(word, title) %>%
72 | anti_join(stop_words, by = "word") %>%
73 | filter(!(word %in% c("de", "en", "la", "para")),
74 | str_detect(word, "[a-z]"))
75 |
76 | medium_words %>%
77 | count(word, sort = TRUE) %>%
78 | mutate(word = fct_reorder(word, n)) %>%
79 | head(20) %>%
80 | ggplot(aes(word, n)) +
81 | geom_col() +
82 | coord_flip() +
83 | labs(title = "Common words in Medium post titles")
84 | ```
85 |
86 | ```{r}
87 | medium_words_filtered <- medium_words %>%
88 | add_count(word) %>%
89 | filter(n >= 250)
90 |
91 | tag_claps <- medium_words_filtered %>%
92 | group_by(word) %>%
93 | summarize(median_claps = median(claps),
94 | geometric_mean_claps = exp(mean(log(claps + 1))) - 1,
95 | occurences = n()) %>%
96 | arrange(desc(median_claps))
97 |
98 | library(widyr)
99 | library(ggraph)
100 | library(igraph)
101 |
102 | top_word_cors <- medium_words_filtered %>%
103 | select(post_id, word) %>%
104 | pairwise_cor(word, post_id, sort = TRUE) %>%
105 | head(150)
106 |
107 | vertices <- tag_claps %>%
108 | filter(word %in% top_word_cors$item1 |
109 | word %in% top_word_cors$item2)
110 |
111 | set.seed(2018)
112 |
113 | top_word_cors %>%
114 | graph_from_data_frame(vertices = vertices) %>%
115 | ggraph() +
116 | geom_edge_link() +
117 | geom_node_point(aes(size = occurences * 1.1)) +
118 | geom_node_point(aes(size = occurences,
119 | color = geometric_mean_claps)) +
120 | geom_node_text(aes(label = name), repel = TRUE) +
121 | scale_color_gradient2(low = "blue",
122 | high = "red",
123 | midpoint = 10) +
124 | theme_void() +
125 | labs(title = "What gets claps in Medium article titles?",
126 | subtitle = "Color shows the geometric mean of # of claps on articles with this word in the title",
127 | size = "# of occurrences",
128 | color = "Claps")
129 | ```
130 |
131 | ### Predicting # of claps based on title + tag
132 |
133 | ```{r}
134 | # turn into a sparse matrix
135 | post_word_matrix <- medium_words_filtered %>%
136 | distinct(post_id, word, claps) %>%
137 | cast_sparse(post_id, word)
138 |
139 | # Fit a LASSO model
140 | library(glmnet)
141 |
142 | claps <- medium_processed$claps[match(rownames(post_word_matrix), medium_processed$post_id)]
143 |
144 | lasso_model <- cv.glmnet(post_word_matrix, log(claps + 1))
145 | ```
146 |
147 | ```{r}
148 | library(broom)
149 |
150 | tidy(lasso_model$glmnet.fit) %>%
151 | filter(term %in% c("hadoop", "learning", "gdpr", "deep", "startup", "marketing")) %>%
152 | ggplot(aes(lambda, estimate, color = term)) +
153 | geom_line() +
154 | scale_x_log10()
155 |
156 | tidy(lasso_model$glmnet.fit) %>%
157 | filter(lambda == lasso_model$lambda.min) %>%
158 | arrange(desc(estimate)) %>%
159 | View()
160 | ```
161 |
162 |
--------------------------------------------------------------------------------
/nyc-pizza.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Pizza"
3 | output: html_document
4 | editor_options:
5 | chunk_output_type: console
6 | ---
7 |
8 | ```{r setup, include=FALSE}
9 | knitr::opts_chunk$set(echo = TRUE)
10 | ```
11 |
12 | ## R Markdown
13 |
14 | ```{r}
15 | library(tidyverse)
16 | theme_set(theme_light())\
17 | ```
18 |
19 | ```{r}
20 | pizza_jared <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-01/pizza_jared.csv")
21 | pizza_barstool <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-01/pizza_barstool.csv")
22 | pizza_datafiniti <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-01/pizza_datafiniti.csv")
23 | ```
24 |
25 | ```{r}
26 | answer_orders <- c("Never Again", "Poor", "Average", "Good", "Excellent")
27 |
28 | by_place_answer <- pizza_jared %>%
29 | mutate(time = as.POSIXct(time, origin = "1970-01-01"),
30 | date = as.Date(time),
31 | answer = fct_relevel(answer, answer_orders)) %>%
32 | group_by(place, answer) %>%
33 | summarize(votes = sum(votes)) %>%
34 | mutate(total = sum(votes),
35 | percent = votes / total,
36 | answer_integer = as.integer(answer),
37 | average = sum(answer_integer * percent)) %>%
38 | ungroup()
39 |
40 | by_place <- by_place_answer %>%
41 | distinct(place, total, average)
42 | ```
43 |
44 | ```{r}
45 | by_place_answer %>%
46 | filter(as.integer(fct_reorder(place, total, .desc = TRUE)) <= 16,
47 | answer != "Fair") %>%
48 | mutate(place = glue::glue("{ place } ({ total })"),
49 | place = fct_reorder(place, average)) %>%
50 | ggplot(aes(answer, percent)) +
51 | geom_col() +
52 | facet_wrap(~ place) +
53 | scale_y_continuous(labels = scales::percent) +
54 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
55 | labs(x = "",
56 | y = "% of respondents",
57 | title = "What is the most popular pizza place in Open Stats meetup?",
58 | subtitle = "Only the 16 pizza places with the most respondents. # respondents shown in parentheses.")
59 | ```
60 |
61 | ```{r}
62 | library(broom)
63 |
64 | # THIS TRICK DOESN'T WORK
65 | # tidy(lm(c(1, 2, 3, 4, 5) ~ 1, weights = c(100, 300, 100, 200, 150)), conf.int = TRUE)
66 |
67 | t_test_repeated <- function(x, frequency) {
68 | tidy(t.test(rep(x, frequency)))
69 | }
70 |
71 | by_place_answer %>%
72 | filter(total >= 3) %>%
73 | group_by(place, total) %>%
74 | summarize(t_test_result = list(t_test_repeated(answer_integer, votes))) %>%
75 | ungroup() %>%
76 | unnest(t_test_result) %>%
77 | select(place, total, average = estimate, low = conf.low, high = conf.high) %>%
78 | top_n(16, total) %>%
79 | mutate(place = fct_reorder(place, average)) %>%
80 | ggplot(aes(average, place)) +
81 | geom_point(aes(size = total)) +
82 | geom_errorbarh(aes(xmin = low, xmax = high)) +
83 | labs(x = "Average score (1-5 Likert Scale)",
84 | y = "",
85 | title = "What is the most popular pizza place in Open Stats meetup?",
86 | subtitle = "Only the 16 pizza places with the most respondents.",
87 | size = "# of respondents")
88 | ```
89 |
90 | ```{r}
91 | # Don't bother comparing them, this is a bad graph
92 | pizza_barstool %>%
93 | select(place = name,
94 | barstool_total = review_stats_all_count,
95 | barstool_average = review_stats_all_average_score) %>%
96 | inner_join(by_place, by = "place") %>%
97 | group_by(place) %>%
98 | filter(n() == 1) %>%
99 | ungroup() %>%
100 | filter(barstool_total >= 5,
101 | total >= 5) %>%
102 | ggplot(aes(average, barstool_average)) +
103 | geom_point() +
104 | labs(x = "Meetup",
105 | y = "Barstool")
106 | ```
107 |
108 | ```{r}
109 | pizza_barstool %>%
110 | top_n(50, review_stats_all_count) %>%
111 | ggplot(aes(price_level, review_stats_all_average_score, group = price_level)) +
112 | geom_boxplot()
113 |
114 | pizza_barstool %>%
115 | filter(review_stats_all_count >= 50) %>%
116 | mutate(name = fct_reorder(name, review_stats_all_average_score)) %>%
117 | ggplot(aes(review_stats_all_average_score, name, size = review_stats_all_count)) +
118 | geom_point() +
119 | labs(x = "Average rating",
120 | y = "",
121 | size = "# of reviews",
122 | title = "Barstool Sports ratings of pizza places",
123 | subtitle = "Only places with at least 50 reviews")
124 | ```
125 |
126 | ```{r}
127 | pizza_barstool %>%
128 | filter(review_stats_all_count >= 20) %>%
129 | mutate(city = fct_lump(city, 3)) %>%
130 | add_count(city) %>%
131 | mutate(city = glue::glue("{ city } ({ n })")) %>%
132 | ggplot(aes(city, review_stats_all_average_score)) +
133 | geom_boxplot() +
134 | labs(title = "Do pizza ratings differ across cities?",
135 | subtitle = "Only pizza places with at least 20 reviews")
136 | ```
137 |
138 | ```{r}
139 | pizza_cleaned <- pizza_barstool %>%
140 | select(place = name,
141 | price_level,
142 | contains("review")) %>%
143 | rename_all(~ str_remove(., "review_stats_")) %>%
144 | select(-contains("provider"))
145 |
146 | pizza_cleaned %>%
147 | filter(critic_count > 0) %>%
148 | ggplot(aes(critic_average_score, dave_average_score)) +
149 | geom_point() +
150 | geom_abline(color = "red") +
151 | geom_smooth(method = "lm") +
152 | labs(title = "Does Barstool Sports' Dave agree with the critics?",
153 | x = "Critic average score",
154 | y = "Dave score")
155 | ```
156 |
157 | ```{r}
158 | pizza_cleaned %>%
159 | filter(community_count >= 20) %>%
160 | ggplot(aes(community_average_score, dave_average_score)) +
161 | geom_point(aes(size = community_count)) +
162 | geom_abline(color = "red") +
163 | geom_smooth(method = "lm") +
164 | labs(size = "# of community reviews",
165 | x = "Community score",
166 | y = "Dave score")
167 | ```
168 |
--------------------------------------------------------------------------------
/nyc-restaurants.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "NYC Restaurants"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | library(lubridate)
13 |
14 | theme_set(theme_light())
15 |
16 | # You can use this url to download the data directly into R (will take a few seconds)
17 | restaurant_inspections_raw <- read_csv("https://data.cityofnewyork.us/api/views/43nn-pn8j/rows.csv")
18 |
19 | restaurant_inspections <- restaurant_inspections_raw %>%
20 | janitor::clean_names() %>%
21 | select(-phone, -grade_date, -record_date, -building, -street) %>%
22 | mutate(inspection_date = mdy(inspection_date)) %>%
23 | separate(inspection_type, c("inspection_program", "inspection_type"), sep = " / ")
24 | ```
25 |
26 | ```{r}
27 | restaurant_inspections %>%
28 | count(dba, camis, sort = TRUE)
29 |
30 | restaurant_inspections %>%
31 | count(year = year(inspection_date))
32 |
33 | restaurant_inspections %>%
34 | count(grade, sort = TRUE)
35 |
36 | restaurant_inspections %>%
37 | count(violation_code, violation_description, sort = TRUE)
38 |
39 | restaurant_inspections %>%
40 | filter(camis == 41297769, inspection_date == "2018-09-25") %>%
41 | count(camis, dba, inspection_date, sort = TRUE)
42 |
43 | restaurant_inspections %>%
44 | count(cuisine_description, sort = TRUE)
45 |
46 | restaurant_inspections %>%
47 | filter(action == "No violations were recorded at the time of this inspection.") %>%
48 | count(critical_flag)
49 |
50 | inspections <- restaurant_inspections %>%
51 | group_by(camis,
52 | dba,
53 | boro,
54 | zipcode,
55 | cuisine_description,
56 | inspection_date,
57 | action,
58 | score,
59 | grade,
60 | inspection_type,
61 | inspection_program) %>%
62 | summarize(critical_violations = sum(critical_flag == "Critical", na.rm = TRUE),
63 | non_critical_violations = sum(critical_flag == "Not Critical", na.rm = TRUE)) %>%
64 | ungroup()
65 |
66 | most_recent_cycle_inspection <- inspections %>%
67 | filter(inspection_program == "Cycle Inspection",
68 | inspection_type == "Initial Inspection") %>%
69 | arrange(desc(inspection_date)) %>%
70 | distinct(camis, .keep_all = TRUE)
71 | ```
72 |
73 | ```{r}
74 | by_dba <- most_recent_cycle_inspection %>%
75 | group_by(dba, cuisine = cuisine_description) %>%
76 | summarize(locations = n(),
77 | avg_score = mean(score),
78 | median_score = median(score)) %>%
79 | ungroup() %>%
80 | arrange(desc(locations))
81 |
82 | by_dba %>%
83 | mutate(locations_bin = cut(locations, c(0, 1, 3, 10, Inf), labels = c("1", "2-3", "3-10", ">10"))) %>%
84 | ggplot(aes(locations_bin, avg_score + 1)) +
85 | geom_boxplot() +
86 | scale_y_log10()
87 | ```
88 |
89 | ```{r}
90 | by_cuisine <- by_dba %>%
91 | group_by(cuisine) %>%
92 | summarize(avg_score = mean(avg_score),
93 | median_score = median(avg_score),
94 | restaurants = n()) %>%
95 | arrange(desc(restaurants))
96 |
97 | library(broom)
98 |
99 | cuisine_conf_ints <- by_dba %>%
100 | add_count(cuisine) %>%
101 | filter(n > 100) %>%
102 | nest(-cuisine) %>%
103 | mutate(model = map(data, ~ t.test(.$avg_score))) %>%
104 | unnest(map(model, tidy))
105 |
106 | cuisine_conf_ints %>%
107 | mutate(cuisine = str_remove(cuisine, " \\(.*"),
108 | cuisine = fct_reorder(cuisine, estimate)) %>%
109 | ggplot(aes(estimate, cuisine)) +
110 | geom_point() +
111 | geom_errorbarh(aes(xmin = conf.low,
112 | xmax = conf.high)) +
113 | labs(x = "Average inspection score (higher means more violations)",
114 | y = "Type of cuisine",
115 | title = "Average inspection score by type of cuisine in NYC",
116 | subtitle = "Each restaurant chain was counted once based on its average score")
117 | ```
118 |
119 | ```{r}
120 | violation_cuisine_counts <- restaurant_inspections %>%
121 | semi_join(most_recent_cycle_inspection, by = c("camis", "inspection_date")) %>%
122 | count(critical_flag, violation_code, violation_description, cuisine = cuisine_description) %>%
123 | group_by(violation_code) %>%
124 | mutate(violation_total = sum(n)) %>%
125 | group_by(cuisine) %>%
126 | mutate(cuisine_total = sum(n)) %>%
127 | ungroup() %>%
128 | filter(violation_total >= 1000,
129 | cuisine_total >= 2000) %>%
130 | group_by(violation_description) %>%
131 | mutate(fraction = n / cuisine_total,
132 | avg_fraction = mean(fraction)) %>%
133 | ungroup()
134 |
135 | violation_cuisine_counts %>%
136 | mutate(relative_risk = fraction / avg_fraction) %>%
137 | arrange(desc(relative_risk)) %>%
138 | filter(str_detect(violation_description, "mice"))
139 | ```
140 |
141 | ### What violations tend to occur together?
142 |
143 | ```{r}
144 | library(widyr)
145 |
146 | violations <- restaurant_inspections %>%
147 | semi_join(most_recent_cycle_inspection, by = c("camis", "inspection_date")) %>%
148 | filter(!is.na(violation_description))
149 |
150 | violations %>%
151 | pairwise_cor(violation_description, camis, sort = TRUE)
152 |
153 | principal_components <- violations %>%
154 | mutate(value = 1) %>%
155 | widely_svd(violation_description, camis, value, nv = 6)
156 |
157 | principal_components %>%
158 | filter(dimension == 2) %>%
159 | top_n(10, abs(value)) %>%
160 | mutate(violation_description = str_sub(violation_description, 1, 60),
161 | violation_description = fct_reorder(violation_description, value)) %>%
162 | ggplot(aes(violation_description, value)) +
163 | geom_col() +
164 | coord_flip()
165 | ```
166 |
167 |
168 |
169 |
--------------------------------------------------------------------------------
/nyc-squirrels-app/app.R:
--------------------------------------------------------------------------------
1 | #
2 | # This is a Shiny web application. You can run the application by clicking
3 | # the 'Run App' button above.
4 | #
5 | # Find out more about building applications with Shiny here:
6 | #
7 | # http://shiny.rstudio.com/
8 | #
9 |
10 | library(shiny)
11 |
12 | squirrel_variables <- by_hectare %>%
13 | select(-(hectare:lat)) %>%
14 | colnames()
15 |
16 | names(squirrel_variables) <- squirrel_variables %>%
17 | str_replace_all("_", " ") %>%
18 | str_to_title()
19 |
20 | # Define UI for application that draws a histogram
21 | ui <- fluidPage(
22 |
23 | # Application title
24 | titlePanel("Central Park Squirrels"),
25 |
26 | # Sidebar with a slider input for number of bins
27 | sidebarLayout(
28 | sidebarPanel(
29 | selectInput("variable",
30 | "Variable:",
31 | choices = squirrel_variables),
32 | sliderInput("min_squirrels",
33 | "Minimum squirrels:",
34 | min = 1,
35 | max = 30,
36 | value = 10)
37 | ),
38 |
39 | # Show a plot of the generated distribution
40 | mainPanel(
41 | plotOutput("park_plot")
42 | )
43 | )
44 | )
45 |
46 | # Define server logic required to draw a histogram
47 | server <- function(input, output) {
48 |
49 | output$park_plot <- renderPlot({
50 | # generate bins based on input$bins from ui.R
51 | var <- sym(input$variable)
52 |
53 | filtered <- by_hectare %>%
54 | filter(n >= input$min_squirrels)
55 |
56 | midpoint <- mean(filtered[[input$variable]])
57 |
58 | filtered %>%
59 | ggplot() +
60 | geom_sf(data = central_park_sf) +
61 | geom_point(aes(long, lat, size = n, color = !!var)) +
62 | theme_void() +
63 | scale_color_gradient2(low = "blue", high = "red", mid = "pink",
64 | midpoint = midpoint, labels = scales::percent) +
65 | labs(color = paste("%", input$variable),
66 | size = "# of squirrels") +
67 | coord_sf(datum = NA)
68 | })
69 | }
70 |
71 | # Run the application
72 | shinyApp(ui = ui, server = server)
73 |
--------------------------------------------------------------------------------
/nyc-squirrels.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Untitled"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | theme_set(theme_light())
13 |
14 | nyc_squirrels <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-10-29/nyc_squirrels.csv")
15 | ```
16 |
17 | ```{r}
18 | nyc_squirrels %>%
19 | count(zip_codes, sort = TRUE)
20 | ```
21 |
22 | ```{r}
23 | nyc_squirrels %>%
24 | ggplot(aes(long, lat)) +
25 | geom_point()
26 |
27 | by_hectare <- nyc_squirrels %>%
28 | filter(!is.na(primary_fur_color)) %>%
29 | group_by(hectare) %>%
30 | summarize(long = mean(long),
31 | lat = mean(lat),
32 | pct_gray = mean(primary_fur_color == "Gray", na.rm = TRUE),
33 | n = n())
34 |
35 | by_hectare %>%
36 | filter(n >= 10) %>%
37 | ggplot(aes(long, lat, size = n, color = pct_gray)) +
38 | geom_point() +
39 | theme_void()
40 |
41 | by_hectare %>%
42 | filter(n >= 10) %>%
43 | ggplot(aes(lat, pct_gray)) +
44 | geom_point() +
45 | geom_smooth()
46 |
47 | by_hectare %>%
48 | mutate(n_gray = round(pct_gray * n)) %>%
49 | glm(cbind(n_gray, n - n_gray) ~ lat, data = ., family = "binomial") %>%
50 | summary()
51 | ```
52 |
53 | Squirrels may be more likely to be gray the higher north in the park you go, and more likely to be cinnamon.
54 |
55 | ```{r}
56 | nyc_squirrels %>%
57 | count(highlight_fur_color, sort = TRUE)
58 |
59 | nyc_squirrels %>%
60 | count(approaches, indifferent, runs_from, sort = TRUE)
61 | ```
62 |
63 | Does a squirrel run away?
64 |
65 | ```{r}
66 | glm(runs_from ~ lat, data = nyc_squirrels, family = "binomial") %>%
67 | summary()
68 | ```
69 |
70 | ```{r}
71 | library(sf)
72 |
73 | central_park_sf <- read_sf("~/Downloads/CentralAndProspectParks/")
74 |
75 | by_hectare <- nyc_squirrels %>%
76 | add_count(hectare) %>%
77 | mutate(above_ground = !is.na(location) & location == "Above Ground") %>%
78 | group_by(hectare, n) %>%
79 | summarize_at(vars(long, lat, approaches:runs_from, ends_with("ing"), above_ground), mean) %>%
80 | ungroup()
81 |
82 | by_hectare %>%
83 | filter(n >= 10) %>%
84 | ggplot() +
85 | geom_sf(data = central_park_sf) +
86 | geom_point(aes(long, lat, size = n, color = runs_from)) +
87 | theme_void() +
88 | scale_color_gradient2(low = "blue", high = "red", mid = "pink",
89 | midpoint = .3, labels = scales::percent) +
90 | labs(color = "% of squirrels run",
91 | size = "# of squirrels",
92 | title = "Squirrels in the northwest corner of Central Park are more likely to run away") +
93 | coord_sf(datum = NA)
94 | ```
95 |
96 | ```{r}
97 | central_park_sf %>%
98 | count(lanes, sort = TRUE)
99 |
100 | ggplot(central_park_sf) +
101 | geom_sf() +
102 | geom_point(aes(long, lat, color = runs_from), data = by_hectare) +
103 | coord_sf(datum = NA)
104 | ```
105 |
106 | ```{r}
107 | central_park_sf %>%
108 | ggplot() +
109 | geom_sf(aes(color = bicycle)) +
110 | coord_sf(datum = NA)
111 | ```
112 |
113 |
114 |
115 |
116 |
--------------------------------------------------------------------------------
/office-transcripts.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "The Office"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | library(schrute)
13 | theme_set(theme_light())
14 |
15 | office_transcripts <- as_tibble(theoffice) %>%
16 | mutate(season = as.integer(season),
17 | episode = as.integer(episode)) %>%
18 | mutate(character = str_remove_all(character, '"')) %>%
19 | mutate(name = str_to_lower(str_remove_all(episode_name, "\\.| \\(Part.*")))
20 |
21 | office_ratings <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-03-17/office_ratings.csv') %>%
22 | mutate(name = str_to_lower(str_remove_all(title, "\\.| \\(Part.*|\\: Part.*")))
23 | ```
24 |
25 | ```{r}
26 | library(ggrepel)
27 |
28 | office_ratings %>%
29 | group_by(season) %>%
30 | summarize(avg_rating = mean(imdb_rating)) %>%
31 | ggplot(aes(season, avg_rating)) +
32 | geom_line() +
33 | scale_x_continuous(breaks = 1:9)
34 |
35 | office_ratings %>%
36 | mutate(title = fct_inorder(title),
37 | episode_number = row_number()) %>%
38 | ggplot(aes(episode_number, imdb_rating)) +
39 | geom_line() +
40 | geom_smooth() +
41 | geom_point(aes(color = factor(season), size = total_votes)) +
42 | geom_text(aes(label = title), check_overlap = TRUE, hjust = 1) +
43 | expand_limits(x = -10) +
44 | theme(panel.grid.major.x = element_blank(),
45 | legend.position = "none") +
46 | labs(x = "Episode number",
47 | y = "IMDB Rating",
48 | title = "Popularity of The Office episodes over time",
49 | subtitle = "Color represents season, size represents # of ratings")
50 | ```
51 |
52 | ```{r}
53 | office_ratings %>%
54 | arrange(desc(imdb_rating)) %>%
55 | mutate(title = paste0(season, ".", episode, " ", title),
56 | title = fct_reorder(title, imdb_rating)) %>%
57 | head(20) %>%
58 | ggplot(aes(title, imdb_rating, color = factor(season), size = total_votes)) +
59 | geom_point() +
60 | coord_flip() +
61 | labs(color = "Season",
62 | title = "Most popular episodes of The Office")
63 | ```
64 |
65 | ### Transcripts
66 |
67 | ```{r}
68 | library(tidytext)
69 |
70 | blacklist <- c("yeah", "hey", "uh", "gonna")
71 | blacklist_characters <- c("Everyone", "All", "Both", "Guy", "Girl", "Group")
72 |
73 | transcript_words <- office_transcripts %>%
74 | group_by(character) %>%
75 | filter(n() >= 100,
76 | n_distinct(episode_name) > 2) %>%
77 | ungroup() %>%
78 | select(-text_w_direction) %>%
79 | unnest_tokens(word, text) %>%
80 | anti_join(stop_words, by = "word") %>%
81 | filter(!word %in% blacklist,
82 | !character %in% blacklist_characters)
83 |
84 | character_tf_idf <- transcript_words %>%
85 | add_count(word) %>%
86 | filter(n >= 20) %>%
87 | count(word, character) %>%
88 | bind_tf_idf(word, character, n) %>%
89 | arrange(desc(tf_idf))
90 | ```
91 |
92 | ```{r}
93 | character_tf_idf %>%
94 | filter(character %in% c("Dwight", "Jim", "David Wallace", "Darryl", "Jan", "Holly")) %>%
95 | group_by(character) %>%
96 | top_n(10, tf_idf) %>%
97 | ungroup() %>%
98 | mutate(word = reorder_within(word, tf_idf, character)) %>%
99 | ggplot(aes(word, tf_idf)) +
100 | geom_col() +
101 | coord_flip() +
102 | scale_x_reordered() +
103 | facet_wrap(~ character, scales = "free_y") +
104 | labs(x = "",
105 | y = "TF-IDF of character-word pairs")
106 | ```
107 |
108 | ```{r}
109 | office_transcripts %>%
110 | count(character, sort = TRUE) %>%
111 | filter(character == "Dwight")
112 | ```
113 |
114 | ### Machine learning model
115 |
116 | What affects popularity of an episode:
117 |
118 | * Season/time
119 | * Director
120 | * Writer
121 | * Lines per character
122 |
123 | ```{r}
124 | ratings_summarized <- office_ratings %>%
125 | group_by(name) %>%
126 | summarize(imdb_rating = mean(imdb_rating))
127 |
128 | character_lines_ratings <- office_transcripts %>%
129 | filter(!character %in% blacklist_characters) %>%
130 | count(character, name) %>%
131 | group_by(character) %>%
132 | filter(sum(n) >= 50,
133 | n() >= 5) %>%
134 | inner_join(ratings_summarized, by = "name")
135 |
136 | character_lines_ratings %>%
137 | summarize(avg_rating = mean(imdb_rating),
138 | nb_episodes = n()) %>%
139 | arrange(desc(avg_rating)) %>%
140 | View()
141 | ```
142 |
143 | ```{r}
144 | director_writer_features <- office_transcripts %>%
145 | distinct(name, director, writer) %>%
146 | gather(type, value, director, writer) %>%
147 | separate_rows(value, sep = ";") %>%
148 | unite(feature, type, value, sep = ": ") %>%
149 | group_by(feature) %>%
150 | filter(n() >= 3) %>%
151 | mutate(value = 1) %>%
152 | ungroup()
153 |
154 | character_line_features <- character_lines_ratings %>%
155 | ungroup() %>%
156 | transmute(name, feature = character, value = log2(n))
157 |
158 | season_features = office_ratings %>%
159 | distinct(name, season) %>%
160 | transmute(name, feature = paste("season:", season), value = 1)
161 |
162 | features <- bind_rows(director_writer_features,
163 | character_line_features,
164 | season_features) %>%
165 | semi_join(office_ratings, by = "name") %>%
166 | semi_join(office_transcripts, by = "name")
167 | ```
168 |
169 | ```{r}
170 | episode_feature_matrix <- features %>%
171 | cast_sparse(name, feature, value)
172 |
173 | ratings <- ratings_summarized$imdb_rating[match(rownames(episode_feature_matrix), ratings_summarized$name)]
174 |
175 | library(glmnet)
176 | library(broom)
177 |
178 | mod <- cv.glmnet(episode_feature_matrix, ratings)
179 |
180 | plot(mod)
181 |
182 | tidy(mod$glmnet.fit) %>%
183 | filter(lambda == mod$lambda.min,
184 | term != "(Intercept)") %>%
185 | mutate(term = fct_reorder(term, estimate)) %>%
186 | ggplot(aes(term, estimate, fill = estimate > 0)) +
187 | geom_col() +
188 | coord_flip() +
189 | labs(y = "Estimated effect on the rating of an episode") +
190 | theme(legend.position = "none")
191 | ```
192 |
--------------------------------------------------------------------------------
/pascals-triangle.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "The accumulate function"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(purrr)
12 |
13 | letters
14 |
15 | accumulate(letters, paste)
16 |
17 | accumulate(1:5, ~ . ^ 2, .init = 2)
18 | ```
19 |
20 | Pascal's Triangle
21 |
22 | ```{r}
23 | accumulate(1:6, ~ c(0, .) + c(., 0), .init = 1)
24 | ```
25 |
26 | In this short screencast I share a #tidyverse trick of the day: Use accumulate() from the purrr package to calculate Pascal's Triangle in one line of code.
27 |
28 |
--------------------------------------------------------------------------------
/plastic-waste.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Plastic Waste"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | library(scales)
13 | theme_set(theme_light())
14 |
15 | coast_vs_waste <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-21/coastal-population-vs-mismanaged-plastic.csv")
16 |
17 | mismanaged_vs_gdp <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-21/per-capita-mismanaged-plastic-waste-vs-gdp-per-capita.csv")
18 |
19 | waste_vs_gdp <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-05-21/per-capita-plastic-waste-vs-gdp-per-capita.csv")
20 | ```
21 |
22 | ```{r}
23 | library(janitor)
24 |
25 | # Data cleaning
26 | clean_dataset <- function(tbl) {
27 | tbl %>%
28 | clean_names() %>%
29 | rename(country = entity,
30 | country_code = code) %>%
31 | filter(year == 2010) %>%
32 | select(-year)
33 | }
34 |
35 | plastic_waste <- coast_vs_waste %>%
36 | clean_dataset() %>%
37 | select(-total_population_gapminder) %>%
38 | inner_join(clean_dataset(mismanaged_vs_gdp) %>%
39 | select(-total_population_gapminder), by = c("country", "country_code")) %>%
40 | inner_join(clean_dataset(waste_vs_gdp), by = c("country", "country_code")) %>%
41 | select(country,
42 | country_code,
43 | mismanaged_waste = mismanaged_plastic_waste_tonnes,
44 | coastal_population,
45 | total_population = total_population_gapminder,
46 | mismanaged_per_capita = per_capita_mismanaged_plastic_waste_kilograms_per_person_per_day,
47 | gdp_per_capita = gdp_per_capita_ppp_constant_2011_international_rate) %>%
48 | filter(!is.na(mismanaged_waste))
49 | ```
50 |
51 | ```{r}
52 | g1 <- plastic_waste %>%
53 | arrange(-total_population) %>%
54 | mutate(pct_population_coastal = pmin(1, coastal_population / total_population),
55 | high_coastal_pop = ifelse(pct_population_coastal >= .8, ">=80%", "<80%")) %>%
56 | ggplot(aes(gdp_per_capita, mismanaged_per_capita)) +
57 | geom_point(aes(size = total_population)) +
58 | geom_text(aes(label = country), vjust = 1, hjust = 1, check_overlap = TRUE) +
59 | scale_x_log10(labels = dollar_format()) +
60 | scale_y_log10() +
61 | scale_size_continuous(guide = FALSE) +
62 | labs(x = "GDP per capita",
63 | y = "Mismanaged plastic waste (kg per person per day)",
64 | color = "Coastal population",
65 | title = "How plastic waste mismanagement correlates with country income",
66 | subtitle = "Based in Our World in Data 2010 numbers. Size represents total population")
67 |
68 | g1
69 | ```
70 |
71 | ```{r}
72 | plastic_waste %>%
73 | mutate(pct_population_coastal = pmin(1, coastal_population / total_population)) %>%
74 | arrange(-total_population) %>%
75 | ggplot(aes(pct_population_coastal, mismanaged_per_capita)) +
76 | geom_point(aes(size = total_population)) +
77 | geom_text(aes(label = country), vjust = 1, hjust = 1, check_overlap = TRUE)
78 | ```
79 |
80 | ```{r}
81 | tbl_df(iso3166)
82 |
83 | library(fuzzyjoin)
84 |
85 | plastic_data <- plastic_waste %>%
86 | inner_join(iso3166, by = c("country_code" = "a3"))
87 |
88 | map_data("world") %>%
89 | tbl_df() %>%
90 | filter(region != "Antarctica") %>%
91 | regex_left_join(plastic_data, by = c("region" = "mapname")) %>%
92 | ggplot(aes(long, lat, group = group, fill = mismanaged_per_capita)) +
93 | geom_polygon() +
94 | scale_fill_gradient2(trans = "log10",
95 | low = "blue",
96 | high = "red",
97 | mid = "pink",
98 | midpoint = log10(.02)) +
99 | coord_fixed(2) +
100 | ggthemes::theme_map() +
101 | labs(fill = "Mismanaged plastic waste per-cap",
102 | title = "Where in the world is waste mismanaged?")
103 | ```
104 |
105 | ### Comparing to other country stats
106 |
107 | ```{r}
108 | library(WDI)
109 |
110 | indicators <- c("co2_emissions_per_capita" = "EN.ATM.CO2E.PC",
111 | "cpia_transparency" = "IQ.CPA.TRAN.XQ")
112 |
113 | other_data <- WDI(indicator = indicators, start = 2010, end = 2010) %>%
114 | tbl_df() %>%
115 | select(-country)
116 |
117 | plastic_with_indicators <- other_data %>%
118 | inner_join(plastic_data, by = c(iso2c = "a2")) %>%
119 | arrange(desc(total_population))
120 |
121 | plastic_with_indicators %>%
122 | ggplot(aes(gdp_per_capita, co2_emissions_per_capita)) +
123 | geom_point(aes(size = total_population)) +
124 | geom_text(aes(label = country), vjust = 1, hjust = 1, check_overlap = TRUE) +
125 | scale_size_continuous(guide = FALSE) +
126 | scale_x_log10() +
127 | scale_y_log10() +
128 | labs(x = "GDP per capita",
129 | y = "CO2 emissions (tons per capita)",
130 | color = "Coastal population",
131 | title = "How plastic waste mismanagement correlates with country income",
132 | subtitle = "Based in Our World in Data 2010 numbers. Size represents total population")
133 |
134 | library(patchwork)
135 |
136 | g2 +
137 | labs(title = "CO2 emissions are correlated with country income, but not plastic waste") +
138 | g1 +
139 | labs(title = "",
140 | subtitle = "")
141 | ```
142 |
143 | ```{r}
144 | # Looking by CPIA trust rating is a trust
145 | plastic_with_indicators %>%
146 | filter(!is.na(cpia_transparency)) %>%
147 | ggplot(aes(cpia_transparency, mismanaged_per_capita, group = cpia_transparency)) +
148 | geom_boxplot()
149 |
150 | plastic_with_indicators %>%
151 | arrange(desc(cpia_transparency)) %>%
152 | View()
153 | ```
154 |
155 |
156 |
157 |
158 |
159 |
160 |
--------------------------------------------------------------------------------
/r-downloads.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "R Downloads"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | library(scales)
13 | library(countrycode)
14 | theme_set(theme_light())
15 |
16 | r_downloads_year_raw <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-10-30/r_downloads_year.csv")
17 |
18 | r_downloads_year <- r_downloads_year_raw %>%
19 | select(-X1) %>%
20 | mutate(country_name = countrycode(country, "iso2c", "country.name"))
21 | ```
22 |
23 | ```{r}
24 | r_downloads_year %>%
25 | count(date) %>%
26 | ggplot(aes(date, n)) +
27 | geom_line() +
28 | expand_limits(y = 0) +
29 | labs(y = "# of R downloads per day")
30 |
31 | library(lubridate)
32 |
33 | r_downloads_year %>%
34 | count(date) %>%
35 | group_by(weekday = wday(date, label = TRUE)) %>%
36 | summarize(average = mean(n)) %>%
37 | ggplot(aes(weekday, average)) +
38 | geom_line(group = 1) +
39 | expand_limits(y = 0) +
40 | labs(y = "Average downloads per weekday")
41 | ```
42 |
43 | R is downloaded about 1500 times each weekend day and around 3000 times on a typical weekday (averaged over the last year).
44 |
45 | ```{r}
46 | r_downloads_year %>%
47 | group_by(week = floor_date(date, "week")) %>%
48 | summarize(n = n_distinct(ip_id)) %>%
49 | filter(week > min(week)) %>%
50 | ggplot(aes(week, n)) +
51 | geom_line() +
52 | expand_limits(y = 0) +
53 | labs(y = "# of R downloads per week (distinct IPs)")
54 | ```
55 |
56 | What time of day were people installing R?
57 |
58 | ```{r}
59 | r_downloads_year %>%
60 | mutate(country = countrycode(country, "iso2c", "country.name")) %>%
61 | filter(!is.na(country)) %>%
62 | count(hour = hour(time),
63 | country = fct_lump(country, 8)) %>%
64 | ggplot(aes(hour, n)) +
65 | geom_line() +
66 | expand_limits(y = 0) +
67 | facet_wrap(~ country, scales = "free_y")
68 | ```
69 |
70 |
71 | ```{r}
72 | library(countrycode)
73 |
74 | r_downloads_year %>%
75 | count(country = countrycode(country, "iso2c", "country.name"), sort = TRUE) %>%
76 | mutate(percent = n / sum(n)) %>%
77 | filter(!is.na(country)) %>%
78 | head(16) %>%
79 | mutate(country = fct_reorder(country, percent)) %>%
80 | ggplot(aes(country, percent)) +
81 | geom_col() +
82 | coord_flip() +
83 | scale_y_continuous(labels = percent_format()) +
84 | labs(title = "What countries install the most R?")
85 | ```
86 |
87 | More than a third of R installations come from the US.
88 |
89 | ```{r}
90 | r_downloads_year %>%
91 | mutate(version = fct_lump(version, 8)) %>%
92 | count(date, version) %>%
93 | ggplot(aes(date, n, color = version)) +
94 | geom_line()
95 | ```
96 |
97 | What operating system do R users use?
98 |
99 | ```{r}
100 | r_downloads_year %>%
101 | count(country = fct_lump(country, 8),
102 | week = floor_date(date, "week")) %>%
103 | filter(week > min(week)) %>%
104 | ggplot(aes(week, n, color = country)) +
105 | geom_line()
106 | ```
107 |
108 | ### R package downloads
109 |
110 | ```{r}
111 | package_downloads <- read_csv("http://cran-logs.rstudio.com/2018/2018-10-27.csv.gz")
112 | ```
113 |
114 | ```{r}
115 | package_downloads %>%
116 | filter(country %in% c("US", "IN")) %>%
117 | group_by(country, package, sort = TRUE) %>%
118 | summarize(n = n_distinct(ip_id)) %>%
119 | spread(country, n, fill = 0) %>%
120 | ungroup() %>%
121 | mutate(total = US + IN,
122 | IN = (IN + 1) / sum(IN + 1),
123 | US = (US + 1) / sum(US + 1),
124 | ratio = US / IN) %>%
125 | filter(total >= 1000) %>%
126 | arrange((ratio)) %>%
127 | View()
128 | ```
129 |
130 | ```{r}
131 | library(cranlogs)
132 |
133 | cranlogs::cran_downloads(packages = c("tidyverse", "broom"), when = "last-week")
134 | ```
135 |
136 |
137 | ### Appendix: Why count only distinct IPs?
138 |
139 | ```{r}
140 | r_download_gaps <- r_downloads_year %>%
141 | mutate(datetime = as.POSIXlt(date) + time) %>%
142 | arrange(datetime) %>%
143 | group_by(ip_id) %>%
144 | mutate(gap = as.numeric(datetime - lag(datetime))) %>%
145 | filter(!is.na(gap))
146 | ```
147 |
148 | ```{r}
149 | ip_counts <- r_downloads_year %>%
150 | count(ip_id, sort = TRUE)
151 | ```
152 |
153 | A majority (`r percent(mean(ip_counts$n >= 100))`) of IP addresses that installed R did so more than 100 times in a year.
154 |
155 | ```{r}
156 | r_download_gaps %>%
157 | ggplot(aes(gap)) +
158 | geom_histogram() +
159 | geom_vline(color = "red", lty = 2, xintercept = 86400) +
160 | scale_x_log10(breaks = 60 ^ (0:4),
161 | labels = c("Second", "Minute", "Hour", "2.5 Days", "120 Days"))
162 | ```
163 |
164 | ```{r}
165 | r_download_gaps
166 | ```
167 |
168 |
169 |
--------------------------------------------------------------------------------
/ramen-ratings.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Ramen Ratings"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ## R Markdown
11 |
12 | ```{r}
13 | library(tidyverse)
14 | theme_set(theme_light())
15 |
16 | ramen_ratings <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-06-04/ramen_ratings.csv")
17 | ```
18 |
19 | ```{r}
20 | library(drlib)
21 |
22 | ramen_ratings_processed <- ramen_ratings %>%
23 | mutate(style = fct_lump(style, 4),
24 | country = fct_lump(country, 12),
25 | brand = fct_lump(brand, 20)) %>%
26 | replace_na(list(style = "Other")) %>%
27 | mutate(brand = fct_relevel(brand, "Other"),
28 | country = fct_relevel(country, "Other"),
29 | style = fct_relevel(style, "Pack"))
30 |
31 | ramen_ratings_processed %>%
32 | gather(category, value, -review_number, -stars) %>%
33 | count(category, value) %>%
34 | group_by(category) %>%
35 | top_n(20, n) %>%
36 | ungroup() %>%
37 | mutate(value = reorder_within(value, n, category)) %>%
38 | ggplot(aes(value, n)) +
39 | geom_col() +
40 | facet_wrap(~ category, scales = "free_y") +
41 | scale_x_reordered() +
42 | coord_flip() +
43 | labs(title = "Categorical predictors (after processing)",
44 | x = "Predictor",
45 | y = "Count")
46 | ```
47 |
48 | ```{r}
49 | library(broom)
50 |
51 | lm(stars ~ brand + country + style, ramen_ratings_processed) %>%
52 | tidy(conf.int = TRUE) %>%
53 | filter(term != "(Intercept)") %>%
54 | arrange(desc(estimate)) %>%
55 | extract(term, c("category", "term"), "^([a-z]+)([A-Z].*)") %>%
56 | mutate(term = fct_reorder(term, estimate)) %>%
57 | ggplot(aes(estimate, term, color = category)) +
58 | geom_point() +
59 | geom_errorbarh(aes(xmin = conf.low, xmax = conf.high)) +
60 | geom_vline(lty = 2, xintercept = 0) +
61 | facet_wrap(~ category, ncol = 1, scales = "free_y") +
62 | theme(legend.position = "none") +
63 | labs(x = "Estimated effect on ramen rating",
64 | y = "",
65 | title = "Coefficients that predict ramen ratings",
66 | subtitle = "Less common brands and countries were lumped together as the reference level")
67 | ```
68 |
69 | ```{r}
70 | library(tidytext)
71 |
72 | ramen_ratings_processed %>%
73 | filter(!is.na(stars)) %>%
74 | unnest_tokens(word, variety) %>%
75 | group_by(word) %>%
76 | summarize(avg_rating = mean(stars),
77 | n = n()) %>%
78 | arrange(desc(n))
79 | ```
80 |
81 | ### Web scraping
82 |
83 | ```{r}
84 | library(rvest)
85 |
86 | ramen_list <- read_html("https://www.theramenrater.com/resources-2/the-list/")
87 |
88 | # How the original data was (probably) created
89 | ramen_reviews <- ramen_list %>%
90 | html_node("#myTable") %>%
91 | html_table() %>%
92 | tbl_df() %>%
93 | janitor::clean_names() %>%
94 | select(-t)
95 | ```
96 |
97 | ```{r}
98 | review_links <- read_html("https://www.theramenrater.com/resources-2/the-list/") %>%
99 | html_nodes("#myTable a")
100 |
101 | reviews <- tibble(review_number = parse_number(html_text(review_links)),
102 | link = html_attr(review_links, "href"))
103 | ```
104 |
105 | See here for more about possibly and other "dealing with failure" functions: https://r4ds.had.co.nz/iteration.html#dealing-with-failure
106 |
107 | ```{r}
108 | page <- read_html("https://www.theramenrater.com/2019/05/23/3180-yum-yum-moo-deng/")
109 |
110 | get_review_text <- function(url) {
111 | message(url)
112 |
113 | read_html(url) %>%
114 | html_nodes(".entry-content > p") %>%
115 | html_text() %>%
116 | str_subset(".")
117 | }
118 |
119 | review_text <- reviews %>%
120 | head(250) %>%
121 | mutate(text = map(link, possibly(get_review_text, character(0), quiet = FALSE)))
122 | ```
123 |
124 | More on correlation graphs: https://www.tidytextmining.com/ngrams.html
125 |
126 | ```{r}
127 | library(tidytext)
128 |
129 | review_paragraphs <- review_text %>%
130 | filter(!map_lgl(text, is.null)) %>%
131 | unnest() %>%
132 | filter(str_detect(text, "Finished")) %>%
133 | mutate(text = str_remove(text, "Finished.*?\\. "))
134 |
135 | review_paragraphs_tokenized <- review_paragraphs %>%
136 | unnest_tokens(word, text) %>%
137 | anti_join(stop_words, by = "word") %>%
138 | filter(str_detect(word, "[a-z]")) %>%
139 | inner_join(ramen_ratings, by = "review_number")
140 |
141 | review_words <- review_paragraphs_tokenized %>%
142 | filter(!is.na(stars)) %>%
143 | group_by(word) %>%
144 | summarize(number = n(),
145 | reviews = n_distinct(review_number),
146 | avg_rating = mean(stars)) %>%
147 | arrange(desc(reviews))
148 |
149 | review_words_filtered <- review_words %>%
150 | filter(reviews < 200, reviews >= 10)
151 |
152 | library(widyr)
153 | word_cors <- review_paragraphs_tokenized %>%
154 | semi_join(review_words_filtered, by = "word") %>%
155 | distinct(review_number, word) %>%
156 | pairwise_cor(word, review_number, sort = TRUE)
157 | ```
158 |
159 | ```{r}
160 | library(igraph)
161 | library(ggraph)
162 |
163 | set.seed(2019)
164 |
165 | filtered_cors <- word_cors %>%
166 | head(300)
167 |
168 | nodes <- review_words_filtered %>%
169 | filter(word %in% filtered_cors$item1 | word %in% filtered_cors$item2)
170 |
171 | filtered_cors %>%
172 | graph_from_data_frame(vertices = nodes) %>%
173 | ggraph() +
174 | geom_edge_link() +
175 | geom_node_point(aes(size = reviews * 1.1)) +
176 | geom_node_point(aes(size = reviews, color = avg_rating)) +
177 | geom_node_text(aes(label = name), repel = TRUE) +
178 | scale_color_gradient2(low = "red", high = "blue", midpoint = 4) +
179 | theme_void() +
180 | labs(color = "Average rating",
181 | size = "# of reviews",
182 | title = "Network of words used together in ramen reviews",
183 | subtitle = "Based on 250 ramen reviews and their star ratings")
184 | ```
185 |
186 |
--------------------------------------------------------------------------------
/riddler-die-reroll.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "The Riddler: Rerolling a die"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 |
13 | start <- 1:6
14 |
15 | next_rolls <- sample(start, replace = TRUE)
16 |
17 | simulate_roll <- function(num_sides = 6) {
18 | rolls <- accumulate(1:1000, function(x, ...) {
19 | result <- sample(x, replace = TRUE)
20 |
21 | if (length(unique(result)) == 1) {
22 | done(result)
23 | } else {
24 | result
25 | }
26 | }, .init = 1:num_sides)
27 |
28 | length(rolls) - 1
29 | }
30 |
31 | simulations4 <- replicate(1e6, simulate_roll(4))
32 |
33 | mean(simulations)
34 |
35 | tibble(simulations) %>%
36 | count(simulations, sort = TRUE)
37 | ```
38 |
39 | On average, it takes 9.666 rolls to get the same number on all sides.
40 |
41 | ```{r simulations_crossed}
42 | simulations_crossed <- crossing(trial = 1:20000,
43 | num_sides = 2:10) %>%
44 | mutate(simulation = map_dbl(num_sides, simulate_roll))
45 | ```
46 |
47 | ```{r}
48 | summarized <- simulations_crossed %>%
49 | group_by(num_sides) %>%
50 | summarize(average_roll = mean(simulation),
51 | variance_roll = var(simulation),
52 | sd_roll = sd(simulation),
53 | cov = sd_roll / average_roll)
54 |
55 | summarized %>%
56 | ggplot(aes(num_sides, average_roll)) +
57 | geom_point() +
58 | geom_line() +
59 | expand_limits(y = 0)
60 |
61 | lm(average_roll ~ num_sides, summarized)
62 |
63 | simulations_crossed %>%
64 | ggplot(aes(simulation)) +
65 | geom_histogram(binwidth = 1) +
66 | facet_wrap(~ num_sides, scales = "free_y")
67 | ```
68 |
--------------------------------------------------------------------------------
/riddler-die-roll-low.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Riddler: How Low Can You Roll: Cumulative window functions"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 |
13 | set.seed(2020)
14 |
15 | sim <- tibble(roll = sample(0:9, 1e7, replace = TRUE)) %>%
16 | group_by(group = lag(cumsum(roll == 0), default = 0)) %>%
17 | filter(roll <= cummin(roll)) %>%
18 | mutate(decimal = roll * 10 ^ -row_number()) %>%
19 | summarize(score = sum(decimal))
20 | ```
21 |
22 | ```{r}
23 | scores %>%
24 | summarize(mean(score))
25 |
26 | scores %>%
27 | ggplot(aes(score)) +
28 | geom_histogram(binwidth = .001) +
29 | scale_x_continuous(breaks = seq(0, 1, .1))
30 |
31 | mean(scores$score)
32 | ```
33 |
34 |
35 |
--------------------------------------------------------------------------------
/riddler-prisoner-coin-flip.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Riddler: Flip your way to freedom"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | library(scales)
13 | theme_set(theme_light())
14 |
15 | sim <- crossing(trial = 1:50000,
16 | probability = seq(.01, 1, .01)) %>%
17 | mutate(num_flips = rbinom(n(), 4, probability),
18 | num_tails = rbinom(n(), num_flips, .5),
19 | set_free = num_flips != 0 & num_tails == 0)
20 |
21 | probability_exact <- function(p, n = 4) {
22 | sum(dbinom(0:n, n, p) * c(0, dbinom(1:n, 1:n, .5)))
23 | }
24 |
25 | probabilities <- sim %>%
26 | group_by(probability) %>%
27 | summarize(pct_free = mean(set_free)) %>%
28 | mutate(exact = map_dbl(probability, probability_exact))
29 |
30 | probabilities %>%
31 | ggplot(aes(probability, pct_free)) +
32 | geom_line() +
33 | geom_line(aes(y = exact), color = "red", lty = 2)
34 |
35 | opt <- optimise(probability_exact, c(0, 1), maximum = TRUE)
36 | ```
37 |
38 | The highest chance of escape is `r percent(opt$objective)` when the prisoners use the random number generator to have a `r percent(optim$maximum)` chance of flipping the coin.
39 |
40 | ### Extra credit
41 |
42 | ```{r}
43 | sim_n <- crossing(trial = 1:100000,
44 | probability = seq(.02, 1, .02),
45 | n = 2:8) %>%
46 | mutate(num_flips = rbinom(n(), n, probability),
47 | num_tails = rbinom(n(), num_flips, .5),
48 | set_free = num_flips != 0 & num_tails == 0)
49 |
50 | probabilities_n <- sim_n %>%
51 | group_by(probability, n) %>%
52 | summarize(simulated = mean(set_free)) %>%
53 | ungroup() %>%
54 | mutate(exact = map2_dbl(probability, n, probability_exact))
55 |
56 | probabilities_n %>%
57 | ggplot(aes(probability, exact, color = factor(n))) +
58 | geom_line() +
59 | geom_point(aes(y = simulated), size = .4) +
60 | scale_x_continuous(labels = percent) +
61 | scale_y_continuous(labels = percent) +
62 | labs(x = "Probability of flipping the coin",
63 | y = "Probability of escape",
64 | color = "# of prisoners",
65 | title = "What's the chance of escaping with n prisoners?",
66 | subtitle = "Points show simulations of 100,000 prisoners each")
67 | ```
68 |
69 | ```{r}
70 | optimize_for_n <- function(n) {
71 | optimise(function(p) probability_exact(p, n), c(0, 1), maximum = TRUE)
72 | }
73 |
74 | optimal_n <- tibble(n = 2:60) %>%
75 | mutate(optimal = map(n, optimize_for_n)) %>%
76 | unnest_wider(optimal)
77 |
78 | optimal_n %>%
79 | gather(metric, value, -n) %>%
80 | mutate(metric = ifelse(metric == "maximum", "Optimal probability to flip", "Probability of escape")) %>%
81 | ggplot(aes(n, value, color = metric)) +
82 | geom_line() +
83 | scale_y_continuous(labels = percent) +
84 | expand_limits(y = 0) +
85 | labs(x = "# of prisoners",
86 | y = "Probability",
87 | color = "")
88 |
89 | optimal_n %>%
90 | arrange(desc(n)) %>%
91 | mutate(expected_coins_flipped = n * maximum) %>%
92 | ggplot(aes(n, expected_coins_flipped)) +
93 | geom_line()
94 | ```
95 |
96 |
--------------------------------------------------------------------------------
/riddler-spam-comments.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Riddler: Spam Comments"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 |
13 | sim <- replicate(1e6, which(cumsum(rexp(250, 1:250)) > 3)[1] - 1)
14 |
15 | mean(sim, na.rm = TRUE)
16 | ```
17 |
18 | ```{r}
19 | comment_counts <- tibble(n_comments = sim) %>%
20 | count(n_comments) %>%
21 | mutate(density = n / sum(n))
22 |
23 | library(broom)
24 |
25 | augmented_exp <- nls(density ~ lambda * exp(-lambda * n_comments),
26 | data = comment_counts,
27 | start = list(lambda = 1)) %>%
28 | augment(comment_counts)
29 |
30 | augmented_geometric <- nls(density ~ (1 - p) ^ n_comments * p,
31 | data = comment_counts,
32 | start = list(p = .05)) %>%
33 | augment(comment_counts)
34 |
35 | augmented_geometric %>%
36 | ggplot(aes(n_comments, density)) +
37 | geom_line() +
38 | geom_line(aes(y = .fitted), color = "red")
39 | ```
40 |
41 | The number of comments after 3 days is described by a geometric distribution.
42 |
43 | What determines the parameter $p$ (and therefore the expected value) of the geometric distribution?
44 |
45 |
46 | ```{r}
47 | tidy_sim <- crossing(trial = 1:5e4,
48 | step = 1:250) %>%
49 | mutate(waiting = rexp(n(), step)) %>%
50 | group_by(trial) %>%
51 | mutate(cumulative = cumsum(waiting)) %>%
52 | ungroup()
53 |
54 | ncomments <- tidy_sim %>%
55 | mutate(within_3 = cumulative < 3) %>%
56 | group_by(trial) %>%
57 | summarize(n_comments = sum(within_3))
58 | ```
59 |
60 | ```{r}
61 | comments_by_threshold <- tidy_sim %>%
62 | crossing(threshold = seq(.25, 3, .25)) %>%
63 | mutate(within = cumulative < threshold) %>%
64 | group_by(threshold, trial) %>%
65 | summarize(n_comments = sum(within))
66 |
67 | comments_by_threshold %>%
68 | summarize(expected_value = mean(n_comments)) %>%
69 | ggplot(aes(threshold, expected_value)) +
70 | geom_line() +
71 | geom_line(aes(y = exp(threshold) - 1), color = "red") +
72 | labs(x = "# of days",
73 | y = "Expected number of comments",
74 | title = "How many comments cumulatively over time?",
75 | subtitle = "Red line shows exp(x) - 1")
76 |
77 | comment_counts %>%
78 | filter(!is.na(n_comments)) %>%
79 | ggplot(aes(n_comments, density)) +
80 | geom_line() +
81 | geom_line(aes(y = (1 - 1 / exp(3)) ^ n_comments / exp(3)), color = "red")
82 | ```
83 |
84 | The number of comments after n days is described by a geometric distribution with expected value $e^{n}-1$ (that is, with success probability $exp(-n)$).
85 |
--------------------------------------------------------------------------------
/seattle-pets.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Untitled"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | library(lubridate)
13 | theme_set(theme_light())
14 |
15 | seattle_pets <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-26/seattle_pets.csv") %>%
16 | mutate(license_issue_date = mdy(license_issue_date)) %>%
17 | rename(animal_name = animals_name)
18 | ```
19 |
20 | ```{r}
21 | seattle_pets %>%
22 | filter(license_issue_date >= "2017-01-01") %>%
23 | count(species, primary_breed, sort = TRUE) %>%
24 | filter(species %in% c("Cat", "Dog")) %>%
25 | mutate(percent = n / sum(n)) %>%
26 | group_by(species) %>%
27 | top_n(10, percent) %>%
28 | ungroup() %>%
29 | mutate(primary_breed = fct_reorder(primary_breed, percent)) %>%
30 | ggplot(aes(primary_breed, percent, fill = species)) +
31 | geom_col(show.legend = FALSE) +
32 | scale_y_continuous(labels = scales::percent_format()) +
33 | facet_wrap(~ species, scales = "free_y", ncol = 1) +
34 | coord_flip() +
35 | labs(x = "Primary breed",
36 | y = "% of this species",
37 | title = "Most common cat and dog breeds",
38 | subtitle = "Of licensed pets in Seattle 2017-2018")
39 | ```
40 |
41 | ```{r}
42 | dogs <- seattle_pets %>%
43 | filter(species == "Dog")
44 |
45 | name_counts <- dogs %>%
46 | group_by(animal_name) %>%
47 | summarize(name_total = n()) %>%
48 | filter(name_total >= 100)
49 |
50 | breed_counts <- dogs %>%
51 | group_by(primary_breed) %>%
52 | summarize(breed_total = n()) %>%
53 | filter(breed_total >= 200)
54 |
55 | total_dogs <- nrow(dogs)
56 |
57 | name_breed_counts <- dogs %>%
58 | count(primary_breed, animal_name) %>%
59 | complete(primary_breed, animal_name, fill = list(n = 0)) %>%
60 | inner_join(name_counts, by = "animal_name") %>%
61 | inner_join(breed_counts, by = "primary_breed")
62 |
63 | # One-sided hypergeometric p-value
64 | hypergeom_test <- name_breed_counts %>%
65 | mutate(percent_of_breed = n / breed_total,
66 | percent_overall = name_total / total_dogs) %>%
67 | mutate(overrepresented_ratio = percent_of_breed / percent_overall) %>%
68 | arrange(desc(overrepresented_ratio)) %>%
69 | mutate(hypergeom_p_value = 1 - phyper(n, name_total, total_dogs - name_total, breed_total),
70 | holm_p_value = p.adjust(hypergeom_p_value),
71 | fdr = p.adjust(hypergeom_p_value, method = "fdr"))
72 |
73 | hypergeom_test %>%
74 | filter(fdr < .05)
75 |
76 | ggplot(hypergeom_test, aes(hypergeom_p_value)) +
77 | geom_histogram(binwidth = .1) +
78 | labs(x = "One-sided hypergeometric p-values for overrepresented name")
79 | ```
80 |
81 | ```{r}
82 | crossing(name_total = c(100, 200, 300),
83 | breed_total = seq(200, 1000, 25)) %>%
84 | mutate(max_p_value = 1 - phyper(0, name_total, total_dogs - name_total, breed_total)) %>%
85 | ggplot(aes(breed_total, max_p_value, color = factor(name_total))) +
86 | geom_line() +
87 | labs(x = "Total # of dogs in breed",
88 | y = "Maximum one-sided p-value",
89 | color = "# of dogs with name")
90 | ```
91 |
92 | ```{r}
93 | library(scales)
94 |
95 | hypergeom_test %>%
96 | filter(fdr <= .05) %>%
97 | arrange(fdr) %>%
98 | transmute(`Breed` = primary_breed,
99 | `Name` = animal_name,
100 | `# of dogs with name` = n,
101 | `% of breed` = percent(percent_of_breed),
102 | `% overall` = percent(percent_overall),
103 | `FDR-adjusted one-sided p-value` = fdr) %>%
104 | knitr::kable()
105 | ```
106 |
107 |
--------------------------------------------------------------------------------
/simpsons-guests.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Simpsons Guests"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | theme_set(theme_light())
13 |
14 | simpsons <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-08-27/simpsons-guests.csv") %>%
15 | mutate(self = str_detect(role, "self|selves"),
16 | season = parse_number(season))
17 | ```
18 |
19 | ```{r}
20 | simpsons %>%
21 | filter(self) %>%
22 | count(guest_star, sort = TRUE) %>%
23 | filter(n > 1) %>%
24 | mutate(guest_star = fct_reorder(guest_star, n)) %>%
25 | ggplot(aes(guest_star, n)) +
26 | geom_col() +
27 | coord_flip() +
28 | labs(title = "Who has played themselves in multiple Simpsons episodes?")
29 | ```
30 |
31 | ```{r}
32 | simpsons %>%
33 | separate_rows(role, sep = ";\\s+") %>%
34 | add_count(role) %>%
35 | filter(n >= 8) %>%
36 | count(season, role) %>%
37 | mutate(role = fct_reorder(role, -n, sum)) %>%
38 | ggplot(aes(season, n)) +
39 | geom_col() +
40 | facet_wrap(~ role)
41 | ```
42 |
43 | ```{r}
44 | simpsons
45 | ```
46 |
47 | ### Bringing in Simpsons dialogue
48 |
49 | ```{r}
50 | dialogue <- read_csv("~/Downloads/simpsons_dataset.csv") %>%
51 | select(role = raw_character_text, line = spoken_words)
52 |
53 | guests_processed <- simpsons %>%
54 | separate_rows(role, sep = ";\\s+") %>%
55 | mutate(role = ifelse(self, guest_star, role),
56 | role = ifelse(role == "Edna Krabappel", "Edna Krabappel-Flanders", role))
57 |
58 | guests_summarized <- guests_processed %>%
59 | filter(season <= 27) %>%
60 | group_by(guest_star, role, self) %>%
61 | summarize(nb_episodes = n(),
62 | first_season = min(season),
63 | last_season = max(season)) %>%
64 | arrange(desc(nb_episodes)) %>%
65 | group_by(role) %>%
66 | filter(n() == 1) %>%
67 | ungroup() %>%
68 | filter(!is.na(role))
69 |
70 | dialogue_summarized <- dialogue %>%
71 | group_by(role) %>%
72 | summarize(nb_lines = n(),
73 | random_line = sample(line, 1)) %>%
74 | arrange(desc(nb_lines))
75 |
76 | guest_roles <- guests_summarized %>%
77 | inner_join(dialogue_summarized, by = "role") %>%
78 | mutate(lines_per_episode = nb_lines / nb_episodes)
79 |
80 | guest_roles %>%
81 | mutate(self = ifelse(self, "Playing Themselves", "Playing a Character")) %>%
82 | ggplot(aes(lines_per_episode)) +
83 | geom_histogram(binwidth = 2, center = 1) +
84 | facet_wrap(~ self, ncol = 1) +
85 | labs(x = "Average # of lines per episode",
86 | title = "Most guest stars, especially those playing themselves, have relatively few lines per episode")
87 |
88 | guest_roles %>%
89 | arrange(desc(lines_per_episode))
90 | ```
91 |
92 | ```{r}
93 | library(tidytext)
94 |
95 | role_words <- dialogue %>%
96 | filter(!is.na(line), !is.na(role)) %>%
97 | mutate(line_number = row_number()) %>%
98 | unnest_tokens(word, line) %>%
99 | anti_join(stop_words, by = "word") %>%
100 | distinct(role, line_number, word) %>%
101 | count(role, word, sort = TRUE)
102 |
103 | role_word_tf_idf <- role_words %>%
104 | group_by(role) %>%
105 | mutate(total_words = sum(n)) %>%
106 | ungroup() %>%
107 | bind_tf_idf(word, role, n) %>%
108 | arrange(desc(tf_idf))
109 |
110 | role_word_tf_idf %>%
111 | filter(total_words >= 500) %>%
112 | distinct(role, .keep_all = TRUE) %>%
113 | mutate(role_word = paste0(role, ": ", word)) %>%
114 | head(20) %>%
115 | mutate(role_word = fct_reorder(role_word, tf_idf)) %>%
116 | ggplot(aes(role_word, tf_idf)) +
117 | geom_col() +
118 | coord_flip() +
119 | labs(title = "Using TF-IDF as a Simpsons catchphrase detector",
120 | subtitle = "Only the 53 characters that speak at least 500 words in 27 seasons",
121 | x = "",
122 | y = "TF-IDF")
123 | ```
124 |
125 | ```{r}
126 | guests_summarized %>%
127 | filter(nb_episodes > 1) %>%
128 | inner_join(role_word_tf_idf, by = "role") %>%
129 | filter(total_words >= 100) %>%
130 | arrange(desc(tf_idf)) %>%
131 | distinct(role, .keep_all = TRUE) %>%
132 | select(guest_star, role, word, tf_idf)
133 | ```
134 |
135 |
136 |
137 |
138 |
139 |
--------------------------------------------------------------------------------
/space-launches.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Space Launches"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | theme_set(theme_light())
13 |
14 | agencies <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/agencies.csv")
15 | launches <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-15/launches.csv")
16 | ```
17 |
18 | ```{r}
19 | launches %>%
20 | count(launch_year, agency_type) %>%
21 | ggplot(aes(launch_year, n, color = agency_type)) +
22 | geom_line() +
23 | labs(x = "Time",
24 | y = "# of launches this year",
25 | color = "Agency type")
26 | ```
27 |
28 | ```{r}
29 | library(countrycode)
30 |
31 | agencies %>%
32 | View()
33 |
34 | launches %>%
35 | count(agency_type, agency, sort = TRUE)
36 |
37 | agencies %>%
38 | count(state_code, wt = count, sort = TRUE) %>%
39 | View()
40 |
41 | launches_processed <- launches %>%
42 | filter(launch_date <= Sys.Date()) %>%
43 | mutate(state_code_cleaned = fct_collapse(
44 | state_code,
45 | "RU" = c("SU", "RU"),
46 | "FR" = "F",
47 | "JP" = "J",
48 | "IT" = "I"
49 | )) %>%
50 | mutate(state_name = countrycode(state_code_cleaned, "iso2c", "country.name"),
51 | state_name = fct_lump(state_name, 6)) %>%
52 | replace_na(list(state_name = "Other"))
53 |
54 | launches_processed %>%
55 | count(launch_year, state_name) %>%
56 | mutate(state_name = fct_reorder(state_name, -n, sum)) %>%
57 | ggplot(aes(launch_year, n, color = state_name)) +
58 | geom_line() +
59 | labs(x = "Time",
60 | y = "Launches per year",
61 | color = "Responsible state",
62 | title = "Launches per year per country",
63 | subtitle = "Combines Soviet Union (pre-1990) with Russia")
64 | ```
65 |
66 | ### Focus on private + startup launches
67 |
68 | ```{r}
69 | agencies %>%
70 | filter(agency_type %in% c("private", "startup")) %>%
71 | View()
72 |
73 | private_startup_launches <- launches_processed %>%
74 | filter(agency_type %in% c("private", "startup")) %>%
75 | inner_join(agencies %>%
76 | select(agency, agency_name = name, short_name, parent), by = "agency") %>%
77 | mutate(agency_name_lumped = fct_lump(agency_name, 6),
78 | agency_name_lumped = if_else(agency_name_lumped == "Other" & state_name == "United States",
79 | "Other US", as.character(agency_name_lumped)))
80 |
81 | private_startup_launches %>%
82 | count(agency_name_lumped, state_name, sort = TRUE) %>%
83 | mutate(agency_name_lumped = fct_reorder(agency_name_lumped, n, sum)) %>%
84 | ggplot(aes(agency_name_lumped, n, fill = state_name)) +
85 | geom_col() +
86 | coord_flip() +
87 | labs(x = "",
88 | y = "# of launches overall",
89 | title = "What private/startup agencies have had the most launches?",
90 | fill = "Country")
91 |
92 | private_startup_launches %>%
93 | count(agency_name_lumped,
94 | decade = 5 * (launch_year %/% 5)) %>%
95 | complete(agency_name_lumped, decade, fill = list(n = 0)) %>%
96 | mutate(agency_name_lumped = fct_reorder(agency_name_lumped, -n, sum)) %>%
97 | ggplot(aes(decade, n, color = agency_name_lumped)) +
98 | geom_line() +
99 | facet_wrap(~ agency_name_lumped) +
100 | theme(legend.position = "none") +
101 | labs(x = "Time",
102 | y = "# of launches in 5 year period")
103 | ```
104 |
105 | ```{r}
106 | vehicles <- launches_processed %>%
107 | group_by(type, state_name) %>%
108 | summarize(first_launch = min(launch_year),
109 | last_launch = max(launch_year),
110 | launches = n()) %>%
111 | ungroup()
112 |
113 | russian_vehicles <- vehicles %>%
114 | filter(state_name == "Russia") %>%
115 | arrange(desc(launches)) %>%
116 | filter(launches >= 30)
117 |
118 | launches_processed %>%
119 | semi_join(russian_vehicles, by = "type") %>%
120 | mutate(type = fct_reorder(type, launch_date, min)) %>%
121 | ggplot(aes(launch_date, type)) +
122 | geom_jitter(color = "blue", alpha = .25, width = 0, height = .2) +
123 | theme(legend.position = "none") +
124 | labs(title = "Timeline of Soviet/Russian space vehicles",
125 | x = "Launch date",
126 | y = "Vehicle type",
127 | subtitle = "Only vehicles with at least 30 launches")
128 |
129 | launches_processed %>%
130 | filter(state_code == "US") %>%
131 | add_count(type) %>%
132 | filter(n >= 20) %>%
133 | mutate(type = fct_reorder(type, launch_date, min),
134 | agency_type = str_to_title(agency_type)) %>%
135 | ggplot(aes(launch_date, type, color = agency_type)) +
136 | geom_jitter(alpha = .25, width = 0, height = .2) +
137 | labs(title = "Timeline of US space vehicles",
138 | x = "Launch date",
139 | y = "Vehicle type",
140 | color = "Agency type",
141 | subtitle = "Only vehicles with at least 20 launches")
142 |
143 |
144 | by_type %>%
145 | arrange(desc(launches)) %>%
146 | View()
147 | count(type, sort = TRUE)
148 | ```
149 |
150 |
151 |
--------------------------------------------------------------------------------
/thanksgiving.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Untitled"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | theme_set(theme_light())
13 |
14 | thanksgiving_survey <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-11-20/thanksgiving_meals.csv") %>%
15 | mutate(family_income = fct_reorder(family_income, parse_number(family_income)))
16 | ```
17 |
18 | ```{r}
19 | thanksgiving_survey %>%
20 | count(celebrate)
21 | ```
22 |
23 | Almost everyone serves turkey as the main dish
24 |
25 | ```{r}
26 | thanksgiving_survey %>%
27 | count(main_dish, sort = TRUE)
28 | ```
29 |
30 | ```{r}
31 | thanksgiving_survey %>%
32 | count(main_dish, main_prep, sort = TRUE)
33 | ```
34 |
35 | ```{r}
36 | thanksgiving_survey %>%
37 | count(cranberry, sort = TRUE)
38 |
39 | thanksgiving_survey %>%
40 | count(gravy, sort = TRUE)
41 | ```
42 |
43 | ### Relationship with income
44 |
45 | ```{r}
46 | thanksgiving_survey %>%
47 | group_by(family_income) %>%
48 | summarize(celebrate = sum(celebrate == "Yes"),
49 | total = n(),
50 | low = qbeta(0.025, celebrate + .5, total - celebrate + .5),
51 | high = qbeta(0.975, celebrate + .5, total - celebrate + .5)) %>%
52 | ggplot(aes(family_income, celebrate / total, group = 1)) +
53 | geom_line() +
54 | geom_ribbon(aes(ymin = low, ymax = high), alpha = .2) +
55 | scale_y_continuous(labels = scales::percent_format()) +
56 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
57 | labs(x = "Family income",
58 | y = "% celebrating Thanksgiving")
59 | ```
60 |
61 | ```{r}
62 | thanksgiving_survey %>%
63 | filter(cranberry %in% c("Canned", "Homemade")) %>%
64 | group_by(family_income) %>%
65 | summarize(homemade = sum(cranberry == "Homemade"),
66 | total = n(),
67 | low = qbeta(0.025, homemade + .5, total - homemade + .5),
68 | high = qbeta(0.975, homemade + .5, total - homemade + .5)) %>%
69 | ggplot(aes(family_income, homemade / total, group = 1)) +
70 | geom_line() +
71 | geom_ribbon(aes(ymin = low, ymax = high), alpha = .2) +
72 | scale_y_continuous(labels = scales::percent_format()) +
73 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
74 | labs(x = "Family income",
75 | y = "% serving homemade")
76 | ```
77 |
78 | ```{r}
79 | food_gathered <- thanksgiving_survey %>%
80 | select(id, starts_with("side"),
81 | starts_with("pie"),
82 | starts_with("dessert")) %>%
83 | select(-side15, -pie13, -dessert12) %>%
84 | gather(type, value, -id) %>%
85 | filter(!is.na(value),
86 | !value %in% c("None", "Other (please specify)")) %>%
87 | mutate(type = str_remove(type, "\\d+"))
88 |
89 | n_respondents <- n_distinct(food_gathered$id)
90 | ```
91 |
92 | What are the most common pies, sides, and desserts?
93 |
94 | ```{r}
95 | food_gathered %>%
96 | count(type, value, sort = TRUE) %>%
97 | mutate(value = fct_reorder(value, n)) %>%
98 | ggplot(aes(value, n / n_respondents, fill = type)) +
99 | geom_col(show.legend = FALSE) +
100 | coord_flip() +
101 | scale_y_continuous(labels = scales::percent_format()) +
102 | facet_wrap(~ type, scales = "free_y", ncol = 1) +
103 | labs(x = "",
104 | y = "% of respondents")
105 | ```
106 |
107 | How do the servings differ by income?
108 |
109 | ```{r}
110 | food_gathered %>%
111 | inner_join(thanksgiving_survey, by = "id") %>%
112 | mutate(age_number = parse_number(age)) %>%
113 | group_by(value) %>%
114 | summarize(average_age = mean(age_number, na.rm = TRUE),
115 | total = n()) %>%
116 | arrange(desc(average_age))
117 |
118 | food_by_region <- food_gathered %>%
119 | inner_join(thanksgiving_survey, by = "id") %>%
120 | group_by(us_region) %>%
121 | mutate(respondents = n_distinct(id)) %>%
122 | count(us_region, respondents, type, value) %>%
123 | ungroup() %>%
124 | mutate(percent = n / respondents)
125 |
126 | food_by_region %>%
127 | filter(value == "Cornbread") %>%
128 | arrange(desc(percent))
129 | ```
130 |
131 | ```{r}
132 | library(ebbr)
133 |
134 | food_gathered %>%
135 | inner_join(thanksgiving_survey, by = "id") %>%
136 | filter(!is.na(prayer)) %>%
137 | group_by(type, value) %>%
138 | summarize(prayer = sum(prayer == "Yes"),
139 | total = n()) %>%
140 | add_ebb_estimate(prayer, total) %>%
141 | arrange(desc(.fitted)) %>%
142 | View()
143 | ```
144 |
145 | ### Network of foods
146 |
147 | What sides, pies and desserts are eaten together?
148 |
149 | ```{r}
150 | library(widyr)
151 |
152 | food_cors <- food_gathered %>%
153 | pairwise_cor(value, id, sort = TRUE)
154 | ```
155 |
156 | ```{r}
157 | library(ggraph)
158 | library(igraph)
159 |
160 | set.seed(2018)
161 |
162 | food_types <- food_gathered %>%
163 | count(value, type, sort = TRUE)
164 |
165 | food_cors %>%
166 | head(75) %>%
167 | graph_from_data_frame(vertices = food_types) %>%
168 | ggraph() +
169 | geom_edge_link() +
170 | geom_node_point(aes(color = type, size = n / n_respondents)) +
171 | geom_node_text(aes(label = name), vjust = 1, hjust = 1, repel = TRUE) +
172 | scale_size_continuous(labels = scales::percent_format()) +
173 | theme_void() +
174 | labs(title = "What foods get served together at Thanksgiving?",
175 | color = "",
176 | size = "% of respondents")
177 | ```
178 |
179 |
180 |
181 |
--------------------------------------------------------------------------------
/tour-de-france.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Tour de France"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | # devtools::install_github("thebioengineer/tidytuesdayR")
12 |
13 | library(tidytuesdayR)
14 | library(tidyverse)
15 | theme_set(theme_light())
16 | ```
17 |
18 | ```{r}
19 | library(lubridate)
20 |
21 | tuesdata <- tidytuesdayR::tt_load('2020-04-07')
22 |
23 | tdf_winners <- tuesdata$tdf_winners %>%
24 | mutate(year = year(start_date),
25 | speed = distance / time_overall)
26 |
27 | stage_data <- tuesdata$stage_data
28 | tdf_stages <- tuesdata$tdf_stages %>%
29 | janitor::clean_names() %>%
30 | mutate(year = year(date))
31 | ```
32 |
33 | ```{r}
34 | View(tuesdata$tdf_winners)
35 | ```
36 |
37 | ```{r}
38 | tdf_winners %>%
39 | count(birth_country, sort = TRUE) %>%
40 | mutate(birth_country = fct_reorder(birth_country, n)) %>%
41 | ggplot(aes(n, birth_country)) +
42 | geom_col() +
43 | labs(y = "",
44 | title = "What countries were the most Tour de France winners born in?")
45 |
46 | tdf_winners %>%
47 | count(winner_name, birth_country, sort = TRUE)
48 | ```
49 |
50 | ```{r}
51 | by_decade <- tdf_winners %>%
52 | group_by(decade = 10 * (year %/% 10)) %>%
53 | summarize(winner_age = mean(age),
54 | winner_height = mean(height, na.rm = TRUE),
55 | winner_weight = mean(weight, na.rm = TRUE),
56 | winner_margin = mean(time_margin, na.rm = TRUE),
57 | winner_speed = mean(speed, na.rm = TRUE))
58 |
59 | by_decade %>%
60 | filter(decade >= 1910) %>%
61 | ggplot(aes(decade, winner_margin * 60)) +
62 | geom_line() +
63 | expand_limits(y = 0) +
64 | labs(x = "Decade",
65 | y = "Average margin of winner (minutes)",
66 | title = "Tour de France races have been getting closer")
67 |
68 | by_decade %>%
69 | ggplot(aes(decade, winner_speed)) +
70 | geom_line() +
71 | expand_limits(y = 0) +
72 | labs(x = "Decade",
73 | y = "Average speed of winner (km/h)",
74 | title = "Tour de France winners have been getting faster")
75 | ```
76 |
77 | Life expectancy of
78 |
79 | ```{r}
80 | library(survival)
81 | library(broom)
82 |
83 | surv_model <- tdf_winners %>%
84 | distinct(winner_name, .keep_all = TRUE) %>%
85 | transmute(winner_name,
86 | birth_year = year(born),
87 | death_year = year(died),
88 | dead = as.integer(!is.na(death_year))) %>%
89 | mutate(age_at_death = coalesce(death_year, 2020) - birth_year) %>%
90 | survfit(Surv(age_at_death, dead) ~ 1, data = .)
91 |
92 | glance(surv_model)
93 | ```
94 |
95 | Median life expectancy of a Tour de France winner is 77.
96 |
97 | ```{r}
98 | stages_joined <- stage_data %>%
99 | extract(stage_results_id, "stage", "stage-(.*)") %>%
100 | inner_join(tdf_stages, by = c("year", "stage")) %>%
101 | mutate(rank = as.integer(rank)) %>%
102 | group_by(year, stage) %>%
103 | mutate(finishers = sum(!is.na(rank))) %>%
104 | ungroup() %>%
105 | mutate(percentile = 1 - rank / finishers)
106 |
107 | total_points <- stages_joined %>%
108 | group_by(year, rider) %>%
109 | summarize(total_points = sum(points, na.rm = TRUE)) %>%
110 | mutate(points_rank = percent_rank(total_points)) %>%
111 | ungroup()
112 |
113 | stages_joined %>%
114 | filter(stage == "1") %>%
115 | group_by(winner_country) %>%
116 | summarize(stages = n(),
117 | median_percentile = median(percentile, na.rm = TRUE)) %>%
118 | arrange(desc(stages))
119 | ```
120 |
121 | Does the winner of the first stage predict their final point ranking?
122 |
123 | ```{r}
124 | stages_joined %>%
125 | filter(stage == "1") %>%
126 | inner_join(total_points, by = c("year", "rider")) %>%
127 | select(year, rider,
128 | percentile_first_stage = percentile,
129 | points_rank) %>%
130 | mutate(first_stage_bin = cut(percentile_first_stage, seq(0, 1, .1),
131 | include.lowest = TRUE)) %>%
132 | filter(!is.na(first_stage_bin)) %>%
133 | ggplot(aes(first_stage_bin, points_rank)) +
134 | geom_boxplot() +
135 | theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
136 | scale_y_continuous(labels = scales::percent) +
137 | labs(x = "Decile performance in the first stage",
138 | y = "Overall points percentile")
139 | ```
140 |
141 | ```{r}
142 | library(gganimate)
143 | library(tidytext)
144 |
145 | top_10_2017 <- total_points %>%
146 | filter(year == max(year)) %>%
147 | top_n(10, total_points)
148 |
149 | stages_joined %>%
150 | filter(year == max(year)) %>%
151 | semi_join(top_10_2017, by = "rider") %>%
152 | mutate(stage = as.integer(stage),
153 | points = coalesce(points, 0)) %>%
154 | arrange(stage) %>%
155 | group_by(rider) %>%
156 | mutate(cumulative_points = cumsum(points)) %>%
157 | ungroup() %>%
158 | # mutate(rider = reorder_within(rider, cumulative_points, stage)) %>%
159 | ggplot(aes(cumulative_points, rider, fill = cumulative_points)) +
160 | geom_col() +
161 | transition_time(stage) +
162 | theme(legend.position = "none") +
163 | labs(title = "The 2017 Tour de France. Stage: { frame_time }",
164 | x = "Cumulative points at this stage",
165 | y = "")
166 | ```
167 |
168 |
169 |
170 |
171 |
172 |
--------------------------------------------------------------------------------
/trees.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Trees in NYC"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | This Rmd (unlike others in this repository) isn't a screencast, but rather created in a [live session of the New York Open Statistical Programming Meetup](https://www.meetup.com/nyhackr/events/260545903/).
11 |
12 | Tree data here: https://data.cityofnewyork.us/Environment/2015-Street-Tree-Census-Tree-Data/pi5s-9p35
13 | Zip code data here: https://data.cityofnewyork.us/widgets/i8iw-xf4u
14 |
15 | ```{r}
16 | library(tidyverse)
17 | theme_set(theme_light())
18 |
19 | tree_dataset_raw <- read_csv("~/Desktop/dataset_ideas/2015_Street_Tree_Census_-_Tree_Data.csv")
20 |
21 | trees <- tree_dataset_raw %>%
22 | mutate(health = fct_relevel(health, c("Poor", "Fair", "Good"))) %>%
23 | mutate(postcode = as.character(postcode)) %>%
24 | mutate(maple = str_detect(str_to_lower(spc_common), "maple"))
25 | ```
26 |
27 | ```{r}
28 | View(trees)
29 |
30 | three_boroughs <- trees %>%
31 | filter(postcode %in% c(10023, 10002, 10009))
32 |
33 | three_boroughs %>%
34 | filter(!is.na(health)) %>%
35 | count(postcode, health, sort = TRUE) %>%
36 | ggplot(aes(health, n)) +
37 | geom_col() +
38 | facet_wrap(~ postcode)
39 |
40 | summarize_trees <- function(data) {
41 | data %>%
42 | summarize(average_health = mean(as.numeric(health), na.rm = TRUE),
43 | percent_good_health = mean(health == "Good", na.rm = TRUE),
44 | percent_maple = mean(maple, na.rm = TRUE),
45 | trees = n())
46 | }
47 |
48 | trees_by_zipcode <- trees %>%
49 | group_by(postcode, borough) %>%
50 | summarize_trees() %>%
51 | filter(trees >= 100)
52 |
53 | trees_by_zipcode %>%
54 | arrange(desc(average_health))
55 |
56 | trees %>%
57 | group_by(borough) %>%
58 | summarize_trees() %>%
59 | arrange(average_health)
60 | ```
61 |
62 | ```{r}
63 | library(sf)
64 |
65 | zip_codes <- read_sf("~/Downloads/ZIP_CODE_040114/")
66 |
67 | class(zip_codes)
68 |
69 | joined_trees <- zip_codes %>%
70 | left_join(trees_by_zipcode, by = c("ZIPCODE" = "postcode")) %>%
71 | mutate(population_density = POPULATION / AREA,
72 | tree_density = trees / AREA,
73 | people_per_tree = POPULATION / trees)
74 | ```
75 |
76 | ```{r}
77 | ggplot(joined_trees, aes(fill = percent_good_health)) +
78 | geom_sf() +
79 | scale_fill_gradient2(low = "brown",
80 | high = "darkgreen",
81 | midpoint = .8,
82 | labels = scales::percent) +
83 | theme_void() +
84 | coord_sf(datum = NA) +
85 | labs(fill = "% trees in good health",
86 | title = "Where are the healthiest (and unhealthiest) trees in NYC?",
87 | subtitle = "Based on a 2015 survey of 600,000 trees")
88 | ```
89 |
90 | ### Species
91 |
92 | ```{r}
93 | trees %>%
94 | filter(!is.na(spc_common)) %>%
95 | count(spc_common = fct_lump(spc_common, n = 19),
96 | borough,
97 | sort = TRUE) %>%
98 | mutate(spc_common = fct_reorder(spc_common, n, sum)) %>%
99 | ggplot(aes(spc_common, n, fill = borough)) +
100 | geom_col() +
101 | coord_flip() +
102 | labs(x = "",
103 | y = "# of trees in NYC",
104 | fill = "Borough",
105 | title = "What species of tree do you find in NYC?")
106 | ```
107 |
108 | ```{r}
109 | trees %>%
110 | group_by(spc_common, maple) %>%
111 | summarize_trees() %>%
112 | filter(trees >= 500) %>%
113 | arrange(percent_good_health) %>%
114 | ggplot(aes(trees, percent_good_health)) +
115 | geom_point(aes(color = maple)) +
116 | geom_text(aes(label = spc_common), vjust = 1, hjust = 1, check_overlap = TRUE) +
117 | scale_x_log10(labels = scales::comma) +
118 | scale_y_continuous(labels = scales::percent) +
119 | labs(x = "# of trees in NYC",
120 | y = "% marked as 'Good' health")
121 | ```
122 |
123 | ```{r}
124 | trees %>%
125 | filter(!is.na(maple)) %>%
126 | group_by(borough, maple) %>%
127 | summarize_trees() %>%
128 | mutate(maple = ifelse(maple, "Maple", "Other")) %>%
129 | select(borough, maple, percent_good_health) %>%
130 | spread(maple, percent_good_health) %>%
131 | ggplot(aes(Other, Maple)) +
132 | geom_point() +
133 | geom_text(aes(label = borough)) +
134 | geom_abline(color = "red")
135 | ```
136 |
137 |
138 | ```{r}
139 | ggplot(joined_trees, aes(fill = percent_maple)) +
140 | geom_sf() +
141 | scale_fill_gradient2(low = "brown",
142 | high = "darkgreen",
143 | midpoint = .1,
144 | labels = scales::percent) +
145 | theme_void() +
146 | coord_sf(datum = NA) +
147 | labs(fill = "% trees that are maple",
148 | title = "Where are the maple trees in NYC?",
149 | subtitle = "Based on a 2015 survey of 600,000 trees")
150 | ```
151 |
152 | ### Looking at tree and population density
153 |
154 | ```{r}
155 | processed_zipcodes <- joined_trees %>%
156 | select(ZIPCODE, POPULATION, AREA, trees) %>%
157 | rename_all(str_to_lower)
158 |
159 | processed_zipcodes %>%
160 | ggplot(aes(POPULATION / AREA)) +
161 | geom_histogram()
162 |
163 | processed_zipcodes %>%
164 | filter(population >= 1000) %>%
165 | arrange(desc(people_per_tree))
166 | ```
167 |
168 | ```{r}
169 | ggplot(joined_trees, aes(fill = tree_density * (5280 ^ 2))) +
170 | geom_sf() +
171 | scale_fill_gradient2(low = "brown",
172 | high = "darkgreen",
173 | midpoint = log10(3000),
174 | trans = "log10") +
175 | theme_void() +
176 | coord_sf(datum = NA) +
177 | labs(fill = "# of trees per square mile",
178 | title = "Where are the densest trees?",
179 | subtitle = "Based on a 2015 survey of 600,000 trees")
180 | ```
181 |
182 |
--------------------------------------------------------------------------------
/umbrella-week.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Untitled"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | Louie walks to and from work every day. In his city, there is a 50 percent chance of rain each morning and an independent 40 percent chance each evening. His habit is to bring (and use) an umbrella if it’s raining when he leaves the house or office, but to leave them all behind if not. Louie owns three umbrellas.
11 |
12 | On Sunday night, two are with him at home and one is at his office. Assuming it never starts raining during his walk to his home or office, what is the probability that he makes it through the work week without getting wet?
13 |
14 | ```{r}
15 | library(tidyverse)
16 |
17 | set.seed(2018)
18 |
19 | simulations <- crossing(trial = 1:1e5,
20 | weekday = 1:5,
21 | commute = c("Morning", "Evening")) %>%
22 | arrange(trial, weekday, desc(commute)) %>%
23 | mutate(rain = rbinom(n(), 1, ifelse(commute == "Morning", .5, .4)),
24 | home_change = case_when(
25 | commute == "Morning" & rain ~ -1,
26 | commute == "Evening" & rain ~ 1,
27 | TRUE ~ 0),
28 | office_change = -home_change) %>%
29 | group_by(trial) %>%
30 | mutate(home = 2 + cumsum(home_change),
31 | office = 1 + cumsum(office_change))
32 |
33 | simulations %>%
34 | summarize(dry = !any(home < 0 | office < 0)) %>%
35 | summarize(dry = mean(dry))
36 |
37 | days <- c("Mon", "Tue", "Wed", "Thu", "Fri")
38 |
39 | simulations %>%
40 | ungroup() %>%
41 | filter(home < 0 | office < 0) %>%
42 | distinct(trial, .keep_all = TRUE) %>%
43 | count(weekday, commute, sort = TRUE) %>%
44 | mutate(weekday = factor(days[weekday], levels = days),
45 | commute = fct_relevel(commute, "Morning")) %>%
46 | ggplot(aes(weekday, n / 1e5, fill = commute)) +
47 | geom_col(position = "dodge") +
48 | scale_y_continuous(labels = scales::percent_format()) +
49 | labs(title = "When does Louie first get wet?",
50 | y = "Probability overall")
51 | ```
52 |
53 | Answer to the riddle: ~69.3%.
54 |
55 |
--------------------------------------------------------------------------------
/uncanny-xmen.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Uncanny X-Men: Claremont Run"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidytuesdayR)
12 | library(tidyverse)
13 | theme_set(theme_light())
14 |
15 | xmen <- tidytuesdayR::tt_load('2020-06-30')
16 | characters <- tt_load(2018, week = 9)$week9_comic_characters
17 | ```
18 |
19 | ```{r}
20 | character_visualization <- xmen$character_visualization %>%
21 | separate(character, c("superhero", "secret_identity"), sep = " = ", fill = "right")
22 | ```
23 |
24 | ```{r}
25 | character_visualization %>%
26 | group_by(superhero)
27 | ```
28 |
29 | ```{r}
30 | by_character %>%
31 | left_join(characters, by = c(superhero = "name")) %>%
32 | View()
33 |
34 | characters %>%
35 | filter(str_detect(name, "Angel")) %>%
36 | distinct(name)
37 | ```
38 |
39 |
40 | ```{r}
41 | library(ggrepel)
42 |
43 | by_character <- character_visualization %>%
44 | group_by(superhero) %>%
45 | summarize(across(speech:depicted,
46 | list(total = sum,
47 | issues = ~ sum(. > 0),
48 | avg = ~ mean(.[depicted > 0]))))
49 |
50 | by_character %>%
51 | arrange(desc(speech_issues)) %>%
52 | filter(speech_issues > 0) %>%
53 | mutate(superhero = fct_reorder(superhero, speech_issues)) %>%
54 | ggplot(aes(speech_issues, superhero)) +
55 | geom_col() +
56 | labs(title = "Which X-Men speak in the most issues?",
57 | x = "# of issues",
58 | y = "")
59 |
60 | by_character %>%
61 | ggplot(aes(depicted_issues, speech_avg)) +
62 | geom_point() +
63 | geom_text_repel(aes(label = superhero)) +
64 | labs(x = "# of issues in which character appears",
65 | y = "Average lines per issue depicted") +
66 | expand_limits(y = 0)
67 |
68 | by_character %>%
69 | ggplot(aes(depicted_issues, thought_avg)) +
70 | geom_point() +
71 | geom_text_repel(aes(label = superhero)) +
72 | scale_x_log10() +
73 | labs(x = "# of issues in which character appears",
74 | y = "Average thoughts per issue") +
75 | expand_limits(y = 0)
76 |
77 | by_character %>%
78 | ggplot(aes(depicted_issues, speech_avg / thought_avg)) +
79 | geom_point() +
80 | geom_text_repel(aes(label = superhero)) +
81 | expand_limits(y = 1) +
82 | scale_y_log10(breaks = c(1, 3, 10, 30),
83 | labels = c("Same", "3X", "10X", "30X")) +
84 | labs(x = "# of issues depicted",
85 | y = "Speech to thought ratio (log scale)",
86 | title = "Which X-Men think, and which speak?")
87 | ```
88 |
89 | ```{r}
90 | by_character_costume <- character_visualization %>%
91 | group_by(superhero, costume) %>%
92 | summarize(across(speech:depicted,
93 | list(total = sum,
94 | issues = ~ sum(. > 0),
95 | avg = ~ mean(.[depicted > 0])))) %>%
96 | ungroup() %>%
97 | mutate(speech_thought_ratio = speech_avg / thought_avg)
98 |
99 | by_character_costume %>%
100 | mutate(superhero = fct_reorder(superhero, speech_total)) %>%
101 | filter(speech_avg > 0) %>%
102 | ggplot(aes(speech_total,
103 | superhero,
104 | color = costume,
105 | size = depicted_issues)) +
106 | geom_point() +
107 | labs(x = "# of lines (in/out of costume)")
108 |
109 | costume_ratios <- by_character_costume %>%
110 | filter(speech_avg > 0) %>%
111 | group_by(superhero) %>%
112 | summarize(costume_ratio = speech_total[1] / speech_total[2]) %>%
113 | inner_join(by_character, by = "superhero")
114 |
115 | costume_ratios %>%
116 | ggplot(aes(speech_total,
117 | costume_ratio)) +
118 | geom_point() +
119 | geom_text_repel(aes(label = superhero)) +
120 | scale_y_log10() +
121 | labs(x = "Total # of lines",
122 | y = "Lines in costume / lines out of costume")
123 |
124 | costume_ratios %>%
125 | mutate(superhero = fct_reorder(superhero, costume_ratio)) %>%
126 | ggplot(aes(costume_ratio, y = superhero)) +
127 | geom_errorbarh(aes(xmin = 1, xmax = costume_ratio), height = 0) +
128 | geom_point(aes(size = speech_total, color = costume_ratio > 1)) +
129 | scale_x_log10() +
130 | scale_color_discrete(guide = FALSE) +
131 | labs(size ="# of lines",
132 | x = "Lines in costume / lines out of costume",
133 | y = "",
134 | title = "Which X-Men tend to speak in costume/out of costume?")
135 | ```
136 |
137 | ```{r}
138 | xmen$locations %>%
139 | group_by(location) %>%
140 | summarize(total = n(),
141 | issues = n_distinct(issue)) %>%
142 | arrange(desc(issues))
143 | ```
144 |
145 | ```{r}
146 | xmen$comic_bechdel %>%
147 | bind_rows(xmen$xmen_bechdel %>% mutate(writer = "Chris Claremont")) %>%
148 | filter(!is.na(pass_bechdel),
149 | !is.na(writer)) %>%
150 | group_by(writer = fct_lump(writer, 5)) %>%
151 | summarize(n_issues = n(),
152 | pct_bechdel = mean(pass_bechdel == "yes")) %>%
153 | arrange(desc(n_issues))
154 | ```
155 |
156 | ```{r}
157 | xmen$xmen_bechdel %>%
158 | filter(!is.na(pass_bechdel)) %>%
159 | group_by(issue_group = 20 * (issue %/% 20)) %>%
160 | summarize(pct_bechdel = mean(pass_bechdel == "yes"),
161 | n = n()) %>%
162 | ggplot(aes(issue_group, pct_bechdel)) +
163 | geom_line() +
164 | scale_y_continuous(labels = scales::percent)
165 | ```
166 |
167 | ```{r}
168 | by_issue_character <- character_visualization %>%
169 | group_by(issue_group = 20 * (issue %/% 20), superhero) %>%
170 | summarize(across(speech:depicted, sum)) %>%
171 | ungroup()
172 |
173 | by_issue_character %>%
174 | filter(fct_lump(superhero, 9, w = speech) != "Other") %>%
175 | mutate(superhero = fct_reorder(superhero, -speech, sum)) %>%
176 | ggplot(aes(issue_group, speech)) +
177 | geom_col() +
178 | facet_wrap(~ superhero)
179 | ```
180 |
181 |
182 |
183 |
--------------------------------------------------------------------------------
/us-dairy.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Untitled"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | theme_set(theme_light())
13 |
14 | milk_products_facts <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-29/milk_products_facts.csv")
15 | ```
16 |
17 | ```{r}
18 | milk_products_tidied <- milk_products_facts %>%
19 | gather(product, lbs_per_person, -year) %>%
20 | separate(product, c("category", "product"), sep = "_",
21 | extra = "merge", fill = "right") %>%
22 | mutate(product = coalesce(product, category),
23 | product = str_to_title(str_replace_all(product, "_", " ")),
24 | category = str_to_title(category),
25 | product = ifelse(product == "Other", paste(product, category), product))
26 |
27 | milk_products_tidied %>%
28 | group_by(category, year) %>%
29 | summarize(lbs_per_person = sum(lbs_per_person)) %>%
30 | ggplot(aes(year, lbs_per_person, color = category)) +
31 | geom_line() +
32 | labs(title = "Dairy consumption by category",
33 | subtitle = "Based on US consumption (source: USDA)",
34 | x = "Year",
35 | y = "Lbs per person")
36 |
37 | milk_products_tidied %>%
38 | group_by(product = fct_lump(product, 6, w = lbs_per_person),
39 | year) %>%
40 | summarize(lbs_per_person = sum(lbs_per_person)) %>%
41 | ggplot(aes(year, lbs_per_person, color = product)) +
42 | geom_line()
43 |
44 | milk_products_tidied %>%
45 | ggplot(aes(year, lbs_per_person)) +
46 | geom_line() +
47 | facet_wrap(~ product, scales = "free") +
48 | expand_limits(y = 0)
49 |
50 | milk_products_tidied %>%
51 | filter(year == max(year)) %>%
52 | mutate(product = fct_reorder(product, lbs_per_person, sum)) %>%
53 | ggplot(aes(product, lbs_per_person, fill = category)) +
54 | geom_col() +
55 | coord_flip() +
56 | labs(x = "",
57 | y = "Pounds consumed per US person in 2017")
58 | ```
59 |
60 | ```{r}
61 | library(sweep)
62 | library(timetk)
63 | library(lubridate)
64 |
65 | milk_product_ts <- milk_products_tidied %>%
66 | mutate(year = as.Date("0001-01-01") + years(year - 1)) %>%
67 | nest(-category, -product) %>%
68 | mutate(ts = map(data, tk_ts, start = 1975, freq = 1))
69 |
70 | milk_product_ets <- milk_product_ts %>%
71 | mutate(model = map(ts, ets))
72 |
73 | milk_product_ets %>%
74 | unnest(map(model, sw_glance))
75 |
76 | milk_product_ts %>%
77 | crossing(model_name = c("auto.arima", "ets")) %>%
78 | mutate(model = map2(model_name, ts, ~ invoke(.x, list(.y))),
79 | forecast = map(model, forecast, h = 10)) %>%
80 | unnest(map(forecast, sw_sweep)) %>%
81 | ggplot(aes(index, lbs_per_person, color = model_name, lty = key)) +
82 | geom_line() +
83 | geom_ribbon(aes(ymin = lo.80, ymax = hi.80), alpha = .5) +
84 | facet_wrap(~ product, scales = "free_y") +
85 | expand_limits(y = 0) +
86 | scale_x_continuous(breaks = c(1980, 2000, 2020)) +
87 | scale_linetype_discrete(guide = FALSE) +
88 | labs(x = "Year",
89 | y = "Average US consumption (lbs per person)",
90 | title = "Forecasted consumption of dairy products",
91 | subtitle = "Based on USDA data 1975-2017. Showing 80% prediction intervals.",
92 | color = "Model")
93 | ```
94 |
95 | ```{r}
96 | cheese <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-01-29/clean_cheese.csv")
97 |
98 | cheese_tidied <- cheese %>%
99 | gather(type, lbs_per_person, -Year) %>%
100 | rename(year = Year) %>%
101 | mutate(type = str_to_title(type),
102 | type = fct_recode(type, "Total American Cheese" = "Total American Chese"))
103 |
104 | cheese_tidied %>%
105 | ggplot(aes(year, lbs_per_person)) +
106 | geom_line() +
107 | facet_wrap(~ type, scales = "free_y") +
108 | expand_limits(y = 0)
109 |
110 | cheese_ts <- cheese_tidied %>%
111 | mutate(year = as.Date("0001-01-01") + years(year - 1)) %>%
112 | nest(-type) %>%
113 | mutate(ts = map(data, tk_ts, start = 1970, freq = 1))
114 |
115 | cheese_ts %>%
116 | crossing(model_name = c("auto.arima", "ets")) %>%
117 | mutate(model = map2(model_name, ts, ~ invoke(.x, list(.y))),
118 | forecast = map(model, forecast, h = 10)) %>%
119 | unnest(map(forecast, sw_sweep)) %>%
120 | ggplot(aes(index, lbs_per_person, color = model_name, lty = key)) +
121 | geom_line() +
122 | geom_ribbon(aes(ymin = lo.80, ymax = hi.80), alpha = .5) +
123 | facet_wrap(~ type, scales = "free_y") +
124 | expand_limits(y = 0) +
125 | scale_x_continuous(breaks = c(1980, 2000, 2020)) +
126 | scale_linetype_discrete(guide = FALSE) +
127 | labs(x = "Year",
128 | y = "Average US consumption (lbs per person)",
129 | title = "Forecasted consumption of dairy products",
130 | subtitle = "Based on USDA data 1975-2017. Showing 80% prediction intervals.",
131 | color = "Model")
132 | ```
133 |
134 |
135 |
--------------------------------------------------------------------------------
/us-wind.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Untitled"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 |
13 | us_wind <- read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2018-11-06/us_wind.csv")
14 | ```
15 |
16 | Look at the continental US (48 states, District of Columbia)
17 |
18 | ```{r}
19 | us_wind_processed <- us_wind %>%
20 | filter(!t_state %in% c("AK", "HI", "GU", "PR")) %>%
21 | mutate(t_cap = ifelse(t_cap < 0, NA, t_cap)) %>%
22 | mutate_if(is.numeric, ~ ifelse(. == -9999, NA, .))
23 |
24 | wind_projects <- us_wind_processed %>%
25 | group_by(p_name, t_state) %>%
26 | summarize(year = min(p_year, na.rm = TRUE),
27 | turbines = n(),
28 | total_capacity = sum(t_cap),
29 | lon = mean(xlong),
30 | lat = mean(ylat),
31 | lon_sd = sd(xlong),
32 | lat_sd = sd(ylat)) %>%
33 | ungroup()
34 | ```
35 |
36 | How has turbine capacity changed over time?
37 |
38 | ```{r}
39 | turbine_models <- us_wind_processed %>%
40 | group_by(t_model) %>%
41 | summarize(t_cap = median(t_cap),
42 | t_hh = median(t_hh),
43 | t_rd = median(t_rd),
44 | t_rsa = median(t_rsa),
45 | t_ttlh = median(t_ttlh),
46 | turbines = n(),
47 | projects = n_distinct(p_name)) %>%
48 | arrange(desc(projects))
49 |
50 | turbine_models %>%
51 | ggplot(aes(t_ttlh, t_cap)) +
52 | geom_point() +
53 | labs(title = "When it comes to turbines, bigger is better!",
54 | x = "Turbine total height (meters)",
55 | y = "Turbine capacity (kW)")
56 |
57 | turbine_models %>%
58 | ggplot(aes(t_rsa, t_cap)) +
59 | geom_point() +
60 | labs(title = "When it comes to turbines, bigger is better!",
61 | x = "Turbine rotor swept area (meters ^ 2)",
62 | y = "Turbine capacity (kW)")
63 |
64 | ```
65 |
66 |
67 | ```{r}
68 | wind_projects %>%
69 | ggplot(aes(year, total_capacity)) +
70 | geom_point()
71 |
72 | wind_projects %>%
73 | ggplot(aes(year, total_capacity / turbines)) +
74 | geom_point()
75 | ```
76 |
77 | ```{r}
78 | wind_projects %>%
79 | ggplot(aes(lon, lat, size = turbines, color = year)) +
80 | borders("state") +
81 | geom_point() +
82 | coord_map() +
83 | theme_void()
84 | ```
85 |
86 | ### Animation
87 |
88 | ```{r}
89 | library(gganimate)
90 |
91 | ggplot(mtcars, aes(factor(cyl), mpg)) +
92 | geom_boxplot() +
93 | # Here comes the gganimate code
94 | transition_states(
95 | gear,
96 | transition_length = 2,
97 | state_length = 1
98 | ) +
99 | enter_fade() +
100 | exit_shrink() +
101 | ease_aes('sine-in-out')
102 | ```
103 |
104 | ```{r}
105 | wind_projects
106 |
107 | wind_projects %>%
108 | filter(!is.na(year), !is.infinite(year)) %>%
109 | crossing(time = 1981:2018) %>%
110 | filter(year <= time) %>%
111 | ggplot(aes(lon, lat, size = turbines, color = year)) +
112 | borders("state") +
113 | geom_point() +
114 | transition_manual(time) +
115 | scale_color_continuous(guide = FALSE) +
116 | labs(title = "Locations of wind turbine projects in continental US (1981-2018)") +
117 | coord_map() +
118 | theme_void()
119 |
120 | p
121 | anim_save("turbines.gif")
122 | ```
123 |
124 |
125 |
126 | ```{r}
127 | us_wind_processed
128 | ```
129 |
130 |
131 |
--------------------------------------------------------------------------------
/us_phds.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "US PhDs"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | theme_set(theme_light())
13 |
14 | # Major field of study
15 | major_fields_raw <- readxl::read_xlsx("~/Downloads/data_tables/sed17-sr-tab012.xlsx",
16 | skip = 3)
17 |
18 | major_fields <- major_fields_raw %>%
19 | rename(field = `Field of study`) %>%
20 | gather(key, value, -field) %>%
21 | mutate(year = as.numeric(ifelse(str_detect(key, "X__"), NA, key)),
22 | type = ifelse(!str_detect(value, "Number|Percent"), NA, value),
23 | value = as.numeric(value)) %>%
24 | fill(year, type) %>%
25 | select(-key) %>%
26 | filter(!is.na(value)) %>%
27 | spread(type, value)
28 | ```
29 |
30 | ```{r}
31 | fine_fields <- readxl::read_xlsx("~/Downloads/data_tables/sed17-sr-tab013.xlsx",
32 | skip = 3) %>%
33 | rename(field = 1) %>%
34 | gather(year, number, -field) %>%
35 | mutate(year = as.numeric(year),
36 | number = as.numeric(number)) %>%
37 | filter(!is.na(number))
38 |
39 | fine_fields %>%
40 | filter(field %in% sample(unique(field), 6)) %>%
41 | ggplot(aes(year, number, color = field)) +
42 | geom_line()
43 | ```
44 |
45 | ```{r}
46 | # get the broad field names and the major field names
47 |
48 | sex <- c("All", "Male", "Female", "All doctorate recipientsa", "All fieldsa")
49 |
50 | broad_fields <- readxl::read_xlsx("~/Downloads/data_tables/sed17-sr-tab014.xlsx", skip = 4) %>%
51 | rename(field = 1) %>%
52 | filter(!field %in% sex) %>%
53 | mutate(field = fct_recode(field,
54 | "Life sciences" = "Life sciencesb",
55 | "Other" = "Otherc")) %>%
56 | pull(field) %>%
57 | as.character()
58 | ```
59 |
60 |
61 | ```{r}
62 | recipients_year_field_sex <- readxl::read_xlsx("~/Downloads/data_tables/sed17-sr-tab015.xlsx", skip = 3) %>%
63 | rename(field = 1) %>%
64 | select(-contains("change")) %>%
65 | mutate(field = as.character(fct_recode(field, "All" = "All doctorate recipientsa",
66 | "Other" = "Otherb")),
67 | sex = if_else(field %in% sex, field, NA_character_),
68 | broad_field = ifelse(field %in% broad_fields, field, NA)) %>%
69 | fill(sex, broad_field) %>%
70 | gather(year, number, -sex, -broad_field, -field) %>%
71 | mutate(year = as.numeric(year),
72 | number = as.numeric(number)) %>%
73 | filter(!field %in% sex) %>%
74 | filter(!is.na(number))
75 |
76 | recipients_year_field_sex %>%
77 | filter(sex != "All",
78 | broad_field == "Mathematics and computer sciences") %>%
79 | ggplot(aes(year, number, color = sex)) +
80 | geom_line() +
81 | expand_limits(y = 0) +
82 | facet_wrap(~ field)
83 |
84 | recipients_year_field_sex %>%
85 | spread(sex, number) %>%
86 | mutate(pct_male = Male / All) %>%
87 | filter(broad_field == "Engineering") %>%
88 | mutate(field = fct_reorder(field, -pct_male)) %>%
89 | ggplot(aes(year, pct_male, color = field)) +
90 | geom_line() +
91 | scale_y_continuous(labels = scales::percent_format()) +
92 | labs(x = "Year",
93 | y = "% of PhD recipients reporting as male",
94 | color = "Major field",
95 | title = "Breakdown by sex over time within Engineering major fields")
96 |
97 | recipients_year_field_sex %>%
98 | spread(sex, number) %>%
99 | mutate(pct_male = Male / All) %>%
100 | filter(broad_field == "Humanities and arts") %>%
101 | mutate(field = fct_reorder(field, -pct_male)) %>%
102 | ggplot(aes(year, pct_male, color = field)) +
103 | geom_line() +
104 | scale_y_continuous(labels = scales::percent_format()) +
105 | labs(x = "Year",
106 | y = "% of PhD recipients reporting as male",
107 | color = "Major field",
108 | title = "Breakdown by sex over time within Humanities & Arts major fields")
109 |
110 | recipients_year_field_sex %>%
111 | spread(sex, number) %>%
112 | mutate(pct_male = Male / All) %>%
113 | filter(broad_field == "Education") %>%
114 | mutate(field = fct_reorder(field, -pct_male)) %>%
115 | ggplot(aes(year, pct_male, color = field)) +
116 | geom_line() +
117 | scale_y_continuous(labels = scales::percent_format()) +
118 | labs(x = "Year",
119 | y = "% of PhD recipients reporting as male",
120 | color = "Major field",
121 | title = "Breakdown by sex over time within Education major fields")
122 | ```
123 |
124 | Three levels:
125 |
126 | * Broad field (Life sciences)
127 | * Major field (Biological and biomedical sciences)
128 | * Subfield (Computational biology)
129 |
130 |
131 |
--------------------------------------------------------------------------------
/volcano-eruptions.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Volcanoes"
3 | output: html_document
4 | ---
5 |
6 | ```{r}
7 | library(tidyverse)
8 | theme_set(theme_light())
9 |
10 | tuesdata <- tidytuesdayR::tt_load('2020-05-12')
11 | ```
12 |
13 | ```{r}
14 | volcano <- tuesdata$volcano %>%
15 | mutate(last_eruption_year = as.numeric(last_eruption_year))
16 |
17 | eruptions <- tuesdata$eruptions
18 | ```
19 |
20 | ```{r}
21 | volcano %>%
22 | count(evidence_category, sort = TRUE)
23 |
24 | volcano %>%
25 | mutate(years_ago = 2020 - last_eruption_year) %>%
26 | ggplot(aes(years_ago + 1, fill = evidence_category)) +
27 | geom_histogram() +
28 | scale_x_log10()
29 | ```
30 |
31 | ```{r}
32 | volcano %>%
33 | count(region, sort = TRUE)
34 |
35 | volcano %>%
36 | count(primary_volcano_type, sort = TRUE)
37 | ```
38 |
39 | ```{r}
40 | library(ggthemes)
41 |
42 | volcano %>%
43 | mutate(primary_volcano_type = str_remove(primary_volcano_type, "\\(.*\\)"),
44 | primary_volcano_type = fct_lump(primary_volcano_type, 6)) %>%
45 | ggplot(aes(longitude, latitude)) +
46 | borders() +
47 | geom_point(aes(color = primary_volcano_type), size = .5) +
48 | theme_map() +
49 | labs(title = "Volcanoes of the World",
50 | color = "Type")
51 | ```
52 |
53 | ```{r}
54 | library(leaflet)
55 | library(glue)
56 | library(htmlwidgets)
57 | library(DT)
58 |
59 | template <- "<>{ volcano_name }
{ primary_volcano_type }
"
60 |
61 | volcano %>%
62 | mutate(transformed_pop = log2(population_within_5_km + 1),
63 | pop_color = colorNumeric(c("blue", "red"), transformed_pop)(transformed_pop)) %>%
64 | gather(key, value,
65 | volcano_name, primary_volcano_type, last_eruption_year,
66 | country,
67 | tectonic_settings,
68 | population_within_5_km) %>%
69 | mutate(key = str_to_title(str_replace_all(key, "_", " ")),
70 | key = paste0("", key, "")) %>%
71 | replace_na(list(value = "Unknown")) %>%
72 | nest(data = c(key, value)) %>%
73 | mutate(html = map(data,
74 | knitr::kable,
75 | format = "html",
76 | escape = FALSE,
77 | col.names = c("", ""))) %>%
78 | leaflet() %>%
79 | addTiles() %>%
80 | addCircleMarkers(lat = ~ latitude,
81 | lng = ~ longitude,
82 | color = ~ pop_color,
83 | popup = ~ html,
84 | radius = 1) %>%
85 | addMeasure()
86 | ```
87 |
88 | ```{r}
89 | volcano %>%
90 | ggplot(aes(population_within_5_km)) +
91 | geom_histogram() +
92 | scale_x_log10()
93 | ```
94 |
95 | ```{r}
96 | library(gganimate)
97 |
98 | eruptions %>%
99 | ggplot(aes(2020 - start_year)) +
100 | geom_histogram() +
101 | scale_x_log10()
102 |
103 | eruptions %>%
104 | filter(start_year >= 1900,
105 | eruption_category != "Discredited Eruption")
106 | ggplot(aes(2020 - start_year)) +
107 | geom_histogram() +
108 | scale_x_log10()
109 | ```
110 |
111 | ```{r}
112 | library(gganimate)
113 |
114 | anim_graph <- eruptions %>%
115 | filter(start_year >= 1900) %>%
116 | mutate(volume = .00001 * 10 ^ vei) %>%
117 | ggplot(aes(longitude, latitude)) +
118 | borders() +
119 | geom_point(aes(size = volume,
120 | color = volume)) +
121 | theme_map() +
122 | scale_color_gradient2(low = "blue", high = "red",
123 | midpoint = log10(.01),
124 | trans = "log10",
125 | guide = FALSE) +
126 | scale_size_continuous(range = c(1, 6)) +
127 | # transition_reveal(start_year) +
128 | transition_time(start_year) +
129 | labs(title = "Eruptions: Year { round(frame_time) }") +
130 | theme(legend.position = "none")
131 |
132 | animate(anim_graph, 150, fps = 4)
133 |
134 | anim_save("eruptions.gif")
135 | ```
136 |
137 |
138 |
139 |
140 |
141 | ```{r}
142 | eruptions
143 | ```
144 |
145 |
--------------------------------------------------------------------------------
/women-workplace-app/app.R:
--------------------------------------------------------------------------------
1 | library(shiny)
2 | library(ggplot2)
3 | library(plotly)
4 | library(dplyr)
5 | library(scales)
6 | theme_set(theme_light())
7 |
8 | jobs_gender <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-05/jobs_gender.csv")
9 |
10 | # Define UI for application that draws a histogram
11 | ui <- fluidPage(
12 |
13 | # Application title
14 | titlePanel("Occupation and gender explorer"),
15 |
16 | # Sidebar with a slider input for number of bins
17 | sidebarLayout(
18 | sidebarPanel(
19 | selectInput("major_category",
20 | "Occupation category:",
21 | choices = unique(jobs_gender$major_category))
22 | ),
23 |
24 | # Show a plot of the generated distribution
25 | mainPanel(
26 | plotlyOutput("jobs_scatter", height = "700px")
27 | )
28 | )
29 | )
30 |
31 | # Define server logic required to draw a histogram
32 | server <- function(input, output) {
33 |
34 | output$jobs_scatter <- renderPlotly({
35 | p <- jobs_gender %>%
36 | filter(year == 2016,
37 | total_workers >= 20000) %>%
38 | filter(major_category == input$major_category) %>%
39 | arrange(desc(wage_percent_of_male)) %>%
40 | mutate(percent_female = workers_female / total_workers,
41 | wage_percent_female = total_earnings_female / total_earnings_male) %>%
42 | ggplot(aes(percent_female,
43 | wage_percent_female,
44 | color = minor_category,
45 | size = total_workers,
46 | label = occupation)) +
47 | geom_point() +
48 | scale_size_continuous(range = c(1, 10), guide = FALSE) +
49 | labs(x = "% of workforce reported as female",
50 | y = "% of median female salary / median male",
51 | title = "Gender disparity and pay gap in 2016",
52 | subtitle = "Only occupations with at least 20,000 workers total",
53 | color = "Minor category") +
54 | scale_x_continuous(labels = percent_format()) +
55 | scale_y_continuous(labels = percent_format())
56 |
57 | ggplotly(p)
58 | })
59 | }
60 |
61 | # Run the application
62 | shinyApp(ui = ui, server = server)
63 |
64 |
--------------------------------------------------------------------------------
/women-workplace.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Women in the Workplace"
3 | output: html_document
4 | ---
5 |
6 | ```{r setup, include=FALSE}
7 | knitr::opts_chunk$set(echo = TRUE)
8 | ```
9 |
10 | ```{r}
11 | library(tidyverse)
12 | library(scales)
13 | theme_set(theme_light())
14 |
15 | jobs_gender <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-05/jobs_gender.csv")
16 | earnings_female <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-05/earnings_female.csv")
17 | employed_gender <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-05/employed_gender.csv")
18 | ```
19 |
20 | ```{r}
21 | summarize_jobs_gender <- function(tbl) {
22 | tbl %>%
23 | summarize(total_earnings = sum(total_earnings * total_workers) / sum(total_workers),
24 | total_earnings_male = sum(total_earnings_male * workers_male, na.rm = TRUE) /
25 | sum(workers_male[!is.na(total_earnings_male)]),
26 | total_earnings_female = sum(total_earnings_female * workers_female, na.rm = TRUE) /
27 | sum(workers_female[!is.na(total_earnings_female)]),
28 | total_workers = sum(total_workers),
29 | workers_male = sum(workers_male),
30 | workers_female = sum(workers_female)) %>%
31 | mutate(wage_percent_of_male = total_earnings_female / total_earnings_male)
32 | }
33 |
34 | by_year_major_category <- jobs_gender %>%
35 | filter(complete.cases(.)) %>%
36 | group_by(year, major_category) %>%
37 | summarize_jobs_gender()
38 |
39 | by_year_major_category %>%
40 | mutate(major_category = fct_reorder(major_category, -total_earnings)) %>%
41 | ggplot(aes(year, total_earnings, color = major_category)) +
42 | geom_line() +
43 | expand_limits(y = 0)
44 |
45 | by_year_major_category %>%
46 | mutate(major_category = fct_reorder(major_category, -wage_percent_of_male)) %>%
47 | ggplot(aes(year, wage_percent_of_male, color = major_category)) +
48 | geom_line()
49 | ```
50 |
51 | ```{r}
52 | by_minor_category_2016 <- jobs_gender %>%
53 | filter(year == 2016) %>%
54 | group_by(major_category, minor_category) %>%
55 | summarize_jobs_gender() %>%
56 | ungroup()
57 | ```
58 |
59 | ```{r}
60 | by_minor_category_2016 %>%
61 | mutate(minor_category = fct_reorder(minor_category, wage_percent_of_male)) %>%
62 | ggplot(aes(minor_category, wage_percent_of_male, fill = major_category)) +
63 | geom_col() +
64 | coord_flip()
65 | ```
66 |
67 | ```{r}
68 | library(plotly)
69 |
70 | p <- jobs_gender %>%
71 | filter(year == 2016) %>%
72 | filter(major_category == "Healthcare Practitioners and Technical") %>%
73 | arrange(desc(wage_percent_of_male)) %>%
74 | ggplot(aes(workers_female / total_workers,
75 | total_earnings,
76 | size = total_workers,
77 | label = occupation)) +
78 | geom_point() +
79 | scale_size_continuous(range = c(1, 10)) +
80 | labs(size = "Total # of workers",
81 | x = "% of workforce reported as female",
82 | y = "Median salary in the occupation") +
83 | scale_x_continuous(labels = percent_format()) +
84 | scale_y_continuous(labels = dollar_format()) +
85 | expand_limits(y = 0)
86 |
87 | ggplotly(p)
88 | ```
89 |
90 | ```{r}
91 | p <- jobs_gender %>%
92 | filter(year == 2016,
93 | total_workers >= 20000) %>%
94 | filter(major_category == "Computer, Engineering, and Science") %>%
95 | arrange(desc(wage_percent_of_male)) %>%
96 | ggplot(aes(workers_female / total_workers,
97 | total_earnings_female / total_earnings_male,
98 | color = minor_category,
99 | size = total_workers,
100 | label = occupation)) +
101 | geom_point() +
102 | scale_size_continuous(range = c(1, 10)) +
103 | labs(size = "Total # of workers",
104 | x = "% of workforce reported as female",
105 | y = "% of median female salary / median male") +
106 | scale_x_continuous(labels = percent_format()) +
107 | scale_y_continuous(labels = percent_format())
108 |
109 | ggplotly(p)
110 | ```
111 |
112 |
--------------------------------------------------------------------------------