├── .gitignore
├── 2-imdb.Rproj
├── README.md
├── imdb-0.png
├── imdb-1.png
├── imdb-10.png
├── imdb-11.png
├── imdb-12.png
├── imdb-2.png
├── imdb-2a.png
├── imdb-2b.png
├── imdb-3.png
├── imdb-3b.png
├── imdb-4.png
├── imdb-5.png
├── imdb-6.png
├── imdb-7.png
├── imdb-8.png
├── imdb-9.png
├── imdb_analysis.Rmd
└── imdb_analysis_livestream.Rmd


/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | 


--------------------------------------------------------------------------------
/2-imdb.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # imdb-data-analysis
 2 | ![](imdb-4.png)
 3 | 
 4 | R Code + R Notebook on how to process and visualize the official IMDb datasets.
 5 | 
 6 | This R Notebook is the complement to my blog post [Analyzing IMDb Data The Intended Way, with R and ggplot2](http://minimaxir.com/2018/07/imdb-data-analysis/).
 7 | 
 8 | ## Maintainer
 9 | Max Woolf ([@minimaxir](http://minimaxir.com))
10 | 
11 | *Max's open-source projects are supported by his [Patreon](https://www.patreon.com/minimaxir). If you found this project helpful, any monetary contributions to the Patreon are appreciated and will be put to good creative use.*
12 | 
13 | ## License
14 | MIT


--------------------------------------------------------------------------------
/imdb-0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-0.png


--------------------------------------------------------------------------------
/imdb-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-1.png


--------------------------------------------------------------------------------
/imdb-10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-10.png


--------------------------------------------------------------------------------
/imdb-11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-11.png


--------------------------------------------------------------------------------
/imdb-12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-12.png


--------------------------------------------------------------------------------
/imdb-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-2.png


--------------------------------------------------------------------------------
/imdb-2a.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-2a.png


--------------------------------------------------------------------------------
/imdb-2b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-2b.png


--------------------------------------------------------------------------------
/imdb-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-3.png


--------------------------------------------------------------------------------
/imdb-3b.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-3b.png


--------------------------------------------------------------------------------
/imdb-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-4.png


--------------------------------------------------------------------------------
/imdb-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-5.png


--------------------------------------------------------------------------------
/imdb-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-6.png


--------------------------------------------------------------------------------
/imdb-7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-7.png


--------------------------------------------------------------------------------
/imdb-8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-8.png


--------------------------------------------------------------------------------
/imdb-9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-9.png


--------------------------------------------------------------------------------
/imdb_analysis.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Analyzing IMDb Data The Intended Way, with R and ggplot2"
  3 | author: "Max Woolf (@minimaxir)"
  4 | date: "2018-07-15"
  5 | output:
  6 |   html_notebook:
  7 |     highlight: tango
  8 |     mathjax: null
  9 |     number_sections: yes
 10 |     theme: spacelab
 11 |     toc: True
 12 | ---
 13 | 
 14 | This R Notebook is the complement to my blog post [Analyzing IMDb Data The Intended Way, with R and ggplot2](http://minimaxir.com/2018/07/imdb-data-analysis/).
 15 | 
 16 | This notebook is licensed under the MIT License. If you use the code or data visualization designs contained within this notebook, it would be greatly appreciated if proper attribution is given back to this notebook and/or myself. Thanks! :)
 17 | 
 18 | IMDb data retrieved on July 4th 2018.
 19 | 
 20 | 
 21 | **Information courtesy of
 22 | IMDb
 23 | (http://www.imdb.com).
 24 | Used with permission.**
 25 | 
 26 | 
 27 | ```{r}
 28 | library(tidyverse)
 29 | library(ggridges)   # unused in final blog post
 30 | library(tidytext)   # unused in final blog post
 31 | library(scales)
 32 | 
 33 | sessionInfo()
 34 | ```
 35 | 
 36 | Helper function to read IMDB files given filename.
 37 | 
 38 | ```{r}
 39 | read_imdb <- function(data_path) {
 40 |   path <- "/Volumes/Extreme 510/Data/imdb/"
 41 |   read_tsv(paste0(path, data_path), na = "\\N", quote='', progress=F)
 42 | }
 43 | ```
 44 | 
 45 | Helper function to pretty print the size of a dataframe for charts/notebook.
 46 | 
 47 | ```{r}
 48 | ppdf <- function(df) {
 49 |   df %>% nrow() %>% comma()
 50 | }
 51 | ```
 52 | 
 53 | 
 54 | # Ratings
 55 | 
 56 | ```{r}
 57 | df_ratings <- read_imdb("title.ratings.tsv")
 58 | df_ratings %>% head()
 59 | ```
 60 | 
 61 | There are **`r df_ratings %>% ppdf()`** ratings in the dataset.
 62 | 
 63 | Plot every point. (note: very slow!)
 64 | 
 65 | ```{r eval=FALSE}
 66 | plot <- ggplot(df_ratings, aes(x = numVotes, y = averageRating)) +
 67 |           geom_point()
 68 | 
 69 | ggsave("imdb-0.png", plot, width=4, height=3)
 70 | ```
 71 | 
 72 | ![](imdb-0.png)
 73 | 
 74 | Plot a 2D histogram and clean up axes.
 75 | 
 76 | ```{r}
 77 | plot <- ggplot(df_ratings, aes(x = numVotes, y = averageRating)) +
 78 |           geom_bin2d() +
 79 |           scale_x_log10(labels = comma) +
 80 |           scale_y_continuous(breaks = 1:10) +
 81 |           scale_fill_viridis_c(labels = comma)
 82 | 
 83 | ggsave("imdb-1.png", plot, width=4, height=3)
 84 | ```
 85 | 
 86 | ![](imdb-1.png)
 87 | 
 88 | 
 89 | 
 90 | # Title Basics
 91 | 
 92 | ```{r}
 93 | df_basics <- read_imdb("title.basics.tsv")
 94 | df_basics %>% head()
 95 | ```
 96 | 
 97 | There are **`r df_basics %>% ppdf()`** titles in the dataset.
 98 | 
 99 | Merge `df_ratings` and `df_basics` to perform ratings/vote analysis using more metadata.
100 | 
101 | ```{r}
102 | df_ratings <- df_ratings %>% left_join(df_basics)
103 | 
104 | df_ratings %>% head()
105 | ```
106 | 
107 | ```{r}
108 | plot <- ggplot(df_ratings, aes(x = runtimeMinutes, y = averageRating)) +
109 |           geom_bin2d() +
110 |           scale_x_continuous(labels = comma) +
111 |           scale_y_continuous(breaks = 1:10) +
112 |           scale_fill_viridis_c()
113 | 
114 | ggsave("imdb-2a.png", plot, width=4, height=3)
115 | ```
116 | 
117 | ![](imdb-2a.png)
118 | 
119 | Which movies have the superhigh runtimes?
120 | 
121 | ```{r}
122 | df_ratings %>% arrange(desc(runtimeMinutes)) %>% head(10)
123 | ```
124 | 
125 | 
126 | ```{r}
127 | plot <- ggplot(df_ratings %>% filter(runtimeMinutes < 180, titleType=="movie", numVotes >= 10), aes(x = runtimeMinutes, y = averageRating)) +
128 |           geom_bin2d() +
129 |           scale_x_continuous(breaks = seq(0, 180, 60), labels = 0:3) +
130 |           scale_y_continuous(breaks = 0:10) +
131 |           scale_fill_viridis_c(option = "inferno", labels = comma) +
132 |           theme_minimal(base_family = "Source Sans Pro", base_size=8) +
133 |           labs(title="Relationship between Movie Runtime and Average Movie Rating",
134 |                subtitle="Data from IMDb retrieved July 4th, 2018",
135 |                x="Runtime (Hours)",
136 |                y="Average User Rating",
137 |                caption="Max Woolf — minimaxir.com",
138 |                fill="# Movies")
139 | 
140 | ggsave("imdb-2b.png", plot, width=4, height=3)
141 | ```
142 | 
143 | ![](imdb-2b.png)
144 | ## Unnesting Genres
145 | 
146 | How to facet by genre; this was removed from the post since a little complicated.
147 | 
148 | NB: you cannot use default tokenization on `unnest_tokens` since some tokens have dashes. (e.g. `film-noir`)
149 | 
150 | ```{r}
151 | df_ratings_unnest <- df_ratings %>%
152 |                         filter(runtimeMinutes < 180, titleType=="movie", numVotes >= 10) %>%
153 |                         select(runtimeMinutes, averageRating, genres) %>%
154 |                         unnest_tokens(genre, genres, token = str_split, pattern = ",")
155 | 
156 | df_ratings_unnest %>% head(10)
157 | ```
158 | 
159 | ```{r}
160 | plot <- ggplot(df_ratings_unnest, aes(x = runtimeMinutes, y = averageRating)) +
161 |           geom_bin2d() +
162 |           scale_x_continuous(breaks = seq(0, 180, 60), labels = 0:3) +
163 |           scale_y_continuous(breaks = 1:10) +
164 |           scale_fill_viridis_c(option = "inferno", labels = comma) +
165 |           theme_minimal(base_family = "Source Sans Pro", base_size=9) +
166 |           labs(title="Relationship between Movie Runtime and Average Mobie Rating",
167 |                subtitle="Data from IMDb retrieved July 4th, 2018",
168 |                x="Runtime (Hours)",
169 |                y="Average User Rating",
170 |                caption="Max Woolf — minimaxir.com",
171 |                fill="# Movies") +
172 |           facet_wrap(~ genre)
173 | 
174 | ggsave("imdb-3.png", plot, width=6, height=6)
175 | ```
176 | 
177 | ![](imdb-3.png)
178 | 
179 | Normalize by facet. There are two approaches:
180 | 
181 | 1. Do a weighted sum of the points in the spatial area, where the weight is the reciprocol of the # of points in the facet.
182 | 2. Manually calculate bins/counts and scale to `[0, 1]`
183 | 
184 | Option 1 is a bit easier to implement. Additionally, remove facets with little data.
185 | 
186 | `squish` trick via https://stackoverflow.com/a/23655697/9314418
187 | 
188 | ```{r}
189 | df_temp <- df_ratings_unnest %>%
190 |             filter(!(genre %in% c("game-show", "reality-tv", "short", "talk-show", NA))) %>%
191 |             group_by(genre) %>%
192 |             mutate(prop = 1/n())
193 | 
194 | plot <- ggplot(df_temp, aes(x = runtimeMinutes, y = averageRating, z=prop)) +
195 |           stat_summary_2d(fun=sum) +
196 |           scale_x_continuous(breaks = seq(0, 180, 60), labels = 0:3) +
197 |           scale_y_continuous(breaks = 1:10) +
198 |           scale_fill_viridis_c(option = "inferno", labels = comma, limits=c(0, 0.02), oob=squish, guide=F) +
199 |           theme_minimal(base_family = "Source Sans Pro", base_size=9) +
200 |           labs(title="Relationship between Movie Runtime and Average Mobie Rating",
201 |                subtitle="Data from IMDb retrieved July 4th, 2018",
202 |                x="Runtime (Hours)",
203 |                y="Average User Rating",
204 |                caption="Max Woolf — minimaxir.com") +
205 |           facet_wrap(~ genre)
206 | 
207 | ggsave("imdb-3b.png", plot, width=6, height=6)
208 | ```
209 | 
210 | ![](imdb-3b.png)
211 | 
212 | # Rating vs. Movie Year
213 | 
214 | Set theme to custom theme based on `theme_minimal` for the rest of the notebook.
215 | 
216 | ```{r}
217 | theme_set(theme_minimal(base_size=9, base_family="Source Sans Pro") +
218 |             theme(plot.title = element_text(size=8, family="Source Sans Pro Bold", margin=margin(t = -0.1, b = 0.1, unit='cm')),
219 |                   axis.title.x = element_text(size=8),
220 |                   axis.title.y = element_text(size=8),
221 |                   plot.subtitle = element_text(family="Source Sans Pro Semibold", color="#969696", size=6),
222 |                   plot.caption = element_text(size=6, color="#969696"),
223 |                   legend.title = element_text(size=8),
224 |                   legend.key.width = unit(0.25, unit='cm')))
225 | ```
226 | 
227 | 
228 | ```{r}
229 | plot <- ggplot(df_ratings %>% filter(titleType=="movie", numVotes >= 10), aes(x = startYear, y = averageRating)) +
230 |           geom_bin2d() +
231 |           geom_smooth(color="black") +
232 |           scale_x_continuous() +
233 |           scale_y_continuous(breaks = 1:10) +
234 |           scale_fill_viridis_c(option = "plasma", labels = comma, trans='log10') +
235 |           labs(title="Relationship between Movie Release Year and Average Rating",
236 |                subtitle=sprintf("For %s Movies/Ratings. Data from IMDb retrieved 7/4/2018", df_ratings %>% filter(titleType=="movie", numVotes >= 10) %>% ppdf),
237 |                x="Year Movie was Released",
238 |                y="Average User Rating For Movie",
239 |                caption="Max Woolf — minimaxir.com",
240 |                fill="# Movies")
241 | 
242 | ggsave("imdb-4.png", plot, width=4, height=3)
243 | ```
244 | 
245 | ![](imdb-4.png)
246 | 
247 | Work with Ridge plots; not included in post because it doesn't offer much insight different from the chart above.
248 | 
249 | NB: For ridge plots, the y-axis must be a `factor`, not a `numeric`; this is what tripped me up in the stream.
250 | 
251 | ```{r}
252 | plot <- ggplot(df_ratings %>% filter(startYear >= 2000, titleType=="movie",  numVotes >= 10) %>% mutate(startYear = factor(startYear)), aes(x = averageRating, y = startYear, fill=startYear)) +
253 |           geom_density_ridges() +
254 |           scale_fill_hue(guide=F) +
255 |           scale_x_continuous(breaks = 1:10) +
256 |           theme_minimal(base_family = "Source Sans Pro", base_size=9)
257 | 
258 | ggsave("imdb-5.png", plot, width=4, height=3)
259 | ```
260 | 
261 | ![](imdb-5.png)
262 | 
263 | Bucket by decades.
264 | 
265 | ```{r}
266 | df_ratings_decades <- df_ratings %>%
267 |                         filter(startYear>=1950, titleType=="movie",  numVotes >= 10) %>%
268 |   mutate(decade = fct_rev(factor(cut_width(startYear, 10, boundary=0), labels = paste0(seq(1950, 2010, 10), "s"))))
269 | 
270 | df_ratings_decades %>% head()
271 | ```
272 | 
273 | ```{r}
274 | plot <- ggplot(df_ratings_decades, aes(x = averageRating, y = decade, fill=0.5 - abs(0.5-..ecdf..))) +
275 |           geom_density_ridges_gradient(calc_ecdf=T, quantile_lines=T) +
276 |           scale_fill_viridis_c(option = "plasma", guide=F) +
277 |           scale_x_continuous(breaks = 1:10) +
278 |           #scale_y_discrete(expand = c(0,01, 0)) +
279 |           theme_minimal(base_family = "Source Sans Pro", base_size=9)
280 | 
281 | ggsave("imdb-6.png", plot, width=4, height=3)
282 | ```
283 | 
284 | ![](imdb-6.png)
285 | 
286 | ```{r}
287 | plot <- ggplot(df_ratings_decades, aes(x = runtimeMinutes, y = decade, fill=0.5 - abs(0.5-..ecdf..))) +
288 |           geom_density_ridges_gradient(calc_ecdf=T, quantile_lines=T) +
289 |           scale_fill_viridis_c(option = "plasma", guide=F) +
290 |           scale_x_continuous(breaks = seq(0, 180, 60), limits=c(0,180), labels = 0:3) +
291 |           theme_minimal(base_family = "Source Sans Pro", base_size=9)
292 | 
293 | ggsave("imdb-7.png", plot, width=4, height=3)
294 | ```
295 | 
296 | ![](imdb-7.png)
297 | 
298 | ## Episode Analysis
299 | 
300 | For reference; not included in post. (Too much bad data to clean.)
301 | 
302 | ```{r}
303 | df_episode <- read_imdb("title.episode.tsv") %>% filter(!is.na(seasonNumber))
304 | df_episode %>% head()
305 | ```
306 | 
307 | There are `r df_episode %>% ppdf()` episodes in the dataset.
308 | 
309 | ```{r}
310 | df_episode_count <- df_episode %>%
311 |                 group_by(parentTconst, seasonNumber) %>%
312 |                 tally() %>%
313 |                 left_join(df_basics,  c("parentTconst" = "tconst"))
314 | 
315 | df_episode_count %>% head()
316 | ```
317 | 
318 | 
319 | # Actor Information
320 | 
321 | `str_detect` is vectorized and *much* faster than using a loop/`lapply`. Using a regular expression to search for actor *or* actress is another speed increase.
322 | 
323 | ```{r}
324 | df_actors <- read_imdb("name.basics.tsv") %>%
325 |                 filter(str_detect(primaryProfession, "actor|actress"))  %>%
326 |                 select(nconst, primaryName, birthYear)
327 | 
328 | df_actors %>% head()
329 | ```
330 | 
331 | There are **`r df_actors %>% ppdf()`** actors in the dataset.
332 | 
333 | ```{r}
334 | df_principals <- read_imdb("title.principals.tsv") %>%
335 |   filter(str_detect(category, "actor|actress")) %>%
336 |   select(tconst, ordering, nconst, category) %>%
337 |   group_by(tconst) %>%
338 |   filter(ordering == min(ordering))
339 | 
340 | df_principals %>% head()
341 | ```
342 | 
343 | There are **`r df_principals %>% ppdf()`** principals/rows in the dataset.
344 | 
345 | Join the 2 dataframes.  (onto `principals`, since Many-to-One)
346 | 
347 | ```{r}
348 | df_principals <- df_principals %>% left_join(df_actors)
349 | 
350 | df_principals %>% head()
351 | ```
352 | 
353 | # Putting It All Together
354 | 
355 | Merge actor information onto the full ratings dataframe.
356 | 
357 | ```{r}
358 | df_ratings <- df_ratings %>% left_join(df_principals)
359 | 
360 | df_ratings %>% head()
361 | ```
362 | 
363 | Filter down to movies w/ actor info. (only if the birth year is present in the data)
364 | 
365 | ```{r}
366 | df_ratings_movies <- df_ratings %>%
367 |                         filter(titleType == "movie", !is.na(birthYear), numVotes >= 10) %>%
368 |                         mutate(age_lead = startYear - birthYear) %>%
369 |                         arrange(desc(numVotes))
370 | 
371 | df_ratings_movies %>% head(100)
372 | ```
373 | 
374 | Aggregate lead-actor/actress ages by movie year w/ percentiles.
375 | 
376 | ```{r}
377 | df_actor_ages <- df_ratings_movies %>%
378 |                   group_by(startYear) %>%
379 |                   summarize(low_age = quantile(age_lead, 0.25, na.rm=T),
380 |                             med_age = quantile(age_lead, 0.50, na.rm=T),
381 |                             high_age = quantile(age_lead, 0.75, na.rm=T)) %>%
382 |                   arrange(startYear)
383 | 
384 | df_actor_ages %>% head()
385 | ```
386 | 
387 | Create a ribbon plot.
388 | 
389 | NB: Plot the ribbon before the line, so the line is on top.
390 | 
391 | ```{r}
392 | plot <- ggplot(df_actor_ages %>% filter(startYear >= 1920) , aes(x = startYear)) +
393 |           geom_ribbon(aes(ymin=low_age, ymax=high_age), alpha=0.2) +
394 |           geom_line(aes(y=med_age)) +
395 |           labs(title="Change in Ages of Movie Lead Actors/Actress Over Time",
396 |                subtitle=sprintf("For %s Actors. Line represents median age.\nRibbon bounds represent 25th — 75th Percentiles. Data from IMDb retrieved 7/4/2018",df_ratings_movies %>% filter(startYear >= 1920) %>% ppdf()),
397 |                x="Year Movie was Released",
398 |                y="Age of Lead Actor/Actress",
399 |                caption="Max Woolf — minimaxir.com",
400 |                fill="# Movies")
401 | 
402 | ggsave("imdb-8.png", plot, width=4, height=3)
403 | ```
404 | 
405 | ![](imdb-8.png)
406 | 
407 | Create a plot comparing actors/actresses. Same code, except adding an aggregation and aestetic on `category`.
408 | 
409 | ```{r}
410 | df_actor_ages_lead <- df_ratings_movies %>%
411 |                   group_by(startYear, category) %>%
412 |                   summarize(low_age = quantile(age_lead, 0.25, na.rm=T),
413 |                             med_age = quantile(age_lead, 0.50, na.rm=T),
414 |                             high_age = quantile(age_lead, 0.75, na.rm=T)) %>%
415 |                   arrange(startYear)
416 | 
417 | df_actor_ages_lead %>% head()
418 | 
419 | plot <- ggplot(df_actor_ages_lead %>% filter(startYear >= 1920), aes(x = startYear, fill=category, color=category)) +
420 |           geom_ribbon(aes(ymin=low_age, ymax=high_age), alpha=0.2, size=0) +
421 |           geom_line(aes(y=med_age)) +
422 |           scale_fill_brewer(palette="Set1") +
423 |           scale_color_brewer(palette="Set1") +
424 |           labs(title="Change in Ages of Movie Lead Actors/Actress Over Time",
425 |                subtitle=sprintf("For %s Actors. Line represents median age.\nRibbon bounds represent 25th — 75th Percentiles. Data from IMDb retrieved 7/4/2018",df_ratings_movies %>% filter(startYear >= 1920) %>% ppdf()),
426 |                x="Year Movie was Released",
427 |                y="Age of Lead Actor/Actress",
428 |                caption="Max Woolf — minimaxir.com",
429 |                fill='',
430 |                color='')
431 | 
432 | ggsave("imdb-9.png", plot, width=4, height=3)
433 | ```
434 | 
435 | ![](imdb-9.png)
436 | 
437 | Same plot, but facet. (unused in final post since may not be enough data/similar accross all genres)
438 | 
439 | ```{r}
440 | df_actor_ages_lead <- df_ratings_movies %>%
441 |                   select(startYear, category, genres, age_lead) %>%
442 |                   unnest_tokens(genre, genres, token = str_split, pattern = ",") %>%
443 |                   filter(!(genre %in% c("game-show", "reality-tv", "short", "talk-show", "film-noir", NA))) %>%
444 |                   group_by(startYear, category, genre) %>%
445 |                   summarize(low_age = quantile(age_lead, 0.25, na.rm=T),
446 |                             med_age = quantile(age_lead, 0.50, na.rm=T),
447 |                             high_age = quantile(age_lead, 0.75, na.rm=T)) %>%
448 |                   arrange(startYear)
449 | 
450 | df_actor_ages_lead %>% head()
451 | 
452 | plot <- ggplot(df_actor_ages_lead %>% filter(startYear >= 1950), aes(x = startYear, fill=category, color=category)) +
453 |           geom_ribbon(aes(ymin=low_age, ymax=high_age), alpha=0.2, size=0) +
454 |           geom_line(aes(y=med_age), size=0.5) +
455 |           theme_minimal(base_family = "Source Sans Pro", base_size=9) +
456 |           scale_fill_brewer(palette="Set1") +
457 |           scale_color_brewer(palette="Set1") +
458 |           facet_wrap(~ genre)
459 | 
460 | ggsave("imdb-10.png", plot, width=6, height=6)
461 | ```
462 | 
463 | ![](imdb-10.png)
464 | 
465 | # Lead Gender Balance
466 | 
467 | Unused in post since a bit more complicated to explain and results need double-checking.
468 | 
469 | ```{r}
470 | plot <- ggplot(df_ratings_movies %>% filter(startYear >= 1950), aes(x = startYear, fill=category)) +
471 |           geom_bar(position="fill", width=1) +
472 |           theme_minimal(base_family = "Source Sans Pro", base_size=9) +
473 |           scale_fill_brewer(palette="Set1") +
474 |           scale_color_brewer(palette="Set1")
475 | 
476 | ggsave("imdb-11.png", plot, width=4, height=3)
477 | ```
478 | 
479 | ![](imdb-11.png)
480 | 
481 | # nth time lead
482 | 
483 | ```{r}
484 | df_ratings_movies_nth <- df_ratings_movies %>%
485 |                       group_by(nconst) %>%
486 |                       arrange(startYear) %>%
487 |                       mutate(nth_lead = row_number()) %>%
488 |                       ungroup() %>%
489 |                       arrange(desc(startYear), desc(numVotes))
490 | 
491 | df_ratings_movies_nth %>% select(primaryTitle, primaryName, nth_lead) %>% head(100)
492 | ```
493 | 
494 | ```{r}
495 | df_actor_ages <- df_ratings_movies_nth %>%
496 |                   group_by(startYear) %>%
497 |                   summarize(low_nth = quantile(nth_lead, 0.25),
498 |                             med_nth = quantile(nth_lead, 0.50),
499 |                             high_nth = quantile(nth_lead, 0.75)) %>%
500 |                   arrange(startYear)
501 | 
502 | df_actor_ages %>% head()
503 | 
504 | plot <- ggplot(df_actor_ages %>% filter(startYear >= 1950) , aes(x = startYear)) +
505 |           geom_ribbon(aes(ymin=low_nth, ymax=high_nth), alpha=0.2) +
506 |           geom_line(aes(y=med_nth)) +
507 |           scale_y_continuous(breaks=c(1:5, 10)) +
508 |           labs(title="#th Time Lead Actor of Movie Was A Lead Actor, Over Time",
509 |                subtitle=sprintf("For %s Lead Actors. Line represents median #.\nRibbon bounds represent 25th — 75th Percentiles. Data from IMDb retrieved 7/4/2018",df_ratings_movies_nth %>% filter(startYear >= 1950) %>% ppdf()),
510 |                x="Year",
511 |                y="#th Time Lead Actor was a Lead Actor",
512 |                caption="Max Woolf — minimaxir.com",
513 |                fill="# Movies") +
514 |           theme(panel.grid.minor = element_blank())
515 | 
516 | ggsave("imdb-12.png", plot, width=4, height=3)
517 | ```
518 | 
519 | ![](imdb-12.png)
520 | 
521 | # LICENSE
522 | 
523 | The MIT License (MIT)
524 | 
525 | Copyright (c) 2018 Max Woolf
526 | 
527 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
528 | 
529 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
530 | 
531 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/imdb_analysis_livestream.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Analyzing IMDb Data The Intended Way, with R and ggplot2"
  3 | author: "Max Woolf (@minimaxir)"
  4 | date: "2018-07-04"
  5 | output:
  6 |   html_notebook:
  7 |     highlight: tango
  8 |     mathjax: null
  9 |     number_sections: yes
 10 |     theme: spacelab
 11 |     toc: True
 12 | ---
 13 | 
 14 | 
 15 | **Information courtesy of
 16 | IMDb
 17 | (http://www.imdb.com).
 18 | Used with permission.**
 19 | 
 20 | 
 21 | ```{r}
 22 | library(tidyverse)
 23 | library(ggrepel)
 24 | library(gghighlight)
 25 | library(ggridges)
 26 | library(tidytext)
 27 | library(scales)
 28 | 
 29 | sessionInfo()
 30 | ```
 31 | 
 32 | # Ratings
 33 | 
 34 | ```{r}
 35 | read_imdb <- function(data_path) {
 36 |   path <- "/Volumes/Extreme 510/Data/imdb/"
 37 |   read_tsv(paste0(path, data_path), na = "\\N", progress=F)
 38 | }
 39 | 
 40 | df_ratings <- read_imdb("title.ratings.tsv")
 41 | df_ratings %>% head()
 42 | ```
 43 | 
 44 | ```{r}
 45 | plot <- ggplot(df_ratings, aes(x = numVotes, y = averageRating)) +
 46 |           geom_bin2d() +
 47 |           scale_x_log10(labels = comma) +
 48 |           scale_y_continuous(breaks = 0:10) +
 49 |           scale_fill_viridis_c(option = "inferno")
 50 | 
 51 | ggsave("imdb-1.png", plot, width=4, height=3)
 52 | ```
 53 | 
 54 | ![](imdb-1.png)
 55 | 
 56 | # Title Basics
 57 | 
 58 | ```{r}
 59 | df_basics <- read_imdb("title.basics.tsv")
 60 | ```
 61 | 
 62 | ```{r}
 63 | df_basics %>% head()
 64 | ```
 65 | 
 66 | ```{r}
 67 | df_ratings <- df_ratings %>% left_join(df_basics)
 68 | 
 69 | df_ratings %>% head()
 70 | ```
 71 | 
 72 | ```{r}
 73 | plot <- ggplot(df_ratings %>% filter(runtimeMinutes < 180, runtimeMinutes > 20), aes(x = runtimeMinutes, y = averageRating)) +
 74 |           geom_bin2d() +
 75 |           scale_x_continuous(breaks = seq(0, 180, 60), labels = 0:3) +
 76 |           scale_y_continuous(breaks = 0:10) +
 77 |           scale_fill_viridis_c(option = "inferno", labels = comma) +
 78 |           theme_minimal(base_family = "Source Sans Pro", base_size=9) +
 79 |           labs(title="Relationship between Movie Runtime and Average Mobie Rating",
 80 |                subtitle="Data from IMDb retrieved July 4th, 2018",
 81 |                x="Runtime (Hours)",
 82 |                y="Average User Rating",
 83 |                caption="Max Woolf — minimaxir.com",
 84 |                fill="")
 85 | 
 86 | ggsave("imdb-2.png", plot, width=6, height=3)
 87 | ```
 88 | 
 89 | ```{r}
 90 | df_ratings_unnest <- df_ratings %>%
 91 |                         select(runtimeMinutes, averageRating, genres) %>%
 92 |                         unnest_tokens(genre, genres)
 93 | 
 94 | df_ratings_unnest %>% head(10)
 95 | ```
 96 | 
 97 | ```{r}
 98 | plot <- ggplot(df_ratings_unnest %>% filter(runtimeMinutes < 180, runtimeMinutes > 20), aes(x = runtimeMinutes, y = averageRating)) +
 99 |           geom_bin2d() +
100 |           scale_x_continuous(breaks = seq(0, 180, 60), labels = 0:3) +
101 |           scale_y_continuous(breaks = 0:10) +
102 |           scale_fill_viridis_c(option = "inferno", labels = comma, trans='log10') +
103 |           theme_minimal(base_family = "Source Sans Pro", base_size=9) +
104 |           facet_wrap(~ genre) +
105 |           labs(title="Relationship between Movie Runtime and Average Mobie Rating",
106 |                subtitle="Data from IMDb retrieved July 4th, 2018",
107 |                x="Runtime (Hours)",
108 |                y="Average User Rating",
109 |                caption="Max Woolf — minimaxir.com",
110 |                fill="")
111 | 
112 | ggsave("imdb-3.png", plot, width=6, height=6)
113 | ```
114 | 
115 | # Rating vs. Movie Year
116 | 
117 | ```{r}
118 | plot <- ggplot(df_ratings, aes(x = startYear, y = averageRating)) +
119 |           geom_bin2d() +
120 |           scale_x_continuous() +
121 |           scale_y_continuous(breaks = 0:10) +
122 |           scale_fill_viridis_c(option = "inferno", labels = comma, trans='log10') +
123 |           theme_minimal(base_family = "Source Sans Pro", base_size=9) +
124 |           labs(title="Relationship between Movie Runtime and Average Mobie Rating",
125 |                subtitle="Data from IMDb retrieved July 4th, 2018",
126 |                x="Year of Release",
127 |                y="Average User Rating",
128 |                caption="Max Woolf — minimaxir.com",
129 |                fill="")
130 | 
131 | ggsave("imdb-4.png", plot, width=6, height=3)
132 | ```
133 | 
134 | ```{r}
135 | # plot <- ggplot(df_ratings %>% filter(startYear >= 1950, !is.na(startYear)), aes(x = averageRating, y = startYear)) +
136 | #           geom_density_ridges()
137 | #           #theme_ridges() +
138 | #           scale_y_discrete() +
139 | #           #scale_y_continuous(breaks = 0:10) +
140 | #           #scale_fill_viridis_c(option = "inferno", labels = comma, trans='log10') +
141 | #           #theme_minimal(base_family = "Source Sans Pro", base_size=9) +
142 | #           # labs(title="Relationship between Movie Runtime and Average Mobie Rating",
143 | #           #      subtitle="Data from IMDb retrieved July 4th, 2018",
144 | #           #      x="Year of Release",
145 | #           #      y="Average User Rating",
146 | #           #      caption="Max Woolf — minimaxir.com",
147 | #           #      fill="")
148 | # 
149 | # ggsave("imdb-5.png", plot, width=6, height=3)
150 | ```
151 | 
152 | ```{r}
153 | plot <- ggplot(df_ratings, aes(x = startYear, y = averageRating)) +
154 |           geom_bin2d() +
155 |           scale_x_continuous() +
156 |           scale_y_continuous(breaks = 0:10) +
157 |           scale_fill_viridis_c(option = "inferno", labels = comma, trans='log10') +
158 |           theme_minimal(base_family = "Source Sans Pro", base_size=9) +
159 |           labs(title="Relationship between Movie Runtime and Average Mobie Rating",
160 |                subtitle="Data from IMDb retrieved July 4th, 2018",
161 |                x="Year of Release",
162 |                y="Average User Rating",
163 |                caption="Max Woolf — minimaxir.com",
164 |                fill="")
165 | 
166 | ggsave("imdb-4.png", plot, width=6, height=3)
167 | ```
168 | 
169 | 
170 | ```{r}
171 | # df_ratings <- df_ratings %>% mutate(decade = factor(cut_interval(startYear, 10, boundary=0), labels = seq(1950, 2010, 10)))
172 | # 
173 | # df_ratings %>% head()
174 | ```
175 | 
176 | ```{r}
177 | df_episode <- read_imdb("title.episode.tsv") %>% filter(!is.na(seasonNumber))
178 | df_episode %>% head()
179 | ```
180 | 
181 | ```{r}
182 | df_episode_count <- df_episode %>%
183 |                 group_by(parentTcont, seasonNumber) %>%
184 |                 tally()
185 | ```
186 | 
187 | 


--------------------------------------------------------------------------------