├── .gitignore ├── 2-imdb.Rproj ├── README.md ├── imdb-0.png ├── imdb-1.png ├── imdb-10.png ├── imdb-11.png ├── imdb-12.png ├── imdb-2.png ├── imdb-2a.png ├── imdb-2b.png ├── imdb-3.png ├── imdb-3b.png ├── imdb-4.png ├── imdb-5.png ├── imdb-6.png ├── imdb-7.png ├── imdb-8.png ├── imdb-9.png ├── imdb_analysis.Rmd └── imdb_analysis_livestream.Rmd /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | -------------------------------------------------------------------------------- /2-imdb.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # imdb-data-analysis 2 | ![](imdb-4.png) 3 | 4 | R Code + R Notebook on how to process and visualize the official IMDb datasets. 5 | 6 | This R Notebook is the complement to my blog post [Analyzing IMDb Data The Intended Way, with R and ggplot2](http://minimaxir.com/2018/07/imdb-data-analysis/). 7 | 8 | ## Maintainer 9 | Max Woolf ([@minimaxir](http://minimaxir.com)) 10 | 11 | *Max's open-source projects are supported by his [Patreon](https://www.patreon.com/minimaxir). If you found this project helpful, any monetary contributions to the Patreon are appreciated and will be put to good creative use.* 12 | 13 | ## License 14 | MIT -------------------------------------------------------------------------------- /imdb-0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-0.png -------------------------------------------------------------------------------- /imdb-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-1.png -------------------------------------------------------------------------------- /imdb-10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-10.png -------------------------------------------------------------------------------- /imdb-11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-11.png -------------------------------------------------------------------------------- /imdb-12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-12.png -------------------------------------------------------------------------------- /imdb-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-2.png -------------------------------------------------------------------------------- /imdb-2a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-2a.png -------------------------------------------------------------------------------- /imdb-2b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-2b.png -------------------------------------------------------------------------------- /imdb-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-3.png -------------------------------------------------------------------------------- /imdb-3b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-3b.png -------------------------------------------------------------------------------- /imdb-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-4.png -------------------------------------------------------------------------------- /imdb-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-5.png -------------------------------------------------------------------------------- /imdb-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-6.png -------------------------------------------------------------------------------- /imdb-7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-7.png -------------------------------------------------------------------------------- /imdb-8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-8.png -------------------------------------------------------------------------------- /imdb-9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/minimaxir/imdb-data-analysis/567fbbb4b79117a1e81eb0bf639fecdd944757f7/imdb-9.png -------------------------------------------------------------------------------- /imdb_analysis.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Analyzing IMDb Data The Intended Way, with R and ggplot2" 3 | author: "Max Woolf (@minimaxir)" 4 | date: "2018-07-15" 5 | output: 6 | html_notebook: 7 | highlight: tango 8 | mathjax: null 9 | number_sections: yes 10 | theme: spacelab 11 | toc: True 12 | --- 13 | 14 | This R Notebook is the complement to my blog post [Analyzing IMDb Data The Intended Way, with R and ggplot2](http://minimaxir.com/2018/07/imdb-data-analysis/). 15 | 16 | This notebook is licensed under the MIT License. If you use the code or data visualization designs contained within this notebook, it would be greatly appreciated if proper attribution is given back to this notebook and/or myself. Thanks! :) 17 | 18 | IMDb data retrieved on July 4th 2018. 19 | 20 | 21 | **Information courtesy of 22 | IMDb 23 | (http://www.imdb.com). 24 | Used with permission.** 25 | 26 | 27 | ```{r} 28 | library(tidyverse) 29 | library(ggridges) # unused in final blog post 30 | library(tidytext) # unused in final blog post 31 | library(scales) 32 | 33 | sessionInfo() 34 | ``` 35 | 36 | Helper function to read IMDB files given filename. 37 | 38 | ```{r} 39 | read_imdb <- function(data_path) { 40 | path <- "/Volumes/Extreme 510/Data/imdb/" 41 | read_tsv(paste0(path, data_path), na = "\\N", quote='', progress=F) 42 | } 43 | ``` 44 | 45 | Helper function to pretty print the size of a dataframe for charts/notebook. 46 | 47 | ```{r} 48 | ppdf <- function(df) { 49 | df %>% nrow() %>% comma() 50 | } 51 | ``` 52 | 53 | 54 | # Ratings 55 | 56 | ```{r} 57 | df_ratings <- read_imdb("title.ratings.tsv") 58 | df_ratings %>% head() 59 | ``` 60 | 61 | There are **`r df_ratings %>% ppdf()`** ratings in the dataset. 62 | 63 | Plot every point. (note: very slow!) 64 | 65 | ```{r eval=FALSE} 66 | plot <- ggplot(df_ratings, aes(x = numVotes, y = averageRating)) + 67 | geom_point() 68 | 69 | ggsave("imdb-0.png", plot, width=4, height=3) 70 | ``` 71 | 72 | ![](imdb-0.png) 73 | 74 | Plot a 2D histogram and clean up axes. 75 | 76 | ```{r} 77 | plot <- ggplot(df_ratings, aes(x = numVotes, y = averageRating)) + 78 | geom_bin2d() + 79 | scale_x_log10(labels = comma) + 80 | scale_y_continuous(breaks = 1:10) + 81 | scale_fill_viridis_c(labels = comma) 82 | 83 | ggsave("imdb-1.png", plot, width=4, height=3) 84 | ``` 85 | 86 | ![](imdb-1.png) 87 | 88 | 89 | 90 | # Title Basics 91 | 92 | ```{r} 93 | df_basics <- read_imdb("title.basics.tsv") 94 | df_basics %>% head() 95 | ``` 96 | 97 | There are **`r df_basics %>% ppdf()`** titles in the dataset. 98 | 99 | Merge `df_ratings` and `df_basics` to perform ratings/vote analysis using more metadata. 100 | 101 | ```{r} 102 | df_ratings <- df_ratings %>% left_join(df_basics) 103 | 104 | df_ratings %>% head() 105 | ``` 106 | 107 | ```{r} 108 | plot <- ggplot(df_ratings, aes(x = runtimeMinutes, y = averageRating)) + 109 | geom_bin2d() + 110 | scale_x_continuous(labels = comma) + 111 | scale_y_continuous(breaks = 1:10) + 112 | scale_fill_viridis_c() 113 | 114 | ggsave("imdb-2a.png", plot, width=4, height=3) 115 | ``` 116 | 117 | ![](imdb-2a.png) 118 | 119 | Which movies have the superhigh runtimes? 120 | 121 | ```{r} 122 | df_ratings %>% arrange(desc(runtimeMinutes)) %>% head(10) 123 | ``` 124 | 125 | 126 | ```{r} 127 | plot <- ggplot(df_ratings %>% filter(runtimeMinutes < 180, titleType=="movie", numVotes >= 10), aes(x = runtimeMinutes, y = averageRating)) + 128 | geom_bin2d() + 129 | scale_x_continuous(breaks = seq(0, 180, 60), labels = 0:3) + 130 | scale_y_continuous(breaks = 0:10) + 131 | scale_fill_viridis_c(option = "inferno", labels = comma) + 132 | theme_minimal(base_family = "Source Sans Pro", base_size=8) + 133 | labs(title="Relationship between Movie Runtime and Average Movie Rating", 134 | subtitle="Data from IMDb retrieved July 4th, 2018", 135 | x="Runtime (Hours)", 136 | y="Average User Rating", 137 | caption="Max Woolf — minimaxir.com", 138 | fill="# Movies") 139 | 140 | ggsave("imdb-2b.png", plot, width=4, height=3) 141 | ``` 142 | 143 | ![](imdb-2b.png) 144 | ## Unnesting Genres 145 | 146 | How to facet by genre; this was removed from the post since a little complicated. 147 | 148 | NB: you cannot use default tokenization on `unnest_tokens` since some tokens have dashes. (e.g. `film-noir`) 149 | 150 | ```{r} 151 | df_ratings_unnest <- df_ratings %>% 152 | filter(runtimeMinutes < 180, titleType=="movie", numVotes >= 10) %>% 153 | select(runtimeMinutes, averageRating, genres) %>% 154 | unnest_tokens(genre, genres, token = str_split, pattern = ",") 155 | 156 | df_ratings_unnest %>% head(10) 157 | ``` 158 | 159 | ```{r} 160 | plot <- ggplot(df_ratings_unnest, aes(x = runtimeMinutes, y = averageRating)) + 161 | geom_bin2d() + 162 | scale_x_continuous(breaks = seq(0, 180, 60), labels = 0:3) + 163 | scale_y_continuous(breaks = 1:10) + 164 | scale_fill_viridis_c(option = "inferno", labels = comma) + 165 | theme_minimal(base_family = "Source Sans Pro", base_size=9) + 166 | labs(title="Relationship between Movie Runtime and Average Mobie Rating", 167 | subtitle="Data from IMDb retrieved July 4th, 2018", 168 | x="Runtime (Hours)", 169 | y="Average User Rating", 170 | caption="Max Woolf — minimaxir.com", 171 | fill="# Movies") + 172 | facet_wrap(~ genre) 173 | 174 | ggsave("imdb-3.png", plot, width=6, height=6) 175 | ``` 176 | 177 | ![](imdb-3.png) 178 | 179 | Normalize by facet. There are two approaches: 180 | 181 | 1. Do a weighted sum of the points in the spatial area, where the weight is the reciprocol of the # of points in the facet. 182 | 2. Manually calculate bins/counts and scale to `[0, 1]` 183 | 184 | Option 1 is a bit easier to implement. Additionally, remove facets with little data. 185 | 186 | `squish` trick via https://stackoverflow.com/a/23655697/9314418 187 | 188 | ```{r} 189 | df_temp <- df_ratings_unnest %>% 190 | filter(!(genre %in% c("game-show", "reality-tv", "short", "talk-show", NA))) %>% 191 | group_by(genre) %>% 192 | mutate(prop = 1/n()) 193 | 194 | plot <- ggplot(df_temp, aes(x = runtimeMinutes, y = averageRating, z=prop)) + 195 | stat_summary_2d(fun=sum) + 196 | scale_x_continuous(breaks = seq(0, 180, 60), labels = 0:3) + 197 | scale_y_continuous(breaks = 1:10) + 198 | scale_fill_viridis_c(option = "inferno", labels = comma, limits=c(0, 0.02), oob=squish, guide=F) + 199 | theme_minimal(base_family = "Source Sans Pro", base_size=9) + 200 | labs(title="Relationship between Movie Runtime and Average Mobie Rating", 201 | subtitle="Data from IMDb retrieved July 4th, 2018", 202 | x="Runtime (Hours)", 203 | y="Average User Rating", 204 | caption="Max Woolf — minimaxir.com") + 205 | facet_wrap(~ genre) 206 | 207 | ggsave("imdb-3b.png", plot, width=6, height=6) 208 | ``` 209 | 210 | ![](imdb-3b.png) 211 | 212 | # Rating vs. Movie Year 213 | 214 | Set theme to custom theme based on `theme_minimal` for the rest of the notebook. 215 | 216 | ```{r} 217 | theme_set(theme_minimal(base_size=9, base_family="Source Sans Pro") + 218 | theme(plot.title = element_text(size=8, family="Source Sans Pro Bold", margin=margin(t = -0.1, b = 0.1, unit='cm')), 219 | axis.title.x = element_text(size=8), 220 | axis.title.y = element_text(size=8), 221 | plot.subtitle = element_text(family="Source Sans Pro Semibold", color="#969696", size=6), 222 | plot.caption = element_text(size=6, color="#969696"), 223 | legend.title = element_text(size=8), 224 | legend.key.width = unit(0.25, unit='cm'))) 225 | ``` 226 | 227 | 228 | ```{r} 229 | plot <- ggplot(df_ratings %>% filter(titleType=="movie", numVotes >= 10), aes(x = startYear, y = averageRating)) + 230 | geom_bin2d() + 231 | geom_smooth(color="black") + 232 | scale_x_continuous() + 233 | scale_y_continuous(breaks = 1:10) + 234 | scale_fill_viridis_c(option = "plasma", labels = comma, trans='log10') + 235 | labs(title="Relationship between Movie Release Year and Average Rating", 236 | subtitle=sprintf("For %s Movies/Ratings. Data from IMDb retrieved 7/4/2018", df_ratings %>% filter(titleType=="movie", numVotes >= 10) %>% ppdf), 237 | x="Year Movie was Released", 238 | y="Average User Rating For Movie", 239 | caption="Max Woolf — minimaxir.com", 240 | fill="# Movies") 241 | 242 | ggsave("imdb-4.png", plot, width=4, height=3) 243 | ``` 244 | 245 | ![](imdb-4.png) 246 | 247 | Work with Ridge plots; not included in post because it doesn't offer much insight different from the chart above. 248 | 249 | NB: For ridge plots, the y-axis must be a `factor`, not a `numeric`; this is what tripped me up in the stream. 250 | 251 | ```{r} 252 | plot <- ggplot(df_ratings %>% filter(startYear >= 2000, titleType=="movie", numVotes >= 10) %>% mutate(startYear = factor(startYear)), aes(x = averageRating, y = startYear, fill=startYear)) + 253 | geom_density_ridges() + 254 | scale_fill_hue(guide=F) + 255 | scale_x_continuous(breaks = 1:10) + 256 | theme_minimal(base_family = "Source Sans Pro", base_size=9) 257 | 258 | ggsave("imdb-5.png", plot, width=4, height=3) 259 | ``` 260 | 261 | ![](imdb-5.png) 262 | 263 | Bucket by decades. 264 | 265 | ```{r} 266 | df_ratings_decades <- df_ratings %>% 267 | filter(startYear>=1950, titleType=="movie", numVotes >= 10) %>% 268 | mutate(decade = fct_rev(factor(cut_width(startYear, 10, boundary=0), labels = paste0(seq(1950, 2010, 10), "s")))) 269 | 270 | df_ratings_decades %>% head() 271 | ``` 272 | 273 | ```{r} 274 | plot <- ggplot(df_ratings_decades, aes(x = averageRating, y = decade, fill=0.5 - abs(0.5-..ecdf..))) + 275 | geom_density_ridges_gradient(calc_ecdf=T, quantile_lines=T) + 276 | scale_fill_viridis_c(option = "plasma", guide=F) + 277 | scale_x_continuous(breaks = 1:10) + 278 | #scale_y_discrete(expand = c(0,01, 0)) + 279 | theme_minimal(base_family = "Source Sans Pro", base_size=9) 280 | 281 | ggsave("imdb-6.png", plot, width=4, height=3) 282 | ``` 283 | 284 | ![](imdb-6.png) 285 | 286 | ```{r} 287 | plot <- ggplot(df_ratings_decades, aes(x = runtimeMinutes, y = decade, fill=0.5 - abs(0.5-..ecdf..))) + 288 | geom_density_ridges_gradient(calc_ecdf=T, quantile_lines=T) + 289 | scale_fill_viridis_c(option = "plasma", guide=F) + 290 | scale_x_continuous(breaks = seq(0, 180, 60), limits=c(0,180), labels = 0:3) + 291 | theme_minimal(base_family = "Source Sans Pro", base_size=9) 292 | 293 | ggsave("imdb-7.png", plot, width=4, height=3) 294 | ``` 295 | 296 | ![](imdb-7.png) 297 | 298 | ## Episode Analysis 299 | 300 | For reference; not included in post. (Too much bad data to clean.) 301 | 302 | ```{r} 303 | df_episode <- read_imdb("title.episode.tsv") %>% filter(!is.na(seasonNumber)) 304 | df_episode %>% head() 305 | ``` 306 | 307 | There are `r df_episode %>% ppdf()` episodes in the dataset. 308 | 309 | ```{r} 310 | df_episode_count <- df_episode %>% 311 | group_by(parentTconst, seasonNumber) %>% 312 | tally() %>% 313 | left_join(df_basics, c("parentTconst" = "tconst")) 314 | 315 | df_episode_count %>% head() 316 | ``` 317 | 318 | 319 | # Actor Information 320 | 321 | `str_detect` is vectorized and *much* faster than using a loop/`lapply`. Using a regular expression to search for actor *or* actress is another speed increase. 322 | 323 | ```{r} 324 | df_actors <- read_imdb("name.basics.tsv") %>% 325 | filter(str_detect(primaryProfession, "actor|actress")) %>% 326 | select(nconst, primaryName, birthYear) 327 | 328 | df_actors %>% head() 329 | ``` 330 | 331 | There are **`r df_actors %>% ppdf()`** actors in the dataset. 332 | 333 | ```{r} 334 | df_principals <- read_imdb("title.principals.tsv") %>% 335 | filter(str_detect(category, "actor|actress")) %>% 336 | select(tconst, ordering, nconst, category) %>% 337 | group_by(tconst) %>% 338 | filter(ordering == min(ordering)) 339 | 340 | df_principals %>% head() 341 | ``` 342 | 343 | There are **`r df_principals %>% ppdf()`** principals/rows in the dataset. 344 | 345 | Join the 2 dataframes. (onto `principals`, since Many-to-One) 346 | 347 | ```{r} 348 | df_principals <- df_principals %>% left_join(df_actors) 349 | 350 | df_principals %>% head() 351 | ``` 352 | 353 | # Putting It All Together 354 | 355 | Merge actor information onto the full ratings dataframe. 356 | 357 | ```{r} 358 | df_ratings <- df_ratings %>% left_join(df_principals) 359 | 360 | df_ratings %>% head() 361 | ``` 362 | 363 | Filter down to movies w/ actor info. (only if the birth year is present in the data) 364 | 365 | ```{r} 366 | df_ratings_movies <- df_ratings %>% 367 | filter(titleType == "movie", !is.na(birthYear), numVotes >= 10) %>% 368 | mutate(age_lead = startYear - birthYear) %>% 369 | arrange(desc(numVotes)) 370 | 371 | df_ratings_movies %>% head(100) 372 | ``` 373 | 374 | Aggregate lead-actor/actress ages by movie year w/ percentiles. 375 | 376 | ```{r} 377 | df_actor_ages <- df_ratings_movies %>% 378 | group_by(startYear) %>% 379 | summarize(low_age = quantile(age_lead, 0.25, na.rm=T), 380 | med_age = quantile(age_lead, 0.50, na.rm=T), 381 | high_age = quantile(age_lead, 0.75, na.rm=T)) %>% 382 | arrange(startYear) 383 | 384 | df_actor_ages %>% head() 385 | ``` 386 | 387 | Create a ribbon plot. 388 | 389 | NB: Plot the ribbon before the line, so the line is on top. 390 | 391 | ```{r} 392 | plot <- ggplot(df_actor_ages %>% filter(startYear >= 1920) , aes(x = startYear)) + 393 | geom_ribbon(aes(ymin=low_age, ymax=high_age), alpha=0.2) + 394 | geom_line(aes(y=med_age)) + 395 | labs(title="Change in Ages of Movie Lead Actors/Actress Over Time", 396 | subtitle=sprintf("For %s Actors. Line represents median age.\nRibbon bounds represent 25th — 75th Percentiles. Data from IMDb retrieved 7/4/2018",df_ratings_movies %>% filter(startYear >= 1920) %>% ppdf()), 397 | x="Year Movie was Released", 398 | y="Age of Lead Actor/Actress", 399 | caption="Max Woolf — minimaxir.com", 400 | fill="# Movies") 401 | 402 | ggsave("imdb-8.png", plot, width=4, height=3) 403 | ``` 404 | 405 | ![](imdb-8.png) 406 | 407 | Create a plot comparing actors/actresses. Same code, except adding an aggregation and aestetic on `category`. 408 | 409 | ```{r} 410 | df_actor_ages_lead <- df_ratings_movies %>% 411 | group_by(startYear, category) %>% 412 | summarize(low_age = quantile(age_lead, 0.25, na.rm=T), 413 | med_age = quantile(age_lead, 0.50, na.rm=T), 414 | high_age = quantile(age_lead, 0.75, na.rm=T)) %>% 415 | arrange(startYear) 416 | 417 | df_actor_ages_lead %>% head() 418 | 419 | plot <- ggplot(df_actor_ages_lead %>% filter(startYear >= 1920), aes(x = startYear, fill=category, color=category)) + 420 | geom_ribbon(aes(ymin=low_age, ymax=high_age), alpha=0.2, size=0) + 421 | geom_line(aes(y=med_age)) + 422 | scale_fill_brewer(palette="Set1") + 423 | scale_color_brewer(palette="Set1") + 424 | labs(title="Change in Ages of Movie Lead Actors/Actress Over Time", 425 | subtitle=sprintf("For %s Actors. Line represents median age.\nRibbon bounds represent 25th — 75th Percentiles. Data from IMDb retrieved 7/4/2018",df_ratings_movies %>% filter(startYear >= 1920) %>% ppdf()), 426 | x="Year Movie was Released", 427 | y="Age of Lead Actor/Actress", 428 | caption="Max Woolf — minimaxir.com", 429 | fill='', 430 | color='') 431 | 432 | ggsave("imdb-9.png", plot, width=4, height=3) 433 | ``` 434 | 435 | ![](imdb-9.png) 436 | 437 | Same plot, but facet. (unused in final post since may not be enough data/similar accross all genres) 438 | 439 | ```{r} 440 | df_actor_ages_lead <- df_ratings_movies %>% 441 | select(startYear, category, genres, age_lead) %>% 442 | unnest_tokens(genre, genres, token = str_split, pattern = ",") %>% 443 | filter(!(genre %in% c("game-show", "reality-tv", "short", "talk-show", "film-noir", NA))) %>% 444 | group_by(startYear, category, genre) %>% 445 | summarize(low_age = quantile(age_lead, 0.25, na.rm=T), 446 | med_age = quantile(age_lead, 0.50, na.rm=T), 447 | high_age = quantile(age_lead, 0.75, na.rm=T)) %>% 448 | arrange(startYear) 449 | 450 | df_actor_ages_lead %>% head() 451 | 452 | plot <- ggplot(df_actor_ages_lead %>% filter(startYear >= 1950), aes(x = startYear, fill=category, color=category)) + 453 | geom_ribbon(aes(ymin=low_age, ymax=high_age), alpha=0.2, size=0) + 454 | geom_line(aes(y=med_age), size=0.5) + 455 | theme_minimal(base_family = "Source Sans Pro", base_size=9) + 456 | scale_fill_brewer(palette="Set1") + 457 | scale_color_brewer(palette="Set1") + 458 | facet_wrap(~ genre) 459 | 460 | ggsave("imdb-10.png", plot, width=6, height=6) 461 | ``` 462 | 463 | ![](imdb-10.png) 464 | 465 | # Lead Gender Balance 466 | 467 | Unused in post since a bit more complicated to explain and results need double-checking. 468 | 469 | ```{r} 470 | plot <- ggplot(df_ratings_movies %>% filter(startYear >= 1950), aes(x = startYear, fill=category)) + 471 | geom_bar(position="fill", width=1) + 472 | theme_minimal(base_family = "Source Sans Pro", base_size=9) + 473 | scale_fill_brewer(palette="Set1") + 474 | scale_color_brewer(palette="Set1") 475 | 476 | ggsave("imdb-11.png", plot, width=4, height=3) 477 | ``` 478 | 479 | ![](imdb-11.png) 480 | 481 | # nth time lead 482 | 483 | ```{r} 484 | df_ratings_movies_nth <- df_ratings_movies %>% 485 | group_by(nconst) %>% 486 | arrange(startYear) %>% 487 | mutate(nth_lead = row_number()) %>% 488 | ungroup() %>% 489 | arrange(desc(startYear), desc(numVotes)) 490 | 491 | df_ratings_movies_nth %>% select(primaryTitle, primaryName, nth_lead) %>% head(100) 492 | ``` 493 | 494 | ```{r} 495 | df_actor_ages <- df_ratings_movies_nth %>% 496 | group_by(startYear) %>% 497 | summarize(low_nth = quantile(nth_lead, 0.25), 498 | med_nth = quantile(nth_lead, 0.50), 499 | high_nth = quantile(nth_lead, 0.75)) %>% 500 | arrange(startYear) 501 | 502 | df_actor_ages %>% head() 503 | 504 | plot <- ggplot(df_actor_ages %>% filter(startYear >= 1950) , aes(x = startYear)) + 505 | geom_ribbon(aes(ymin=low_nth, ymax=high_nth), alpha=0.2) + 506 | geom_line(aes(y=med_nth)) + 507 | scale_y_continuous(breaks=c(1:5, 10)) + 508 | labs(title="#th Time Lead Actor of Movie Was A Lead Actor, Over Time", 509 | subtitle=sprintf("For %s Lead Actors. Line represents median #.\nRibbon bounds represent 25th — 75th Percentiles. Data from IMDb retrieved 7/4/2018",df_ratings_movies_nth %>% filter(startYear >= 1950) %>% ppdf()), 510 | x="Year", 511 | y="#th Time Lead Actor was a Lead Actor", 512 | caption="Max Woolf — minimaxir.com", 513 | fill="# Movies") + 514 | theme(panel.grid.minor = element_blank()) 515 | 516 | ggsave("imdb-12.png", plot, width=4, height=3) 517 | ``` 518 | 519 | ![](imdb-12.png) 520 | 521 | # LICENSE 522 | 523 | The MIT License (MIT) 524 | 525 | Copyright (c) 2018 Max Woolf 526 | 527 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 528 | 529 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 530 | 531 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /imdb_analysis_livestream.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Analyzing IMDb Data The Intended Way, with R and ggplot2" 3 | author: "Max Woolf (@minimaxir)" 4 | date: "2018-07-04" 5 | output: 6 | html_notebook: 7 | highlight: tango 8 | mathjax: null 9 | number_sections: yes 10 | theme: spacelab 11 | toc: True 12 | --- 13 | 14 | 15 | **Information courtesy of 16 | IMDb 17 | (http://www.imdb.com). 18 | Used with permission.** 19 | 20 | 21 | ```{r} 22 | library(tidyverse) 23 | library(ggrepel) 24 | library(gghighlight) 25 | library(ggridges) 26 | library(tidytext) 27 | library(scales) 28 | 29 | sessionInfo() 30 | ``` 31 | 32 | # Ratings 33 | 34 | ```{r} 35 | read_imdb <- function(data_path) { 36 | path <- "/Volumes/Extreme 510/Data/imdb/" 37 | read_tsv(paste0(path, data_path), na = "\\N", progress=F) 38 | } 39 | 40 | df_ratings <- read_imdb("title.ratings.tsv") 41 | df_ratings %>% head() 42 | ``` 43 | 44 | ```{r} 45 | plot <- ggplot(df_ratings, aes(x = numVotes, y = averageRating)) + 46 | geom_bin2d() + 47 | scale_x_log10(labels = comma) + 48 | scale_y_continuous(breaks = 0:10) + 49 | scale_fill_viridis_c(option = "inferno") 50 | 51 | ggsave("imdb-1.png", plot, width=4, height=3) 52 | ``` 53 | 54 | ![](imdb-1.png) 55 | 56 | # Title Basics 57 | 58 | ```{r} 59 | df_basics <- read_imdb("title.basics.tsv") 60 | ``` 61 | 62 | ```{r} 63 | df_basics %>% head() 64 | ``` 65 | 66 | ```{r} 67 | df_ratings <- df_ratings %>% left_join(df_basics) 68 | 69 | df_ratings %>% head() 70 | ``` 71 | 72 | ```{r} 73 | plot <- ggplot(df_ratings %>% filter(runtimeMinutes < 180, runtimeMinutes > 20), aes(x = runtimeMinutes, y = averageRating)) + 74 | geom_bin2d() + 75 | scale_x_continuous(breaks = seq(0, 180, 60), labels = 0:3) + 76 | scale_y_continuous(breaks = 0:10) + 77 | scale_fill_viridis_c(option = "inferno", labels = comma) + 78 | theme_minimal(base_family = "Source Sans Pro", base_size=9) + 79 | labs(title="Relationship between Movie Runtime and Average Mobie Rating", 80 | subtitle="Data from IMDb retrieved July 4th, 2018", 81 | x="Runtime (Hours)", 82 | y="Average User Rating", 83 | caption="Max Woolf — minimaxir.com", 84 | fill="") 85 | 86 | ggsave("imdb-2.png", plot, width=6, height=3) 87 | ``` 88 | 89 | ```{r} 90 | df_ratings_unnest <- df_ratings %>% 91 | select(runtimeMinutes, averageRating, genres) %>% 92 | unnest_tokens(genre, genres) 93 | 94 | df_ratings_unnest %>% head(10) 95 | ``` 96 | 97 | ```{r} 98 | plot <- ggplot(df_ratings_unnest %>% filter(runtimeMinutes < 180, runtimeMinutes > 20), aes(x = runtimeMinutes, y = averageRating)) + 99 | geom_bin2d() + 100 | scale_x_continuous(breaks = seq(0, 180, 60), labels = 0:3) + 101 | scale_y_continuous(breaks = 0:10) + 102 | scale_fill_viridis_c(option = "inferno", labels = comma, trans='log10') + 103 | theme_minimal(base_family = "Source Sans Pro", base_size=9) + 104 | facet_wrap(~ genre) + 105 | labs(title="Relationship between Movie Runtime and Average Mobie Rating", 106 | subtitle="Data from IMDb retrieved July 4th, 2018", 107 | x="Runtime (Hours)", 108 | y="Average User Rating", 109 | caption="Max Woolf — minimaxir.com", 110 | fill="") 111 | 112 | ggsave("imdb-3.png", plot, width=6, height=6) 113 | ``` 114 | 115 | # Rating vs. Movie Year 116 | 117 | ```{r} 118 | plot <- ggplot(df_ratings, aes(x = startYear, y = averageRating)) + 119 | geom_bin2d() + 120 | scale_x_continuous() + 121 | scale_y_continuous(breaks = 0:10) + 122 | scale_fill_viridis_c(option = "inferno", labels = comma, trans='log10') + 123 | theme_minimal(base_family = "Source Sans Pro", base_size=9) + 124 | labs(title="Relationship between Movie Runtime and Average Mobie Rating", 125 | subtitle="Data from IMDb retrieved July 4th, 2018", 126 | x="Year of Release", 127 | y="Average User Rating", 128 | caption="Max Woolf — minimaxir.com", 129 | fill="") 130 | 131 | ggsave("imdb-4.png", plot, width=6, height=3) 132 | ``` 133 | 134 | ```{r} 135 | # plot <- ggplot(df_ratings %>% filter(startYear >= 1950, !is.na(startYear)), aes(x = averageRating, y = startYear)) + 136 | # geom_density_ridges() 137 | # #theme_ridges() + 138 | # scale_y_discrete() + 139 | # #scale_y_continuous(breaks = 0:10) + 140 | # #scale_fill_viridis_c(option = "inferno", labels = comma, trans='log10') + 141 | # #theme_minimal(base_family = "Source Sans Pro", base_size=9) + 142 | # # labs(title="Relationship between Movie Runtime and Average Mobie Rating", 143 | # # subtitle="Data from IMDb retrieved July 4th, 2018", 144 | # # x="Year of Release", 145 | # # y="Average User Rating", 146 | # # caption="Max Woolf — minimaxir.com", 147 | # # fill="") 148 | # 149 | # ggsave("imdb-5.png", plot, width=6, height=3) 150 | ``` 151 | 152 | ```{r} 153 | plot <- ggplot(df_ratings, aes(x = startYear, y = averageRating)) + 154 | geom_bin2d() + 155 | scale_x_continuous() + 156 | scale_y_continuous(breaks = 0:10) + 157 | scale_fill_viridis_c(option = "inferno", labels = comma, trans='log10') + 158 | theme_minimal(base_family = "Source Sans Pro", base_size=9) + 159 | labs(title="Relationship between Movie Runtime and Average Mobie Rating", 160 | subtitle="Data from IMDb retrieved July 4th, 2018", 161 | x="Year of Release", 162 | y="Average User Rating", 163 | caption="Max Woolf — minimaxir.com", 164 | fill="") 165 | 166 | ggsave("imdb-4.png", plot, width=6, height=3) 167 | ``` 168 | 169 | 170 | ```{r} 171 | # df_ratings <- df_ratings %>% mutate(decade = factor(cut_interval(startYear, 10, boundary=0), labels = seq(1950, 2010, 10))) 172 | # 173 | # df_ratings %>% head() 174 | ``` 175 | 176 | ```{r} 177 | df_episode <- read_imdb("title.episode.tsv") %>% filter(!is.na(seasonNumber)) 178 | df_episode %>% head() 179 | ``` 180 | 181 | ```{r} 182 | df_episode_count <- df_episode %>% 183 | group_by(parentTcont, seasonNumber) %>% 184 | tally() 185 | ``` 186 | 187 | --------------------------------------------------------------------------------