├── .Rprofile ├── Dockerfile ├── src.Rproj ├── packrat ├── packrat.opts ├── init.R └── packrat.lock ├── README.md ├── .gitignore ├── LICENSE └── genre_analysis.Rmd /.Rprofile: -------------------------------------------------------------------------------- 1 | #### -- Packrat Autoloader (version 0.5.0) -- #### 2 | source("packrat/init.R") 3 | #### -- End Packrat Autoloader -- #### 4 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rocker/rstudio:latest 2 | 3 | # Install system packages 4 | RUN apt-get update && \ 5 | apt-get install -y --no-install-recommends git zlib1g-dev libxml2-dev && \ 6 | apt-get clean && \ 7 | rm -rf /var/lib/apt/lists/* -------------------------------------------------------------------------------- /src.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: No 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 4 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | LineEndingConversion: Posix 18 | -------------------------------------------------------------------------------- /packrat/packrat.opts: -------------------------------------------------------------------------------- 1 | auto.snapshot: FALSE 2 | use.cache: FALSE 3 | print.banner.on.startup: auto 4 | vcs.ignore.lib: TRUE 5 | vcs.ignore.src: TRUE 6 | external.packages: 7 | local.repos: 8 | load.external.packages.on.startup: TRUE 9 | ignored.packages: 10 | ignored.directories: 11 | data 12 | inst 13 | quiet.package.installation: TRUE 14 | snapshot.recommended.packages: FALSE 15 | snapshot.fields: 16 | Imports 17 | Depends 18 | LinkingTo 19 | symlink.system.packages: TRUE 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # movies_dataset_eda 2 | 3 | A toy project mainly for practicing rstudio/gt 4 | 5 | [Blog post](https://medium.com/the-artificial-impostor/playing-with-rstudio-gt-r-package-2f37a340c23f). 6 | 7 | ## Docker Instructions 8 | 9 | Go into the project folder and build the Docker: 10 | 11 | ``` 12 | docker build -t rstudio . 13 | ``` 14 | 15 | Start a container: 16 | 17 | ``` 18 | docker run -d -v $(pwd):/home/rstudio/src -e USERID=1000 --name -e PASSWORD= -p 8787:8787 rstudio 19 | ``` 20 | 21 | Finally, visit http://localhost:8787 to access RStudio web interface. 22 | 23 | Reference: [More Portable, Reproducible R Development Environment](https://medium.com/the-artificial-impostor/more-portable-reproducible-r-development-environment-c3074df7a6a8) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | 8 | # Example code in package build process 9 | *-Ex.R 10 | 11 | # Output files from R CMD build 12 | /*.tar.gz 13 | 14 | # Output files from R CMD check 15 | /*.Rcheck/ 16 | 17 | # RStudio files 18 | .Rproj.user/ 19 | 20 | # produced vignettes 21 | vignettes/*.html 22 | vignettes/*.pdf 23 | 24 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 25 | .httr-oauth 26 | 27 | # knitr and R markdown default cache directories 28 | /*_cache/ 29 | /cache/ 30 | 31 | # Temporary files created by R markdown 32 | *.utf8.md 33 | *.knit.md 34 | 35 | # Shiny token, see https://shiny.rstudio.com/articles/shinyapps.html 36 | rsconnect/ 37 | 38 | # packrat 39 | packrat/lib*/ 40 | packrat/src/ 41 | 42 | .Rproj.user 43 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 CeShine Lee 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /packrat/init.R: -------------------------------------------------------------------------------- 1 | local({ 2 | 3 | ## Helper function to get the path to the library directory for a 4 | ## given packrat project. 5 | getPackratLibDir <- function(projDir = NULL) { 6 | path <- file.path("packrat", "lib", R.version$platform, getRversion()) 7 | 8 | if (!is.null(projDir)) { 9 | 10 | ## Strip trailing slashes if necessary 11 | projDir <- sub("/+$", "", projDir) 12 | 13 | ## Only prepend path if different from current working dir 14 | if (!identical(normalizePath(projDir), normalizePath(getwd()))) 15 | path <- file.path(projDir, path) 16 | } 17 | 18 | path 19 | } 20 | 21 | ## Ensure that we set the packrat library directory relative to the 22 | ## project directory. Normally, this should be the working directory, 23 | ## but we also use '.rs.getProjectDirectory()' if necessary (e.g. we're 24 | ## rebuilding a project while within a separate directory) 25 | libDir <- if (exists(".rs.getProjectDirectory")) 26 | getPackratLibDir(.rs.getProjectDirectory()) 27 | else 28 | getPackratLibDir() 29 | 30 | ## Unload packrat in case it's loaded -- this ensures packrat _must_ be 31 | ## loaded from the private library. Note that `requireNamespace` will 32 | ## succeed if the package is already loaded, regardless of lib.loc! 33 | if ("packrat" %in% loadedNamespaces()) 34 | try(unloadNamespace("packrat"), silent = TRUE) 35 | 36 | if (suppressWarnings(requireNamespace("packrat", quietly = TRUE, lib.loc = libDir))) { 37 | 38 | # Check 'print.banner.on.startup' -- when NA and RStudio, don't print 39 | print.banner <- packrat::get_opts("print.banner.on.startup") 40 | if (print.banner == "auto" && is.na(Sys.getenv("RSTUDIO", unset = NA))) { 41 | print.banner <- TRUE 42 | } else { 43 | print.banner <- FALSE 44 | } 45 | return(packrat::on(print.banner = print.banner)) 46 | } 47 | 48 | ## Escape hatch to allow RStudio to handle bootstrapping. This 49 | ## enables RStudio to provide print output when automagically 50 | ## restoring a project from a bundle on load. 51 | if (!is.na(Sys.getenv("RSTUDIO", unset = NA)) && 52 | is.na(Sys.getenv("RSTUDIO_PACKRAT_BOOTSTRAP", unset = NA))) { 53 | Sys.setenv("RSTUDIO_PACKRAT_BOOTSTRAP" = "1") 54 | setHook("rstudio.sessionInit", function(...) { 55 | # Ensure that, on sourcing 'packrat/init.R', we are 56 | # within the project root directory 57 | if (exists(".rs.getProjectDirectory")) { 58 | owd <- getwd() 59 | setwd(.rs.getProjectDirectory()) 60 | on.exit(setwd(owd), add = TRUE) 61 | } 62 | source("packrat/init.R") 63 | }) 64 | return(invisible(NULL)) 65 | } 66 | 67 | ## Bootstrapping -- only performed in interactive contexts, 68 | ## or when explicitly asked for on the command line 69 | if (interactive() || "--bootstrap-packrat" %in% commandArgs(TRUE)) { 70 | 71 | needsRestore <- "--bootstrap-packrat" %in% commandArgs(TRUE) 72 | 73 | message("Packrat is not installed in the local library -- ", 74 | "attempting to bootstrap an installation...") 75 | 76 | ## We need utils for the following to succeed -- there are calls to functions 77 | ## in 'restore' that are contained within utils. utils gets loaded at the 78 | ## end of start-up anyhow, so this should be fine 79 | library("utils", character.only = TRUE) 80 | 81 | ## Install packrat into local project library 82 | packratSrcPath <- list.files(full.names = TRUE, 83 | file.path("packrat", "src", "packrat") 84 | ) 85 | 86 | ## No packrat tarballs available locally -- try some other means of installation 87 | if (!length(packratSrcPath)) { 88 | 89 | message("> No source tarball of packrat available locally") 90 | 91 | ## There are no packrat sources available -- try using a version of 92 | ## packrat installed in the user library to bootstrap 93 | if (requireNamespace("packrat", quietly = TRUE) && packageVersion("packrat") >= "0.2.0.99") { 94 | message("> Using user-library packrat (", 95 | packageVersion("packrat"), 96 | ") to bootstrap this project") 97 | } 98 | 99 | ## Couldn't find a user-local packrat -- try finding and using devtools 100 | ## to install 101 | else if (requireNamespace("devtools", quietly = TRUE)) { 102 | message("> Attempting to use devtools::install_github to install ", 103 | "a temporary version of packrat") 104 | library(stats) ## for setNames 105 | devtools::install_github("rstudio/packrat") 106 | } 107 | 108 | ## Try downloading packrat from CRAN if available 109 | else if ("packrat" %in% rownames(available.packages())) { 110 | message("> Installing packrat from CRAN") 111 | install.packages("packrat") 112 | } 113 | 114 | ## Fail -- couldn't find an appropriate means of installing packrat 115 | else { 116 | stop("Could not automatically bootstrap packrat -- try running ", 117 | "\"'install.packages('devtools'); devtools::install_github('rstudio/packrat')\"", 118 | "and restarting R to bootstrap packrat.") 119 | } 120 | 121 | # Restore the project, unload the temporary packrat, and load the private packrat 122 | if (needsRestore) 123 | packrat::restore(prompt = FALSE, restart = TRUE) 124 | 125 | ## This code path only reached if we didn't restart earlier 126 | unloadNamespace("packrat") 127 | requireNamespace("packrat", lib.loc = libDir, quietly = TRUE) 128 | return(packrat::on()) 129 | 130 | } 131 | 132 | ## Multiple packrat tarballs available locally -- try to choose one 133 | ## TODO: read lock file and infer most appropriate from there; low priority because 134 | ## after bootstrapping packrat a restore should do the right thing 135 | if (length(packratSrcPath) > 1) { 136 | warning("Multiple versions of packrat available in the source directory;", 137 | "using packrat source:\n- ", shQuote(packratSrcPath)) 138 | packratSrcPath <- packratSrcPath[[1]] 139 | } 140 | 141 | 142 | lib <- file.path("packrat", "lib", R.version$platform, getRversion()) 143 | if (!file.exists(lib)) { 144 | dir.create(lib, recursive = TRUE) 145 | } 146 | 147 | message("> Installing packrat into project private library:") 148 | message("- ", shQuote(lib)) 149 | 150 | surround <- function(x, with) { 151 | if (!length(x)) return(character()) 152 | paste0(with, x, with) 153 | } 154 | 155 | 156 | ## Invoke install.packages() in clean R session 157 | peq <- function(x, y) paste(x, y, sep = " = ") 158 | installArgs <- c( 159 | peq("pkgs", surround(packratSrcPath, with = "'")), 160 | peq("lib", surround(lib, with = "'")), 161 | peq("repos", "NULL"), 162 | peq("type", surround("source", with = "'")) 163 | ) 164 | 165 | fmt <- "utils::install.packages(%s)" 166 | installCmd <- sprintf(fmt, paste(installArgs, collapse = ", ")) 167 | 168 | ## Write script to file (avoid issues with command line quoting 169 | ## on R 3.4.3) 170 | installFile <- tempfile("packrat-bootstrap", fileext = ".R") 171 | writeLines(installCmd, con = installFile) 172 | on.exit(unlink(installFile), add = TRUE) 173 | 174 | fullCmd <- paste( 175 | surround(file.path(R.home("bin"), "R"), with = "\""), 176 | "--vanilla", 177 | "--slave", 178 | "-f", 179 | surround(installFile, with = "\"") 180 | ) 181 | system(fullCmd) 182 | 183 | ## Tag the installed packrat so we know it's managed by packrat 184 | ## TODO: should this be taking information from the lockfile? this is a bit awkward 185 | ## because we're taking an un-annotated packrat source tarball and simply assuming it's now 186 | ## an 'installed from source' version 187 | 188 | ## -- InstallAgent -- ## 189 | installAgent <- "InstallAgent: packrat 0.5.0" 190 | 191 | ## -- InstallSource -- ## 192 | installSource <- "InstallSource: source" 193 | 194 | packratDescPath <- file.path(lib, "packrat", "DESCRIPTION") 195 | DESCRIPTION <- readLines(packratDescPath) 196 | DESCRIPTION <- c(DESCRIPTION, installAgent, installSource) 197 | cat(DESCRIPTION, file = packratDescPath, sep = "\n") 198 | 199 | # Otherwise, continue on as normal 200 | message("> Attaching packrat") 201 | library("packrat", character.only = TRUE, lib.loc = lib) 202 | 203 | message("> Restoring library") 204 | if (needsRestore) 205 | packrat::restore(prompt = FALSE, restart = FALSE) 206 | 207 | # If the environment allows us to restart, do so with a call to restore 208 | restart <- getOption("restart") 209 | if (!is.null(restart)) { 210 | message("> Packrat bootstrap successfully completed. ", 211 | "Restarting R and entering packrat mode...") 212 | return(restart()) 213 | } 214 | 215 | # Callers (source-erers) can define this hidden variable to make sure we don't enter packrat mode 216 | # Primarily useful for testing 217 | if (!exists(".__DONT_ENTER_PACKRAT_MODE__.") && interactive()) { 218 | message("> Packrat bootstrap successfully completed. Entering packrat mode...") 219 | packrat::on() 220 | } 221 | 222 | Sys.unsetenv("RSTUDIO_PACKRAT_BOOTSTRAP") 223 | 224 | } 225 | 226 | }) 227 | -------------------------------------------------------------------------------- /packrat/packrat.lock: -------------------------------------------------------------------------------- 1 | PackratFormat: 1.4 2 | PackratVersion: 0.5.0 3 | RVersion: 3.5.1 4 | Repos: CRAN=https://cran.rstudio.com/ 5 | 6 | Package: BH 7 | Source: CRAN 8 | Version: 1.66.0-1 9 | Hash: 4cc8883584b955ed01f38f68bc03af6d 10 | 11 | Package: DBI 12 | Source: CRAN 13 | Version: 1.0.0 14 | Hash: 6abedd7919c4457604c0aa44529a6683 15 | 16 | Package: R6 17 | Source: CRAN 18 | Version: 2.3.0 19 | Hash: 8eccabbf292b5aba632985cde6406fc3 20 | 21 | Package: RColorBrewer 22 | Source: CRAN 23 | Version: 1.1-2 24 | Hash: c0d56cd15034f395874c870141870c25 25 | 26 | Package: Rcpp 27 | Source: CRAN 28 | Version: 1.0.0 29 | Hash: c7273c0f0bc9f5e41f4c52a8cf571f0f 30 | 31 | Package: assertthat 32 | Source: CRAN 33 | Version: 0.2.0 34 | Hash: e8805df54c65ac96d50235c44a82615c 35 | 36 | Package: backports 37 | Source: CRAN 38 | Version: 1.1.3 39 | Hash: a0b8191e6bd2fe71aadd4678bb8f3c98 40 | 41 | Package: base64enc 42 | Source: CRAN 43 | Version: 0.1-3 44 | Hash: c590d29e555926af053055e23ee79efb 45 | 46 | Package: bindr 47 | Source: CRAN 48 | Version: 0.1.1 49 | Hash: 76578c5f543a6ecbc1365d6445f9ebf7 50 | 51 | Package: bindrcpp 52 | Source: CRAN 53 | Version: 0.2.2 54 | Hash: 8ce499301f0dc5c7ff69f0b42e33f5c1 55 | Requires: Rcpp, bindr, plogr 56 | 57 | Package: bitops 58 | Source: CRAN 59 | Version: 1.0-6 60 | Hash: 67d0775189fd0041d95abca618c5c07e 61 | 62 | Package: broom 63 | Source: CRAN 64 | Version: 0.5.1 65 | Hash: f170bc989c523e039487511e87c1854f 66 | Requires: backports, dplyr, generics, purrr, reshape2, stringr, tibble, 67 | tidyr 68 | 69 | Package: caTools 70 | Source: CRAN 71 | Version: 1.17.1.1 72 | Hash: 1764f9cb70825aa7a1c461e7514ffb73 73 | Requires: bitops 74 | 75 | Package: callr 76 | Source: CRAN 77 | Version: 3.1.1 78 | Hash: 461cdebafe2c1cfc23ddc37527633185 79 | Requires: R6, processx 80 | 81 | Package: cellranger 82 | Source: CRAN 83 | Version: 1.1.0 84 | Hash: be9d203e7849f73818b36f93e9273c2c 85 | Requires: rematch, tibble 86 | 87 | Package: checkmate 88 | Source: CRAN 89 | Version: 1.8.5 90 | Hash: e1bbc5228ab3da931a099208bc95ad23 91 | Requires: backports 92 | 93 | Package: cli 94 | Source: CRAN 95 | Version: 1.0.1 96 | Hash: a742a3229dbf7085c3a737af10e5065b 97 | Requires: assertthat, crayon 98 | 99 | Package: clipr 100 | Source: CRAN 101 | Version: 0.4.1 102 | Hash: caf20ae357bfa2ed50e0e7db267f69ce 103 | 104 | Package: clisymbols 105 | Source: CRAN 106 | Version: 1.2.0 107 | Hash: a76a309884277a4fd8a5d741965fbef5 108 | 109 | Package: colorspace 110 | Source: CRAN 111 | Version: 1.3-2 112 | Hash: 0bf8618b585fa98eb23414cd3ab95118 113 | 114 | Package: commonmark 115 | Source: CRAN 116 | Version: 1.7 117 | Hash: 77f4ba718e2bad1877ef26e48cf8fa43 118 | 119 | Package: crayon 120 | Source: CRAN 121 | Version: 1.3.4 122 | Hash: ff2840dd9b0d563fc80377a5a45510cd 123 | 124 | Package: curl 125 | Source: CRAN 126 | Version: 3.2 127 | Hash: 82a7cf5bb702ef52329b6d23ea6132a7 128 | 129 | Package: data.table 130 | Source: CRAN 131 | Version: 1.11.8 132 | Hash: 67c877937c790a0cadb71284febb6b34 133 | 134 | Package: dbplyr 135 | Source: CRAN 136 | Version: 1.2.2 137 | Hash: f09ea2f1a5c31d86b061d7121fab5db8 138 | Requires: DBI, R6, assertthat, dplyr, glue, purrr, rlang, tibble, 139 | tidyselect 140 | 141 | Package: desc 142 | Source: CRAN 143 | Version: 1.2.0 144 | Hash: a0a3ca939997679a52816bae4ed6aaae 145 | Requires: R6, assertthat, crayon, rprojroot 146 | 147 | Package: devtools 148 | Source: CRAN 149 | Version: 2.0.1 150 | Hash: 4da83470ae57bd2db0ac06e91b1d3a08 151 | Requires: callr, cli, digest, git2r, httr, jsonlite, memoise, pkgbuild, 152 | pkgload, rcmdcheck, remotes, rstudioapi, sessioninfo, usethis, 153 | withr 154 | 155 | Package: digest 156 | Source: CRAN 157 | Version: 0.6.18 158 | Hash: 65f62365ec69ddd17230d2ffe891a6ab 159 | 160 | Package: dplyr 161 | Source: CRAN 162 | Version: 0.7.8 163 | Hash: d6e576944199cba782a471015a822848 164 | Requires: BH, R6, Rcpp, assertthat, bindrcpp, glue, magrittr, 165 | pkgconfig, plogr, rlang, tibble, tidyselect 166 | 167 | Package: evaluate 168 | Source: CRAN 169 | Version: 0.12 170 | Hash: c32505adba4f6eca5aa20dd32300b019 171 | 172 | Package: fansi 173 | Source: CRAN 174 | Version: 0.4.0 175 | Hash: f147621f72b561485bfffcae78c4f5d5 176 | 177 | Package: forcats 178 | Source: CRAN 179 | Version: 0.3.0 180 | Hash: 770f3834b97a2c429bdecb7a5f27eb25 181 | Requires: magrittr, rlang, tibble 182 | 183 | Package: fs 184 | Source: CRAN 185 | Version: 1.2.6 186 | Hash: 8ffb8b293a18e26e435465307cc47f75 187 | Requires: Rcpp 188 | 189 | Package: generics 190 | Source: CRAN 191 | Version: 0.0.2 192 | Hash: 4aaf002dd434e8c854611c5d11a1d58e 193 | 194 | Package: ggplot2 195 | Source: CRAN 196 | Version: 3.1.0 197 | Hash: ef541b05dda10b209d509b5bbaf46ea3 198 | Requires: digest, gtable, lazyeval, plyr, reshape2, rlang, scales, 199 | tibble, viridisLite, withr 200 | 201 | Package: gh 202 | Source: CRAN 203 | Version: 1.0.1 204 | Hash: 3f8812accd1320227d744bca620e8c42 205 | Requires: httr, ini, jsonlite 206 | 207 | Package: git2r 208 | Source: CRAN 209 | Version: 0.23.0 210 | Hash: ea0a8f265d61f9356d17fff4d9e1613b 211 | 212 | Package: glue 213 | Source: CRAN 214 | Version: 1.3.0 215 | Hash: 1fbde6dec830370be696eee8ef31c9e4 216 | 217 | Package: gt 218 | Source: github 219 | Version: 0.1.0 220 | Hash: 63f3be25f850d24667b9d53cbc8d4971 221 | Requires: checkmate, commonmark, dplyr, ggplot2, glue, htmltools, 222 | magrittr, rlang, sass, scales, stringr, tibble, tidyr, tidyselect 223 | GithubRepo: gt 224 | GithubUsername: rstudio 225 | GithubRef: master 226 | GithubSha1: b4a75a7ac406bca0a3823bcd416b23d5a305839d 227 | RemoteHost: api.github.com 228 | RemoteRepo: gt 229 | RemoteUsername: rstudio 230 | RemoteRef: master 231 | RemoteSha: b4a75a7ac406bca0a3823bcd416b23d5a305839d 232 | 233 | Package: gtable 234 | Source: CRAN 235 | Version: 0.2.0 236 | Hash: cd78381a9d3fea966ac39bd0daaf5554 237 | 238 | Package: haven 239 | Source: CRAN 240 | Version: 2.0.0 241 | Hash: 3180a95083d9ef6da3fafc38e0519e2f 242 | Requires: Rcpp, forcats, hms, readr, tibble 243 | 244 | Package: highr 245 | Source: CRAN 246 | Version: 0.7 247 | Hash: 20757f5c393ed0ecf96c9e8e6d8d514c 248 | 249 | Package: hms 250 | Source: CRAN 251 | Version: 0.4.2 252 | Hash: b4096a4f6a6736138e9a825c2baaacf0 253 | Requires: pkgconfig, rlang 254 | 255 | Package: htmltools 256 | Source: CRAN 257 | Version: 0.3.6 258 | Hash: 9707abea0a9b7406e98fb1242e97e1f6 259 | Requires: Rcpp, digest 260 | 261 | Package: httr 262 | Source: CRAN 263 | Version: 1.4.0 264 | Hash: 62d62d3ffcc9a34411b6e35a813f72dd 265 | Requires: R6, curl, jsonlite, mime, openssl 266 | 267 | Package: ini 268 | Source: CRAN 269 | Version: 0.3.1 270 | Hash: 9d6de5178c1cedabfb24e7d2acc9a092 271 | 272 | Package: jsonlite 273 | Source: CRAN 274 | Version: 1.6 275 | Hash: 5f969e213e966135393e3e304abf3f49 276 | 277 | Package: knitr 278 | Source: CRAN 279 | Version: 1.21 280 | Hash: 05d92a30fe7f149fef5e82428cbbe0d7 281 | Requires: evaluate, highr, markdown, stringr, xfun, yaml 282 | 283 | Package: labeling 284 | Source: CRAN 285 | Version: 0.3 286 | Hash: ecf589b42cd284b03a4beb9665482d3e 287 | 288 | Package: lazyeval 289 | Source: CRAN 290 | Version: 0.2.1 291 | Hash: 88926ad9c43581fd0822a37c8ed09f05 292 | 293 | Package: lubridate 294 | Source: CRAN 295 | Version: 1.7.4 296 | Hash: a7c783782f0e50be33b31f859c11333e 297 | Requires: Rcpp, stringr 298 | 299 | Package: magrittr 300 | Source: CRAN 301 | Version: 1.5 302 | Hash: bdc4d48c3135e8f3b399536ddf160df4 303 | 304 | Package: markdown 305 | Source: CRAN 306 | Version: 0.9 307 | Hash: 730f688930cef3223d59bd8aef679ab9 308 | Requires: mime 309 | 310 | Package: memoise 311 | Source: CRAN 312 | Version: 1.1.0 313 | Hash: 410fcd334bc626db100237cc1370f2e9 314 | Requires: digest 315 | 316 | Package: mime 317 | Source: CRAN 318 | Version: 0.6 319 | Hash: 2ed8f98b8284ad733f3907fc6e2f1334 320 | 321 | Package: modelr 322 | Source: CRAN 323 | Version: 0.1.2 324 | Hash: f691854a99ac7814f3853d499408d9a3 325 | Requires: broom, dplyr, magrittr, purrr, rlang, tibble, tidyr 326 | 327 | Package: munsell 328 | Source: CRAN 329 | Version: 0.5.0 330 | Hash: 38d0efee9bb99bef143bde41c4ce715c 331 | Requires: colorspace 332 | 333 | Package: openssl 334 | Source: CRAN 335 | Version: 1.1 336 | Hash: 82c893294829c4badc3d4133b66d1428 337 | 338 | Package: packrat 339 | Source: CRAN 340 | Version: 0.5.0 341 | Hash: 498643e765d1442ba7b1160a1df3abf9 342 | 343 | Package: pacman 344 | Source: CRAN 345 | Version: 0.5.0 346 | Hash: c05b1da1943bdd4a7b255df08d0a5e3f 347 | Requires: remotes 348 | 349 | Package: pillar 350 | Source: CRAN 351 | Version: 1.3.1 352 | Hash: 9ed4c2a5d3047bfba3e852ad5e806d91 353 | Requires: cli, crayon, fansi, rlang, utf8 354 | 355 | Package: pkgbuild 356 | Source: CRAN 357 | Version: 1.0.2 358 | Hash: 770e918f4c389e4fef9a891d043bf84f 359 | Requires: R6, callr, cli, crayon, desc, prettyunits, rprojroot, withr 360 | 361 | Package: pkgconfig 362 | Source: CRAN 363 | Version: 2.0.2 364 | Hash: b0fd6ed908e150b77e5f00c6478bd58c 365 | 366 | Package: pkgload 367 | Source: CRAN 368 | Version: 1.0.2 369 | Hash: 41eb2db35be61f6f9e8864cf87a1ecb0 370 | Requires: desc, pkgbuild, rlang, rprojroot, rstudioapi, withr 371 | 372 | Package: plogr 373 | Source: CRAN 374 | Version: 0.2.0 375 | Hash: 81a8008a5e7858552503935f1abe48aa 376 | 377 | Package: plyr 378 | Source: CRAN 379 | Version: 1.8.4 380 | Hash: ec185c885aab7ec91693d78c20cb5d1a 381 | Requires: Rcpp 382 | 383 | Package: prettyunits 384 | Source: CRAN 385 | Version: 1.0.2 386 | Hash: 49286102a855640daaa38eafe8b1ec30 387 | Requires: assertthat, magrittr 388 | 389 | Package: processx 390 | Source: CRAN 391 | Version: 3.2.1 392 | Hash: fdc6a66626b75f96ee28ffc770d9ebd7 393 | Requires: R6, ps 394 | 395 | Package: progress 396 | Source: CRAN 397 | Version: 1.2.0 398 | Hash: 0ff5f631b66daf57857f28062f3e6f30 399 | Requires: R6, crayon, hms, prettyunits 400 | 401 | Package: ps 402 | Source: CRAN 403 | Version: 1.3.0 404 | Hash: 1d4cae95887ffe5b1a22bea5994476cd 405 | 406 | Package: purrr 407 | Source: CRAN 408 | Version: 0.2.5 409 | Hash: 8b0c16db10c7e20b70cd37779a673a8b 410 | Requires: magrittr, rlang, tibble 411 | 412 | Package: rcmdcheck 413 | Source: CRAN 414 | Version: 1.3.2 415 | Hash: 162a63e909423b7c5f44c30711135481 416 | Requires: R6, callr, cli, crayon, desc, digest, pkgbuild, prettyunits, 417 | rprojroot, sessioninfo, withr, xopen 418 | 419 | Package: readr 420 | Source: CRAN 421 | Version: 1.3.1 422 | Hash: ed1b5520a0df4007bc971658ee543b00 423 | Requires: BH, R6, Rcpp, clipr, crayon, hms, tibble 424 | 425 | Package: readxl 426 | Source: CRAN 427 | Version: 1.2.0 428 | Hash: 3d49de1375021e909463ce4bdb613327 429 | Requires: Rcpp, cellranger, progress, tibble 430 | 431 | Package: rematch 432 | Source: CRAN 433 | Version: 1.0.1 434 | Hash: ad4faf59e7611117ff165817074c50c7 435 | 436 | Package: remotes 437 | Source: CRAN 438 | Version: 2.0.2 439 | Hash: cfcf5e119ed5e6af4b47530ba59101f3 440 | 441 | Package: reprex 442 | Source: CRAN 443 | Version: 0.2.1 444 | Hash: b30c7fbcb528a71d7ffcd07c9982589d 445 | Requires: callr, clipr, fs, rlang, rmarkdown, whisker, withr 446 | 447 | Package: reshape2 448 | Source: CRAN 449 | Version: 1.4.3 450 | Hash: c950c8ac85b81209635acb3ce21b4cce 451 | Requires: Rcpp, plyr, stringr 452 | 453 | Package: rlang 454 | Source: CRAN 455 | Version: 0.3.0.1 456 | Hash: 35fb7a51d5d756c56f793ed9c381fb84 457 | 458 | Package: rmarkdown 459 | Source: CRAN 460 | Version: 1.11 461 | Hash: a30b0c41b60d981f41dc8c98f2136297 462 | Requires: base64enc, evaluate, htmltools, jsonlite, knitr, mime, 463 | stringr, tinytex, yaml 464 | 465 | Package: rprojroot 466 | Source: CRAN 467 | Version: 1.3-2 468 | Hash: a25c3f70c166fb3fbabc410eb32b6366 469 | Requires: backports 470 | 471 | Package: rstudioapi 472 | Source: CRAN 473 | Version: 0.8 474 | Hash: 9ba7fe76dcaf96966c449527ca04bb78 475 | 476 | Package: rvest 477 | Source: CRAN 478 | Version: 0.3.2 479 | Hash: c69f7526520bad66fd2111ebe8b1364b 480 | Requires: httr, magrittr, selectr, xml2 481 | 482 | Package: sass 483 | Source: github 484 | Version: 0.1.0.9000 485 | Hash: 2b4ab4b71b56f10605e901cae30c2dad 486 | Requires: digest 487 | GithubRepo: sass 488 | GithubUsername: rstudio 489 | GithubRef: master 490 | GithubSha1: c73867da1d7a07b65da1d175ae072417f7387404 491 | RemoteHost: api.github.com 492 | RemoteRepo: sass 493 | RemoteUsername: rstudio 494 | RemoteRef: master 495 | RemoteSha: c73867da1d7a07b65da1d175ae072417f7387404 496 | 497 | Package: scales 498 | Source: CRAN 499 | Version: 1.0.0 500 | Hash: 7d9b717abcec656ae7c5c982d72b75e5 501 | Requires: R6, RColorBrewer, Rcpp, labeling, munsell, viridisLite 502 | 503 | Package: selectr 504 | Source: CRAN 505 | Version: 0.4-1 506 | Hash: b12802c11e35dec9d16a74d30ed0f3ed 507 | Requires: R6, stringr 508 | 509 | Package: sessioninfo 510 | Source: CRAN 511 | Version: 1.1.1 512 | Hash: 9e50c8458e611f166ba702277cbb5096 513 | Requires: cli, withr 514 | 515 | Package: stringi 516 | Source: CRAN 517 | Version: 1.2.4 518 | Hash: 03ab60ef7fa4627b38ad67c95ce6b04c 519 | 520 | Package: stringr 521 | Source: CRAN 522 | Version: 1.3.1 523 | Hash: 9f417a1d899ed1f080942ab36998e8b5 524 | Requires: glue, magrittr, stringi 525 | 526 | Package: tibble 527 | Source: CRAN 528 | Version: 1.4.2 529 | Hash: 83895360ce4f8d2ce92eee00526b5b0b 530 | Requires: cli, crayon, pillar, rlang 531 | 532 | Package: tidyr 533 | Source: CRAN 534 | Version: 0.8.2 535 | Hash: 9ff92b9b3c11dfcd94d179b75b85c668 536 | Requires: Rcpp, dplyr, glue, magrittr, purrr, rlang, stringi, tibble, 537 | tidyselect 538 | 539 | Package: tidyselect 540 | Source: CRAN 541 | Version: 0.2.5 542 | Hash: fad1cf10c5c4996fca6ca68e0716d2e6 543 | Requires: Rcpp, glue, purrr, rlang 544 | 545 | Package: tidyverse 546 | Source: CRAN 547 | Version: 1.2.1 548 | Hash: 1b090209cb20b6fc6eba75de8b7f0b53 549 | Requires: broom, cli, crayon, dbplyr, dplyr, forcats, ggplot2, haven, 550 | hms, httr, jsonlite, lubridate, magrittr, modelr, purrr, readr, 551 | readxl, reprex, rlang, rstudioapi, rvest, stringr, tibble, tidyr, 552 | xml2 553 | 554 | Package: tinytex 555 | Source: CRAN 556 | Version: 0.9 557 | Hash: a4ffe98c58eace5a95644d8fd6ca5a50 558 | Requires: xfun 559 | 560 | Package: usethis 561 | Source: CRAN 562 | Version: 1.4.0 563 | Hash: bdf0ce7802818c5dfc592dd73d62db5b 564 | Requires: clipr, clisymbols, crayon, curl, desc, fs, gh, git2r, glue, 565 | rlang, rprojroot, rstudioapi, whisker 566 | 567 | Package: utf8 568 | Source: CRAN 569 | Version: 1.1.4 570 | Hash: f3f97ce59092abc8ed3fd098a59e236c 571 | 572 | Package: viridisLite 573 | Source: CRAN 574 | Version: 0.3.0 575 | Hash: 78bb072c4f9e729a283d4c40ec93f9c6 576 | 577 | Package: whisker 578 | Source: CRAN 579 | Version: 0.3-2 580 | Hash: 803d662762e532705c2c066a82d066e7 581 | 582 | Package: withr 583 | Source: CRAN 584 | Version: 2.1.2 585 | Hash: d534108bcd5f34ec73e9eb523751ba20 586 | 587 | Package: xfun 588 | Source: CRAN 589 | Version: 0.4 590 | Hash: 76004125b45195c0330ec71904849995 591 | 592 | Package: xml2 593 | Source: CRAN 594 | Version: 1.2.0 595 | Hash: 3f00e5347a6d7ccad2237fe1bc1df6d0 596 | Requires: Rcpp 597 | 598 | Package: xopen 599 | Source: CRAN 600 | Version: 1.0.0 601 | Hash: 22c2708f177f9fd9f8a52012bac61d6a 602 | Requires: processx 603 | 604 | Package: yaml 605 | Source: CRAN 606 | Version: 2.2.0 607 | Hash: a5ad5616d83d89f8d84cbf3cf4034e13 608 | -------------------------------------------------------------------------------- /genre_analysis.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Movie Genre Overlappings, Ratings by Genre and Year" 3 | author: "Ceshine Lee" 4 | output: 5 | html_notebook 6 | --- 7 | 8 | ### (Exploring the Moives Dataset with rstudio/gt package) 9 | 10 | Tables can be an effective way of communicating data. Though not as powerful in telling stories as charts, by cramming a lot of numbers into limited space, it can provide readers with accurate and potentially useful information which readers can interpret in their own ways. 11 | 12 | I've come across this new R package [**gt**](https://github.com/rstudio/gt)(Easily generate information-rich, publication-quality tables from R) and decided to give it a try. 13 | 14 | > With the gt package, anyone can make wonderful-looking tables using the R programming language. 15 | 16 | Admittedly, the tables in the documents might not be the optimal way of presentation. They serve as a demonstration of what **gt** can do, and maybe also helpful enough for analyst in constructing their stories about this dataset. 17 | 18 | (Dataset Source: [The Movies Dataset on Kaggle](https://www.kaggle.com/rounakbanik/the-movies-dataset).) 19 | 20 | ![](https://raw.githubusercontent.com/rstudio/gt/master/man/figures/gt_parts_of_a_table.svg?sanitize=true) 21 | 22 | ```{r, message=FALSE, results="hide"} 23 | # install.packages("packrat") 24 | # install.packages("pacman") 25 | # install.packages("devtools") 26 | # install.packages("tidyverse") 27 | # devtools::install_github("rstudio/gt") 28 | library(gt) 29 | pacman::p_load("tidyverse") 30 | pacman::p_load("data.table") 31 | pacman::p_load("jsonlite") 32 | pacman::p_load("lubridate") 33 | ``` 34 | 35 | ## Read and clean the Metadata 36 | 37 | Some of the fields read in here are not used in the later sections. However, they are kept as a reminder of the potentially useful information. 38 | 39 | ```{r read_clean_metadata} 40 | metadata <- fread("data/movies_metadata.csv", select=c('adult', 'genres', 'release_date', 'original_language', 'original_title', 'id', 'imdb_id'), fill=T) 41 | metadata <- metadata[!is.na(as.integer(id)) & (original_language == 'en') & (adult == "False"), ] 42 | metadata[, id := as.integer(id)] 43 | # Replace singles quote in 'genre' so 'fromJSON' function can parse it 44 | metadata[, genre := gsub("\'","\"", metadata$genre)] 45 | # Remove redundant fields 46 | metadata[, c('original_language', 'adult', "imdb_id") := NULL] 47 | ``` 48 | 49 | ## Parse Genres 50 | 51 | Genre information is stored as JSON texts. We need to parse it and do some transformation. 52 | 53 | ```{r parse_genre} 54 | genres <- metadata[, unlist(lapply(genre, fromJSON), recursive=F)['name'], by=id] 55 | sorted.genres <- genres[, .N, by=name][order(-N)] 56 | sorted.genres[1:20] 57 | ``` 58 | 59 | Assuming genres are assigned in order of representativeness, taking at most three top genres for each movie. Furthermore, only the top 6 genres are considered. 60 | 61 | ```{r} 62 | # Dummy variable for the later dcast operation 63 | genres[, dummy := 1] 64 | # Only use the first 3 assigned genres 65 | encoded.genres <- dcast(na.omit(genres[,.SD[1:3], by=id])[name %in% sorted.genres[1:6, name]], 66 | id ~ name, value.var='dummy', fill=0) 67 | data.w.genre <- merge(encoded.genres, 68 | metadata[, .(id, original_title, release_date)], 69 | all.x=T, by="id") 70 | ``` 71 | 72 | ## Genre Statistics 73 | 74 | Here I used a for loop to collect and calculate statstics genre by genre. I later figured out the better way to do this is probably using a *melt* call followed by several *ratio* column calculations. The later sections will use the *melt* approach instead. 75 | 76 | ```{r} 77 | i <- 0 78 | tmp.raw <- list() 79 | tmp.ratio <- list() 80 | for(name in sorted.genres[1:6, name]){ 81 | i <- i + 1 82 | dt.tmp <- data.w.genre[get(name) == 1, lapply(.SD, sum), .SDcols=sorted.genres[1:6, name]] 83 | tmp.ratio[[i]] <- copy(dt.tmp) 84 | dt.tmp[, genre := name] 85 | tmp.raw[[i]] <- dt.tmp 86 | tmp.ratio[[i]] <- tmp.ratio[[i]] / dt.tmp[, get(name)] 87 | tmp.ratio[[i]][, genre := name] 88 | } 89 | genre.stats <- merge(rbindlist(tmp.raw), rbindlist(tmp.ratio), by="genre", suffixes=c("", ".ratio"), sort=F) 90 | rm(tmp.raw, tmp.ratio, i) 91 | ``` 92 | 93 | The actual table declaration and rendering part. Unfortunately it requires a lot of manual labeling and assignments. But I think it is verbose and readable enough. 94 | 95 | ```{r} 96 | gt_tbl <- gt(data = genre.stats, rowname_col = "genre") 97 | gt_tbl %>% 98 | tab_header( 99 | title = "Movie Metadata Stats" 100 | ) %>% 101 | tab_spanner( 102 | label = "Genre Overlappings", 103 | columns = vars(Drama, Comedy, Thriller, Romance, Action, Horror) 104 | ) %>% 105 | tab_spanner( 106 | label = "Genre Overlappings(%)", 107 | columns = vars(Drama.ratio, Comedy.ratio, Thriller.ratio, Romance.ratio, Action.ratio, Horror.ratio) 108 | ) %>% 109 | fmt_percent( 110 | columns = vars(Drama.ratio, Comedy.ratio, Thriller.ratio, Romance.ratio, Action.ratio, Horror.ratio), 111 | decimals = 1, 112 | drop_trailing_zeros = F 113 | ) %>% 114 | tab_source_note( 115 | source_note = md("Source: [\"The Movie Dataset\"](rounakbanik/the-movies-dataset) on Kaggle") 116 | ) %>% 117 | cols_label( 118 | Drama.ratio = "Drama", 119 | Comedy.ratio = "Comedy", 120 | Thriller.ratio = "Thriller", 121 | Romance.ratio = "Romance", 122 | Action.ratio = "Action", 123 | Horror.ratio = "Horror" 124 | ) %>% 125 | tab_stubhead_label(label = "Genre") %>% 126 | tab_style( 127 | style = cells_styles( 128 | text_size = px(12)), 129 | locations = list( 130 | cells_column_labels(columns = 1:12), 131 | cells_stub(), 132 | cells_data() 133 | )) %>% 134 | tab_style( 135 | style = cells_styles( 136 | text_decorate = "underline", 137 | text_weight = "bold"), 138 | locations = list( 139 | cells_data(columns=c(1), rows=c(1)), 140 | cells_data(columns=c(2), rows=c(2)), 141 | cells_data(columns=c(3), rows=c(3)), 142 | cells_data(columns=c(4), rows=c(4)), 143 | cells_data(columns=c(5), rows=c(5)), 144 | cells_data(columns=c(6), rows=c(6)) 145 | )) %>% 146 | tab_style( 147 | style = cells_styles( 148 | text_color="lightgrey"), 149 | locations = list( 150 | cells_data(columns=c(7), rows=c(1)), 151 | cells_data(columns=c(8), rows=c(2)), 152 | cells_data(columns=c(9), rows=c(3)), 153 | cells_data(columns=c(10), rows=c(4)), 154 | cells_data(columns=c(11), rows=c(5)), 155 | cells_data(columns=c(12), rows=c(6)) 156 | )) 157 | ``` 158 | 159 | *How to read the table*: For example, Drama (the first row) has 13008 movies (the first column), and 2639 of them are also under Comedy, which is 20.3% of the 13008 movies. 160 | 161 | The diagonal elements of 'Genre Overlappings' section are emphasized because they represent the movie count of the respective genre. Similarily, the diagonal elements of 'Genre Overlappings(%)' section are somewhat hiddened since they convey almost no useful information. 162 | 163 | 164 | ## Reading Ratings 165 | 166 | Now we turn our attention to movie ratings (from movielens). We only consider the average rating of a movie (which of course is not perfect). 167 | 168 | ```{r, read_ratings} 169 | ratings <- fread("data/ratings.csv", select=c('movieId', 'rating')) 170 | ratings.stats <- ratings[, .(n_ratings=.N, avg_rating=mean(rating)), by=movieId] 171 | rm(ratings) 172 | head(ratings.stats) 173 | ``` 174 | 175 | Here we remove movies with less than 500 ratings. 176 | 177 | ```{r} 178 | data.w.genre.ratings <- merge(data.w.genre, ratings.stats, by.y="movieId", by.x="id") 179 | # Only keep moives with >= 500 ratings 180 | data.w.genre.ratings <- data.w.genre.ratings[n_ratings >= 500] 181 | head(data.w.genre.ratings) 182 | ``` 183 | 184 | ## Rating statistics by Genre 185 | 186 | Using a *melt* call followed by a bunch of data.table operations to prepare the metrics: 187 | 188 | ```{r} 189 | rating.by.genre <- melt( 190 | data.w.genre.ratings, id=c("id", "avg_rating", "n_ratings"), measure.vars=c("Action", "Comedy", "Drama", "Horror", "Romance", "Thriller"), variable.name="genre" 191 | )[value==1][ 192 | , .(n_movies=.N, min_rating=min(avg_rating), avg_rating=mean(avg_rating), med_rating=median(avg_rating), max_rating=max(avg_rating), sd_rating=sd(avg_rating), n_ratings=sum(n_ratings), avg_n_ratings=mean(n_ratings), max_n_ratings=max(n_ratings), min_n_ratings=min(n_ratings)), by=genre] 193 | head(rating.by.genre) 194 | ``` 195 | 196 | Follows roughly the same recipe (unfortunately I did not find a reasonable way to make use of row group feature of *gt* for this dataset): 197 | 198 | ```{r} 199 | gt_tbl <- gt(data = rating.by.genre, rowname_col = "genre") 200 | gt_tbl %>% 201 | tab_header( 202 | title = "Movie Average Ratings by Genre", 203 | subtitle = "with # of Ratings >= 500" 204 | ) %>% 205 | tab_spanner( 206 | label = "Average Rating (1-5)", 207 | columns = vars(min_rating, avg_rating, med_rating, max_rating, sd_rating) 208 | ) %>% 209 | fmt_number( 210 | columns = vars(min_rating, avg_rating, med_rating, max_rating, sd_rating), 211 | decimals = 2, 212 | drop_trailing_zeros = F 213 | ) %>% 214 | tab_spanner( 215 | label = "# of Ratings", 216 | columns = vars(n_ratings, min_n_ratings, avg_n_ratings, max_n_ratings) 217 | ) %>% 218 | fmt_number( 219 | columns = vars(n_ratings, min_n_ratings, avg_n_ratings, max_n_ratings), 220 | decimals = 0, 221 | drop_trailing_zeros = F 222 | ) %>% 223 | tab_source_note( 224 | source_note = md("Source: [\"The Movie Dataset\"](rounakbanik/the-movies-dataset) on Kaggle") 225 | ) %>% 226 | cols_label( 227 | min_rating = "Min", 228 | avg_rating = "Avg", 229 | med_rating = "Med", 230 | max_rating = "Max", 231 | n_ratings = "Total", 232 | min_n_ratings = "Min", 233 | avg_n_ratings = "Avg", 234 | max_n_ratings = "Max", 235 | n_movies="Movies", 236 | sd_rating="Stdev" 237 | ) %>% 238 | tab_stubhead_label(label = "Genre") %>% 239 | tab_style( 240 | style = cells_styles( 241 | text_size = px(14)), 242 | locations = list( 243 | cells_column_labels(columns = 1:10), 244 | cells_stub(), 245 | cells_data() 246 | )) 247 | ``` 248 | 249 | Suprisingly, the distribution of ratings are quite similar across all genres. Maybe movielens has done some normalization on the ratings? 250 | 251 | 252 | ## Rating statistics by Genre and Year 253 | 254 | We'd also like to know if the distributions of ratings change over time. First we plot the histogram of the years in which the movies were released. This is a case where a histogram is far more readable than a table. 255 | 256 | ```{r} 257 | data.w.genre.ratings[,release_year:=year(ymd(release_date))] 258 | ggplot(data.w.genre.ratings[release_year >= 1950][order(release_year),.N, by=release_year], aes(x=release_year, y=N)) + 259 | geom_bar(stat="identity") + ggtitle("# of Movie by Year") + theme_bw() + scale_x_continuous(breaks=seq(1950, 2020, 10)) 260 | ``` 261 | 262 | Now we only take movies released after 1979 and before 2010 into account, and put them in to buckets each representing a decade. 263 | 264 | ```{r} 265 | data.w.genre.ratings <- data.w.genre.ratings[(release_year >= 1980) & (release_year < 2010)] 266 | data.w.genre.ratings[, release_decade := (release_year %/% 10) * 10 ] 267 | ``` 268 | 269 | The same old *melt* trick: 270 | 271 | ```{r} 272 | rating.by.genre.year <- melt( 273 | data.w.genre.ratings, id=c("id", "avg_rating", "n_ratings", "release_decade"), measure.vars=c("Action", "Comedy", "Drama", "Horror", "Romance", "Thriller"), variable.name="genre" 274 | )[value==1][ 275 | , .(n_movies=.N, min_rating=min(avg_rating), avg_rating=mean(avg_rating), med_rating=median(avg_rating), max_rating=max(avg_rating), sd_rating=sd(avg_rating), n_ratings=sum(n_ratings), avg_n_ratings=mean(n_ratings), max_n_ratings=max(n_ratings), min_n_ratings=min(n_ratings)), by=.(genre, release_decade)] 276 | ``` 277 | 278 | Even more manual labeling and a *dcast* call to handle the extra *decade* dimension: 279 | 280 | ```{r} 281 | gt_tbl <- gt( 282 | data = dcast(rating.by.genre.year, genre ~ release_decade, value.var = c("avg_rating", "sd_rating", "avg_n_ratings", "n_movies"))[, .(genre, 283 | n_movies_1980, avg_rating_1980, sd_rating_1980, avg_n_ratings_1980, 284 | n_movies_1990, avg_rating_1990, sd_rating_1990, avg_n_ratings_1990, 285 | n_movies_2000, avg_rating_2000, sd_rating_2000, avg_n_ratings_2000 286 | )], 287 | rowname_col = "genre") 288 | gt_tbl %>% 289 | tab_header( 290 | title = "Movie Average Ratings by Genre & Year", 291 | subtitle = "with # of Ratings >= 500" 292 | ) %>% 293 | tab_spanner( 294 | label = "1980s", 295 | columns = vars(n_movies_1980, avg_rating_1980, sd_rating_1980, avg_n_ratings_1980) 296 | ) %>% 297 | tab_spanner( 298 | label = "1990s", 299 | columns = vars(n_movies_1990, avg_rating_1990, sd_rating_1990, avg_n_ratings_1990) 300 | ) %>% 301 | tab_spanner( 302 | label = "2000s", 303 | columns = vars(n_movies_2000, avg_rating_2000, sd_rating_2000, avg_n_ratings_2000) 304 | ) %>% 305 | fmt_number( 306 | columns = vars(avg_rating_1980, sd_rating_1980, avg_rating_1990, sd_rating_1990, avg_rating_2000, sd_rating_2000), 307 | decimals = 2, 308 | drop_trailing_zeros = F 309 | ) %>% 310 | fmt_number( 311 | columns = vars(avg_n_ratings_1980, avg_n_ratings_1990, avg_n_ratings_2000, n_movies_1980, n_movies_1990, n_movies_2000), 312 | decimals = 0, 313 | drop_trailing_zeros = F 314 | ) %>% 315 | tab_source_note( 316 | source_note = md("Source: [\"The Movie Dataset\"](rounakbanik/the-movies-dataset) on Kaggle") 317 | ) %>% 318 | tab_footnote( 319 | footnote = "#: Number of movies", 320 | cells_column_labels(columns = c(1, 5, 9)) 321 | ) %>% 322 | tab_footnote( 323 | footnote = "Avg #: Average number of ratings.", 324 | cells_column_labels(columns = c(4, 8, 12)) 325 | ) %>% 326 | cols_label( 327 | n_movies_1980 = "#", 328 | n_movies_1990 = "#", 329 | n_movies_2000 = "#", 330 | avg_rating_1980 = "Avg", 331 | avg_rating_1990 = "Avg", 332 | avg_rating_2000 = "Avg", 333 | sd_rating_1980 = "Stdev", 334 | sd_rating_1990 = "Stdev", 335 | sd_rating_2000 = "Stdev", 336 | avg_n_ratings_1980 = "Avg #", 337 | avg_n_ratings_1990 = "Avg #", 338 | avg_n_ratings_2000 = "Avg #" 339 | ) %>% 340 | tab_stubhead_label(label = "Genre") %>% 341 | tab_style( 342 | style = cells_styles( 343 | text_size = px(14)), 344 | locations = list( 345 | cells_column_labels(columns = 1:12), 346 | cells_stub(), 347 | cells_data() 348 | )) 349 | ``` 350 | 351 | It appears the variances of ratings in 1980s are lower than in 1990s and 2000s. (A boxplot or scatterplot might be more appropriate here.) 352 | 353 | ## Acknowledgements 354 | 355 | * Erik Bruin's Kaggle notebook ["Movie recommendation systems based on TMDB"](https://www.kaggle.com/erikbruin/movie-recommendation-systems-based-on-tmdb) has been a great source of ideas r.t. manipulating this dataset using R. 356 | 357 | --------------------------------------------------------------------------------