├── Deadliest movies scrape ├── code │ ├── film-death-counts-Python.csv │ ├── imdb-scraper.R │ ├── imdb-scraper.py │ ├── movie-scraper.R │ ├── movie-scraper.py │ ├── movies-R-full.csv │ ├── movies-R.csv │ ├── movies-python.csv │ └── movies.csv ├── custom.css ├── notebook.R ├── notebook.html ├── notebook.md ├── notebook2.R ├── notebook2.html ├── notebook2.md ├── pandoc_config.txt ├── pandoc_config2.txt └── programming_cat.jpg ├── Deadliest movies ├── bloody_gun.jpg ├── code │ ├── code.R │ └── code.py ├── custom.css ├── figure │ ├── baseGraphR.png │ ├── gunR.png │ ├── prettyR.png │ └── rightLabelsR.png ├── figurePy │ ├── basePy.png │ ├── finalPy.png │ └── prettyPy.png ├── pandoc_config.txt ├── run.R ├── run.html └── run.md ├── Linear regression ├── Linear regression.Rproj ├── code │ ├── code.R │ └── code.ipynb ├── custom.css ├── figure │ ├── graphBaseR.png │ └── graphPredictR.png ├── figurePy │ ├── graphBasePy.png │ └── graphPredictPy.png ├── notebook.R ├── notebook.html ├── notebook.md └── pandoc_config.txt └── README.md /Deadliest movies scrape/code/imdb-scraper.R: -------------------------------------------------------------------------------- 1 | #' Copyright 2014 Simon Garnier (http://www.theswarmlab.com / @sjmgarnier) 2 | #' 3 | #' This script is free software: you can redistribute it and/or modify it under 4 | #' the terms of the GNU General Public License as published by the Free Software 5 | #' Foundation, either version 3 of the License, or (at your option) any later 6 | #' version. 7 | #' 8 | #' This script is distributed in the hope that it will be useful, but WITHOUT 9 | #' ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 10 | #' FOR A PARTICULAR PURPOSE. 11 | #' 12 | #' See the GNU General Public License for more details. 13 | #' 14 | #' You should have received a copy of the GNU General Public License along with 15 | #' this script. If not, see http://www.gnu.org/licenses/. 16 | #' 17 | 18 | #' **Document title:** R vs Python - Round 2 (2/2) 19 | #' 20 | #' **Date:** February 2, 2014 21 | #' 22 | #' **Author:** Simon Garnier (http://www.theswarmlab.com / @sjmgarnier) 23 | #' 24 | #' **Description:** This script scrapes data out of www.imdb.com. For more 25 | #' information, see http://www.theswarmlab.com/r-vs-python-round-2/ and 26 | #' http://www.theswarmlab.com/r-vs-python-round-2-22 27 | #' 28 | #' Document generated with RStudio ([www.rstudio.com](http://www.rstudio.com)). 29 | #' 30 | 31 | # Load libraries 32 | # No additional libraries needed here. Yeah! 33 | 34 | # Create IMDB scraper 35 | IMDb <- function(ID) { 36 | # Retrieve movie info from IMDb.com. 37 | # 38 | # Args: 39 | # ID: IDs of the movies. 40 | # 41 | # Returns: 42 | # A data frame containing one line per movie, and nine columns: movie ID, 43 | # film title, year of release, duration in minutes, MPAA rating, genre(s), 44 | # director(s), IMDb rating, and full cast. 45 | 46 | # Load required libraries 47 | require(XML) 48 | require(pbapply) # Apply functions with progress bars!!! 49 | 50 | # Wrap core of the function in do.call and pblapply in order to 51 | # pseudo-vectorize it (pblapply) and return a data frame (do.call) 52 | info <- do.call(rbind, pblapply(ID, FUN = function(ID) { 53 | # Create movie URL on IMDb.com 54 | URL <- paste0("http://www.imdb.com/title/tt", ID) 55 | 56 | # Download and parse HTML of IMDb page 57 | parsed.html <- htmlParse(URL) 58 | 59 | # Find title 60 | Film <- xpathSApply(parsed.html, "//h1[@class='header']/span[@class='itemprop']", xmlValue) 61 | 62 | # Find year 63 | Year <- as.numeric(gsub("[^0-9]", "", xpathSApply(parsed.html, "//h1[@class='header']/span[@class='nobr']", xmlValue))) 64 | 65 | # Find duration in minutes 66 | Length_Minutes <- as.numeric(gsub("[^0-9]", "", xpathSApply(parsed.html, "//div[@class='infobar']/time[@itemprop='duration']", xmlValue))) 67 | 68 | # Find MPAA rating 69 | MPAA_Rating <- unname(xpathSApply(parsed.html, "//div[@class='infobar']/span/@content")) 70 | if (!is.character(MPAA_Rating)) { # Some movies don't have a MPAA rating 71 | MPAA_Rating <- "UNRATED" 72 | } 73 | 74 | # Find genre 75 | Genre <- paste(xpathSApply(parsed.html, "//span[@class='itemprop' and @itemprop='genre']", xmlValue), collapse='|') 76 | 77 | # Find director 78 | Director <- paste(xpathSApply(parsed.html, "//div[@itemprop='director']/a", xmlValue), collapse='|') 79 | 80 | # Find IMDB rating 81 | IMDB_rating <- as.numeric(xpathSApply(parsed.html, "//div[@class='titlePageSprite star-box-giga-star']", xmlValue)) 82 | 83 | # Extract full cast from the full credits page 84 | parsed.html <- htmlParse(paste0(URL,"/fullcredits")) 85 | Full_Cast <- paste(xpathSApply(parsed.html, "//span[@itemprop='name']", xmlValue), collapse='|') 86 | 87 | data.frame(ID = ID, Film = Film, Year = Year, Length_Minutes = Length_Minutes, 88 | MPAA_Rating = MPAA_Rating, Genre = Genre, 89 | Director = Director, IMDB_rating = IMDB_rating, Full_Cast = Full_Cast)) 90 | })) 91 | } 92 | 93 | # Load data from last challenge 94 | data <- read.csv("movies-R.csv") 95 | 96 | # For each movie, extract IMDb info and append it to the data 97 | data <- within(data, { 98 | # Extract ID number 99 | IMDB_ID <- gsub("[^0-9]", "", IMDB_URL) 100 | 101 | # Download IMDb info into a temporary variable 102 | IMDB_Info <- IMDb(IMDB_ID) 103 | 104 | # Save MPAA rating 105 | MPAA_Rating <- IMDB_Info$MPAA_Rating 106 | 107 | # Save genre(s) 108 | Genre <- IMDB_Info$Genre 109 | 110 | # Save director(s) 111 | Director <- IMDB_Info$Director 112 | 113 | # Save duration in minutes 114 | Length_Minutes <- IMDB_Info$Length_Minutes 115 | 116 | # Save IMDb rating 117 | IMDB_rating <- IMDB_Info$IMDB_rating 118 | 119 | # Save full cast 120 | Full_Cast <- IMDB_Info$Full_Cast 121 | 122 | # Delete IMDb info 123 | IMDB_Info <- NULL 124 | }) 125 | 126 | write.csv(data, file = "movies-R-full.csv") 127 | 128 | 129 | -------------------------------------------------------------------------------- /Deadliest movies scrape/code/imdb-scraper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2014 Randal S. Olson 3 | 4 | This file is a script that combines data from www.MovieBodyCounts.com and IMDB.com to 5 | create a list of films, metadata about the films, and the number of on-screen body 6 | counts in the films. The script requires an internet connection and two libraries 7 | installed: imdbpy and pandas. 8 | 9 | 10 | This script is free software: you can redistribute it and/or modify it under the 11 | terms of the GNU General Public License as published by the Free Software Foundation, 12 | either version 3 of the License, or (at your option) any later version. 13 | 14 | This script is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; 15 | without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 16 | See the GNU General Public License for more details. 17 | 18 | You should have received a copy of the GNU General Public License along with this script. 19 | If not, see http://www.gnu.org/licenses/. 20 | """ 21 | 22 | from imdb import IMDb 23 | import pandas as pd 24 | import re 25 | 26 | imdb_access = IMDb() 27 | movie_data = pd.read_csv("movies.csv") 28 | 29 | # Grab only the movie number out of the IMDB URL 30 | movie_data["Movie_Number"] = movie_data["IMDB_URL"].apply(lambda x: re.sub("[^0-9]", "", x)) 31 | 32 | with open("film-death-counts-Python.csv", "wb") as out_file: 33 | out_file.write("Film,Year,Body_Count,MPAA_Rating,Genre,Director,Actors,Length_Minutes,IMDB_Rating\n") 34 | 35 | for movie_entry in movie_data.iterrows(): 36 | # Use a try-catch on the loop to prevent temporary connection-related issues from stopping the scrape 37 | try: 38 | movie = imdb_access.get_movie(movie_entry[1]["Movie_Number"]) 39 | movie_fields = [] 40 | 41 | # Remove non-ASCII character encodings and commas from movie titles 42 | movie_fields.append(movie["title"].encode("ascii", "replace").replace(",", "")) 43 | movie_fields.append(str(movie["year"])) 44 | movie_fields.append(str(movie_entry[1]["Body_Count"])) 45 | 46 | # Some movies don't have MPAA Ratings on IMDB 47 | try: 48 | movie_fields.append(str(movie["mpaa"].split(" ")[1])) 49 | except: 50 | movie_fields.append("") 51 | 52 | # For movies with multiple genres/directors/actors, join them with bars | 53 | movie_fields.append(str("|".join(movie["genres"]))) 54 | movie_fields.append(str("|".join([str(x) for x in movie["director"]]))) 55 | movie_fields.append(str("|".join([str(x) for x in movie["cast"]]))) 56 | 57 | movie_fields.append(str(int(movie["runtime"][0].split(":")[-1]))) 58 | movie_fields.append(str(float(movie["rating"]))) 59 | 60 | # All entries are comma-delimited 61 | out_file.write(",".join(movie_fields) + "\n") 62 | 63 | except Exception as e: 64 | print "Error with", str(movie) 65 | -------------------------------------------------------------------------------- /Deadliest movies scrape/code/movie-scraper.R: -------------------------------------------------------------------------------- 1 | #' Copyright 2014 Simon Garnier (http://www.theswarmlab.com / @sjmgarnier) 2 | #' 3 | #' This script is free software: you can redistribute it and/or modify it under 4 | #' the terms of the GNU General Public License as published by the Free Software 5 | #' Foundation, either version 3 of the License, or (at your option) any later 6 | #' version. 7 | #' 8 | #' This script is distributed in the hope that it will be useful, but WITHOUT 9 | #' ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 10 | #' FOR A PARTICULAR PURPOSE. 11 | #' 12 | #' See the GNU General Public License for more details. 13 | #' 14 | #' You should have received a copy of the GNU General Public License along with 15 | #' this script. If not, see http://www.gnu.org/licenses/. 16 | #' 17 | 18 | #' **Document title:** R vs Python - Round 2 19 | #' 20 | #' **Date:** January 12, 2014 21 | #' 22 | #' **Author:** Simon Garnier (http://www.theswarmlab.com / @sjmgarnier) 23 | #' 24 | #' **Description:** This script scrapes data out of 2 websites 25 | #' (www.MovieBodyCounts.com and www.imdb.com). For more information, see 26 | #' http://www.theswarmlab.com/r-vs-python-round-2/ 27 | #' 28 | #' Document generated with RStudio ([www.rstudio.com](http://www.rstudio.com)). 29 | #' 30 | 31 | # Load libraries 32 | library(RCurl) # Everything necessary to grab webpage 33 | library(XML) # Everything necessary to parse HTML code 34 | library(pbapply) # Progress bars!!! 35 | 36 | # Create curl handle which can be used for multiple HHTP requests. 37 | # followlocation = TRUE in case one of the URLs we want to grab is a redirection 38 | # link. 39 | curl <- getCurlHandle(useragent = "R", followlocation = TRUE) 40 | 41 | # Prepare URLs of the movie lists alphabetically ordered by first letter of 42 | # movie title (capital A to Z, except for v and y) + "numbers" list (for movies 43 | # which title starts with a number) 44 | urls.by.letter <- paste0("http://www.moviebodycounts.com/movies-", 45 | c("numbers", LETTERS[1:21], "v", "W" , "x", "Y", "Z"), ".htm") 46 | 47 | # For each movie list... For loops are frowned upon in R, let's use the classier 48 | # apply functions instead. Here I use the pblapply from the pbapply package. 49 | # It's equivalent to the regular lapply function, but it provides a neat 50 | # progress bar. Unlist to get a vector. 51 | urls.by.movie <- unlist(pblapply(urls.by.letter, FUN = function(URL) { 52 | # Load raw HTML 53 | raw.html <- getURL(URL, curl = curl) 54 | 55 | # Parse HTML content 56 | parsed.html <- htmlParse(raw.html) 57 | 58 | # Extract desired links from HTML content. The desired links are those after 59 | # image 'graphic-movies.jpg' in the page 60 | links <- as.vector(xpathSApply(parsed.html, "//img[@src='graphic-movies.jpg']/following::a/@href")) 61 | 62 | if (!is.null(links)) { 63 | ix = grepl("http://www.moviebodycounts.com/", links) 64 | links[!ix] <- paste0("http://www.moviebodycounts.com/", links[!ix]) 65 | return(links) 66 | } 67 | }), use.names = FALSE) 68 | 69 | # One URL is actually a shortcut to another page. Let's get rid of it. 70 | ix <- which(grepl("movies-C.htm", urls.by.movie)) 71 | urls.by.movie <- urls.by.movie[-ix] 72 | 73 | # Ok, let's get serious now 74 | 75 | data <- do.call(rbind, pblapply(urls.by.movie, FUN = function(URL) { 76 | # Load raw HTML 77 | raw.html <- getURL(URL, curl = curl) 78 | 79 | # Parse HTML content 80 | parsed.html <- htmlParse(raw.html) 81 | 82 | # Find movie title 83 | # Title appears inside a XML/HTML node called "title" ("//title"). In this 84 | # node, it comes after "Movie Body Counts: ". I use gsub to get rid off "Movie 85 | # Body Counts: " and keep only the movie title. 86 | Film <- xpathSApply(parsed.html, "//title", xmlValue) 87 | Film <- gsub("Movie Body Counts: ", "", Film) 88 | 89 | # Find movie year 90 | # The year is usually a text inside ("/descendant::text()") a link node 91 | # ("//a") which source contains the string "charts-year" ("[contains(@href, 92 | # 'charts-year')]"). 93 | Year <- as.numeric(xpathSApply(parsed.html, "//a[contains(@href, 'charts-year')]/descendant::text()", xmlValue)) 94 | 95 | # Find IMDB link 96 | # The IMDB link is inside a link node ("//a") which source contains "imdb" 97 | # ("/@href[contains(.,'imdb')]") 98 | IMDB_URL <- as.vector(xpathSApply(parsed.html, "//a/@href[contains(.,'imdb')]"))[1] 99 | 100 | # Note: We select the first element of the vector because for at least one of 101 | # the movies, this command returns two links. 102 | 103 | # Find kill count. 104 | # Kill count is contained in the first non-empty text node 105 | # ("/following::text()[normalize-space()]") after the image which source file 106 | # is called "graphic-bc.jpg" ("//img[@src='graphic-bc.jpg']") 107 | Body_Count <- xpathSApply(parsed.html, "//img[@src='graphic-bc.jpg']/following::text()[normalize-space()]", xmlValue)[1] 108 | 109 | # Now we need to clean up the text node that we just extracted because there 110 | # are lots of inconsistencies in the way the kill counts are displayed across 111 | # all movie pages. For instance, counts are sometimes accompanied by text, not 112 | # always the same, and sometimes there is no text at all. Sometimes the total 113 | # count is split in two numbers (e.g., number of dead humans and number of 114 | # dead aliens). And sometimes the total count is displayed and accompanied by 115 | # a split count in parenthesis. First, let's remove everything that is 116 | # writtent in parenthesis or that is not a number. 117 | # Using gsub, remove everything in parenthesis and all non number characters 118 | Body_Count <- gsub("\\(.*?\\)", " ", Body_Count) 119 | Body_Count <- gsub("[^0-9]+", " ", Body_Count) 120 | 121 | # In case the total count has been split, we want to separate these numbers 122 | # from each other so that we can add them up later. Using strsplit, split the 123 | # character string at spaces 124 | Body_Count <- unlist(strsplit(Body_Count, " ")) 125 | 126 | # For now, we have extracted characters. Transform them into numbers. 127 | Body_Count <- as.numeric(Body_Count) 128 | 129 | # Sum up the numbers (in case they have been split into separate categories. 130 | Body_Count <- sum(Body_Count, na.rm = TRUE) 131 | 132 | return(data.frame(IMDB_URL, Film, Year, Body_Count)) 133 | })) 134 | 135 | # Save scraped data in a .csv file for future use 136 | write.csv(data, "movies-R.csv", row.names = FALSE) 137 | 138 | 139 | 140 | 141 | 142 | -------------------------------------------------------------------------------- /Deadliest movies scrape/code/movie-scraper.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2014 Randal S. Olson 3 | 4 | This file is a script that scrapes on-screen body counts for various movies on 5 | www.MovieBodyCounts.com. The script requires an internet connection and two libraries 6 | installed: urllib2 and html2text. 7 | 8 | Due to inconsistent formatting of the HTML on www.MovieBodyCounts.com, the script will 9 | not scrape everything perfectly. As such, the resulting output file *will* require some 10 | cleanup afterwards. The manual cleanup will take less time than finding an elegant 11 | solution to perfectly scrape the page. 12 | 13 | 14 | This script is free software: you can redistribute it and/or modify it under the 15 | terms of the GNU General Public License as published by the Free Software Foundation, 16 | either version 3 of the License, or (at your option) any later version. 17 | 18 | This script is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; 19 | without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 20 | See the GNU General Public License for more details. 21 | 22 | You should have received a copy of the GNU General Public License along with this script. 23 | If not, see http://www.gnu.org/licenses/. 24 | """ 25 | 26 | # String parsing libraries 27 | import string 28 | import re 29 | 30 | # urllib2 reads web pages if you provide it an URL 31 | import urllib2 32 | 33 | # html2text converts HTML to Markdown, which is much easier to parse 34 | from html2text import html2text 35 | 36 | # Generate a list of all letters for the Movie pages (+ a "numbers" page) 37 | # MovieBodyCount's actor pages are all with capital letters EXCEPT v and x 38 | letters = ["numbers"] + list(string.letters[26:52].upper().replace("V", "v").replace("X", "x")) 39 | 40 | list_of_films = [] 41 | 42 | # Go through each movie list page and gather all of the movie web page URLs 43 | for letter in letters: 44 | try: 45 | # Read the raw HTML from the web page 46 | page_text = urllib2.urlopen("http://www.moviebodycounts.com/movies-" + letter + ".htm").read() 47 | 48 | # Convert the raw HTML into Markdown 49 | page_text = html2text(page_text).split("\n") 50 | 51 | # Search through the web page for movie page entries 52 | for line in page_text: 53 | # We know it's a movie page entry when it has ".htm" in it, but not ".jpg", "contact.htm", and "movies.htm" 54 | # .jpg means it's a line with an image -- none of the movie entries have an image 55 | # contact.htm and movies.htm means it's a link to the Contact or Movies page -- not what we want 56 | # movies- means it's a redirect link to another page -- just skip over it 57 | if ".htm" in line and ".jpg" not in line and "contact.htm" not in line and "movies.htm" not in line and "movies-" not in line: 58 | #print line 59 | # The URL is in between parentheses (), so we can simply split the string on those 60 | # Some URLs are full URLs, e.g. www.moviebodycounts.com/movie_name.html, so splitting on the / gives us only the page name 61 | list_of_films.append(line.split("(")[-1].strip(")").split("/")[-1]) 62 | 63 | # If the movie list page doesn't exist, keep going 64 | except: 65 | print "\nerror with " + letter + "\n" 66 | 67 | # Now that we have every movie web page URL, go through each movie page and extract the movie name, kill counts, etc. 68 | out_file = open("film-death-counts.csv", "wb") 69 | out_file.write("Film,Year,Kill_Count,IMDB_url\n") 70 | 71 | for film_page in list_of_films: 72 | try: 73 | # The information we're looking for on the page: 74 | film = "" 75 | kills = "" 76 | year = "" 77 | IMDB_url = "" 78 | 79 | # A flag indicating that we've found the film title on the page 80 | found_title = False 81 | 82 | # Read the page's raw HTML and convert it to Markdown (again) and go through each line 83 | for line in html2text(urllib2.urlopen("http://www.moviebodycounts.com/" + film_page).read()).split("\n"): 84 | 85 | # If we haven't found the title yet, these markers tell us we've found the movie title 86 | if not found_title and "!" not in line and "(" not in line and "[" not in line and line.strip() != "": 87 | film = line.replace(",", "").strip(":") 88 | found_title = True 89 | 90 | # The kill counts are usually on a line with "Film:" 91 | if "film:" in line.lower() or "kills:" in line.lower() or "count:" in line.lower(): 92 | kills = re.sub("[^0-9]", "", line.split(":")[1].split("(")[0]) 93 | 94 | # The year is usually on a line with "charts-year" 95 | if "charts-year" in line: 96 | year = line.split("[")[1].split("]")[0] 97 | 98 | # The IMDB url is on a line with "[imdb]" 99 | if "[imdb]" in line.lower(): 100 | IMDB_url = line.lower().split("[imdb](")[1].split(")")[0] 101 | 102 | out_file.write(film + "," + year + "," + kills + "," + IMDB_url + "\n") 103 | 104 | # If a movie page fails to open, print out the error and move on to the next movie 105 | except Exception as e: 106 | print film_page 107 | print e 108 | 109 | out_file.close() 110 | -------------------------------------------------------------------------------- /Deadliest movies scrape/custom.css: -------------------------------------------------------------------------------- 1 | body { 2 | font: 14px/1.5em "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif; 3 | color: #777; 4 | -webkit-font-smoothing: antialiased; /* Fix for webkit rendering */ 5 | -webkit-text-size-adjust: 100%; 6 | /*font-family: "Avenir Next", Helvetica, Arial, sans-serif;*/ 7 | padding:1em; 8 | margin:auto; 9 | max-width:10in; 10 | } 11 | 12 | h1, h2, h3, h4, h5, h6 { 13 | 14 | font-weight: normal; } 15 | h1 a, h2 a, h3 a, h4 a, h5 a, h6 a { font-weight: inherit; } 16 | h1 { font-size: 46px; line-height: 50px; margin-bottom: 14px;} 17 | h2 { font-size: 35px; line-height: 40px; margin-bottom: 10px; } 18 | h3 { font-size: 28px; line-height: 34px; margin-bottom: 8px; } 19 | h4 { font-size: 21px; line-height: 30px; margin-bottom: 4px; } 20 | h5 { font-size: 17px; line-height: 24px; } 21 | h6 { font-size: 14px; line-height: 21px; } 22 | .subheader { color: #777; } 23 | 24 | p { margin: 0 0 20px 0; } 25 | p img { margin: 0; } 26 | p.lead { font-size: 21px; line-height: 27px; color: #444; } 27 | 28 | em { font-style: italic; } 29 | strong { font-weight: bold; } 30 | small { font-size: 80%; } 31 | 32 | hr { 33 | height: 0.2em; 34 | border: 0; 35 | color: #CCCCCC; 36 | background-color: #CCCCCC; 37 | } 38 | 39 | p, blockquote, ul, ol, dl, li, table, pre { 40 | margin: 15px 0; 41 | text-align: justify; 42 | } 43 | 44 | a, a:visited { color: #333; text-decoration: underline; outline: 0; } 45 | a:hover, a:focus { color: #000; } 46 | p a, p a:visited { line-height: inherit; } 47 | 48 | #message { 49 | border-radius: 6px; 50 | border: 1px solid #ccc; 51 | display:block; 52 | width:100%; 53 | height:60px; 54 | margin:6px 0px; 55 | } 56 | 57 | button, #ws { 58 | font-size: 10pt; 59 | padding: 4px 6px; 60 | border-radius: 5px; 61 | border: 1px solid #bbb; 62 | background-color: #eee; 63 | } 64 | 65 | code, pre, #ws, #message { 66 | font-family: Monaco; 67 | font-size: 8pt; 68 | border-radius: 3px; 69 | background-color: #F8F8F8; 70 | color: inherit; 71 | } 72 | 73 | code { 74 | border: 1px solid #EAEAEA; 75 | margin: 0 2px; 76 | padding: 0 5px; 77 | } 78 | 79 | pre.r { 80 | border: 2px solid #8A0606; 81 | } 82 | 83 | pre.r:before { 84 | content: 'R code \A'; 85 | color: #8A0606; 86 | font-weight: bold; 87 | } 88 | 89 | pre.python { 90 | border: 2px solid #068A06; 91 | } 92 | 93 | pre.python:before { 94 | content: 'Python code \A'; 95 | color: #068A06; 96 | font-weight: bold; 97 | } 98 | 99 | img { 100 | max-width: 100%; 101 | height: auto; 102 | width: auto\9; /* ie8 */ 103 | } 104 | 105 | pre { 106 | border: 1px solid #CCCCCC; 107 | overflow: auto; 108 | padding: 4px 8px; 109 | } 110 | 111 | pre > code { 112 | border: 0; 113 | margin: 0; 114 | padding: 0; 115 | } 116 | 117 | blockquote, blockquote p { font-size: 12px; line-height: 24px; color: #000; font-style: italic; } 118 | blockquote { margin: 0 0 20px; padding: 9px 50px 0 49px; border-left: 1px solid #ddd; } 119 | blockquote cite { display: block; font-size: 12px; color: #555; } 120 | blockquote cite:before { content: "\2014 \0020"; } 121 | blockquote cite a, blockquote cite a:visited, blockquote cite a:visited { color: #555; } 122 | 123 | #ws { background-color: #f8f8f8; } 124 | 125 | .send { color:#77bb77; } 126 | .server { color:#7799bb; } 127 | .error { color:#AA0000; } 128 | -------------------------------------------------------------------------------- /Deadliest movies scrape/notebook.R: -------------------------------------------------------------------------------- 1 | #+ licence, echo=FALSE 2 | # Copyright 2014 Simon Garnier (http://www.theswarmlab.com / @sjmgarnier) 3 | # 4 | # This script is free software: you can redistribute it and/or modify it under 5 | # the terms of the GNU General Public License as published by the Free Software 6 | # Foundation, either version 3 of the License, or (at your option) any later 7 | # version. 8 | # 9 | # This script is distributed in the hope that it will be useful, but WITHOUT ANY 10 | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 11 | # A PARTICULAR PURPOSE. 12 | # 13 | # See the GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU General Public License along with 16 | # this script. If not, see http://www.gnu.org/licenses/. 17 | # 18 | # You can generate the HTML files by running: 19 | # library(knitr) 20 | # spin("notebook.R") 21 | # pandoc("notebook.md", config = "pandoc_config.txt") 22 | 23 | 24 | #+ 25 | #' **Document title:** R vs Python - Round 2 (1/2) 26 | #' 27 | #' **Date:** January 12, 2014 28 | #' 29 | #' **Text by:** Simon Garnier ([www.theswarmlab.com](http://www.theswarmlab.com) 30 | #' / [\@sjmgarnier](http://twitter.com/sjmgarnier)) 31 | #' 32 | #' **R code by:** Simon Garnier 33 | #' ([www.theswarmlab.com](http://www.theswarmlab.com) / 34 | #' [\@sjmgarnier](http://twitter.com/sjmgarnier)) 35 | #' 36 | #' **Python code by:** Randy Olson 37 | #' ([www.randalolson.com](http://www.randalolson.com) / 38 | #' [\@randal_olson](http://twitter.com/randal_olson)) 39 | #' 40 | #' Document generated with RStudio ([www.rstudio.com](http://www.rstudio.com)), 41 | #' knitr ([www.yihui.name/knitr/](http://yihui.name/knitr/)) and pandoc 42 | #' ([www.johnmacfarlane.net/pandoc/](http://johnmacfarlane.net/pandoc/)). Python 43 | #' figures generated with iPython Notebook 44 | #' ([www.ipython.org/notebook.html](http://ipython.org/notebook.html)). 45 | #' 46 | 47 | 48 | #' ___ 49 | #' 50 | #' #### Foreword #### 51 | #' 52 | #' My friend Randy Olson and I got into the habit to argue about the relative 53 | #' qualities of our favorite languages for data analysis and visualization. I am 54 | #' an enthusiastic R user ([www.r-project.org](http://www.r-project.org)) while 55 | #' Randy is a fan of Python ([www.python.org](http://www.python.org)). One thing 56 | #' we agree on however is that our discussions are meaningless unless we 57 | #' actually put R and Python to a series of tests to showcase their relative 58 | #' strengths and weaknesses. Essentially we will set a common goal (*e.g.*, 59 | #' perform a particular type of data analysis or draw a particular type of 60 | #' graph) and create the R and Python codes to achieve this goal. And since 61 | #' Randy and I are all about sharing, open source and open access, we decided to 62 | #' make public the results of our friendly challenges so that you can help us 63 | #' decide between R and Python and, hopefully, also learn something along the 64 | #' way. 65 | #' 66 | 67 | 68 | #' ___ 69 | #' 70 | #' #### Today's challenge: a data thief manual for honest scientists (Part 1 of 2) #### 71 | #' 72 | #' ##### 1 - Introduction ##### 73 | #' 74 | #' Last week we started our challenge series with a rather simple task: plot a 75 | #' pretty barchart from some data collected by Randy for his recent post on the 76 | #' ["Top 25 most violence packed films" in the history of the movie 77 | #' industry](www.randalolson.com/2013/12/31/most-violence-packed-films/). Today 78 | #' we will try to up our game a little bit with a more complex task. We will 79 | #' show you how you can collect the data that Randy used for his post directly 80 | #' from the website they originate from 81 | #' ([www.MovieBodyCounts.com](http://www.moviebodycounts.com)). This is called 82 | #' data scraping, or the art of taking advantage of the gigantic database that 83 | #' is the Internet. 84 | #' 85 | #' The basic principle behind the scraping of website data is simple: a website 86 | #' is a like database, and each page of the website is like a table of this 87 | #' database. All we want is find in the database the tables that contain 88 | #' information that we would like to acquire, and then extract this information 89 | #' from within these relevant tables. This task can be relatively easy if all 90 | #' the pages of a website have a similar structure (*i.e.*, if the database is 91 | #' clean and well maintained). In this ideal situation, all we have to do is 92 | #' identify one or more stable markers that delimit the desired information and 93 | #' use them to tell R or Python what to save in memory. Unfortunately not all 94 | #' websites have a similar structure across all of their pages and it can 95 | #' quickly become a nightmare to identify such markers. Worse, sometimes you 96 | #' will have to resign yourself to scrape or correct part or all of the data 97 | #' manually. 98 | #' 99 | #' For this challenge, we will attempt to recover the following pieces of 100 | #' information for each movie listed on 101 | #' [www.MovieBodyCounts.com](http://www.moviebodycounts.com): title, release 102 | #' year, count of on-screen deaths and link to the movie page on 103 | #' [www.imdb.com](http://www.imdb.com) (this will help us for part 2 of this 104 | #' challenge next week). We will detail the different steps of the process and 105 | #' provide for each step the corresponding code (red boxes for R, green boxes 106 | #' for Python). You will also find the entire codes at the end of this document. 107 | #' 108 | 109 | 110 | #' ##### 2 - Step by step process ##### 111 | #' 112 | #' First things first, let's set up our working environment by loading some 113 | #' necessary libraries. 114 | #' 115 | 116 | #+ libR, eval=FALSE, message=FALSE 117 | # Load libraries 118 | library(RCurl) # Everything necessary to grab webpages on the Web 119 | library(XML) # Everything necessary to parse XML and HTML code 120 | library(pbapply) # Progress bars!!! Just because why not :-) 121 | 122 | # Create curl handle which can be used for multiple HHTP requests. 123 | # followlocation = TRUE in case one of the URLs we want to grab is a redirection 124 | # link. 125 | curl <- getCurlHandle(useragent = "R", followlocation = TRUE) 126 | 127 | #+ libPy, eval=FALSE, engine="python" 128 | # String parsing libraries 129 | import string 130 | import re 131 | 132 | # urllib2 reads web pages if you provide it an URL 133 | import urllib2 134 | 135 | # html2text converts HTML to Markdown, which is much easier to parse 136 | from html2text import html2text 137 | 138 | #' Now a word about the organization of 139 | #' [www.MovieBodyCounts.com](http://www.moviebodycounts.com). To be perfectly 140 | #' honest, it is a bit messy :-) Movies are organized in a series of 141 | #' alphabetically ordered lists (by the first letter of each movie's title), 142 | #' each letter having its own page 143 | #' (http://www.moviebodycounts.com/movies-[A-Z].htm). There is also a list for 144 | #' movies which title starts with a number 145 | #' (http://www.moviebodycounts.com/movies-numbers.htm). Finally, all category 146 | #' letters are capitalized in the lists' URLs, except for letters v and x. 147 | #' Annoying, right? This is just one of the many little problems one can 148 | #' encounter when dealing with messy databases :-) 149 | #' 150 | #' With all this information in mind, our first task is to create a list of all 151 | #' these lists. 152 | #' 153 | 154 | #+ listURLsR, eval=FALSE 155 | # Prepare URLs of the movie lists alphabetically ordered by first letter of 156 | # movie title (capital A to Z, except for v and y) + "numbers" list (for movies 157 | # which title starts with a number) 158 | urls.by.letter <- paste0("http://www.moviebodycounts.com/movies-", 159 | c("numbers", LETTERS[1:21], "v", "W" , "x", "Y", "Z"), ".htm") 160 | 161 | #+ listURLsPy, eval=FALSE, engine="python" 162 | # Generate a list of all letters for the Movie pages (+ a "numbers" page) 163 | # MovieBodyCount's actor pages are all with capital letters EXCEPT v and x 164 | letters = ["numbers"] + list(string.letters[26:52].upper().replace("V", "v").replace("X", "x")) 165 | 166 | #' Our next task is to go through the HTML code of all these lists and gather 167 | #' the URLs of all the movie webpages. This is where the data scraping really 168 | #' starts. 169 | #' 170 | #' As you will quickly notices by reading the following code, Randy and I have 171 | #' decided to use a different approach to identify and collect the desired URLs 172 | #' (and of all the data in the rest of this challenge). I have decided to rely 173 | #' on the [XML Path Language (XPath)](http://www.w3schools.com/xpath/), a 174 | #' language that makes it easy to navigate through elements and attributes in an 175 | #' XML/HTML document. Randy has decided to use an approach based on more 176 | #' "classical" string parsing and manipulation functions. Note that these are 177 | #' just personal preferences. XPath interpreters are also available in Python, 178 | #' and R is fully equipped for manipulating character strings. 179 | #' 180 | #' For each movie list, we will... 181 | #' 182 | 183 | #+ loop1R, eval=FALSE 184 | # For each movie list... For loops are frowned upon in R, let's use the classier 185 | # apply functions instead. Here I use the pblapply from the pbapply package. 186 | # It's equivalent to the regular lapply function, but it provides a neat 187 | # progress bar. Unlist to get a vector. 188 | urls.by.movie <- unlist(pblapply(urls.by.letter, FUN = function(URL) { 189 | 190 | #+ loop1Py, eval=FALSE, engine="python" 191 | list_of_films = [] 192 | 193 | # Go through each movie list page and gather all of the movie web page URLs 194 | for letter in letters: 195 | try: 196 | 197 | #' ...download the raw HTML content of the webpage,... 198 | #' 199 | 200 | #+ readRaw1R, eval=FALSE 201 | # Load raw HTML 202 | raw.html <- getURL(URL, curl = curl) 203 | 204 | #+ readRaw1Py, eval=FALSE, engine="python" 205 | # Read the raw HTML from the web page 206 | page_text = urllib2.urlopen("http://www.moviebodycounts.com/movies-" + letter + ".htm").read() 207 | 208 | #' ...transform raw HTML into a more convenient format to work with,... 209 | #' 210 | 211 | #+ parse1R, eval=FALSE 212 | # Parse HTML content 213 | parsed.html <- htmlParse(raw.html) 214 | 215 | #+ parse1Py, eval=FALSE, engine="python" 216 | # Convert the raw HTML into Markdown 217 | page_text = html2text(page_text).split("\n") 218 | 219 | #' ...find movie page entry, store the URL for later use and close the loop. 220 | #' 221 | 222 | #+ movieLinkR, eval=FALSE 223 | # Extract desired links from HTML content using XPath. 224 | # The desired links are all the URLs ("a/@href") directly following 225 | # ("/following::") the image which source file is called "graphic-movies.jpg" 226 | # ("//img[@src='graphic-movies.jpg']"). 227 | links <- as.vector(xpathSApply(parsed.html, "//img[@src='graphic-movies.jpg']/following::a/@href")) 228 | 229 | # Most links are relative URLs. Add root of the website to make them absolute. 230 | if (!is.null(links)) { 231 | ix = grepl("http://www.moviebodycounts.com/", links) # Find relative URLs 232 | links[!ix] <- paste0("http://www.moviebodycounts.com/", links[!ix]) # Add root of website to make URLs absolute 233 | return(links) 234 | } 235 | }), use.names = FALSE) # close the loop 236 | 237 | # One URL is actually just a symbolic link to another page. Let's get rid of it. 238 | ix <- which(grepl("movies-C.htm", urls.by.movie)) 239 | urls.by.movie <- urls.by.movie[-ix] 240 | 241 | #+ movieLinkPy, eval=FALSE, engine="python" 242 | # Search through the web page for movie page entries 243 | for line in page_text: 244 | # We know it's a movie page entry when it has ".htm" in it, but not ".jpg", "contact.htm", and "movies.htm" 245 | # .jpg means it's a line with an image -- none of the movie entries have an image 246 | # contact.htm and movies.htm means it's a link to the Contact or Movies page -- not what we want 247 | # movies- means it's a redirect link to another page -- just skip over it 248 | if ".htm" in line and ".jpg" not in line and "contact.htm" not in line and "movies.htm" not in line and "movies-" not in line: 249 | #print line 250 | # The URL is in between parentheses (), so we can simply split the string on those 251 | # Some URLs are full URLs, e.g. www.moviebodycounts.com/movie_name.html, so splitting on the / gives us only the page name 252 | list_of_films.append(line.split("(")[-1].strip(")").split("/")[-1]) 253 | 254 | # If the movie list page doesn't exist, keep going 255 | except: 256 | print "\nerror with " + letter + "\n" 257 | 258 | #' Now that we know where to find each movie, we can start the hard part of this 259 | #' challenge. We will go through each movie webpage and attempt to find its 260 | #' title, release year, count of on-screen deaths and link to its page on 261 | #' [www.imdb.com](http://www.imdb.com). We will save all this information in a 262 | #' .csv file. 263 | #' 264 | #' For each movie, we will... 265 | 266 | #+ loop2R, eval=FALSE 267 | # For each movie... 268 | # do.call(rbind, ...) to reorganize the results in a nice data frame 269 | data <- do.call(rbind, pblapply(urls.by.movie, FUN = function(URL) { 270 | 271 | #+ loop2Py, eval=FALSE, engine="python" 272 | # Now that we have every movie web page URL, go through each movie page and 273 | # extract the movie name, kill counts, etc. 274 | out_file = open("film-death-counts.csv", "wb") 275 | out_file.write("Film,Year,Kill_Count,IMDB_url\n") 276 | 277 | for film_page in list_of_films: 278 | try: 279 | # The information we're looking for on the page: 280 | film = "" 281 | kills = "" 282 | year = "" 283 | IMDB_url = "" 284 | 285 | # A flag indicating that we've found the film title on the page 286 | found_title = False 287 | 288 | #' ...download the raw HTML content of the webpage and transform raw HTML into a 289 | #' more convenient format to work with,... 290 | #' 291 | 292 | #+ readRaw2R, eval=FALSE 293 | # Load raw HTML 294 | raw.html <- getURL(URL, curl = curl) 295 | 296 | # Parse HTML content 297 | parsed.html <- htmlParse(raw.html) 298 | 299 | #+ readRaw2Py, eval=FALSE, engine="python" 300 | # Read the page's raw HTML and convert it to Markdown (again) and go 301 | # through each line 302 | for line in html2text(urllib2.urlopen("http://www.moviebodycounts.com/" + film_page).read()).split("\n"): 303 | 304 | #' ...attempt to find movie title,... 305 | 306 | #+ titleR, eval=FALSE 307 | # Find movie title 308 | # Title appears inside a XML/HTML node called "title" ("//title"). In this 309 | # node, it comes after "Movie Body Counts: ". I use gsub to get rid off "Movie 310 | # Body Counts: " and keep only the movie title. 311 | Film <- xpathSApply(parsed.html, "//title", xmlValue) 312 | Film <- gsub("Movie Body Counts: ", "", Film) 313 | 314 | #+ titlePy, eval=FALSE, engine="python" 315 | # If we haven't found the title yet, these markers tell us we've found the movie 316 | # title 317 | if not found_title and "!" not in line and "(" not in line and "[" not in line and line.strip() != "": 318 | film = line.replace(",", "").strip(":") 319 | found_title = True 320 | 321 | #' ...attempt to find movie year,... 322 | 323 | #+ yearR, eval=FALSE 324 | # Find movie year 325 | # The year is usually a text inside ("/descendant::text()") a link node 326 | # ("//a") which source contains the string "charts-year" ("[contains(@href, 327 | # 'charts-year')]"). 328 | Year <- as.numeric(xpathSApply(parsed.html, "//a[contains(@href, 'charts-year')]/descendant::text()", xmlValue)) 329 | 330 | #+ yearPy, eval=FALSE, engine="python" 331 | # The year is usually on a line with "charts-year" 332 | if "charts-year" in line: 333 | year = line.split("[")[1].split("]")[0] 334 | 335 | #' ...attempt to find link to movie on IMDB,... 336 | 337 | #+ imdbR, eval=FALSE 338 | # Find IMDB link 339 | # The IMDB link is inside a link node ("//a") which source contains "imdb" 340 | # ("/@href[contains(.,'imdb')]") 341 | IMDB_URL <- as.vector(xpathSApply(parsed.html, "//a/@href[contains(.,'imdb')]"))[1] 342 | 343 | # Note: We select the first element of the vector because for at least one of 344 | # the movies, this command returns two links. 345 | 346 | #+ imdbPy, eval=FALSE, engine="python" 347 | # The IMDB url is on a line with "[imdb]" 348 | if "[imdb]" in line.lower(): 349 | IMDB_url = line.lower().split("[imdb](")[1].split(")")[0] 350 | 351 | #' ... and finally attempt to find the on-screen kill count. Here, Randy chose 352 | #' an approach that minimizes his coding effort, but that will potentially force 353 | #' him to make several manual corrections a posteriori. I chose to find a 354 | #' solution that works with minimal to no manual corrections, but that requires 355 | #' an extra coding effort. Whichever approach is best depends mostly on the size 356 | #' of the data you want to scrape and the time you have to do it. 357 | 358 | #+killsR, eval=FALSE 359 | # Find kill count. 360 | # Kill count is contained in the first non-empty text node 361 | # ("/following::text()[normalize-space()]") after the image which source file 362 | # is called "graphic-bc.jpg" ("//img[@src='graphic-bc.jpg']") 363 | Body_Count <- xpathSApply(parsed.html, "//img[@src='graphic-bc.jpg']/following::text()[normalize-space()]", xmlValue)[1] 364 | 365 | # Now we need to clean up the text node that we just extracted because there 366 | # are lots of inconsistencies in the way the kill counts are displayed across 367 | # all movie pages. For instance, counts are sometimes accompanied by text, not 368 | # always the same, and sometimes there is no text at all. Sometimes the total 369 | # count is split in two numbers (e.g., number of dead humans and number of 370 | # dead aliens). And sometimes the total count is displayed and accompanied by 371 | # a split count in parenthesis. First, let's remove everything that is 372 | # writtent in parenthesis or that is not a number. 373 | # Using gsub, remove everything in parenthesis and all non number characters 374 | Body_Count <- gsub("\\(.*?\\)", " ", Body_Count) 375 | Body_Count <- gsub("[^0-9]+", " ", Body_Count) 376 | 377 | # In case the total count has been split, we want to separate these numbers 378 | # from each other so that we can add them up later. Using strsplit, split the 379 | # character string at spaces 380 | Body_Count <- unlist(strsplit(Body_Count, " ")) 381 | 382 | # For now, we have extracted characters. Transform them into numbers. 383 | Body_Count <- as.numeric(Body_Count) 384 | 385 | # Sum up the numbers (in case they have been split into separate categories. 386 | Body_Count <- sum(Body_Count, na.rm = TRUE) 387 | 388 | #+ killsPy, eval=FALSE, engine="python" 389 | # The kill counts are usually on a line with "Film:" 390 | if "film:" in line.lower() or "kills:" in line.lower() or "count:" in line.lower(): 391 | kills = re.sub("[^0-9]", "", line.split(":")[1].split("(")[0]) 392 | 393 | #' Almost done! Now we just need to close the loop and write the data frame into 394 | #' a .csv file 395 | 396 | #+ saveR, eval=FALSE 397 | # Return scraped data into a data frame form 398 | return(data.frame(IMDB_URL, Film, Year, Body_Count)) 399 | })) 400 | 401 | # Save scraped data in a .csv file for future use 402 | write.csv(data, "movies-R.csv", row.names = FALSE) 403 | 404 | #+ savePy, eval=FALSE, engine="python" 405 | out_file.write(film + "," + year + "," + kills + "," + IMDB_url + "\n") 406 | 407 | # If a movie page fails to open, print out the error and move on to the next movie 408 | except Exception as e: 409 | print film_page 410 | print e 411 | 412 | out_file.close() 413 | 414 | #' And voilà! You should now have a .csv file somewhere on your computer 415 | #' containing all the information we just scraped from the website. Not too 416 | #' hard, right? 417 | #' 418 | #' Keep the .csv file, we will use it again next week to complete this challenge 419 | #' by scraping additional information from [www.imdb.com](http://www.imdb.com). 420 | #' 421 | 422 | 423 | #' ___ 424 | #' 425 | #' #### 3 - Source code #### 426 | #' 427 | #' R and Python source codes are available 428 | #' [here](https://github.com/morpionZ/R-vs-Python/tree/master/Deadliest%20movies%20scrape/code). 429 | #' 430 | 431 | 432 | #' ___ 433 | #' 434 | #' #### 4 - Bonus for the braves #### 435 | #' 436 | #' Today's challenge was code and text heavy. No pretty pictures to please the eye. So, for all the brave people who made it to the end, here is a cat picture :-) 437 | 438 | #+ bonus, echo=FALSE 439 | library(jpeg) # To read JPG images 440 | 441 | # Download a relevant cat picture; mode is set to "wb" because it seems that 442 | # Windows needs it. I don't use Windows, I can't confirm 443 | if (!file.exists("programming_cat.jpg")) { 444 | download.file(url = "http://i.chzbgr.com/completestore/2010/5/18/129186912722282650.jpg", 445 | destfile = "programming_cat.jpg", quiet = TRUE, mode = "wb") 446 | } 447 | 448 | # Display image 449 | #'
Document title: R vs Python - Round 2 (2/2)
32 |Date: February 2, 2014
33 |Text by: Simon Garnier (www.theswarmlab.com / @sjmgarnier)
34 |R code by: Simon Garnier (www.theswarmlab.com / @sjmgarnier)
35 |Python code by: Randy Olson (www.randalolson.com / @randal_olson)
36 |Document generated with RStudio (www.rstudio.com), knitr (www.yihui.name/knitr/) and pandoc (www.johnmacfarlane.net/pandoc/). Python figures generated with iPython Notebook (www.ipython.org/notebook.html).
37 |My friend Randy Olson and I got into the habit to argue about the relative qualities of our favorite languages for data analysis and visualization. I am an enthusiastic R user (www.r-project.org) while Randy is a fan of Python (www.python.org). One thing we agree on however is that our discussions are meaningless unless we actually put R and Python to a series of tests to showcase their relative strengths and weaknesses. Essentially we will set a common goal (e.g., perform a particular type of data analysis or draw a particular type of graph) and create the R and Python codes to achieve this goal. And since Randy and I are all about sharing, open source and open access, we decided to make public the results of our friendly challenges so that you can help us decide between R and Python and, hopefully, also learn something along the way.
40 |Last time we showed you how to scrape data from www.MovieBodyCounts.com. Today, we will finish what we started by retrieving additional information from www.imdb.com. In particular, we will attempt to recover the following pieces of information for each of the movies we collected last time: MPAA rating, genre(s), director(s), duration in minutes, IMDb rating and full cast. We will detail the different steps of the process and provide for each step the corresponding code (red boxes for R, green boxes for Python). You will also find the entire codes at the end of this document.
44 |If you think there’s a better way to code this in either language, leave a pull request on our GitHub repository or leave a note with suggestions in the comments below.
45 |First things first, let’s set up our working environment by loading some necessary libraries.
47 |# Load libraries
48 | # No additional libraries needed here. Yeah!
49 | # String parsing libraries
50 | from imdb import IMDb
51 | import pandas as pd
52 | import re
53 | Randy is lucky today. Someone else has already written a package (‘IMDbPY’) to scrape data from IMDb. Unfortunately for me, R users are too busy working with serious data sets to take the time to write such a package for my favorite data processing language. Hadley Wickham has included a ‘movie’ data set in the ggplot2 package that contains some of the information stored on IMDb, but some of the pieces we need for today’s challenge are missing.
54 |Since I am not easily discouraged, I decided to write my own IMDb scraping function (see below). It is not as sophisticated as the Python package Randy is using today, but it does the job until someone else decides to write a more complete R/IMDb package. As you will see, I am using the same scraping technique (XPath) as the one I used in the first part of the challenge.
55 |# Create IMDB scraper
56 | IMDb <- function(ID) {
57 | # Retrieve movie info from IMDb.com.
58 | #
59 | # Args:
60 | # ID: IDs of the movies.
61 | #
62 | # Returns:
63 | # A data frame containing one line per movie, and nine columns: movie ID,
64 | # film title, year of release, duration in minutes, MPAA rating, genre(s),
65 | # director(s), IMDb rating, and full cast.
66 |
67 | # Load required libraries
68 | require(XML)
69 | require(pbapply) # Apply functions with progress bars!!!
70 |
71 | # Wrap core of the function in do.call and pblapply in order to
72 | # pseudo-vectorize it (pblapply) and return a data frame (do.call)
73 | info <- do.call(rbind, pblapply(ID, FUN = function(ID) {
74 | # Create movie URL on IMDb.com
75 | URL <- paste0("http://www.imdb.com/title/tt", ID)
76 |
77 | # Download and parse HTML of IMDb page
78 | parsed.html <- htmlParse(URL)
79 |
80 | # Find title
81 | Film <- xpathSApply(parsed.html, "//h1[@class='header']/span[@class='itemprop']", xmlValue)
82 |
83 | # Find year
84 | Year <- as.numeric(gsub("[^0-9]", "", xpathSApply(parsed.html, "//h1[@class='header']/span[@class='nobr']", xmlValue)))
85 |
86 | # Find duration in minutes
87 | Length_Minutes <- as.numeric(gsub("[^0-9]", "", xpathSApply(parsed.html, "//div[@class='infobar']/time[@itemprop='duration']", xmlValue)))
88 |
89 | # Find MPAA rating
90 | MPAA_Rating <- unname(xpathSApply(parsed.html, "//div[@class='infobar']/span/@content"))
91 | if (!is.character(MPAA_Rating)) { # Some movies don't have a MPAA rating
92 | MPAA_Rating <- "UNRATED"
93 | }
94 |
95 | # Find genre
96 | Genre <- paste(xpathSApply(parsed.html, "//span[@class='itemprop' and @itemprop='genre']", xmlValue), collapse='|')
97 |
98 | # Find director
99 | Director <- paste(xpathSApply(parsed.html, "//div[@itemprop='director']/a", xmlValue), collapse='|')
100 |
101 | # Find IMDB rating
102 | IMDB_rating <- as.numeric(xpathSApply(parsed.html, "//div[@class='titlePageSprite star-box-giga-star']", xmlValue))
103 |
104 | # Extract full cast from the full credits page
105 | parsed.html <- htmlParse(paste0(URL,"/fullcredits"))
106 | Full_Cast <- paste(xpathSApply(parsed.html, "//span[@itemprop='name']", xmlValue), collapse='|')
107 |
108 | data.frame(ID = ID, Film = Film, Year = Year, Length_Minutes = Length_Minutes,
109 | MPAA_Rating = MPAA_Rating, Genre = Genre,
110 | Director = Director, IMDB_rating = IMDB_rating, Full_Cast = Full_Cast))
111 | }))
112 | }
113 | imdb_access = IMDb()
114 | Randy and I now have a working IMDb scraper. We can start collecting and organizing the data that we need.
115 |First, let’s load the data we collected last time.
116 |# Load data from last challenge
117 | data <- read.csv("movies-R.csv")
118 | movie_data = pd.read_csv("movies.csv")
119 | Then, we will extract the movie IMDb ID from the IMDb URL we collected last week. It’s easy, it’s the only number in the URL.
120 |# For each movie, extract IMDb info and append it to the data
121 | data <- within(data, {
122 | # Extract ID number
123 | IMDB_ID <- gsub("[^0-9]", "", IMDB_URL)
124 | # Grab only the movie number out of the IMDB URL
125 | movie_data["Movie_Number"] = movie_data["IMDB_URL"].apply(lambda x: re.sub("[^0-9]", "", x))
126 | Now that this is done, we will simply let the IMDb scraper collect the data we want and we will append it to the data from the first part of the challenge.
127 | # Download IMDb info into a temporary variable
128 | IMDB_Info <- IMDb(IMDB_ID)
129 |
130 | # Save MPAA rating
131 | MPAA_Rating <- IMDB_Info$MPAA_Rating
132 |
133 | # Save genre(s)
134 | Genre <- IMDB_Info$Genre
135 |
136 | # Save director(s)
137 | Director <- IMDB_Info$Director
138 |
139 | # Save duration in minutes
140 | Length_Minutes <- IMDB_Info$Length_Minutes
141 |
142 | # Save IMDb rating
143 | IMDB_rating <- IMDB_Info$IMDB_rating
144 |
145 | # Save full cast
146 | Full_Cast <- IMDB_Info$Full_Cast
147 |
148 | # Delete IMDb info
149 | IMDB_Info <- NULL
150 | })
151 | with open("film-death-counts-Python.csv", "wb") as out_file:
152 | out_file.write("Film,Year,Body_Count,MPAA_Rating,Genre,Director,Actors,Length_Minutes,IMDB_Rating\n")
153 |
154 | for movie_entry in movie_data.iterrows():
155 | # Use a try-catch on the loop to prevent temporary connection-related issues from stopping the scrape
156 | try:
157 | movie = imdb_access.get_movie(movie_entry[1]["Movie_Number"])
158 | movie_fields = []
159 |
160 | # Remove non-ASCII character encodings and commas from movie titles
161 | movie_fields.append(movie["title"].encode("ascii", "replace").replace(",", ""))
162 | movie_fields.append(str(movie["year"]))
163 | movie_fields.append(str(movie_entry[1]["Body_Count"]))
164 |
165 | # Some movies don't have MPAA Ratings on IMDB
166 | try:
167 | movie_fields.append(str(movie["mpaa"].split(" ")[1]))
168 | except:
169 | movie_fields.append("")
170 |
171 | # For movies with multiple genres/directors/actors, join them with bars |
172 | movie_fields.append(str("|".join(movie["genres"])))
173 | movie_fields.append(str("|".join([str(x) for x in movie["director"]])))
174 | movie_fields.append(str("|".join([str(x) for x in movie["cast"]])))
175 |
176 | movie_fields.append(str(int(movie["runtime"][0].split(":")[-1])))
177 | movie_fields.append(str(float(movie["rating"])))
178 | And finally, all what is left to do is to save the complete data set into a .csv file and close the script.
179 |write.csv(data, file = "movies-R-full.csv")
180 | # All entries are comma-delimited
181 | out_file.write(",".join(movie_fields) + "\n")
182 |
183 | except Exception as e:
184 | print "Error with", str(movie)
185 | That’s it! You should now have a .csv file somewhere on your computer containing all the information we just scraped in both parts of this challenge.
186 |Sorry it took us so long to complete this part, but beginnings of semesters are always very busy times at the university.
187 |Stay tuned for our next challenge! It will be about making a linear regression, running basic diagnostic tests and plotting the resulting straight line with its confidence interval.
188 |R and Python source codes are available here.
191 | 192 | 193 | -------------------------------------------------------------------------------- /Deadliest movies scrape/notebook2.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | **Document title:** R vs Python - Round 2 (2/2) 7 | 8 | **Date:** February 2, 2014 9 | 10 | **Text by:** Simon Garnier ([www.theswarmlab.com](http://www.theswarmlab.com) 11 | / [\@sjmgarnier](http://twitter.com/sjmgarnier)) 12 | 13 | **R code by:** Simon Garnier 14 | ([www.theswarmlab.com](http://www.theswarmlab.com) / 15 | [\@sjmgarnier](http://twitter.com/sjmgarnier)) 16 | 17 | **Python code by:** Randy Olson 18 | ([www.randalolson.com](http://www.randalolson.com) / 19 | [\@randal_olson](http://twitter.com/randal_olson)) 20 | 21 | Document generated with RStudio ([www.rstudio.com](http://www.rstudio.com)), 22 | knitr ([www.yihui.name/knitr/](http://yihui.name/knitr/)) and pandoc 23 | ([www.johnmacfarlane.net/pandoc/](http://johnmacfarlane.net/pandoc/)). Python 24 | figures generated with iPython Notebook 25 | ([www.ipython.org/notebook.html](http://ipython.org/notebook.html)). 26 | 27 | ___ 28 | 29 | #### Foreword #### 30 | 31 | My friend Randy Olson and I got into the habit to argue about the relative 32 | qualities of our favorite languages for data analysis and visualization. I am 33 | an enthusiastic R user ([www.r-project.org](http://www.r-project.org)) while 34 | Randy is a fan of Python ([www.python.org](http://www.python.org)). One thing 35 | we agree on however is that our discussions are meaningless unless we 36 | actually put R and Python to a series of tests to showcase their relative 37 | strengths and weaknesses. Essentially we will set a common goal (*e.g.*, 38 | perform a particular type of data analysis or draw a particular type of 39 | graph) and create the R and Python codes to achieve this goal. And since 40 | Randy and I are all about sharing, open source and open access, we decided to 41 | make public the results of our friendly challenges so that you can help us 42 | decide between R and Python and, hopefully, also learn something along the 43 | way. 44 | 45 | ___ 46 | 47 | #### Today's challenge: a data thief manual for honest scientists (Part 2 of 2) #### 48 | 49 | ##### 1 - Introduction ##### 50 | 51 | [Last time](http://www.theswarmlab.com/r-vs-python-round-2/) we showed you 52 | how to scrape data from 53 | [www.MovieBodyCounts.com](http://www.moviebodycounts.com). Today, we will 54 | finish what we started by retrieving additional information from 55 | [www.imdb.com](http://www.imdb.com). In particular, we will attempt to 56 | recover the following pieces of information for each of the movies we 57 | collected last time: MPAA rating, genre(s), director(s), duration in minutes, 58 | IMDb rating and full cast. We will detail the different steps of the process 59 | and provide for each step the corresponding code (red boxes for R, green 60 | boxes for Python). You will also find the entire codes at the end of this 61 | document. 62 | 63 | If you think there’s a better way to code this in either language, leave a 64 | pull request on our [GitHub 65 | repository](https://github.com/morpionZ/R-vs-Python/tree/master/Deadliest%20movies%20scrape/code) 66 | or leave a note with suggestions in the comments below. 67 | 68 | ##### 2 - Step by step process ##### 69 | 70 | First things first, let's set up our working environment by loading some 71 | necessary libraries. 72 | 73 | 74 | 75 | ```r 76 | # Load libraries 77 | # No additional libraries needed here. Yeah! 78 | 79 | ``` 80 | 81 | ```python 82 | # String parsing libraries 83 | from imdb import IMDb 84 | import pandas as pd 85 | import re 86 | ``` 87 | 88 | 89 | Randy is lucky today. Someone else has already written a package 90 | (['IMDbPY'](http://imdbpy.sourceforge.net/)) to scrape data from IMDb. 91 | Unfortunately for me, R users are too busy working with serious data sets to 92 | take the time to write such a package for my favorite data processing 93 | language. [Hadley Wickham](http://had.co.nz/) has included a ['movie' data 94 | set](http://had.co.nz/data/movies/) in the [ggplot2](http://ggplot2.org/) 95 | package that contains some of the information stored on IMDb, but some of the 96 | pieces we need for today's challenge are missing. 97 | 98 | Since I am not easily discouraged, I decided to write my own IMDb scraping 99 | function (see below). It is not as sophisticated as the Python package Randy 100 | is using today, but it does the job until someone else decides to write a 101 | more complete R/IMDb package. As you will see, I am using the same scraping 102 | technique (XPath) as the one I used in the first part of the challenge. 103 | 104 | 105 | 106 | ```r 107 | # Create IMDB scraper 108 | IMDb <- function(ID) { 109 | # Retrieve movie info from IMDb.com. 110 | # 111 | # Args: 112 | # ID: IDs of the movies. 113 | # 114 | # Returns: 115 | # A data frame containing one line per movie, and nine columns: movie ID, 116 | # film title, year of release, duration in minutes, MPAA rating, genre(s), 117 | # director(s), IMDb rating, and full cast. 118 | 119 | # Load required libraries 120 | require(XML) 121 | require(pbapply) # Apply functions with progress bars!!! 122 | 123 | # Wrap core of the function in do.call and pblapply in order to 124 | # pseudo-vectorize it (pblapply) and return a data frame (do.call) 125 | info <- do.call(rbind, pblapply(ID, FUN = function(ID) { 126 | # Create movie URL on IMDb.com 127 | URL <- paste0("http://www.imdb.com/title/tt", ID) 128 | 129 | # Download and parse HTML of IMDb page 130 | parsed.html <- htmlParse(URL) 131 | 132 | # Find title 133 | Film <- xpathSApply(parsed.html, "//h1[@class='header']/span[@class='itemprop']", xmlValue) 134 | 135 | # Find year 136 | Year <- as.numeric(gsub("[^0-9]", "", xpathSApply(parsed.html, "//h1[@class='header']/span[@class='nobr']", xmlValue))) 137 | 138 | # Find duration in minutes 139 | Length_Minutes <- as.numeric(gsub("[^0-9]", "", xpathSApply(parsed.html, "//div[@class='infobar']/time[@itemprop='duration']", xmlValue))) 140 | 141 | # Find MPAA rating 142 | MPAA_Rating <- unname(xpathSApply(parsed.html, "//div[@class='infobar']/span/@content")) 143 | if (!is.character(MPAA_Rating)) { # Some movies don't have a MPAA rating 144 | MPAA_Rating <- "UNRATED" 145 | } 146 | 147 | # Find genre 148 | Genre <- paste(xpathSApply(parsed.html, "//span[@class='itemprop' and @itemprop='genre']", xmlValue), collapse='|') 149 | 150 | # Find director 151 | Director <- paste(xpathSApply(parsed.html, "//div[@itemprop='director']/a", xmlValue), collapse='|') 152 | 153 | # Find IMDB rating 154 | IMDB_rating <- as.numeric(xpathSApply(parsed.html, "//div[@class='titlePageSprite star-box-giga-star']", xmlValue)) 155 | 156 | # Extract full cast from the full credits page 157 | parsed.html <- htmlParse(paste0(URL,"/fullcredits")) 158 | Full_Cast <- paste(xpathSApply(parsed.html, "//span[@itemprop='name']", xmlValue), collapse='|') 159 | 160 | data.frame(ID = ID, Film = Film, Year = Year, Length_Minutes = Length_Minutes, 161 | MPAA_Rating = MPAA_Rating, Genre = Genre, 162 | Director = Director, IMDB_rating = IMDB_rating, Full_Cast = Full_Cast)) 163 | })) 164 | } 165 | 166 | ``` 167 | 168 | ```python 169 | imdb_access = IMDb() 170 | ``` 171 | 172 | 173 | Randy and I now have a working IMDb scraper. We can start collecting and 174 | organizing the data that we need. 175 | 176 | First, let's load the data we collected last time. 177 | 178 | 179 | 180 | ```r 181 | # Load data from last challenge 182 | data <- read.csv("movies-R.csv") 183 | 184 | ``` 185 | 186 | ```python 187 | movie_data = pd.read_csv("movies.csv") 188 | ``` 189 | 190 | 191 | Then, we will extract the movie IMDb ID from the IMDb URL we collected last 192 | week. It's easy, it's the only number in the URL. 193 | 194 | 195 | 196 | ```r 197 | # For each movie, extract IMDb info and append it to the data 198 | data <- within(data, { 199 | # Extract ID number 200 | IMDB_ID <- gsub("[^0-9]", "", IMDB_URL) 201 | 202 | ``` 203 | 204 | ```python 205 | # Grab only the movie number out of the IMDB URL 206 | movie_data["Movie_Number"] = movie_data["IMDB_URL"].apply(lambda x: re.sub("[^0-9]", "", x)) 207 | ``` 208 | 209 | 210 | Now that this is done, we will simply let the IMDb scraper collect the data 211 | we want and we will append it to the data from the first part of the 212 | challenge. 213 | 214 | 215 | 216 | ```r 217 | # Download IMDb info into a temporary variable 218 | IMDB_Info <- IMDb(IMDB_ID) 219 | 220 | # Save MPAA rating 221 | MPAA_Rating <- IMDB_Info$MPAA_Rating 222 | 223 | # Save genre(s) 224 | Genre <- IMDB_Info$Genre 225 | 226 | # Save director(s) 227 | Director <- IMDB_Info$Director 228 | 229 | # Save duration in minutes 230 | Length_Minutes <- IMDB_Info$Length_Minutes 231 | 232 | # Save IMDb rating 233 | IMDB_rating <- IMDB_Info$IMDB_rating 234 | 235 | # Save full cast 236 | Full_Cast <- IMDB_Info$Full_Cast 237 | 238 | # Delete IMDb info 239 | IMDB_Info <- NULL 240 | }) 241 | 242 | ``` 243 | 244 | ```python 245 | with open("film-death-counts-Python.csv", "wb") as out_file: 246 | out_file.write("Film,Year,Body_Count,MPAA_Rating,Genre,Director,Actors,Length_Minutes,IMDB_Rating\n") 247 | 248 | for movie_entry in movie_data.iterrows(): 249 | # Use a try-catch on the loop to prevent temporary connection-related issues from stopping the scrape 250 | try: 251 | movie = imdb_access.get_movie(movie_entry[1]["Movie_Number"]) 252 | movie_fields = [] 253 | 254 | # Remove non-ASCII character encodings and commas from movie titles 255 | movie_fields.append(movie["title"].encode("ascii", "replace").replace(",", "")) 256 | movie_fields.append(str(movie["year"])) 257 | movie_fields.append(str(movie_entry[1]["Body_Count"])) 258 | 259 | # Some movies don't have MPAA Ratings on IMDB 260 | try: 261 | movie_fields.append(str(movie["mpaa"].split(" ")[1])) 262 | except: 263 | movie_fields.append("") 264 | 265 | # For movies with multiple genres/directors/actors, join them with bars | 266 | movie_fields.append(str("|".join(movie["genres"]))) 267 | movie_fields.append(str("|".join([str(x) for x in movie["director"]]))) 268 | movie_fields.append(str("|".join([str(x) for x in movie["cast"]]))) 269 | 270 | movie_fields.append(str(int(movie["runtime"][0].split(":")[-1]))) 271 | movie_fields.append(str(float(movie["rating"]))) 272 | ``` 273 | 274 | 275 | And finally, all what is left to do is to save the complete data set into a 276 | .csv file and close the script. 277 | 278 | 279 | 280 | ```r 281 | write.csv(data, file = "movies-R-full.csv") 282 | 283 | ``` 284 | 285 | ```python 286 | # All entries are comma-delimited 287 | out_file.write(",".join(movie_fields) + "\n") 288 | 289 | except Exception as e: 290 | print "Error with", str(movie) 291 | ``` 292 | 293 | 294 | That's it! You should now have a .csv file somewhere on your computer 295 | containing all the information we just scraped in both parts of this 296 | challenge. 297 | 298 | Sorry it took us so long to complete this part, but beginnings of semesters 299 | are always very busy times at the university. 300 | 301 | Stay tuned for our next challenge! It will be about making a linear 302 | regression, running basic diagnostic tests and plotting the resulting 303 | straight line with its confidence interval. 304 | 305 | ___ 306 | 307 | #### 3 - Source code #### 308 | 309 | R and Python source codes are available 310 | [here](https://github.com/morpionZ/R-vs-Python/tree/master/Deadliest%20movies%20scrape/code). 311 | 312 | -------------------------------------------------------------------------------- /Deadliest movies scrape/pandoc_config.txt: -------------------------------------------------------------------------------- 1 | format: html 2 | c: custom.css 3 | s: 4 | S: 5 | mathjax: 6 | o: notebook.html 7 | -------------------------------------------------------------------------------- /Deadliest movies scrape/pandoc_config2.txt: -------------------------------------------------------------------------------- 1 | format: html 2 | c: custom.css 3 | s: 4 | S: 5 | mathjax: 6 | o: notebook2.html 7 | -------------------------------------------------------------------------------- /Deadliest movies scrape/programming_cat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjmgarnier/R-vs-Python/73e33c1623b12ce3fbb8a6ccca2d661571f40455/Deadliest movies scrape/programming_cat.jpg -------------------------------------------------------------------------------- /Deadliest movies/bloody_gun.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjmgarnier/R-vs-Python/73e33c1623b12ce3fbb8a6ccca2d661571f40455/Deadliest movies/bloody_gun.jpg -------------------------------------------------------------------------------- /Deadliest movies/code/code.R: -------------------------------------------------------------------------------- 1 | #' Copyright 2014 Simon Garnier (http://www.theswarmlab.com / @sjmgarnier) 2 | #' 3 | #' This script is free software: you can redistribute it and/or modify it under 4 | #' the terms of the GNU General Public License as published by the Free Software 5 | #' Foundation, either version 3 of the License, or (at your option) any later 6 | #' version. 7 | #' 8 | #' This script is distributed in the hope that it will be useful, but WITHOUT 9 | #' ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 10 | #' FOR A PARTICULAR PURPOSE. 11 | #' 12 | #' See the GNU General Public License for more details. 13 | #' 14 | #' You should have received a copy of the GNU General Public License along with 15 | #' this script. If not, see http://www.gnu.org/licenses/. 16 | #' 17 | 18 | #' **Document title:** R vs Python - Round 1 19 | #' 20 | #' **Date:** January 5, 2014 21 | #' 22 | #' **Author:** Simon Garnier (http://www.theswarmlab.com / @sjmgarnier) 23 | #' 24 | #' **Description:** This script generates a pretty barchart representing the top 25 | #' 25 most violent movies ordered by number of on screen deaths per minute. For 26 | #' more information, see http://www.theswarmlab.com/r-vs-python-round-1/ 27 | #' 28 | #' Document generated with RStudio ([www.rstudio.com](http://www.rstudio.com)). 29 | #' 30 | 31 | # Load libraries 32 | library(lattice) # Very versatile graphics package 33 | library(latticeExtra) # Addition to "lattice" that makes layering graphs a 34 | # breathe, and I'm a lazy person, so why not 35 | 36 | # Load data into a data frame 37 | body.count.data <- within(read.csv("http://files.figshare.com/1332945/film_death_counts.csv"), { 38 | 39 | # Compute on screen deaths per minute for each movie. 40 | Deaths_Per_Minute <- Body_Count / Length_Minutes 41 | ord <- order(Deaths_Per_Minute, decreasing = TRUE) # useful later 42 | 43 | # Combine film title and release date into a new factor column with levels 44 | # ordered by ascending violence 45 | Full_Title <- paste0(Film, " (", Year, ")") 46 | Full_Title <- ordered(Full_Title, levels = rev(unique(Full_Title[ord]))) 47 | 48 | # Combine number of on screen death per minute and duration of the movies into 49 | # a new character string column 50 | Deaths_Per_Minute_With_Length <- paste0(round(Deaths_Per_Minute, digits=2), " (", Length_Minutes, " mins)") 51 | 52 | }) 53 | 54 | # Reorder "body.count.data" by (descending) number of on screen deaths per minute 55 | body.count.data <- body.count.data[body.count.data$ord, ] 56 | 57 | # Select top 25 most violent movies by number of on screen deaths per minute 58 | body.count.data <- body.count.data[1:25,] 59 | 60 | # Generate base graph 61 | graph <- barchart(Full_Title ~ Deaths_Per_Minute, data = body.count.data) 62 | graphics.off() 63 | dev.new(width = 10, height = 8) 64 | print(graph) 65 | 66 | # Create theme 67 | my.bloody.theme <- within(trellis.par.get(), { # Initialize theme with default value 68 | axis.line$col <- NA # Remove axes 69 | plot.polygon <- within(plot.polygon, { 70 | col <- "#8A0606" # Set bar colors to a nice bloody red 71 | border <- NA # Remove bars' outline 72 | }) 73 | axis.text$cex <- 1 # Default axis text size is a bit small. Make it bigger 74 | layout.heights <- within(layout.heights, { 75 | bottom.padding <- 0 # Remove bottom padding 76 | axis.bottom <- 0 # Remove axis padding at the bottom of the graph 77 | axis.top <- 0 # Remove axis padding at the top of the graph 78 | }) 79 | }) 80 | 81 | # Update figure with new theme + other improvements (like a title for instance) 82 | graph <- update( 83 | graph, 84 | main ="25 most violence packed films by deaths per minute", # Title of the barchart 85 | par.settings = my.bloody.theme, # Use custom theme 86 | xlab = NULL, # Remove label of x axis 87 | scales = list(x = list(at = NULL)), # Remove rest of x axis 88 | xlim = c(0, 6.7), # Set graph limits along x axis to accomodate the additional text (requires some trial and error) 89 | box.width = 0.75) # Default bar width is a bit small. Make it bigger) 90 | 91 | print(graph) 92 | 93 | # Add number of on screen deaths per minute and duration of movies at the end of each bar 94 | graph <- graph + layer(with(body.count.data, 95 | panel.text( 96 | Deaths_Per_Minute, # x position of the text 97 | 25:1, # y position of the text 98 | pos = 4, # Position of the text relative to the x and y position (4 = to the right) 99 | Deaths_Per_Minute_With_Length))) # Text to display 100 | 101 | # Print graph 102 | print(graph) 103 | 104 | # Load additional libraries 105 | library(jpeg) # To read JPG images 106 | library(grid) # Graphics library with better image plotting capabilities 107 | 108 | # Download a pretty background image; mode is set to "wb" because it seems that 109 | # Windows needs it. I don't use Windows, I can't confirm 110 | download.file(url = "http://www.theswarmlab.com/wp-content/uploads/2014/01/bloody_gun.jpg", 111 | destfile = "bloody_gun.jpg", quiet = TRUE, mode = "wb") 112 | 113 | # Load gun image using "readJPEG" from the "jpeg" package 114 | img <- readJPEG("bloody_gun.jpg") 115 | 116 | # Add image to graph using "grid.raster" from the "grid" package 117 | graph <- graph + layer_( 118 | grid.raster( 119 | as.raster(img), # Image as a raster 120 | x = 1, # x location of image "Normalised Parent Coordinates" 121 | y = 0, # y location of image "Normalised Parent Coordinates" 122 | height = 0.7, # Height of the image. 1 indicates that the image height is equal to the graph height 123 | just = c("right", "bottom"))) # Justification of the image relative to its x and y locations 124 | 125 | # Print graph 126 | print(graph) 127 | -------------------------------------------------------------------------------- /Deadliest movies/code/code.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2014 Randal S. Olson 3 | 4 | This file is a script that makes pretty bar charts. It was written to be executed 5 | in IPython Notebook. 6 | 7 | This script is free software: you can redistribute it and/or modify it under the 8 | terms of the GNU General Public License as published by the Free Software Foundation, 9 | either version 3 of the License, or (at your option) any later version. 10 | 11 | This script is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; 12 | without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 13 | See the GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License along with this script. 16 | If not, see http://www.gnu.org/licenses/. 17 | """ 18 | 19 | # This starts the IPython Notebook pylab module, useful for plotting and interactive scientific computing 20 | %pylab inline 21 | from pandas import read_csv 22 | 23 | # Read the data into a pandas DataFrame 24 | body_count_data = read_csv("http://files.figshare.com/1332945/film_death_counts.csv") 25 | 26 | # Divide the body counts by the length of the film 27 | body_count_data["Deaths_Per_Minute"] = (body_count_data["Body_Count"].apply(float).values / 28 | body_count_data["Length_Minutes"].values) 29 | 30 | # Only keep the top 25 highest kills per minute films 31 | body_count_data = body_count_data.sort("Deaths_Per_Minute", ascending=False)[:25] 32 | 33 | # Change the order of the data so highest kills per minute films are on top in the plot 34 | body_count_data = body_count_data.sort("Deaths_Per_Minute", ascending=True) 35 | 36 | # Generate the full titles for the movies: movie name (year) 37 | full_title = [] 38 | 39 | for film, year in zip(body_count_data["Film"].values, body_count_data["Year"].values): 40 | full_title.append(film + " (" + str(year) + ")") 41 | 42 | body_count_data["Full_Title"] = array(full_title) 43 | 44 | fig = plt.figure(figsize=(8,12)) 45 | 46 | # Plot the red horizontal bars 47 | rects = plt.barh(range(len(body_count_data["Deaths_Per_Minute"])), 48 | body_count_data["Deaths_Per_Minute"], 49 | height=0.8, 50 | align="center", 51 | color="#8A0707", 52 | edgecolor="none") 53 | 54 | # This function adds the deaths per minute label to the right of the bars 55 | def autolabel(rects): 56 | for i, rect in enumerate(rects): 57 | width = rect.get_width() 58 | label_text = (str(round(float(width), 2)) + 59 | " (" + str(body_count_data["Length_Minutes"].values[i]) + 60 | " mins)") 61 | 62 | plt.text(width + 0.25, 63 | rect.get_y() + rect.get_height() / 2., 64 | label_text, 65 | ha="left", 66 | va="center", 67 | fontsize=14) 68 | 69 | autolabel(rects) 70 | 71 | # Add the film labels to left of the bars (y-axis) 72 | yticks(range(len(body_count_data["Full_Title"])), body_count_data["Full_Title"].values, fontsize=14) 73 | 74 | # Don't have any x tick labels 75 | xticks(arange(0, 5, 1), [""]) 76 | 77 | # Plot styling 78 | 79 | # Remove the plot frame lines 80 | ax = axes() 81 | ax.spines["top"].set_visible(False) 82 | ax.spines["right"].set_visible(False) 83 | ax.spines["left"].set_visible(False) 84 | ax.spines["bottom"].set_visible(False) 85 | 86 | # y-axis ticks on the left and x-axis ticks on the bottom 87 | ax.yaxis.tick_left() 88 | ax.xaxis.tick_bottom() 89 | 90 | # Color the y-axis ticks the same dark red color, and the x-axis ticks white 91 | ax.tick_params(axis="y", color="#8A0707") 92 | ax.tick_params(axis="x", color="white") 93 | 94 | # Don't show the x axis tick markers 95 | ax.xaxis.grid(color="white", linestyle="-") 96 | 97 | # Save the figure as a PNG 98 | # We can also save this as a PDF, JPG, TIFF, or most other image formats 99 | savefig("25-Violence-Packed-Films.png", bbox_inches="tight") 100 | -------------------------------------------------------------------------------- /Deadliest movies/custom.css: -------------------------------------------------------------------------------- 1 | body { 2 | font: 14px/1.5em "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif; 3 | color: #777; 4 | -webkit-font-smoothing: antialiased; /* Fix for webkit rendering */ 5 | -webkit-text-size-adjust: 100%; 6 | /*font-family: "Avenir Next", Helvetica, Arial, sans-serif;*/ 7 | padding:1em; 8 | margin:auto; 9 | max-width:10in; 10 | } 11 | 12 | h1, h2, h3, h4, h5, h6 { 13 | 14 | font-weight: normal; } 15 | h1 a, h2 a, h3 a, h4 a, h5 a, h6 a { font-weight: inherit; } 16 | h1 { font-size: 46px; line-height: 50px; margin-bottom: 14px;} 17 | h2 { font-size: 35px; line-height: 40px; margin-bottom: 10px; } 18 | h3 { font-size: 28px; line-height: 34px; margin-bottom: 8px; } 19 | h4 { font-size: 21px; line-height: 30px; margin-bottom: 4px; } 20 | h5 { font-size: 17px; line-height: 24px; } 21 | h6 { font-size: 14px; line-height: 21px; } 22 | .subheader { color: #777; } 23 | 24 | p { margin: 0 0 20px 0; } 25 | p img { margin: 0; } 26 | p.lead { font-size: 21px; line-height: 27px; color: #444; } 27 | 28 | em { font-style: italic; } 29 | strong { font-weight: bold; } 30 | small { font-size: 80%; } 31 | 32 | hr { 33 | height: 0.2em; 34 | border: 0; 35 | color: #CCCCCC; 36 | background-color: #CCCCCC; 37 | } 38 | 39 | p, blockquote, ul, ol, dl, li, table, pre { 40 | margin: 15px 0; 41 | text-align: justify; 42 | } 43 | 44 | a, a:visited { color: #333; text-decoration: underline; outline: 0; } 45 | a:hover, a:focus { color: #000; } 46 | p a, p a:visited { line-height: inherit; } 47 | 48 | #message { 49 | border-radius: 6px; 50 | border: 1px solid #ccc; 51 | display:block; 52 | width:100%; 53 | height:60px; 54 | margin:6px 0px; 55 | } 56 | 57 | button, #ws { 58 | font-size: 10pt; 59 | padding: 4px 6px; 60 | border-radius: 5px; 61 | border: 1px solid #bbb; 62 | background-color: #eee; 63 | } 64 | 65 | code, pre, #ws, #message { 66 | font-family: Monaco; 67 | font-size: 8pt; 68 | border-radius: 3px; 69 | background-color: #F8F8F8; 70 | color: inherit; 71 | } 72 | 73 | code { 74 | border: 1px solid #EAEAEA; 75 | margin: 0 2px; 76 | padding: 0 5px; 77 | } 78 | 79 | pre.r { 80 | border: 2px solid #8A0606; 81 | } 82 | 83 | pre.r:before { 84 | content: 'R code \A'; 85 | color: #8A0606; 86 | font-weight: bold; 87 | } 88 | 89 | pre.python { 90 | border: 2px solid #068A06; 91 | } 92 | 93 | pre.python:before { 94 | content: 'Python code \A'; 95 | color: #068A06; 96 | font-weight: bold; 97 | } 98 | 99 | img { 100 | max-width: 100%; 101 | height: auto; 102 | width: auto\9; /* ie8 */ 103 | } 104 | 105 | pre { 106 | border: 1px solid #CCCCCC; 107 | overflow: auto; 108 | padding: 4px 8px; 109 | } 110 | 111 | pre > code { 112 | border: 0; 113 | margin: 0; 114 | padding: 0; 115 | } 116 | 117 | blockquote, blockquote p { font-size: 12px; line-height: 24px; color: #000; font-style: italic; } 118 | blockquote { margin: 0 0 20px; padding: 9px 50px 0 49px; border-left: 1px solid #ddd; } 119 | blockquote cite { display: block; font-size: 12px; color: #555; } 120 | blockquote cite:before { content: "\2014 \0020"; } 121 | blockquote cite a, blockquote cite a:visited, blockquote cite a:visited { color: #555; } 122 | 123 | #ws { background-color: #f8f8f8; } 124 | 125 | .send { color:#77bb77; } 126 | .server { color:#7799bb; } 127 | .error { color:#AA0000; } 128 | -------------------------------------------------------------------------------- /Deadliest movies/figure/baseGraphR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjmgarnier/R-vs-Python/73e33c1623b12ce3fbb8a6ccca2d661571f40455/Deadliest movies/figure/baseGraphR.png -------------------------------------------------------------------------------- /Deadliest movies/figure/gunR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjmgarnier/R-vs-Python/73e33c1623b12ce3fbb8a6ccca2d661571f40455/Deadliest movies/figure/gunR.png -------------------------------------------------------------------------------- /Deadliest movies/figure/prettyR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjmgarnier/R-vs-Python/73e33c1623b12ce3fbb8a6ccca2d661571f40455/Deadliest movies/figure/prettyR.png -------------------------------------------------------------------------------- /Deadliest movies/figure/rightLabelsR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjmgarnier/R-vs-Python/73e33c1623b12ce3fbb8a6ccca2d661571f40455/Deadliest movies/figure/rightLabelsR.png -------------------------------------------------------------------------------- /Deadliest movies/figurePy/basePy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjmgarnier/R-vs-Python/73e33c1623b12ce3fbb8a6ccca2d661571f40455/Deadliest movies/figurePy/basePy.png -------------------------------------------------------------------------------- /Deadliest movies/figurePy/finalPy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjmgarnier/R-vs-Python/73e33c1623b12ce3fbb8a6ccca2d661571f40455/Deadliest movies/figurePy/finalPy.png -------------------------------------------------------------------------------- /Deadliest movies/figurePy/prettyPy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sjmgarnier/R-vs-Python/73e33c1623b12ce3fbb8a6ccca2d661571f40455/Deadliest movies/figurePy/prettyPy.png -------------------------------------------------------------------------------- /Deadliest movies/pandoc_config.txt: -------------------------------------------------------------------------------- 1 | format: html 2 | c: custom.css 3 | s: 4 | S: 5 | mathjax: 6 | o: run.html 7 | -------------------------------------------------------------------------------- /Deadliest movies/run.R: -------------------------------------------------------------------------------- 1 | #+ licence, echo=FALSE 2 | # Copyright 2014 Simon Garnier (http://www.theswarmlab.com / @sjmgarnier) 3 | # 4 | # This script is free software: you can redistribute it and/or modify it under 5 | # the terms of the GNU General Public License as published by the Free Software 6 | # Foundation, either version 3 of the License, or (at your option) any later 7 | # version. 8 | # 9 | # This script is distributed in the hope that it will be useful, but WITHOUT ANY 10 | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR 11 | # A PARTICULAR PURPOSE. 12 | # 13 | # See the GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU General Public License along with 16 | # this script. If not, see http://www.gnu.org/licenses/. 17 | # 18 | # You can generate the HTML files by running: 19 | # library(knitr) 20 | # spin("run.R") 21 | # pandoc("run.md", config = "pandoc_config.txt") 22 | 23 | 24 | #+ 25 | #' **Document title:** R vs Python - Round 1 26 | #' 27 | #' **Date:** January 5, 2014 28 | #' 29 | #' **Text by:** Simon Garnier ([www.theswarmlab.com](http://www.theswarmlab.com) 30 | #' / [\@sjmgarnier](http://twitter.com/sjmgarnier)) 31 | #' 32 | #' **R code by:** Simon Garnier 33 | #' ([www.theswarmlab.com](http://www.theswarmlab.com) / 34 | #' [\@sjmgarnier](http://twitter.com/sjmgarnier)) 35 | #' 36 | #' **Python code by:** Randy Olson 37 | #' ([www.randalolson.com](http://www.randalolson.com) / 38 | #' [\@randal_olson](http://twitter.com/randal_olson)) 39 | #' 40 | #' Document generated with RStudio ([www.rstudio.com](http://www.rstudio.com)), 41 | #' knitr ([www.yihui.name/knitr/](http://yihui.name/knitr/)) and pandoc 42 | #' ([www.johnmacfarlane.net/pandoc/](http://johnmacfarlane.net/pandoc/)). Python 43 | #' figures generated with iPython Notebook 44 | #' ([www.ipython.org/notebook.html](http://ipython.org/notebook.html)). 45 | #' 46 | 47 | 48 | #' ___ 49 | #' 50 | #' #### Foreword #### 51 | #' 52 | #' My friend Randy Olson and I got into the habit to argue about the relative 53 | #' qualities of our favorite languages for data analysis and visualization. I am 54 | #' an enthusiastic R user ([www.r-project.org](http://www.r-project.org)) while 55 | #' Randy is a fan of Python ([www.python.org](http://www.python.org)). One thing 56 | #' we agree on however is that our discussions are meaningless unless we 57 | #' actually put R and Python to a series of tests to showcase their relative 58 | #' strengths and weaknesses. Essentially we will set a common goal (*e.g.*, 59 | #' perform a particular type of data analysis or draw a particular type of 60 | #' graph) and create the R and Python codes to achieve this goal. And since 61 | #' Randy and I are all about sharing, open source and open access, we decided to 62 | #' make public the results of our friendly challenges so that you can help us 63 | #' decide between R and Python and, hopefully, also learn something along the 64 | #' way. 65 | #' 66 | 67 | 68 | #' ___ 69 | #' 70 | #' #### Today's challenge: where we learn that Hollywood's cemetery is full #### 71 | #' 72 | #' ##### 1 - Introduction ##### 73 | #' 74 | #' For this first challenge, we will use data collected by Randy for his recent 75 | #' post on the ["Top 25 most violence packed films" in the history of the movie 76 | #' industry](www.randalolson.com/2013/12/31/most-violence-packed-films/). For 77 | #' his post, Randy generated a simple horizontal barchart showing the top 25 78 | #' most violent films ordered by number of on screen deaths per minute. In the 79 | #' rest of this document, we will show you how to reproduce this graph using 80 | #' Python and how to achieve a similar result with R. We will detail the 81 | #' different steps of the process and provide for each step the corresponding 82 | #' code (red boxes for R, green boxes for Python). You will also find the entire 83 | #' codes at the end of this document. 84 | #' 85 | #' And now without further ado, let's get started! 86 | #' 87 | 88 | #' ##### 2 - Step by step process ##### 89 | #' 90 | #' First things first, let's set up our working environment by loading some 91 | #' necessary libraries. 92 | #' 93 | 94 | #+ libR, message=FALSE 95 | # Load libraries 96 | library(lattice) # Very versatile graphics package 97 | library(latticeExtra) # Addition to "lattice" that makes layering graphs a 98 | # breathe, and I'm a lazy person, so why not 99 | 100 | #+ libPy, eval=FALSE, engine="python" 101 | # This starts the IPython Notebook pylab module, useful for plotting and 102 | # interactive scientific computing 103 | %pylab inline 104 | from pandas import * 105 | 106 | #' Now let's load the data for today's job. The raw data were scraped by Randy 107 | #' (using Python) from [www.MovieBodyCounts.com](http://www.MovieBodyCounts.com) 108 | #' and he generously provided the result of his hard work on FigShare at this 109 | #' address: 110 | #' [http://dx.doi.org/10.6084/m9.figshare.889719](http://dx.doi.org/10.6084/m9.figshare.889719). 111 | #' 112 | 113 | #+ dataR 114 | # Load data into a data frame 115 | body.count.data <- read.csv("http://files.figshare.com/1332945/film_death_counts.csv") 116 | 117 | #+ dataPy, eval=FALSE, engine='python' 118 | # Read the data into a pandas DataFrame 119 | body_count_data = read_csv("http://files.figshare.com/1332945/film_death_counts.csv") 120 | 121 | #' For each movie, the data frame contains a column for the total number of on 122 | #' screen deaths ("Body_Count") and a column for the duration 123 | #' ("Length_Minutes"). We will now create an extra column for the number of on 124 | #' screen deaths per minute of each movie ("Deaths_Per_Minute") 125 | #' 126 | 127 | #+ deathsPerMinR 128 | # Compute on screen deaths per minute for each movie. 129 | body.count.data <- within(body.count.data, { 130 | Deaths_Per_Minute <- Body_Count / Length_Minutes 131 | ord <- order(Deaths_Per_Minute, decreasing = TRUE) # useful later 132 | }) 133 | 134 | #+ deathsPerMinPy, eval=FALSE, engine="python" 135 | # Divide the body counts by the length of the film 136 | body_count_data["Deaths_Per_Minute"] = (body_count_data["Body_Count"].apply(float).values / 137 | body_count_data["Length_Minutes"].values) 138 | 139 | #' Now we will reorder the data frame by (descending) number of on screen deaths 140 | #' per minute, and select the top 25 most violent movies according to this criterion. 141 | #' 142 | 143 | #+ top25R 144 | # Reorder "body.count.data" by (descending) number of on screen deaths per minute 145 | body.count.data <- body.count.data[body.count.data$ord, ] 146 | 147 | # Select top 25 most violent movies by number of on screen deaths per minute 148 | body.count.data <- body.count.data[1:25,] 149 | 150 | #+ top25Py, eval=FALSE, engine="python" 151 | # Only keep the top 25 highest kills per minute films 152 | body_count_data = body_count_data.sort("Deaths_Per_Minute", ascending=False)[:25] 153 | 154 | # Change the order of the data so highest kills per minute films are on top in the plot 155 | body_count_data = body_count_data.sort("Deaths_Per_Minute", ascending=True) 156 | 157 | #' In Randy's graph, the "y" axis shows the film title with the release date. We 158 | #' will now generate the full title for each movie following a "Movie name 159 | #' (year)" format, and append it to the data frame. 160 | #' 161 | 162 | #+ filmTitleR 163 | # Combine film title and release date into a new factor column with levels 164 | # ordered by ascending violence 165 | body.count.data <- within(body.count.data, { 166 | Full_Title <- paste0(Film, " (", Year, ")") 167 | ord <- order(Deaths_Per_Minute, decreasing = TRUE) 168 | Full_Title <- ordered(Full_Title, levels = rev(unique(Full_Title[ord]))) 169 | }) 170 | 171 | #+ filmTitlePy, eval=FALSE, engine="python" 172 | # Generate the full titles for the movies: movie name (year) 173 | full_title = [] 174 | 175 | for film, year in zip(body_count_data["Film"].values, body_count_data["Year"].values): 176 | full_title.append(film + " (" + str(year) + ")") 177 | 178 | body_count_ y-axis ticks on the left and x-axis ticks on the bottom 179 | ax.yaxis.tick_left() 180 | ax.xaxis.tick_bottom()data["Full_Title"] = array(full_title) 181 | 182 | #' Now we are ready to generate the barchart. We're going to start with the 183 | #' default options and then we will make this thing look pretty. 184 | #' 185 | 186 | #+ baseGraphR, fig.width=10, fig.height=8, fig.align="center", dev="png" 187 | # Generate base graph 188 | graph <- barchart(Full_Title ~ Deaths_Per_Minute, data = body.count.data) 189 | print(graph) 190 | 191 | #+ baseGraphPy, eval=FALSE, engine="python" 192 | # plot the bars 193 | fig = plt.figure(figsize=(8,12)) 194 | 195 | # Plot the red horizontal bars 196 | rects = plt.barh(range(len(body_count_data["Deaths_Per_Minute"])), 197 | body_count_data["Deaths_Per_Minute"], 198 | height=0.8, 199 | align="center", 200 | color="#8A0707", 201 | edgecolor="none") 202 | 203 | # Add the film labels to left of the bars (y-axis) 204 | yticks(range(len(body_count_data["Full_Title"])), body_count_data["Full_Title"].values, fontsize=14)xticks(arange(0, 5, 1), [""]) 205 | 206 | #'Document title: R vs Python - Round 1
32 |Date: January 5, 2014
33 |Text by: Simon Garnier (www.theswarmlab.com / @sjmgarnier)
34 |R code by: Simon Garnier (www.theswarmlab.com / @sjmgarnier)
35 |Python code by: Randy Olson (www.randalolson.com / @randal_olson)
36 |Document generated with RStudio (www.rstudio.com), knitr (www.yihui.name/knitr/) and pandoc (www.johnmacfarlane.net/pandoc/). Python figures generated with iPython Notebook (www.ipython.org/notebook.html).
37 |My friend Randy Olson and I got into the habit to argue about the relative qualities of our favorite languages for data analysis and visualization. I am an enthusiastic R user (www.r-project.org) while Randy is a fan of Python (www.python.org). One thing we agree on however is that our discussions are meaningless unless we actually put R and Python to a series of tests to showcase their relative strengths and weaknesses. Essentially we will set a common goal (e.g., perform a particular type of data analysis or draw a particular type of graph) and create the R and Python codes to achieve this goal. And since Randy and I are all about sharing, open source and open access, we decided to make public the results of our friendly challenges so that you can help us decide between R and Python and, hopefully, also learn something along the way.
40 |For this first challenge, we will use data collected by Randy for his recent post on the “Top 25 most violence packed films” in the history of the movie industry. For his post, Randy generated a simple horizontal barchart showing the top 25 more violent films ordered by number of on screen deaths per minute. In the rest of this document, we will show you how to reproduce this graph using Python and how to achieve a similar result with R. We will detail the different steps of the process and provide for each step the corresponding code (red boxes for R, green boxes for Python). You will also find the entire codes at the end of this document.
44 |And now without further ado, let’s get started!
45 |First thing first, let’s set up our working environment by loading some necessary libraries.
47 |# Load libraries
48 | library(lattice) # Very versatile graphics package
49 | library(latticeExtra) # Addition to "lattice" that makes layering graphs a
50 | # breathe, and I'm a lazy person, so why not
51 | # This starts the IPython Notebook pylab module, useful for plotting and
52 | # interactive scientific computing
53 | %pylab inline
54 | from pandas import *
55 | Now let’s load the data for today’s job. The raw data were scraped by Randy (using Python) from www.MovieBodyCounts.com and he generously provided the result of his hard work on FigShare at this address: http://dx.doi.org/10.6084/m9.figshare.889719.
56 |# Load data into a data frame
57 | body.count.data <- read.csv("http://files.figshare.com/1332945/film_death_counts.csv")
58 | # Read the data into a pandas DataFrame
59 | body_count_data = read_csv("http://files.figshare.com/1332945/film_death_counts.csv")
60 | For each movie, the data frame contains a column for the total number of on screen deaths (“Body_Count”) and a column for the duration (“Length_Minutes”). We will now create an extra column for the number of on screen deaths per minute of each movie (“Deaths_Per_Minute”)
61 |# Compute on screen deaths per minute for each movie.
62 | body.count.data <- within(body.count.data, {
63 | Deaths_Per_Minute <- Body_Count / Length_Minutes
64 | ord <- order(Deaths_Per_Minute, decreasing = TRUE) # useful later
65 | })
66 | # Divide the body counts by the length of the film
67 | body_count_data["Deaths_Per_Minute"] = (body_count_data["Body_Count"].apply(float).values /
68 | body_count_data["Length_Minutes"].values)
69 | Now we will reorder the data frame by (descending) number of on screen deaths per minute, and select the top 25 most violent movies according to this criterion.
70 |# Reorder "body.count.data" by (descending) number of on screen deaths per minute
71 | body.count.data <- body.count.data[body.count.data$ord, ]
72 |
73 | # Select top 25 most violent movies by number of on screen deaths per minute
74 | body.count.data <- body.count.data[1:25,]
75 | # Only keep the top 25 highest kills per minute films
76 | body_count_data = body_count_data.sort("Deaths_Per_Minute", ascending=False)[:25]
77 |
78 | # Change the order of the data so highest kills per minute films are on top in the plot
79 | body_count_data = body_count_data.sort("Deaths_Per_Minute", ascending=True)
80 | In Randy’s graph, the “y” axis shows the film title with the release date. We will now generate the full title for each movie following a “Movie name (year)” format, and append it to the data frame.
81 |# Combine film title and release date into a new factor column with levels
82 | # ordered by ascending violence
83 | body.count.data <- within(body.count.data, {
84 | Full_Title <- paste0(Film, " (", Year, ")")
85 | ord <- order(Deaths_Per_Minute, decreasing = TRUE)
86 | Full_Title <- ordered(Full_Title, levels = rev(unique(Full_Title[ord]))) # some films are duplicated! Bad Randy!
87 | })
88 | # Generate the full titles for the movies: movie name (year)
89 | full_title = []
90 |
91 | for film, year in zip(body_count_data["Film"].values, body_count_data["Year"].values):
92 | full_title.append(film + " (" + str(year) + ")")
93 |
94 | body_count_ y-axis ticks on the left and x-axis ticks on the bottom
95 | ax.yaxis.tick_left()
96 | ax.xaxis.tick_bottom()data["Full_Title"] = array(full_title)
97 | Now we are ready to generate the barchart. We’re going to start with the default options and then we will make this thing look pretty.
98 |# Generate base graph
99 | graph <- barchart(Full_Title ~ Deaths_Per_Minute, data = body.count.data)
100 | print(graph)
101 | # plot the bars
103 | fig = plt.figure(figsize=(8,12))
104 |
105 | # Plot the red horizontal bars
106 | rects = plt.barh(range(len(body_count_data["Deaths_Per_Minute"])),
107 | body_count_data["Deaths_Per_Minute"],
108 | height=0.8,
109 | align="center",
110 | color="#8A0707",
111 | edgecolor="none")
112 |
113 | # Add the film labels to left of the bars (y-axis)
114 | yticks(range(len(body_count_data["Full_Title"])), body_count_data["Full_Title"].values, fontsize=14)xticks(arange(0, 5, 1), [""])
115 | Ok, now let’s make this pretty.
120 |# Create theme
121 | my.bloody.theme <- within(trellis.par.get(), { # Initialize theme with default value
122 | axis.line$col <- NA # Remove axes
123 | plot.polygon <- within(plot.polygon, {
124 | col <- "#8A0606" # Set bar colors to a nice bloody red
125 | border <- NA # Remove bars' outline
126 | })
127 | axis.text$cex <- 1 # Default axis text size is a bit small. Make it bigger
128 | layout.heights <- within(layout.heights, {
129 | bottom.padding <- 0 # Remove bottom padding
130 | axis.bottom <- 0 # Remove axis padding at the bottom of the graph
131 | axis.top <- 0 # Remove axis padding at the top of the graph
132 | })
133 | })
134 |
135 | # Update figure with new theme + other improvements (like a title for instance)
136 | graph <- update(
137 | graph,
138 | main='25 most violence packed films by deaths per minute', # Title of the barchart
139 | par.settings = my.bloody.theme, # Use custom theme
140 | xlab = NULL, # Remove label of x axis
141 | scales=list(x=list(at=NULL)), # Remove rest of x axis
142 | xlim = c(0, 6.7), # Set graph limits along x axis to accomodate the additional text (requires some trial and error)
143 | box.width=0.75) # Default bar width is a bit small. Make it bigger)
144 |
145 | print(graph)
146 | # Don't have any x tick labels
148 | xticks(arange(0, 5, 1), [""])
149 |
150 | # Plot styling
151 |
152 | # Remove the plot frame lines
153 | ax = axes()
154 | ax.spines["top"].set_visible(False)
155 | ax.spines["right"].set_visible(False)
156 | ax.spines["left"].set_visible(False)
157 | ax.spines["bottom"].set_visible(False)
158 |
159 | # Color the y-axis ticks the same dark red color, and the x-axis ticks white
160 | ax.tick_params(axis="y", color="#8A0707")
161 | ax.tick_params(axis="x", color="white")
162 |
163 | ax.xaxis.grid(color="white", linestyle="-")
164 | Finally, the last thing we want to add to our graph is the number of deaths per minute and the duration of each movie on the right of the graph.
169 |# Combine number of on screen death per minute and duration of the movies into a new character string column
170 | body.count.data <- within(body.count.data, {
171 | Deaths_Per_Minute_With_Length = paste0(round(body.count.data$Deaths_Per_Minute, digits=2), " (", body.count.data$Length_Minutes, " mins)")
172 | })
173 |
174 | # Add number of on screen deaths per minute and duration of movies at the end of each bar
175 | graph <- graph + layer(with(body.count.data,
176 | panel.text(
177 | Deaths_Per_Minute, # x position of the text
178 | 25:1, # y position of the text
179 | pos = 4, # Position of the text relative to the x and y position (4 = to the right)
180 | Deaths_Per_Minute_With_Length))) # Text to display
181 |
182 | # Print graph
183 | print(graph)
184 | # This function adds the deaths per minute label to the right of the bars
186 | def autolabel(rects):
187 | for i, rect in enumerate(rects):
188 | width = rect.get_width()
189 | label_text = (str(round(float(width), 2)) +
190 | " (" + str(body_count_data["Length_Minutes"].values[i]) +
191 | " mins)")
192 |
193 | plt.text(width + 0.25,
194 | rect.get_y() + rect.get_height() / 2.,
195 | label_text,
196 | ha="left",
197 | va="center",
198 | fontsize=14)
199 |
200 | autolabel(rects)
201 | Just for fun, I decided to add to the R graph a little accessory in relation with the general theme of this data set.
208 |# Load additional libraries
209 | library(jpeg) # To read JPG images
210 | library(grid) # Graphics library with better image plotting capabilities
211 |
212 | # Download a pretty background image; mode is set to "wb" because it seems that
213 | # Windows needs it. I don't use Windows, I can't confirm
214 | download.file(url = "http://www.theswarmlab.com/wp-content/uploads/2014/01/bloody_gun.jpg",
215 | destfile = "bloody_gun.jpg", quiet = TRUE, mode = "wb")
216 |
217 | # Load gun image using "readJPEG" from the "jpeg" package
218 | img <- readJPEG("bloody_gun.jpg")
219 |
220 | # Add image to graph using "grid.raster" from the "grid" package
221 | graph <- graph + layer_(
222 | grid.raster(
223 | as.raster(img), # Image as a raster
224 | x = 1, # x location of image "Normalised Parent Coordinates"
225 | y = 0, # y location of image "Normalised Parent Coordinates"
226 | height = 0.7, # Height of the image. 1 indicates that the image height is equal to the graph height
227 | just = c("right", "bottom"))) # Justification of the image relative to its x and y locations
228 |
229 | # Print graph
230 | print(graph)
231 | R and Python source codes are available here.
235 |For F# fan, Terje Tyldum has written his version of the code in F# here.
236 |Randy and I also recommend that you check out this post by Ramiro Gómez (@yaph) where he does a more in-depth analysis of the data set we used for today’s challenge.
237 | 238 | 239 | -------------------------------------------------------------------------------- /Deadliest movies/run.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | **Document title:** R vs Python - Round 1 7 | 8 | **Date:** January 5, 2014 9 | 10 | **Text by:** Simon Garnier ([www.theswarmlab.com](http://www.theswarmlab.com) 11 | / [\@sjmgarnier](http://twitter.com/sjmgarnier)) 12 | 13 | **R code by:** Simon Garnier 14 | ([www.theswarmlab.com](http://www.theswarmlab.com) / 15 | [\@sjmgarnier](http://twitter.com/sjmgarnier)) 16 | 17 | **Python code by:** Randy Olson 18 | ([www.randalolson.com](http://www.randalolson.com) / 19 | [\@randal_olson](http://twitter.com/randal_olson)) 20 | 21 | Document generated with RStudio ([www.rstudio.com](http://www.rstudio.com)), 22 | knitr ([www.yihui.name/knitr/](http://yihui.name/knitr/)) and pandoc 23 | ([www.johnmacfarlane.net/pandoc/](http://johnmacfarlane.net/pandoc/)). Python 24 | figures generated with iPython Notebook 25 | ([www.ipython.org/notebook.html](http://ipython.org/notebook.html)). 26 | 27 | ___ 28 | 29 | #### Foreword #### 30 | 31 | My friend Randy Olson and I got into the habit to argue about the relative 32 | qualities of our favorite languages for data analysis and visualization. I am 33 | an enthusiastic R user ([www.r-project.org](http://www.r-project.org)) while 34 | Randy is a fan of Python ([www.python.org](http://www.python.org)). One thing 35 | we agree on however is that our discussions are meaningless unless we 36 | actually put R and Python to a series of tests to showcase their relative 37 | strengths and weaknesses. Essentially we will set a common goal (*e.g.*, 38 | perform a particular type of data analysis or draw a particular type of 39 | graph) and create the R and Python codes to achieve this goal. And since 40 | Randy and I are all about sharing, open source and open access, we decided to 41 | make public the results of our friendly challenges so that you can help us 42 | decide between R and Python and, hopefully, also learn something along the 43 | way. 44 | 45 | ___ 46 | 47 | #### Today's challenge: where we learn that Hollywood's cemetery is full #### 48 | 49 | ##### 1 - Introduction ##### 50 | 51 | For this first challenge, we will use data collected by Randy for his recent 52 | post on the ["Top 25 most violence packed films" in the history of the movie 53 | industry](www.randalolson.com/2013/12/31/most-violence-packed-films/). For 54 | his post, Randy generated a simple horizontal barchart showing the top 25 55 | more violent films ordered by number of on screen deaths per minute. In the 56 | rest of this document, we will show you how to reproduce this graph using 57 | Python and how to achieve a similar result with R. We will detail the 58 | different steps of the process and provide for each step the corresponding 59 | code (red boxes for R, green boxes for Python). You will also find the entire 60 | codes at the end of this document. 61 | 62 | And now without further ado, let's get started! 63 | 64 | ##### 2 - Step by step process ##### 65 | 66 | First thing first, let's set up our working environment by loading some 67 | necessary libraries. 68 | 69 | 70 | 71 | ```r 72 | # Load libraries 73 | library(lattice) # Very versatile graphics package 74 | library(latticeExtra) # Addition to "lattice" that makes layering graphs a 75 | # breathe, and I'm a lazy person, so why not 76 | ``` 77 | 78 | ```python 79 | # This starts the IPython Notebook pylab module, useful for plotting and 80 | # interactive scientific computing 81 | %pylab inline 82 | from pandas import * 83 | ``` 84 | 85 | 86 | Now let's load the data for today's job. The raw data were scraped by Randy 87 | (using Python) from [www.MovieBodyCounts.com](http://www.MovieBodyCounts.com) 88 | and he generously provided the result of his hard work on FigShare at this 89 | address: 90 | [http://dx.doi.org/10.6084/m9.figshare.889719](http://dx.doi.org/10.6084/m9.figshare.889719). 91 | 92 | 93 | 94 | ```r 95 | # Load data into a data frame 96 | body.count.data <- read.csv("http://files.figshare.com/1332945/film_death_counts.csv") 97 | ``` 98 | 99 | ```python 100 | # Read the data into a pandas DataFrame 101 | body_count_data = read_csv("http://files.figshare.com/1332945/film_death_counts.csv") 102 | ``` 103 | 104 | 105 | For each movie, the data frame contains a column for the total number of on 106 | screen deaths ("Body_Count") and a column for the duration 107 | ("Length_Minutes"). We will now create an extra column for the number of on 108 | screen deaths per minute of each movie ("Deaths_Per_Minute") 109 | 110 | 111 | 112 | ```r 113 | # Compute on screen deaths per minute for each movie. 114 | body.count.data <- within(body.count.data, { 115 | Deaths_Per_Minute <- Body_Count / Length_Minutes 116 | ord <- order(Deaths_Per_Minute, decreasing = TRUE) # useful later 117 | }) 118 | ``` 119 | 120 | ```python 121 | # Divide the body counts by the length of the film 122 | body_count_data["Deaths_Per_Minute"] = (body_count_data["Body_Count"].apply(float).values / 123 | body_count_data["Length_Minutes"].values) 124 | ``` 125 | 126 | 127 | Now we will reorder the data frame by (descending) number of on screen deaths 128 | per minute, and select the top 25 most violent movies according to this criterion. 129 | 130 | 131 | 132 | ```r 133 | # Reorder "body.count.data" by (descending) number of on screen deaths per minute 134 | body.count.data <- body.count.data[body.count.data$ord, ] 135 | 136 | # Select top 25 most violent movies by number of on screen deaths per minute 137 | body.count.data <- body.count.data[1:25,] 138 | ``` 139 | 140 | ```python 141 | # Only keep the top 25 highest kills per minute films 142 | body_count_data = body_count_data.sort("Deaths_Per_Minute", ascending=False)[:25] 143 | 144 | # Change the order of the data so highest kills per minute films are on top in the plot 145 | body_count_data = body_count_data.sort("Deaths_Per_Minute", ascending=True) 146 | ``` 147 | 148 | 149 | In Randy's graph, the "y" axis shows the film title with the release date. We 150 | will now generate the full title for each movie following a "Movie name 151 | (year)" format, and append it to the data frame. 152 | 153 | 154 | 155 | ```r 156 | # Combine film title and release date into a new factor column with levels 157 | # ordered by ascending violence 158 | body.count.data <- within(body.count.data, { 159 | Full_Title <- paste0(Film, " (", Year, ")") 160 | ord <- order(Deaths_Per_Minute, decreasing = TRUE) 161 | Full_Title <- ordered(Full_Title, levels = rev(unique(Full_Title[ord]))) # some films are duplicated! Bad Randy! 162 | }) 163 | ``` 164 | 165 | ```python 166 | # Generate the full titles for the movies: movie name (year) 167 | full_title = [] 168 | 169 | for film, year in zip(body_count_data["Film"].values, body_count_data["Year"].values): 170 | full_title.append(film + " (" + str(year) + ")") 171 | 172 | body_count_ y-axis ticks on the left and x-axis ticks on the bottom 173 | ax.yaxis.tick_left() 174 | ax.xaxis.tick_bottom()data["Full_Title"] = array(full_title) 175 | ``` 176 | 177 | 178 | Now we are ready to generate the barchart. We're going to start with the 179 | default options and then we will make this thing look pretty. 180 | 181 | 182 | 183 | ```r 184 | # Generate base graph 185 | graph <- barchart(Full_Title ~ Deaths_Per_Minute, data = body.count.data) 186 | print(graph) 187 | ``` 188 | 189 |