├── Deadliest movies scrape
    ├── code
    │   ├── film-death-counts-Python.csv
    │   ├── imdb-scraper.R
    │   ├── imdb-scraper.py
    │   ├── movie-scraper.R
    │   ├── movie-scraper.py
    │   ├── movies-R-full.csv
    │   ├── movies-R.csv
    │   ├── movies-python.csv
    │   └── movies.csv
    ├── custom.css
    ├── notebook.R
    ├── notebook.html
    ├── notebook.md
    ├── notebook2.R
    ├── notebook2.html
    ├── notebook2.md
    ├── pandoc_config.txt
    ├── pandoc_config2.txt
    └── programming_cat.jpg
├── Deadliest movies
    ├── bloody_gun.jpg
    ├── code
    │   ├── code.R
    │   └── code.py
    ├── custom.css
    ├── figure
    │   ├── baseGraphR.png
    │   ├── gunR.png
    │   ├── prettyR.png
    │   └── rightLabelsR.png
    ├── figurePy
    │   ├── basePy.png
    │   ├── finalPy.png
    │   └── prettyPy.png
    ├── pandoc_config.txt
    ├── run.R
    ├── run.html
    └── run.md
├── Linear regression
    ├── Linear regression.Rproj
    ├── code
    │   ├── code.R
    │   └── code.ipynb
    ├── custom.css
    ├── figure
    │   ├── graphBaseR.png
    │   └── graphPredictR.png
    ├── figurePy
    │   ├── graphBasePy.png
    │   └── graphPredictPy.png
    ├── notebook.R
    ├── notebook.html
    ├── notebook.md
    └── pandoc_config.txt
└── README.md


/Deadliest movies scrape/code/imdb-scraper.R:
--------------------------------------------------------------------------------
  1 | #' Copyright 2014 Simon Garnier (http://www.theswarmlab.com / @sjmgarnier)
  2 | #' 
  3 | #' This script is free software: you can redistribute it and/or modify it under
  4 | #' the terms of the GNU General Public License as published by the Free Software
  5 | #' Foundation, either version 3 of the License, or (at your option) any later
  6 | #' version.
  7 | #' 
  8 | #' This script is distributed in the hope that it will be useful, but WITHOUT
  9 | #' ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 10 | #' FOR A PARTICULAR PURPOSE.
 11 | #' 
 12 | #' See the GNU General Public License for more details.
 13 | #' 
 14 | #' You should have received a copy of the GNU General Public License along with
 15 | #' this script. If not, see http://www.gnu.org/licenses/.
 16 | #' 
 17 | 
 18 | #' **Document title:** R vs Python - Round 2 (2/2)
 19 | #' 
 20 | #' **Date:** February 2, 2014
 21 | #' 
 22 | #' **Author:** Simon Garnier (http://www.theswarmlab.com / @sjmgarnier)
 23 | #' 
 24 | #' **Description:** This script scrapes data out of www.imdb.com. For more
 25 | #' information, see http://www.theswarmlab.com/r-vs-python-round-2/ and
 26 | #' http://www.theswarmlab.com/r-vs-python-round-2-22
 27 | #' 
 28 | #' Document generated with RStudio ([www.rstudio.com](http://www.rstudio.com)).
 29 | #' 
 30 | 
 31 | # Load libraries
 32 | # No additional libraries needed here. Yeah!
 33 | 
 34 | # Create IMDB scraper
 35 | IMDb <- function(ID) {
 36 |   # Retrieve movie info from IMDb.com. 
 37 |   #
 38 |   # Args:
 39 |   #   ID: IDs of the movies.
 40 |   #
 41 |   # Returns:
 42 |   #   A data frame containing one line per movie, and nine columns: movie ID,
 43 |   #   film title, year of release, duration in minutes, MPAA rating, genre(s),
 44 |   #   director(s), IMDb rating, and full cast.
 45 |   
 46 |   # Load required libraries
 47 |   require(XML)
 48 |   require(pbapply)    # Apply functions with progress bars!!!
 49 |   
 50 |   # Wrap core of the function in do.call and pblapply in order to
 51 |   # pseudo-vectorize it (pblapply) and return a data frame (do.call)
 52 |   info <- do.call(rbind, pblapply(ID, FUN = function(ID) {
 53 |     # Create movie URL on IMDb.com
 54 |     URL <- paste0("http://www.imdb.com/title/tt", ID)
 55 |     
 56 |     # Download and parse HTML of IMDb page
 57 |     parsed.html <- htmlParse(URL)
 58 |     
 59 |     # Find title
 60 |     Film <- xpathSApply(parsed.html, "//h1[@class='header']/span[@class='itemprop']", xmlValue)
 61 |     
 62 |     # Find year
 63 |     Year <- as.numeric(gsub("[^0-9]", "", xpathSApply(parsed.html, "//h1[@class='header']/span[@class='nobr']", xmlValue)))
 64 |     
 65 |     # Find duration in minutes
 66 |     Length_Minutes <- as.numeric(gsub("[^0-9]", "", xpathSApply(parsed.html, "//div[@class='infobar']/time[@itemprop='duration']", xmlValue)))
 67 |     
 68 |     # Find MPAA rating
 69 |     MPAA_Rating <- unname(xpathSApply(parsed.html, "//div[@class='infobar']/span/@content"))
 70 |     if (!is.character(MPAA_Rating)) {   # Some movies don't have a MPAA rating
 71 |       MPAA_Rating <- "UNRATED"
 72 |     }
 73 |     
 74 |     # Find genre
 75 |     Genre <- paste(xpathSApply(parsed.html, "//span[@class='itemprop' and @itemprop='genre']", xmlValue), collapse='|')
 76 |     
 77 |     # Find director
 78 |     Director <- paste(xpathSApply(parsed.html, "//div[@itemprop='director']/a", xmlValue), collapse='|')
 79 |     
 80 |     # Find IMDB rating
 81 |     IMDB_rating <- as.numeric(xpathSApply(parsed.html, "//div[@class='titlePageSprite star-box-giga-star']", xmlValue))
 82 |     
 83 |     # Extract full cast from the full credits page
 84 |     parsed.html <- htmlParse(paste0(URL,"/fullcredits"))
 85 |     Full_Cast <- paste(xpathSApply(parsed.html, "//span[@itemprop='name']", xmlValue), collapse='|')
 86 |     
 87 |     data.frame(ID = ID, Film = Film, Year = Year, Length_Minutes = Length_Minutes,
 88 |                MPAA_Rating = MPAA_Rating, Genre = Genre, 
 89 |                Director = Director, IMDB_rating = IMDB_rating, Full_Cast = Full_Cast))
 90 |   }))
 91 | }
 92 | 
 93 | # Load data from last challenge 
 94 | data <- read.csv("movies-R.csv")
 95 | 
 96 | # For each movie, extract IMDb info and append it to the data
 97 | data <- within(data, {
 98 |   # Extract ID number
 99 |   IMDB_ID <- gsub("[^0-9]", "", IMDB_URL)
100 |   
101 |   # Download IMDb info into a temporary variable
102 |   IMDB_Info <- IMDb(IMDB_ID)
103 |   
104 |   # Save MPAA rating
105 |   MPAA_Rating <- IMDB_Info$MPAA_Rating
106 |   
107 |   # Save genre(s)
108 |   Genre <- IMDB_Info$Genre
109 |   
110 |   # Save director(s)
111 |   Director <- IMDB_Info$Director
112 |   
113 |   # Save duration in minutes
114 |   Length_Minutes <- IMDB_Info$Length_Minutes
115 |   
116 |   # Save IMDb rating
117 |   IMDB_rating <- IMDB_Info$IMDB_rating
118 |   
119 |   # Save full cast
120 |   Full_Cast <- IMDB_Info$Full_Cast
121 |   
122 |   # Delete IMDb info
123 |   IMDB_Info <- NULL
124 | })
125 | 
126 | write.csv(data, file = "movies-R-full.csv")
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/Deadliest movies scrape/code/imdb-scraper.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright 2014 Randal S. Olson
 3 | 
 4 | This file is a script that combines data from www.MovieBodyCounts.com and IMDB.com to
 5 | create a list of films, metadata about the films, and the number of on-screen body
 6 | counts in the films. The script requires an internet connection and two libraries
 7 | installed: imdbpy and pandas.
 8 | 
 9 | 
10 | This script is free software: you can redistribute it and/or modify it under the
11 | terms of the GNU General Public License as published by the Free Software Foundation,
12 | either version 3 of the License, or (at your option) any later version.
13 | 
14 | This script is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
15 | without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
16 | See the GNU General Public License for more details.
17 | 
18 | You should have received a copy of the GNU General Public License along with this script.
19 | If not, see http://www.gnu.org/licenses/.
20 | """
21 | 
22 | from imdb import IMDb
23 | import pandas as pd
24 | import re
25 | 
26 | imdb_access = IMDb()
27 | movie_data = pd.read_csv("movies.csv")
28 | 
29 | # Grab only the movie number out of the IMDB URL
30 | movie_data["Movie_Number"] = movie_data["IMDB_URL"].apply(lambda x: re.sub("[^0-9]", "", x))
31 | 
32 | with open("film-death-counts-Python.csv", "wb") as out_file:
33 |     out_file.write("Film,Year,Body_Count,MPAA_Rating,Genre,Director,Actors,Length_Minutes,IMDB_Rating\n")
34 |     
35 |     for movie_entry in movie_data.iterrows():
36 |         # Use a try-catch on the loop to prevent temporary connection-related issues from stopping the scrape
37 |         try:
38 |             movie = imdb_access.get_movie(movie_entry[1]["Movie_Number"])
39 |             movie_fields = []
40 |             
41 |             # Remove non-ASCII character encodings and commas from movie titles
42 |             movie_fields.append(movie["title"].encode("ascii", "replace").replace(",", ""))
43 |             movie_fields.append(str(movie["year"]))
44 |             movie_fields.append(str(movie_entry[1]["Body_Count"]))
45 |             
46 |             # Some movies don't have MPAA Ratings on IMDB
47 |             try:
48 |                 movie_fields.append(str(movie["mpaa"].split(" ")[1]))
49 |             except:
50 |                 movie_fields.append("")
51 |             
52 |             # For movies with multiple genres/directors/actors, join them with bars |
53 |             movie_fields.append(str("|".join(movie["genres"])))
54 |             movie_fields.append(str("|".join([str(x) for x in movie["director"]])))
55 |             movie_fields.append(str("|".join([str(x) for x in movie["cast"]])))
56 |             
57 |             movie_fields.append(str(int(movie["runtime"][0].split(":")[-1])))
58 |             movie_fields.append(str(float(movie["rating"])))
59 |             
60 |             # All entries are comma-delimited
61 |             out_file.write(",".join(movie_fields) + "\n")
62 |             
63 |         except Exception as e:
64 |             print "Error with", str(movie)
65 | 


--------------------------------------------------------------------------------
/Deadliest movies scrape/code/movie-scraper.R:
--------------------------------------------------------------------------------
  1 | #' Copyright 2014 Simon Garnier (http://www.theswarmlab.com / @sjmgarnier)
  2 | #' 
  3 | #' This script is free software: you can redistribute it and/or modify it under
  4 | #' the terms of the GNU General Public License as published by the Free Software
  5 | #' Foundation, either version 3 of the License, or (at your option) any later
  6 | #' version.
  7 | #' 
  8 | #' This script is distributed in the hope that it will be useful, but WITHOUT
  9 | #' ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 10 | #' FOR A PARTICULAR PURPOSE.
 11 | #' 
 12 | #' See the GNU General Public License for more details.
 13 | #' 
 14 | #' You should have received a copy of the GNU General Public License along with
 15 | #' this script. If not, see http://www.gnu.org/licenses/.
 16 | #' 
 17 | 
 18 | #' **Document title:** R vs Python - Round 2
 19 | #' 
 20 | #' **Date:** January 12, 2014
 21 | #' 
 22 | #' **Author:** Simon Garnier (http://www.theswarmlab.com / @sjmgarnier)
 23 | #' 
 24 | #' **Description:** This script scrapes data out of 2 websites
 25 | #' (www.MovieBodyCounts.com and www.imdb.com). For more information, see
 26 | #' http://www.theswarmlab.com/r-vs-python-round-2/
 27 | #' 
 28 | #' Document generated with RStudio ([www.rstudio.com](http://www.rstudio.com)).
 29 | #' 
 30 | 
 31 | # Load libraries
 32 | library(RCurl)      # Everything necessary to grab webpage
 33 | library(XML)        # Everything necessary to parse HTML code
 34 | library(pbapply)    # Progress bars!!!
 35 | 
 36 | # Create curl handle which can be used for multiple HHTP requests. 
 37 | # followlocation = TRUE in case one of the URLs we want to grab is a redirection
 38 | # link.
 39 | curl <- getCurlHandle(useragent = "R", followlocation = TRUE)
 40 | 
 41 | # Prepare URLs of the movie lists alphabetically ordered by first letter of
 42 | # movie title (capital A to Z, except for v and y) + "numbers" list (for movies
 43 | # which title starts with a number)
 44 | urls.by.letter <- paste0("http://www.moviebodycounts.com/movies-", 
 45 |                          c("numbers", LETTERS[1:21], "v", "W" , "x", "Y", "Z"), ".htm")
 46 | 
 47 | # For each movie list... For loops are frowned upon in R, let's use the classier
 48 | # apply functions instead. Here I use the pblapply from the pbapply package.
 49 | # It's equivalent to the regular lapply function, but it provides a neat 
 50 | # progress bar. Unlist to get a vector. 
 51 | urls.by.movie <- unlist(pblapply(urls.by.letter, FUN = function(URL) {
 52 |   # Load raw HTML
 53 |   raw.html <- getURL(URL, curl = curl)
 54 |   
 55 |   # Parse HTML content
 56 |   parsed.html <- htmlParse(raw.html)
 57 |   
 58 |   # Extract desired links from HTML content. The desired links are those after
 59 |   # image 'graphic-movies.jpg' in the page
 60 |   links <- as.vector(xpathSApply(parsed.html, "//img[@src='graphic-movies.jpg']/following::a/@href"))
 61 |   
 62 |   if (!is.null(links)) {
 63 |     ix = grepl("http://www.moviebodycounts.com/", links)
 64 |     links[!ix] <- paste0("http://www.moviebodycounts.com/", links[!ix])
 65 |     return(links)
 66 |   }
 67 | }), use.names = FALSE)
 68 | 
 69 | # One URL is actually a shortcut to another page. Let's get rid of it.
 70 | ix <- which(grepl("movies-C.htm", urls.by.movie))
 71 | urls.by.movie <- urls.by.movie[-ix]
 72 | 
 73 | # Ok, let's get serious now
 74 | 
 75 | data <- do.call(rbind, pblapply(urls.by.movie, FUN = function(URL) {
 76 |   # Load raw HTML
 77 |   raw.html <- getURL(URL, curl = curl)
 78 |   
 79 |   # Parse HTML content
 80 |   parsed.html <- htmlParse(raw.html)
 81 |   
 82 |   # Find movie title
 83 |   # Title appears inside a XML/HTML node called "title" ("//title"). In this
 84 |   # node, it comes after "Movie Body Counts: ". I use gsub to get rid off "Movie
 85 |   # Body Counts: " and keep only the movie title.
 86 |   Film <- xpathSApply(parsed.html, "//title", xmlValue)
 87 |   Film <- gsub("Movie Body Counts: ", "", Film)
 88 |   
 89 |   # Find movie year
 90 |   # The year is usually a text inside ("/descendant::text()") a link node
 91 |   # ("//a") which source contains the string "charts-year" ("[contains(@href,
 92 |   # 'charts-year')]").
 93 |   Year <- as.numeric(xpathSApply(parsed.html, "//a[contains(@href, 'charts-year')]/descendant::text()", xmlValue))
 94 |   
 95 |   # Find IMDB link
 96 |   # The IMDB link is inside a link node ("//a") which source contains "imdb"
 97 |   # ("/@href[contains(.,'imdb')]")
 98 |   IMDB_URL <- as.vector(xpathSApply(parsed.html, "//a/@href[contains(.,'imdb')]"))[1]
 99 |   
100 |   # Note: We select the first element of the vector because for at least one of
101 |   # the movies, this command returns two links.
102 |   
103 |   # Find kill count.
104 |   # Kill count is contained in the first non-empty text node
105 |   # ("/following::text()[normalize-space()]") after the image which source file
106 |   # is called "graphic-bc.jpg" ("//img[@src='graphic-bc.jpg']")
107 |   Body_Count <- xpathSApply(parsed.html, "//img[@src='graphic-bc.jpg']/following::text()[normalize-space()]", xmlValue)[1]
108 |   
109 |   # Now we need to clean up the text node that we just extracted because there
110 |   # are lots of inconsistencies in the way the kill counts are displayed across
111 |   # all movie pages. For instance, counts are sometimes accompanied by text, not
112 |   # always the same, and sometimes there is no text at all. Sometimes the total
113 |   # count is split in two numbers (e.g., number of dead humans and number of
114 |   # dead aliens). And sometimes the total count is displayed and accompanied by
115 |   # a split count in parenthesis. First, let's remove everything that is
116 |   # writtent in parenthesis or that is not a number.
117 |   # Using gsub, remove everything in parenthesis and all non number characters
118 |   Body_Count <- gsub("\\(.*?\\)", " ", Body_Count)
119 |   Body_Count <- gsub("[^0-9]+", " ", Body_Count)
120 |   
121 |   # In case the total count has been split, we want to separate these numbers
122 |   # from each other so that we can add them up later. Using strsplit, split the
123 |   # character string at spaces
124 |   Body_Count <- unlist(strsplit(Body_Count, " "))
125 |   
126 |   # For now, we have extracted characters. Transform them into numbers.
127 |   Body_Count <- as.numeric(Body_Count)
128 |   
129 |   # Sum up the numbers (in case they have been split into separate categories.
130 |   Body_Count <- sum(Body_Count, na.rm = TRUE)
131 |     
132 |   return(data.frame(IMDB_URL, Film, Year, Body_Count))
133 | }))
134 | 
135 | # Save scraped data in a .csv file for future use
136 | write.csv(data, "movies-R.csv", row.names = FALSE)
137 | 
138 | 
139 | 
140 | 
141 | 
142 | 


--------------------------------------------------------------------------------
/Deadliest movies scrape/code/movie-scraper.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2014 Randal S. Olson
  3 | 
  4 | This file is a script that scrapes on-screen body counts for various movies on
  5 | www.MovieBodyCounts.com. The script requires an internet connection and two libraries
  6 | installed: urllib2 and html2text.
  7 | 
  8 | Due to inconsistent formatting of the HTML on www.MovieBodyCounts.com, the script will
  9 | not scrape everything perfectly. As such, the resulting output file *will* require some
 10 | cleanup afterwards. The manual cleanup will take less time than finding an elegant
 11 | solution to perfectly scrape the page.
 12 | 
 13 | 
 14 | This script is free software: you can redistribute it and/or modify it under the
 15 | terms of the GNU General Public License as published by the Free Software Foundation,
 16 | either version 3 of the License, or (at your option) any later version.
 17 | 
 18 | This script is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 19 | without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 20 | See the GNU General Public License for more details.
 21 | 
 22 | You should have received a copy of the GNU General Public License along with this script.
 23 | If not, see http://www.gnu.org/licenses/.
 24 | """
 25 | 
 26 | # String parsing libraries
 27 | import string
 28 | import re
 29 | 
 30 | # urllib2 reads web pages if you provide it an URL
 31 | import urllib2
 32 | 
 33 | # html2text converts HTML to Markdown, which is much easier to parse
 34 | from html2text import html2text
 35 | 
 36 | # Generate a list of all letters for the Movie pages (+ a "numbers" page)
 37 | # MovieBodyCount's actor pages are all with capital letters EXCEPT v and x
 38 | letters = ["numbers"] + list(string.letters[26:52].upper().replace("V", "v").replace("X", "x"))
 39 | 
 40 | list_of_films = []
 41 | 
 42 | # Go through each movie list page and gather all of the movie web page URLs
 43 | for letter in letters:
 44 |     try:
 45 |         # Read the raw HTML from the web page
 46 |         page_text = urllib2.urlopen("http://www.moviebodycounts.com/movies-" + letter + ".htm").read()
 47 |         
 48 |         # Convert the raw HTML into Markdown
 49 |         page_text = html2text(page_text).split("\n")
 50 |         
 51 |         # Search through the web page for movie page entries
 52 |         for line in page_text:
 53 |             # We know it's a movie page entry when it has ".htm" in it, but not ".jpg", "contact.htm", and "movies.htm"
 54 |             # .jpg means it's a line with an image -- none of the movie entries have an image
 55 |             # contact.htm and movies.htm means it's a link to the Contact or Movies page -- not what we want
 56 |             # movies- means it's a redirect link to another page -- just skip over it
 57 |             if ".htm" in line and ".jpg" not in line and "contact.htm" not in line and "movies.htm" not in line and "movies-" not in line:
 58 |                 #print line
 59 |                 # The URL is in between parentheses (), so we can simply split the string on those
 60 |                 # Some URLs are full URLs, e.g. www.moviebodycounts.com/movie_name.html, so splitting on the / gives us only the page name
 61 |                 list_of_films.append(line.split("(")[-1].strip(")").split("/")[-1])
 62 |                 
 63 |     # If the movie list page doesn't exist, keep going
 64 |     except:
 65 |         print "\nerror with " + letter + "\n"
 66 | 
 67 | # Now that we have every movie web page URL, go through each movie page and extract the movie name, kill counts, etc.
 68 | out_file = open("film-death-counts.csv", "wb")
 69 | out_file.write("Film,Year,Kill_Count,IMDB_url\n")
 70 | 
 71 | for film_page in list_of_films:
 72 |     try:
 73 |         # The information we're looking for on the page:
 74 |         film = ""
 75 |         kills = ""
 76 |         year = ""
 77 |         IMDB_url = ""
 78 |         
 79 |         # A flag indicating that we've found the film title on the page
 80 |         found_title = False
 81 |         
 82 |         # Read the page's raw HTML and convert it to Markdown (again) and go through each line
 83 |         for line in html2text(urllib2.urlopen("http://www.moviebodycounts.com/" + film_page).read()).split("\n"):
 84 |         
 85 |             # If we haven't found the title yet, these markers tell us we've found the movie title
 86 |             if not found_title and "!" not in line and "(" not in line and "[" not in line and line.strip() != "":
 87 |                 film = line.replace(",", "").strip(":")
 88 |                 found_title = True
 89 |                 
 90 |             # The kill counts are usually on a line with "Film:"
 91 |             if "film:" in line.lower() or "kills:" in line.lower() or "count:" in line.lower():
 92 |                 kills = re.sub("[^0-9]", "", line.split(":")[1].split("(")[0])
 93 | 
 94 |             # The year is usually on a line with "charts-year"
 95 |             if "charts-year" in line:
 96 |                 year = line.split("[")[1].split("]")[0]
 97 |             
 98 |             # The IMDB url is on a line with "[imdb]"
 99 |             if "[imdb]" in line.lower():
100 |                 IMDB_url = line.lower().split("[imdb](")[1].split(")")[0]
101 |                 
102 |         out_file.write(film + "," + year + "," + kills + "," + IMDB_url + "\n")
103 |             
104 |     # If a movie page fails to open, print out the error and move on to the next movie
105 |     except Exception as e:
106 |         print film_page
107 |         print e
108 |         
109 | out_file.close()
110 | 


--------------------------------------------------------------------------------
/Deadliest movies scrape/custom.css:
--------------------------------------------------------------------------------
  1 | body {
  2 |         font: 14px/1.5em "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif;
  3 | 		    color: #777;
  4 | 		-webkit-font-smoothing: antialiased; /* Fix for webkit rendering */
  5 | 		-webkit-text-size-adjust: 100%;
  6 |         /*font-family: "Avenir Next", Helvetica, Arial, sans-serif;*/
  7 |         padding:1em;
  8 |         margin:auto;
  9 |         max-width:10in;
 10 | }
 11 | 
 12 | h1, h2, h3, h4, h5, h6 {
 13 | 
 14 | 		font-weight: normal; }
 15 | 	h1 a, h2 a, h3 a, h4 a, h5 a, h6 a { font-weight: inherit; }
 16 | 	h1 { font-size: 46px; line-height: 50px; margin-bottom: 14px;}
 17 | 	h2 { font-size: 35px; line-height: 40px; margin-bottom: 10px; }
 18 | 	h3 { font-size: 28px; line-height: 34px; margin-bottom: 8px; }
 19 | 	h4 { font-size: 21px; line-height: 30px; margin-bottom: 4px; }
 20 | 	h5 { font-size: 17px; line-height: 24px; }
 21 | 	h6 { font-size: 14px; line-height: 21px; }
 22 | 	.subheader { color: #777; }
 23 | 
 24 | 	p { margin: 0 0 20px 0; }
 25 | 	p img { margin: 0; }
 26 | 	p.lead { font-size: 21px; line-height: 27px; color: #444;  }
 27 | 
 28 | 	em { font-style: italic; }
 29 | 	strong { font-weight: bold;  }
 30 | 	small { font-size: 80%; }
 31 | 
 32 | hr {
 33 |         height: 0.2em;
 34 |         border: 0;
 35 |         color: #CCCCCC;
 36 |         background-color: #CCCCCC;
 37 | }
 38 | 
 39 | p, blockquote, ul, ol, dl, li, table, pre {
 40 |         margin: 15px 0;
 41 |         text-align: justify;
 42 | }
 43 | 
 44 | a, a:visited { color: #333; text-decoration: underline; outline: 0; }
 45 |   a:hover, a:focus { color: #000; }
 46 | 	p a, p a:visited { line-height: inherit; }
 47 | 
 48 | #message {
 49 |         border-radius: 6px;
 50 |         border: 1px solid #ccc;
 51 |         display:block;
 52 |         width:100%;
 53 |         height:60px;
 54 |         margin:6px 0px;
 55 | }
 56 | 
 57 | button, #ws {
 58 |         font-size: 10pt;
 59 |         padding: 4px 6px;
 60 |         border-radius: 5px;
 61 |         border: 1px solid #bbb;
 62 |         background-color: #eee;
 63 | }
 64 | 
 65 | code, pre, #ws, #message {
 66 |         font-family: Monaco;
 67 |         font-size: 8pt;
 68 |         border-radius: 3px;
 69 |         background-color: #F8F8F8;
 70 |         color: inherit;
 71 | }
 72 | 
 73 | code {
 74 |         border: 1px solid #EAEAEA;
 75 |         margin: 0 2px;
 76 |         padding: 0 5px;
 77 | }
 78 | 
 79 | pre.r {
 80 |   border: 2px solid #8A0606;
 81 | }
 82 | 
 83 | pre.r:before {
 84 |   content: 'R code \A';
 85 |   color: #8A0606;
 86 |   font-weight: bold;
 87 | }
 88 | 
 89 | pre.python {
 90 |   border: 2px solid #068A06;
 91 | }
 92 | 
 93 | pre.python:before {
 94 |   content: 'Python code \A';
 95 |   color: #068A06;
 96 |   font-weight: bold;
 97 | }
 98 | 
 99 | img {
100 |     max-width: 100%;
101 |     height: auto;
102 |     width: auto\9; /* ie8 */
103 | }
104 | 
105 | pre {
106 |         border: 1px solid #CCCCCC;
107 |         overflow: auto;
108 |         padding: 4px 8px;
109 | }
110 | 
111 | pre > code {
112 |         border: 0;
113 |         margin: 0;
114 |         padding: 0;
115 | }
116 | 
117 | blockquote, blockquote p { font-size: 12px; line-height: 24px; color: #000; font-style: italic; }
118 | 	blockquote { margin: 0 0 20px; padding: 9px 50px 0 49px; border-left: 1px solid #ddd; }
119 | 	blockquote cite { display: block; font-size: 12px; color: #555; }
120 | 	blockquote cite:before { content: "\2014 \0020"; }
121 | 	blockquote cite a, blockquote cite a:visited, blockquote cite a:visited { color: #555; }
122 | 
123 | #ws { background-color: #f8f8f8; }
124 | 
125 | .send { color:#77bb77; }
126 | .server { color:#7799bb; }
127 | .error { color:#AA0000; }
128 | 


--------------------------------------------------------------------------------
/Deadliest movies scrape/notebook.R:
--------------------------------------------------------------------------------
  1 | #+ licence, echo=FALSE 
  2 | # Copyright 2014 Simon Garnier (http://www.theswarmlab.com / @sjmgarnier)
  3 | # 
  4 | # This script is free software: you can redistribute it and/or modify it under 
  5 | # the terms of the GNU General Public License as published by the Free Software 
  6 | # Foundation, either version 3 of the License, or (at your option) any later 
  7 | # version.
  8 | # 
  9 | # This script is distributed in the hope that it will be useful, but WITHOUT ANY
 10 | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 11 | # A PARTICULAR PURPOSE.
 12 | # 
 13 | # See the GNU General Public License for more details.
 14 | # 
 15 | # You should have received a copy of the GNU General Public License along with 
 16 | # this script. If not, see http://www.gnu.org/licenses/.
 17 | # 
 18 | # You can generate the HTML files by running: 
 19 | # library(knitr) 
 20 | # spin("notebook.R") 
 21 | # pandoc("notebook.md", config = "pandoc_config.txt")
 22 | 
 23 | 
 24 | #+ 
 25 | #' **Document title:** R vs Python - Round 2 (1/2)
 26 | #' 
 27 | #' **Date:** January 12, 2014
 28 | #' 
 29 | #' **Text by:** Simon Garnier ([www.theswarmlab.com](http://www.theswarmlab.com)
 30 | #' / [\@sjmgarnier](http://twitter.com/sjmgarnier))
 31 | #' 
 32 | #' **R code by:** Simon Garnier
 33 | #' ([www.theswarmlab.com](http://www.theswarmlab.com) /
 34 | #' [\@sjmgarnier](http://twitter.com/sjmgarnier))
 35 | #' 
 36 | #' **Python code by:** Randy Olson
 37 | #' ([www.randalolson.com](http://www.randalolson.com) /
 38 | #' [\@randal_olson](http://twitter.com/randal_olson))
 39 | #' 
 40 | #' Document generated with RStudio ([www.rstudio.com](http://www.rstudio.com)), 
 41 | #' knitr ([www.yihui.name/knitr/](http://yihui.name/knitr/)) and pandoc 
 42 | #' ([www.johnmacfarlane.net/pandoc/](http://johnmacfarlane.net/pandoc/)). Python
 43 | #' figures generated with iPython Notebook
 44 | #' ([www.ipython.org/notebook.html](http://ipython.org/notebook.html)).
 45 | #' 
 46 | 
 47 | 
 48 | #' ___
 49 | #' 
 50 | #' #### Foreword ####
 51 | #' 
 52 | #' My friend Randy Olson and I got into the habit to argue about the relative 
 53 | #' qualities of our favorite languages for data analysis and visualization. I am
 54 | #' an enthusiastic R user ([www.r-project.org](http://www.r-project.org)) while 
 55 | #' Randy is a fan of Python ([www.python.org](http://www.python.org)). One thing
 56 | #' we agree on however is that our discussions are meaningless unless we
 57 | #' actually put R and Python to a series of tests to showcase their relative
 58 | #' strengths and weaknesses. Essentially we will set a common goal (*e.g.*,
 59 | #' perform a particular type of data analysis or draw a particular type of
 60 | #' graph) and create the R and Python codes to achieve this goal. And since
 61 | #' Randy and I are all about sharing, open source and open access, we decided to
 62 | #' make public the results of our friendly challenges so that you can help us
 63 | #' decide between R and Python and, hopefully, also learn something along the
 64 | #' way.
 65 | #' 
 66 | 
 67 | 
 68 | #' ___
 69 | #' 
 70 | #' #### Today's challenge: a data thief manual for honest scientists (Part 1 of 2) ####
 71 | #' 
 72 | #' ##### 1 - Introduction #####
 73 | #' 
 74 | #' Last week we started our challenge series with a rather simple task: plot a 
 75 | #' pretty barchart from some data collected by Randy for his recent post on the 
 76 | #' ["Top 25 most violence packed films" in the history of the movie 
 77 | #' industry](www.randalolson.com/2013/12/31/most-violence-packed-films/). Today 
 78 | #' we will try to up our game a little bit with a more complex task. We will 
 79 | #' show you how you can collect the data that Randy used for his post directly 
 80 | #' from the website they originate from 
 81 | #' ([www.MovieBodyCounts.com](http://www.moviebodycounts.com)). This is called 
 82 | #' data scraping, or the art of taking advantage of the gigantic database that 
 83 | #' is the Internet.
 84 | #' 
 85 | #' The basic principle behind the scraping of website data is simple: a website 
 86 | #' is a like database, and each page of the website is like a table of this 
 87 | #' database. All we want is find in the database the tables that contain 
 88 | #' information that we would like to acquire, and then extract this information 
 89 | #' from within these relevant tables. This task can be relatively easy if all 
 90 | #' the pages of a website have a similar structure (*i.e.*, if the database is 
 91 | #' clean and well maintained). In this ideal situation, all we have to do is 
 92 | #' identify one or more stable markers that delimit the desired information and 
 93 | #' use them to tell R or Python what to save in memory. Unfortunately not all 
 94 | #' websites have a similar structure across all of their pages and it can 
 95 | #' quickly become a nightmare to identify such markers. Worse, sometimes you 
 96 | #' will have to resign yourself to scrape or correct part or all of the data 
 97 | #' manually.
 98 | #' 
 99 | #' For this challenge, we will attempt to recover the following pieces of
100 | #' information for each movie listed on
101 | #' [www.MovieBodyCounts.com](http://www.moviebodycounts.com): title, release
102 | #' year, count of on-screen deaths and link to the movie page on
103 | #' [www.imdb.com](http://www.imdb.com) (this will help us for part 2 of this
104 | #' challenge next week). We will detail the different steps of the process and
105 | #' provide for each step the corresponding code (red boxes for R, green boxes
106 | #' for Python). You will also find the entire codes at the end of this document.
107 | #' 
108 | 
109 | 
110 | #' ##### 2 - Step by step process #####
111 | #' 
112 | #' First things first, let's set up our working environment by loading some
113 | #' necessary libraries.
114 | #' 
115 | 
116 | #+ libR, eval=FALSE, message=FALSE
117 | # Load libraries
118 | library(RCurl)      # Everything necessary to grab webpages on the Web
119 | library(XML)        # Everything necessary to parse XML and HTML code
120 | library(pbapply)    # Progress bars!!! Just because why not :-)
121 | 
122 | # Create curl handle which can be used for multiple HHTP requests. 
123 | # followlocation = TRUE in case one of the URLs we want to grab is a redirection
124 | # link.
125 | curl <- getCurlHandle(useragent = "R", followlocation = TRUE)
126 | 
127 | #+ libPy, eval=FALSE, engine="python"
128 | # String parsing libraries
129 | import string
130 | import re
131 | 
132 | # urllib2 reads web pages if you provide it an URL
133 | import urllib2
134 | 
135 | # html2text converts HTML to Markdown, which is much easier to parse
136 | from html2text import html2text
137 | 
138 | #' Now a word about the organization of 
139 | #' [www.MovieBodyCounts.com](http://www.moviebodycounts.com). To be perfectly 
140 | #' honest, it is a bit messy :-) Movies are organized in a series of 
141 | #' alphabetically ordered lists (by the first letter of each movie's title), 
142 | #' each letter having its own page 
143 | #' (http://www.moviebodycounts.com/movies-[A-Z].htm). There is also a list for 
144 | #' movies which title starts with a number 
145 | #' (http://www.moviebodycounts.com/movies-numbers.htm). Finally, all category 
146 | #' letters are capitalized in the lists' URLs, except for letters v and x. 
147 | #' Annoying, right? This is just one of the many little problems one can
148 | #' encounter when dealing with messy databases :-)
149 | #' 
150 | #' With all this information in mind, our first task is to create a list of all
151 | #' these lists.
152 | #' 
153 | 
154 | #+ listURLsR, eval=FALSE
155 | # Prepare URLs of the movie lists alphabetically ordered by first letter of
156 | # movie title (capital A to Z, except for v and y) + "numbers" list (for movies
157 | # which title starts with a number)
158 | urls.by.letter <- paste0("http://www.moviebodycounts.com/movies-", 
159 |                          c("numbers", LETTERS[1:21], "v", "W" , "x", "Y", "Z"), ".htm")
160 | 
161 | #+ listURLsPy, eval=FALSE, engine="python"
162 | # Generate a list of all letters for the Movie pages (+ a "numbers" page)
163 | # MovieBodyCount's actor pages are all with capital letters EXCEPT v and x
164 | letters = ["numbers"] + list(string.letters[26:52].upper().replace("V", "v").replace("X", "x"))
165 | 
166 | #' Our next task is to go through the HTML code of all these lists and gather
167 | #' the URLs of all the movie webpages. This is where the data scraping really
168 | #' starts.
169 | #' 
170 | #' As you will quickly notices by reading the following code, Randy and I have
171 | #' decided to use a different approach to identify and collect the desired URLs
172 | #' (and of all the data in the rest of this challenge). I have decided to rely
173 | #' on the [XML Path Language (XPath)](http://www.w3schools.com/xpath/), a
174 | #' language that makes it easy to navigate through elements and attributes in an
175 | #' XML/HTML document. Randy has decided to use an approach based on more
176 | #' "classical" string parsing and manipulation functions. Note that these are
177 | #' just personal preferences. XPath interpreters are also available in Python,
178 | #' and R is fully equipped for manipulating character strings.
179 | #' 
180 | #' For each movie list, we will...
181 | #' 
182 | 
183 | #+ loop1R, eval=FALSE 
184 | # For each movie list... For loops are frowned upon in R, let's use the classier
185 | # apply functions instead. Here I use the pblapply from the pbapply package.
186 | # It's equivalent to the regular lapply function, but it provides a neat 
187 | # progress bar. Unlist to get a vector. 
188 | urls.by.movie <- unlist(pblapply(urls.by.letter, FUN = function(URL) {
189 | 
190 | #+ loop1Py, eval=FALSE, engine="python"
191 | list_of_films = []
192 |   
193 | # Go through each movie list page and gather all of the movie web page URLs
194 | for letter in letters:
195 |   try:
196 |   
197 | #' ...download the raw HTML content of the webpage,...
198 | #'
199 | 
200 | #+ readRaw1R, eval=FALSE
201 |   # Load raw HTML
202 |   raw.html <- getURL(URL, curl = curl)
203 | 
204 | #+ readRaw1Py, eval=FALSE, engine="python"
205 |       # Read the raw HTML from the web page
206 |       page_text = urllib2.urlopen("http://www.moviebodycounts.com/movies-" + letter + ".htm").read()
207 |   
208 | #' ...transform raw HTML into a more convenient format to work with,...
209 | #' 
210 | 
211 | #+ parse1R, eval=FALSE
212 |   # Parse HTML content
213 |   parsed.html <- htmlParse(raw.html)
214 | 
215 | #+ parse1Py, eval=FALSE, engine="python"
216 |       # Convert the raw HTML into Markdown
217 |       page_text = html2text(page_text).split("\n")
218 | 
219 | #' ...find movie page entry, store the URL for later use and close the loop.
220 | #'
221 | 
222 | #+ movieLinkR, eval=FALSE
223 |   # Extract desired links from HTML content using XPath. 
224 |   # The desired links are all the URLs ("a/@href") directly following
225 |   # ("/following::") the image which source file is called "graphic-movies.jpg" 
226 |   # ("//img[@src='graphic-movies.jpg']").
227 |   links <- as.vector(xpathSApply(parsed.html, "//img[@src='graphic-movies.jpg']/following::a/@href"))
228 | 
229 |   # Most links are relative URLs. Add root of the website to make them absolute.
230 |   if (!is.null(links)) {
231 |     ix = grepl("http://www.moviebodycounts.com/", links)                  # Find relative URLs
232 |     links[!ix] <- paste0("http://www.moviebodycounts.com/", links[!ix])   # Add root of website to make URLs absolute
233 |     return(links)
234 |   }
235 | }), use.names = FALSE) # close the loop
236 | 
237 | # One URL is actually just a symbolic link to another page. Let's get rid of it.
238 | ix <- which(grepl("movies-C.htm", urls.by.movie))
239 | urls.by.movie <- urls.by.movie[-ix]
240 | 
241 | #+ movieLinkPy, eval=FALSE, engine="python"
242 |       # Search through the web page for movie page entries
243 |       for line in page_text:
244 |         # We know it's a movie page entry when it has ".htm" in it, but not ".jpg", "contact.htm", and "movies.htm"
245 |         # .jpg means it's a line with an image -- none of the movie entries have an image
246 |         # contact.htm and movies.htm means it's a link to the Contact or Movies page -- not what we want
247 |         # movies- means it's a redirect link to another page -- just skip over it
248 |         if ".htm" in line and ".jpg" not in line and "contact.htm" not in line and "movies.htm" not in line and "movies-" not in line:
249 |           #print line
250 |           # The URL is in between parentheses (), so we can simply split the string on those
251 |           # Some URLs are full URLs, e.g. www.moviebodycounts.com/movie_name.html, so splitting on the / gives us only the page name
252 |           list_of_films.append(line.split("(")[-1].strip(")").split("/")[-1])
253 |       
254 |     # If the movie list page doesn't exist, keep going
255 |     except:
256 |       print "\nerror with " + letter + "\n"
257 | 
258 | #' Now that we know where to find each movie, we can start the hard part of this
259 | #' challenge. We will go through each movie webpage and attempt to find its 
260 | #' title, release year, count of on-screen deaths and link to its page on 
261 | #' [www.imdb.com](http://www.imdb.com). We will save all this information in a
262 | #' .csv file.
263 | #' 
264 | #' For each movie, we will...
265 | 
266 | #+ loop2R, eval=FALSE
267 | # For each movie... 
268 | # do.call(rbind, ...) to reorganize the results in a nice data frame
269 | data <- do.call(rbind, pblapply(urls.by.movie, FUN = function(URL) {
270 | 
271 | #+ loop2Py, eval=FALSE, engine="python"
272 | # Now that we have every movie web page URL, go through each movie page and
273 | # extract the movie name, kill counts, etc.
274 | out_file = open("film-death-counts.csv", "wb")
275 | out_file.write("Film,Year,Kill_Count,IMDB_url\n")
276 |   
277 | for film_page in list_of_films:
278 |   try:
279 |       # The information we're looking for on the page:
280 |       film = ""
281 |       kills = ""
282 |       year = ""
283 |       IMDB_url = ""
284 | 
285 |       # A flag indicating that we've found the film title on the page
286 |       found_title = False
287 | 
288 | #' ...download the raw HTML content of the webpage and transform raw HTML into a
289 | #' more convenient format to work with,...
290 | #' 
291 | 
292 | #+ readRaw2R, eval=FALSE
293 |   # Load raw HTML
294 |   raw.html <- getURL(URL, curl = curl)
295 | 
296 |   # Parse HTML content
297 |   parsed.html <- htmlParse(raw.html)
298 | 
299 | #+ readRaw2Py, eval=FALSE, engine="python"
300 |       # Read the page's raw HTML and convert it to Markdown (again) and go
301 |       # through each line
302 |       for line in html2text(urllib2.urlopen("http://www.moviebodycounts.com/" + film_page).read()).split("\n"):
303 | 
304 | #' ...attempt to find movie title,...
305 | 
306 | #+ titleR, eval=FALSE
307 |   # Find movie title
308 |   # Title appears inside a XML/HTML node called "title" ("//title"). In this
309 |   # node, it comes after "Movie Body Counts: ". I use gsub to get rid off "Movie
310 |   # Body Counts: " and keep only the movie title.
311 |   Film <- xpathSApply(parsed.html, "//title", xmlValue)
312 |   Film <- gsub("Movie Body Counts: ", "", Film)
313 | 
314 | #+ titlePy, eval=FALSE, engine="python"
315 | # If we haven't found the title yet, these markers tell us we've found the movie
316 | # title
317 |         if not found_title and "!" not in line and "(" not in line and "[" not in line and line.strip() != "":
318 |           film = line.replace(",", "").strip(":")
319 |           found_title = True
320 | 
321 | #' ...attempt to find movie year,...
322 | 
323 | #+ yearR, eval=FALSE
324 |   # Find movie year
325 |   # The year is usually a text inside ("/descendant::text()") a link node
326 |   # ("//a") which source contains the string "charts-year" ("[contains(@href,
327 |   # 'charts-year')]").
328 |   Year <- as.numeric(xpathSApply(parsed.html, "//a[contains(@href, 'charts-year')]/descendant::text()", xmlValue))
329 | 
330 | #+ yearPy, eval=FALSE, engine="python"
331 |         # The year is usually on a line with "charts-year"
332 |         if "charts-year" in line:
333 |           year = line.split("[")[1].split("]")[0]
334 | 
335 | #' ...attempt to find link to movie on IMDB,...
336 | 
337 | #+ imdbR, eval=FALSE
338 |   # Find IMDB link
339 |   # The IMDB link is inside a link node ("//a") which source contains "imdb"
340 |   # ("/@href[contains(.,'imdb')]")
341 |   IMDB_URL <- as.vector(xpathSApply(parsed.html, "//a/@href[contains(.,'imdb')]"))[1]
342 | 
343 |   # Note: We select the first element of the vector because for at least one of
344 |   # the movies, this command returns two links.
345 | 
346 | #+ imdbPy, eval=FALSE, engine="python"
347 |         # The IMDB url is on a line with "[imdb]"
348 |         if "[imdb]" in line.lower():
349 |           IMDB_url = line.lower().split("[imdb](")[1].split(")")[0]
350 | 
351 | #' ... and finally attempt to find the on-screen kill count. Here, Randy chose 
352 | #' an approach that minimizes his coding effort, but that will potentially force
353 | #' him to make several manual corrections a posteriori. I chose to find a 
354 | #' solution that works with minimal to no manual corrections, but that requires 
355 | #' an extra coding effort. Whichever approach is best depends mostly on the size
356 | #' of the data you want to scrape and the time you have to do it.
357 | 
358 | #+killsR, eval=FALSE
359 |   # Find kill count.
360 |   # Kill count is contained in the first non-empty text node
361 |   # ("/following::text()[normalize-space()]") after the image which source file
362 |   # is called "graphic-bc.jpg" ("//img[@src='graphic-bc.jpg']")
363 |   Body_Count <- xpathSApply(parsed.html, "//img[@src='graphic-bc.jpg']/following::text()[normalize-space()]", xmlValue)[1]
364 | 
365 |   # Now we need to clean up the text node that we just extracted because there
366 |   # are lots of inconsistencies in the way the kill counts are displayed across
367 |   # all movie pages. For instance, counts are sometimes accompanied by text, not
368 |   # always the same, and sometimes there is no text at all. Sometimes the total
369 |   # count is split in two numbers (e.g., number of dead humans and number of
370 |   # dead aliens). And sometimes the total count is displayed and accompanied by
371 |   # a split count in parenthesis. First, let's remove everything that is
372 |   # writtent in parenthesis or that is not a number.
373 |   # Using gsub, remove everything in parenthesis and all non number characters
374 |   Body_Count <- gsub("\\(.*?\\)", " ", Body_Count)
375 |   Body_Count <- gsub("[^0-9]+", " ", Body_Count)
376 |   
377 |   # In case the total count has been split, we want to separate these numbers
378 |   # from each other so that we can add them up later. Using strsplit, split the
379 |   # character string at spaces
380 |   Body_Count <- unlist(strsplit(Body_Count, " "))
381 |   
382 |   # For now, we have extracted characters. Transform them into numbers.
383 |   Body_Count <- as.numeric(Body_Count)
384 |   
385 |   # Sum up the numbers (in case they have been split into separate categories.
386 |   Body_Count <- sum(Body_Count, na.rm = TRUE)
387 |   
388 | #+ killsPy, eval=FALSE, engine="python"
389 |         # The kill counts are usually on a line with "Film:"
390 |         if "film:" in line.lower() or "kills:" in line.lower() or "count:" in line.lower():
391 |           kills = re.sub("[^0-9]", "", line.split(":")[1].split("(")[0])
392 | 
393 | #' Almost done! Now we just need to close the loop and write the data frame into
394 | #' a .csv file
395 | 
396 | #+ saveR, eval=FALSE
397 |   # Return scraped data into a data frame form
398 |   return(data.frame(IMDB_URL, Film, Year, Body_Count))
399 | }))
400 | 
401 | # Save scraped data in a .csv file for future use
402 | write.csv(data, "movies-R.csv", row.names = FALSE)
403 | 
404 | #+ savePy, eval=FALSE, engine="python"
405 |       out_file.write(film + "," + year + "," + kills + "," + IMDB_url + "\n")
406 | 
407 |     # If a movie page fails to open, print out the error and move on to the next movie
408 |     except Exception as e:
409 |       print film_page
410 |     print e
411 | 
412 | out_file.close()
413 | 
414 | #' And voilà! You should now have a .csv file somewhere on your computer
415 | #' containing all the information we just scraped from the website. Not too
416 | #' hard, right?
417 | #' 
418 | #' Keep the .csv file, we will use it again next week to complete this challenge
419 | #' by scraping additional information from [www.imdb.com](http://www.imdb.com).
420 | #' 
421 | 
422 | 
423 | #' ___
424 | #' 
425 | #' #### 3 - Source code ####
426 | #' 
427 | #' R and Python source codes are available 
428 | #' [here](https://github.com/morpionZ/R-vs-Python/tree/master/Deadliest%20movies%20scrape/code).
429 | #' 
430 | 
431 | 
432 | #' ___
433 | #' 
434 | #' #### 4 - Bonus for the braves ####
435 | #' 
436 | #' Today's challenge was code and text heavy. No pretty pictures to please the eye. So, for all the brave people who made it to the end, here is a cat picture :-) 
437 | 
438 | #+ bonus, echo=FALSE
439 | library(jpeg)  # To read JPG images
440 | 
441 | # Download a relevant cat picture; mode is set to "wb" because it seems that
442 | # Windows needs it. I don't use Windows, I can't confirm
443 | if (!file.exists("programming_cat.jpg")) {
444 |   download.file(url = "http://i.chzbgr.com/completestore/2010/5/18/129186912722282650.jpg", 
445 |                 destfile = "programming_cat.jpg", quiet = TRUE, mode = "wb")
446 | }
447 | 
448 | # Display image
449 | #' <center> ![](programming_cat.jpg) </center>
450 | #' 
451 | 
452 | 
453 | 
454 | 
455 | 
456 | 
457 | 


--------------------------------------------------------------------------------
/Deadliest movies scrape/notebook.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | 
  5 | 
  6 | **Document title:** R vs Python - Round 2 (1/2)
  7 | 
  8 | **Date:** January 12, 2014
  9 | 
 10 | **Text by:** Simon Garnier ([www.theswarmlab.com](http://www.theswarmlab.com)
 11 | / [\@sjmgarnier](http://twitter.com/sjmgarnier))
 12 | 
 13 | **R code by:** Simon Garnier
 14 | ([www.theswarmlab.com](http://www.theswarmlab.com) /
 15 | [\@sjmgarnier](http://twitter.com/sjmgarnier))
 16 | 
 17 | **Python code by:** Randy Olson
 18 | ([www.randalolson.com](http://www.randalolson.com) /
 19 | [\@randal_olson](http://twitter.com/randal_olson))
 20 | 
 21 | Document generated with RStudio ([www.rstudio.com](http://www.rstudio.com)), 
 22 | knitr ([www.yihui.name/knitr/](http://yihui.name/knitr/)) and pandoc 
 23 | ([www.johnmacfarlane.net/pandoc/](http://johnmacfarlane.net/pandoc/)). Python
 24 | figures generated with iPython Notebook
 25 | ([www.ipython.org/notebook.html](http://ipython.org/notebook.html)).
 26 | 
 27 | ___
 28 | 
 29 | #### Foreword ####
 30 | 
 31 | My friend Randy Olson and I got into the habit to argue about the relative 
 32 | qualities of our favorite languages for data analysis and visualization. I am
 33 | an enthusiastic R user ([www.r-project.org](http://www.r-project.org)) while 
 34 | Randy is a fan of Python ([www.python.org](http://www.python.org)). One thing
 35 | we agree on however is that our discussions are meaningless unless we
 36 | actually put R and Python to a series of tests to showcase their relative
 37 | strengths and weaknesses. Essentially we will set a common goal (*e.g.*,
 38 | perform a particular type of data analysis or draw a particular type of
 39 | graph) and create the R and Python codes to achieve this goal. And since
 40 | Randy and I are all about sharing, open source and open access, we decided to
 41 | make public the results of our friendly challenges so that you can help us
 42 | decide between R and Python and, hopefully, also learn something along the
 43 | way.
 44 | 
 45 | ___
 46 | 
 47 | #### Today's challenge: a data thief manual for honest scientists (Part 1 of 2) ####
 48 | 
 49 | ##### 1 - Introduction #####
 50 | 
 51 | Last week we started our challenge series with a rather simple task: plot a 
 52 | pretty barchart from some data collected by Randy for his recent post on the 
 53 | ["Top 25 most violence packed films" in the history of the movie 
 54 | industry](www.randalolson.com/2013/12/31/most-violence-packed-films/). Today 
 55 | we will try to up our game a little bit with a more complex task. We will 
 56 | show you how you can collect the data that Randy used for his post directly 
 57 | from the website they originate from 
 58 | ([www.MovieBodyCounts.com](http://www.moviebodycounts.com)). This is called 
 59 | data scraping, or the art of taking advantage of the gigantic database that 
 60 | is the Internet.
 61 | 
 62 | The basic principle behind the scraping of website data is simple: a website 
 63 | is a like database, and each page of the website is like a table of this 
 64 | database. All we want is find in the database the tables that contain 
 65 | information that we would like to acquire, and then extract this information 
 66 | from within these relevant tables. This task can be relatively easy if all 
 67 | the pages of a website have a similar structure (*i.e.*, if the database is 
 68 | clean and well maintained). In this ideal situation, all we have to do is 
 69 | identify one or more stable markers that delimit the desired information and 
 70 | use them to tell R or Python what to save in memory. Unfortunately not all 
 71 | websites have a similar structure across all of their pages and it can 
 72 | quickly become a nightmare to identify such markers. Worse, sometimes you 
 73 | will have to resign yourself to scrape or correct part or all of the data 
 74 | manually.
 75 | 
 76 | For this challenge, we will attempt to recover the following pieces of
 77 | information for each movie listed on
 78 | [www.MovieBodyCounts.com](http://www.moviebodycounts.com): title, release
 79 | year, count of on-screen deaths and link to the movie page on
 80 | [www.imdb.com](http://www.imdb.com) (this will help us for part 2 of this
 81 | challenge next week). We will detail the different steps of the process and
 82 | provide for each step the corresponding code (red boxes for R, green boxes
 83 | for Python). You will also find the entire codes at the end of this document.
 84 | 
 85 | ##### 2 - Step by step process #####
 86 | 
 87 | First things first, let's set up our working environment by loading some
 88 | necessary libraries.
 89 | 
 90 | 
 91 | 
 92 | ```r
 93 | # Load libraries
 94 | library(RCurl)      # Everything necessary to grab webpages on the Web
 95 | library(XML)        # Everything necessary to parse XML and HTML code
 96 | library(pbapply)    # Progress bars!!! Just because why not :-)
 97 | 
 98 | # Create curl handle which can be used for multiple HHTP requests. 
 99 | # followlocation = TRUE in case one of the URLs we want to grab is a redirection
100 | # link.
101 | curl <- getCurlHandle(useragent = "R", followlocation = TRUE)
102 | 
103 | ```
104 | 
105 | ```python
106 | # String parsing libraries
107 | import string
108 | import re
109 | 
110 | # urllib2 reads web pages if you provide it an URL
111 | import urllib2
112 | 
113 | # html2text converts HTML to Markdown, which is much easier to parse
114 | from html2text import html2text
115 | ```
116 | 
117 | 
118 | Now a word about the organization of 
119 | [www.MovieBodyCounts.com](http://www.moviebodycounts.com). To be perfectly 
120 | honest, it is a bit messy :-) Movies are organized in a series of 
121 | alphabetically ordered lists (by the first letter of each movie's title), 
122 | each letter having its own page 
123 | (http://www.moviebodycounts.com/movies-[A-Z].htm). There is also a list for 
124 | movies which title starts with a number 
125 | (http://www.moviebodycounts.com/movies-numbers.htm). Finally, all category 
126 | letters are capitalized in the lists' URLs, except for letters v and x. 
127 | Annoying, right? This is just one of the many little problems one can
128 | encounter when dealing with messy databases :-)
129 | 
130 | With all this information in mind, our first task is to create a list of all
131 | these lists.
132 | 
133 | 
134 | 
135 | ```r
136 | # Prepare URLs of the movie lists alphabetically ordered by first letter of
137 | # movie title (capital A to Z, except for v and y) + "numbers" list (for movies
138 | # which title starts with a number)
139 | urls.by.letter <- paste0("http://www.moviebodycounts.com/movies-", 
140 |                          c("numbers", LETTERS[1:21], "v", "W" , "x", "Y", "Z"), ".htm")
141 | 
142 | ```
143 | 
144 | ```python
145 | # Generate a list of all letters for the Movie pages (+ a "numbers" page)
146 | # MovieBodyCount's actor pages are all with capital letters EXCEPT v and x
147 | letters = ["numbers"] + list(string.letters[26:52].upper().replace("V", "v").replace("X", "x"))
148 | ```
149 | 
150 | 
151 | Our next task is to go through the HTML code of all these lists and gather
152 | the URLs of all the movie webpages. This is where the data scraping really
153 | starts.
154 | 
155 | As you will quickly notices by reading the following code, Randy and I have
156 | decided to use a different approach to identify and collect the desired URLs
157 | (and of all the data in the rest of this challenge). I have decided to rely
158 | on the [XML Path Language (XPath)](http://www.w3schools.com/xpath/), a
159 | language that makes it easy to navigate through elements and attributes in an
160 | XML/HTML document. Randy has decided to use an approach based on more
161 | "classical" string parsing and manipulation functions. Note that these are
162 | just personal preferences. XPath interpreters are also available in Python,
163 | and R is fully equipped for manipulating character strings.
164 | 
165 | For each movie list, we will...
166 | 
167 | 
168 | 
169 | ```r
170 | # For each movie list... For loops are frowned upon in R, let's use the classier
171 | # apply functions instead. Here I use the pblapply from the pbapply package.
172 | # It's equivalent to the regular lapply function, but it provides a neat 
173 | # progress bar. Unlist to get a vector. 
174 | urls.by.movie <- unlist(pblapply(urls.by.letter, FUN = function(URL) {
175 | 
176 | ```
177 | 
178 | ```python
179 | list_of_films = []
180 |   
181 | # Go through each movie list page and gather all of the movie web page URLs
182 | for letter in letters:
183 |   try:
184 | ```
185 | 
186 | 
187 | ...download the raw HTML content of the webpage,...
188 | 
189 | 
190 | 
191 | ```r
192 |   # Load raw HTML
193 |   raw.html <- getURL(URL, curl = curl)
194 | 
195 | ```
196 | 
197 | ```python
198 |       # Read the raw HTML from the web page
199 |       page_text = urllib2.urlopen("http://www.moviebodycounts.com/movies-" + letter + ".htm").read()
200 | ```
201 | 
202 | 
203 | ...transform raw HTML into a more convenient format to work with,...
204 | 
205 | 
206 | 
207 | ```r
208 |   # Parse HTML content
209 |   parsed.html <- htmlParse(raw.html)
210 | 
211 | ```
212 | 
213 | ```python
214 |       # Convert the raw HTML into Markdown
215 |       page_text = html2text(page_text).split("\n")
216 | ```
217 | 
218 | 
219 | ...find movie page entry, store the URL for later use and close the loop.
220 | 
221 | 
222 | 
223 | ```r
224 |   # Extract desired links from HTML content using XPath. 
225 |   # The desired links are all the URLs ("a/@href") directly following
226 |   # ("/following::") the image which source file is called "graphic-movies.jpg" 
227 |   # ("//img[@src='graphic-movies.jpg']").
228 |   links <- as.vector(xpathSApply(parsed.html, "//img[@src='graphic-movies.jpg']/following::a/@href"))
229 | 
230 |   # Most links are relative URLs. Add root of the website to make them absolute.
231 |   if (!is.null(links)) {
232 |     ix = grepl("http://www.moviebodycounts.com/", links)                  # Find relative URLs
233 |     links[!ix] <- paste0("http://www.moviebodycounts.com/", links[!ix])   # Add root of website to make URLs absolute
234 |     return(links)
235 |   }
236 | }), use.names = FALSE) # close the loop
237 | 
238 | # One URL is actually just a symbolic link to another page. Let's get rid of it.
239 | ix <- which(grepl("movies-C.htm", urls.by.movie))
240 | urls.by.movie <- urls.by.movie[-ix]
241 | 
242 | ```
243 | 
244 | ```python
245 |       # Search through the web page for movie page entries
246 |       for line in page_text:
247 |         # We know it's a movie page entry when it has ".htm" in it, but not ".jpg", "contact.htm", and "movies.htm"
248 |         # .jpg means it's a line with an image -- none of the movie entries have an image
249 |         # contact.htm and movies.htm means it's a link to the Contact or Movies page -- not what we want
250 |         # movies- means it's a redirect link to another page -- just skip over it
251 |         if ".htm" in line and ".jpg" not in line and "contact.htm" not in line and "movies.htm" not in line and "movies-" not in line:
252 |           #print line
253 |           # The URL is in between parentheses (), so we can simply split the string on those
254 |           # Some URLs are full URLs, e.g. www.moviebodycounts.com/movie_name.html, so splitting on the / gives us only the page name
255 |           list_of_films.append(line.split("(")[-1].strip(")").split("/")[-1])
256 |       
257 |     # If the movie list page doesn't exist, keep going
258 |     except:
259 |       print "\nerror with " + letter + "\n"
260 | ```
261 | 
262 | 
263 | Now that we know where to find each movie, we can start the hard part of this
264 | challenge. We will go through each movie webpage and attempt to find its 
265 | title, release year, count of on-screen deaths and link to its page on 
266 | [www.imdb.com](http://www.imdb.com). We will save all this information in a
267 | .csv file.
268 | 
269 | For each movie, we will...
270 | 
271 | 
272 | ```r
273 | # For each movie... 
274 | # do.call(rbind, ...) to reorganize the results in a nice data frame
275 | data <- do.call(rbind, pblapply(urls.by.movie, FUN = function(URL) {
276 | 
277 | ```
278 | 
279 | ```python
280 | # Now that we have every movie web page URL, go through each movie page and
281 | # extract the movie name, kill counts, etc.
282 | out_file = open("film-death-counts.csv", "wb")
283 | out_file.write("Film,Year,Kill_Count,IMDB_url\n")
284 |   
285 | for film_page in list_of_films:
286 |   try:
287 |       # The information we're looking for on the page:
288 |       film = ""
289 |       kills = ""
290 |       year = ""
291 |       IMDB_url = ""
292 | 
293 |       # A flag indicating that we've found the film title on the page
294 |       found_title = False
295 | ```
296 | 
297 | 
298 | ...download the raw HTML content of the webpage and transform raw HTML into a
299 | more convenient format to work with,...
300 | 
301 | 
302 | 
303 | ```r
304 |   # Load raw HTML
305 |   raw.html <- getURL(URL, curl = curl)
306 | 
307 |   # Parse HTML content
308 |   parsed.html <- htmlParse(raw.html)
309 | 
310 | ```
311 | 
312 | ```python
313 |       # Read the page's raw HTML and convert it to Markdown (again) and go
314 |       # through each line
315 |       for line in html2text(urllib2.urlopen("http://www.moviebodycounts.com/" + film_page).read()).split("\n"):
316 | ```
317 | 
318 | 
319 | ...attempt to find movie title,...
320 | 
321 | 
322 | ```r
323 |   # Find movie title
324 |   # Title appears inside a XML/HTML node called "title" ("//title"). In this
325 |   # node, it comes after "Movie Body Counts: ". I use gsub to get rid off "Movie
326 |   # Body Counts: " and keep only the movie title.
327 |   Film <- xpathSApply(parsed.html, "//title", xmlValue)
328 |   Film <- gsub("Movie Body Counts: ", "", Film)
329 | 
330 | ```
331 | 
332 | ```python
333 | # If we haven't found the title yet, these markers tell us we've found the movie
334 | # title
335 |         if not found_title and "!" not in line and "(" not in line and "[" not in line and line.strip() != "":
336 |           film = line.replace(",", "").strip(":")
337 |           found_title = True
338 | ```
339 | 
340 | 
341 | ...attempt to find movie year,...
342 | 
343 | 
344 | ```r
345 |   # Find movie year
346 |   # The year is usually a text inside ("/descendant::text()") a link node
347 |   # ("//a") which source contains the string "charts-year" ("[contains(@href,
348 |   # 'charts-year')]").
349 |   Year <- as.numeric(xpathSApply(parsed.html, "//a[contains(@href, 'charts-year')]/descendant::text()", xmlValue))
350 | 
351 | ```
352 | 
353 | ```python
354 |         # The year is usually on a line with "charts-year"
355 |         if "charts-year" in line:
356 |           year = line.split("[")[1].split("]")[0]
357 | ```
358 | 
359 | 
360 | ...attempt to find link to movie on IMDB,...
361 | 
362 | 
363 | ```r
364 |   # Find IMDB link
365 |   # The IMDB link is inside a link node ("//a") which source contains "imdb"
366 |   # ("/@href[contains(.,'imdb')]")
367 |   IMDB_URL <- as.vector(xpathSApply(parsed.html, "//a/@href[contains(.,'imdb')]"))[1]
368 | 
369 |   # Note: We select the first element of the vector because for at least one of
370 |   # the movies, this command returns two links.
371 | 
372 | ```
373 | 
374 | ```python
375 |         # The IMDB url is on a line with "[imdb]"
376 |         if "[imdb]" in line.lower():
377 |           IMDB_url = line.lower().split("[imdb](")[1].split(")")[0]
378 | ```
379 | 
380 | 
381 | ... and finally attempt to find the on-screen kill count. Here, Randy chose 
382 | an approach that minimizes his coding effort, but that will potentially force
383 | him to make several manual corrections a posteriori. I chose to find a 
384 | solution that works with minimal to no manual corrections, but that requires 
385 | an extra coding effort. Whichever approach is best depends mostly on the size
386 | of the data you want to scrape and the time you have to do it.
387 | 
388 | 
389 | ```r
390 |   # Find kill count.
391 |   # Kill count is contained in the first non-empty text node
392 |   # ("/following::text()[normalize-space()]") after the image which source file
393 |   # is called "graphic-bc.jpg" ("//img[@src='graphic-bc.jpg']")
394 |   Body_Count <- xpathSApply(parsed.html, "//img[@src='graphic-bc.jpg']/following::text()[normalize-space()]", xmlValue)[1]
395 | 
396 |   # Now we need to clean up the text node that we just extracted because there
397 |   # are lots of inconsistencies in the way the kill counts are displayed across
398 |   # all movie pages. For instance, counts are sometimes accompanied by text, not
399 |   # always the same, and sometimes there is no text at all. Sometimes the total
400 |   # count is split in two numbers (e.g., number of dead humans and number of
401 |   # dead aliens). And sometimes the total count is displayed and accompanied by
402 |   # a split count in parenthesis. First, let's remove everything that is
403 |   # writtent in parenthesis or that is not a number.
404 |   # Using gsub, remove everything in parenthesis and all non number characters
405 |   Body_Count <- gsub("\\(.*?\\)", " ", Body_Count)
406 |   Body_Count <- gsub("[^0-9]+", " ", Body_Count)
407 |   
408 |   # In case the total count has been split, we want to separate these numbers
409 |   # from each other so that we can add them up later. Using strsplit, split the
410 |   # character string at spaces
411 |   Body_Count <- unlist(strsplit(Body_Count, " "))
412 |   
413 |   # For now, we have extracted characters. Transform them into numbers.
414 |   Body_Count <- as.numeric(Body_Count)
415 |   
416 |   # Sum up the numbers (in case they have been split into separate categories.
417 |   Body_Count <- sum(Body_Count, na.rm = TRUE)
418 |   
419 | ```
420 | 
421 | ```python
422 |         # The kill counts are usually on a line with "Film:"
423 |         if "film:" in line.lower() or "kills:" in line.lower() or "count:" in line.lower():
424 |           kills = re.sub("[^0-9]", "", line.split(":")[1].split("(")[0])
425 | ```
426 | 
427 | 
428 | Almost done! Now we just need to close the loop and write the data frame into
429 | a .csv file
430 | 
431 | 
432 | ```r
433 |   # Return scraped data into a data frame form
434 |   return(data.frame(IMDB_URL, Film, Year, Body_Count))
435 | }))
436 | 
437 | # Save scraped data in a .csv file for future use
438 | write.csv(data, "movies-R.csv", row.names = FALSE)
439 | 
440 | ```
441 | 
442 | ```python
443 |       out_file.write(film + "," + year + "," + kills + "," + IMDB_url + "\n")
444 | 
445 |     # If a movie page fails to open, print out the error and move on to the next movie
446 |     except Exception as e:
447 |       print film_page
448 |     print e
449 | 
450 | out_file.close()
451 | ```
452 | 
453 | 
454 | And voilà! You should now have a .csv file somewhere on your computer
455 | containing all the information we just scraped from the website. Not too
456 | hard, right?
457 | 
458 | Keep the .csv file, we will use it again next week to complete this challenge
459 | by scraping additional information from [www.imdb.com](http://www.imdb.com).
460 | 
461 | ___
462 | 
463 | #### 3 - Source code ####
464 | 
465 | R and Python source codes are available 
466 | [here](https://github.com/morpionZ/R-vs-Python/tree/master/Deadliest%20movies%20scrape/code).
467 | 
468 | ___
469 | 
470 | #### 4 - Bonus for the braves ####
471 | 
472 | Today's challenge was code and text heavy. No pretty pictures to please the eye. So, for all the brave people who made it to the end, here is a cat picture :-) 
473 | 
474 | 
475 | 
476 | 
477 | <center> ![](programming_cat.jpg) </center>
478 | 
479 | 


--------------------------------------------------------------------------------
/Deadliest movies scrape/notebook2.R:
--------------------------------------------------------------------------------
  1 | #+ licence, echo=FALSE 
  2 | # Copyright 2014 Simon Garnier (http://www.theswarmlab.com / @sjmgarnier)
  3 | # 
  4 | # This script is free software: you can redistribute it and/or modify it under 
  5 | # the terms of the GNU General Public License as published by the Free Software 
  6 | # Foundation, either version 3 of the License, or (at your option) any later 
  7 | # version.
  8 | # 
  9 | # This script is distributed in the hope that it will be useful, but WITHOUT ANY
 10 | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 11 | # A PARTICULAR PURPOSE.
 12 | # 
 13 | # See the GNU General Public License for more details.
 14 | # 
 15 | # You should have received a copy of the GNU General Public License along with 
 16 | # this script. If not, see http://www.gnu.org/licenses/.
 17 | # 
 18 | # You can generate the HTML files by running: 
 19 | # library(knitr) 
 20 | # spin("notebook2.R") 
 21 | # pandoc("notebook2.md", config = "pandoc_config2.txt")
 22 | 
 23 | 
 24 | #+ 
 25 | #' **Document title:** R vs Python - Round 2 (2/2)
 26 | #' 
 27 | #' **Date:** February 2, 2014
 28 | #' 
 29 | #' **Text by:** Simon Garnier ([www.theswarmlab.com](http://www.theswarmlab.com)
 30 | #' / [\@sjmgarnier](http://twitter.com/sjmgarnier))
 31 | #' 
 32 | #' **R code by:** Simon Garnier
 33 | #' ([www.theswarmlab.com](http://www.theswarmlab.com) /
 34 | #' [\@sjmgarnier](http://twitter.com/sjmgarnier))
 35 | #' 
 36 | #' **Python code by:** Randy Olson
 37 | #' ([www.randalolson.com](http://www.randalolson.com) /
 38 | #' [\@randal_olson](http://twitter.com/randal_olson))
 39 | #' 
 40 | #' Document generated with RStudio ([www.rstudio.com](http://www.rstudio.com)), 
 41 | #' knitr ([www.yihui.name/knitr/](http://yihui.name/knitr/)) and pandoc 
 42 | #' ([www.johnmacfarlane.net/pandoc/](http://johnmacfarlane.net/pandoc/)). Python
 43 | #' figures generated with iPython Notebook
 44 | #' ([www.ipython.org/notebook.html](http://ipython.org/notebook.html)).
 45 | #' 
 46 | 
 47 | 
 48 | #' ___
 49 | #' 
 50 | #' #### Foreword ####
 51 | #' 
 52 | #' My friend Randy Olson and I got into the habit to argue about the relative 
 53 | #' qualities of our favorite languages for data analysis and visualization. I am
 54 | #' an enthusiastic R user ([www.r-project.org](http://www.r-project.org)) while 
 55 | #' Randy is a fan of Python ([www.python.org](http://www.python.org)). One thing
 56 | #' we agree on however is that our discussions are meaningless unless we
 57 | #' actually put R and Python to a series of tests to showcase their relative
 58 | #' strengths and weaknesses. Essentially we will set a common goal (*e.g.*,
 59 | #' perform a particular type of data analysis or draw a particular type of
 60 | #' graph) and create the R and Python codes to achieve this goal. And since
 61 | #' Randy and I are all about sharing, open source and open access, we decided to
 62 | #' make public the results of our friendly challenges so that you can help us
 63 | #' decide between R and Python and, hopefully, also learn something along the
 64 | #' way.
 65 | #' 
 66 | 
 67 | 
 68 | #' ___
 69 | #' 
 70 | #' #### Today's challenge: a data thief manual for honest scientists (Part 2 of 2) ####
 71 | #' 
 72 | #' ##### 1 - Introduction #####
 73 | #' 
 74 | #' [Last time](http://www.theswarmlab.com/r-vs-python-round-2/) we showed you 
 75 | #' how to scrape data from 
 76 | #' [www.MovieBodyCounts.com](http://www.moviebodycounts.com). Today, we will 
 77 | #' finish what we started by retrieving additional information from 
 78 | #' [www.imdb.com](http://www.imdb.com). In particular, we will attempt to 
 79 | #' recover the following pieces of information for each of the movies we 
 80 | #' collected last time: MPAA rating, genre(s), director(s), duration in minutes,
 81 | #' IMDb rating and full cast. We will detail the different steps of the process 
 82 | #' and provide for each step the corresponding code (red boxes for R, green 
 83 | #' boxes for Python). You will also find the entire codes at the end of this 
 84 | #' document.
 85 | #' 
 86 | #' If you think there’s a better way to code this in either language, leave a 
 87 | #' pull request on our [GitHub
 88 | #' repository](https://github.com/morpionZ/R-vs-Python/tree/master/Deadliest%20movies%20scrape/code)
 89 | #' or leave a note with suggestions in the comments below.
 90 | #' 
 91 | 
 92 | 
 93 | #' ##### 2 - Step by step process #####
 94 | #' 
 95 | #' First things first, let's set up our working environment by loading some
 96 | #' necessary libraries.
 97 | #' 
 98 | 
 99 | #+ libR, eval=FALSE, message=FALSE
100 | # Load libraries
101 | # No additional libraries needed here. Yeah!
102 | 
103 | #+ libPy, eval=FALSE, engine="python"
104 | # String parsing libraries
105 | from imdb import IMDb
106 | import pandas as pd
107 | import re
108 | 
109 | #' Randy is lucky today. Someone else has already written a package
110 | #' (['IMDbPY'](http://imdbpy.sourceforge.net/)) to scrape data from IMDb.
111 | #' Unfortunately for me, R users are too busy working with serious data sets to
112 | #' take the time to write such a package for my favorite data processing
113 | #' language. [Hadley Wickham](http://had.co.nz/) has included a ['movie' data
114 | #' set](http://had.co.nz/data/movies/) in the [ggplot2](http://ggplot2.org/)
115 | #' package that contains some of the information stored on IMDb, but some of the
116 | #' pieces we need for today's challenge are missing.
117 | #' 
118 | #' Since I am not easily discouraged, I decided to write my own IMDb scraping
119 | #' function (see below). It is not as sophisticated as the Python package Randy
120 | #' is using today, but it does the job until someone else decides to write a
121 | #' more complete R/IMDb package. As you will see, I am using the same scraping
122 | #' technique (XPath) as the one I used in the first part of the challenge.
123 | #' 
124 | 
125 | #+ IMDbScraperR, eval=FALSE
126 | # Create IMDB scraper
127 | IMDb <- function(ID) {
128 |   # Retrieve movie info from IMDb.com. 
129 |   #
130 |   # Args:
131 |   #   ID: IDs of the movies.
132 |   #
133 |   # Returns:
134 |   #   A data frame containing one line per movie, and nine columns: movie ID,
135 |   #   film title, year of release, duration in minutes, MPAA rating, genre(s),
136 |   #   director(s), IMDb rating, and full cast.
137 |   
138 |   # Load required libraries
139 |   require(XML)
140 |   require(pbapply)    # Apply functions with progress bars!!!
141 |   
142 |   # Wrap core of the function in do.call and pblapply in order to
143 |   # pseudo-vectorize it (pblapply) and return a data frame (do.call)
144 |   info <- do.call(rbind, pblapply(ID, FUN = function(ID) {
145 |     # Create movie URL on IMDb.com
146 |     URL <- paste0("http://www.imdb.com/title/tt", ID)
147 |     
148 |     # Download and parse HTML of IMDb page
149 |     parsed.html <- htmlParse(URL)
150 |     
151 |     # Find title
152 |     Film <- xpathSApply(parsed.html, "//h1[@class='header']/span[@class='itemprop']", xmlValue)
153 |     
154 |     # Find year
155 |     Year <- as.numeric(gsub("[^0-9]", "", xpathSApply(parsed.html, "//h1[@class='header']/span[@class='nobr']", xmlValue)))
156 |     
157 |     # Find duration in minutes
158 |     Length_Minutes <- as.numeric(gsub("[^0-9]", "", xpathSApply(parsed.html, "//div[@class='infobar']/time[@itemprop='duration']", xmlValue)))
159 |     
160 |     # Find MPAA rating
161 |     MPAA_Rating <- unname(xpathSApply(parsed.html, "//div[@class='infobar']/span/@content"))
162 |     if (!is.character(MPAA_Rating)) {   # Some movies don't have a MPAA rating
163 |       MPAA_Rating <- "UNRATED"
164 |     }
165 |     
166 |     # Find genre
167 |     Genre <- paste(xpathSApply(parsed.html, "//span[@class='itemprop' and @itemprop='genre']", xmlValue), collapse='|')
168 |     
169 |     # Find director
170 |     Director <- paste(xpathSApply(parsed.html, "//div[@itemprop='director']/a", xmlValue), collapse='|')
171 |     
172 |     # Find IMDB rating
173 |     IMDB_rating <- as.numeric(xpathSApply(parsed.html, "//div[@class='titlePageSprite star-box-giga-star']", xmlValue))
174 |     
175 |     # Extract full cast from the full credits page
176 |     parsed.html <- htmlParse(paste0(URL,"/fullcredits"))
177 |     Full_Cast <- paste(xpathSApply(parsed.html, "//span[@itemprop='name']", xmlValue), collapse='|')
178 |     
179 |     data.frame(ID = ID, Film = Film, Year = Year, Length_Minutes = Length_Minutes,
180 |                MPAA_Rating = MPAA_Rating, Genre = Genre, 
181 |                Director = Director, IMDB_rating = IMDB_rating, Full_Cast = Full_Cast))
182 |   }))
183 | }
184 | 
185 | #+ IMDbScraperPy, eval=FALSE, engine="python"
186 | imdb_access = IMDb()
187 | 
188 | #' Randy and I now have a working IMDb scraper. We can start collecting and
189 | #' organizing the data that we need.
190 | #' 
191 | #' First, let's load the data we collected last time.
192 | #' 
193 | 
194 | #+ loadDataR, eval=FALSE 
195 | # Load data from last challenge 
196 | data <- read.csv("movies-R.csv")
197 | 
198 | #+ loadDataPy, eval=FALSE, engine="python"
199 | movie_data = pd.read_csv("movies.csv")
200 | 
201 | #' Then, we will extract the movie IMDb ID from the IMDb URL we collected last
202 | #' week. It's easy, it's the only number in the URL.
203 | #' 
204 | 
205 | #+ getIDR, eval=FALSE
206 | # For each movie, extract IMDb info and append it to the data
207 | data <- within(data, {
208 |   # Extract ID number
209 |   IMDB_ID <- gsub("[^0-9]", "", IMDB_URL)
210 | 
211 | #+ getIDPy, eval=FALSE, engine="python"
212 | # Grab only the movie number out of the IMDB URL
213 | movie_data["Movie_Number"] = movie_data["IMDB_URL"].apply(lambda x: re.sub("[^0-9]", "", x))
214 |   
215 | #' Now that this is done, we will simply let the IMDb scraper collect the data
216 | #' we want and we will append it to the data from the first part of the
217 | #' challenge.
218 | #' 
219 | 
220 | #+ appendR, eval=FALSE
221 |   # Download IMDb info into a temporary variable
222 |   IMDB_Info <- IMDb(IMDB_ID)
223 |   
224 |   # Save MPAA rating
225 |   MPAA_Rating <- IMDB_Info$MPAA_Rating
226 |   
227 |   # Save genre(s)
228 |   Genre <- IMDB_Info$Genre
229 |   
230 |   # Save director(s)
231 |   Director <- IMDB_Info$Director
232 |   
233 |   # Save duration in minutes
234 |   Length_Minutes <- IMDB_Info$Length_Minutes
235 |   
236 |   # Save IMDb rating
237 |   IMDB_rating <- IMDB_Info$IMDB_rating
238 |   
239 |   # Save full cast
240 |   Full_Cast <- IMDB_Info$Full_Cast
241 |   
242 |   # Delete IMDb info
243 |   IMDB_Info <- NULL
244 | })
245 | 
246 | #+ appendPy, eval=FALSE, engine="python"
247 | with open("film-death-counts-Python.csv", "wb") as out_file:
248 |     out_file.write("Film,Year,Body_Count,MPAA_Rating,Genre,Director,Actors,Length_Minutes,IMDB_Rating\n")
249 | 
250 |     for movie_entry in movie_data.iterrows():
251 |         # Use a try-catch on the loop to prevent temporary connection-related issues from stopping the scrape
252 |         try:
253 |             movie = imdb_access.get_movie(movie_entry[1]["Movie_Number"])
254 |             movie_fields = []
255 |             
256 |             # Remove non-ASCII character encodings and commas from movie titles
257 |             movie_fields.append(movie["title"].encode("ascii", "replace").replace(",", ""))
258 |             movie_fields.append(str(movie["year"]))
259 |             movie_fields.append(str(movie_entry[1]["Body_Count"]))
260 |             
261 |             # Some movies don't have MPAA Ratings on IMDB
262 |             try:
263 |               movie_fields.append(str(movie["mpaa"].split(" ")[1]))
264 |             except:
265 |               movie_fields.append("")
266 |             
267 |             # For movies with multiple genres/directors/actors, join them with bars |
268 |             movie_fields.append(str("|".join(movie["genres"])))
269 |             movie_fields.append(str("|".join([str(x) for x in movie["director"]])))
270 |             movie_fields.append(str("|".join([str(x) for x in movie["cast"]])))
271 |             
272 |             movie_fields.append(str(int(movie["runtime"][0].split(":")[-1])))
273 |             movie_fields.append(str(float(movie["rating"])))
274 | 
275 | #' And finally, all what is left to do is to save the complete data set into a
276 | #' .csv file and close the script. 
277 | #' 
278 | 
279 | #+ saveR, eval=FALSE
280 | write.csv(data, file = "movies-R-full.csv")
281 | 
282 | #+ savePy, eval=FALSE, engine="python"
283 | # All entries are comma-delimited
284 |             out_file.write(",".join(movie_fields) + "\n")
285 |             
286 |         except Exception as e:
287 |             print "Error with", str(movie)
288 | 
289 | 
290 | #' That's it! You should now have a .csv file somewhere on your computer 
291 | #' containing all the information we just scraped in both parts of this 
292 | #' challenge.
293 | #' 
294 | #' Sorry it took us so long to complete this part, but beginnings of semesters 
295 | #' are always very busy times at the university.
296 | #' 
297 | #' Stay tuned for our next challenge! It will be about making a linear
298 | #' regression, running basic diagnostic tests and plotting the resulting
299 | #' straight line with its confidence interval.
300 | #' 
301 | 
302 | 
303 | #' ___
304 | #' 
305 | #' #### 3 - Source code ####
306 | #' 
307 | #' R and Python source codes are available 
308 | #' [here](https://github.com/morpionZ/R-vs-Python/tree/master/Deadliest%20movies%20scrape/code).
309 | #' 
310 | 


--------------------------------------------------------------------------------
/Deadliest movies scrape/notebook2.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  2 | <html xmlns="http://www.w3.org/1999/xhtml">
  3 | <head>
  4 |   <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  5 |   <meta http-equiv="Content-Style-Type" content="text/css" />
  6 |   <meta name="generator" content="pandoc" />
  7 |   <title></title>
  8 |   <style type="text/css">code{white-space: pre;}</style>
  9 |   <style type="text/css">
 10 | table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
 11 |   margin: 0; padding: 0; vertical-align: baseline; border: none; }
 12 | table.sourceCode { width: 100%; line-height: 100%; }
 13 | td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
 14 | td.sourceCode { padding-left: 5px; }
 15 | code > span.kw { color: #007020; font-weight: bold; }
 16 | code > span.dt { color: #902000; }
 17 | code > span.dv { color: #40a070; }
 18 | code > span.bn { color: #40a070; }
 19 | code > span.fl { color: #40a070; }
 20 | code > span.ch { color: #4070a0; }
 21 | code > span.st { color: #4070a0; }
 22 | code > span.co { color: #60a0b0; font-style: italic; }
 23 | code > span.ot { color: #007020; }
 24 | code > span.al { color: #ff0000; font-weight: bold; }
 25 | code > span.fu { color: #06287e; }
 26 | code > span.er { color: #ff0000; font-weight: bold; }
 27 |   </style>
 28 |   <link rel="stylesheet" href="custom.css" type="text/css" />
 29 | </head>
 30 | <body>
 31 | <p><strong>Document title:</strong> R vs Python - Round 2 (2/2)</p>
 32 | <p><strong>Date:</strong> February 2, 2014</p>
 33 | <p><strong>Text by:</strong> Simon Garnier (<a href="http://www.theswarmlab.com">www.theswarmlab.com</a> / <a href="http://twitter.com/sjmgarnier">@sjmgarnier</a>)</p>
 34 | <p><strong>R code by:</strong> Simon Garnier (<a href="http://www.theswarmlab.com">www.theswarmlab.com</a> / <a href="http://twitter.com/sjmgarnier">@sjmgarnier</a>)</p>
 35 | <p><strong>Python code by:</strong> Randy Olson (<a href="http://www.randalolson.com">www.randalolson.com</a> / <a href="http://twitter.com/randal_olson">@randal_olson</a>)</p>
 36 | <p>Document generated with RStudio (<a href="http://www.rstudio.com">www.rstudio.com</a>), knitr (<a href="http://yihui.name/knitr/">www.yihui.name/knitr/</a>) and pandoc (<a href="http://johnmacfarlane.net/pandoc/">www.johnmacfarlane.net/pandoc/</a>). Python figures generated with iPython Notebook (<a href="http://ipython.org/notebook.html">www.ipython.org/notebook.html</a>).</p>
 37 | <hr />
 38 | <h4 id="foreword">Foreword</h4>
 39 | <p>My friend Randy Olson and I got into the habit to argue about the relative qualities of our favorite languages for data analysis and visualization. I am an enthusiastic R user (<a href="http://www.r-project.org">www.r-project.org</a>) while Randy is a fan of Python (<a href="http://www.python.org">www.python.org</a>). One thing we agree on however is that our discussions are meaningless unless we actually put R and Python to a series of tests to showcase their relative strengths and weaknesses. Essentially we will set a common goal (<em>e.g.</em>, perform a particular type of data analysis or draw a particular type of graph) and create the R and Python codes to achieve this goal. And since Randy and I are all about sharing, open source and open access, we decided to make public the results of our friendly challenges so that you can help us decide between R and Python and, hopefully, also learn something along the way.</p>
 40 | <hr />
 41 | <h4 id="todays-challenge-a-data-thief-manual-for-honest-scientists-part-2-of-2">Today’s challenge: a data thief manual for honest scientists (Part 2 of 2)</h4>
 42 | <h5 id="introduction">1 - Introduction</h5>
 43 | <p><a href="http://www.theswarmlab.com/r-vs-python-round-2/">Last time</a> we showed you how to scrape data from <a href="http://www.moviebodycounts.com">www.MovieBodyCounts.com</a>. Today, we will finish what we started by retrieving additional information from <a href="http://www.imdb.com">www.imdb.com</a>. In particular, we will attempt to recover the following pieces of information for each of the movies we collected last time: MPAA rating, genre(s), director(s), duration in minutes, IMDb rating and full cast. We will detail the different steps of the process and provide for each step the corresponding code (red boxes for R, green boxes for Python). You will also find the entire codes at the end of this document.</p>
 44 | <p>If you think there’s a better way to code this in either language, leave a pull request on our <a href="https://github.com/morpionZ/R-vs-Python/tree/master/Deadliest%20movies%20scrape/code">GitHub repository</a> or leave a note with suggestions in the comments below.</p>
 45 | <h5 id="step-by-step-process">2 - Step by step process</h5>
 46 | <p>First things first, let’s set up our working environment by loading some necessary libraries.</p>
 47 | <pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Load libraries</span>
 48 | <span class="co"># No additional libraries needed here. Yeah!</span></code></pre>
 49 | <pre class="sourceCode python"><code class="sourceCode python"><span class="co"># String parsing libraries</span>
 50 | <span class="ch">from</span> imdb <span class="ch">import</span> IMDb
 51 | <span class="ch">import</span> pandas <span class="ch">as</span> pd
 52 | <span class="ch">import</span> re</code></pre>
 53 | <p>Randy is lucky today. Someone else has already written a package (<a href="http://imdbpy.sourceforge.net/">‘IMDbPY’</a>) to scrape data from IMDb. Unfortunately for me, R users are too busy working with serious data sets to take the time to write such a package for my favorite data processing language. <a href="http://had.co.nz/">Hadley Wickham</a> has included a <a href="http://had.co.nz/data/movies/">‘movie’ data set</a> in the <a href="http://ggplot2.org/">ggplot2</a> package that contains some of the information stored on IMDb, but some of the pieces we need for today’s challenge are missing.</p>
 54 | <p>Since I am not easily discouraged, I decided to write my own IMDb scraping function (see below). It is not as sophisticated as the Python package Randy is using today, but it does the job until someone else decides to write a more complete R/IMDb package. As you will see, I am using the same scraping technique (XPath) as the one I used in the first part of the challenge.</p>
 55 | <pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Create IMDB scraper</span>
 56 | IMDb &lt;-<span class="st"> </span>function(ID) {
 57 |   <span class="co"># Retrieve movie info from IMDb.com. </span>
 58 |   <span class="co">#</span>
 59 |   <span class="co"># Args:</span>
 60 |   <span class="co">#   ID: IDs of the movies.</span>
 61 |   <span class="co">#</span>
 62 |   <span class="co"># Returns:</span>
 63 |   <span class="co">#   A data frame containing one line per movie, and nine columns: movie ID,</span>
 64 |   <span class="co">#   film title, year of release, duration in minutes, MPAA rating, genre(s),</span>
 65 |   <span class="co">#   director(s), IMDb rating, and full cast.</span>
 66 |   
 67 |   <span class="co"># Load required libraries</span>
 68 |   <span class="kw">require</span>(XML)
 69 |   <span class="kw">require</span>(pbapply)    <span class="co"># Apply functions with progress bars!!!</span>
 70 |   
 71 |   <span class="co"># Wrap core of the function in do.call and pblapply in order to</span>
 72 |   <span class="co"># pseudo-vectorize it (pblapply) and return a data frame (do.call)</span>
 73 |   info &lt;-<span class="st"> </span><span class="kw">do.call</span>(rbind, <span class="kw">pblapply</span>(ID, <span class="dt">FUN =</span> function(ID) {
 74 |     <span class="co"># Create movie URL on IMDb.com</span>
 75 |     URL &lt;-<span class="st"> </span><span class="kw">paste0</span>(<span class="st">&quot;http://www.imdb.com/title/tt&quot;</span>, ID)
 76 |     
 77 |     <span class="co"># Download and parse HTML of IMDb page</span>
 78 |     parsed.html &lt;-<span class="st"> </span><span class="kw">htmlParse</span>(URL)
 79 |     
 80 |     <span class="co"># Find title</span>
 81 |     Film &lt;-<span class="st"> </span><span class="kw">xpathSApply</span>(parsed.html, <span class="st">&quot;//h1[@class=&#39;header&#39;]/span[@class=&#39;itemprop&#39;]&quot;</span>, xmlValue)
 82 |     
 83 |     <span class="co"># Find year</span>
 84 |     Year &lt;-<span class="st"> </span><span class="kw">as.numeric</span>(<span class="kw">gsub</span>(<span class="st">&quot;[^0-9]&quot;</span>, <span class="st">&quot;&quot;</span>, <span class="kw">xpathSApply</span>(parsed.html, <span class="st">&quot;//h1[@class=&#39;header&#39;]/span[@class=&#39;nobr&#39;]&quot;</span>, xmlValue)))
 85 |     
 86 |     <span class="co"># Find duration in minutes</span>
 87 |     Length_Minutes &lt;-<span class="st"> </span><span class="kw">as.numeric</span>(<span class="kw">gsub</span>(<span class="st">&quot;[^0-9]&quot;</span>, <span class="st">&quot;&quot;</span>, <span class="kw">xpathSApply</span>(parsed.html, <span class="st">&quot;//div[@class=&#39;infobar&#39;]/time[@itemprop=&#39;duration&#39;]&quot;</span>, xmlValue)))
 88 |     
 89 |     <span class="co"># Find MPAA rating</span>
 90 |     MPAA_Rating &lt;-<span class="st"> </span><span class="kw">unname</span>(<span class="kw">xpathSApply</span>(parsed.html, <span class="st">&quot;//div[@class=&#39;infobar&#39;]/span/@content&quot;</span>))
 91 |     if (!<span class="kw">is.character</span>(MPAA_Rating)) {   <span class="co"># Some movies don&#39;t have a MPAA rating</span>
 92 |       MPAA_Rating &lt;-<span class="st"> &quot;UNRATED&quot;</span>
 93 |     }
 94 |     
 95 |     <span class="co"># Find genre</span>
 96 |     Genre &lt;-<span class="st"> </span><span class="kw">paste</span>(<span class="kw">xpathSApply</span>(parsed.html, <span class="st">&quot;//span[@class=&#39;itemprop&#39; and @itemprop=&#39;genre&#39;]&quot;</span>, xmlValue), <span class="dt">collapse=</span><span class="st">&#39;|&#39;</span>)
 97 |     
 98 |     <span class="co"># Find director</span>
 99 |     Director &lt;-<span class="st"> </span><span class="kw">paste</span>(<span class="kw">xpathSApply</span>(parsed.html, <span class="st">&quot;//div[@itemprop=&#39;director&#39;]/a&quot;</span>, xmlValue), <span class="dt">collapse=</span><span class="st">&#39;|&#39;</span>)
100 |     
101 |     <span class="co"># Find IMDB rating</span>
102 |     IMDB_rating &lt;-<span class="st"> </span><span class="kw">as.numeric</span>(<span class="kw">xpathSApply</span>(parsed.html, <span class="st">&quot;//div[@class=&#39;titlePageSprite star-box-giga-star&#39;]&quot;</span>, xmlValue))
103 |     
104 |     <span class="co"># Extract full cast from the full credits page</span>
105 |     parsed.html &lt;-<span class="st"> </span><span class="kw">htmlParse</span>(<span class="kw">paste0</span>(URL,<span class="st">&quot;/fullcredits&quot;</span>))
106 |     Full_Cast &lt;-<span class="st"> </span><span class="kw">paste</span>(<span class="kw">xpathSApply</span>(parsed.html, <span class="st">&quot;//span[@itemprop=&#39;name&#39;]&quot;</span>, xmlValue), <span class="dt">collapse=</span><span class="st">&#39;|&#39;</span>)
107 |     
108 |     <span class="kw">data.frame</span>(<span class="dt">ID =</span> ID, <span class="dt">Film =</span> Film, <span class="dt">Year =</span> Year, <span class="dt">Length_Minutes =</span> Length_Minutes,
109 |                <span class="dt">MPAA_Rating =</span> MPAA_Rating, <span class="dt">Genre =</span> Genre, 
110 |                <span class="dt">Director =</span> Director, <span class="dt">IMDB_rating =</span> IMDB_rating, <span class="dt">Full_Cast =</span> Full_Cast)<span class="er">)</span>
111 |   }))
112 | }</code></pre>
113 | <pre class="sourceCode python"><code class="sourceCode python">imdb_access = IMDb()</code></pre>
114 | <p>Randy and I now have a working IMDb scraper. We can start collecting and organizing the data that we need.</p>
115 | <p>First, let’s load the data we collected last time.</p>
116 | <pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Load data from last challenge </span>
117 | data &lt;-<span class="st"> </span><span class="kw">read.csv</span>(<span class="st">&quot;movies-R.csv&quot;</span>)</code></pre>
118 | <pre class="sourceCode python"><code class="sourceCode python">movie_data = pd.read_csv(<span class="st">&quot;movies.csv&quot;</span>)</code></pre>
119 | <p>Then, we will extract the movie IMDb ID from the IMDb URL we collected last week. It’s easy, it’s the only number in the URL.</p>
120 | <pre class="sourceCode r"><code class="sourceCode r"><span class="co"># For each movie, extract IMDb info and append it to the data</span>
121 | data &lt;-<span class="st"> </span><span class="kw">within</span>(data, {
122 |   <span class="co"># Extract ID number</span>
123 |   IMDB_ID &lt;-<span class="st"> </span><span class="kw">gsub</span>(<span class="st">&quot;[^0-9]&quot;</span>, <span class="st">&quot;&quot;</span>, IMDB_URL)</code></pre>
124 | <pre class="sourceCode python"><code class="sourceCode python"><span class="co"># Grab only the movie number out of the IMDB URL</span>
125 | movie_data[<span class="st">&quot;Movie_Number&quot;</span>] = movie_data[<span class="st">&quot;IMDB_URL&quot;</span>].<span class="dt">apply</span>(<span class="kw">lambda</span> x: re.sub(<span class="st">&quot;[^0-9]&quot;</span>, <span class="st">&quot;&quot;</span>, x))</code></pre>
126 | <p>Now that this is done, we will simply let the IMDb scraper collect the data we want and we will append it to the data from the first part of the challenge.</p>
127 | <pre class="sourceCode r"><code class="sourceCode r">  <span class="co"># Download IMDb info into a temporary variable</span>
128 |   IMDB_Info &lt;-<span class="st"> </span><span class="kw">IMDb</span>(IMDB_ID)
129 |   
130 |   <span class="co"># Save MPAA rating</span>
131 |   MPAA_Rating &lt;-<span class="st"> </span>IMDB_Info$MPAA_Rating
132 |   
133 |   <span class="co"># Save genre(s)</span>
134 |   Genre &lt;-<span class="st"> </span>IMDB_Info$Genre
135 |   
136 |   <span class="co"># Save director(s)</span>
137 |   Director &lt;-<span class="st"> </span>IMDB_Info$Director
138 |   
139 |   <span class="co"># Save duration in minutes</span>
140 |   Length_Minutes &lt;-<span class="st"> </span>IMDB_Info$Length_Minutes
141 |   
142 |   <span class="co"># Save IMDb rating</span>
143 |   IMDB_rating &lt;-<span class="st"> </span>IMDB_Info$IMDB_rating
144 |   
145 |   <span class="co"># Save full cast</span>
146 |   Full_Cast &lt;-<span class="st"> </span>IMDB_Info$Full_Cast
147 |   
148 |   <span class="co"># Delete IMDb info</span>
149 |   IMDB_Info &lt;-<span class="st"> </span><span class="ot">NULL</span>
150 | <span class="er">})</span></code></pre>
151 | <pre class="sourceCode python"><code class="sourceCode python"><span class="kw">with</span> <span class="dt">open</span>(<span class="st">&quot;film-death-counts-Python.csv&quot;</span>, <span class="st">&quot;wb&quot;</span>) <span class="ch">as</span> out_file:
152 |     out_file.write(<span class="st">&quot;Film,Year,Body_Count,MPAA_Rating,Genre,Director,Actors,Length_Minutes,IMDB_Rating</span><span class="ch">\n</span><span class="st">&quot;</span>)
153 | 
154 |     <span class="kw">for</span> movie_entry in movie_data.iterrows():
155 |         <span class="co"># Use a try-catch on the loop to prevent temporary connection-related issues from stopping the scrape</span>
156 |         <span class="kw">try</span>:
157 |             movie = imdb_access.get_movie(movie_entry[<span class="dv">1</span>][<span class="st">&quot;Movie_Number&quot;</span>])
158 |             movie_fields = []
159 |             
160 |             <span class="co"># Remove non-ASCII character encodings and commas from movie titles</span>
161 |             movie_fields.append(movie[<span class="st">&quot;title&quot;</span>].encode(<span class="st">&quot;ascii&quot;</span>, <span class="st">&quot;replace&quot;</span>).replace(<span class="st">&quot;,&quot;</span>, <span class="st">&quot;&quot;</span>))
162 |             movie_fields.append(<span class="dt">str</span>(movie[<span class="st">&quot;year&quot;</span>]))
163 |             movie_fields.append(<span class="dt">str</span>(movie_entry[<span class="dv">1</span>][<span class="st">&quot;Body_Count&quot;</span>]))
164 |             
165 |             <span class="co"># Some movies don&#39;t have MPAA Ratings on IMDB</span>
166 |             <span class="kw">try</span>:
167 |               movie_fields.append(<span class="dt">str</span>(movie[<span class="st">&quot;mpaa&quot;</span>].split(<span class="st">&quot; &quot;</span>)[<span class="dv">1</span>]))
168 |             <span class="kw">except</span>:
169 |               movie_fields.append(<span class="st">&quot;&quot;</span>)
170 |             
171 |             <span class="co"># For movies with multiple genres/directors/actors, join them with bars |</span>
172 |             movie_fields.append(<span class="dt">str</span>(<span class="st">&quot;|&quot;</span>.join(movie[<span class="st">&quot;genres&quot;</span>])))
173 |             movie_fields.append(<span class="dt">str</span>(<span class="st">&quot;|&quot;</span>.join([<span class="dt">str</span>(x) <span class="kw">for</span> x in movie[<span class="st">&quot;director&quot;</span>]])))
174 |             movie_fields.append(<span class="dt">str</span>(<span class="st">&quot;|&quot;</span>.join([<span class="dt">str</span>(x) <span class="kw">for</span> x in movie[<span class="st">&quot;cast&quot;</span>]])))
175 |             
176 |             movie_fields.append(<span class="dt">str</span>(<span class="dt">int</span>(movie[<span class="st">&quot;runtime&quot;</span>][<span class="dv">0</span>].split(<span class="st">&quot;:&quot;</span>)[-<span class="dv">1</span>])))
177 |             movie_fields.append(<span class="dt">str</span>(<span class="dt">float</span>(movie[<span class="st">&quot;rating&quot;</span>])))</code></pre>
178 | <p>And finally, all what is left to do is to save the complete data set into a .csv file and close the script.</p>
179 | <pre class="sourceCode r"><code class="sourceCode r"><span class="kw">write.csv</span>(data, <span class="dt">file =</span> <span class="st">&quot;movies-R-full.csv&quot;</span>)</code></pre>
180 | <pre class="sourceCode python"><code class="sourceCode python"><span class="co"># All entries are comma-delimited</span>
181 |             out_file.write(<span class="st">&quot;,&quot;</span>.join(movie_fields) + <span class="st">&quot;</span><span class="ch">\n</span><span class="st">&quot;</span>)
182 |             
183 |         <span class="kw">except</span> <span class="ot">Exception</span> <span class="ch">as</span> e:
184 |             <span class="dt">print</span> <span class="st">&quot;Error with&quot;</span>, <span class="dt">str</span>(movie)</code></pre>
185 | <p>That’s it! You should now have a .csv file somewhere on your computer containing all the information we just scraped in both parts of this challenge.</p>
186 | <p>Sorry it took us so long to complete this part, but beginnings of semesters are always very busy times at the university.</p>
187 | <p>Stay tuned for our next challenge! It will be about making a linear regression, running basic diagnostic tests and plotting the resulting straight line with its confidence interval.</p>
188 | <hr />
189 | <h4 id="source-code">3 - Source code</h4>
190 | <p>R and Python source codes are available <a href="https://github.com/morpionZ/R-vs-Python/tree/master/Deadliest%20movies%20scrape/code">here</a>.</p>
191 | </body>
192 | </html>
193 | 


--------------------------------------------------------------------------------
/Deadliest movies scrape/notebook2.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | 
  5 | 
  6 | **Document title:** R vs Python - Round 2 (2/2)
  7 | 
  8 | **Date:** February 2, 2014
  9 | 
 10 | **Text by:** Simon Garnier ([www.theswarmlab.com](http://www.theswarmlab.com)
 11 | / [\@sjmgarnier](http://twitter.com/sjmgarnier))
 12 | 
 13 | **R code by:** Simon Garnier
 14 | ([www.theswarmlab.com](http://www.theswarmlab.com) /
 15 | [\@sjmgarnier](http://twitter.com/sjmgarnier))
 16 | 
 17 | **Python code by:** Randy Olson
 18 | ([www.randalolson.com](http://www.randalolson.com) /
 19 | [\@randal_olson](http://twitter.com/randal_olson))
 20 | 
 21 | Document generated with RStudio ([www.rstudio.com](http://www.rstudio.com)), 
 22 | knitr ([www.yihui.name/knitr/](http://yihui.name/knitr/)) and pandoc 
 23 | ([www.johnmacfarlane.net/pandoc/](http://johnmacfarlane.net/pandoc/)). Python
 24 | figures generated with iPython Notebook
 25 | ([www.ipython.org/notebook.html](http://ipython.org/notebook.html)).
 26 | 
 27 | ___
 28 | 
 29 | #### Foreword ####
 30 | 
 31 | My friend Randy Olson and I got into the habit to argue about the relative 
 32 | qualities of our favorite languages for data analysis and visualization. I am
 33 | an enthusiastic R user ([www.r-project.org](http://www.r-project.org)) while 
 34 | Randy is a fan of Python ([www.python.org](http://www.python.org)). One thing
 35 | we agree on however is that our discussions are meaningless unless we
 36 | actually put R and Python to a series of tests to showcase their relative
 37 | strengths and weaknesses. Essentially we will set a common goal (*e.g.*,
 38 | perform a particular type of data analysis or draw a particular type of
 39 | graph) and create the R and Python codes to achieve this goal. And since
 40 | Randy and I are all about sharing, open source and open access, we decided to
 41 | make public the results of our friendly challenges so that you can help us
 42 | decide between R and Python and, hopefully, also learn something along the
 43 | way.
 44 | 
 45 | ___
 46 | 
 47 | #### Today's challenge: a data thief manual for honest scientists (Part 2 of 2) ####
 48 | 
 49 | ##### 1 - Introduction #####
 50 | 
 51 | [Last time](http://www.theswarmlab.com/r-vs-python-round-2/) we showed you 
 52 | how to scrape data from 
 53 | [www.MovieBodyCounts.com](http://www.moviebodycounts.com). Today, we will 
 54 | finish what we started by retrieving additional information from 
 55 | [www.imdb.com](http://www.imdb.com). In particular, we will attempt to 
 56 | recover the following pieces of information for each of the movies we 
 57 | collected last time: MPAA rating, genre(s), director(s), duration in minutes,
 58 | IMDb rating and full cast. We will detail the different steps of the process 
 59 | and provide for each step the corresponding code (red boxes for R, green 
 60 | boxes for Python). You will also find the entire codes at the end of this 
 61 | document.
 62 | 
 63 | If you think there’s a better way to code this in either language, leave a 
 64 | pull request on our [GitHub
 65 | repository](https://github.com/morpionZ/R-vs-Python/tree/master/Deadliest%20movies%20scrape/code)
 66 | or leave a note with suggestions in the comments below.
 67 | 
 68 | ##### 2 - Step by step process #####
 69 | 
 70 | First things first, let's set up our working environment by loading some
 71 | necessary libraries.
 72 | 
 73 | 
 74 | 
 75 | ```r
 76 | # Load libraries
 77 | # No additional libraries needed here. Yeah!
 78 | 
 79 | ```
 80 | 
 81 | ```python
 82 | # String parsing libraries
 83 | from imdb import IMDb
 84 | import pandas as pd
 85 | import re
 86 | ```
 87 | 
 88 | 
 89 | Randy is lucky today. Someone else has already written a package
 90 | (['IMDbPY'](http://imdbpy.sourceforge.net/)) to scrape data from IMDb.
 91 | Unfortunately for me, R users are too busy working with serious data sets to
 92 | take the time to write such a package for my favorite data processing
 93 | language. [Hadley Wickham](http://had.co.nz/) has included a ['movie' data
 94 | set](http://had.co.nz/data/movies/) in the [ggplot2](http://ggplot2.org/)
 95 | package that contains some of the information stored on IMDb, but some of the
 96 | pieces we need for today's challenge are missing.
 97 | 
 98 | Since I am not easily discouraged, I decided to write my own IMDb scraping
 99 | function (see below). It is not as sophisticated as the Python package Randy
100 | is using today, but it does the job until someone else decides to write a
101 | more complete R/IMDb package. As you will see, I am using the same scraping
102 | technique (XPath) as the one I used in the first part of the challenge.
103 | 
104 | 
105 | 
106 | ```r
107 | # Create IMDB scraper
108 | IMDb <- function(ID) {
109 |   # Retrieve movie info from IMDb.com. 
110 |   #
111 |   # Args:
112 |   #   ID: IDs of the movies.
113 |   #
114 |   # Returns:
115 |   #   A data frame containing one line per movie, and nine columns: movie ID,
116 |   #   film title, year of release, duration in minutes, MPAA rating, genre(s),
117 |   #   director(s), IMDb rating, and full cast.
118 |   
119 |   # Load required libraries
120 |   require(XML)
121 |   require(pbapply)    # Apply functions with progress bars!!!
122 |   
123 |   # Wrap core of the function in do.call and pblapply in order to
124 |   # pseudo-vectorize it (pblapply) and return a data frame (do.call)
125 |   info <- do.call(rbind, pblapply(ID, FUN = function(ID) {
126 |     # Create movie URL on IMDb.com
127 |     URL <- paste0("http://www.imdb.com/title/tt", ID)
128 |     
129 |     # Download and parse HTML of IMDb page
130 |     parsed.html <- htmlParse(URL)
131 |     
132 |     # Find title
133 |     Film <- xpathSApply(parsed.html, "//h1[@class='header']/span[@class='itemprop']", xmlValue)
134 |     
135 |     # Find year
136 |     Year <- as.numeric(gsub("[^0-9]", "", xpathSApply(parsed.html, "//h1[@class='header']/span[@class='nobr']", xmlValue)))
137 |     
138 |     # Find duration in minutes
139 |     Length_Minutes <- as.numeric(gsub("[^0-9]", "", xpathSApply(parsed.html, "//div[@class='infobar']/time[@itemprop='duration']", xmlValue)))
140 |     
141 |     # Find MPAA rating
142 |     MPAA_Rating <- unname(xpathSApply(parsed.html, "//div[@class='infobar']/span/@content"))
143 |     if (!is.character(MPAA_Rating)) {   # Some movies don't have a MPAA rating
144 |       MPAA_Rating <- "UNRATED"
145 |     }
146 |     
147 |     # Find genre
148 |     Genre <- paste(xpathSApply(parsed.html, "//span[@class='itemprop' and @itemprop='genre']", xmlValue), collapse='|')
149 |     
150 |     # Find director
151 |     Director <- paste(xpathSApply(parsed.html, "//div[@itemprop='director']/a", xmlValue), collapse='|')
152 |     
153 |     # Find IMDB rating
154 |     IMDB_rating <- as.numeric(xpathSApply(parsed.html, "//div[@class='titlePageSprite star-box-giga-star']", xmlValue))
155 |     
156 |     # Extract full cast from the full credits page
157 |     parsed.html <- htmlParse(paste0(URL,"/fullcredits"))
158 |     Full_Cast <- paste(xpathSApply(parsed.html, "//span[@itemprop='name']", xmlValue), collapse='|')
159 |     
160 |     data.frame(ID = ID, Film = Film, Year = Year, Length_Minutes = Length_Minutes,
161 |                MPAA_Rating = MPAA_Rating, Genre = Genre, 
162 |                Director = Director, IMDB_rating = IMDB_rating, Full_Cast = Full_Cast))
163 |   }))
164 | }
165 | 
166 | ```
167 | 
168 | ```python
169 | imdb_access = IMDb()
170 | ```
171 | 
172 | 
173 | Randy and I now have a working IMDb scraper. We can start collecting and
174 | organizing the data that we need.
175 | 
176 | First, let's load the data we collected last time.
177 | 
178 | 
179 | 
180 | ```r
181 | # Load data from last challenge 
182 | data <- read.csv("movies-R.csv")
183 | 
184 | ```
185 | 
186 | ```python
187 | movie_data = pd.read_csv("movies.csv")
188 | ```
189 | 
190 | 
191 | Then, we will extract the movie IMDb ID from the IMDb URL we collected last
192 | week. It's easy, it's the only number in the URL.
193 | 
194 | 
195 | 
196 | ```r
197 | # For each movie, extract IMDb info and append it to the data
198 | data <- within(data, {
199 |   # Extract ID number
200 |   IMDB_ID <- gsub("[^0-9]", "", IMDB_URL)
201 | 
202 | ```
203 | 
204 | ```python
205 | # Grab only the movie number out of the IMDB URL
206 | movie_data["Movie_Number"] = movie_data["IMDB_URL"].apply(lambda x: re.sub("[^0-9]", "", x))
207 | ```
208 | 
209 | 
210 | Now that this is done, we will simply let the IMDb scraper collect the data
211 | we want and we will append it to the data from the first part of the
212 | challenge.
213 | 
214 | 
215 | 
216 | ```r
217 |   # Download IMDb info into a temporary variable
218 |   IMDB_Info <- IMDb(IMDB_ID)
219 |   
220 |   # Save MPAA rating
221 |   MPAA_Rating <- IMDB_Info$MPAA_Rating
222 |   
223 |   # Save genre(s)
224 |   Genre <- IMDB_Info$Genre
225 |   
226 |   # Save director(s)
227 |   Director <- IMDB_Info$Director
228 |   
229 |   # Save duration in minutes
230 |   Length_Minutes <- IMDB_Info$Length_Minutes
231 |   
232 |   # Save IMDb rating
233 |   IMDB_rating <- IMDB_Info$IMDB_rating
234 |   
235 |   # Save full cast
236 |   Full_Cast <- IMDB_Info$Full_Cast
237 |   
238 |   # Delete IMDb info
239 |   IMDB_Info <- NULL
240 | })
241 | 
242 | ```
243 | 
244 | ```python
245 | with open("film-death-counts-Python.csv", "wb") as out_file:
246 |     out_file.write("Film,Year,Body_Count,MPAA_Rating,Genre,Director,Actors,Length_Minutes,IMDB_Rating\n")
247 | 
248 |     for movie_entry in movie_data.iterrows():
249 |         # Use a try-catch on the loop to prevent temporary connection-related issues from stopping the scrape
250 |         try:
251 |             movie = imdb_access.get_movie(movie_entry[1]["Movie_Number"])
252 |             movie_fields = []
253 |             
254 |             # Remove non-ASCII character encodings and commas from movie titles
255 |             movie_fields.append(movie["title"].encode("ascii", "replace").replace(",", ""))
256 |             movie_fields.append(str(movie["year"]))
257 |             movie_fields.append(str(movie_entry[1]["Body_Count"]))
258 |             
259 |             # Some movies don't have MPAA Ratings on IMDB
260 |             try:
261 |               movie_fields.append(str(movie["mpaa"].split(" ")[1]))
262 |             except:
263 |               movie_fields.append("")
264 |             
265 |             # For movies with multiple genres/directors/actors, join them with bars |
266 |             movie_fields.append(str("|".join(movie["genres"])))
267 |             movie_fields.append(str("|".join([str(x) for x in movie["director"]])))
268 |             movie_fields.append(str("|".join([str(x) for x in movie["cast"]])))
269 |             
270 |             movie_fields.append(str(int(movie["runtime"][0].split(":")[-1])))
271 |             movie_fields.append(str(float(movie["rating"])))
272 | ```
273 | 
274 | 
275 | And finally, all what is left to do is to save the complete data set into a
276 | .csv file and close the script. 
277 | 
278 | 
279 | 
280 | ```r
281 | write.csv(data, file = "movies-R-full.csv")
282 | 
283 | ```
284 | 
285 | ```python
286 | # All entries are comma-delimited
287 |             out_file.write(",".join(movie_fields) + "\n")
288 |             
289 |         except Exception as e:
290 |             print "Error with", str(movie)
291 | ```
292 | 
293 | 
294 | That's it! You should now have a .csv file somewhere on your computer 
295 | containing all the information we just scraped in both parts of this 
296 | challenge.
297 | 
298 | Sorry it took us so long to complete this part, but beginnings of semesters 
299 | are always very busy times at the university.
300 | 
301 | Stay tuned for our next challenge! It will be about making a linear
302 | regression, running basic diagnostic tests and plotting the resulting
303 | straight line with its confidence interval.
304 | 
305 | ___
306 | 
307 | #### 3 - Source code ####
308 | 
309 | R and Python source codes are available 
310 | [here](https://github.com/morpionZ/R-vs-Python/tree/master/Deadliest%20movies%20scrape/code).
311 | 
312 | 


--------------------------------------------------------------------------------
/Deadliest movies scrape/pandoc_config.txt:
--------------------------------------------------------------------------------
1 | format: html
2 | c: custom.css
3 | s: 
4 | S: 
5 | mathjax: 
6 | o: notebook.html
7 | 


--------------------------------------------------------------------------------
/Deadliest movies scrape/pandoc_config2.txt:
--------------------------------------------------------------------------------
1 | format: html
2 | c: custom.css
3 | s: 
4 | S: 
5 | mathjax: 
6 | o: notebook2.html
7 | 


--------------------------------------------------------------------------------
/Deadliest movies scrape/programming_cat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sjmgarnier/R-vs-Python/73e33c1623b12ce3fbb8a6ccca2d661571f40455/Deadliest movies scrape/programming_cat.jpg


--------------------------------------------------------------------------------
/Deadliest movies/bloody_gun.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sjmgarnier/R-vs-Python/73e33c1623b12ce3fbb8a6ccca2d661571f40455/Deadliest movies/bloody_gun.jpg


--------------------------------------------------------------------------------
/Deadliest movies/code/code.R:
--------------------------------------------------------------------------------
  1 | #' Copyright 2014 Simon Garnier (http://www.theswarmlab.com / @sjmgarnier)
  2 | #' 
  3 | #' This script is free software: you can redistribute it and/or modify it under
  4 | #' the terms of the GNU General Public License as published by the Free Software
  5 | #' Foundation, either version 3 of the License, or (at your option) any later
  6 | #' version.
  7 | #' 
  8 | #' This script is distributed in the hope that it will be useful, but WITHOUT
  9 | #' ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 10 | #' FOR A PARTICULAR PURPOSE.
 11 | #' 
 12 | #' See the GNU General Public License for more details.
 13 | #' 
 14 | #' You should have received a copy of the GNU General Public License along with
 15 | #' this script. If not, see http://www.gnu.org/licenses/.
 16 | #' 
 17 | 
 18 | #' **Document title:** R vs Python - Round 1
 19 | #' 
 20 | #' **Date:** January 5, 2014
 21 | #' 
 22 | #' **Author:** Simon Garnier (http://www.theswarmlab.com / @sjmgarnier)
 23 | #' 
 24 | #' **Description:** This script generates a pretty barchart representing the top
 25 | #' 25 most violent movies ordered by number of on screen deaths per minute. For
 26 | #' more information, see http://www.theswarmlab.com/r-vs-python-round-1/
 27 | #' 
 28 | #' Document generated with RStudio ([www.rstudio.com](http://www.rstudio.com)).
 29 | #' 
 30 | 
 31 | # Load libraries
 32 | library(lattice)        # Very versatile graphics package
 33 | library(latticeExtra)   # Addition to "lattice" that makes layering graphs a 
 34 |                         # breathe, and I'm a lazy person, so why not
 35 | 
 36 | # Load data into a data frame
 37 | body.count.data <- within(read.csv("http://files.figshare.com/1332945/film_death_counts.csv"), {
 38 |   
 39 |   # Compute on screen deaths per minute for each movie. 
 40 |   Deaths_Per_Minute <- Body_Count / Length_Minutes
 41 |   ord <- order(Deaths_Per_Minute, decreasing = TRUE)  # useful later
 42 |   
 43 |   # Combine film title and release date into a new factor column with levels
 44 |   # ordered by ascending violence
 45 |   Full_Title <- paste0(Film, " (", Year, ")")
 46 |   Full_Title <- ordered(Full_Title, levels = rev(unique(Full_Title[ord])))
 47 |   
 48 |   # Combine number of on screen death per minute and duration of the movies into
 49 |   # a new character string column
 50 |   Deaths_Per_Minute_With_Length <- paste0(round(Deaths_Per_Minute, digits=2), " (", Length_Minutes, " mins)")
 51 |   
 52 | })
 53 | 
 54 | # Reorder "body.count.data" by (descending) number of on screen deaths per minute
 55 | body.count.data <- body.count.data[body.count.data$ord, ]
 56 | 
 57 | # Select top 25 most violent movies by number of on screen deaths per minute
 58 | body.count.data <- body.count.data[1:25,]
 59 | 
 60 | # Generate base graph
 61 | graph <- barchart(Full_Title ~ Deaths_Per_Minute, data = body.count.data)
 62 | graphics.off()
 63 | dev.new(width = 10, height = 8)
 64 | print(graph)
 65 | 
 66 | # Create theme
 67 | my.bloody.theme <- within(trellis.par.get(), {    # Initialize theme with default value
 68 |   axis.line$col <- NA                             # Remove axes 
 69 |   plot.polygon <- within(plot.polygon, {
 70 |     col <- "#8A0606"                              # Set bar colors to a nice bloody red
 71 |     border <- NA                                  # Remove bars' outline
 72 |   })
 73 |   axis.text$cex <- 1                              # Default axis text size is a bit small. Make it bigger
 74 |   layout.heights <- within(layout.heights, {
 75 |     bottom.padding <- 0                           # Remove bottom padding
 76 |     axis.bottom <- 0                              # Remove axis padding at the bottom of the graph
 77 |     axis.top <- 0                                 # Remove axis padding at the top of the graph
 78 |   })
 79 | })
 80 | 
 81 | # Update figure with new theme + other improvements (like a title for instance)
 82 | graph <- update(
 83 |   graph, 
 84 |   main  ="25 most violence packed films by deaths per minute",  # Title of the barchart
 85 |   par.settings = my.bloody.theme,                               # Use custom theme
 86 |   xlab = NULL,                                                  # Remove label of x axis
 87 |   scales = list(x = list(at = NULL)),                           # Remove rest of x axis
 88 |   xlim = c(0, 6.7),                                             # Set graph limits along x axis to accomodate the additional text (requires some trial and error)
 89 |   box.width = 0.75)                                               # Default bar width is a bit small. Make it bigger)
 90 | 
 91 | print(graph)
 92 | 
 93 | # Add number of on screen deaths per minute and duration of movies at the end of each bar 
 94 | graph <- graph + layer(with(body.count.data, 
 95 |   panel.text(
 96 |     Deaths_Per_Minute,                # x position of the text
 97 |     25:1,                             # y position of the text
 98 |     pos = 4,                          # Position of the text relative to the x and y position (4 = to the right)
 99 |     Deaths_Per_Minute_With_Length)))  # Text to display                                     
100 | 
101 | # Print graph
102 | print(graph)
103 | 
104 | # Load additional libraries
105 | library(jpeg)  # To read JPG images
106 | library(grid)  # Graphics library with better image plotting capabilities
107 | 
108 | # Download a pretty background image; mode is set to "wb" because it seems that
109 | # Windows needs it. I don't use Windows, I can't confirm
110 | download.file(url = "http://www.theswarmlab.com/wp-content/uploads/2014/01/bloody_gun.jpg", 
111 |               destfile = "bloody_gun.jpg", quiet = TRUE, mode = "wb")
112 | 
113 | # Load gun image using "readJPEG" from the "jpeg" package
114 | img <- readJPEG("bloody_gun.jpg")
115 | 
116 | # Add image to graph using "grid.raster" from the "grid" package
117 | graph <- graph + layer_(
118 |   grid.raster(
119 |     as.raster(img),                 # Image as a raster
120 |     x = 1,                          # x location of image "Normalised Parent Coordinates"
121 |     y = 0,                          # y location of image "Normalised Parent Coordinates"
122 |     height = 0.7,                   # Height of the image. 1 indicates that the image height is equal to the graph height
123 |     just = c("right", "bottom")))   # Justification of the image relative to its x and y locations
124 | 
125 | # Print graph
126 | print(graph)
127 | 


--------------------------------------------------------------------------------
/Deadliest movies/code/code.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2014 Randal S. Olson
  3 | 
  4 | This file is a script that makes pretty bar charts. It was written to be executed
  5 | in IPython Notebook.
  6 | 
  7 | This script is free software: you can redistribute it and/or modify it under the
  8 | terms of the GNU General Public License as published by the Free Software Foundation,
  9 | either version 3 of the License, or (at your option) any later version.
 10 | 
 11 | This script is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
 12 | without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 13 | See the GNU General Public License for more details.
 14 | 
 15 | You should have received a copy of the GNU General Public License along with this script.
 16 | If not, see http://www.gnu.org/licenses/.
 17 | """
 18 | 
 19 | # This starts the IPython Notebook pylab module, useful for plotting and interactive scientific computing
 20 | %pylab inline
 21 | from pandas import read_csv
 22 | 
 23 | # Read the data into a pandas DataFrame
 24 | body_count_data = read_csv("http://files.figshare.com/1332945/film_death_counts.csv")
 25 | 
 26 | # Divide the body counts by the length of the film
 27 | body_count_data["Deaths_Per_Minute"] = (body_count_data["Body_Count"].apply(float).values /
 28 |                                             body_count_data["Length_Minutes"].values)
 29 | 
 30 | # Only keep the top 25 highest kills per minute films
 31 | body_count_data = body_count_data.sort("Deaths_Per_Minute", ascending=False)[:25]
 32 | 
 33 | # Change the order of the data so highest kills per minute films are on top in the plot
 34 | body_count_data = body_count_data.sort("Deaths_Per_Minute", ascending=True)
 35 | 
 36 | # Generate the full titles for the movies: movie name (year)
 37 | full_title = []
 38 | 
 39 | for film, year in zip(body_count_data["Film"].values, body_count_data["Year"].values):
 40 |     full_title.append(film + " (" + str(year) + ")")
 41 |     
 42 | body_count_data["Full_Title"] = array(full_title)
 43 | 
 44 | fig = plt.figure(figsize=(8,12))
 45 | 
 46 | # Plot the red horizontal bars
 47 | rects = plt.barh(range(len(body_count_data["Deaths_Per_Minute"])),
 48 |                     body_count_data["Deaths_Per_Minute"],
 49 |                     height=0.8,
 50 |                     align="center",
 51 |                     color="#8A0707",
 52 |                     edgecolor="none")
 53 | 
 54 | # This function adds the deaths per minute label to the right of the bars
 55 | def autolabel(rects):
 56 |     for i, rect in enumerate(rects):
 57 |         width = rect.get_width()
 58 |         label_text = (str(round(float(width), 2)) +
 59 |                         " (" + str(body_count_data["Length_Minutes"].values[i]) +
 60 |                         " mins)")
 61 |         
 62 |         plt.text(width + 0.25,
 63 |                     rect.get_y() + rect.get_height() / 2.,
 64 |                     label_text,
 65 |                     ha="left",
 66 |                     va="center",
 67 |                     fontsize=14)
 68 | 
 69 | autolabel(rects)
 70 | 
 71 | # Add the film labels to left of the bars (y-axis)
 72 | yticks(range(len(body_count_data["Full_Title"])), body_count_data["Full_Title"].values, fontsize=14)
 73 | 
 74 | # Don't have any x tick labels
 75 | xticks(arange(0, 5, 1), [""])
 76 | 
 77 | # Plot styling
 78 | 
 79 | # Remove the plot frame lines
 80 | ax = axes()
 81 | ax.spines["top"].set_visible(False)
 82 | ax.spines["right"].set_visible(False)
 83 | ax.spines["left"].set_visible(False)
 84 | ax.spines["bottom"].set_visible(False)
 85 | 
 86 | # y-axis ticks on the left and x-axis ticks on the bottom
 87 | ax.yaxis.tick_left()
 88 | ax.xaxis.tick_bottom()
 89 | 
 90 | # Color the y-axis ticks the same dark red color, and the x-axis ticks white
 91 | ax.tick_params(axis="y", color="#8A0707")
 92 | ax.tick_params(axis="x", color="white")
 93 | 
 94 | # Don't show the x axis tick markers
 95 | ax.xaxis.grid(color="white", linestyle="-")
 96 | 
 97 | # Save the figure as a PNG
 98 | # We can also save this as a PDF, JPG, TIFF, or most other image formats
 99 | savefig("25-Violence-Packed-Films.png", bbox_inches="tight")
100 | 


--------------------------------------------------------------------------------
/Deadliest movies/custom.css:
--------------------------------------------------------------------------------
  1 | body {
  2 |         font: 14px/1.5em "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif;
  3 | 		    color: #777;
  4 | 		-webkit-font-smoothing: antialiased; /* Fix for webkit rendering */
  5 | 		-webkit-text-size-adjust: 100%;
  6 |         /*font-family: "Avenir Next", Helvetica, Arial, sans-serif;*/
  7 |         padding:1em;
  8 |         margin:auto;
  9 |         max-width:10in;
 10 | }
 11 | 
 12 | h1, h2, h3, h4, h5, h6 {
 13 | 
 14 | 		font-weight: normal; }
 15 | 	h1 a, h2 a, h3 a, h4 a, h5 a, h6 a { font-weight: inherit; }
 16 | 	h1 { font-size: 46px; line-height: 50px; margin-bottom: 14px;}
 17 | 	h2 { font-size: 35px; line-height: 40px; margin-bottom: 10px; }
 18 | 	h3 { font-size: 28px; line-height: 34px; margin-bottom: 8px; }
 19 | 	h4 { font-size: 21px; line-height: 30px; margin-bottom: 4px; }
 20 | 	h5 { font-size: 17px; line-height: 24px; }
 21 | 	h6 { font-size: 14px; line-height: 21px; }
 22 | 	.subheader { color: #777; }
 23 | 
 24 | 	p { margin: 0 0 20px 0; }
 25 | 	p img { margin: 0; }
 26 | 	p.lead { font-size: 21px; line-height: 27px; color: #444;  }
 27 | 
 28 | 	em { font-style: italic; }
 29 | 	strong { font-weight: bold;  }
 30 | 	small { font-size: 80%; }
 31 | 
 32 | hr {
 33 |         height: 0.2em;
 34 |         border: 0;
 35 |         color: #CCCCCC;
 36 |         background-color: #CCCCCC;
 37 | }
 38 | 
 39 | p, blockquote, ul, ol, dl, li, table, pre {
 40 |         margin: 15px 0;
 41 |         text-align: justify;
 42 | }
 43 | 
 44 | a, a:visited { color: #333; text-decoration: underline; outline: 0; }
 45 |   a:hover, a:focus { color: #000; }
 46 | 	p a, p a:visited { line-height: inherit; }
 47 | 
 48 | #message {
 49 |         border-radius: 6px;
 50 |         border: 1px solid #ccc;
 51 |         display:block;
 52 |         width:100%;
 53 |         height:60px;
 54 |         margin:6px 0px;
 55 | }
 56 | 
 57 | button, #ws {
 58 |         font-size: 10pt;
 59 |         padding: 4px 6px;
 60 |         border-radius: 5px;
 61 |         border: 1px solid #bbb;
 62 |         background-color: #eee;
 63 | }
 64 | 
 65 | code, pre, #ws, #message {
 66 |         font-family: Monaco;
 67 |         font-size: 8pt;
 68 |         border-radius: 3px;
 69 |         background-color: #F8F8F8;
 70 |         color: inherit;
 71 | }
 72 | 
 73 | code {
 74 |         border: 1px solid #EAEAEA;
 75 |         margin: 0 2px;
 76 |         padding: 0 5px;
 77 | }
 78 | 
 79 | pre.r {
 80 |   border: 2px solid #8A0606;
 81 | }
 82 | 
 83 | pre.r:before {
 84 |   content: 'R code \A';
 85 |   color: #8A0606;
 86 |   font-weight: bold;
 87 | }
 88 | 
 89 | pre.python {
 90 |   border: 2px solid #068A06;
 91 | }
 92 | 
 93 | pre.python:before {
 94 |   content: 'Python code \A';
 95 |   color: #068A06;
 96 |   font-weight: bold;
 97 | }
 98 | 
 99 | img {
100 |     max-width: 100%;
101 |     height: auto;
102 |     width: auto\9; /* ie8 */
103 | }
104 | 
105 | pre {
106 |         border: 1px solid #CCCCCC;
107 |         overflow: auto;
108 |         padding: 4px 8px;
109 | }
110 | 
111 | pre > code {
112 |         border: 0;
113 |         margin: 0;
114 |         padding: 0;
115 | }
116 | 
117 | blockquote, blockquote p { font-size: 12px; line-height: 24px; color: #000; font-style: italic; }
118 | 	blockquote { margin: 0 0 20px; padding: 9px 50px 0 49px; border-left: 1px solid #ddd; }
119 | 	blockquote cite { display: block; font-size: 12px; color: #555; }
120 | 	blockquote cite:before { content: "\2014 \0020"; }
121 | 	blockquote cite a, blockquote cite a:visited, blockquote cite a:visited { color: #555; }
122 | 
123 | #ws { background-color: #f8f8f8; }
124 | 
125 | .send { color:#77bb77; }
126 | .server { color:#7799bb; }
127 | .error { color:#AA0000; }
128 | 


--------------------------------------------------------------------------------
/Deadliest movies/figure/baseGraphR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sjmgarnier/R-vs-Python/73e33c1623b12ce3fbb8a6ccca2d661571f40455/Deadliest movies/figure/baseGraphR.png


--------------------------------------------------------------------------------
/Deadliest movies/figure/gunR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sjmgarnier/R-vs-Python/73e33c1623b12ce3fbb8a6ccca2d661571f40455/Deadliest movies/figure/gunR.png


--------------------------------------------------------------------------------
/Deadliest movies/figure/prettyR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sjmgarnier/R-vs-Python/73e33c1623b12ce3fbb8a6ccca2d661571f40455/Deadliest movies/figure/prettyR.png


--------------------------------------------------------------------------------
/Deadliest movies/figure/rightLabelsR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sjmgarnier/R-vs-Python/73e33c1623b12ce3fbb8a6ccca2d661571f40455/Deadliest movies/figure/rightLabelsR.png


--------------------------------------------------------------------------------
/Deadliest movies/figurePy/basePy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sjmgarnier/R-vs-Python/73e33c1623b12ce3fbb8a6ccca2d661571f40455/Deadliest movies/figurePy/basePy.png


--------------------------------------------------------------------------------
/Deadliest movies/figurePy/finalPy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sjmgarnier/R-vs-Python/73e33c1623b12ce3fbb8a6ccca2d661571f40455/Deadliest movies/figurePy/finalPy.png


--------------------------------------------------------------------------------
/Deadliest movies/figurePy/prettyPy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sjmgarnier/R-vs-Python/73e33c1623b12ce3fbb8a6ccca2d661571f40455/Deadliest movies/figurePy/prettyPy.png


--------------------------------------------------------------------------------
/Deadliest movies/pandoc_config.txt:
--------------------------------------------------------------------------------
1 | format: html
2 | c: custom.css
3 | s: 
4 | S: 
5 | mathjax: 
6 | o: run.html
7 | 


--------------------------------------------------------------------------------
/Deadliest movies/run.R:
--------------------------------------------------------------------------------
  1 | #+ licence, echo=FALSE 
  2 | # Copyright 2014 Simon Garnier (http://www.theswarmlab.com / @sjmgarnier)
  3 | # 
  4 | # This script is free software: you can redistribute it and/or modify it under 
  5 | # the terms of the GNU General Public License as published by the Free Software 
  6 | # Foundation, either version 3 of the License, or (at your option) any later 
  7 | # version.
  8 | # 
  9 | # This script is distributed in the hope that it will be useful, but WITHOUT ANY
 10 | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 11 | # A PARTICULAR PURPOSE.
 12 | # 
 13 | # See the GNU General Public License for more details.
 14 | # 
 15 | # You should have received a copy of the GNU General Public License along with 
 16 | # this script. If not, see http://www.gnu.org/licenses/.
 17 | # 
 18 | # You can generate the HTML files by running: 
 19 | # library(knitr) 
 20 | # spin("run.R") 
 21 | # pandoc("run.md", config = "pandoc_config.txt")
 22 | 
 23 | 
 24 | #+ 
 25 | #' **Document title:** R vs Python - Round 1
 26 | #' 
 27 | #' **Date:** January 5, 2014
 28 | #' 
 29 | #' **Text by:** Simon Garnier ([www.theswarmlab.com](http://www.theswarmlab.com)
 30 | #' / [\@sjmgarnier](http://twitter.com/sjmgarnier))
 31 | #' 
 32 | #' **R code by:** Simon Garnier
 33 | #' ([www.theswarmlab.com](http://www.theswarmlab.com) /
 34 | #' [\@sjmgarnier](http://twitter.com/sjmgarnier))
 35 | #' 
 36 | #' **Python code by:** Randy Olson
 37 | #' ([www.randalolson.com](http://www.randalolson.com) /
 38 | #' [\@randal_olson](http://twitter.com/randal_olson))
 39 | #' 
 40 | #' Document generated with RStudio ([www.rstudio.com](http://www.rstudio.com)), 
 41 | #' knitr ([www.yihui.name/knitr/](http://yihui.name/knitr/)) and pandoc 
 42 | #' ([www.johnmacfarlane.net/pandoc/](http://johnmacfarlane.net/pandoc/)). Python
 43 | #' figures generated with iPython Notebook
 44 | #' ([www.ipython.org/notebook.html](http://ipython.org/notebook.html)).
 45 | #' 
 46 | 
 47 | 
 48 | #' ___
 49 | #' 
 50 | #' #### Foreword ####
 51 | #' 
 52 | #' My friend Randy Olson and I got into the habit to argue about the relative 
 53 | #' qualities of our favorite languages for data analysis and visualization. I am
 54 | #' an enthusiastic R user ([www.r-project.org](http://www.r-project.org)) while 
 55 | #' Randy is a fan of Python ([www.python.org](http://www.python.org)). One thing
 56 | #' we agree on however is that our discussions are meaningless unless we
 57 | #' actually put R and Python to a series of tests to showcase their relative
 58 | #' strengths and weaknesses. Essentially we will set a common goal (*e.g.*,
 59 | #' perform a particular type of data analysis or draw a particular type of
 60 | #' graph) and create the R and Python codes to achieve this goal. And since
 61 | #' Randy and I are all about sharing, open source and open access, we decided to
 62 | #' make public the results of our friendly challenges so that you can help us
 63 | #' decide between R and Python and, hopefully, also learn something along the
 64 | #' way.
 65 | #' 
 66 | 
 67 | 
 68 | #' ___
 69 | #' 
 70 | #' #### Today's challenge: where we learn that Hollywood's cemetery is full ####
 71 | #' 
 72 | #' ##### 1 - Introduction #####
 73 | #' 
 74 | #' For this first challenge, we will use data collected by Randy for his recent
 75 | #' post on the ["Top 25 most violence packed films" in the history of the movie
 76 | #' industry](www.randalolson.com/2013/12/31/most-violence-packed-films/). For
 77 | #' his post, Randy generated a simple horizontal barchart showing the top 25
 78 | #' most violent films ordered by number of on screen deaths per minute. In the
 79 | #' rest of this document, we will show you how to reproduce this graph using
 80 | #' Python and how to achieve a similar result with R. We will detail the
 81 | #' different steps of the process and provide for each step the corresponding
 82 | #' code (red boxes for R, green boxes for Python). You will also find the entire
 83 | #' codes at the end of this document.
 84 | #' 
 85 | #' And now without further ado, let's get started!
 86 | #' 
 87 | 
 88 | #' ##### 2 - Step by step process #####
 89 | #' 
 90 | #' First things first, let's set up our working environment by loading some
 91 | #' necessary libraries.
 92 | #' 
 93 | 
 94 | #+ libR, message=FALSE
 95 | # Load libraries
 96 | library(lattice)        # Very versatile graphics package
 97 | library(latticeExtra)   # Addition to "lattice" that makes layering graphs a 
 98 |                         # breathe, and I'm a lazy person, so why not
 99 | 
100 | #+ libPy, eval=FALSE, engine="python"
101 | # This starts the IPython Notebook pylab module, useful for plotting and
102 | # interactive scientific computing
103 | %pylab inline
104 | from pandas import *
105 | 
106 | #' Now let's load the data for today's job. The raw data were scraped by Randy 
107 | #' (using Python) from [www.MovieBodyCounts.com](http://www.MovieBodyCounts.com)
108 | #' and he generously provided the result of his hard work on FigShare at this 
109 | #' address: 
110 | #' [http://dx.doi.org/10.6084/m9.figshare.889719](http://dx.doi.org/10.6084/m9.figshare.889719).
111 | #' 
112 | 
113 | #+ dataR
114 | # Load data into a data frame
115 | body.count.data <- read.csv("http://files.figshare.com/1332945/film_death_counts.csv")
116 | 
117 | #+ dataPy, eval=FALSE, engine='python'
118 | # Read the data into a pandas DataFrame
119 | body_count_data = read_csv("http://files.figshare.com/1332945/film_death_counts.csv")
120 | 
121 | #' For each movie, the data frame contains a column for the total number of on
122 | #' screen deaths ("Body_Count") and a column for the duration
123 | #' ("Length_Minutes"). We will now create an extra column for the number of on
124 | #' screen deaths per minute of each movie ("Deaths_Per_Minute")
125 | #'
126 |   
127 | #+ deathsPerMinR
128 | # Compute on screen deaths per minute for each movie. 
129 | body.count.data <- within(body.count.data, { 
130 |   Deaths_Per_Minute <- Body_Count / Length_Minutes
131 |   ord <- order(Deaths_Per_Minute, decreasing = TRUE)  # useful later
132 | })
133 | 
134 | #+ deathsPerMinPy, eval=FALSE, engine="python"
135 | # Divide the body counts by the length of the film
136 | body_count_data["Deaths_Per_Minute"] = (body_count_data["Body_Count"].apply(float).values /
137 |                                           body_count_data["Length_Minutes"].values)
138 | 
139 | #' Now we will reorder the data frame by (descending) number of on screen deaths
140 | #' per minute, and select the top 25 most violent movies according to this criterion.
141 | #' 
142 | 
143 | #+ top25R
144 | # Reorder "body.count.data" by (descending) number of on screen deaths per minute
145 | body.count.data <- body.count.data[body.count.data$ord, ]
146 | 
147 | # Select top 25 most violent movies by number of on screen deaths per minute
148 | body.count.data <- body.count.data[1:25,]
149 | 
150 | #+ top25Py, eval=FALSE, engine="python"
151 | # Only keep the top 25 highest kills per minute films
152 | body_count_data = body_count_data.sort("Deaths_Per_Minute", ascending=False)[:25]
153 | 
154 | # Change the order of the data so highest kills per minute films are on top in the plot
155 | body_count_data = body_count_data.sort("Deaths_Per_Minute", ascending=True)
156 | 
157 | #' In Randy's graph, the "y" axis shows the film title with the release date. We
158 | #' will now generate the full title for each movie following a "Movie name
159 | #' (year)" format, and append it to the data frame.
160 | #' 
161 | 
162 | #+ filmTitleR
163 | # Combine film title and release date into a new factor column with levels
164 | # ordered by ascending violence
165 | body.count.data <- within(body.count.data, {
166 |   Full_Title <- paste0(Film, " (", Year, ")")
167 |   ord <- order(Deaths_Per_Minute, decreasing = TRUE)
168 |   Full_Title <- ordered(Full_Title, levels = rev(unique(Full_Title[ord])))
169 | })
170 | 
171 | #+ filmTitlePy, eval=FALSE, engine="python"
172 | # Generate the full titles for the movies: movie name (year)
173 | full_title = []
174 | 
175 | for film, year in zip(body_count_data["Film"].values, body_count_data["Year"].values):
176 |   full_title.append(film + " (" + str(year) + ")")
177 | 
178 | body_count_ y-axis ticks on the left and x-axis ticks on the bottom
179 | ax.yaxis.tick_left()
180 | ax.xaxis.tick_bottom()data["Full_Title"] = array(full_title)
181 | 
182 | #' Now we are ready to generate the barchart. We're going to start with the
183 | #' default options and then we will make this thing look pretty.
184 | #' 
185 | 
186 | #+ baseGraphR, fig.width=10, fig.height=8, fig.align="center", dev="png"
187 | # Generate base graph
188 | graph <- barchart(Full_Title ~ Deaths_Per_Minute, data = body.count.data)
189 | print(graph)
190 | 
191 | #+ baseGraphPy, eval=FALSE, engine="python"
192 | # plot the bars
193 | fig = plt.figure(figsize=(8,12))
194 | 
195 | # Plot the red horizontal bars
196 | rects = plt.barh(range(len(body_count_data["Deaths_Per_Minute"])),
197 |                  body_count_data["Deaths_Per_Minute"],
198 |                  height=0.8,
199 |                  align="center",
200 |                  color="#8A0707",
201 |                  edgecolor="none")
202 | 
203 | # Add the film labels to left of the bars (y-axis)
204 | yticks(range(len(body_count_data["Full_Title"])), body_count_data["Full_Title"].values, fontsize=14)xticks(arange(0, 5, 1), [""])
205 | 
206 | #' <center> ![](figurePy/basePy.png) </center>
207 | #' 
208 | 
209 | #' Ok, now let's make this pretty. 
210 | #' 
211 | 
212 | #+ prettyR, fig.width=10, fig.height=8, fig.align="center", dev="png"
213 | # Create theme
214 | my.bloody.theme <- within(trellis.par.get(), {    # Initialize theme with default value
215 |   axis.line$col <- NA                             # Remove axes 
216 |   plot.polygon <- within(plot.polygon, {
217 |     col <- "#8A0606"                              # Set bar colors to a nice bloody red
218 |     border <- NA                                  # Remove bars' outline
219 |   })
220 |   axis.text$cex <- 1                              # Default axis text size is a bit small. Make it bigger
221 |   layout.heights <- within(layout.heights, {
222 |     bottom.padding <- 0                           # Remove bottom padding
223 |     axis.bottom <- 0                              # Remove axis padding at the bottom of the graph
224 |     axis.top <- 0                                 # Remove axis padding at the top of the graph
225 |   })
226 | })
227 | 
228 | # Update figure with new theme + other improvements (like a title for instance)
229 | graph <- update(
230 |   graph, 
231 |   main = '25 most violence packed films by deaths per minute',  # Title of the barchart
232 |   par.settings = my.bloody.theme,                               # Use custom theme
233 |   xlab = NULL,                                                  # Remove label of x axis
234 |   scales = list(x = list(at = NULL)),                           # Remove rest of x axis
235 |   xlim = c(0, 6.7),                                             # Set graph limits along x axis to accomodate the additional text (requires some trial and error)
236 |   box.width = 0.75)                                             # Default bar width is a bit small. Make it bigger)
237 | 
238 | print(graph)
239 | 
240 | #+ prettyPy, eval=FALSE, engine="python"
241 | # Don't have any x tick labels
242 | xticks(arange(0, 5, 1), [""])
243 | 
244 | # Plot styling
245 | 
246 | # Remove the plot frame lines
247 | ax = axes()
248 | ax.spines["top"].set_visible(False)
249 | ax.spines["right"].set_visible(False)
250 | ax.spines["left"].set_visible(False)
251 | ax.spines["bottom"].set_visible(False)
252 | 
253 | # Color the y-axis ticks the same dark red color, and the x-axis ticks white
254 | ax.tick_params(axis="y", color="#8A0707")
255 | ax.tick_params(axis="x", color="white")
256 | 
257 | ax.xaxis.grid(color="white", linestyle="-")
258 | 
259 | #' <center> ![](figurePy/prettyPy.png) </center>
260 | #' 
261 | 
262 | #' Finally, the last thing we want to add to our graph is the number of deaths
263 | #' per minute and the duration of each movie on the right of the graph.
264 | #' 
265 | 
266 | #+ rightLabelsR, fig.width=10, fig.height=8, fig.align="center", dev="png"
267 | # Combine number of on screen death per minute and duration of the movies into a new character string column
268 | body.count.data <- within(body.count.data, {
269 |   Deaths_Per_Minute_With_Length <- paste0(round(body.count.data$Deaths_Per_Minute, digits = 2), " (", body.count.data$Length_Minutes, " mins)")
270 | })
271 | 
272 | # Add number of on screen deaths per minute and duration of movies at the end of each bar 
273 | graph <- graph + layer(with(body.count.data, 
274 |   panel.text(
275 |     Deaths_Per_Minute,                  # x position of the text
276 |     25:1,                               # y position of the text
277 |     pos = 4,                            # Position of the text relative to the x and y position (4 = to the right)
278 |     Deaths_Per_Minute_With_Length)))    # Text to display                                     
279 | 
280 | # Print graph
281 | print(graph)
282 | 
283 | #+ rightLabelsPy, eval=FALSE, engine="python"
284 | # This function adds the deaths per minute label to the right of the bars
285 | def autolabel(rects):
286 |   for i, rect in enumerate(rects):
287 |   width = rect.get_width()
288 | label_text = (str(round(float(width), 2)) +
289 |                 " (" + str(body_count_data["Length_Minutes"].values[i]) +
290 |                 " mins)")
291 | 
292 | plt.text(width + 0.25,
293 |          rect.get_y() + rect.get_height() / 2.,
294 |          label_text,
295 |          ha="left",
296 |          va="center",
297 |          fontsize=14)
298 | 
299 | autolabel(rects)
300 | 
301 | #' <center> ![](figurePy/finalPy.png) </center>
302 | #' 
303 | 
304 | 
305 | #' ___
306 | #' 
307 | #' #### 3 - R bonus ####
308 | #' 
309 | #' Just for fun, I decided to add to the R graph a little accessory in relation
310 | #' with the general theme of this data set.
311 | 
312 | #+ gunR, fig.width=10, fig.height=8, fig.align="center", dev="png"
313 | # Load additional libraries
314 | library(jpeg)  # To read JPG images
315 | library(grid)  # Graphics library with better image plotting capabilities
316 | 
317 | # Download a pretty background image; mode is set to "wb" because it seems that
318 | # Windows needs it. I don't use Windows, I can't confirm
319 | download.file(url = "http://www.theswarmlab.com/wp-content/uploads/2014/01/bloody_gun.jpg", 
320 |               destfile = "bloody_gun.jpg", quiet = TRUE, mode = "wb")
321 | 
322 | # Load gun image using "readJPEG" from the "jpeg" package
323 | img <- readJPEG("bloody_gun.jpg")
324 | 
325 | # Add image to graph using "grid.raster" from the "grid" package
326 | graph <- graph + layer_(
327 |   grid.raster(
328 |     as.raster(img),                 # Image as a raster
329 |     x = 1,                          # x location of image "Normalised Parent Coordinates"
330 |     y = 0,                          # y location of image "Normalised Parent Coordinates"
331 |     height = 0.7,                   # Height of the image. 1 indicates that the image height is equal to the graph height
332 |     just = c("right", "bottom")))   # Justification of the image relative to its x and y locations
333 | 
334 | # Print graph
335 | print(graph)
336 | 
337 | 
338 | #' ___
339 | #' 
340 | #' #### 4 - Source code ####
341 | #' 
342 | #' R and Python source codes are available 
343 | #' [here](https://github.com/morpionZ/R-vs-Python/tree/master/Deadliest%20movies/code).
344 | #' 
345 | #' For F# fan, [Terje Tyldum](http://terjetyl.ghost.io/) has written his version
346 | #' of the code in F# [here](http://terjetyl.ghost.io/f-charting-challenge/).
347 | #' 
348 | #' Randy and I also recommend that you check out [this 
349 | #' post](http://nbviewer.ipython.org/github/yaph/ipython-notebooks/blob/master/Exploring%20Movie%20Body%20Counts.ipynb)
350 | #' by [Ramiro Gómez](http://ramiro.org/) ([\@yaph](https://twitter.com/yaph)) 
351 | #' where he does a more in-depth analysis of the data set we used for today’s 
352 | #' challenge.
353 | #' 
354 | 
355 | 
356 | 


--------------------------------------------------------------------------------
/Deadliest movies/run.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  2 | <html xmlns="http://www.w3.org/1999/xhtml">
  3 | <head>
  4 |   <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  5 |   <meta http-equiv="Content-Style-Type" content="text/css" />
  6 |   <meta name="generator" content="pandoc" />
  7 |   <title></title>
  8 |   <style type="text/css">code{white-space: pre;}</style>
  9 |   <style type="text/css">
 10 | table.sourceCode, tr.sourceCode, td.lineNumbers, td.sourceCode {
 11 |   margin: 0; padding: 0; vertical-align: baseline; border: none; }
 12 | table.sourceCode { width: 100%; line-height: 100%; }
 13 | td.lineNumbers { text-align: right; padding-right: 4px; padding-left: 4px; color: #aaaaaa; border-right: 1px solid #aaaaaa; }
 14 | td.sourceCode { padding-left: 5px; }
 15 | code > span.kw { color: #007020; font-weight: bold; }
 16 | code > span.dt { color: #902000; }
 17 | code > span.dv { color: #40a070; }
 18 | code > span.bn { color: #40a070; }
 19 | code > span.fl { color: #40a070; }
 20 | code > span.ch { color: #4070a0; }
 21 | code > span.st { color: #4070a0; }
 22 | code > span.co { color: #60a0b0; font-style: italic; }
 23 | code > span.ot { color: #007020; }
 24 | code > span.al { color: #ff0000; font-weight: bold; }
 25 | code > span.fu { color: #06287e; }
 26 | code > span.er { color: #ff0000; font-weight: bold; }
 27 |   </style>
 28 |   <link rel="stylesheet" href="custom.css" type="text/css" />
 29 | </head>
 30 | <body>
 31 | <p><strong>Document title:</strong> R vs Python - Round 1</p>
 32 | <p><strong>Date:</strong> January 5, 2014</p>
 33 | <p><strong>Text by:</strong> Simon Garnier (<a href="http://www.theswarmlab.com">www.theswarmlab.com</a> / <a href="http://twitter.com/sjmgarnier">@sjmgarnier</a>)</p>
 34 | <p><strong>R code by:</strong> Simon Garnier (<a href="http://www.theswarmlab.com">www.theswarmlab.com</a> / <a href="http://twitter.com/sjmgarnier">@sjmgarnier</a>)</p>
 35 | <p><strong>Python code by:</strong> Randy Olson (<a href="http://www.randalolson.com">www.randalolson.com</a> / <a href="http://twitter.com/randal_olson">@randal_olson</a>)</p>
 36 | <p>Document generated with RStudio (<a href="http://www.rstudio.com">www.rstudio.com</a>), knitr (<a href="http://yihui.name/knitr/">www.yihui.name/knitr/</a>) and pandoc (<a href="http://johnmacfarlane.net/pandoc/">www.johnmacfarlane.net/pandoc/</a>). Python figures generated with iPython Notebook (<a href="http://ipython.org/notebook.html">www.ipython.org/notebook.html</a>).</p>
 37 | <hr />
 38 | <h4 id="foreword">Foreword</h4>
 39 | <p>My friend Randy Olson and I got into the habit to argue about the relative qualities of our favorite languages for data analysis and visualization. I am an enthusiastic R user (<a href="http://www.r-project.org">www.r-project.org</a>) while Randy is a fan of Python (<a href="http://www.python.org">www.python.org</a>). One thing we agree on however is that our discussions are meaningless unless we actually put R and Python to a series of tests to showcase their relative strengths and weaknesses. Essentially we will set a common goal (<em>e.g.</em>, perform a particular type of data analysis or draw a particular type of graph) and create the R and Python codes to achieve this goal. And since Randy and I are all about sharing, open source and open access, we decided to make public the results of our friendly challenges so that you can help us decide between R and Python and, hopefully, also learn something along the way.</p>
 40 | <hr />
 41 | <h4 id="todays-challenge-where-we-learn-that-hollywoods-cemetery-is-full">Today’s challenge: where we learn that Hollywood’s cemetery is full</h4>
 42 | <h5 id="introduction">1 - Introduction</h5>
 43 | <p>For this first challenge, we will use data collected by Randy for his recent post on the <a href="www.randalolson.com/2013/12/31/most-violence-packed-films/">“Top 25 most violence packed films” in the history of the movie industry</a>. For his post, Randy generated a simple horizontal barchart showing the top 25 more violent films ordered by number of on screen deaths per minute. In the rest of this document, we will show you how to reproduce this graph using Python and how to achieve a similar result with R. We will detail the different steps of the process and provide for each step the corresponding code (red boxes for R, green boxes for Python). You will also find the entire codes at the end of this document.</p>
 44 | <p>And now without further ado, let’s get started!</p>
 45 | <h5 id="step-by-step-process">2 - Step by step process</h5>
 46 | <p>First thing first, let’s set up our working environment by loading some necessary libraries.</p>
 47 | <pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Load libraries</span>
 48 | <span class="kw">library</span>(lattice)        <span class="co"># Very versatile graphics package</span>
 49 | <span class="kw">library</span>(latticeExtra)   <span class="co"># Addition to &quot;lattice&quot; that makes layering graphs a </span>
 50 |                         <span class="co"># breathe, and I&#39;m a lazy person, so why not</span></code></pre>
 51 | <pre class="sourceCode python"><code class="sourceCode python"><span class="co"># This starts the IPython Notebook pylab module, useful for plotting and</span>
 52 | <span class="co"># interactive scientific computing</span>
 53 | %pylab inline
 54 | <span class="ch">from</span> pandas <span class="ch">import</span> *</code></pre>
 55 | <p>Now let’s load the data for today’s job. The raw data were scraped by Randy (using Python) from <a href="http://www.MovieBodyCounts.com">www.MovieBodyCounts.com</a> and he generously provided the result of his hard work on FigShare at this address: <a href="http://dx.doi.org/10.6084/m9.figshare.889719">http://dx.doi.org/10.6084/m9.figshare.889719</a>.</p>
 56 | <pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Load data into a data frame</span>
 57 | body.count.data &lt;-<span class="st"> </span><span class="kw">read.csv</span>(<span class="st">&quot;http://files.figshare.com/1332945/film_death_counts.csv&quot;</span>)</code></pre>
 58 | <pre class="sourceCode python"><code class="sourceCode python"><span class="co"># Read the data into a pandas DataFrame</span>
 59 | body_count_data = read_csv(<span class="st">&quot;http://files.figshare.com/1332945/film_death_counts.csv&quot;</span>)</code></pre>
 60 | <p>For each movie, the data frame contains a column for the total number of on screen deaths (“Body_Count”) and a column for the duration (“Length_Minutes”). We will now create an extra column for the number of on screen deaths per minute of each movie (“Deaths_Per_Minute”)</p>
 61 | <pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Compute on screen deaths per minute for each movie. </span>
 62 | body.count.data &lt;-<span class="st"> </span><span class="kw">within</span>(body.count.data, { 
 63 |   Deaths_Per_Minute &lt;-<span class="st"> </span>Body_Count /<span class="st"> </span>Length_Minutes
 64 |   ord &lt;-<span class="st"> </span><span class="kw">order</span>(Deaths_Per_Minute, <span class="dt">decreasing =</span> <span class="ot">TRUE</span>)  <span class="co"># useful later</span>
 65 | })</code></pre>
 66 | <pre class="sourceCode python"><code class="sourceCode python"><span class="co"># Divide the body counts by the length of the film</span>
 67 | body_count_data[<span class="st">&quot;Deaths_Per_Minute&quot;</span>] = (body_count_data[<span class="st">&quot;Body_Count&quot;</span>].<span class="dt">apply</span>(<span class="dt">float</span>).values /
 68 |                                           body_count_data[<span class="st">&quot;Length_Minutes&quot;</span>].values)</code></pre>
 69 | <p>Now we will reorder the data frame by (descending) number of on screen deaths per minute, and select the top 25 most violent movies according to this criterion.</p>
 70 | <pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Reorder &quot;body.count.data&quot; by (descending) number of on screen deaths per minute</span>
 71 | body.count.data &lt;-<span class="st"> </span>body.count.data[body.count.data$ord, ]
 72 | 
 73 | <span class="co"># Select top 25 most violent movies by number of on screen deaths per minute</span>
 74 | body.count.data &lt;-<span class="st"> </span>body.count.data[<span class="dv">1</span>:<span class="dv">25</span>,]</code></pre>
 75 | <pre class="sourceCode python"><code class="sourceCode python"><span class="co"># Only keep the top 25 highest kills per minute films</span>
 76 | body_count_data = body_count_data.sort(<span class="st">&quot;Deaths_Per_Minute&quot;</span>, ascending=<span class="ot">False</span>)[:<span class="dv">25</span>]
 77 | 
 78 | <span class="co"># Change the order of the data so highest kills per minute films are on top in the plot</span>
 79 | body_count_data = body_count_data.sort(<span class="st">&quot;Deaths_Per_Minute&quot;</span>, ascending=<span class="ot">True</span>)</code></pre>
 80 | <p>In Randy’s graph, the “y” axis shows the film title with the release date. We will now generate the full title for each movie following a “Movie name (year)” format, and append it to the data frame.</p>
 81 | <pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Combine film title and release date into a new factor column with levels</span>
 82 | <span class="co"># ordered by ascending violence</span>
 83 | body.count.data &lt;-<span class="st"> </span><span class="kw">within</span>(body.count.data, {
 84 |   Full_Title &lt;-<span class="st"> </span><span class="kw">paste0</span>(Film, <span class="st">&quot; (&quot;</span>, Year, <span class="st">&quot;)&quot;</span>)
 85 |   ord &lt;-<span class="st"> </span><span class="kw">order</span>(Deaths_Per_Minute, <span class="dt">decreasing =</span> <span class="ot">TRUE</span>)
 86 |   Full_Title &lt;-<span class="st"> </span><span class="kw">ordered</span>(Full_Title, <span class="dt">levels =</span> <span class="kw">rev</span>(<span class="kw">unique</span>(Full_Title[ord])))  <span class="co"># some films are duplicated! Bad Randy!</span>
 87 | })</code></pre>
 88 | <pre class="sourceCode python"><code class="sourceCode python"><span class="co"># Generate the full titles for the movies: movie name (year)</span>
 89 | full_title = []
 90 | 
 91 | <span class="kw">for</span> film, year in <span class="dt">zip</span>(body_count_data[<span class="st">&quot;Film&quot;</span>].values, body_count_data[<span class="st">&quot;Year&quot;</span>].values):
 92 |   full_title.append(film + <span class="st">&quot; (&quot;</span> + <span class="dt">str</span>(year) + <span class="st">&quot;)&quot;</span>)
 93 | 
 94 | body_count_ y-axis ticks on the left and x-axis ticks on the bottom
 95 | ax.yaxis.tick_left()
 96 | ax.xaxis.tick_bottom()data[<span class="st">&quot;Full_Title&quot;</span>] = array(full_title)</code></pre>
 97 | <p>Now we are ready to generate the barchart. We’re going to start with the default options and then we will make this thing look pretty.</p>
 98 | <pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Generate base graph</span>
 99 | graph &lt;-<span class="st"> </span><span class="kw">barchart</span>(Full_Title ~<span class="st"> </span>Deaths_Per_Minute, <span class="dt">data =</span> body.count.data)
100 | <span class="kw">print</span>(graph)</code></pre>
101 | <p><img src="figure/baseGraphR.png" title="plot of chunk baseGraphR" alt="plot of chunk baseGraphR" style="display: block; margin: auto;" /></p>
102 | <pre class="sourceCode python"><code class="sourceCode python"><span class="co"># plot the bars</span>
103 | fig = plt.figure(figsize=(<span class="dv">8</span>,<span class="dv">12</span>))
104 | 
105 | <span class="co"># Plot the red horizontal bars</span>
106 | rects = plt.barh(<span class="dt">range</span>(<span class="dt">len</span>(body_count_data[<span class="st">&quot;Deaths_Per_Minute&quot;</span>])),
107 |                  body_count_data[<span class="st">&quot;Deaths_Per_Minute&quot;</span>],
108 |                  height=<span class="fl">0.8</span>,
109 |                  align=<span class="st">&quot;center&quot;</span>,
110 |                  color=<span class="st">&quot;#8A0707&quot;</span>,
111 |                  edgecolor=<span class="st">&quot;none&quot;</span>)
112 | 
113 | <span class="co"># Add the film labels to left of the bars (y-axis)</span>
114 | yticks(<span class="dt">range</span>(<span class="dt">len</span>(body_count_data[<span class="st">&quot;Full_Title&quot;</span>])), body_count_data[<span class="st">&quot;Full_Title&quot;</span>].values, fontsize=<span class="dv">14</span>)xticks(arange(<span class="dv">0</span>, <span class="dv">5</span>, <span class="dv">1</span>), [<span class="st">&quot;&quot;</span>])</code></pre>
115 | <center> 
116 | <img src="figurePy/basePy.png" />
117 | </center>
118 | 
119 | <p>Ok, now let’s make this pretty.</p>
120 | <pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Create theme</span>
121 | my.bloody.theme &lt;-<span class="st"> </span><span class="kw">within</span>(<span class="kw">trellis.par.get</span>(), {    <span class="co"># Initialize theme with default value</span>
122 |   axis.line$col &lt;-<span class="st"> </span><span class="ot">NA</span>                             <span class="co"># Remove axes </span>
123 |   plot.polygon &lt;-<span class="st"> </span><span class="kw">within</span>(plot.polygon, {
124 |     col &lt;-<span class="st"> &quot;#8A0606&quot;</span>                              <span class="co"># Set bar colors to a nice bloody red</span>
125 |     border &lt;-<span class="st"> </span><span class="ot">NA</span>                                  <span class="co"># Remove bars&#39; outline</span>
126 |   })
127 |   axis.text$cex &lt;-<span class="st"> </span><span class="dv">1</span>                              <span class="co"># Default axis text size is a bit small. Make it bigger</span>
128 |   layout.heights &lt;-<span class="st"> </span><span class="kw">within</span>(layout.heights, {
129 |     bottom.padding &lt;-<span class="st"> </span><span class="dv">0</span>                           <span class="co"># Remove bottom padding</span>
130 |     axis.bottom &lt;-<span class="st"> </span><span class="dv">0</span>                              <span class="co"># Remove axis padding at the bottom of the graph</span>
131 |     axis.top &lt;-<span class="st"> </span><span class="dv">0</span>                                 <span class="co"># Remove axis padding at the top of the graph</span>
132 |   })
133 | })
134 | 
135 | <span class="co"># Update figure with new theme + other improvements (like a title for instance)</span>
136 | graph &lt;-<span class="st"> </span><span class="kw">update</span>(
137 |   graph, 
138 |   <span class="dt">main=</span><span class="st">&#39;25 most violence packed films by deaths per minute&#39;</span>,    <span class="co"># Title of the barchart</span>
139 |   <span class="dt">par.settings =</span> my.bloody.theme,                               <span class="co"># Use custom theme</span>
140 |   <span class="dt">xlab =</span> <span class="ot">NULL</span>,                                                  <span class="co"># Remove label of x axis</span>
141 |   <span class="dt">scales=</span><span class="kw">list</span>(<span class="dt">x=</span><span class="kw">list</span>(<span class="dt">at=</span><span class="ot">NULL</span>)),                                 <span class="co"># Remove rest of x axis</span>
142 |   <span class="dt">xlim =</span> <span class="kw">c</span>(<span class="dv">0</span>, <span class="fl">6.7</span>),                                             <span class="co"># Set graph limits along x axis to accomodate the additional text (requires some trial and error)</span>
143 |   <span class="dt">box.width=</span><span class="fl">0.75</span>)                                               <span class="co"># Default bar width is a bit small. Make it bigger)</span>
144 | 
145 | <span class="kw">print</span>(graph)</code></pre>
146 | <p><img src="figure/prettyR.png" title="plot of chunk prettyR" alt="plot of chunk prettyR" style="display: block; margin: auto;" /></p>
147 | <pre class="sourceCode python"><code class="sourceCode python"><span class="co"># Don&#39;t have any x tick labels</span>
148 | xticks(arange(<span class="dv">0</span>, <span class="dv">5</span>, <span class="dv">1</span>), [<span class="st">&quot;&quot;</span>])
149 | 
150 | <span class="co"># Plot styling</span>
151 | 
152 | <span class="co"># Remove the plot frame lines</span>
153 | ax = axes()
154 | ax.spines[<span class="st">&quot;top&quot;</span>].set_visible(<span class="ot">False</span>)
155 | ax.spines[<span class="st">&quot;right&quot;</span>].set_visible(<span class="ot">False</span>)
156 | ax.spines[<span class="st">&quot;left&quot;</span>].set_visible(<span class="ot">False</span>)
157 | ax.spines[<span class="st">&quot;bottom&quot;</span>].set_visible(<span class="ot">False</span>)
158 | 
159 | <span class="co"># Color the y-axis ticks the same dark red color, and the x-axis ticks white</span>
160 | ax.tick_params(axis=<span class="st">&quot;y&quot;</span>, color=<span class="st">&quot;#8A0707&quot;</span>)
161 | ax.tick_params(axis=<span class="st">&quot;x&quot;</span>, color=<span class="st">&quot;white&quot;</span>)
162 | 
163 | ax.xaxis.grid(color=<span class="st">&quot;white&quot;</span>, linestyle=<span class="st">&quot;-&quot;</span>)</code></pre>
164 | <center> 
165 | <img src="figurePy/prettyPy.png" />
166 | </center>
167 | 
168 | <p>Finally, the last thing we want to add to our graph is the number of deaths per minute and the duration of each movie on the right of the graph.</p>
169 | <pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Combine number of on screen death per minute and duration of the movies into a new character string column</span>
170 | body.count.data &lt;-<span class="st"> </span><span class="kw">within</span>(body.count.data, {
171 |   Deaths_Per_Minute_With_Length =<span class="st"> </span><span class="kw">paste0</span>(<span class="kw">round</span>(body.count.data$Deaths_Per_Minute, <span class="dt">digits=</span><span class="dv">2</span>), <span class="st">&quot; (&quot;</span>, body.count.data$Length_Minutes, <span class="st">&quot; mins)&quot;</span>)
172 | })
173 | 
174 | <span class="co"># Add number of on screen deaths per minute and duration of movies at the end of each bar </span>
175 | graph &lt;-<span class="st"> </span>graph +<span class="st"> </span><span class="kw">layer</span>(<span class="kw">with</span>(body.count.data, 
176 |   <span class="kw">panel.text</span>(
177 |     Deaths_Per_Minute,                  <span class="co"># x position of the text</span>
178 |     <span class="dv">25</span>:<span class="dv">1</span>,                               <span class="co"># y position of the text</span>
179 |     <span class="dt">pos =</span> <span class="dv">4</span>,                            <span class="co"># Position of the text relative to the x and y position (4 = to the right)</span>
180 |     Deaths_Per_Minute_With_Length)))    <span class="co"># Text to display                                     </span>
181 | 
182 | <span class="co"># Print graph</span>
183 | <span class="kw">print</span>(graph)</code></pre>
184 | <p><img src="figure/rightLabelsR.png" title="plot of chunk rightLabelsR" alt="plot of chunk rightLabelsR" style="display: block; margin: auto;" /></p>
185 | <pre class="sourceCode python"><code class="sourceCode python"><span class="co"># This function adds the deaths per minute label to the right of the bars</span>
186 | <span class="kw">def</span> autolabel(rects):
187 |   <span class="kw">for</span> i, rect in <span class="dt">enumerate</span>(rects):
188 |   width = rect.get_width()
189 | label_text = (<span class="dt">str</span>(<span class="dt">round</span>(<span class="dt">float</span>(width), <span class="dv">2</span>)) +
190 |                 <span class="co">&quot; (&quot;</span> + <span class="dt">str</span>(body_count_data[<span class="st">&quot;Length_Minutes&quot;</span>].values[i]) +
191 |                 <span class="co">&quot; mins)&quot;</span>)
192 | 
193 | plt.text(width + <span class="fl">0.25</span>,
194 |          rect.get_y() + rect.get_height() / <span class="dv">2</span>.,
195 |          label_text,
196 |          ha=<span class="st">&quot;left&quot;</span>,
197 |          va=<span class="st">&quot;center&quot;</span>,
198 |          fontsize=<span class="dv">14</span>)
199 | 
200 | autolabel(rects)</code></pre>
201 | <center> 
202 | <img src="figurePy/finalPy.png" />
203 | </center>
204 | 
205 | <hr />
206 | <h4 id="r-bonus">3 - R bonus</h4>
207 | <p>Just for fun, I decided to add to the R graph a little accessory in relation with the general theme of this data set.</p>
208 | <pre class="sourceCode r"><code class="sourceCode r"><span class="co"># Load additional libraries</span>
209 | <span class="kw">library</span>(jpeg)  <span class="co"># To read JPG images</span>
210 | <span class="kw">library</span>(grid)  <span class="co"># Graphics library with better image plotting capabilities</span>
211 | 
212 | <span class="co"># Download a pretty background image; mode is set to &quot;wb&quot; because it seems that</span>
213 | <span class="co"># Windows needs it. I don&#39;t use Windows, I can&#39;t confirm</span>
214 | <span class="kw">download.file</span>(<span class="dt">url =</span> <span class="st">&quot;http://www.theswarmlab.com/wp-content/uploads/2014/01/bloody_gun.jpg&quot;</span>, 
215 |               <span class="dt">destfile =</span> <span class="st">&quot;bloody_gun.jpg&quot;</span>, <span class="dt">quiet =</span> <span class="ot">TRUE</span>, <span class="dt">mode =</span> <span class="st">&quot;wb&quot;</span>)
216 | 
217 | <span class="co"># Load gun image using &quot;readJPEG&quot; from the &quot;jpeg&quot; package</span>
218 | img &lt;-<span class="st"> </span><span class="kw">readJPEG</span>(<span class="st">&quot;bloody_gun.jpg&quot;</span>)
219 | 
220 | <span class="co"># Add image to graph using &quot;grid.raster&quot; from the &quot;grid&quot; package</span>
221 | graph &lt;-<span class="st"> </span>graph +<span class="st"> </span><span class="kw">layer_</span>(
222 |   <span class="kw">grid.raster</span>(
223 |     <span class="kw">as.raster</span>(img),                 <span class="co"># Image as a raster</span>
224 |     <span class="dt">x =</span> <span class="dv">1</span>,                          <span class="co"># x location of image &quot;Normalised Parent Coordinates&quot;</span>
225 |     <span class="dt">y =</span> <span class="dv">0</span>,                          <span class="co"># y location of image &quot;Normalised Parent Coordinates&quot;</span>
226 |     <span class="dt">height =</span> <span class="fl">0.7</span>,                   <span class="co"># Height of the image. 1 indicates that the image height is equal to the graph height</span>
227 |     <span class="dt">just =</span> <span class="kw">c</span>(<span class="st">&quot;right&quot;</span>, <span class="st">&quot;bottom&quot;</span>)))   <span class="co"># Justification of the image relative to its x and y locations</span>
228 | 
229 | <span class="co"># Print graph</span>
230 | <span class="kw">print</span>(graph)</code></pre>
231 | <p><img src="figure/gunR.png" title="plot of chunk gunR" alt="plot of chunk gunR" style="display: block; margin: auto;" /></p>
232 | <hr />
233 | <h4 id="source-code">4 - Source code</h4>
234 | <p>R and Python source codes are available <a href="https://github.com/morpionZ/R-vs-Python/tree/master/Deadliest%20movies/code">here</a>.</p>
235 | <p>For F# fan, <a href="http://terjetyl.ghost.io/">Terje Tyldum</a> has written his version of the code in F# <a href="http://terjetyl.ghost.io/f-charting-challenge/">here</a>.</p>
236 | <p>Randy and I also recommend that you check out <a href="http://nbviewer.ipython.org/github/yaph/ipython-notebooks/blob/master/Exploring%20Movie%20Body%20Counts.ipynb">this post</a> by <a href="http://ramiro.org/">Ramiro Gómez</a> (<a href="https://twitter.com/yaph">@yaph</a>) where he does a more in-depth analysis of the data set we used for today’s challenge.</p>
237 | </body>
238 | </html>
239 | 


--------------------------------------------------------------------------------
/Deadliest movies/run.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | 
  5 | 
  6 | **Document title:** R vs Python - Round 1
  7 | 
  8 | **Date:** January 5, 2014
  9 | 
 10 | **Text by:** Simon Garnier ([www.theswarmlab.com](http://www.theswarmlab.com)
 11 | / [\@sjmgarnier](http://twitter.com/sjmgarnier))
 12 | 
 13 | **R code by:** Simon Garnier
 14 | ([www.theswarmlab.com](http://www.theswarmlab.com) /
 15 | [\@sjmgarnier](http://twitter.com/sjmgarnier))
 16 | 
 17 | **Python code by:** Randy Olson
 18 | ([www.randalolson.com](http://www.randalolson.com) /
 19 | [\@randal_olson](http://twitter.com/randal_olson))
 20 | 
 21 | Document generated with RStudio ([www.rstudio.com](http://www.rstudio.com)), 
 22 | knitr ([www.yihui.name/knitr/](http://yihui.name/knitr/)) and pandoc 
 23 | ([www.johnmacfarlane.net/pandoc/](http://johnmacfarlane.net/pandoc/)). Python
 24 | figures generated with iPython Notebook
 25 | ([www.ipython.org/notebook.html](http://ipython.org/notebook.html)).
 26 | 
 27 | ___
 28 | 
 29 | #### Foreword ####
 30 | 
 31 | My friend Randy Olson and I got into the habit to argue about the relative 
 32 | qualities of our favorite languages for data analysis and visualization. I am
 33 | an enthusiastic R user ([www.r-project.org](http://www.r-project.org)) while 
 34 | Randy is a fan of Python ([www.python.org](http://www.python.org)). One thing
 35 | we agree on however is that our discussions are meaningless unless we
 36 | actually put R and Python to a series of tests to showcase their relative
 37 | strengths and weaknesses. Essentially we will set a common goal (*e.g.*,
 38 | perform a particular type of data analysis or draw a particular type of
 39 | graph) and create the R and Python codes to achieve this goal. And since
 40 | Randy and I are all about sharing, open source and open access, we decided to
 41 | make public the results of our friendly challenges so that you can help us
 42 | decide between R and Python and, hopefully, also learn something along the
 43 | way.
 44 | 
 45 | ___
 46 | 
 47 | #### Today's challenge: where we learn that Hollywood's cemetery is full ####
 48 | 
 49 | ##### 1 - Introduction #####
 50 | 
 51 | For this first challenge, we will use data collected by Randy for his recent
 52 | post on the ["Top 25 most violence packed films" in the history of the movie
 53 | industry](www.randalolson.com/2013/12/31/most-violence-packed-films/). For
 54 | his post, Randy generated a simple horizontal barchart showing the top 25
 55 | more violent films ordered by number of on screen deaths per minute. In the
 56 | rest of this document, we will show you how to reproduce this graph using
 57 | Python and how to achieve a similar result with R. We will detail the
 58 | different steps of the process and provide for each step the corresponding
 59 | code (red boxes for R, green boxes for Python). You will also find the entire
 60 | codes at the end of this document.
 61 | 
 62 | And now without further ado, let's get started!
 63 | 
 64 | ##### 2 - Step by step process #####
 65 | 
 66 | First thing first, let's set up our working environment by loading some
 67 | necessary libraries.
 68 | 
 69 | 
 70 | 
 71 | ```r
 72 | # Load libraries
 73 | library(lattice)        # Very versatile graphics package
 74 | library(latticeExtra)   # Addition to "lattice" that makes layering graphs a 
 75 |                         # breathe, and I'm a lazy person, so why not
 76 | ```
 77 | 
 78 | ```python
 79 | # This starts the IPython Notebook pylab module, useful for plotting and
 80 | # interactive scientific computing
 81 | %pylab inline
 82 | from pandas import *
 83 | ```
 84 | 
 85 | 
 86 | Now let's load the data for today's job. The raw data were scraped by Randy 
 87 | (using Python) from [www.MovieBodyCounts.com](http://www.MovieBodyCounts.com)
 88 | and he generously provided the result of his hard work on FigShare at this 
 89 | address: 
 90 | [http://dx.doi.org/10.6084/m9.figshare.889719](http://dx.doi.org/10.6084/m9.figshare.889719).
 91 | 
 92 | 
 93 | 
 94 | ```r
 95 | # Load data into a data frame
 96 | body.count.data <- read.csv("http://files.figshare.com/1332945/film_death_counts.csv")
 97 | ```
 98 | 
 99 | ```python
100 | # Read the data into a pandas DataFrame
101 | body_count_data = read_csv("http://files.figshare.com/1332945/film_death_counts.csv")
102 | ```
103 | 
104 | 
105 | For each movie, the data frame contains a column for the total number of on
106 | screen deaths ("Body_Count") and a column for the duration
107 | ("Length_Minutes"). We will now create an extra column for the number of on
108 | screen deaths per minute of each movie ("Deaths_Per_Minute")
109 | 
110 | 
111 | 
112 | ```r
113 | # Compute on screen deaths per minute for each movie. 
114 | body.count.data <- within(body.count.data, { 
115 |   Deaths_Per_Minute <- Body_Count / Length_Minutes
116 |   ord <- order(Deaths_Per_Minute, decreasing = TRUE)  # useful later
117 | })
118 | ```
119 | 
120 | ```python
121 | # Divide the body counts by the length of the film
122 | body_count_data["Deaths_Per_Minute"] = (body_count_data["Body_Count"].apply(float).values /
123 |                                           body_count_data["Length_Minutes"].values)
124 | ```
125 | 
126 | 
127 | Now we will reorder the data frame by (descending) number of on screen deaths
128 | per minute, and select the top 25 most violent movies according to this criterion.
129 | 
130 | 
131 | 
132 | ```r
133 | # Reorder "body.count.data" by (descending) number of on screen deaths per minute
134 | body.count.data <- body.count.data[body.count.data$ord, ]
135 | 
136 | # Select top 25 most violent movies by number of on screen deaths per minute
137 | body.count.data <- body.count.data[1:25,]
138 | ```
139 | 
140 | ```python
141 | # Only keep the top 25 highest kills per minute films
142 | body_count_data = body_count_data.sort("Deaths_Per_Minute", ascending=False)[:25]
143 | 
144 | # Change the order of the data so highest kills per minute films are on top in the plot
145 | body_count_data = body_count_data.sort("Deaths_Per_Minute", ascending=True)
146 | ```
147 | 
148 | 
149 | In Randy's graph, the "y" axis shows the film title with the release date. We
150 | will now generate the full title for each movie following a "Movie name
151 | (year)" format, and append it to the data frame.
152 | 
153 | 
154 | 
155 | ```r
156 | # Combine film title and release date into a new factor column with levels
157 | # ordered by ascending violence
158 | body.count.data <- within(body.count.data, {
159 |   Full_Title <- paste0(Film, " (", Year, ")")
160 |   ord <- order(Deaths_Per_Minute, decreasing = TRUE)
161 |   Full_Title <- ordered(Full_Title, levels = rev(unique(Full_Title[ord])))  # some films are duplicated! Bad Randy!
162 | })
163 | ```
164 | 
165 | ```python
166 | # Generate the full titles for the movies: movie name (year)
167 | full_title = []
168 | 
169 | for film, year in zip(body_count_data["Film"].values, body_count_data["Year"].values):
170 |   full_title.append(film + " (" + str(year) + ")")
171 | 
172 | body_count_ y-axis ticks on the left and x-axis ticks on the bottom
173 | ax.yaxis.tick_left()
174 | ax.xaxis.tick_bottom()data["Full_Title"] = array(full_title)
175 | ```
176 | 
177 | 
178 | Now we are ready to generate the barchart. We're going to start with the
179 | default options and then we will make this thing look pretty.
180 | 
181 | 
182 | 
183 | ```r
184 | # Generate base graph
185 | graph <- barchart(Full_Title ~ Deaths_Per_Minute, data = body.count.data)
186 | print(graph)
187 | ```
188 | 
189 | <img src="figure/baseGraphR.png" title="plot of chunk baseGraphR" alt="plot of chunk baseGraphR" style="display: block; margin: auto;" />
190 | 
191 | ```python
192 | # plot the bars
193 | fig = plt.figure(figsize=(8,12))
194 | 
195 | # Plot the red horizontal bars
196 | rects = plt.barh(range(len(body_count_data["Deaths_Per_Minute"])),
197 |                  body_count_data["Deaths_Per_Minute"],
198 |                  height=0.8,
199 |                  align="center",
200 |                  color="#8A0707",
201 |                  edgecolor="none")
202 | 
203 | # Add the film labels to left of the bars (y-axis)
204 | yticks(range(len(body_count_data["Full_Title"])), body_count_data["Full_Title"].values, fontsize=14)xticks(arange(0, 5, 1), [""])
205 | ```
206 | 
207 | 
208 | <center> ![](figurePy/basePy.png) </center>
209 | 
210 | Ok, now let's make this pretty. 
211 | 
212 | 
213 | 
214 | ```r
215 | # Create theme
216 | my.bloody.theme <- within(trellis.par.get(), {    # Initialize theme with default value
217 |   axis.line$col <- NA                             # Remove axes 
218 |   plot.polygon <- within(plot.polygon, {
219 |     col <- "#8A0606"                              # Set bar colors to a nice bloody red
220 |     border <- NA                                  # Remove bars' outline
221 |   })
222 |   axis.text$cex <- 1                              # Default axis text size is a bit small. Make it bigger
223 |   layout.heights <- within(layout.heights, {
224 |     bottom.padding <- 0                           # Remove bottom padding
225 |     axis.bottom <- 0                              # Remove axis padding at the bottom of the graph
226 |     axis.top <- 0                                 # Remove axis padding at the top of the graph
227 |   })
228 | })
229 | 
230 | # Update figure with new theme + other improvements (like a title for instance)
231 | graph <- update(
232 |   graph, 
233 |   main='25 most violence packed films by deaths per minute',    # Title of the barchart
234 |   par.settings = my.bloody.theme,                               # Use custom theme
235 |   xlab = NULL,                                                  # Remove label of x axis
236 |   scales=list(x=list(at=NULL)),                                 # Remove rest of x axis
237 |   xlim = c(0, 6.7),                                             # Set graph limits along x axis to accomodate the additional text (requires some trial and error)
238 |   box.width=0.75)                                               # Default bar width is a bit small. Make it bigger)
239 | 
240 | print(graph)
241 | ```
242 | 
243 | <img src="figure/prettyR.png" title="plot of chunk prettyR" alt="plot of chunk prettyR" style="display: block; margin: auto;" />
244 | 
245 | ```python
246 | # Don't have any x tick labels
247 | xticks(arange(0, 5, 1), [""])
248 | 
249 | # Plot styling
250 | 
251 | # Remove the plot frame lines
252 | ax = axes()
253 | ax.spines["top"].set_visible(False)
254 | ax.spines["right"].set_visible(False)
255 | ax.spines["left"].set_visible(False)
256 | ax.spines["bottom"].set_visible(False)
257 | 
258 | # Color the y-axis ticks the same dark red color, and the x-axis ticks white
259 | ax.tick_params(axis="y", color="#8A0707")
260 | ax.tick_params(axis="x", color="white")
261 | 
262 | ax.xaxis.grid(color="white", linestyle="-")
263 | ```
264 | 
265 | 
266 | <center> ![](figurePy/prettyPy.png) </center>
267 | 
268 | Finally, the last thing we want to add to our graph is the number of deaths
269 | per minute and the duration of each movie on the right of the graph.
270 | 
271 | 
272 | 
273 | ```r
274 | # Combine number of on screen death per minute and duration of the movies into a new character string column
275 | body.count.data <- within(body.count.data, {
276 |   Deaths_Per_Minute_With_Length = paste0(round(body.count.data$Deaths_Per_Minute, digits=2), " (", body.count.data$Length_Minutes, " mins)")
277 | })
278 | 
279 | # Add number of on screen deaths per minute and duration of movies at the end of each bar 
280 | graph <- graph + layer(with(body.count.data, 
281 |   panel.text(
282 |     Deaths_Per_Minute,                  # x position of the text
283 |     25:1,                               # y position of the text
284 |     pos = 4,                            # Position of the text relative to the x and y position (4 = to the right)
285 |     Deaths_Per_Minute_With_Length)))    # Text to display                                     
286 | 
287 | # Print graph
288 | print(graph)
289 | ```
290 | 
291 | <img src="figure/rightLabelsR.png" title="plot of chunk rightLabelsR" alt="plot of chunk rightLabelsR" style="display: block; margin: auto;" />
292 | 
293 | ```python
294 | # This function adds the deaths per minute label to the right of the bars
295 | def autolabel(rects):
296 |   for i, rect in enumerate(rects):
297 |   width = rect.get_width()
298 | label_text = (str(round(float(width), 2)) +
299 |                 " (" + str(body_count_data["Length_Minutes"].values[i]) +
300 |                 " mins)")
301 | 
302 | plt.text(width + 0.25,
303 |          rect.get_y() + rect.get_height() / 2.,
304 |          label_text,
305 |          ha="left",
306 |          va="center",
307 |          fontsize=14)
308 | 
309 | autolabel(rects)
310 | ```
311 | 
312 | 
313 | <center> ![](figurePy/finalPy.png) </center>
314 | 
315 | ___
316 | 
317 | #### 3 - R bonus ####
318 | 
319 | Just for fun, I decided to add to the R graph a little accessory in relation
320 | with the general theme of this data set.
321 | 
322 | 
323 | ```r
324 | # Load additional libraries
325 | library(jpeg)  # To read JPG images
326 | library(grid)  # Graphics library with better image plotting capabilities
327 | 
328 | # Download a pretty background image; mode is set to "wb" because it seems that
329 | # Windows needs it. I don't use Windows, I can't confirm
330 | download.file(url = "http://www.theswarmlab.com/wp-content/uploads/2014/01/bloody_gun.jpg", 
331 |               destfile = "bloody_gun.jpg", quiet = TRUE, mode = "wb")
332 | 
333 | # Load gun image using "readJPEG" from the "jpeg" package
334 | img <- readJPEG("bloody_gun.jpg")
335 | 
336 | # Add image to graph using "grid.raster" from the "grid" package
337 | graph <- graph + layer_(
338 |   grid.raster(
339 |     as.raster(img),                 # Image as a raster
340 |     x = 1,                          # x location of image "Normalised Parent Coordinates"
341 |     y = 0,                          # y location of image "Normalised Parent Coordinates"
342 |     height = 0.7,                   # Height of the image. 1 indicates that the image height is equal to the graph height
343 |     just = c("right", "bottom")))   # Justification of the image relative to its x and y locations
344 | 
345 | # Print graph
346 | print(graph)
347 | ```
348 | 
349 | <img src="figure/gunR.png" title="plot of chunk gunR" alt="plot of chunk gunR" style="display: block; margin: auto;" />
350 | 
351 | 
352 | ___
353 | 
354 | #### 4 - Source code ####
355 | 
356 | R and Python source codes are available 
357 | [here](https://github.com/morpionZ/R-vs-Python/tree/master/Deadliest%20movies/code).
358 | 
359 | For F# fan, [Terje Tyldum](http://terjetyl.ghost.io/) has written his version
360 | of the code in F# [here](http://terjetyl.ghost.io/f-charting-challenge/).
361 | 
362 | Randy and I also recommend that you check out [this 
363 | post](http://nbviewer.ipython.org/github/yaph/ipython-notebooks/blob/master/Exploring%20Movie%20Body%20Counts.ipynb)
364 | by [Ramiro Gómez](http://ramiro.org/) ([\@yaph](https://twitter.com/yaph)) 
365 | where he does a more in-depth analysis of the data set we used for today’s 
366 | challenge.
367 | 
368 | 


--------------------------------------------------------------------------------
/Linear regression/Linear regression.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------
/Linear regression/code/code.R:
--------------------------------------------------------------------------------
  1 | #' Copyright 2014 Simon Garnier (http://www.theswarmlab.com / @sjmgarnier)
  2 | #' 
  3 | #' This script is free software: you can redistribute it and/or modify it under
  4 | #' the terms of the GNU General Public License as published by the Free Software
  5 | #' Foundation, either version 3 of the License, or (at your option) any later
  6 | #' version.
  7 | #' 
  8 | #' This script is distributed in the hope that it will be useful, but WITHOUT
  9 | #' ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 10 | #' FOR A PARTICULAR PURPOSE.
 11 | #' 
 12 | #' See the GNU General Public License for more details.
 13 | #' 
 14 | #' You should have received a copy of the GNU General Public License along with
 15 | #' this script. If not, see http://www.gnu.org/licenses/.
 16 | #' 
 17 | 
 18 | #' **Document title:** R vs Python - Round 3
 19 | #' 
 20 | #' **Date:** February 5, 2014
 21 | #' 
 22 | #' **Author:** Simon Garnier (http://www.theswarmlab.com / @sjmgarnier)
 23 | #' 
 24 | #' **Description:** This script demonstrate how to perform simple linear
 25 | #' regression with R. For more information, see 
 26 | #' http://www.theswarmlab.com/r-vs-python-round-3/
 27 | #' 
 28 | #' Document generated with RStudio ([www.rstudio.com](http://www.rstudio.com)).
 29 | #' 
 30 | 
 31 | # Load libraries
 32 | require("Quandl")         # Functions to access Quandl's database
 33 | require("lattice")        # Base graph functions
 34 | require("latticeExtra")   # Layer graph functions
 35 | 
 36 | # Load data from Quandl
 37 | my.data <- Quandl("TPC/HIST_RECEIPT", 
 38 |                   start_date = "1945-12-31", 
 39 |                   end_date = "2013-12-31")
 40 | 
 41 | # The columns' names contain spaces, make them syntactically valid
 42 | names(my.data) <- make.names(names(my.data))
 43 | 
 44 | # Create graph object
 45 | graph <- xyplot(Individual.Income.Taxes ~ Fiscal.Year, 
 46 |                 data = my.data,
 47 |                 type = c("g", "b"),
 48 |                 col = "#00526D")
 49 | 
 50 | graph <- graph + xyplot(Corporation.Income.Taxes ~ Fiscal.Year, 
 51 |                         data = my.data,
 52 |                         type = c("b"),
 53 |                         col = "#AD3333")
 54 | 
 55 | # Create pretty theme
 56 | my.theme <- within(trellis.par.get(), {           
 57 |   plot.line <- within(plot.line, {
 58 |     lwd <- 3
 59 |   })
 60 |   
 61 |   axis.text <- within(axis.text, {
 62 |     cex <- 1.25
 63 |   })
 64 |   
 65 |   par.xlab.text <- within(par.xlab.text, {
 66 |     cex <- 1.5
 67 |   })
 68 |   
 69 |   par.ylab.text <- within(par.ylab.text, {
 70 |     cex <- 1.5
 71 |   })
 72 |   
 73 |   add.text <- within(add.text, {
 74 |     cex <- 1.25
 75 |   })
 76 |   
 77 |   superpose.line <- within(superpose.line, {
 78 |     col <- c("#00526D", "#AD3333")
 79 |     lwd <- 3
 80 |   })
 81 | })
 82 | 
 83 | # Compute upper and lower limits of the y axes
 84 | max.val <- max(my.data$Corporation.Income.Taxes, my.data$Individual.Income.Taxes)
 85 | ylim <- c(0 - 0.05 * max.val, max.val * 1.15)
 86 | 
 87 | # Prepare a legend
 88 | key <- within(list(), {
 89 |   text <- c("Individuals", "Corporations")
 90 |   corner <- c(0.05, .95)
 91 |   lines <- TRUE
 92 |   points = FALSE
 93 | })
 94 | 
 95 | # Update graph object 
 96 | graph <- update(graph, 
 97 |                 xlab = "Fiscal year", ylab = "Income taxes (% of GDP)",
 98 |                 ylim = ylim,
 99 |                 par.settings = my.theme,
100 |                 auto.key = key)
101 | 
102 | # Print graph object
103 | print(graph)
104 | 
105 | # Fit individuals' income taxes data
106 | model.ind <- lm(Individual.Income.Taxes ~ Fiscal.Year, data = my.data)
107 | 
108 | # Fit corporations' income taxes data
109 | model.corp <- lm(Corporation.Income.Taxes ~ Fiscal.Year, data = my.data)
110 | 
111 | # Summary statistics for individual income taxes model
112 | summary(model.ind)
113 | 
114 | # Summary statistics for corporation income taxes model
115 | summary(model.corp)
116 | 
117 | # Create new dates for model predictions
118 | difference <- as.Date("2013-12-31") - as.Date("1945-12-31")
119 | new.data <- data.frame(Fiscal.Year = seq.Date(from = as.Date("1945-12-31") - 0.1 * difference, 
120 |                                               to = as.Date("2013-12-31") + 0.1 * difference,
121 |                                               by = "years"))
122 | 
123 | # Compute model predictions (values + confidence intervals) for new dates
124 | # Model of individual income taxes
125 | predict.ind <- within(new.data, {
126 |   Predictions <- as.data.frame(predict(model.ind, 
127 |                                        newdata = new.data,
128 |                                        interval = "confidence"))
129 | })
130 | 
131 | # Model of corporation income taxes
132 | predict.corp <- within(new.data, {
133 |   Predictions <- as.data.frame(predict(model.corp, 
134 |                                        newdata = new.data,
135 |                                        interval = "confidence"))
136 | })
137 | 
138 | # Add predicted values for model of individual income taxes 
139 | graph <- graph + xyplot(Predictions$fit ~ Fiscal.Year, 
140 |                         data = predict.ind,
141 |                         type = c("l"),
142 |                         col = "#00526D")
143 | 
144 | # Add predicted values for model of corporation income taxes 
145 | graph <- graph + xyplot(Predictions$fit ~ Fiscal.Year, 
146 |                         data = predict.corp,
147 |                         type = c("l"),
148 |                         col = "#AD3333")
149 | 
150 | # Add confidence polygon for model of individual income taxes 
151 | graph <- graph + layer_(lpolygon(x = c(predict.ind$Fiscal.Year, 
152 |                                        rev(predict.ind$Fiscal.Year)),
153 |                                  y = c(predict.ind$Predictions$upr, 
154 |                                        rev(predict.ind$Predictions$lwr)),
155 |                                  col = "#00526D25", 
156 |                                  border = "#00526D75"))
157 | 
158 | # Add confidence polygon for model of corporation income taxes 
159 | graph <- graph + layer_(lpolygon(x = c(predict.corp$Fiscal.Year, 
160 |                                        rev(predict.corp$Fiscal.Year)),
161 |                                  y = c(predict.corp$Predictions$upr, 
162 |                                        rev(predict.corp$Predictions$lwr)),
163 |                                  col = "#AD333325", 
164 |                                  border = "#AD333375"))
165 | 
166 | # Reprint graph object
167 | print(graph)
168 | 


--------------------------------------------------------------------------------
/Linear regression/custom.css:
--------------------------------------------------------------------------------
  1 | body {
  2 |         font: 14px/1.5em "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif;
  3 | 		    color: #777;
  4 | 		-webkit-font-smoothing: antialiased; /* Fix for webkit rendering */
  5 | 		-webkit-text-size-adjust: 100%;
  6 |         /*font-family: "Avenir Next", Helvetica, Arial, sans-serif;*/
  7 |         padding:1em;
  8 |         margin:auto;
  9 |         max-width:10in;
 10 | }
 11 | 
 12 | h1, h2, h3, h4, h5, h6 {
 13 | 
 14 | 		font-weight: normal; }
 15 | 	h1 a, h2 a, h3 a, h4 a, h5 a, h6 a { font-weight: inherit; }
 16 | 	h1 { font-size: 46px; line-height: 50px; margin-bottom: 14px;}
 17 | 	h2 { font-size: 35px; line-height: 40px; margin-bottom: 10px; }
 18 | 	h3 { font-size: 28px; line-height: 34px; margin-bottom: 8px; }
 19 | 	h4 { font-size: 21px; line-height: 30px; margin-bottom: 4px; }
 20 | 	h5 { font-size: 17px; line-height: 24px; }
 21 | 	h6 { font-size: 14px; line-height: 21px; }
 22 | 	.subheader { color: #777; }
 23 | 
 24 | 	p { margin: 0 0 20px 0; }
 25 | 	p img { margin: 0; }
 26 | 	p.lead { font-size: 21px; line-height: 27px; color: #444;  }
 27 | 
 28 | 	em { font-style: italic; }
 29 | 	strong { font-weight: bold;  }
 30 | 	small { font-size: 80%; }
 31 | 
 32 | hr {
 33 |         height: 0.2em;
 34 |         border: 0;
 35 |         color: #CCCCCC;
 36 |         background-color: #CCCCCC;
 37 | }
 38 | 
 39 | p, blockquote, ul, ol, dl, li, table, pre {
 40 |         margin: 15px 0;
 41 |         text-align: justify;
 42 | }
 43 | 
 44 | a, a:visited { color: #333; text-decoration: underline; outline: 0; }
 45 |   a:hover, a:focus { color: #000; }
 46 | 	p a, p a:visited { line-height: inherit; }
 47 | 
 48 | #message {
 49 |         border-radius: 6px;
 50 |         border: 1px solid #ccc;
 51 |         display:block;
 52 |         width:100%;
 53 |         height:60px;
 54 |         margin:6px 0px;
 55 | }
 56 | 
 57 | button, #ws {
 58 |         font-size: 10pt;
 59 |         padding: 4px 6px;
 60 |         border-radius: 5px;
 61 |         border: 1px solid #bbb;
 62 |         background-color: #eee;
 63 | }
 64 | 
 65 | code, pre, #ws, #message {
 66 |         font-family: Monaco;
 67 |         font-size: 8pt;
 68 |         border-radius: 3px;
 69 |         background-color: #F8F8F8;
 70 |         color: inherit;
 71 | }
 72 | 
 73 | code {
 74 |         border: 1px solid #EAEAEA;
 75 |         margin: 0 2px;
 76 |         padding: 0 5px;
 77 | }
 78 | 
 79 | pre.r {
 80 |   border: 2px solid #8A0606;
 81 | }
 82 | 
 83 | pre.r:before {
 84 |   content: 'R code \A';
 85 |   color: #8A0606;
 86 |   font-weight: bold;
 87 | }
 88 | 
 89 | pre.python {
 90 |   border: 2px solid #068A06;
 91 | }
 92 | 
 93 | pre.python:before {
 94 |   content: 'Python code \A';
 95 |   color: #068A06;
 96 |   font-weight: bold;
 97 | }
 98 | 
 99 | img {
100 |     max-width: 100%;
101 |     height: auto;
102 |     width: auto\9; /* ie8 */
103 | }
104 | 
105 | pre {
106 |         border: 1px solid #CCCCCC;
107 |         overflow: auto;
108 |         padding: 4px 8px;
109 | }
110 | 
111 | pre > code {
112 |         border: 0;
113 |         margin: 0;
114 |         padding: 0;
115 | }
116 | 
117 | blockquote, blockquote p { font-size: 12px; line-height: 24px; color: #000; font-style: italic; }
118 | 	blockquote { margin: 0 0 20px; padding: 9px 50px 0 49px; border-left: 1px solid #ddd; }
119 | 	blockquote cite { display: block; font-size: 12px; color: #555; }
120 | 	blockquote cite:before { content: "\2014 \0020"; }
121 | 	blockquote cite a, blockquote cite a:visited, blockquote cite a:visited { color: #555; }
122 | 
123 | #ws { background-color: #f8f8f8; }
124 | 
125 | .send { color:#77bb77; }
126 | .server { color:#7799bb; }
127 | .error { color:#AA0000; }
128 | 


--------------------------------------------------------------------------------
/Linear regression/figure/graphBaseR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sjmgarnier/R-vs-Python/73e33c1623b12ce3fbb8a6ccca2d661571f40455/Linear regression/figure/graphBaseR.png


--------------------------------------------------------------------------------
/Linear regression/figure/graphPredictR.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sjmgarnier/R-vs-Python/73e33c1623b12ce3fbb8a6ccca2d661571f40455/Linear regression/figure/graphPredictR.png


--------------------------------------------------------------------------------
/Linear regression/figurePy/graphBasePy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sjmgarnier/R-vs-Python/73e33c1623b12ce3fbb8a6ccca2d661571f40455/Linear regression/figurePy/graphBasePy.png


--------------------------------------------------------------------------------
/Linear regression/figurePy/graphPredictPy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sjmgarnier/R-vs-Python/73e33c1623b12ce3fbb8a6ccca2d661571f40455/Linear regression/figurePy/graphPredictPy.png


--------------------------------------------------------------------------------
/Linear regression/notebook.R:
--------------------------------------------------------------------------------
  1 | #+ licence, echo=FALSE 
  2 | # Copyright 2014 Simon Garnier (http://www.theswarmlab.com / @sjmgarnier)
  3 | # 
  4 | # This script is free software: you can redistribute it and/or modify it under 
  5 | # the terms of the GNU General Public License as published by the Free Software 
  6 | # Foundation, either version 3 of the License, or (at your option) any later 
  7 | # version.
  8 | # 
  9 | # This script is distributed in the hope that it will be useful, but WITHOUT ANY
 10 | # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
 11 | # A PARTICULAR PURPOSE.
 12 | # 
 13 | # See the GNU General Public License for more details.
 14 | # 
 15 | # You should have received a copy of the GNU General Public License along with 
 16 | # this script. If not, see http://www.gnu.org/licenses/.
 17 | # 
 18 | # You can generate the HTML files by running: 
 19 | # require("knitr") 
 20 | # spin("notebook.R") 
 21 | # pandoc("notebook.md", config = "pandoc_config.txt")
 22 | 
 23 | 
 24 | #+ 
 25 | #' **Document title:** R vs Python - Round 3
 26 | #' 
 27 | #' **Date:** February 5, 2014
 28 | #' 
 29 | #' **Text by:** Simon Garnier ([www.theswarmlab.com](http://www.theswarmlab.com)
 30 | #' / [\@sjmgarnier](http://twitter.com/sjmgarnier))
 31 | #' 
 32 | #' **R code by:** Simon Garnier
 33 | #' ([www.theswarmlab.com](http://www.theswarmlab.com) /
 34 | #' [\@sjmgarnier](http://twitter.com/sjmgarnier))
 35 | #' 
 36 | #' **Python code by:** Randy Olson
 37 | #' ([www.randalolson.com](http://www.randalolson.com) /
 38 | #' [\@randal_olson](http://twitter.com/randal_olson))
 39 | #' 
 40 | #' Document generated with RStudio ([www.rstudio.com](http://www.rstudio.com)), 
 41 | #' knitr ([www.yihui.name/knitr/](http://yihui.name/knitr/)) and pandoc 
 42 | #' ([www.johnmacfarlane.net/pandoc/](http://johnmacfarlane.net/pandoc/)). Python
 43 | #' figures generated with iPython Notebook
 44 | #' ([www.ipython.org/notebook.html](http://ipython.org/notebook.html)).
 45 | #' 
 46 | 
 47 | 
 48 | #' ___
 49 | #' 
 50 | #' #### Foreword ####
 51 | #' 
 52 | #' My friend Randy Olson and I got into the habit to argue about the relative 
 53 | #' qualities of our favorite languages for data analysis and visualization. I am
 54 | #' an enthusiastic R user ([www.r-project.org](http://www.r-project.org)) while 
 55 | #' Randy is a fan of Python ([www.python.org](http://www.python.org)). One thing
 56 | #' we agree on however is that our discussions are meaningless unless we
 57 | #' actually put R and Python to a series of tests to showcase their relative
 58 | #' strengths and weaknesses. Essentially we will set a common goal (*e.g.*,
 59 | #' perform a particular type of data analysis or draw a particular type of
 60 | #' graph) and create the R and Python codes to achieve this goal. And since
 61 | #' Randy and I are all about sharing, open source and open access, we decided to
 62 | #' make public the results of our friendly challenges so that you can help us
 63 | #' decide between R and Python and, hopefully, also learn something along the
 64 | #' way.
 65 | #' 
 66 | 
 67 | 
 68 | #' ___
 69 | #' 
 70 | #' #### Today's challenge: walk the line ####
 71 | #' 
 72 | #' ##### 1 - Introduction #####
 73 | #' 
 74 | #' Linear regression is one of the most popular statistical tools, in particular
 75 | #' when it comes to detect and measure the strength of trends in time-series 
 76 | #' (but not only). Today we will show you how to perform a linear regression in 
 77 | #' R and Python, how to run basic diagnostic tests to make sure the linear 
 78 | #' regression went well, and how to nicely plot the final result.
 79 | #' 
 80 | #' The data we will use today are provided by [Quandl](http://www.quandl.com/). 
 81 | #' Quandl is a company that collects and organizes time-series datasets from 
 82 | #' hundred's of public sources, and can even host your datasets for free. It is 
 83 | #' like the YouTube of time-series data. Besides having a huge collection of 
 84 | #' datasets, the nice thing about Quandl is that it provides packages for 
 85 | #' [R](http://www.quandl.com/help/packages/r) and 
 86 | #' [Python](http://www.quandl.com/help/packages/python) (and [other languages as
 87 | #' well](http://www.quandl.com/help/packages)) that make it very easy - one line
 88 | #' of code! - to retrieve time-series datasets directly from your favorite data 
 89 | #' analysis tool. Yeah Quandl!
 90 | #' 
 91 | #' The dataset that we will explore contains information about the average 
 92 | #' contributions of individuals and corporations to the income taxes, expressed
 93 | #' as a fraction of the [Gross Domestic Produce 
 94 | #' (GDP)](http://en.wikipedia.org/wiki/Gross_domestic_product). We will use 
 95 | #' linear regression to estimate the variation of these contributions from 1945 
 96 | #' to 2013.
 97 | #' 
 98 | #' In the following, we will detail the different steps of the process and 
 99 | #' provide for each step the corresponding code (red boxes for R, green boxes 
100 | #' for Python). You will also find the entire codes at the end of this document.
101 | #' 
102 | #' If you think there’s a better way to code this in either language, leave a 
103 | #' pull request on our [GitHub 
104 | #' repository](https://github.com/morpionZ/R-vs-Python/tree/master/Deadliest%20movies%20scrape/code)
105 | #' or leave a note with suggestions in the comments below.
106 | #' 
107 | 
108 | 
109 | #' ##### 2 - Step by step process #####
110 | #' 
111 | #' First things first, let's set up our working environment by loading some
112 | #' necessary libraries.
113 | #' 
114 | 
115 | #+ libR, message=FALSE
116 | # Load libraries
117 | require("Quandl")         # Functions to access Quandl's database
118 | require("lattice")        # Base graph functions
119 | require("latticeExtra")   # Layer graph functions
120 | 
121 | #+ libPy, eval=FALSE, engine="python"
122 | import Quandl
123 | 
124 | #' Now, let's load the dataset from Quandl. You can find it on Quandl's website 
125 | #' [here](http://www.quandl.com/TPC/HIST_RECEIPT-Historical-Source-of-Revenue-as-Share-of-GDP).
126 | #' Among other things, this dataset contains the contributions of individuals 
127 | #' and corporations to the income taxes, expressed as a fraction of the GDP. Its
128 | #' original source is the [Tax Policy Center](http://www.taxpolicycenter.org/).
129 | #' The dataset code at Quandl is "TPC/HIST_RECEIPT" and we will retrieve data
130 | #' from December 31, 1945, to December 31, 2013.
131 | #' 
132 | 
133 | #+ dataR, message=FALSE, warning=FALSE, results="hide"
134 | # Load data from Quandl
135 | my.data <- Quandl("TPC/HIST_RECEIPT", 
136 |                   start_date = "1945-12-31", 
137 |                   end_date = "2013-12-31")
138 | 
139 | # Display first lines of the data frame
140 | head(my.data)
141 | 
142 | #+ dataPy, eval=FALSE, engine="python"
143 | quandl_data = Quandl.get("TPC/HIST_RECEIPT",
144 |                          trim_start="1945-12-31",
145 |                          trim_end="2013-12-31")
146 | 
147 | quandl_data.head()
148 | 
149 | #' The column names from Quandl's data frame contain spaces. We need to correct 
150 | #' that before we can proceed with the rest of the analysis. 
151 | #' 
152 | 
153 | #+ syntaxR, message=FALSE, warning=FALSE
154 | # The columns' names contain spaces, make them syntactically valid
155 | names(my.data) <- make.names(names(my.data))
156 | 
157 | #+ syntaxPy, eval=FALSE, engine="python"
158 | %matplotlib inline
159 | import matplotlib.pyplot as plt
160 | 
161 | # Rename the columns we're using with underscores -- required for the linear regression
162 | quandl_data["Individual_Income_Taxes"] = quandl_data["Individual Income Taxes"]
163 | quandl_data["Corporation_Income_Taxes"] = quandl_data["Corporation Income Taxes"]
164 | quandl_data["Fiscal_Year"] = range(len(quandl_data.index))
165 | 
166 | #' The dataset contains three columns of interest for us: individuals' and 
167 | #' corporations' income taxes as a fraction of the GDP, and fiscal year. We will 
168 | #' first plot the individuals' and corporations' income taxes as a function of 
169 | #' the fiscal year to see what the data look like.
170 | #' 
171 | 
172 | #+ graphBaseR, message=FALSE, fig.width=10, fig.height=8, fig.align="center", dev="png"
173 | # Create graph object
174 | graph <- xyplot(Individual.Income.Taxes ~ Fiscal.Year, 
175 |                 data = my.data,
176 |                 type = c("g", "b"),
177 |                 col = "#00526D")
178 | 
179 | graph <- graph + xyplot(Corporation.Income.Taxes ~ Fiscal.Year, 
180 |                         data = my.data,
181 |                         type = c("b"),
182 |                         col = "#AD3333")
183 | 
184 | # Create pretty theme, because who doesn't like a nice looking graph :-)
185 | my.theme <- within(trellis.par.get(), {           
186 |   plot.line <- within(plot.line, {
187 |     lwd <- 3
188 |   })
189 |   
190 |   axis.text <- within(axis.text, {
191 |     cex <- 1.25
192 |   })
193 |   
194 |   par.xlab.text <- within(par.xlab.text, {
195 |     cex <- 1.5
196 |   })
197 |   
198 |   par.ylab.text <- within(par.ylab.text, {
199 |     cex <- 1.5
200 |   })
201 |   
202 |   add.text <- within(add.text, {
203 |     cex <- 1.25
204 |   })
205 |   
206 |   superpose.line <- within(superpose.line, {
207 |     col <- c("#00526D", "#AD3333")
208 |     lwd <- 3
209 |   })
210 | })
211 | 
212 | # Compute upper and lower limits of the y axes
213 | max.val <- max(my.data$Corporation.Income.Taxes, my.data$Individual.Income.Taxes)
214 | ylim <- c(0 - 0.05 * max.val, max.val * 1.15)
215 | 
216 | # Prepare a legend
217 | key <- within(list(), {
218 |   text <- c("Individuals", "Corporations")
219 |   corner <- c(0.05, .95)
220 |   lines <- TRUE
221 |   points = FALSE
222 | })
223 | 
224 | # Update graph object 
225 | graph <- update(graph, 
226 |                 xlab = "Fiscal year", ylab = "Income taxes (% of GDP)",
227 |                 ylim = ylim,
228 |                 par.settings = my.theme,
229 |                 auto.key = key)
230 | 
231 | # Print graph object
232 | print(graph)
233 | 
234 | #+ graphBasePy, eval=FALSE, engine="python"
235 | plt.figure(figsize=(12, 7))
236 | 
237 | plt.plot(quandl_data["Fiscal_Year"], quandl_data["Individual_Income_Taxes"], marker="o", color="#00526D", lw=2, label="Individuals")
238 | plt.plot(quandl_data["Fiscal_Year"], quandl_data["Corporation_Income_Taxes"], marker="o", color="#AD3333", lw=2, label="Corporations")
239 | 
240 | plt.xlabel("Fiscal Year", fontsize=16)
241 | plt.ylabel("Income taxes (% of GDP)", fontsize=16)
242 | 
243 | plt.xlim(-1, 69)
244 | plt.xticks(quandl_data["Fiscal_Year"][5::10], quandl_data.index[5::10].year)
245 | 
246 | plt.grid()
247 | 
248 | plt.legend(loc="upper left");
249 | 
250 | #' <center> ![](figurePy/graphBasePy.png) </center>
251 | #' 
252 | 
253 | #' The first thing one can notice in this graph is the clear decreasing trend of
254 | #' the corporations' income taxes: since 1945, they have dropped from about 7\% 
255 | #' of the GDP to less than 2\%. On the contrary, the individuals' contribution 
256 | #' to the income taxes seems to have remained stable around 8\% of the GDP, with
257 | #' maybe a slight general increase over time. Also there seems to be correlated 
258 | #' fluctuations, with several successive years of increase or decrease probably 
259 | #' due to the prolonged action of successive governments. The intensity of 
260 | #' these fluctuations seems uneven along the considered time period. Finally the
261 | #' corporations' income taxes seem to decrease linearly (more or less) until the
262 | #' mid-80's but do not seem to change much after that. These three elements 
263 | #' (autocorrelation, heteroscedasticity and change in slope) will certainly 
264 | #' raise flags in diagnostic tests but we will deal with them in another
265 | #' challenge. Today we want to focus on making and plotting simple linear 
266 | #' regressions instead.
267 | #' 
268 | #' First, let's fit a linear model to the individual and corporation income 
269 | #' taxes data.
270 | #' 
271 | 
272 | #+ modelFitIndR, message=FALSE
273 | # Fit individuals' income taxes data
274 | model.ind <- lm(Individual.Income.Taxes ~ Fiscal.Year, data = my.data)
275 | 
276 | # Fit corporations' income taxes data
277 | model.corp <- lm(Corporation.Income.Taxes ~ Fiscal.Year, data = my.data)
278 | 
279 | #+ modelFitIndPy, eval=FALSE, engine="python"
280 | # Fit a linear regression to the individual income taxes time series
281 | individual_regression = sm.OLS.from_formula("Individual_Income_Taxes ~ Fiscal_Year", quandl_data).fit()
282 | 
283 | # Fit a linear regression to the corporation income taxes time series
284 | corporation_regression = sm.OLS.from_formula("Corporation_Income_Taxes ~ Fiscal_Year", quandl_data).fit()
285 | 
286 | #' Now let's have a look to the summary statistics of each fitted model. We will
287 | #' display the R output only to keep things simple. However if you decide to 
288 | #' look at the Python output, the result of the regression might look a bit 
289 | #' different. Randy had to transform the dates into years because the fitting
290 | #' function did not seem capable of handling dates like the fitting function in
291 | #' R does (it converts the dates in days in this case).
292 | #' 
293 | 
294 | #' First, the model for individuals' income taxes:
295 | #' 
296 | 
297 | #+ sumStatsIndR, message=FALSE
298 | # Summary statistics for individual income taxes model
299 | summary(model.ind)
300 | 
301 | #+ sumStatsIndPy, eval=FALSE, engine="python"
302 | print individual_regression.summary()
303 | 
304 | #' Then, the model for corporations' income taxes:
305 | #' 
306 | 
307 | #+ sumStatsCorpR, message=FALSE
308 | # Summary statistics for corporation income taxes model
309 | summary(model.corp)
310 | 
311 | #+ sumStatsCorpPy, eval=FALSE, engine="python"
312 | print corporation_regression.summary()
313 | 
314 | #' What do we see? For the individual income taxes, we see a small, positive 
315 | #' increase of +2.579e-05\% of GDP per day (R treats dates as days). Over the 
316 | #' whole 1945 to 2013 period, this corresponds to a total increase of +0.64\% of
317 | #' GDP, *i.e.* a +8.4\% global increase in the contribution of individuals to 
318 | #' income taxes. However this increase is only marginally significant (p = 
319 | #' 0.0575), therefore we can consider that the contribution of individuals to 
320 | #' the income taxes has remained fairly stable during the 7 decades since WWII.
321 | #' 
322 | #' We have a very different story for the corporation income taxes. The linear 
323 | #' model shows a strongly significant (p < 2e-16) decrease of -1.564e-04\% of 
324 | #' GDP per day. Over the 1945-2013 period, this corresponds to a total decrease 
325 | #' of -3.88\% of GDP, *i.e.* a -80.1\% global decrease in the contribution of 
326 | #' corporations to income taxes.
327 | #' 
328 | #' Of course, we need to be very cautious in the interpretation of these results
329 | #' because the linear models we have fitted to the data do not take into account
330 | #' the potential sources of error that we have identified earlier. We will see 
331 | #' next time how we can evaluate them, and correct them if necessary.
332 | #' 
333 | #' For now, we will assume that everything is good and we will focus on plotting
334 | #' the two linear models and their confidence interval over the data.
335 | #' 
336 | #' To do this, we will first compute the value and confidence interval as
337 | #' predicted by each model for different dates. We will choose dates that cover
338 | #' the whole studied period (1945-2013), plus a little extra on both ends (10\%)
339 | #' to make the graph more aesthetically pleasing (*i.e.*, the models'
340 | #' predictions will cover a larger time period than the x-axis limits of the
341 | #' graph). 
342 | #' 
343 | 
344 | #+ predictR, message=FALSE
345 | # Create new dates for model predictions
346 | difference <- as.Date("2013-12-31") - as.Date("1945-12-31")
347 | new.data <- data.frame(Fiscal.Year = seq.Date(from = as.Date("1945-12-31") - 0.1 * difference, 
348 |                                               to = as.Date("2013-12-31") + 0.1 * difference,
349 |                                               by = "years"))
350 | 
351 | # Compute model predictions (values + confidence intervals) for new dates
352 | # Model of individual income taxes
353 | predict.ind <- within(new.data, {
354 |   Predictions <- as.data.frame(predict(model.ind, 
355 |                                        newdata = new.data,
356 |                                        interval = "confidence"))
357 | })
358 | 
359 | # Model of corporation income taxes
360 | predict.corp <- within(new.data, {
361 |   Predictions <- as.data.frame(predict(model.corp, 
362 |                                        newdata = new.data,
363 |                                        interval = "confidence"))
364 | })
365 | 
366 | #+ predictPy, eval=FALSE, engine="python"
367 | # Predict the 95% confidence bands for the regression
368 | individual_prstd, individual_iv_l, individual_iv_u = wls_prediction_std(individual_regression)
369 | individual_st, individual_data, individual_ss2 = summary_table(individual_regression, alpha=0.05)
370 | 
371 | individual_fitted_values = individual_data[:, 2]
372 | individual_predict_mean_ci_low, individual_predict_mean_ci_upper = individual_data[:, 4:6].T
373 | 
374 | # Predict the 95% confidence bands for the regression
375 | corporation_prstd, corporation_iv_l, corporation_iv_u = wls_prediction_std(corporation_regression)
376 | corporation_st, corporation_data, corporation_ss2 = summary_table(corporation_regression, alpha=0.05)
377 | 
378 | corporation_fitted_values = corporation_data[:, 2]
379 | corporation_predict_mean_ci_low, corporation_predict_mean_ci_upper = corporation_data[:, 4:6].T
380 | 
381 | #' And finally, we will add to the data plot the predicted values as a line and
382 | #' the confidence intervals of the predicted values as a polygon.
383 | #' 
384 | 
385 | #+ graphPredictR, message=FALSE, fig.width=10, fig.height=8, fig.align="center", dev="png"
386 | # Add predicted values for model of individual income taxes 
387 | graph <- graph + xyplot(Predictions$fit ~ Fiscal.Year, 
388 |                         data = predict.ind,
389 |                         type = c("l"),
390 |                         col = "#00526D")
391 | 
392 | # Add predicted values for model of corporation income taxes 
393 | graph <- graph + xyplot(Predictions$fit ~ Fiscal.Year, 
394 |                         data = predict.corp,
395 |                         type = c("l"),
396 |                         col = "#AD3333")
397 | 
398 | # Add confidence polygon for model of individual income taxes 
399 | graph <- graph + layer_(lpolygon(x = c(predict.ind$Fiscal.Year, 
400 |                                        rev(predict.ind$Fiscal.Year)),
401 |                                  y = c(predict.ind$Predictions$upr, 
402 |                                        rev(predict.ind$Predictions$lwr)),
403 |                                  col = "#00526D25", 
404 |                                  border = "#00526D75"))
405 | 
406 | # Add confidence polygon for model of corporation income taxes 
407 | graph <- graph + layer_(lpolygon(x = c(predict.corp$Fiscal.Year, 
408 |                                        rev(predict.corp$Fiscal.Year)),
409 |                                  y = c(predict.corp$Predictions$upr, 
410 |                                        rev(predict.corp$Predictions$lwr)),
411 |                                  col = "#AD333325", 
412 |                                  border = "#AD333375"))
413 | 
414 | # Reprint graph object
415 | print(graph)
416 | 
417 | #+ graphPredictPy, eval=FALSE, engine="python"
418 | plt.plot(quandl_data["Fiscal_Year"],
419 |          individual_fitted_values,
420 |          color="#00526D", lw=2)
421 | 
422 | plt.fill_between(quandl_data["Fiscal_Year"],
423 |                  individual_predict_mean_ci_low,
424 |                  individual_predict_mean_ci_upper,
425 |                  color="#00526D", alpha=0.2)
426 | 
427 | plt.plot(quandl_data["Fiscal_Year"],
428 |          corporation_fitted_values,
429 |          color="#AD3333", lw=2)
430 | 
431 | plt.fill_between(quandl_data["Fiscal_Year"],
432 |                  corporation_predict_mean_ci_low,
433 |                  corporation_predict_mean_ci_upper,
434 |                  color="#AD3333", alpha=0.2)
435 | 
436 | plt.legend(loc="upper left");
437 | 
438 | #' <center> ![](figurePy/graphPredictPy.png) </center>
439 | #'
440 | 
441 | #' And that's it for today! We hope that you enjoyed it and that you'll come back next time to find out more about running diagnotic tests and correcting potential errors on today's linear models. 
442 | #' 
443 | 
444 | #' ___
445 | #' 
446 | #' #### 3 - Source code ####
447 | #' 
448 | #' R and Python source codes are available 
449 | #' [here](https://github.com/morpionZ/R-vs-Python/tree/master/Linear%20regression/code).
450 | #' 
451 | 
452 | 
453 | 
454 | 


--------------------------------------------------------------------------------
/Linear regression/notebook.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | 
  5 | 
  6 | **Document title:** R vs Python - Round 3
  7 | 
  8 | **Date:** February 5, 2014
  9 | 
 10 | **Text by:** Simon Garnier ([www.theswarmlab.com](http://www.theswarmlab.com)
 11 | / [\@sjmgarnier](http://twitter.com/sjmgarnier))
 12 | 
 13 | **R code by:** Simon Garnier
 14 | ([www.theswarmlab.com](http://www.theswarmlab.com) /
 15 | [\@sjmgarnier](http://twitter.com/sjmgarnier))
 16 | 
 17 | **Python code by:** Randy Olson
 18 | ([www.randalolson.com](http://www.randalolson.com) /
 19 | [\@randal_olson](http://twitter.com/randal_olson))
 20 | 
 21 | Document generated with RStudio ([www.rstudio.com](http://www.rstudio.com)), 
 22 | knitr ([www.yihui.name/knitr/](http://yihui.name/knitr/)) and pandoc 
 23 | ([www.johnmacfarlane.net/pandoc/](http://johnmacfarlane.net/pandoc/)). Python
 24 | figures generated with iPython Notebook
 25 | ([www.ipython.org/notebook.html](http://ipython.org/notebook.html)).
 26 | 
 27 | ___
 28 | 
 29 | #### Foreword ####
 30 | 
 31 | My friend Randy Olson and I got into the habit to argue about the relative 
 32 | qualities of our favorite languages for data analysis and visualization. I am
 33 | an enthusiastic R user ([www.r-project.org](http://www.r-project.org)) while 
 34 | Randy is a fan of Python ([www.python.org](http://www.python.org)). One thing
 35 | we agree on however is that our discussions are meaningless unless we
 36 | actually put R and Python to a series of tests to showcase their relative
 37 | strengths and weaknesses. Essentially we will set a common goal (*e.g.*,
 38 | perform a particular type of data analysis or draw a particular type of
 39 | graph) and create the R and Python codes to achieve this goal. And since
 40 | Randy and I are all about sharing, open source and open access, we decided to
 41 | make public the results of our friendly challenges so that you can help us
 42 | decide between R and Python and, hopefully, also learn something along the
 43 | way.
 44 | 
 45 | ___
 46 | 
 47 | #### Today's challenge: walk the line ####
 48 | 
 49 | ##### 1 - Introduction #####
 50 | 
 51 | Linear regression is one of the most popular statistical tools, in particular
 52 | when it comes to detect and measure the strength of trends in time-series 
 53 | (but not only). Today we will show you how to perform a linear regression in 
 54 | R and Python, how to run basic diagnostic tests to make sure the linear 
 55 | regression went well, and how to nicely plot the final result.
 56 | 
 57 | The data we will use today are provided by [Quandl](http://www.quandl.com/). 
 58 | Quandl is a company that collects and organizes time-series datasets from 
 59 | hundred's of public sources, and can even host your datasets for free. It is 
 60 | like the YouTube of time-series data. Besides having a huge collection of 
 61 | datasets, the nice thing about Quandl is that it provides packages for 
 62 | [R](http://www.quandl.com/help/packages/r) and 
 63 | [Python](http://www.quandl.com/help/packages/python) (and [other languages as
 64 | well](http://www.quandl.com/help/packages)) that make it very easy - one line
 65 | of code! - to retrieve time-series datasets directly from your favorite data 
 66 | analysis tool. Yeah Quandl!
 67 | 
 68 | The dataset that we will explore contains information about the average 
 69 | contributions of individuals and corporations to the income taxes, expressed
 70 | as a fraction of the [Gross Domestic Produce 
 71 | (GDP)](http://en.wikipedia.org/wiki/Gross_domestic_product). We will use 
 72 | linear regression to estimate the variation of these contributions from 1945 
 73 | to 2013.
 74 | 
 75 | In the following, we will detail the different steps of the process and 
 76 | provide for each step the corresponding code (red boxes for R, green boxes 
 77 | for Python). You will also find the entire codes at the end of this document.
 78 | 
 79 | If you think there’s a better way to code this in either language, leave a 
 80 | pull request on our [GitHub 
 81 | repository](https://github.com/morpionZ/R-vs-Python/tree/master/Deadliest%20movies%20scrape/code)
 82 | or leave a note with suggestions in the comments below.
 83 | 
 84 | ##### 2 - Step by step process #####
 85 | 
 86 | First things first, let's set up our working environment by loading some
 87 | necessary libraries.
 88 | 
 89 | 
 90 | 
 91 | ```r
 92 | # Load libraries
 93 | require("Quandl")         # Functions to access Quandl's database
 94 | require("lattice")        # Base graph functions
 95 | require("latticeExtra")   # Layer graph functions
 96 | ```
 97 | 
 98 | ```python
 99 | import Quandl
100 | ```
101 | 
102 | 
103 | Now, let's load the dataset from Quandl. You can find it on Quandl's website 
104 | [here](http://www.quandl.com/TPC/HIST_RECEIPT-Historical-Source-of-Revenue-as-Share-of-GDP).
105 | Among other things, this dataset contains the contributions of individuals 
106 | and corporations to the income taxes, expressed as a fraction of the GDP. Its
107 | original source is the [Tax Policy Center](http://www.taxpolicycenter.org/).
108 | The dataset code at Quandl is "TPC/HIST_RECEIPT" and we will retrieve data
109 | from December 31, 1945, to December 31, 2013.
110 | 
111 | 
112 | 
113 | ```r
114 | # Load data from Quandl
115 | my.data <- Quandl("TPC/HIST_RECEIPT", 
116 |                   start_date = "1945-12-31", 
117 |                   end_date = "2013-12-31")
118 | 
119 | # Display first lines of the data frame
120 | head(my.data)
121 | ```
122 | 
123 | ```python
124 | quandl_data = Quandl.get("TPC/HIST_RECEIPT",
125 |                          trim_start="1945-12-31",
126 |                          trim_end="2013-12-31")
127 | 
128 | quandl_data.head()
129 | ```
130 | 
131 | 
132 | The column names from Quandl's data frame contain spaces. We need to correct 
133 | that before we can proceed with the rest of the analysis. 
134 | 
135 | 
136 | 
137 | ```r
138 | # The columns' names contain spaces, make them syntactically valid
139 | names(my.data) <- make.names(names(my.data))
140 | ```
141 | 
142 | ```python
143 | %matplotlib inline
144 | import matplotlib.pyplot as plt
145 | 
146 | # Rename the columns we're using with underscores -- required for the linear regression
147 | quandl_data["Individual_Income_Taxes"] = quandl_data["Individual Income Taxes"]
148 | quandl_data["Corporation_Income_Taxes"] = quandl_data["Corporation Income Taxes"]
149 | quandl_data["Fiscal_Year"] = range(len(quandl_data.index))
150 | ```
151 | 
152 | 
153 | The dataset contains three columns of interest for us: individuals' and 
154 | corporations' income taxes as a fraction of the GDP, and fiscal year. We will 
155 | first plot the individuals' and corporations' income taxes as a function of 
156 | the fiscal year to see what the data look like.
157 | 
158 | 
159 | 
160 | ```r
161 | # Create graph object
162 | graph <- xyplot(Individual.Income.Taxes ~ Fiscal.Year, 
163 |                 data = my.data,
164 |                 type = c("g", "b"),
165 |                 col = "#00526D")
166 | 
167 | graph <- graph + xyplot(Corporation.Income.Taxes ~ Fiscal.Year, 
168 |                         data = my.data,
169 |                         type = c("b"),
170 |                         col = "#AD3333")
171 | 
172 | # Create pretty theme, because who doesn't like a nice looking graph :-)
173 | my.theme <- within(trellis.par.get(), {           
174 |   plot.line <- within(plot.line, {
175 |     lwd <- 3
176 |   })
177 |   
178 |   axis.text <- within(axis.text, {
179 |     cex <- 1.25
180 |   })
181 |   
182 |   par.xlab.text <- within(par.xlab.text, {
183 |     cex <- 1.5
184 |   })
185 |   
186 |   par.ylab.text <- within(par.ylab.text, {
187 |     cex <- 1.5
188 |   })
189 |   
190 |   add.text <- within(add.text, {
191 |     cex <- 1.25
192 |   })
193 |   
194 |   superpose.line <- within(superpose.line, {
195 |     col <- c("#00526D", "#AD3333")
196 |     lwd <- 3
197 |   })
198 | })
199 | 
200 | # Compute upper and lower limits of the y axes
201 | max.val <- max(my.data$Corporation.Income.Taxes, my.data$Individual.Income.Taxes)
202 | ylim <- c(0 - 0.05 * max.val, max.val * 1.15)
203 | 
204 | # Prepare a legend
205 | key <- within(list(), {
206 |   text <- c("Individuals", "Corporations")
207 |   corner <- c(0.05, .95)
208 |   lines <- TRUE
209 |   points = FALSE
210 | })
211 | 
212 | # Update graph object 
213 | graph <- update(graph, 
214 |                 xlab = "Fiscal year", ylab = "Income taxes (% of GDP)",
215 |                 ylim = ylim,
216 |                 par.settings = my.theme,
217 |                 auto.key = key)
218 | 
219 | # Print graph object
220 | print(graph)
221 | ```
222 | 
223 | <img src="figure/graphBaseR.png" title="plot of chunk graphBaseR" alt="plot of chunk graphBaseR" style="display: block; margin: auto;" />
224 | 
225 | ```python
226 | plt.figure(figsize=(12, 7))
227 | 
228 | plt.plot(quandl_data["Fiscal_Year"], quandl_data["Individual_Income_Taxes"], marker="o", color="#00526D", lw=2, label="Individuals")
229 | plt.plot(quandl_data["Fiscal_Year"], quandl_data["Corporation_Income_Taxes"], marker="o", color="#AD3333", lw=2, label="Corporations")
230 | 
231 | plt.xlabel("Fiscal Year", fontsize=16)
232 | plt.ylabel("Income taxes (% of GDP)", fontsize=16)
233 | 
234 | plt.xlim(-1, 69)
235 | plt.xticks(quandl_data["Fiscal_Year"][5::10], quandl_data.index[5::10].year)
236 | 
237 | plt.grid()
238 | 
239 | plt.legend(loc="upper left");
240 | ```
241 | 
242 | 
243 | <center> ![](figurePy/graphBasePy.png) </center>
244 | 
245 | The first thing one can notice in this graph is the clear decreasing trend of
246 | the corporations' income taxes: since 1945, they have dropped from about 7\% 
247 | of the GDP to less than 2\%. On the contrary, the individuals' contribution 
248 | to the income taxes seems to have remained stable around 8\% of the GDP, with
249 | maybe a slight general increase over time. Also there seems to be correlated 
250 | fluctuations, with several successive years of increase or decrease probably 
251 | due to the prolonged action of successive governments. The intensity of 
252 | these fluctuations seems uneven along the considered time period. Finally the
253 | corporations' income taxes seem to decrease linearly (more or less) until the
254 | mid-80's but do not seem to change much after that. These three elements 
255 | (autocorrelation, heteroscedasticity and change in slope) will certainly 
256 | raise flags in diagnostic tests but we will deal with them in another
257 | challenge. Today we want to focus on making and plotting simple linear 
258 | regressions instead.
259 | 
260 | First, let's fit a linear model to the individual and corporation income 
261 | taxes data.
262 | 
263 | 
264 | 
265 | ```r
266 | # Fit individuals' income taxes data
267 | model.ind <- lm(Individual.Income.Taxes ~ Fiscal.Year, data = my.data)
268 | 
269 | # Fit corporations' income taxes data
270 | model.corp <- lm(Corporation.Income.Taxes ~ Fiscal.Year, data = my.data)
271 | ```
272 | 
273 | ```python
274 | # Fit a linear regression to the individual income taxes time series
275 | individual_regression = sm.OLS.from_formula("Individual_Income_Taxes ~ Fiscal_Year", quandl_data).fit()
276 | 
277 | # Fit a linear regression to the corporation income taxes time series
278 | corporation_regression = sm.OLS.from_formula("Corporation_Income_Taxes ~ Fiscal_Year", quandl_data).fit()
279 | ```
280 | 
281 | 
282 | Now let's have a look to the summary statistics of each fitted model. We will
283 | display the R output only to keep things simple. However if you decide to 
284 | look at the Python output, the result of the regression might look a bit 
285 | different. Randy had to transform the dates into years because the fitting
286 | function did not seem capable of handling dates like the fitting function in
287 | R does (it converts the dates in days in this case).
288 | 
289 | First, the model for individuals' income taxes:
290 | 
291 | 
292 | 
293 | ```r
294 | # Summary statistics for individual income taxes model
295 | summary(model.ind)
296 | ```
297 | 
298 | ```
299 | ## 
300 | ## Call:
301 | ## lm(formula = Individual.Income.Taxes ~ Fiscal.Year, data = my.data)
302 | ## 
303 | ## Residuals:
304 | ##     Min      1Q  Median      3Q     Max 
305 | ## -1.9522 -0.3065  0.0217  0.3101  2.0674 
306 | ## 
307 | ## Coefficients:
308 | ##             Estimate Std. Error t value Pr(>|t|)    
309 | ## (Intercept) 7.84e+00   1.09e-01   72.18   <2e-16 ***
310 | ## Fiscal.Year 2.58e-05   1.33e-05    1.93    0.058 .  
311 | ## ---
312 | ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
313 | ## 
314 | ## Residual standard error: 0.806 on 67 degrees of freedom
315 | ## Multiple R-squared:  0.0528,	Adjusted R-squared:  0.0387 
316 | ## F-statistic: 3.73 on 1 and 67 DF,  p-value: 0.0575
317 | ```
318 | 
319 | ```python
320 | print individual_regression.summary()
321 | ```
322 | 
323 | 
324 | Then, the model for corporations' income taxes:
325 | 
326 | 
327 | 
328 | ```r
329 | # Summary statistics for corporation income taxes model
330 | summary(model.corp)
331 | ```
332 | 
333 | ```
334 | ## 
335 | ## Call:
336 | ## lm(formula = Corporation.Income.Taxes ~ Fiscal.Year, data = my.data)
337 | ## 
338 | ## Residuals:
339 | ##    Min     1Q Median     3Q    Max 
340 | ## -1.535 -0.478 -0.150  0.394  2.394 
341 | ## 
342 | ## Coefficients:
343 | ##              Estimate Std. Error t value Pr(>|t|)    
344 | ## (Intercept)  3.43e+00   9.93e-02    34.6   <2e-16 ***
345 | ## Fiscal.Year -1.56e-04   1.22e-05   -12.8   <2e-16 ***
346 | ## ---
347 | ## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
348 | ## 
349 | ## Residual standard error: 0.737 on 67 degrees of freedom
350 | ## Multiple R-squared:  0.711,	Adjusted R-squared:  0.706 
351 | ## F-statistic:  164 on 1 and 67 DF,  p-value: <2e-16
352 | ```
353 | 
354 | ```python
355 | print corporation_regression.summary()
356 | ```
357 | 
358 | 
359 | What do we see? For the individual income taxes, we see a small, positive 
360 | increase of +2.579e-05\% of GDP per day (R treats dates as days). Over the 
361 | whole 1945 to 2013 period, this corresponds to a total increase of +0.64\% of
362 | GDP, *i.e.* a +8.4\% global increase in the contribution of individuals to 
363 | income taxes. However this increase is only marginally significant (p = 
364 | 0.0575), therefore we can consider that the contribution of individuals to 
365 | the income taxes has remained fairly stable during the 7 decades since WWII.
366 | 
367 | We have a very different story for the corporation income taxes. The linear 
368 | model shows a strongly significant (p < 2e-16) decrease of -1.564e-04\% of 
369 | GDP per day. Over the 1945-2013 period, this corresponds to a total decrease 
370 | of -3.88\% of GDP, *i.e.* a -80.1\% global decrease in the contribution of 
371 | corporations to income taxes.
372 | 
373 | Of course, we need to be very cautious in the interpretation of these results
374 | because the linear models we have fitted to the data do not take into account
375 | the potential sources of error that we have identified earlier. We will see 
376 | next time how we can evaluate them, and correct them if necessary.
377 | 
378 | For now, we will assume that everything is good and we will focus on plotting
379 | the two linear models and their confidence interval over the data.
380 | 
381 | To do this, we will first compute the value and confidence interval as
382 | predicted by each model for different dates. We will choose dates that cover
383 | the whole studied period (1945-2013), plus a little extra on both ends (10\%)
384 | to make the graph more aesthetically pleasing (*i.e.*, the models'
385 | predictions will cover a larger time period than the x-axis limits of the
386 | graph). 
387 | 
388 | 
389 | 
390 | ```r
391 | # Create new dates for model predictions
392 | difference <- as.Date("2013-12-31") - as.Date("1945-12-31")
393 | new.data <- data.frame(Fiscal.Year = seq.Date(from = as.Date("1945-12-31") - 0.1 * difference, 
394 |                                               to = as.Date("2013-12-31") + 0.1 * difference,
395 |                                               by = "years"))
396 | 
397 | # Compute model predictions (values + confidence intervals) for new dates
398 | # Model of individual income taxes
399 | predict.ind <- within(new.data, {
400 |   Predictions <- as.data.frame(predict(model.ind, 
401 |                                        newdata = new.data,
402 |                                        interval = "confidence"))
403 | })
404 | 
405 | # Model of corporation income taxes
406 | predict.corp <- within(new.data, {
407 |   Predictions <- as.data.frame(predict(model.corp, 
408 |                                        newdata = new.data,
409 |                                        interval = "confidence"))
410 | })
411 | ```
412 | 
413 | ```python
414 | # Predict the 95% confidence bands for the regression
415 | individual_prstd, individual_iv_l, individual_iv_u = wls_prediction_std(individual_regression)
416 | individual_st, individual_data, individual_ss2 = summary_table(individual_regression, alpha=0.05)
417 | 
418 | individual_fitted_values = individual_data[:, 2]
419 | individual_predict_mean_ci_low, individual_predict_mean_ci_upper = individual_data[:, 4:6].T
420 | 
421 | # Predict the 95% confidence bands for the regression
422 | corporation_prstd, corporation_iv_l, corporation_iv_u = wls_prediction_std(corporation_regression)
423 | corporation_st, corporation_data, corporation_ss2 = summary_table(corporation_regression, alpha=0.05)
424 | 
425 | corporation_fitted_values = corporation_data[:, 2]
426 | corporation_predict_mean_ci_low, corporation_predict_mean_ci_upper = corporation_data[:, 4:6].T
427 | ```
428 | 
429 | 
430 | And finally, we will add to the data plot the predicted values as a line and
431 | the confidence intervals of the predicted values as a polygon.
432 | 
433 | 
434 | 
435 | ```r
436 | # Add predicted values for model of individual income taxes 
437 | graph <- graph + xyplot(Predictions$fit ~ Fiscal.Year, 
438 |                         data = predict.ind,
439 |                         type = c("l"),
440 |                         col = "#00526D")
441 | 
442 | # Add predicted values for model of corporation income taxes 
443 | graph <- graph + xyplot(Predictions$fit ~ Fiscal.Year, 
444 |                         data = predict.corp,
445 |                         type = c("l"),
446 |                         col = "#AD3333")
447 | 
448 | # Add confidence polygon for model of individual income taxes 
449 | graph <- graph + layer_(lpolygon(x = c(predict.ind$Fiscal.Year, 
450 |                                        rev(predict.ind$Fiscal.Year)),
451 |                                  y = c(predict.ind$Predictions$upr, 
452 |                                        rev(predict.ind$Predictions$lwr)),
453 |                                  col = "#00526D25", 
454 |                                  border = "#00526D75"))
455 | ```
456 | 
457 | ```
458 | ## Error: attempt to apply non-function
459 | ```
460 | 
461 | ```r
462 | 
463 | # Add confidence polygon for model of corporation income taxes 
464 | graph <- graph + layer_(lpolygon(x = c(predict.corp$Fiscal.Year, 
465 |                                        rev(predict.corp$Fiscal.Year)),
466 |                                  y = c(predict.corp$Predictions$upr, 
467 |                                        rev(predict.corp$Predictions$lwr)),
468 |                                  col = "#AD333325", 
469 |                                  border = "#AD333375"))
470 | ```
471 | 
472 | ```
473 | ## Error: attempt to apply non-function
474 | ```
475 | 
476 | <img src="figure/graphPredictR1.png" title="plot of chunk graphPredictR" alt="plot of chunk graphPredictR" style="display: block; margin: auto;" />
477 | 
478 | ```r
479 | 
480 | # Reprint graph object
481 | print(graph)
482 | ```
483 | 
484 | <img src="figure/graphPredictR2.png" title="plot of chunk graphPredictR" alt="plot of chunk graphPredictR" style="display: block; margin: auto;" />
485 | 
486 | ```python
487 | plt.plot(quandl_data["Fiscal_Year"],
488 |          individual_fitted_values,
489 |          color="#00526D", lw=2)
490 | 
491 | plt.fill_between(quandl_data["Fiscal_Year"],
492 |                  individual_predict_mean_ci_low,
493 |                  individual_predict_mean_ci_upper,
494 |                  color="#00526D", alpha=0.2)
495 | 
496 | plt.plot(quandl_data["Fiscal_Year"],
497 |          corporation_fitted_values,
498 |          color="#AD3333", lw=2)
499 | 
500 | plt.fill_between(quandl_data["Fiscal_Year"],
501 |                  corporation_predict_mean_ci_low,
502 |                  corporation_predict_mean_ci_upper,
503 |                  color="#AD3333", alpha=0.2)
504 | 
505 | plt.legend(loc="upper left");
506 | ```
507 | 
508 | 
509 | <center> ![](figurePy/graphPredictPy.png) </center>
510 | 
511 | And that's it for today! We hope that you enjoyed it and that you'll come back next time to find out more about running diagnotic tests and correcting potential errors on today's linear models. 
512 | 
513 | ___
514 | 
515 | #### 3 - Source code ####
516 | 
517 | R and Python source codes are available 
518 | [here](https://github.com/morpionZ/R-vs-Python/tree/master/Linear%20regression/code).
519 | 
520 | 


--------------------------------------------------------------------------------
/Linear regression/pandoc_config.txt:
--------------------------------------------------------------------------------
1 | format: html
2 | c: custom.css
3 | s: 
4 | S: 
5 | mathjax: 
6 | o: notebook.html
7 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | R-vs-Python
2 | ===========
3 | 
4 | This repository contains several code examples comparing R and Python for data science and visualization.
5 | 


--------------------------------------------------------------------------------