├── .gitignore ├── boxofficemojoAPI ├── __init__.py ├── base.py ├── utils.py ├── boxofficemojo.py └── movie.py ├── LICENSE.txt └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | test 2 | Analysis 3 | .idea -------------------------------------------------------------------------------- /boxofficemojoAPI/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'rastko' 2 | from movie import * 3 | from boxofficemojo import BoxOfficeMojo 4 | from base import MovieBase -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2015 Rastko Anicic, https://github.com/skozilla 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | 21 | -------------------------------------------------------------------------------- /boxofficemojoAPI/base.py: -------------------------------------------------------------------------------- 1 | __author__ = 'rastko' 2 | 3 | import bs4 4 | import json 5 | from bson import json_util 6 | 7 | class MovieBase(object): 8 | 9 | def __init__(self, html_soup): 10 | """Movie class which parses html BeautifulSoup object and extracts information about the movie""" 11 | 12 | self.data = {} 13 | assert isinstance(html_soup, bs4.BeautifulSoup) 14 | self.soup = html_soup 15 | 16 | self.clean_html() 17 | self.extract_data() 18 | 19 | def clean_html(self): 20 | """Get rid of all bold, italic, underline, link tags, script tag, img tag, etc""" 21 | invalid_tags = ['a', 'b', 'i', 'u', 'nobr', 'font'] 22 | for tag in invalid_tags: 23 | for match in self.soup.findAll(tag): 24 | match.replaceWithChildren() 25 | 26 | # delete all contents in script and img tags 27 | [x.extract() for x in self.soup.findAll('script')] 28 | [x.extract() for x in self.soup.findAll('img')] 29 | [x.extract() for x in self.soup.findAll('br')] 30 | [x.extract() for x in self.soup.findAll('div', id='hp_banner')] 31 | [x.extract() for x in self.soup.findAll('ul', id='leftnav')] 32 | 33 | def print_to_file(self, file_name): 34 | """"Print a pretty and clean html string to a file""" 35 | f = open(file_name, 'w') 36 | f.write(self.soup.prettify().encode('utf8')) 37 | f.close() 38 | 39 | def extract_data(self): 40 | pass 41 | 42 | def clean_data(self): 43 | pass 44 | 45 | def to_json(self): 46 | """Returns a JSON string of the Data member""" 47 | return json.dumps(self.data, indent=4, sort_keys=True, default=json_util.default) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | BoxOfficeMojo 2 | ============= 3 | 4 | A simple python module for scrapping movie information from www.boxofficemojo.com 5 | 6 | The module is used to extract information for any movie in the www.boxofficemojo.com domain. The information that it obtains 7 | includes financial information (domestic gross, foreign gross, budget), cast, directors, composers, runtime, raiting, etc.. 8 | It could also be used to get the weekly performance of the movie in the box office. 9 | Not all the information is present on the website, so it gets whatever is available. 10 | 11 | Feel free to make suggestions about the code or the functionality, as they would be greatly appreciated. Contributions are welcome. 12 | 13 | ### Example 14 | 15 | ```python 16 | import boxofficemojoAPI as bom 17 | 18 | box_office_mojo = bom.BoxOfficeMojo() 19 | box_office_mojo.crawl_for_urls() 20 | 21 | 22 | movie = box_office_mojo.get_movie_summary("titanic") 23 | movie.clean_data() 24 | print movie.to_json() 25 | 26 | weekly = box_office_mojo.get_weekly_summary("titanic") 27 | weekly.clean_data() 28 | print weekly.to_json() 29 | 30 | ``` 31 | 32 | the output would be 33 | 34 | ```json 35 | { 36 | "Actors": [ 37 | "Leonardo DiCaprio", 38 | "Kate Winslet", 39 | "Billy Zane", 40 | "Kathy Bates", 41 | "Bill Paxton", 42 | "Bernard Hill", 43 | "Ioan Gruffudd" 44 | ], 45 | "Composers": [ 46 | "James Horner" 47 | ], 48 | "Directors": [ 49 | "James Cameron" 50 | ], 51 | "Distributor": "Paramount", 52 | "Domestic": 658672302.0, 53 | "Foreign": 1528100000.0, 54 | "Genre": "Romance", 55 | "MPAA Rating": "PG-13", 56 | "Producers": [ 57 | "Jon Landau" 58 | ], 59 | "Production Budget": 200000000.0, 60 | "Release Date": "December 19, 1997", 61 | "Runtime": 194, 62 | "Title": "Titanic (1997)", 63 | "Worldwide": 2186772302.0, 64 | "Writers": [ 65 | "James Cameron" 66 | ] 67 | } 68 | 69 | { 70 | "Title": "Titanic (1997)", 71 | "Weekly": [ 72 | { 73 | "Average Per Theatre": 19539.0, 74 | "Gross": 52969336.0, 75 | "Gross To Date": 52969336.0, 76 | "Rank": 1, 77 | "Theaters": 2711, 78 | "Theatre Change": null, 79 | "Week": "Dec 19, 1997", 80 | "Week Number": 1, 81 | "Week Over Week Change": null 82 | }, 83 | { 84 | "Average Per Theatre": 26257.0, 85 | "Gross": 71183357.0, 86 | "Gross To Date": 124152693.0, 87 | "Rank": 1, 88 | "Theaters": 2711, 89 | "Theatre Change": null, 90 | "Week": "Dec 26, 1997", 91 | "Week Number": 2, 92 | "Week Over Week Change": 0.344 93 | }, 94 | . 95 | . 96 | . 97 | ] 98 | } 99 | ``` 100 | 101 | ### Known Issues 102 | 103 | The library is a work in progress and has not been tested on all movies present on BoxOfficeMojo, so it might crash sometimes. 104 | This library is not compatible with python 3.x 105 | -------------------------------------------------------------------------------- /boxofficemojoAPI/utils.py: -------------------------------------------------------------------------------- 1 | __author__ = 'rastko' 2 | 3 | import decorator 4 | import requests.exceptions 5 | import time 6 | import re 7 | import inflection 8 | import bs4 9 | from datetime import datetime 10 | 11 | 12 | @decorator.decorator 13 | def na_or_empty(func, *args): 14 | """Decorator for handling conversion of the Movie.Data member""" 15 | data = args[0] 16 | key = args[1] 17 | try: 18 | if key in data: 19 | if data[key].upper() != "N/A" and data[key] != "-": 20 | return func(*args) 21 | else: 22 | data[key] = None 23 | else: 24 | data[key] = None 25 | except: 26 | print "Error cleaning: ", key 27 | raise 28 | 29 | 30 | @decorator.decorator 31 | def catch_connection_error(func, *args): 32 | has_error = True 33 | while has_error is True: 34 | try: 35 | has_error = False 36 | return func(*args) 37 | except requests.exceptions.ConnectionError as e: 38 | has_error = True 39 | print e.message 40 | print "Too many HTTP requests. wait 10 sec." 41 | time.sleep(10) 42 | except: 43 | raise 44 | 45 | 46 | @na_or_empty 47 | def convert_financial_field(data, key): 48 | """Formats financial values in the Data dictionary""" 49 | data[key] = re.sub(r'[(\xc2|\xa0|+|=|:|$|,)]', '', data[key]) 50 | if key == "Production Budget": 51 | digits = re.findall(r'([\d\.\d]+)', data[key]) 52 | digits = float(digits[0]) 53 | if 'million' in data[key]: 54 | data[key] = digits*1000000.0 55 | elif 'thousand' in data[key]: 56 | data[key] = digits*1000.0 57 | else: 58 | data[key] = digits 59 | else: 60 | data[key] = float(data[key]) 61 | 62 | 63 | @na_or_empty 64 | def convert_date_field(data, key): 65 | """Formats date values in the Data dictionary""" 66 | try: 67 | if len(re.findall(r'^(\w{3}) ', data[key])) == 0: 68 | data[key] = datetime.fromtimestamp(time.mktime(time.strptime(data[key], "%B %d, %Y"))) 69 | else: 70 | data[key] = datetime.fromtimestamp(time.mktime(time.strptime(data[key], "%b %d, %Y"))) 71 | except ValueError: 72 | data[key] = None 73 | pass 74 | 75 | 76 | @na_or_empty 77 | def convert_runtime_field(data, key): 78 | """Formats runtime values in the Data dictionary""" 79 | m = re.match(r"^((\d*) hrs\. )?(\d*)", data[key]) 80 | if m.group(2) is None: 81 | data[key] = int(m.group(3)) 82 | else: 83 | data[key] = int(m.group(2))*60 + int(m.group(3)) 84 | 85 | 86 | @na_or_empty 87 | def convert_int_field(data, key): 88 | """Formats integer values in the Data dictionary""" 89 | data[key] = re.sub(r'[=|:|$|,)]', '', data[key]) 90 | data[key] = int(data[key]) 91 | 92 | @na_or_empty 93 | def convert_percent_field(data, key): 94 | """Formats integer values in the Data dictionary""" 95 | data[key] = re.sub(r'[(%|,)]', '', data[key]) 96 | data[key] = float(data[key])*0.01 97 | 98 | 99 | def standardize_keys(obj): 100 | if isinstance(obj, list): 101 | for element in obj: 102 | standardize_keys(element) 103 | elif isinstance(obj, dict): 104 | temp = [] 105 | for key, val in obj.iteritems(): 106 | standardize_keys(val) 107 | temp.append(key) 108 | for key in temp: 109 | obj[inflection.underscore(key.replace(" ", ""))] = obj.pop(key) 110 | else: 111 | pass 112 | 113 | 114 | def get_soup(url): 115 | r = requests.get(url) 116 | if r.status_code == 200: 117 | return bs4.BeautifulSoup(r.content) 118 | else: 119 | return None -------------------------------------------------------------------------------- /boxofficemojoAPI/boxofficemojo.py: -------------------------------------------------------------------------------- 1 | __author__ = 'rastko' 2 | 3 | import bs4 4 | import re 5 | import requests 6 | import movie 7 | import utils 8 | 9 | 10 | class BoxOfficeMojo(object): 11 | """API client object for interacting with BoxOfficeMojo website""" 12 | 13 | BOMURL = "http://www.boxofficemojo.com/movies" 14 | 15 | def __init__(self): 16 | self.letters = ['NUM'] 17 | self.movie_urls = {} 18 | self.total_movies = 0 19 | for i in range(65, 91): 20 | self.letters.append(chr(i)) 21 | 22 | def find_number_of_pages(self, soup): 23 | """Returns the number of sub-pages a certain letter will have""" 24 | pages = soup.findAll(href=re.compile("page")) 25 | if len(pages) > 0: 26 | max_page_url = pages[-1]['href'] 27 | max_page = re.findall("\d+", max_page_url)[0] 28 | return int(max_page) 29 | else: 30 | return 1 31 | 32 | def clean_html(self, soup): 33 | """Get rid of all bold, italic, underline and link tags""" 34 | invalid_tags = ['b', 'i', 'u', 'nobr', 'font'] 35 | for tag in invalid_tags: 36 | for match in soup.findAll(tag): 37 | match.replaceWithChildren() 38 | 39 | def find_urls_in_html(self, soup): 40 | """Adds all the specific movie urls to the movie_urls dictionary""" 41 | urls = soup.findAll(href=re.compile("id=")) 42 | # First URL is an ad for a movie so get rid of it 43 | del(urls[0]) 44 | 45 | self.total_movies += len(urls) 46 | for url in urls: 47 | movie_name = url.renderContents() 48 | suffix = 1 49 | while movie_name in self.movie_urls.keys(): 50 | movie_name = url.renderContents() + '(' + str(suffix) + ')' 51 | suffix += 1 52 | #save only the movie ids 53 | a = re.findall(r'id=((\w|[-(),\':\s.])+).htm', url["href"]) 54 | if len(a) == 1: 55 | self.movie_urls[a[0][0]] = movie_name 56 | 57 | def crawl_for_urls(self): 58 | """Gets all the movie urls and puts them in a dictionary""" 59 | for letter in self.letters: 60 | print 'Processing letter: ' + letter 61 | url = self.BOMURL + "/alphabetical.htm?letter=" + letter 62 | r = requests.get(url) 63 | if r.status_code != 200: 64 | print "HTTP Status code returned:", r.status_code 65 | soup = bs4.BeautifulSoup(r.content) 66 | self.clean_html(soup) 67 | num_pages = self.find_number_of_pages(soup) 68 | self.find_urls_in_html(soup) 69 | for num in range(2, num_pages+1): 70 | new_url = url + "&page=" + str(num) 71 | r = requests.get(new_url) 72 | if r.status_code != 200: 73 | print "HTTP Status code returned:", r.status_code 74 | soup = bs4.BeautifulSoup(r.content) 75 | self.clean_html(soup) 76 | self.find_urls_in_html(soup) 77 | 78 | @utils.catch_connection_error 79 | def get_movie_summary(self, url_or_id): 80 | if 'http' in url_or_id.lower(): 81 | soup = utils.get_soup(url_or_id) 82 | if soup is not None: 83 | return movie.Movie(soup) 84 | else: 85 | print "Not able to parse url: " + url_or_id 86 | pass 87 | elif url_or_id in self.movie_urls.keys(): 88 | url = self.BOMURL + "/?page=main&id=" + url_or_id + ".htm" 89 | soup = utils.get_soup(url) 90 | if soup is not None: 91 | return movie.Movie(soup) 92 | else: 93 | print "Not able to parse url: " + url 94 | pass 95 | else: 96 | print "Invalid movie name or URL ", url_or_id 97 | 98 | @utils.catch_connection_error 99 | def get_weekly_summary(self, url_or_id): 100 | if 'http' in url_or_id.lower(): 101 | soup = utils.get_soup(url_or_id) 102 | if soup is not None: 103 | return movie.Weekly(soup) 104 | else: 105 | print "Not able to parse url: " + url_or_id 106 | pass 107 | elif url_or_id in self.movie_urls.keys(): 108 | url = self.BOMURL + "/?page=weekly&id=" + url_or_id + ".htm" 109 | soup = utils.get_soup(url) 110 | if soup is not None: 111 | return movie.Weekly(soup) 112 | else: 113 | print "Not able to parse url: " + url 114 | pass 115 | else: 116 | print "Invalid movie name or URL ", url_or_id 117 | 118 | def get_all_movies(self): 119 | for key, val in self.movie_urls.iteritems(): 120 | movie = self.get_movie_details(key) 121 | movie.clean_data() 122 | print movie.to_json() 123 | 124 | -------------------------------------------------------------------------------- /boxofficemojoAPI/movie.py: -------------------------------------------------------------------------------- 1 | __author__ = 'rastko' 2 | 3 | import bs4 4 | import re 5 | import json 6 | from bson import json_util 7 | 8 | from . import utils 9 | from base import MovieBase 10 | 11 | 12 | class Movie(MovieBase): 13 | def __init__(self, html_soup): 14 | """Movie class which parses html BeautifulSoup object and extracts information about the movie""" 15 | 16 | MovieBase.__init__(self, html_soup) 17 | 18 | def extract_data(self): 19 | """Extract all the relevant information from the html file""" 20 | title = self.soup.title.contents[0].encode('utf8') 21 | self.data["Title"] = title.replace(" - Box Office Mojo", "") 22 | try: 23 | center = self.soup.findAll("center") 24 | 25 | if len(center) == 0: 26 | pass 27 | 28 | table = center[0].find("table") 29 | 30 | if len(center) is None: 31 | pass 32 | 33 | rows = table.findAll('tr') 34 | for tr in rows: 35 | cols = tr.findAll('td') 36 | contents = [a.renderContents() for a in cols] 37 | for con in contents: 38 | keyval = con.split(":") 39 | key = keyval[0] 40 | val = keyval[1].strip() 41 | self.data[key] = val 42 | 43 | tables = self.soup.find_all("div", "mp_box") 44 | 45 | for table in tables: 46 | box_table_name = table.find("div", "mp_box_tab").string 47 | 48 | if box_table_name == "Total Lifetime Grosses": 49 | rows = table.findAll('tr') 50 | for tr in rows: 51 | cols = tr.findAll('td') 52 | if len(cols) > 1: 53 | contents = [re.sub(r'[(\xc2|\xa0|+|=|:|$|,)]', '', a.renderContents()) for a in cols] 54 | key = contents[0] 55 | val = contents[1] 56 | self.data[key] = val 57 | 58 | elif box_table_name == "Domestic Summary": 59 | pass 60 | 61 | elif box_table_name == "The Players": 62 | # Exclude any results which are just whitespaces or cast member descriptors (i.e. (Voice), (Minor role)) 63 | exclude_pattern = '(^\s*\(.+\)|^\s+$)' 64 | rows = table.findAll('tr') 65 | for tr in rows: 66 | cols = tr.findAll('td') 67 | if len(cols) > 1: 68 | key = cols[0].text 69 | val = cols[1] 70 | key = key.replace(':', '') 71 | if key[-1] != 's': 72 | key += 's' 73 | self.data[key] = [re.sub('\*+\s*$', '', child.encode('utf-8')) for child in val.children 74 | if re.search(exclude_pattern, child) is None] 75 | else: 76 | pass 77 | except: 78 | print "Error parsing movie: ", title 79 | raise 80 | 81 | def clean_data(self): 82 | """Formats all the extracted data into the appropriate types""" 83 | utils.convert_financial_field(self.data, "Domestic") 84 | utils.convert_financial_field(self.data, "Worldwide") 85 | utils.convert_financial_field(self.data, "Foreign") 86 | utils.convert_financial_field(self.data, "Production Budget") 87 | utils.convert_date_field(self.data, "Release Date") 88 | utils.convert_runtime_field(self.data, "Runtime") 89 | 90 | for key, value in self.data.iteritems(): 91 | if "Total Gross" in key or "." in key: 92 | self.data.pop(key) 93 | break 94 | utils.standardize_keys(self.data) 95 | 96 | 97 | class Weekly(MovieBase): 98 | def __init__(self, html_soup): 99 | """Movie class which parses html BeautifulSoup object and extracts information about the movie""" 100 | 101 | MovieBase.__init__(self, html_soup) 102 | 103 | def extract_data(self): 104 | """Extract all the relevant information from the html file""" 105 | title = self.soup.title.contents[0].encode('utf8') 106 | self.data["Title"] = title.replace(" - Weekly Box Office Results - Box Office Mojo", "") 107 | try: 108 | center = self.soup.findAll("center") 109 | 110 | x = center[1].contents[0::2] 111 | years = [year.encode('utf-8') for year in x] 112 | 113 | tables = self.soup.find_all("table", "chart-wide") 114 | 115 | results_collection = [] 116 | year = 0 117 | if len(tables) == 0: 118 | self.data["Weekly"] = None 119 | pass 120 | 121 | for table in tables: 122 | rows = table.findAll("tr") 123 | del(rows[0]) 124 | 125 | for tr in rows: 126 | results_week = {} 127 | cols = tr.findAll("td") 128 | results_week["Week"] = re.sub(ur'(\u2013|\u0096)[\s\w\s]+', '', cols[0].renderContents().decode("utf-8")) + ", " + years[year] 129 | results_week["Rank"] = cols[1].renderContents() 130 | results_week["Gross"] = cols[2].renderContents() 131 | results_week["Week Over Week Change"] = cols[3].renderContents() 132 | results_week["Theaters"] = cols[4].renderContents() 133 | results_week["Theatre Change"] = cols[5].renderContents() 134 | results_week["Average Per Theatre"] = cols[6].renderContents() 135 | results_week["Gross To Date"] = cols[7].renderContents() 136 | results_week["Week Number"] = cols[8].renderContents() 137 | results_collection.append(results_week) 138 | year += 1 139 | 140 | self.data["Weekly"] = results_collection 141 | except: 142 | print "Error parsing movie: ", title 143 | raise 144 | 145 | def clean_data(self): 146 | """Formats all the extracted data into the appropriate types""" 147 | 148 | for results in self.data["Weekly"]: 149 | utils.convert_financial_field(results, "Average Per Theatre") 150 | utils.convert_financial_field(results, "Gross") 151 | utils.convert_financial_field(results, "Gross To Date") 152 | utils.convert_percent_field(results, "Week Over Week Change") 153 | utils.convert_date_field(results, "Week") 154 | utils.convert_int_field(results, "Rank") 155 | utils.convert_int_field(results, "Theaters") 156 | utils.convert_int_field(results, "Theatre Change") 157 | utils.convert_int_field(results, "Week Number") 158 | 159 | for key, value in self.data.iteritems(): 160 | if "Total Gross" in key or "." in key: 161 | self.data.pop(key) 162 | break 163 | utils.standardize_keys(self.data) 164 | --------------------------------------------------------------------------------