├── .gitignore
├── boxofficemojoAPI
    ├── __init__.py
    ├── base.py
    ├── utils.py
    ├── boxofficemojo.py
    └── movie.py
├── LICENSE.txt
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | test
2 | Analysis
3 | .idea


--------------------------------------------------------------------------------
/boxofficemojoAPI/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'rastko'
2 | from movie import *
3 | from boxofficemojo import BoxOfficeMojo
4 | from base import MovieBase


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright 2015 Rastko Anicic, https://github.com/skozilla
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 
21 | 


--------------------------------------------------------------------------------
/boxofficemojoAPI/base.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'rastko'
 2 | 
 3 | import bs4
 4 | import json
 5 | from bson import json_util
 6 | 
 7 | class MovieBase(object):
 8 | 
 9 |     def __init__(self, html_soup):
10 |         """Movie class which parses html BeautifulSoup object and extracts information about the movie"""
11 | 
12 |         self.data = {}
13 |         assert isinstance(html_soup, bs4.BeautifulSoup)
14 |         self.soup = html_soup
15 | 
16 |         self.clean_html()
17 |         self.extract_data()
18 | 
19 |     def clean_html(self):
20 |         """Get rid of all bold, italic, underline, link tags, script tag, img tag, etc"""
21 |         invalid_tags = ['a', 'b', 'i', 'u', 'nobr', 'font']
22 |         for tag in invalid_tags:
23 |             for match in self.soup.findAll(tag):
24 |                 match.replaceWithChildren()
25 | 
26 |         # delete all contents in script and img tags
27 |         [x.extract() for x in self.soup.findAll('script')]
28 |         [x.extract() for x in self.soup.findAll('img')]
29 |         [x.extract() for x in self.soup.findAll('br')]
30 |         [x.extract() for x in self.soup.findAll('div', id='hp_banner')]
31 |         [x.extract() for x in self.soup.findAll('ul', id='leftnav')]
32 | 
33 |     def print_to_file(self, file_name):
34 |         """"Print a pretty and clean html string to a file"""
35 |         f = open(file_name, 'w')
36 |         f.write(self.soup.prettify().encode('utf8'))
37 |         f.close()
38 | 
39 |     def extract_data(self):
40 |         pass
41 | 
42 |     def clean_data(self):
43 |         pass
44 | 
45 |     def to_json(self):
46 |         """Returns a JSON string of the Data member"""
47 |         return json.dumps(self.data, indent=4, sort_keys=True, default=json_util.default)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | BoxOfficeMojo
  2 | =============
  3 | 
  4 | A simple python module for scrapping movie information from www.boxofficemojo.com
  5 | 
  6 | The module is used to extract information for any movie in the www.boxofficemojo.com domain. The information that it obtains
  7 | includes financial information (domestic gross, foreign gross, budget), cast, directors, composers, runtime, raiting, etc.. 
  8 | It could also be used to get the weekly performance of the movie in the box office. 
  9 | Not all the information is present on the website, so it gets whatever is available. 
 10 | 
 11 | Feel free to make suggestions about the code or the functionality, as they would be greatly appreciated. Contributions are welcome. 
 12 | 
 13 | ### Example 
 14 | 
 15 | ```python
 16 | import boxofficemojoAPI as bom
 17 | 
 18 | box_office_mojo = bom.BoxOfficeMojo()
 19 | box_office_mojo.crawl_for_urls()
 20 | 
 21 | 
 22 | movie = box_office_mojo.get_movie_summary("titanic")
 23 | movie.clean_data()
 24 | print movie.to_json()
 25 | 
 26 | weekly = box_office_mojo.get_weekly_summary("titanic")
 27 | weekly.clean_data()
 28 | print weekly.to_json()
 29 | 
 30 | ```
 31 | 
 32 | the output would be 
 33 | 
 34 | ```json
 35 | {
 36 |     "Actors": [
 37 |         "Leonardo DiCaprio", 
 38 |         "Kate Winslet", 
 39 |         "Billy Zane", 
 40 |         "Kathy Bates", 
 41 |         "Bill Paxton", 
 42 |         "Bernard Hill", 
 43 |         "Ioan Gruffudd"
 44 |     ], 
 45 |     "Composers": [
 46 |         "James Horner"
 47 |     ], 
 48 |     "Directors": [
 49 |         "James Cameron"
 50 |     ], 
 51 |     "Distributor": "Paramount", 
 52 |     "Domestic": 658672302.0,  
 53 |     "Foreign": 1528100000.0, 
 54 |     "Genre": "Romance", 
 55 |     "MPAA Rating": "PG-13", 
 56 |     "Producers": [
 57 |         "Jon Landau"
 58 |     ], 
 59 |     "Production Budget": 200000000.0, 
 60 |     "Release Date": "December 19, 1997", 
 61 |     "Runtime": 194, 
 62 |     "Title": "Titanic (1997)", 
 63 |     "Worldwide": 2186772302.0, 
 64 |     "Writers": [
 65 |         "James Cameron"
 66 |     ]
 67 | }
 68 | 
 69 | {
 70 |     "Title": "Titanic (1997)", 
 71 |     "Weekly": [
 72 |         {
 73 |             "Average Per Theatre": 19539.0, 
 74 |             "Gross": 52969336.0, 
 75 |             "Gross To Date": 52969336.0, 
 76 |             "Rank": 1, 
 77 |             "Theaters": 2711, 
 78 |             "Theatre Change": null, 
 79 |             "Week": "Dec 19, 1997", 
 80 |             "Week Number": 1, 
 81 |             "Week Over Week Change": null
 82 |         }, 
 83 |         {
 84 |             "Average Per Theatre": 26257.0, 
 85 |             "Gross": 71183357.0, 
 86 |             "Gross To Date": 124152693.0, 
 87 |             "Rank": 1, 
 88 |             "Theaters": 2711, 
 89 |             "Theatre Change": null, 
 90 |             "Week": "Dec 26, 1997", 
 91 |             "Week Number": 2, 
 92 |             "Week Over Week Change": 0.344
 93 |         }, 
 94 |         .
 95 |         .
 96 |         .
 97 |     ]
 98 | }
 99 | ```
100 | 
101 | ### Known Issues
102 | 
103 | The library is a work in progress and has not been tested on all movies present on BoxOfficeMojo, so it might crash sometimes. 
104 | This library is not compatible with python 3.x
105 | 


--------------------------------------------------------------------------------
/boxofficemojoAPI/utils.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'rastko'
  2 | 
  3 | import decorator
  4 | import requests.exceptions
  5 | import time
  6 | import re
  7 | import inflection
  8 | import bs4
  9 | from datetime import  datetime
 10 | 
 11 | 
 12 | @decorator.decorator
 13 | def na_or_empty(func, *args):
 14 |     """Decorator for handling conversion of the Movie.Data member"""
 15 |     data = args[0]
 16 |     key = args[1]
 17 |     try:
 18 |         if key in data:
 19 |             if data[key].upper() != "N/A" and data[key] != "-":
 20 |                 return func(*args)
 21 |             else:
 22 |                 data[key] = None
 23 |         else:
 24 |             data[key] = None
 25 |     except:
 26 |         print "Error cleaning: ", key
 27 |         raise
 28 | 
 29 | 
 30 | @decorator.decorator
 31 | def catch_connection_error(func, *args):
 32 |     has_error = True
 33 |     while has_error is True:
 34 |         try:
 35 |             has_error = False
 36 |             return func(*args)
 37 |         except requests.exceptions.ConnectionError as e:
 38 |             has_error = True
 39 |             print e.message
 40 |             print "Too many HTTP requests. wait 10 sec."
 41 |             time.sleep(10)
 42 |         except:
 43 |             raise
 44 | 
 45 | 
 46 | @na_or_empty
 47 | def convert_financial_field(data, key):
 48 |     """Formats financial values in the Data dictionary"""
 49 |     data[key] = re.sub(r'[(\xc2|\xa0|+|=|:|$|,)]', '', data[key])
 50 |     if key == "Production Budget":
 51 |         digits = re.findall(r'([\d\.\d]+)', data[key])
 52 |         digits = float(digits[0])
 53 |         if 'million' in data[key]:
 54 |             data[key] = digits*1000000.0
 55 |         elif 'thousand' in data[key]:
 56 |             data[key] = digits*1000.0
 57 |         else:
 58 |             data[key] = digits
 59 |     else:
 60 |         data[key] = float(data[key])
 61 | 
 62 | 
 63 | @na_or_empty
 64 | def convert_date_field(data, key):
 65 |     """Formats date values in the Data dictionary"""
 66 |     try:
 67 |         if len(re.findall(r'^(\w{3}) ', data[key])) == 0:
 68 |             data[key] = datetime.fromtimestamp(time.mktime(time.strptime(data[key], "%B %d, %Y")))
 69 |         else:
 70 |             data[key] = datetime.fromtimestamp(time.mktime(time.strptime(data[key], "%b %d, %Y")))
 71 |     except ValueError:
 72 |         data[key] = None
 73 |         pass
 74 | 
 75 | 
 76 | @na_or_empty
 77 | def convert_runtime_field(data, key):
 78 |     """Formats runtime values in the Data dictionary"""
 79 |     m = re.match(r"^((\d*) hrs\. )?(\d*)", data[key])
 80 |     if m.group(2) is None:
 81 |         data[key] = int(m.group(3))
 82 |     else:
 83 |         data[key] = int(m.group(2))*60 + int(m.group(3))
 84 | 
 85 | 
 86 | @na_or_empty
 87 | def convert_int_field(data, key):
 88 |     """Formats integer values in the Data dictionary"""
 89 |     data[key] = re.sub(r'[=|:|$|,)]', '', data[key])
 90 |     data[key] = int(data[key])
 91 | 
 92 | @na_or_empty
 93 | def convert_percent_field(data, key):
 94 |     """Formats integer values in the Data dictionary"""
 95 |     data[key] = re.sub(r'[(%|,)]', '', data[key])
 96 |     data[key] = float(data[key])*0.01
 97 | 
 98 | 
 99 | def standardize_keys(obj):
100 |     if isinstance(obj, list):
101 |         for element in obj:
102 |             standardize_keys(element)
103 |     elif isinstance(obj, dict):
104 |         temp = []
105 |         for key, val in obj.iteritems():
106 |             standardize_keys(val)
107 |             temp.append(key)
108 |         for key in temp:
109 |             obj[inflection.underscore(key.replace(" ", ""))] = obj.pop(key)
110 |     else:
111 |         pass
112 | 
113 | 
114 | def get_soup(url):
115 |     r = requests.get(url)
116 |     if r.status_code == 200:
117 |         return bs4.BeautifulSoup(r.content)
118 |     else:
119 |         return None


--------------------------------------------------------------------------------
/boxofficemojoAPI/boxofficemojo.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'rastko'
  2 | 
  3 | import bs4
  4 | import re
  5 | import requests
  6 | import movie
  7 | import utils
  8 | 
  9 | 
 10 | class BoxOfficeMojo(object):
 11 |     """API client object for interacting with BoxOfficeMojo website"""
 12 | 
 13 |     BOMURL = "http://www.boxofficemojo.com/movies"
 14 | 
 15 |     def __init__(self):
 16 |         self.letters = ['NUM']
 17 |         self.movie_urls = {}
 18 |         self.total_movies = 0
 19 |         for i in range(65, 91):
 20 |             self.letters.append(chr(i))
 21 | 
 22 |     def find_number_of_pages(self, soup):
 23 |         """Returns the number of sub-pages a certain letter will have"""
 24 |         pages = soup.findAll(href=re.compile("page"))
 25 |         if len(pages) > 0:
 26 |             max_page_url = pages[-1]['href']
 27 |             max_page = re.findall("\d+", max_page_url)[0]
 28 |             return int(max_page)
 29 |         else:
 30 |             return 1
 31 | 
 32 |     def clean_html(self, soup):
 33 |         """Get rid of all bold, italic, underline and link tags"""
 34 |         invalid_tags = ['b', 'i', 'u', 'nobr', 'font']
 35 |         for tag in invalid_tags:
 36 |             for match in soup.findAll(tag):
 37 |                 match.replaceWithChildren()
 38 | 
 39 |     def find_urls_in_html(self, soup):
 40 |         """Adds all the specific movie urls to the movie_urls dictionary"""
 41 |         urls = soup.findAll(href=re.compile("id="))
 42 |         # First URL is an ad for a movie so get rid of it
 43 |         del(urls[0])
 44 | 
 45 |         self.total_movies += len(urls)
 46 |         for url in urls:
 47 |             movie_name = url.renderContents()
 48 |             suffix = 1
 49 |             while movie_name in self.movie_urls.keys():
 50 |                 movie_name = url.renderContents() + '(' + str(suffix) + ')'
 51 |                 suffix += 1
 52 |             #save only the movie ids
 53 |             a = re.findall(r'id=((\w|[-(),\':\s.])+).htm', url["href"])
 54 |             if len(a) == 1:
 55 |                 self.movie_urls[a[0][0]] = movie_name
 56 | 
 57 |     def crawl_for_urls(self):
 58 |         """Gets all the movie urls and puts them in a dictionary"""
 59 |         for letter in self.letters:
 60 |             print 'Processing letter: ' + letter
 61 |             url = self.BOMURL + "/alphabetical.htm?letter=" + letter
 62 |             r = requests.get(url)
 63 |             if r.status_code != 200:
 64 |                 print "HTTP Status code returned:", r.status_code
 65 |             soup = bs4.BeautifulSoup(r.content)
 66 |             self.clean_html(soup)
 67 |             num_pages = self.find_number_of_pages(soup)
 68 |             self.find_urls_in_html(soup)
 69 |             for num in range(2, num_pages+1):
 70 |                 new_url = url + "&page=" + str(num)
 71 |                 r = requests.get(new_url)
 72 |                 if r.status_code != 200:
 73 |                     print "HTTP Status code returned:", r.status_code
 74 |                 soup = bs4.BeautifulSoup(r.content)
 75 |                 self.clean_html(soup)
 76 |                 self.find_urls_in_html(soup)
 77 | 
 78 |     @utils.catch_connection_error
 79 |     def get_movie_summary(self, url_or_id):
 80 |         if 'http' in url_or_id.lower():
 81 |             soup = utils.get_soup(url_or_id)
 82 |             if soup is not None:
 83 |                 return movie.Movie(soup)
 84 |             else:
 85 |                 print "Not able to parse url: " + url_or_id
 86 |                 pass
 87 |         elif url_or_id in self.movie_urls.keys():
 88 |             url = self.BOMURL + "/?page=main&id=" + url_or_id + ".htm"
 89 |             soup = utils.get_soup(url)
 90 |             if soup is not None:
 91 |                 return movie.Movie(soup)
 92 |             else:
 93 |                 print "Not able to parse url: " + url
 94 |                 pass
 95 |         else:
 96 |             print "Invalid movie name or URL ", url_or_id
 97 | 
 98 |     @utils.catch_connection_error
 99 |     def get_weekly_summary(self, url_or_id):
100 |         if 'http' in url_or_id.lower():
101 |             soup = utils.get_soup(url_or_id)
102 |             if soup is not None:
103 |                 return movie.Weekly(soup)
104 |             else:
105 |                 print "Not able to parse url: " + url_or_id
106 |                 pass
107 |         elif url_or_id in self.movie_urls.keys():
108 |             url = self.BOMURL + "/?page=weekly&id=" + url_or_id + ".htm"
109 |             soup = utils.get_soup(url)
110 |             if soup is not None:
111 |                 return movie.Weekly(soup)
112 |             else:
113 |                 print "Not able to parse url: " + url
114 |                 pass
115 |         else:
116 |             print "Invalid movie name or URL ", url_or_id
117 | 
118 |     def get_all_movies(self):
119 |         for key, val in self.movie_urls.iteritems():
120 |             movie = self.get_movie_details(key)
121 |             movie.clean_data()
122 |             print movie.to_json()
123 | 
124 | 


--------------------------------------------------------------------------------
/boxofficemojoAPI/movie.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'rastko'
  2 | 
  3 | import bs4
  4 | import re
  5 | import json
  6 | from bson import json_util
  7 | 
  8 | from . import utils
  9 | from base import MovieBase
 10 | 
 11 | 
 12 | class Movie(MovieBase):
 13 |     def __init__(self, html_soup):
 14 |         """Movie class which parses html BeautifulSoup object and extracts information about the movie"""
 15 | 
 16 |         MovieBase.__init__(self, html_soup)
 17 | 
 18 |     def extract_data(self):
 19 |         """Extract all the relevant information from the html file"""
 20 |         title = self.soup.title.contents[0].encode('utf8')
 21 |         self.data["Title"] = title.replace(" - Box Office Mojo", "")
 22 |         try:
 23 |             center = self.soup.findAll("center")
 24 | 
 25 |             if len(center) == 0:
 26 |                 pass
 27 | 
 28 |             table = center[0].find("table")
 29 | 
 30 |             if len(center) is None:
 31 |                 pass
 32 | 
 33 |             rows = table.findAll('tr')
 34 |             for tr in rows:
 35 |                 cols = tr.findAll('td')
 36 |                 contents = [a.renderContents() for a in cols]
 37 |                 for con in contents:
 38 |                     keyval = con.split(":")
 39 |                     key = keyval[0]
 40 |                     val = keyval[1].strip()
 41 |                     self.data[key] = val
 42 | 
 43 |             tables = self.soup.find_all("div", "mp_box")
 44 | 
 45 |             for table in tables:
 46 |                 box_table_name = table.find("div", "mp_box_tab").string
 47 | 
 48 |                 if box_table_name == "Total Lifetime Grosses":
 49 |                     rows = table.findAll('tr')
 50 |                     for tr in rows:
 51 |                         cols = tr.findAll('td')
 52 |                         if len(cols) > 1:
 53 |                             contents = [re.sub(r'[(\xc2|\xa0|+|=|:|$|,)]', '', a.renderContents()) for a in cols]
 54 |                             key = contents[0]
 55 |                             val = contents[1]
 56 |                             self.data[key] = val
 57 | 
 58 |                 elif box_table_name == "Domestic Summary":
 59 |                     pass
 60 | 
 61 |                 elif box_table_name == "The Players":
 62 |                     # Exclude any results which are just whitespaces or cast member descriptors (i.e. (Voice), (Minor role))
 63 |                     exclude_pattern = '(^\s*\(.+\)|^\s+$)'
 64 |                     rows = table.findAll('tr')
 65 |                     for tr in rows:
 66 |                         cols = tr.findAll('td')
 67 |                         if len(cols) > 1:
 68 |                             key = cols[0].text
 69 |                             val = cols[1]
 70 |                             key = key.replace(':', '')
 71 |                             if key[-1] != 's':
 72 |                                 key += 's'
 73 |                             self.data[key] = [re.sub('\*+\s*$', '', child.encode('utf-8')) for child in val.children
 74 |                                               if re.search(exclude_pattern, child) is None]
 75 |                 else:
 76 |                     pass
 77 |         except:
 78 |             print "Error parsing movie: ", title
 79 |             raise
 80 | 
 81 |     def clean_data(self):
 82 |         """Formats all the extracted data into the appropriate types"""
 83 |         utils.convert_financial_field(self.data, "Domestic")
 84 |         utils.convert_financial_field(self.data, "Worldwide")
 85 |         utils.convert_financial_field(self.data, "Foreign")
 86 |         utils.convert_financial_field(self.data, "Production Budget")
 87 |         utils.convert_date_field(self.data, "Release Date")
 88 |         utils.convert_runtime_field(self.data, "Runtime")
 89 | 
 90 |         for key, value in self.data.iteritems():
 91 |             if "Total Gross" in key or "." in key:
 92 |                 self.data.pop(key)
 93 |                 break
 94 |         utils.standardize_keys(self.data)
 95 | 
 96 | 
 97 | class Weekly(MovieBase):
 98 |     def __init__(self, html_soup):
 99 |         """Movie class which parses html BeautifulSoup object and extracts information about the movie"""
100 | 
101 |         MovieBase.__init__(self, html_soup)
102 | 
103 |     def extract_data(self):
104 |         """Extract all the relevant information from the html file"""
105 |         title = self.soup.title.contents[0].encode('utf8')
106 |         self.data["Title"] = title.replace(" - Weekly Box Office Results - Box Office Mojo", "")
107 |         try:
108 |             center = self.soup.findAll("center")
109 | 
110 |             x = center[1].contents[0::2]
111 |             years = [year.encode('utf-8') for year in x]
112 | 
113 |             tables = self.soup.find_all("table", "chart-wide")
114 | 
115 |             results_collection = []
116 |             year = 0
117 |             if len(tables) == 0:
118 |                 self.data["Weekly"] = None
119 |                 pass
120 | 
121 |             for table in tables:
122 |                 rows = table.findAll("tr")
123 |                 del(rows[0])
124 | 
125 |                 for tr in rows:
126 |                     results_week = {}
127 |                     cols = tr.findAll("td")
128 |                     results_week["Week"] = re.sub(ur'(\u2013|\u0096)[\s\w\s]+', '', cols[0].renderContents().decode("utf-8")) + ", " + years[year]
129 |                     results_week["Rank"] = cols[1].renderContents()
130 |                     results_week["Gross"] = cols[2].renderContents()
131 |                     results_week["Week Over Week Change"] = cols[3].renderContents()
132 |                     results_week["Theaters"] = cols[4].renderContents()
133 |                     results_week["Theatre Change"] = cols[5].renderContents()
134 |                     results_week["Average Per Theatre"] = cols[6].renderContents()
135 |                     results_week["Gross To Date"] = cols[7].renderContents()
136 |                     results_week["Week Number"] = cols[8].renderContents()
137 |                     results_collection.append(results_week)
138 |                 year += 1
139 | 
140 |             self.data["Weekly"] = results_collection
141 |         except:
142 |             print "Error parsing movie: ", title
143 |             raise
144 | 
145 |     def clean_data(self):
146 |         """Formats all the extracted data into the appropriate types"""
147 | 
148 |         for results in self.data["Weekly"]:
149 |             utils.convert_financial_field(results, "Average Per Theatre")
150 |             utils.convert_financial_field(results, "Gross")
151 |             utils.convert_financial_field(results, "Gross To Date")
152 |             utils.convert_percent_field(results, "Week Over Week Change")
153 |             utils.convert_date_field(results, "Week")
154 |             utils.convert_int_field(results, "Rank")
155 |             utils.convert_int_field(results, "Theaters")
156 |             utils.convert_int_field(results, "Theatre Change")
157 |             utils.convert_int_field(results, "Week Number")
158 | 
159 |         for key, value in self.data.iteritems():
160 |             if "Total Gross" in key or "." in key:
161 |                 self.data.pop(key)
162 |                 break
163 |         utils.standardize_keys(self.data)
164 | 


--------------------------------------------------------------------------------