├── .gitignore
└── pipeline
    ├── database.py
    ├── storage.py
    ├── main.py
    ├── processing.py
    └── ingestion.py


/.gitignore:
--------------------------------------------------------------------------------
1 | /data
2 | __pycache__
3 | .DS_Store
4 | 


--------------------------------------------------------------------------------
/pipeline/database.py:
--------------------------------------------------------------------------------
1 | from sqlalchemy import create_engine
2 | 
3 | def insert_db(df, name, engine):
4 |     df.to_sql(name, engine, if_exists="replace")
5 | 
6 |     return None
7 | 


--------------------------------------------------------------------------------
/pipeline/storage.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | 
3 | def archive_data(df, name):
4 |     "Stores the data locally as a pickle file"
5 | 
6 |     pickle.dump(df, open('../data/{}.pkl'.format(name), 'wb'))
7 | 
8 |     return None
9 | 


--------------------------------------------------------------------------------
/pipeline/main.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import create_engine
 2 | 
 3 | from ingestion import scrape_eberts_listing, scrape_movie_reviews, scrape_imdb_listing
 4 | from storage import archive_data
 5 | from processing import clean_ebert_listings, clean_ebert_reviews, clean_imdb
 6 | from database import insert_db
 7 | 
 8 | if __name__ == "__main__":
 9 |     print("This pipeline scrapes, cleans and stores movie data from Roger Ebert and IMDB's website.")
10 | 
11 |     # Initialize database connection
12 |     try:
13 |         engine = create_engine('postgresql://postgres:password@35.224.187.73:5432/postgres')
14 |         print("Database found.")
15 |     except:
16 |         print("No database available.")
17 |         quit()
18 | 
19 |     # Ebert's website for listings
20 |     pages = int(input("How many pages would you like to scrape (24 movies per page)? "))
21 |     print("Scraping movie listings from Ebert's website.")
22 |     ebert_listing = scrape_eberts_listing()
23 |     archive_data(ebert_listing, "ebert_listing")
24 |     ebert_listing = clean_ebert_listings(ebert_listing)
25 |     insert_db(ebert_listing, 'ebert_listing', engine)
26 | 
27 |     # Ebert's website for reviews
28 |     print("Scraping movie reviews from Ebert's website.")
29 |     ebert_reviews, _ = scrape_movie_reviews(ebert_listing)
30 |     archive_data(ebert_reviews, "ebert_reviews")
31 |     ebert_reviews = clean_ebert_reviews(ebert_reviews)
32 |     insert_db(ebert_reviews, 'ebert_reviews', engine)
33 | 
34 |     # IMDB website for other movie info
35 |     print("Scraping movie information from IMDB.")
36 |     imdb_info, _ = scrape_imdb_listing(ebert_listing)
37 |     archive_data(imdb_info, "imdb_info")
38 |     imdb_info = clean_imdb(imdb_info)
39 |     insert_db(imdb_info, 'imdb_info', engine)
40 | 
41 |     print("Program complete.")
42 | 


--------------------------------------------------------------------------------
/pipeline/processing.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | # Ebert Listings
  5 | def convert_year(row):
  6 |     try:
  7 |         year = int(row['year'])
  8 |         return year
  9 |     except:
 10 |         return np.nan
 11 | 
 12 | def clean_ebert_listings(df):
 13 |     df['year'] = df.apply(lambda x: convert_year(x), 1)
 14 | 
 15 |     return df
 16 | 
 17 | # Ebert Reviews
 18 | def convert_runtime(row):
 19 |     try:
 20 |         runtime = int(row['runtime'])
 21 |         return runtime
 22 |     except:
 23 |         return np.nan
 24 | 
 25 | def clean_ebert_reviews(df):
 26 |     df['runtime'] = df.apply(lambda x: convert_runtime(x), 1)
 27 | 
 28 |     return df
 29 | 
 30 | # IMDB Information
 31 | def convert_imdb_rating(row):
 32 |     try:
 33 |         rating = float(row['imdb_rating'])
 34 |         return rating
 35 |     except:
 36 |         return np.nan
 37 | 
 38 | def convert_rating_count(row):
 39 |     try:
 40 |         count = float(row['rating_count'].replace(',', ''))
 41 |         return count
 42 |     except:
 43 |         return np.nan
 44 | 
 45 | def user_review_count(row):
 46 |     try:
 47 |         count = float(row['user_review_count'].replace(',', ''))
 48 |         return count
 49 |     except:
 50 |         return np.nan
 51 | 
 52 | def critic_review_count(row):
 53 |     try:
 54 |         count = float(row['critic_review_count'].replace(',', ''))
 55 |         return count
 56 |     except:
 57 |         return np.nan
 58 | 
 59 | def convert_metascore(row):
 60 |     try:
 61 |         score = float(row['metascore'].strip())
 62 |         return score
 63 |     except:
 64 |         return np.nan
 65 | 
 66 | def convert_country(row):
 67 |     try:
 68 |         country = row['country'].strip()
 69 |         return country
 70 |     except:
 71 |         return np.nan
 72 | 
 73 | def convert_release_date(row):
 74 |     try:
 75 |         rel_date = row['release_date'].strip()
 76 | 
 77 |         if 'TV' in rel_date:
 78 |             return np.nan
 79 |         else:
 80 |             try:
 81 |                 rel_date = datetime.datetime.strptime(rel_date, "%d, %B, %Y")
 82 |                 return rel_date
 83 |             except:
 84 |                 return np.nan
 85 | 
 86 |     except:
 87 |         return np.nan
 88 | 
 89 | 
 90 | def clean_imdb(df):
 91 |     df['IMDB_Rating'] = df.apply(lambda x: convert_imdb_rating(x), 1)
 92 |     df['Rating_Count'] = df.apply(lambda x: convert_rating_count(x), 1)
 93 |     df['User_Review_Count'] = df.apply(lambda x: user_review_count(x), 1)
 94 |     df['Critic_Review_Count'] = df.apply(lambda x: critic_review_count(x), 1)
 95 |     df['Metascore'] = df.apply(lambda x: convert_metascore(x), 1)
 96 |     df['Country'] = df.apply(lambda x: convert_country(x), 1)
 97 |     df['Release_Date'] = df.apply(lambda x: convert_release_date(x), 1)
 98 | 
 99 |     return df
100 | 


--------------------------------------------------------------------------------
/pipeline/ingestion.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import time
  3 | import re
  4 | 
  5 | import pandas as pd
  6 | from bs4 import BeautifulSoup
  7 | 
  8 | 
  9 | def scrape_eberts_listing(num_pages=10):
 10 |     """
 11 |     Parses through webpage with list of movies and returns DataFrame.
 12 |     :num_pages = Number of pages to go through
 13 |     """
 14 |     url = "http://www.rogerebert.com/reviews?great_movies=0&no_stars=0&title=Cabin+in+the+Woods&filtersgreat_movies%5D%5B%5D=&filters%5Bno_stars%5D%5B%5D=&filters%5Bno_stars%5D%5B%5D=1&filters%5Btitle%5D=&filters%5Breviewers%5D=&filters%5Bgenres%5D=&page={}&sort%5Border%5D=newest"
 15 |     pages = list(range(1, num_pages))
 16 |     links = [url.format(i) for i in pages]
 17 | 
 18 |     review_list = list()
 19 |     count = 0
 20 | 
 21 |     for link in links:
 22 |         webpage = requests.get(link).text
 23 |         soup = BeautifulSoup(webpage, 'lxml')
 24 |         all_movies = soup('figure', {'class':'movie review'})
 25 | 
 26 |         for movie in all_movies:
 27 |             url = movie.a.get('href')
 28 |             title = movie.find_all('a')[1].text
 29 |             stars = len(movie.find_all('i', {'class':'icon-star-full'})) + 0.5 * len(movie.find_all('i', {'class':'icon-star-half'}))
 30 | 
 31 |             try:
 32 |                 year = movie.find('span', {'class':'release-year'}).text[1:-1]
 33 |             except:
 34 |                 year = ''
 35 | 
 36 |             count += 1
 37 |             review_list.append([count, title, stars, year, url])
 38 | 
 39 | 
 40 |     df = pd.DataFrame(review_list, columns = ['id', 'title', 'ebertstars', 'year', 'url'])
 41 |     return df
 42 | 
 43 | def scrape_movie_reviews(df):
 44 |     """
 45 |     Parses each individual review page and returns list of key attributes.
 46 |     :link = URL for review
 47 |     """
 48 |     scraped_list = list()
 49 | 
 50 |     for link in df['url']:
 51 |         full_link = "http://www.rogerebert.com" + link
 52 |         webpage = requests.get(full_link).text
 53 |         soup = BeautifulSoup(webpage, 'lxml')
 54 | 
 55 |         try:
 56 |             mpaa = soup.find('p', {'class':'mpaa-rating'}).strong.text[6:]
 57 |         except:
 58 |             mpaa = ''
 59 | 
 60 |         try:
 61 |             runtime = int(soup.find('p', {'class':'running-time'}).strong.text[:3].strip())
 62 |         except:
 63 |             runtime = ''
 64 | 
 65 |         try:
 66 |             review = ' '.join([paragraph.text for paragraph in soup.find('div', {'itemprop':'reviewBody'}).find_all('p')])
 67 |         except:
 68 |             review = ''
 69 | 
 70 |         scraped_list.append([link, mpaa, runtime, review])
 71 | 
 72 |         time.sleep(0.25)
 73 | 
 74 |     df = pd.DataFrame(scraped_list, columns = ['url', 'rating', 'runtime', 'review'])
 75 | 
 76 |     return df, scraped_list
 77 | 
 78 | def scrape_imdb_listing(df):
 79 |     """
 80 |     Searches IMDB, parses results and returns DataFrame.
 81 |     :df = DataFrame with movie titles
 82 |     """
 83 |     movie_list = list()
 84 | 
 85 |     for movie in df['title']:
 86 |         base_url = 'http://www.imdb.com/find?q='
 87 |         url = base_url + movie +'&s=all'
 88 |         webpage = requests.get(url).text
 89 |         soup = BeautifulSoup(webpage, 'lxml')
 90 | 
 91 |         try:
 92 |             results = soup('table', {'class':'findList'})[0]
 93 |         except:
 94 |             continue
 95 | 
 96 |         title = results.find_all('tr')[0]
 97 |         link = title.find('a', href=True)['href']
 98 | 
 99 |         url = 'http://www.imdb.com' + link
100 |         webpage = requests.get(url).text
101 |         soup = BeautifulSoup(webpage, 'lxml')
102 | 
103 |         movie_title = soup.find('title')
104 | 
105 |         try:
106 |             rate = soup.find('span', itemprop='ratingValue').text
107 |         except:
108 |             rate = ''
109 | 
110 |         try:
111 |             count = soup.find('span', itemprop='ratingCount').text
112 |         except:
113 |             count = ''
114 | 
115 |         try:
116 |             des = soup.find('meta',{'name':'description'})['content']
117 |         except:
118 |             des = ''
119 | 
120 |         try:
121 |             metascore = soup.find('div', class_='metacriticScore').text
122 |         except:
123 |             metascore = ''
124 | 
125 |         try:
126 |             reviews_count = soup.find('div', class_='titleReviewbarItemBorder')
127 |             u_reviews = reviews_count.find_all('a')[0].text.split(' ')[0]
128 |             c_reviews = reviews_count.find_all('a')[1].text.split(' ')[0]
129 |         except:
130 |             u_reviews = ''
131 |             c_review = ''
132 | 
133 |         try:
134 |             director = soup.find('span', itemprop='name').text
135 |         except:
136 |             director = ''
137 | 
138 |         try:
139 |             country = soup.find('div', class_='subtext').find_all('a', title=True)[-1].text.split(' ')[-1]
140 |             country = re.sub('[\(\)\{\}<>]', '', country)
141 |         except:
142 |             country = ''
143 | 
144 |         try:
145 |             rel_date = (', ').join(soup.find('div', class_='subtext').find_all('a',
146 |                                             title=True)[-1].text.split(' ')[:-1])
147 |         except:
148 |             rel_date = ''
149 | 
150 |         movie_list.append([movie, rate, count, des, metascore, u_reviews, c_reviews,
151 |                        director, country, rel_date])
152 | 
153 |         time.sleep(0.25)
154 | 
155 | 
156 |     df = pd.DataFrame(movie_list, columns = ['title', 'imdb_rating', 'rating_count',
157 |         'description', 'metascore', 'user_review_count', 'critic_review_count',
158 |         'director', 'country', 'release_date'])
159 | 
160 |     return df, movie_list
161 | 


--------------------------------------------------------------------------------