├── .gitignore └── pipeline ├── database.py ├── storage.py ├── main.py ├── processing.py └── ingestion.py /.gitignore: -------------------------------------------------------------------------------- 1 | /data 2 | __pycache__ 3 | .DS_Store 4 | -------------------------------------------------------------------------------- /pipeline/database.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import create_engine 2 | 3 | def insert_db(df, name, engine): 4 | df.to_sql(name, engine, if_exists="replace") 5 | 6 | return None 7 | -------------------------------------------------------------------------------- /pipeline/storage.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | def archive_data(df, name): 4 | "Stores the data locally as a pickle file" 5 | 6 | pickle.dump(df, open('../data/{}.pkl'.format(name), 'wb')) 7 | 8 | return None 9 | -------------------------------------------------------------------------------- /pipeline/main.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import create_engine 2 | 3 | from ingestion import scrape_eberts_listing, scrape_movie_reviews, scrape_imdb_listing 4 | from storage import archive_data 5 | from processing import clean_ebert_listings, clean_ebert_reviews, clean_imdb 6 | from database import insert_db 7 | 8 | if __name__ == "__main__": 9 | print("This pipeline scrapes, cleans and stores movie data from Roger Ebert and IMDB's website.") 10 | 11 | # Initialize database connection 12 | try: 13 | engine = create_engine('postgresql://postgres:password@35.224.187.73:5432/postgres') 14 | print("Database found.") 15 | except: 16 | print("No database available.") 17 | quit() 18 | 19 | # Ebert's website for listings 20 | pages = int(input("How many pages would you like to scrape (24 movies per page)? ")) 21 | print("Scraping movie listings from Ebert's website.") 22 | ebert_listing = scrape_eberts_listing() 23 | archive_data(ebert_listing, "ebert_listing") 24 | ebert_listing = clean_ebert_listings(ebert_listing) 25 | insert_db(ebert_listing, 'ebert_listing', engine) 26 | 27 | # Ebert's website for reviews 28 | print("Scraping movie reviews from Ebert's website.") 29 | ebert_reviews, _ = scrape_movie_reviews(ebert_listing) 30 | archive_data(ebert_reviews, "ebert_reviews") 31 | ebert_reviews = clean_ebert_reviews(ebert_reviews) 32 | insert_db(ebert_reviews, 'ebert_reviews', engine) 33 | 34 | # IMDB website for other movie info 35 | print("Scraping movie information from IMDB.") 36 | imdb_info, _ = scrape_imdb_listing(ebert_listing) 37 | archive_data(imdb_info, "imdb_info") 38 | imdb_info = clean_imdb(imdb_info) 39 | insert_db(imdb_info, 'imdb_info', engine) 40 | 41 | print("Program complete.") 42 | -------------------------------------------------------------------------------- /pipeline/processing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | # Ebert Listings 5 | def convert_year(row): 6 | try: 7 | year = int(row['year']) 8 | return year 9 | except: 10 | return np.nan 11 | 12 | def clean_ebert_listings(df): 13 | df['year'] = df.apply(lambda x: convert_year(x), 1) 14 | 15 | return df 16 | 17 | # Ebert Reviews 18 | def convert_runtime(row): 19 | try: 20 | runtime = int(row['runtime']) 21 | return runtime 22 | except: 23 | return np.nan 24 | 25 | def clean_ebert_reviews(df): 26 | df['runtime'] = df.apply(lambda x: convert_runtime(x), 1) 27 | 28 | return df 29 | 30 | # IMDB Information 31 | def convert_imdb_rating(row): 32 | try: 33 | rating = float(row['imdb_rating']) 34 | return rating 35 | except: 36 | return np.nan 37 | 38 | def convert_rating_count(row): 39 | try: 40 | count = float(row['rating_count'].replace(',', '')) 41 | return count 42 | except: 43 | return np.nan 44 | 45 | def user_review_count(row): 46 | try: 47 | count = float(row['user_review_count'].replace(',', '')) 48 | return count 49 | except: 50 | return np.nan 51 | 52 | def critic_review_count(row): 53 | try: 54 | count = float(row['critic_review_count'].replace(',', '')) 55 | return count 56 | except: 57 | return np.nan 58 | 59 | def convert_metascore(row): 60 | try: 61 | score = float(row['metascore'].strip()) 62 | return score 63 | except: 64 | return np.nan 65 | 66 | def convert_country(row): 67 | try: 68 | country = row['country'].strip() 69 | return country 70 | except: 71 | return np.nan 72 | 73 | def convert_release_date(row): 74 | try: 75 | rel_date = row['release_date'].strip() 76 | 77 | if 'TV' in rel_date: 78 | return np.nan 79 | else: 80 | try: 81 | rel_date = datetime.datetime.strptime(rel_date, "%d, %B, %Y") 82 | return rel_date 83 | except: 84 | return np.nan 85 | 86 | except: 87 | return np.nan 88 | 89 | 90 | def clean_imdb(df): 91 | df['IMDB_Rating'] = df.apply(lambda x: convert_imdb_rating(x), 1) 92 | df['Rating_Count'] = df.apply(lambda x: convert_rating_count(x), 1) 93 | df['User_Review_Count'] = df.apply(lambda x: user_review_count(x), 1) 94 | df['Critic_Review_Count'] = df.apply(lambda x: critic_review_count(x), 1) 95 | df['Metascore'] = df.apply(lambda x: convert_metascore(x), 1) 96 | df['Country'] = df.apply(lambda x: convert_country(x), 1) 97 | df['Release_Date'] = df.apply(lambda x: convert_release_date(x), 1) 98 | 99 | return df 100 | -------------------------------------------------------------------------------- /pipeline/ingestion.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import time 3 | import re 4 | 5 | import pandas as pd 6 | from bs4 import BeautifulSoup 7 | 8 | 9 | def scrape_eberts_listing(num_pages=10): 10 | """ 11 | Parses through webpage with list of movies and returns DataFrame. 12 | :num_pages = Number of pages to go through 13 | """ 14 | url = "http://www.rogerebert.com/reviews?great_movies=0&no_stars=0&title=Cabin+in+the+Woods&filtersgreat_movies%5D%5B%5D=&filters%5Bno_stars%5D%5B%5D=&filters%5Bno_stars%5D%5B%5D=1&filters%5Btitle%5D=&filters%5Breviewers%5D=&filters%5Bgenres%5D=&page={}&sort%5Border%5D=newest" 15 | pages = list(range(1, num_pages)) 16 | links = [url.format(i) for i in pages] 17 | 18 | review_list = list() 19 | count = 0 20 | 21 | for link in links: 22 | webpage = requests.get(link).text 23 | soup = BeautifulSoup(webpage, 'lxml') 24 | all_movies = soup('figure', {'class':'movie review'}) 25 | 26 | for movie in all_movies: 27 | url = movie.a.get('href') 28 | title = movie.find_all('a')[1].text 29 | stars = len(movie.find_all('i', {'class':'icon-star-full'})) + 0.5 * len(movie.find_all('i', {'class':'icon-star-half'})) 30 | 31 | try: 32 | year = movie.find('span', {'class':'release-year'}).text[1:-1] 33 | except: 34 | year = '' 35 | 36 | count += 1 37 | review_list.append([count, title, stars, year, url]) 38 | 39 | 40 | df = pd.DataFrame(review_list, columns = ['id', 'title', 'ebertstars', 'year', 'url']) 41 | return df 42 | 43 | def scrape_movie_reviews(df): 44 | """ 45 | Parses each individual review page and returns list of key attributes. 46 | :link = URL for review 47 | """ 48 | scraped_list = list() 49 | 50 | for link in df['url']: 51 | full_link = "http://www.rogerebert.com" + link 52 | webpage = requests.get(full_link).text 53 | soup = BeautifulSoup(webpage, 'lxml') 54 | 55 | try: 56 | mpaa = soup.find('p', {'class':'mpaa-rating'}).strong.text[6:] 57 | except: 58 | mpaa = '' 59 | 60 | try: 61 | runtime = int(soup.find('p', {'class':'running-time'}).strong.text[:3].strip()) 62 | except: 63 | runtime = '' 64 | 65 | try: 66 | review = ' '.join([paragraph.text for paragraph in soup.find('div', {'itemprop':'reviewBody'}).find_all('p')]) 67 | except: 68 | review = '' 69 | 70 | scraped_list.append([link, mpaa, runtime, review]) 71 | 72 | time.sleep(0.25) 73 | 74 | df = pd.DataFrame(scraped_list, columns = ['url', 'rating', 'runtime', 'review']) 75 | 76 | return df, scraped_list 77 | 78 | def scrape_imdb_listing(df): 79 | """ 80 | Searches IMDB, parses results and returns DataFrame. 81 | :df = DataFrame with movie titles 82 | """ 83 | movie_list = list() 84 | 85 | for movie in df['title']: 86 | base_url = 'http://www.imdb.com/find?q=' 87 | url = base_url + movie +'&s=all' 88 | webpage = requests.get(url).text 89 | soup = BeautifulSoup(webpage, 'lxml') 90 | 91 | try: 92 | results = soup('table', {'class':'findList'})[0] 93 | except: 94 | continue 95 | 96 | title = results.find_all('tr')[0] 97 | link = title.find('a', href=True)['href'] 98 | 99 | url = 'http://www.imdb.com' + link 100 | webpage = requests.get(url).text 101 | soup = BeautifulSoup(webpage, 'lxml') 102 | 103 | movie_title = soup.find('title') 104 | 105 | try: 106 | rate = soup.find('span', itemprop='ratingValue').text 107 | except: 108 | rate = '' 109 | 110 | try: 111 | count = soup.find('span', itemprop='ratingCount').text 112 | except: 113 | count = '' 114 | 115 | try: 116 | des = soup.find('meta',{'name':'description'})['content'] 117 | except: 118 | des = '' 119 | 120 | try: 121 | metascore = soup.find('div', class_='metacriticScore').text 122 | except: 123 | metascore = '' 124 | 125 | try: 126 | reviews_count = soup.find('div', class_='titleReviewbarItemBorder') 127 | u_reviews = reviews_count.find_all('a')[0].text.split(' ')[0] 128 | c_reviews = reviews_count.find_all('a')[1].text.split(' ')[0] 129 | except: 130 | u_reviews = '' 131 | c_review = '' 132 | 133 | try: 134 | director = soup.find('span', itemprop='name').text 135 | except: 136 | director = '' 137 | 138 | try: 139 | country = soup.find('div', class_='subtext').find_all('a', title=True)[-1].text.split(' ')[-1] 140 | country = re.sub('[\(\)\{\}<>]', '', country) 141 | except: 142 | country = '' 143 | 144 | try: 145 | rel_date = (', ').join(soup.find('div', class_='subtext').find_all('a', 146 | title=True)[-1].text.split(' ')[:-1]) 147 | except: 148 | rel_date = '' 149 | 150 | movie_list.append([movie, rate, count, des, metascore, u_reviews, c_reviews, 151 | director, country, rel_date]) 152 | 153 | time.sleep(0.25) 154 | 155 | 156 | df = pd.DataFrame(movie_list, columns = ['title', 'imdb_rating', 'rating_count', 157 | 'description', 'metascore', 'user_review_count', 'critic_review_count', 158 | 'director', 'country', 'release_date']) 159 | 160 | return df, movie_list 161 | --------------------------------------------------------------------------------