├── IMDBProject.py ├── LICENSE ├── README.md ├── UI-1.png ├── UI-2.png ├── UI-3.png ├── UI-4.png └── imdbmovies.csv /IMDBProject.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """Created on Thu Jan 24 13:50:03 2019 3 | @author: shaz- 4 | """ 5 | ######################################################################################################################### 6 | # Importing Packages 7 | ######################################################################################################################### 8 | 9 | ''' 10 | Importing The Necessary Packages 11 | ''' 12 | import json 13 | import re 14 | import requests 15 | import warnings 16 | import numpy as np 17 | import pandas as pd 18 | import mysql.connector 19 | import urllib.request 20 | from scipy import stats 21 | import seaborn as sns 22 | from bs4 import BeautifulSoup 23 | from currency_converter import CurrencyConverter 24 | from matplotlib import pyplot as plt 25 | import nltk 26 | import unicodedata 27 | import vaderSentiment 28 | from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 29 | from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA 30 | from sklearn.preprocessing import scale 31 | from sklearn.model_selection import train_test_split 32 | from sklearn import metrics as sm 33 | from sklearn.svm import LinearSVC 34 | from sklearn.feature_extraction.text import TfidfVectorizer 35 | warnings.filterwarnings('ignore') 36 | sns.set(style="darkgrid", color_codes=True) 37 | 38 | ######################################################################################################################### 39 | # Defining Functions 40 | ######################################################################################################################### 41 | class ImdbMovies: 42 | model='' 43 | vectorizer='' 44 | mydb='' 45 | '''Loading constructor, so when instance is instantiate it will load our model and as well 46 | as it will create a connection with the database''' 47 | def __init__(self,**kwargs): 48 | self.firstname=kwargs.get('firstname','Firstname Not Provided') 49 | self.lastname=kwargs.get('lastname','LastName Not Provided') 50 | self.mydb=self.DatabaseConnection('root','Sagar$256','imdbmovies') 51 | print("\nPlease wait {}, while we're running the model.....".format(self.firstname)) 52 | self.model,self.vectorizer=self.UserReview_SentimentAnalyzer() 53 | print('''Done!!, you're good to go''') 54 | print("#########################################################################################################################") 55 | print("Welcome! {} {} to our movie search and data analysis program:\n".format(self.firstname.capitalize(),self.lastname.capitalize())) 56 | print("#########################################################################################################################") 57 | 58 | '''This is just to provide user freindly string when object is print''' 59 | def __str__(self): 60 | return '''What's going on {} {}, enjoy your movie buddy'''.format(self.firstname.capitalize(),self.lastname.capitalize()) 61 | 62 | '''Using Vader lexicon function to get the polarity''' 63 | def sentiment_lexicon(self,review, threshold=0.1): 64 | sid = SIA() 65 | ss = sid.polarity_scores(review) 66 | agg_score = ss['compound'] 67 | if agg_score >= threshold: 68 | final_sentiment = 'Positive' 69 | else: 70 | final_sentiment = 'Negative' 71 | return final_sentiment 72 | 73 | '''Sentiment analysis based on user review submited''' 74 | def UserReview_SentimentAnalyzer(self): 75 | self.df=pd.read_sql("select imdbid,User_Review,Polarity from movies;",self.mydb) 76 | # User_Review 77 | self.data = self.df['User_Review'] 78 | self.data=pd.Series.to_string(self.data) ## converted to string from pandas.Series 79 | # for removing accented characters 80 | self.normal = unicodedata.normalize('NFKD', self.data).encode('ASCII', 'ignore') 81 | # sentiment_vader_lexicon: 82 | self.list_senti=[] 83 | for i in self.df['User_Review']: 84 | self.list_senti.append(self.sentiment_lexicon(i)) 85 | self.list_senti 86 | #creating new column as sentiment which will have 0/1 values 87 | self.df['polarity']=self.list_senti 88 | # assigning 89 | self.features=self.df.loc[:,'User_Review'] 90 | self.senti=self.df.loc[:,'polarity'] 91 | # Using TFIDF vectorizer 92 | self.vectorizer = TfidfVectorizer(min_df= 3, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 2)) 93 | self.final_features = self.vectorizer.fit_transform(self.features).toarray() 94 | self.features_train,self.features_test,self.senti_train,self.senti_test=train_test_split(self.final_features,self.senti,test_size=0.2,random_state=23) 95 | # SVC model to get acc & class table 96 | self.lsvm = LinearSVC() 97 | self.l = self.lsvm.fit(self.features_train,self.senti_train) 98 | return self.l,self.vectorizer 99 | 100 | '''Predictor function that will help to analyse user review and provide the polarity''' 101 | def Predict_Sentiment(self,model,vectorizer,User_Review): 102 | # l=UserReview_SentimentAnalyzer() 103 | self.y = self.vectorizer.transform([self.User_Review]).toarray() 104 | self.z=model.predict(self.y) 105 | return self.z 106 | 107 | '''Displaying intial menu where user can select an option either to search the movie or analyse the movie ''' 108 | def displayMenu(self): 109 | print("\nMenu:\n[S]earch Movie,[A]nalyze the data,[Q]uit:\n") 110 | print("---------------------------------------------------------------------------") 111 | Choice='' 112 | flag=0 113 | options={'s':'search','a':'analyse','q':'quit'} 114 | try: 115 | Choice=input("Please enter your option below:\n").lower() 116 | if Choice=='': 117 | flag=1 118 | return Choice,flag 119 | elif Choice not in options.keys(): 120 | flag=2 121 | return Choice,flag 122 | else: 123 | return Choice,flag 124 | except ValueError: 125 | print("\nInvalid input...please enter S,A,Q from choices provided:\n") 126 | self.displayMenu() 127 | 128 | '''Budget and Review need to be extracted from IMDB website ''' 129 | def Extract_Budget_UserReview(self,imdbID): 130 | c = CurrencyConverter() 131 | CurrencyDict = {'$': 'USD', '£': 'GBP', '¥': 'JPY', '€': 'EUR', '₹': 'INR'} 132 | url = 'http://www.imdb.com/title/{}/?ref_=fn_al_nm_1a'.format(imdbID) 133 | data = requests.get(url) 134 | soup = BeautifulSoup(data.text, 'html.parser') 135 | Budget = 0 136 | userReview = "" 137 | 138 | #Extracting the user Review of the movie 139 | movie = soup.findAll('div', {'class': 'user-comments'}) 140 | for res in movie: 141 | userReview = res.span.strong.text 142 | if userReview is None: 143 | userReview='N/A' 144 | 145 | #Extracting the Budget of the movie 146 | for h4 in soup.find_all('h4'): 147 | if "Budget:" in h4: 148 | Budget = h4.next_sibling 149 | match = re.search(r'([\D]+)([\d,]+)', Budget) 150 | output = (match.group(1).replace('\xa0', ''), 151 | match.group(2).replace(',', '')) 152 | if len(output[0]) == 1: 153 | Budget = round( 154 | (c.convert(output[1], CurrencyDict[output[0]], 'USD')/1000000), 2) 155 | elif len(output[0]) == 3 and output[0] == 'XAF': 156 | Budget = round((float(output[1])*0.00174637)/1000000, 2) 157 | elif len(output[0]) == 3 and output[0] == 'FRF': 158 | Budget = round((float(output[1])*0.17)/1000000, 2) 159 | elif len(output[0]) == 3 and output[0] == 'IRR': 160 | Budget = round((float(output[1])*0.0000237954)/1000000, 2) 161 | elif len(output[0]) == 3 and output[0] == 'PKR': 162 | Budget = round((float(output[1])*0.007225614)/1000000, 2) 163 | elif len(output[0]) == 3 and output[0] == 'NPR': 164 | Budget = round((float(output[1])*87.0521)/1000000, 2) 165 | elif len(output[0]) == 3 and output[0] != 'FRF': 166 | Budget = round( 167 | c.convert(output[1], output[0], 'USD')/1000000, 2) 168 | return Budget,userReview 169 | 170 | '''Extracting movie details from API''' 171 | def getMovieData(self,Movietitle): 172 | try: 173 | url = "http://www.omdbapi.com/?t={}&apikey=5ddb11dd".format(Movietitle) 174 | print("Retrieving the data of \"{}\" now…".format(Movietitle)) 175 | api_request = requests.get(url) 176 | source = json.loads(api_request.content) 177 | except requests.RequestException as e: 178 | print(f"ERROR: {e.reason}") 179 | return source 180 | 181 | '''Establishing the database connection''' 182 | def DatabaseConnection(self,user, passwd, database): 183 | mydb='' 184 | try: 185 | mydb = mysql.connector.connect(host='localhost', 186 | user=user, 187 | passwd=passwd, 188 | db=database) 189 | except: 190 | print("""The login credentials you entered are not valid for 191 | the database you indicated. Please check your login details and try 192 | again.""") 193 | return mydb 194 | 195 | '''This function will sepearte each word from genre and stack it in long format''' 196 | def explode(self,df, lst_cols, fill_value=''): 197 | # make sure `lst_cols` is a list 198 | if lst_cols and not isinstance(lst_cols, list): 199 | lst_cols = [lst_cols] 200 | # all columns except `lst_cols` 201 | idx_cols = df.columns.difference(lst_cols) 202 | 203 | # calculate lengths of lists 204 | lens = df[lst_cols[0]].str.len() 205 | 206 | if (lens > 0).all(): 207 | # ALL lists in cells aren't empty 208 | return pd.DataFrame({ 209 | col: np.repeat(df[col].values, lens) 210 | for col in idx_cols 211 | }).assign(**{col: np.concatenate(df[col].values) for col in lst_cols}) \ 212 | .loc[:, df.columns] 213 | else: 214 | # at least one list in cells is empty 215 | return pd.DataFrame({ 216 | col: np.repeat(df[col].values, lens) 217 | for col in idx_cols 218 | }).assign(**{col: np.concatenate(df[col].values) for col in lst_cols}) \ 219 | .append(df.loc[lens == 0, idx_cols]).fillna(fill_value) \ 220 | .loc[:, df.columns] 221 | 222 | '''This Function will put the data extracted from API and from webscraping into movie database ''' 223 | def DataIntoDatabase(self,MovieData, mydb, mycursor): 224 | self.budget,self.User_Review = self.Extract_Budget_UserReview(MovieData['imdbID']) 225 | self.pred=self.Predict_Sentiment(self.model,self.vectorizer,self.User_Review) 226 | self.polarity=''.join(self.pred) 227 | if MovieData['Metascore'] == 'N/A': 228 | metascore = 0 229 | else: 230 | metascore = (float((MovieData['Metascore']))/10) 231 | 232 | if MovieData['imdbRating']=='N/A': 233 | imdb_rating=0 234 | else: 235 | imdb_rating = float(MovieData['imdbRating']) 236 | 237 | if MovieData['Released']=='N/A': 238 | release_year=0 239 | else: 240 | release_year=int(MovieData['Released'].split(' ')[2]) 241 | 242 | if MovieData['Poster']=="N/A": 243 | image_url='Image Not Available' 244 | else: 245 | image_url=MovieData['Poster'] 246 | 247 | sql = """INSERT INTO movies(IMDBID, Title, Genre, Year, URL, Audience_Rating, Critic_Rating, Budget_In_Millions, User_Review,Polarity) 248 | VALUES (%s, %s,%s, %s,%s,%s,%s,%s,%s,%s) 249 | ON DUPLICATE KEY UPDATE 250 | Audience_Rating=values(Audience_Rating), 251 | Critic_Rating=values(Critic_Rating), 252 | Budget_In_Millions=values(Budget_In_Millions), 253 | User_Review=values(User_Review), 254 | Polarity=values(Polarity);""" 255 | 256 | val=[(MovieData['imdbID'],MovieData['Title'], 257 | MovieData['Genre'],release_year,image_url, 258 | imdb_rating,metascore,self.budget,self.User_Review,self.polarity)] 259 | mycursor.executemany(sql, val) 260 | mydb.commit() 261 | 262 | 263 | '''This function will fetch the data from database from the title provided by the user''' 264 | def getDataFromDatabase(self,UserInputTitle): 265 | mydb=self.mydb 266 | self.mycursor=mydb.cursor() 267 | self.mycursor.execute("""select title,genre,year,audience_rating,critic_rating,polarity 268 | from movies where title like %s limit 1""",("%" + self.UserInputTitle + "%",)) 269 | self.myresult = self.mycursor.fetchall() 270 | return self.myresult 271 | 272 | ''' 273 | This is use to display info about the movie title provided by the user, 274 | at the same time if the movie title doesn't exist the it will make an entry into the database 275 | and then it will fetch the data from database to display info. 276 | ''' 277 | def DisplayMovieInfo(self,UserInputTitle): 278 | mydb=self.mydb 279 | mycursor=mydb.cursor() 280 | try: 281 | myresult=self.getDataFromDatabase(self.UserInputTitle) 282 | if not myresult: 283 | MovieData=self.getMovieData(self.UserInputTitle) 284 | if MovieData['Response']=='False': 285 | print("Sorry!!! The Movie Doesn't Exist.....:(") 286 | else: 287 | self.DataIntoDatabase(MovieData,mydb,mycursor) 288 | myresult=self.getDataFromDatabase(self.UserInputTitle) 289 | if myresult[0][5]=='Positive': 290 | res="Good Choice! & you can enjoy this with your buddy. :)" 291 | else: 292 | res="Well, you've decide so enjoy this with your popcorn. :)." 293 | print('*********************************************************') 294 | print("Title: {}".format(myresult[0][0])) 295 | print("Genre: {}".format(myresult[0][1])) 296 | print("Year: {}".format(myresult[0][2])) 297 | print("Audience Rating: {}".format(myresult[0][3])) 298 | print("Critic Rating: {}".format(myresult[0][4])) 299 | print("What's My Sugesstion: {}".format(res)) 300 | print('*********************************************************') 301 | else: 302 | if myresult[0][5]=='Positive': 303 | res="Your can enjoy this with your buddy!" 304 | else: 305 | res="Well you've decide so enjoy with your popcorn." 306 | print('*********************************************************') 307 | print("Title: {}".format(myresult[0][0])) 308 | print("Genre: {}".format(myresult[0][1])) 309 | print("Year: {}".format(myresult[0][2])) 310 | print("Audience Rating: {}".format(myresult[0][3])) 311 | print("Critic Rating: {}".format(myresult[0][4])) 312 | print("What's My Sugesstion: {}".format(res)) 313 | print('*********************************************************') 314 | except: 315 | print('''Sorry its doesnt exist...please try once again''') 316 | 317 | 318 | '''This Function will fetch the data by year''' 319 | def getDataByYear(self,FirstRange,SecondRange): 320 | self.movieData=pd.read_sql("""select title,genre,year,audience_rating,critic_rating,budget_in_millions 321 | from movies 322 | where url!='N/A' and 323 | genre!='N/A'and 324 | year!=0 and 325 | Audience_rating!=0 and 326 | critic_rating!=0 and 327 | budget_in_millions!=0 and 328 | user_review <> ''and polarity <> '' and Year BETWEEN {} and {};""".format(FirstRange,SecondRange),self.mydb) 329 | return self.movieData 330 | 331 | '''Getting an input title from the user''' 332 | def getChoiceMovie(self): 333 | try: 334 | self.UserInputTitle=input("Please enter the title of the movie:\n") 335 | if self.UserInputTitle=='': 336 | print("No Input Provided.") 337 | else: 338 | self.DisplayMovieInfo(self.UserInputTitle) 339 | except ValueError: 340 | print("\nSome error occured.....please check the input provided") 341 | 342 | '''This fucntion will fetch the data from the database & process it while formatting the data in long format''' 343 | def DataAnalysis(self): 344 | self.movieData='' 345 | self.year=pd.read_sql('''select max(year) as Max_Year, min(year) as Min_Year 346 | from movies where url!='N/A' and 347 | genre!='N/A'and year!=0 and 348 | Audience_rating!=0 and 349 | critic_rating!=0 and 350 | budget_in_millions!=0 and 351 | user_review <> '' and 352 | Polarity <>'' ''',self.mydb) 353 | 354 | print('***********************************************************************************************') 355 | print("Data is avialable from {} to {}.\nEnter the range so as to bring you the analysis".format(self.year.Min_Year[0],self.year.Max_Year[0])) 356 | print('***********************************************************************************************') 357 | try: 358 | self.FirstRange=input("Please Enter The Range 1:\n") 359 | self.SecondRange=input("Please Enter The Range 2:\n") 360 | if self.FirstRange=='' or self.SecondRange=='': 361 | print("No input was provided.\n") 362 | self.DataAnalysis() 363 | elif self.FirstRange > self.SecondRange: 364 | print('\nProvided range is invalid...since the first range cannot be greater than the second.\n') 365 | self.DataAnalysis() 366 | elif len(self.FirstRange)!=4 or len(self.SecondRange)!=4: 367 | print("Provided range is invalid...\n") 368 | self.DataAnalysis() 369 | else: 370 | self.movieData=self.getDataByYear(int(self.FirstRange),int(self.SecondRange)) 371 | self.movieData.genre = self.movieData.genre.str.split(',') 372 | self.movieData = self.explode(self.movieData,['genre']) 373 | self.movieData.genre=self.movieData.genre.str.strip(' ') 374 | self.movieData.genre=self.movieData.genre.astype('category') 375 | self.movieData.audience_rating=self.movieData.audience_rating*10 376 | self.movieData.critic_rating=self.movieData.critic_rating*10 377 | 378 | except ValueError: 379 | print("Please provide correct input...since the entered value is not a number.\n") 380 | self.DataAnalysis() 381 | return self.movieData 382 | 383 | def OptionChoice(self): 384 | print("""Please Choose option below: 385 | 1. Display Top 10 Rated Movies 386 | 2. Display Top 10 High Budget Movies 387 | 3. Display Critic Vs Audience Rating 388 | 4. Distribution of Critic or Audience Rating 389 | 5. Display Stack Distribution of Budget 390 | 6. Display Boxplot 391 | 7. Display Barplot\n 392 | """) 393 | 394 | def OptionChoiceDist(self): 395 | self.optionDist={1:'Critic Rating Distribution',2:'Audience Rating Distribution'} 396 | print("""Please Enter:\n[1] Critic Rating Distribution\n[2] Audience Rating Distribution\n""") 397 | try: 398 | self.Dist=int(input(">")) 399 | if self.Dist not in self.optionDist.keys(): 400 | print("Sorry please enter your choice from the option below\n") 401 | self.OptionChoiceDist() 402 | elif self.Dist==1: 403 | self.DisplayHistogram(self.movieData,'critic rating') 404 | elif self.Dist==2: 405 | self.DisplayHistogram(self.movieData,'audience rating') 406 | except ValueError: 407 | print("Invalid input provided.") 408 | self.OptionChoiceDist() 409 | 410 | def OptionChoiceBox(self): 411 | self.optionbox={1:'Critic Rating Boxplot',2:'Audience Rating Boxplot'} 412 | print("""Please Enter:\n[1] Display boxplot for critic rating by genre\n[2] Display boxplot for audience rating by genre\n""") 413 | try: 414 | self.box=int(input(">")) 415 | if self.box not in self.optionbox.keys(): 416 | print("Sorry please enter your choice from the option below\n") 417 | self.OptionChoiceBox() 418 | elif self.box==1: 419 | self.DisplayBoxplot(self.movieData,'genre','critic rating') 420 | elif self.box==2: 421 | self.DisplayBoxplot(self.movieData,'genre','audience rating') 422 | except ValueError: 423 | print("Invalid input provided.") 424 | self.OptionChoiceBox() 425 | 426 | def OptionChoiceBar(self): 427 | self.optionbar={1:'Genre bar plot',2:'Year bar plot'} 428 | print("""Please Enter:\n[1] Display barplot to display Data by Genre\n[2] Display barplot to display Data by year:\n""") 429 | try: 430 | self.bar=int(input(">")) 431 | if self.bar not in self.optionbar.keys(): 432 | print("Sorry please enter your choice from the option below\n") 433 | self.OptionChoiceBar() 434 | elif self.bar==1: 435 | self.catPlot(self.movieData,'genre') 436 | elif self.bar==2: 437 | self.catPlot(self.movieData,'year') 438 | except ValueError: 439 | print("Invalid input provided.") 440 | self.OptionChoiceBar() 441 | 442 | def DisplayCricticAudienceRating(self,movieData): 443 | #Joint Plot Critic Rating Vs Audience Rating 444 | sns.set(style='whitegrid') 445 | sns.jointplot(data=self.movieData,x='critic_rating',y='audience_rating') 446 | j = sns.JointGrid(data=self.movieData,x='critic_rating',y='audience_rating') 447 | j = j.plot_joint(plt.scatter,color="g", s=40, edgecolor="black") 448 | j = j.plot_marginals(sns.distplot, kde=False,) 449 | j = j.annotate(stats.pearsonr,loc="upper left") 450 | j.set_axis_labels('Critic Ratings','Audience Rating') 451 | plt.show() 452 | 453 | # Histogram 454 | def DisplayHistogram(self,movieData,column): 455 | column=column.title() 456 | LabelDictCol = {'Critic Rating':'critic_rating','Audience Rating':'audience_rating','Budget In Millions':'budget_in_millions'} 457 | sns.set(style = 'whitegrid') 458 | fig,ax=plt.subplots() 459 | fig.set_size_inches(11.7,8.27) 460 | plt.hist(movieData[LabelDictCol[column]],bins=15,color='black') 461 | plt.title("{} Distribution".format(column),fontsize=20) 462 | plt.ylabel("Frequency",fontsize=15) 463 | plt.xlabel("{} (%)".format(column),fontsize=15) 464 | plt.show() 465 | 466 | # Stack distribution 467 | def DisplayStackedHistogram(self,movie): 468 | list1=[] 469 | GenreLabels=[] 470 | for gen in movie.genre.cat.categories: 471 | list1.append(movie[movie.genre==gen].budget_in_millions) 472 | GenreLabels.append(gen) 473 | sns.set(style='whitegrid') 474 | fig,ax=plt.subplots() 475 | fig.set_size_inches(11.7,8.27) 476 | plt.hist(list1,bins=30,stacked=True,rwidth=1,label=GenreLabels) 477 | plt.title("Movie Budget Distribution",fontsize=20) 478 | plt.ylabel("Number of Movies",fontsize=15) 479 | plt.xlabel("Budget$$$",fontsize=15) 480 | plt.legend(frameon=True,fancybox=True,prop={'size':10},framealpha=1) 481 | plt.show() 482 | 483 | 484 | # how critic rating is dtributted accross different genre 485 | def DisplayBoxplot(self,data,column1,column2): 486 | column1=column1.title() 487 | column2=column2.title() 488 | LabelDictCol = {'Critic Rating':'critic_rating','Audience Rating':'audience_rating','Budget In Millions':'budget_in_millions','Genre':'genre','Year':'year'} 489 | fig,ax=plt.subplots() 490 | fig.set_size_inches(11.7,8.27) 491 | sns.boxplot(data=data,x=LabelDictCol[column1],y=LabelDictCol[column2],palette='vlag',whis="range") 492 | ax.yaxis.grid(True) 493 | ax.xaxis.grid(True) 494 | plt.title('{} Vs {} Boxplot'.format(column1,column2),fontsize=20) 495 | plt.xlabel('{}'.format(column1),fontsize=15) 496 | plt.ylabel('{}'.format(column2),fontsize=15) 497 | plt.xticks(rotation=30) 498 | sns.despine(trim=True, left=True) 499 | plt.show() 500 | 501 | '''Function is use to display barplot for number of movie by genre or year''' 502 | def catPlot(self,data,column): 503 | column=column.title() 504 | LabelDictCol = {'Critic Rating':'critic_rating','Audience Rating':'audience_rating','Budget In Millions':'budget_in_millions','Genre':'genre','Year':'year'} 505 | sns.catplot(y=LabelDictCol[column], kind="count", palette="ch:.25", data=data) 506 | plt.title('Barplot For {}'.format(column.capitalize()),fontsize=20) 507 | plt.ylabel('{}'.format(column.capitalize()),fontsize=15) 508 | plt.xlabel('') 509 | plt.show() 510 | 511 | '''Display top 10 movie w.r.t genre or year''' 512 | def getTop10(self,data): 513 | p=sns.factorplot(aspect=1.5,y='title',x='audience_rating',data=data.sort_values(['audience_rating','critic_rating'],ascending=False).drop(['genre'],axis=1).drop_duplicates().head(10),palette="ch:.25",kind='bar') 514 | p.set(xlim=(10,100)) 515 | sns.set_style("ticks",{"xtick.major.size":8,"ytick.major.size":8}) 516 | plt.title('Top 10 Rated Movies',fontsize=20) 517 | plt.ylabel('Title',fontsize=15) 518 | plt.xlabel('Audience Rating',fontsize=15) 519 | sns.despine(trim=True,left=True) 520 | plt.show() 521 | 522 | '''Function will display top 10 movie w.r.t budget''' 523 | def getTop10HighBudgetMovie(self,data): 524 | sns.factorplot(aspect=1.5,y='title',x='budget_in_millions',data=data.sort_values(['budget_in_millions'],ascending=False).drop(['genre'],axis=1).drop_duplicates().head(10),palette="ch:.25",kind='bar') 525 | sns.set_style("ticks",{"xtick.major.size":8,"ytick.major.size":8}) 526 | plt.title('Top 10 High Budget Movies',fontsize=20) 527 | plt.ylabel('Title',fontsize=15) 528 | plt.xlabel('Budget In Millions',fontsize=15) 529 | sns.despine(trim=True,left=True) 530 | plt.show() 531 | 532 | 533 | '''This is function will be call after the object is created and its link with mulitple functions from above''' 534 | def DisplayTheDetails(self): 535 | self.options={'s':'search','a':'analyse','q':'quit'} 536 | while True: 537 | Choice,flag = self.displayMenu() 538 | if flag==1 and Choice=='': 539 | print("Please select the option from the menu:\n") 540 | break 541 | self.displayMenu() 542 | elif flag==2 and Choice not in self.options.keys(): 543 | print("Please select the option from the menu:\n") 544 | break 545 | self.displayMenu() 546 | elif flag==0 and Choice in self.options.keys(): 547 | if Choice == 's': 548 | self.getChoiceMovie() 549 | break 550 | elif Choice == 'a': 551 | self.optionAnalyze={1:'Display Top 10 Rated Movies',2:'Display Top 10 High Budget Movies',3: 'Display Critic Vs Audience', 552 | 4:'Distribution of Critic Vs Audience',5:'Display Stack Distribution of Budget',6:'Display Boxplot',7:'Display Barplot', 553 | 8:'Display Dashboard'} 554 | self.movieData=self.DataAnalysis() 555 | try: 556 | self.OptionChoice() 557 | choice=int(input("Please enter the option below:\n")) 558 | if choice not in self.optionAnalyze.keys(): 559 | print("Sorry please enter your choice from the option below ") 560 | break 561 | self.OptionChoice() 562 | if choice == 1: 563 | self.getTop10(self.movieData) 564 | elif choice==2: 565 | self.getTop10HighBudgetMovie(self.movieData) 566 | elif choice==3: 567 | self.DisplayCricticAudienceRating(self.movieData) 568 | elif choice==4: 569 | self.OptionChoiceDist() 570 | elif choice==5 : 571 | self.DisplayStackedHistogram(self.movieData) 572 | elif choice==6: 573 | self.OptionChoiceBox() 574 | elif choice==7: 575 | self.OptionChoiceBar() 576 | except ValueError: 577 | print("Sorry! please enter a number.") 578 | self.DisplayTheDetails() 579 | break 580 | elif Choice == 'q': 581 | break 582 | check_again=input("Do you want to check again? Y/n:\n") 583 | if check_again.lower() != 'n': 584 | self.DisplayTheDetails() 585 | else: 586 | print("\n***********************************************************************************************") 587 | print("Thanks for your participation, GoodBye!!!") 588 | 589 | ######################################################################################################################### 590 | # The Main Part for Displaying Movie info to the user 591 | ######################################################################################################################### 592 | 593 | def MainFunction(): 594 | try: 595 | FirstName = input("Please Enter Your First Name:\n") 596 | LastName = input("Please Enter Your Last Name:\n") 597 | if FirstName=='' or LastName=='': 598 | print("Input cannot be blank...") 599 | 600 | else: 601 | User = ImdbMovies(firstname=FirstName, lastname=LastName) 602 | User.DisplayTheDetails() 603 | except: 604 | print("\n***********************************************************************************************") 605 | print("Please provide a valid input") 606 | MainFunction() 607 | MainFunction() 608 | 609 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Shreyas Wankhede 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # IMDb-Web-Scraping-and-Sentiment-Analysis 2 | * The objective of this project is to scarp the data from IMDb website and form an analysis that will help data analyst or production company to decide how they are going to proceed with making a new movie, second is to form a model to predict what are the sentiments of movies based on user reviews. 3 | *** 4 | 5 | ### Description: 6 | * The Internet Movie Database (IMDb) is one of the world’s most popular sources for movie, TV and celebrity content with more than 100 million unique visitors per month. 7 | IMDb has huge collection of movies database that includes various details of movies along with different ratings and user reviews. 8 | This movie reviews affects everyone from audience, film critics to the production company. 9 | 10 | ### Problem Statememt: 11 | * Idea of our project is to scarp the data from IMDb and form an analysis that will help data analyst or production company to decide how they are going to proceed with making a new movie, second is to form a model to predict what are the sentiments of movies based on user reviews. 12 | 13 | ### Dataset: 14 | * 3500+ records and has 10 columns. 15 | 16 | ### Dashboard: Using Django Framework 17 | ![alt text](https://github.com/shreyaswankhede/IMDb-Web-Scraping-and-Sentiment-Analysis/blob/master/UI-1.png "Logo Title Text 1") 18 |
19 |
This page shows the top 10 movies, top genres and number of movies liked and disliked. 20 | 21 | ![alt text](https://github.com/shreyaswankhede/IMDb-Web-Scraping-and-Sentiment-Analysis/blob/master/UI-2.png "Logo Title Text 1") 22 |
23 |
This pages displays Sentiment Analysis based on user reviews for each movie along with their other details. 24 | 25 | ![alt text](https://github.com/shreyaswankhede/IMDb-Web-Scraping-and-Sentiment-Analysis/blob/master/UI-3.png "Logo Title Text 1") 26 |
27 |
Visualization of Critics ratings for movies. 28 | 29 | ![alt text](https://github.com/shreyaswankhede/IMDb-Web-Scraping-and-Sentiment-Analysis/blob/master/UI-4.png "Logo Title Text 1") 30 |
31 |
Distribution of Audience Ratings vs Critic Ratings. 32 | 33 | *** 34 | 35 |

Thank You! 36 |

37 | Follow @shreyaswankhede 38 | -------------------------------------------------------------------------------- /UI-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shreyaswankhede/IMDb-Web-Scraping-and-Sentiment-Analysis/c916807575e061e2fce36f28f93ef14e17ef7244/UI-1.png -------------------------------------------------------------------------------- /UI-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shreyaswankhede/IMDb-Web-Scraping-and-Sentiment-Analysis/c916807575e061e2fce36f28f93ef14e17ef7244/UI-2.png -------------------------------------------------------------------------------- /UI-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shreyaswankhede/IMDb-Web-Scraping-and-Sentiment-Analysis/c916807575e061e2fce36f28f93ef14e17ef7244/UI-3.png -------------------------------------------------------------------------------- /UI-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shreyaswankhede/IMDb-Web-Scraping-and-Sentiment-Analysis/c916807575e061e2fce36f28f93ef14e17ef7244/UI-4.png --------------------------------------------------------------------------------