├── IMDBProject.py
├── LICENSE
├── README.md
├── UI-1.png
├── UI-2.png
├── UI-3.png
├── UI-4.png
└── imdbmovies.csv
/IMDBProject.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """Created on Thu Jan 24 13:50:03 2019
3 | @author: shaz-
4 | """
5 | #########################################################################################################################
6 | # Importing Packages
7 | #########################################################################################################################
8 |
9 | '''
10 | Importing The Necessary Packages
11 | '''
12 | import json
13 | import re
14 | import requests
15 | import warnings
16 | import numpy as np
17 | import pandas as pd
18 | import mysql.connector
19 | import urllib.request
20 | from scipy import stats
21 | import seaborn as sns
22 | from bs4 import BeautifulSoup
23 | from currency_converter import CurrencyConverter
24 | from matplotlib import pyplot as plt
25 | import nltk
26 | import unicodedata
27 | import vaderSentiment
28 | from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
29 | from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
30 | from sklearn.preprocessing import scale
31 | from sklearn.model_selection import train_test_split
32 | from sklearn import metrics as sm
33 | from sklearn.svm import LinearSVC
34 | from sklearn.feature_extraction.text import TfidfVectorizer
35 | warnings.filterwarnings('ignore')
36 | sns.set(style="darkgrid", color_codes=True)
37 |
38 | #########################################################################################################################
39 | # Defining Functions
40 | #########################################################################################################################
41 | class ImdbMovies:
42 | model=''
43 | vectorizer=''
44 | mydb=''
45 | '''Loading constructor, so when instance is instantiate it will load our model and as well
46 | as it will create a connection with the database'''
47 | def __init__(self,**kwargs):
48 | self.firstname=kwargs.get('firstname','Firstname Not Provided')
49 | self.lastname=kwargs.get('lastname','LastName Not Provided')
50 | self.mydb=self.DatabaseConnection('root','Sagar$256','imdbmovies')
51 | print("\nPlease wait {}, while we're running the model.....".format(self.firstname))
52 | self.model,self.vectorizer=self.UserReview_SentimentAnalyzer()
53 | print('''Done!!, you're good to go''')
54 | print("#########################################################################################################################")
55 | print("Welcome! {} {} to our movie search and data analysis program:\n".format(self.firstname.capitalize(),self.lastname.capitalize()))
56 | print("#########################################################################################################################")
57 |
58 | '''This is just to provide user freindly string when object is print'''
59 | def __str__(self):
60 | return '''What's going on {} {}, enjoy your movie buddy'''.format(self.firstname.capitalize(),self.lastname.capitalize())
61 |
62 | '''Using Vader lexicon function to get the polarity'''
63 | def sentiment_lexicon(self,review, threshold=0.1):
64 | sid = SIA()
65 | ss = sid.polarity_scores(review)
66 | agg_score = ss['compound']
67 | if agg_score >= threshold:
68 | final_sentiment = 'Positive'
69 | else:
70 | final_sentiment = 'Negative'
71 | return final_sentiment
72 |
73 | '''Sentiment analysis based on user review submited'''
74 | def UserReview_SentimentAnalyzer(self):
75 | self.df=pd.read_sql("select imdbid,User_Review,Polarity from movies;",self.mydb)
76 | # User_Review
77 | self.data = self.df['User_Review']
78 | self.data=pd.Series.to_string(self.data) ## converted to string from pandas.Series
79 | # for removing accented characters
80 | self.normal = unicodedata.normalize('NFKD', self.data).encode('ASCII', 'ignore')
81 | # sentiment_vader_lexicon:
82 | self.list_senti=[]
83 | for i in self.df['User_Review']:
84 | self.list_senti.append(self.sentiment_lexicon(i))
85 | self.list_senti
86 | #creating new column as sentiment which will have 0/1 values
87 | self.df['polarity']=self.list_senti
88 | # assigning
89 | self.features=self.df.loc[:,'User_Review']
90 | self.senti=self.df.loc[:,'polarity']
91 | # Using TFIDF vectorizer
92 | self.vectorizer = TfidfVectorizer(min_df= 3, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 2))
93 | self.final_features = self.vectorizer.fit_transform(self.features).toarray()
94 | self.features_train,self.features_test,self.senti_train,self.senti_test=train_test_split(self.final_features,self.senti,test_size=0.2,random_state=23)
95 | # SVC model to get acc & class table
96 | self.lsvm = LinearSVC()
97 | self.l = self.lsvm.fit(self.features_train,self.senti_train)
98 | return self.l,self.vectorizer
99 |
100 | '''Predictor function that will help to analyse user review and provide the polarity'''
101 | def Predict_Sentiment(self,model,vectorizer,User_Review):
102 | # l=UserReview_SentimentAnalyzer()
103 | self.y = self.vectorizer.transform([self.User_Review]).toarray()
104 | self.z=model.predict(self.y)
105 | return self.z
106 |
107 | '''Displaying intial menu where user can select an option either to search the movie or analyse the movie '''
108 | def displayMenu(self):
109 | print("\nMenu:\n[S]earch Movie,[A]nalyze the data,[Q]uit:\n")
110 | print("---------------------------------------------------------------------------")
111 | Choice=''
112 | flag=0
113 | options={'s':'search','a':'analyse','q':'quit'}
114 | try:
115 | Choice=input("Please enter your option below:\n").lower()
116 | if Choice=='':
117 | flag=1
118 | return Choice,flag
119 | elif Choice not in options.keys():
120 | flag=2
121 | return Choice,flag
122 | else:
123 | return Choice,flag
124 | except ValueError:
125 | print("\nInvalid input...please enter S,A,Q from choices provided:\n")
126 | self.displayMenu()
127 |
128 | '''Budget and Review need to be extracted from IMDB website '''
129 | def Extract_Budget_UserReview(self,imdbID):
130 | c = CurrencyConverter()
131 | CurrencyDict = {'$': 'USD', '£': 'GBP', '¥': 'JPY', '€': 'EUR', '₹': 'INR'}
132 | url = 'http://www.imdb.com/title/{}/?ref_=fn_al_nm_1a'.format(imdbID)
133 | data = requests.get(url)
134 | soup = BeautifulSoup(data.text, 'html.parser')
135 | Budget = 0
136 | userReview = ""
137 |
138 | #Extracting the user Review of the movie
139 | movie = soup.findAll('div', {'class': 'user-comments'})
140 | for res in movie:
141 | userReview = res.span.strong.text
142 | if userReview is None:
143 | userReview='N/A'
144 |
145 | #Extracting the Budget of the movie
146 | for h4 in soup.find_all('h4'):
147 | if "Budget:" in h4:
148 | Budget = h4.next_sibling
149 | match = re.search(r'([\D]+)([\d,]+)', Budget)
150 | output = (match.group(1).replace('\xa0', ''),
151 | match.group(2).replace(',', ''))
152 | if len(output[0]) == 1:
153 | Budget = round(
154 | (c.convert(output[1], CurrencyDict[output[0]], 'USD')/1000000), 2)
155 | elif len(output[0]) == 3 and output[0] == 'XAF':
156 | Budget = round((float(output[1])*0.00174637)/1000000, 2)
157 | elif len(output[0]) == 3 and output[0] == 'FRF':
158 | Budget = round((float(output[1])*0.17)/1000000, 2)
159 | elif len(output[0]) == 3 and output[0] == 'IRR':
160 | Budget = round((float(output[1])*0.0000237954)/1000000, 2)
161 | elif len(output[0]) == 3 and output[0] == 'PKR':
162 | Budget = round((float(output[1])*0.007225614)/1000000, 2)
163 | elif len(output[0]) == 3 and output[0] == 'NPR':
164 | Budget = round((float(output[1])*87.0521)/1000000, 2)
165 | elif len(output[0]) == 3 and output[0] != 'FRF':
166 | Budget = round(
167 | c.convert(output[1], output[0], 'USD')/1000000, 2)
168 | return Budget,userReview
169 |
170 | '''Extracting movie details from API'''
171 | def getMovieData(self,Movietitle):
172 | try:
173 | url = "http://www.omdbapi.com/?t={}&apikey=5ddb11dd".format(Movietitle)
174 | print("Retrieving the data of \"{}\" now…".format(Movietitle))
175 | api_request = requests.get(url)
176 | source = json.loads(api_request.content)
177 | except requests.RequestException as e:
178 | print(f"ERROR: {e.reason}")
179 | return source
180 |
181 | '''Establishing the database connection'''
182 | def DatabaseConnection(self,user, passwd, database):
183 | mydb=''
184 | try:
185 | mydb = mysql.connector.connect(host='localhost',
186 | user=user,
187 | passwd=passwd,
188 | db=database)
189 | except:
190 | print("""The login credentials you entered are not valid for
191 | the database you indicated. Please check your login details and try
192 | again.""")
193 | return mydb
194 |
195 | '''This function will sepearte each word from genre and stack it in long format'''
196 | def explode(self,df, lst_cols, fill_value=''):
197 | # make sure `lst_cols` is a list
198 | if lst_cols and not isinstance(lst_cols, list):
199 | lst_cols = [lst_cols]
200 | # all columns except `lst_cols`
201 | idx_cols = df.columns.difference(lst_cols)
202 |
203 | # calculate lengths of lists
204 | lens = df[lst_cols[0]].str.len()
205 |
206 | if (lens > 0).all():
207 | # ALL lists in cells aren't empty
208 | return pd.DataFrame({
209 | col: np.repeat(df[col].values, lens)
210 | for col in idx_cols
211 | }).assign(**{col: np.concatenate(df[col].values) for col in lst_cols}) \
212 | .loc[:, df.columns]
213 | else:
214 | # at least one list in cells is empty
215 | return pd.DataFrame({
216 | col: np.repeat(df[col].values, lens)
217 | for col in idx_cols
218 | }).assign(**{col: np.concatenate(df[col].values) for col in lst_cols}) \
219 | .append(df.loc[lens == 0, idx_cols]).fillna(fill_value) \
220 | .loc[:, df.columns]
221 |
222 | '''This Function will put the data extracted from API and from webscraping into movie database '''
223 | def DataIntoDatabase(self,MovieData, mydb, mycursor):
224 | self.budget,self.User_Review = self.Extract_Budget_UserReview(MovieData['imdbID'])
225 | self.pred=self.Predict_Sentiment(self.model,self.vectorizer,self.User_Review)
226 | self.polarity=''.join(self.pred)
227 | if MovieData['Metascore'] == 'N/A':
228 | metascore = 0
229 | else:
230 | metascore = (float((MovieData['Metascore']))/10)
231 |
232 | if MovieData['imdbRating']=='N/A':
233 | imdb_rating=0
234 | else:
235 | imdb_rating = float(MovieData['imdbRating'])
236 |
237 | if MovieData['Released']=='N/A':
238 | release_year=0
239 | else:
240 | release_year=int(MovieData['Released'].split(' ')[2])
241 |
242 | if MovieData['Poster']=="N/A":
243 | image_url='Image Not Available'
244 | else:
245 | image_url=MovieData['Poster']
246 |
247 | sql = """INSERT INTO movies(IMDBID, Title, Genre, Year, URL, Audience_Rating, Critic_Rating, Budget_In_Millions, User_Review,Polarity)
248 | VALUES (%s, %s,%s, %s,%s,%s,%s,%s,%s,%s)
249 | ON DUPLICATE KEY UPDATE
250 | Audience_Rating=values(Audience_Rating),
251 | Critic_Rating=values(Critic_Rating),
252 | Budget_In_Millions=values(Budget_In_Millions),
253 | User_Review=values(User_Review),
254 | Polarity=values(Polarity);"""
255 |
256 | val=[(MovieData['imdbID'],MovieData['Title'],
257 | MovieData['Genre'],release_year,image_url,
258 | imdb_rating,metascore,self.budget,self.User_Review,self.polarity)]
259 | mycursor.executemany(sql, val)
260 | mydb.commit()
261 |
262 |
263 | '''This function will fetch the data from database from the title provided by the user'''
264 | def getDataFromDatabase(self,UserInputTitle):
265 | mydb=self.mydb
266 | self.mycursor=mydb.cursor()
267 | self.mycursor.execute("""select title,genre,year,audience_rating,critic_rating,polarity
268 | from movies where title like %s limit 1""",("%" + self.UserInputTitle + "%",))
269 | self.myresult = self.mycursor.fetchall()
270 | return self.myresult
271 |
272 | '''
273 | This is use to display info about the movie title provided by the user,
274 | at the same time if the movie title doesn't exist the it will make an entry into the database
275 | and then it will fetch the data from database to display info.
276 | '''
277 | def DisplayMovieInfo(self,UserInputTitle):
278 | mydb=self.mydb
279 | mycursor=mydb.cursor()
280 | try:
281 | myresult=self.getDataFromDatabase(self.UserInputTitle)
282 | if not myresult:
283 | MovieData=self.getMovieData(self.UserInputTitle)
284 | if MovieData['Response']=='False':
285 | print("Sorry!!! The Movie Doesn't Exist.....:(")
286 | else:
287 | self.DataIntoDatabase(MovieData,mydb,mycursor)
288 | myresult=self.getDataFromDatabase(self.UserInputTitle)
289 | if myresult[0][5]=='Positive':
290 | res="Good Choice! & you can enjoy this with your buddy. :)"
291 | else:
292 | res="Well, you've decide so enjoy this with your popcorn. :)."
293 | print('*********************************************************')
294 | print("Title: {}".format(myresult[0][0]))
295 | print("Genre: {}".format(myresult[0][1]))
296 | print("Year: {}".format(myresult[0][2]))
297 | print("Audience Rating: {}".format(myresult[0][3]))
298 | print("Critic Rating: {}".format(myresult[0][4]))
299 | print("What's My Sugesstion: {}".format(res))
300 | print('*********************************************************')
301 | else:
302 | if myresult[0][5]=='Positive':
303 | res="Your can enjoy this with your buddy!"
304 | else:
305 | res="Well you've decide so enjoy with your popcorn."
306 | print('*********************************************************')
307 | print("Title: {}".format(myresult[0][0]))
308 | print("Genre: {}".format(myresult[0][1]))
309 | print("Year: {}".format(myresult[0][2]))
310 | print("Audience Rating: {}".format(myresult[0][3]))
311 | print("Critic Rating: {}".format(myresult[0][4]))
312 | print("What's My Sugesstion: {}".format(res))
313 | print('*********************************************************')
314 | except:
315 | print('''Sorry its doesnt exist...please try once again''')
316 |
317 |
318 | '''This Function will fetch the data by year'''
319 | def getDataByYear(self,FirstRange,SecondRange):
320 | self.movieData=pd.read_sql("""select title,genre,year,audience_rating,critic_rating,budget_in_millions
321 | from movies
322 | where url!='N/A' and
323 | genre!='N/A'and
324 | year!=0 and
325 | Audience_rating!=0 and
326 | critic_rating!=0 and
327 | budget_in_millions!=0 and
328 | user_review <> ''and polarity <> '' and Year BETWEEN {} and {};""".format(FirstRange,SecondRange),self.mydb)
329 | return self.movieData
330 |
331 | '''Getting an input title from the user'''
332 | def getChoiceMovie(self):
333 | try:
334 | self.UserInputTitle=input("Please enter the title of the movie:\n")
335 | if self.UserInputTitle=='':
336 | print("No Input Provided.")
337 | else:
338 | self.DisplayMovieInfo(self.UserInputTitle)
339 | except ValueError:
340 | print("\nSome error occured.....please check the input provided")
341 |
342 | '''This fucntion will fetch the data from the database & process it while formatting the data in long format'''
343 | def DataAnalysis(self):
344 | self.movieData=''
345 | self.year=pd.read_sql('''select max(year) as Max_Year, min(year) as Min_Year
346 | from movies where url!='N/A' and
347 | genre!='N/A'and year!=0 and
348 | Audience_rating!=0 and
349 | critic_rating!=0 and
350 | budget_in_millions!=0 and
351 | user_review <> '' and
352 | Polarity <>'' ''',self.mydb)
353 |
354 | print('***********************************************************************************************')
355 | print("Data is avialable from {} to {}.\nEnter the range so as to bring you the analysis".format(self.year.Min_Year[0],self.year.Max_Year[0]))
356 | print('***********************************************************************************************')
357 | try:
358 | self.FirstRange=input("Please Enter The Range 1:\n")
359 | self.SecondRange=input("Please Enter The Range 2:\n")
360 | if self.FirstRange=='' or self.SecondRange=='':
361 | print("No input was provided.\n")
362 | self.DataAnalysis()
363 | elif self.FirstRange > self.SecondRange:
364 | print('\nProvided range is invalid...since the first range cannot be greater than the second.\n')
365 | self.DataAnalysis()
366 | elif len(self.FirstRange)!=4 or len(self.SecondRange)!=4:
367 | print("Provided range is invalid...\n")
368 | self.DataAnalysis()
369 | else:
370 | self.movieData=self.getDataByYear(int(self.FirstRange),int(self.SecondRange))
371 | self.movieData.genre = self.movieData.genre.str.split(',')
372 | self.movieData = self.explode(self.movieData,['genre'])
373 | self.movieData.genre=self.movieData.genre.str.strip(' ')
374 | self.movieData.genre=self.movieData.genre.astype('category')
375 | self.movieData.audience_rating=self.movieData.audience_rating*10
376 | self.movieData.critic_rating=self.movieData.critic_rating*10
377 |
378 | except ValueError:
379 | print("Please provide correct input...since the entered value is not a number.\n")
380 | self.DataAnalysis()
381 | return self.movieData
382 |
383 | def OptionChoice(self):
384 | print("""Please Choose option below:
385 | 1. Display Top 10 Rated Movies
386 | 2. Display Top 10 High Budget Movies
387 | 3. Display Critic Vs Audience Rating
388 | 4. Distribution of Critic or Audience Rating
389 | 5. Display Stack Distribution of Budget
390 | 6. Display Boxplot
391 | 7. Display Barplot\n
392 | """)
393 |
394 | def OptionChoiceDist(self):
395 | self.optionDist={1:'Critic Rating Distribution',2:'Audience Rating Distribution'}
396 | print("""Please Enter:\n[1] Critic Rating Distribution\n[2] Audience Rating Distribution\n""")
397 | try:
398 | self.Dist=int(input(">"))
399 | if self.Dist not in self.optionDist.keys():
400 | print("Sorry please enter your choice from the option below\n")
401 | self.OptionChoiceDist()
402 | elif self.Dist==1:
403 | self.DisplayHistogram(self.movieData,'critic rating')
404 | elif self.Dist==2:
405 | self.DisplayHistogram(self.movieData,'audience rating')
406 | except ValueError:
407 | print("Invalid input provided.")
408 | self.OptionChoiceDist()
409 |
410 | def OptionChoiceBox(self):
411 | self.optionbox={1:'Critic Rating Boxplot',2:'Audience Rating Boxplot'}
412 | print("""Please Enter:\n[1] Display boxplot for critic rating by genre\n[2] Display boxplot for audience rating by genre\n""")
413 | try:
414 | self.box=int(input(">"))
415 | if self.box not in self.optionbox.keys():
416 | print("Sorry please enter your choice from the option below\n")
417 | self.OptionChoiceBox()
418 | elif self.box==1:
419 | self.DisplayBoxplot(self.movieData,'genre','critic rating')
420 | elif self.box==2:
421 | self.DisplayBoxplot(self.movieData,'genre','audience rating')
422 | except ValueError:
423 | print("Invalid input provided.")
424 | self.OptionChoiceBox()
425 |
426 | def OptionChoiceBar(self):
427 | self.optionbar={1:'Genre bar plot',2:'Year bar plot'}
428 | print("""Please Enter:\n[1] Display barplot to display Data by Genre\n[2] Display barplot to display Data by year:\n""")
429 | try:
430 | self.bar=int(input(">"))
431 | if self.bar not in self.optionbar.keys():
432 | print("Sorry please enter your choice from the option below\n")
433 | self.OptionChoiceBar()
434 | elif self.bar==1:
435 | self.catPlot(self.movieData,'genre')
436 | elif self.bar==2:
437 | self.catPlot(self.movieData,'year')
438 | except ValueError:
439 | print("Invalid input provided.")
440 | self.OptionChoiceBar()
441 |
442 | def DisplayCricticAudienceRating(self,movieData):
443 | #Joint Plot Critic Rating Vs Audience Rating
444 | sns.set(style='whitegrid')
445 | sns.jointplot(data=self.movieData,x='critic_rating',y='audience_rating')
446 | j = sns.JointGrid(data=self.movieData,x='critic_rating',y='audience_rating')
447 | j = j.plot_joint(plt.scatter,color="g", s=40, edgecolor="black")
448 | j = j.plot_marginals(sns.distplot, kde=False,)
449 | j = j.annotate(stats.pearsonr,loc="upper left")
450 | j.set_axis_labels('Critic Ratings','Audience Rating')
451 | plt.show()
452 |
453 | # Histogram
454 | def DisplayHistogram(self,movieData,column):
455 | column=column.title()
456 | LabelDictCol = {'Critic Rating':'critic_rating','Audience Rating':'audience_rating','Budget In Millions':'budget_in_millions'}
457 | sns.set(style = 'whitegrid')
458 | fig,ax=plt.subplots()
459 | fig.set_size_inches(11.7,8.27)
460 | plt.hist(movieData[LabelDictCol[column]],bins=15,color='black')
461 | plt.title("{} Distribution".format(column),fontsize=20)
462 | plt.ylabel("Frequency",fontsize=15)
463 | plt.xlabel("{} (%)".format(column),fontsize=15)
464 | plt.show()
465 |
466 | # Stack distribution
467 | def DisplayStackedHistogram(self,movie):
468 | list1=[]
469 | GenreLabels=[]
470 | for gen in movie.genre.cat.categories:
471 | list1.append(movie[movie.genre==gen].budget_in_millions)
472 | GenreLabels.append(gen)
473 | sns.set(style='whitegrid')
474 | fig,ax=plt.subplots()
475 | fig.set_size_inches(11.7,8.27)
476 | plt.hist(list1,bins=30,stacked=True,rwidth=1,label=GenreLabels)
477 | plt.title("Movie Budget Distribution",fontsize=20)
478 | plt.ylabel("Number of Movies",fontsize=15)
479 | plt.xlabel("Budget$$$",fontsize=15)
480 | plt.legend(frameon=True,fancybox=True,prop={'size':10},framealpha=1)
481 | plt.show()
482 |
483 |
484 | # how critic rating is dtributted accross different genre
485 | def DisplayBoxplot(self,data,column1,column2):
486 | column1=column1.title()
487 | column2=column2.title()
488 | LabelDictCol = {'Critic Rating':'critic_rating','Audience Rating':'audience_rating','Budget In Millions':'budget_in_millions','Genre':'genre','Year':'year'}
489 | fig,ax=plt.subplots()
490 | fig.set_size_inches(11.7,8.27)
491 | sns.boxplot(data=data,x=LabelDictCol[column1],y=LabelDictCol[column2],palette='vlag',whis="range")
492 | ax.yaxis.grid(True)
493 | ax.xaxis.grid(True)
494 | plt.title('{} Vs {} Boxplot'.format(column1,column2),fontsize=20)
495 | plt.xlabel('{}'.format(column1),fontsize=15)
496 | plt.ylabel('{}'.format(column2),fontsize=15)
497 | plt.xticks(rotation=30)
498 | sns.despine(trim=True, left=True)
499 | plt.show()
500 |
501 | '''Function is use to display barplot for number of movie by genre or year'''
502 | def catPlot(self,data,column):
503 | column=column.title()
504 | LabelDictCol = {'Critic Rating':'critic_rating','Audience Rating':'audience_rating','Budget In Millions':'budget_in_millions','Genre':'genre','Year':'year'}
505 | sns.catplot(y=LabelDictCol[column], kind="count", palette="ch:.25", data=data)
506 | plt.title('Barplot For {}'.format(column.capitalize()),fontsize=20)
507 | plt.ylabel('{}'.format(column.capitalize()),fontsize=15)
508 | plt.xlabel('')
509 | plt.show()
510 |
511 | '''Display top 10 movie w.r.t genre or year'''
512 | def getTop10(self,data):
513 | p=sns.factorplot(aspect=1.5,y='title',x='audience_rating',data=data.sort_values(['audience_rating','critic_rating'],ascending=False).drop(['genre'],axis=1).drop_duplicates().head(10),palette="ch:.25",kind='bar')
514 | p.set(xlim=(10,100))
515 | sns.set_style("ticks",{"xtick.major.size":8,"ytick.major.size":8})
516 | plt.title('Top 10 Rated Movies',fontsize=20)
517 | plt.ylabel('Title',fontsize=15)
518 | plt.xlabel('Audience Rating',fontsize=15)
519 | sns.despine(trim=True,left=True)
520 | plt.show()
521 |
522 | '''Function will display top 10 movie w.r.t budget'''
523 | def getTop10HighBudgetMovie(self,data):
524 | sns.factorplot(aspect=1.5,y='title',x='budget_in_millions',data=data.sort_values(['budget_in_millions'],ascending=False).drop(['genre'],axis=1).drop_duplicates().head(10),palette="ch:.25",kind='bar')
525 | sns.set_style("ticks",{"xtick.major.size":8,"ytick.major.size":8})
526 | plt.title('Top 10 High Budget Movies',fontsize=20)
527 | plt.ylabel('Title',fontsize=15)
528 | plt.xlabel('Budget In Millions',fontsize=15)
529 | sns.despine(trim=True,left=True)
530 | plt.show()
531 |
532 |
533 | '''This is function will be call after the object is created and its link with mulitple functions from above'''
534 | def DisplayTheDetails(self):
535 | self.options={'s':'search','a':'analyse','q':'quit'}
536 | while True:
537 | Choice,flag = self.displayMenu()
538 | if flag==1 and Choice=='':
539 | print("Please select the option from the menu:\n")
540 | break
541 | self.displayMenu()
542 | elif flag==2 and Choice not in self.options.keys():
543 | print("Please select the option from the menu:\n")
544 | break
545 | self.displayMenu()
546 | elif flag==0 and Choice in self.options.keys():
547 | if Choice == 's':
548 | self.getChoiceMovie()
549 | break
550 | elif Choice == 'a':
551 | self.optionAnalyze={1:'Display Top 10 Rated Movies',2:'Display Top 10 High Budget Movies',3: 'Display Critic Vs Audience',
552 | 4:'Distribution of Critic Vs Audience',5:'Display Stack Distribution of Budget',6:'Display Boxplot',7:'Display Barplot',
553 | 8:'Display Dashboard'}
554 | self.movieData=self.DataAnalysis()
555 | try:
556 | self.OptionChoice()
557 | choice=int(input("Please enter the option below:\n"))
558 | if choice not in self.optionAnalyze.keys():
559 | print("Sorry please enter your choice from the option below ")
560 | break
561 | self.OptionChoice()
562 | if choice == 1:
563 | self.getTop10(self.movieData)
564 | elif choice==2:
565 | self.getTop10HighBudgetMovie(self.movieData)
566 | elif choice==3:
567 | self.DisplayCricticAudienceRating(self.movieData)
568 | elif choice==4:
569 | self.OptionChoiceDist()
570 | elif choice==5 :
571 | self.DisplayStackedHistogram(self.movieData)
572 | elif choice==6:
573 | self.OptionChoiceBox()
574 | elif choice==7:
575 | self.OptionChoiceBar()
576 | except ValueError:
577 | print("Sorry! please enter a number.")
578 | self.DisplayTheDetails()
579 | break
580 | elif Choice == 'q':
581 | break
582 | check_again=input("Do you want to check again? Y/n:\n")
583 | if check_again.lower() != 'n':
584 | self.DisplayTheDetails()
585 | else:
586 | print("\n***********************************************************************************************")
587 | print("Thanks for your participation, GoodBye!!!")
588 |
589 | #########################################################################################################################
590 | # The Main Part for Displaying Movie info to the user
591 | #########################################################################################################################
592 |
593 | def MainFunction():
594 | try:
595 | FirstName = input("Please Enter Your First Name:\n")
596 | LastName = input("Please Enter Your Last Name:\n")
597 | if FirstName=='' or LastName=='':
598 | print("Input cannot be blank...")
599 |
600 | else:
601 | User = ImdbMovies(firstname=FirstName, lastname=LastName)
602 | User.DisplayTheDetails()
603 | except:
604 | print("\n***********************************************************************************************")
605 | print("Please provide a valid input")
606 | MainFunction()
607 | MainFunction()
608 |
609 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Shreyas Wankhede
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # IMDb-Web-Scraping-and-Sentiment-Analysis
2 | * The objective of this project is to scarp the data from IMDb website and form an analysis that will help data analyst or production company to decide how they are going to proceed with making a new movie, second is to form a model to predict what are the sentiments of movies based on user reviews.
3 | ***
4 |
5 | ### Description:
6 | * The Internet Movie Database (IMDb) is one of the world’s most popular sources for movie, TV and celebrity content with more than 100 million unique visitors per month.
7 | IMDb has huge collection of movies database that includes various details of movies along with different ratings and user reviews.
8 | This movie reviews affects everyone from audience, film critics to the production company.
9 |
10 | ### Problem Statememt:
11 | * Idea of our project is to scarp the data from IMDb and form an analysis that will help data analyst or production company to decide how they are going to proceed with making a new movie, second is to form a model to predict what are the sentiments of movies based on user reviews.
12 |
13 | ### Dataset:
14 | * 3500+ records and has 10 columns.
15 |
16 | ### Dashboard: Using Django Framework
17 | 
18 |
19 |
This page shows the top 10 movies, top genres and number of movies liked and disliked.
20 |
21 | 
22 |
23 |
This pages displays Sentiment Analysis based on user reviews for each movie along with their other details.
24 |
25 | 
26 |
27 |
Visualization of Critics ratings for movies.
28 |
29 | 
30 |
31 |
Distribution of Audience Ratings vs Critic Ratings.
32 |
33 | ***
34 |
35 |
Thank You! 36 |
37 | Follow @shreyaswankhede 38 | -------------------------------------------------------------------------------- /UI-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shreyaswankhede/IMDb-Web-Scraping-and-Sentiment-Analysis/c916807575e061e2fce36f28f93ef14e17ef7244/UI-1.png -------------------------------------------------------------------------------- /UI-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shreyaswankhede/IMDb-Web-Scraping-and-Sentiment-Analysis/c916807575e061e2fce36f28f93ef14e17ef7244/UI-2.png -------------------------------------------------------------------------------- /UI-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shreyaswankhede/IMDb-Web-Scraping-and-Sentiment-Analysis/c916807575e061e2fce36f28f93ef14e17ef7244/UI-3.png -------------------------------------------------------------------------------- /UI-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shreyaswankhede/IMDb-Web-Scraping-and-Sentiment-Analysis/c916807575e061e2fce36f28f93ef14e17ef7244/UI-4.png --------------------------------------------------------------------------------