├── CA2.py
├── IMDB_top_1000.csv
├── MovieReport.docx
└── README.md


/CA2.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import seaborn as sns
  5 | 
  6 | df = pd.read_csv("Dataset\\IMDB_top_1000.csv")  
  7 | 
  8 | # Basic overview
  9 | print(df.head())
 10 | print(df.info())
 11 | print(df.describe(include='all'))
 12 | 
 13 | # Cleaning & Preprocessing
 14 | df['Duration'] = df['Duration'].str.extract(r'(\d+)').astype(float)
 15 | df['Year'] = df['Title'].str.extract(r'\((\d{4})\)').astype(float)
 16 | df['Votes'] = df['Info'].str.extract(r'Votes:\s([\d,]+)')[0].str.replace(',', '').astype(float)
 17 | df['Gross'] = df['Info'].str.extract(r'Gross:\s\$(\d+\.\d+)M')[0].astype(float)
 18 | df['Genre'] = df['Genre'].fillna('').apply(lambda x: x.split(', '))
 19 | df['Main Genre'] = df['Genre'].apply(lambda x: x[0] if x else None)
 20 | df['Rate'] = df['Rate'].astype(float)
 21 | 
 22 | # Objective 1: How strongly are IMDb rating and metascore related on a scale from -1 to 1
 23 | 
 24 | df_corr = df[['Rate', 'Metascore']].dropna()
 25 | 
 26 | correlation = df_corr['Rate'].corr(df_corr['Metascore'])
 27 | print(f"Correlation: {correlation:.2f}")
 28 | 
 29 | plt.figure(figsize=(8, 6))
 30 | sns.regplot(x='Metascore', y='Rate', data=df_corr, scatter_kws={'alpha':0.5}, line_kws={'color':'red'})
 31 | plt.title(f"IMDb Rating vs Metascore\nCorrelation: {correlation:.2f}", fontsize=14)
 32 | plt.xlabel("Metascore")
 33 | plt.ylabel("IMDb Rating")
 34 | plt.show()
 35 | 
 36 | # Objective 2: Genre-wise average IMDb rating — are horror movies rated better than action?
 37 | 
 38 | genre_rating = df[['Main Genre', 'Rate']].dropna()
 39 | 
 40 | genre_avg = genre_rating.groupby('Main Genre')['Rate'].mean().sort_values(ascending=False)
 41 | 
 42 | plt.figure(figsize=(12, 6))
 43 | sns.barplot(x=genre_avg.index, y=genre_avg.values, palette='coolwarm')
 44 | 
 45 | for index, value in enumerate(genre_avg.values):
 46 |     plt.text(index, value + 0.02, f"{value:.2f}", ha='center', va='bottom', fontsize=10)
 47 | 
 48 | plt.title("Average IMDb Rating by Genre", fontsize=16)
 49 | plt.xlabel("Main Genre")
 50 | plt.ylabel("Average IMDb Rating")
 51 | plt.xticks(rotation=45)
 52 | plt.grid(axis='y', linestyle='--', alpha=0.5)
 53 | plt.tight_layout()
 54 | plt.show()
 55 | 
 56 | # Objective 3: Gross earnings vs IMDb rating — does quality equal money?
 57 | 
 58 | clean_df = df.dropna(subset=['Gross', 'Rate'])
 59 | 
 60 | plt.figure(figsize=(10,6))
 61 | sns.scatterplot(data=clean_df, x='Rate', y='Gross', hue='Certificate', alpha=0.7)
 62 | 
 63 | plt.title('Gross Earnings vs IMDb Rating: Does Quality Equal Money?')
 64 | plt.xlabel('IMDb Rating')
 65 | plt.ylabel('Gross Earnings (in Millions)')
 66 | plt.legend(title='Certificate')
 67 | plt.grid(True)
 68 | plt.tight_layout()
 69 | plt.show()
 70 | 
 71 | # Objective 4: Trend of Number of Movies in IMDb Top 1000 Over the Years
 72 | 
 73 | movies_per_year = df['Year'].value_counts().sort_index().reset_index()
 74 | movies_per_year.columns = ['Year', 'Number of Movies']
 75 | 
 76 | plt.figure(figsize=(14, 6))
 77 | 
 78 | sns.lineplot(
 79 |     x='Year', 
 80 |     y='Number of Movies', 
 81 |     data=movies_per_year, 
 82 |     marker='o', 
 83 |     color='royalblue',
 84 |     linewidth=2.5
 85 | )
 86 | 
 87 | plt.title('Trend of Number of Movies in IMDb Top 1000 Over the Years', fontsize=16)
 88 | plt.xlabel('Year', fontsize=12)
 89 | plt.ylabel('Number of Movies', fontsize=12)
 90 | plt.grid(linestyle='--', alpha=0.6)
 91 | 
 92 | plt.xticks(rotation=45)
 93 | 
 94 | peak_year = movies_per_year.loc[movies_per_year['Number of Movies'].idxmax()]
 95 | plt.annotate(
 96 |     f'Peak: {int(peak_year["Year"])} ({int(peak_year["Number of Movies"])} movies)',
 97 |     xy=(peak_year['Year'], peak_year['Number of Movies']),
 98 |     xytext=(10, 10), 
 99 |     textcoords='offset points',
100 |     arrowprops=dict(arrowstyle='->', color='red'))
101 |     
102 | plt.tight_layout()
103 | plt.show()
104 | 
105 | # Objective 5: Analyze the covariance between IMDb ratings and Metascore to understand if higher-rated movies on IMDb also tend to get higher Metascores from critics.”
106 | 
107 | df_clean = df.dropna(subset=['Rate', 'Metascore'])
108 | 
109 | covariance = df_clean[['Rate', 'Metascore']].cov().iloc[0, 1]
110 | print(f"Covariance between IMDb Rating and Metascore: {covariance:.2f}")
111 | 
112 | plt.figure(figsize=(10, 6))
113 | sns.regplot(data=df_clean, x='Rate', y='Metascore', scatter_kws={'alpha': 0.5}, line_kws={'color': 'red'})
114 | plt.title('IMDb Rating vs Metascore')
115 | plt.xlabel('IMDb Rating')
116 | plt.ylabel('Metascore')
117 | plt.grid(True)
118 | plt.tight_layout()
119 | plt.show()
120 | 
121 | # Objective 6: Top 10 Most Frequent Movie Certificates
122 | 
123 | top_certificates = df['Certificate'].value_counts().nlargest(10)
124 | 
125 | plt.figure(figsize=(10, 6))
126 | sns.barplot(x=top_certificates.values, y=top_certificates.index, palette='viridis')
127 | plt.title('Top 10 Most Frequent Movie Certificates')
128 | plt.xlabel('Number of Movies')
129 | plt.ylabel('Certificate')
130 | plt.tight_layout()
131 | plt.grid(axis='x', linestyle='--', alpha=0.7)
132 | plt.show()
133 | 
134 | # Objective 7: Identify which genres have the most number of movies: Are dramas really more liked than action
135 | 
136 | df_exploded = df.explode('Genre')
137 | 
138 | genre_counts = df_exploded['Genre'].value_counts().reset_index()
139 | genre_counts.columns = ['Genre', 'Number of Movies']
140 | 
141 | top_genres = genre_counts.head(15)
142 | 
143 | plt.figure(figsize=(12, 6))
144 | ax = sns.barplot(x='Number of Movies', y='Genre', data=top_genres, palette='viridis')
145 | 
146 | for p in ax.patches:
147 |     width = p.get_width()
148 |     plt.text(
149 |         width + 1,
150 |         p.get_y() + p.get_height() / 2,
151 |         f'{int(width)}',
152 |         ha='left',
153 |         va='center'
154 |     )
155 | 
156 | plt.title('Top Genres by Number of Movies in IMDb Top 1000', fontsize=16)
157 | plt.xlabel('Number of Movies', fontsize=12)
158 | plt.ylabel('Genre', fontsize=12)
159 | plt.grid(axis='x', linestyle='--', alpha=0.7)
160 | plt.tight_layout()
161 | plt.show()
162 | 
163 | # Objective 8: See if popularity (votes) correlates with commercial success (gross). 
164 | 
165 | df_votes_gross = df.dropna(subset=['Votes', 'Gross'])
166 | 
167 | plt.figure(figsize=(10, 6))
168 | sns.scatterplot(data=df_votes_gross, x='Votes', y='Gross', alpha=0.6)
169 | sns.regplot(data=df_votes_gross, x='Votes', y='Gross', scatter=False, color='red')  # Optional trendline
170 | 
171 | plt.title('Votes vs Gross Earnings')
172 | plt.xlabel('Votes (Popularity)')
173 | plt.ylabel('Gross Earnings (in Millions USD)')
174 | plt.grid(True, linestyle='--', alpha=0.5)
175 | plt.tight_layout()
176 | plt.show()
177 | 
178 | # Objective 9: Understand if longer movies tend to have higher/lower ratings.
179 | 
180 | plt.figure(figsize=(10, 6))
181 | 
182 | sns.scatterplot(data=df, x='Duration', y='Rate', alpha=0.6)
183 | sns.regplot(data=df, x='Duration', y='Rate', scatter=False, color='red')
184 | 
185 | plt.title('IMDb Rating vs Duration')
186 | plt.xlabel('Duration (minutes)')
187 | plt.ylabel('IMDb Rating')
188 | plt.grid(True)
189 | plt.tight_layout()
190 | plt.show()
191 | 
192 | #Objective 10: Top 10 directors with highest number of movies in top 1000 IMDb ratings
193 | 
194 | df['Director'] = df['Cast'].str.extract(r'Director:\s([^\|]+)')
195 | 
196 | director_counts = df['Director'].value_counts().reset_index()
197 | director_counts.columns = ['Director', 'Number of Movies']
198 | 
199 | top_directors = director_counts.head(10)
200 | 
201 | plt.figure(figsize=(12, 6))
202 | sns.barplot(x='Number of Movies', y='Director', data=top_directors, palette='viridis')
203 | plt.title('Top 10 Directors with Most Movies in IMDb Top 1000', fontsize=16)
204 | plt.xlabel('Number of Movies', fontsize=12)
205 | plt.ylabel('Director', fontsize=12)
206 | plt.grid(axis='x', linestyle='--', alpha=0.7)
207 | plt.tight_layout()
208 | plt.show()
209 | 
210 | #Objective 11: Calculate correlations between numerical features
211 | 
212 | corr_matrix = df[['Rate', 'Duration', 'Year', 'Votes', 'Gross']].corr()
213 | 
214 | plt.figure(figsize=(10, 8))
215 | sns.heatmap(
216 |     corr_matrix, 
217 |     annot=True, 
218 |     cmap='coolwarm', 
219 |     center=0,
220 |     fmt=".2f",
221 |     linewidths=0.5,
222 |     cbar_kws={'label': 'Correlation Strength'}
223 | )
224 | plt.title('Correlation Between Movie Features\n(Rating, Runtime, Year, Votes, Gross)', fontsize=14)
225 | plt.xticks(rotation=45)
226 | plt.tight_layout()
227 | plt.show()


--------------------------------------------------------------------------------
/MovieReport.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Parsh-Kalwania/Sem4_Python_CA2_project/19aefed525a5b4038c98c988fb98332da20daf24/MovieReport.docx


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 🎬 IMDb Top 1000 Movies - Exploratory Data Analysis (EDA)
 2 | 
 3 | This project presents an Exploratory Data Analysis (EDA) of the **IMDb Top 1000 Movies** dataset. It explores trends and insights from movies based on their ratings, genre, duration, gross earnings, and other features using Python libraries like Pandas, Matplotlib, and Seaborn.
 4 | 
 5 | ---
 6 | 
 7 | ## 📁 Dataset
 8 | 
 9 | - **Source**: [IMDb Top 1000 Movies dataset (CSV format)](https://data.world/melanithapa/movie-data/workspace/file?filename=IMDB_top_1000.csv)
10 | - **Size**: 1000 rows × 10 columns
11 | - **Features Include**:
12 |   - `Title`: Name of the movie
13 |   - `Certificate`: Age rating
14 |   - `Duration`: Length of the movie
15 |   - `Genre`: Movie genres
16 |   - `Rate`: IMDb rating
17 |   - `Metascore`: Metacritic score
18 |   - `Gross`: Box office gross earnings
19 |   - `Cast` and `Director` (extracted from text)
20 | 
21 | ---
22 | 
23 | ## 📊 Objective
24 | 
25 | - Analyze and visualize trends in top-rated IMDb movies.
26 | - Understand relationships between IMDb rating, Metascore, and Gross earnings.
27 | - Identify the most common genres, certificates, and directors.
28 | - Detect outliers and explore data cleaning decisions.
29 | 
30 | ---
31 | 
32 | ## 📌 Key Analysis & Decisions
33 | 
34 | ### 🔍 Data Cleaning
35 | - **Dropped Nulls**: Chose to drop rows with missing values instead of filling them with mean/median/forward fill to avoid introducing bias or assumptions not supported by the data.
36 | 
37 | ### 📈 Insights
38 | - **Top Genres & Directors**: Visualized frequency of genres and directors with multiple top-rated movies.
39 | - **Rating Analysis**: Found correlation between IMDb rating and Metascore.
40 | - **Gross Earnings**: Detected outliers using a boxplot, showing a few high-grossing movies skewing the distribution.
41 | 
42 | ### 📦 Outlier Detection
43 | - Used **boxplot analysis** on Gross to identify high-grossing outliers.
44 | - These outliers were **retained** as they represent genuinely popular/blockbuster films.
45 | 
46 | ---
47 | 
48 | ## 🛠️ Tools Used
49 | 
50 | - **Python 3.13**
51 | - **Pandas**, **NumPy** for data manipulation
52 | - **Seaborn**, **Matplotlib** for visualizations
53 | - **VSCode** for development
54 | 
55 | ---
56 | 
57 | ## 📷 Visuals
58 | 
59 | - Barplots of Genre, Certificate, and Director distributions
60 | - Boxplot of Gross earnings showing outliers
61 | - Scatterplots for correlation analysis
62 | 
63 | ---
64 | 
65 | ## 📌 Conclusion
66 | 
67 | This EDA helped uncover interesting patterns in IMDb's top-rated films. The analysis highlights the dominance of certain genres and directors, the skewed nature of box office revenues, and the reasonable correlation between critic and user ratings.
68 | 


--------------------------------------------------------------------------------