├── CA2.py ├── IMDB_top_1000.csv ├── MovieReport.docx └── README.md /CA2.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | df = pd.read_csv("Dataset\\IMDB_top_1000.csv") 7 | 8 | # Basic overview 9 | print(df.head()) 10 | print(df.info()) 11 | print(df.describe(include='all')) 12 | 13 | # Cleaning & Preprocessing 14 | df['Duration'] = df['Duration'].str.extract(r'(\d+)').astype(float) 15 | df['Year'] = df['Title'].str.extract(r'\((\d{4})\)').astype(float) 16 | df['Votes'] = df['Info'].str.extract(r'Votes:\s([\d,]+)')[0].str.replace(',', '').astype(float) 17 | df['Gross'] = df['Info'].str.extract(r'Gross:\s\$(\d+\.\d+)M')[0].astype(float) 18 | df['Genre'] = df['Genre'].fillna('').apply(lambda x: x.split(', ')) 19 | df['Main Genre'] = df['Genre'].apply(lambda x: x[0] if x else None) 20 | df['Rate'] = df['Rate'].astype(float) 21 | 22 | # Objective 1: How strongly are IMDb rating and metascore related on a scale from -1 to 1 23 | 24 | df_corr = df[['Rate', 'Metascore']].dropna() 25 | 26 | correlation = df_corr['Rate'].corr(df_corr['Metascore']) 27 | print(f"Correlation: {correlation:.2f}") 28 | 29 | plt.figure(figsize=(8, 6)) 30 | sns.regplot(x='Metascore', y='Rate', data=df_corr, scatter_kws={'alpha':0.5}, line_kws={'color':'red'}) 31 | plt.title(f"IMDb Rating vs Metascore\nCorrelation: {correlation:.2f}", fontsize=14) 32 | plt.xlabel("Metascore") 33 | plt.ylabel("IMDb Rating") 34 | plt.show() 35 | 36 | # Objective 2: Genre-wise average IMDb rating — are horror movies rated better than action? 37 | 38 | genre_rating = df[['Main Genre', 'Rate']].dropna() 39 | 40 | genre_avg = genre_rating.groupby('Main Genre')['Rate'].mean().sort_values(ascending=False) 41 | 42 | plt.figure(figsize=(12, 6)) 43 | sns.barplot(x=genre_avg.index, y=genre_avg.values, palette='coolwarm') 44 | 45 | for index, value in enumerate(genre_avg.values): 46 | plt.text(index, value + 0.02, f"{value:.2f}", ha='center', va='bottom', fontsize=10) 47 | 48 | plt.title("Average IMDb Rating by Genre", fontsize=16) 49 | plt.xlabel("Main Genre") 50 | plt.ylabel("Average IMDb Rating") 51 | plt.xticks(rotation=45) 52 | plt.grid(axis='y', linestyle='--', alpha=0.5) 53 | plt.tight_layout() 54 | plt.show() 55 | 56 | # Objective 3: Gross earnings vs IMDb rating — does quality equal money? 57 | 58 | clean_df = df.dropna(subset=['Gross', 'Rate']) 59 | 60 | plt.figure(figsize=(10,6)) 61 | sns.scatterplot(data=clean_df, x='Rate', y='Gross', hue='Certificate', alpha=0.7) 62 | 63 | plt.title('Gross Earnings vs IMDb Rating: Does Quality Equal Money?') 64 | plt.xlabel('IMDb Rating') 65 | plt.ylabel('Gross Earnings (in Millions)') 66 | plt.legend(title='Certificate') 67 | plt.grid(True) 68 | plt.tight_layout() 69 | plt.show() 70 | 71 | # Objective 4: Trend of Number of Movies in IMDb Top 1000 Over the Years 72 | 73 | movies_per_year = df['Year'].value_counts().sort_index().reset_index() 74 | movies_per_year.columns = ['Year', 'Number of Movies'] 75 | 76 | plt.figure(figsize=(14, 6)) 77 | 78 | sns.lineplot( 79 | x='Year', 80 | y='Number of Movies', 81 | data=movies_per_year, 82 | marker='o', 83 | color='royalblue', 84 | linewidth=2.5 85 | ) 86 | 87 | plt.title('Trend of Number of Movies in IMDb Top 1000 Over the Years', fontsize=16) 88 | plt.xlabel('Year', fontsize=12) 89 | plt.ylabel('Number of Movies', fontsize=12) 90 | plt.grid(linestyle='--', alpha=0.6) 91 | 92 | plt.xticks(rotation=45) 93 | 94 | peak_year = movies_per_year.loc[movies_per_year['Number of Movies'].idxmax()] 95 | plt.annotate( 96 | f'Peak: {int(peak_year["Year"])} ({int(peak_year["Number of Movies"])} movies)', 97 | xy=(peak_year['Year'], peak_year['Number of Movies']), 98 | xytext=(10, 10), 99 | textcoords='offset points', 100 | arrowprops=dict(arrowstyle='->', color='red')) 101 | 102 | plt.tight_layout() 103 | plt.show() 104 | 105 | # Objective 5: Analyze the covariance between IMDb ratings and Metascore to understand if higher-rated movies on IMDb also tend to get higher Metascores from critics.” 106 | 107 | df_clean = df.dropna(subset=['Rate', 'Metascore']) 108 | 109 | covariance = df_clean[['Rate', 'Metascore']].cov().iloc[0, 1] 110 | print(f"Covariance between IMDb Rating and Metascore: {covariance:.2f}") 111 | 112 | plt.figure(figsize=(10, 6)) 113 | sns.regplot(data=df_clean, x='Rate', y='Metascore', scatter_kws={'alpha': 0.5}, line_kws={'color': 'red'}) 114 | plt.title('IMDb Rating vs Metascore') 115 | plt.xlabel('IMDb Rating') 116 | plt.ylabel('Metascore') 117 | plt.grid(True) 118 | plt.tight_layout() 119 | plt.show() 120 | 121 | # Objective 6: Top 10 Most Frequent Movie Certificates 122 | 123 | top_certificates = df['Certificate'].value_counts().nlargest(10) 124 | 125 | plt.figure(figsize=(10, 6)) 126 | sns.barplot(x=top_certificates.values, y=top_certificates.index, palette='viridis') 127 | plt.title('Top 10 Most Frequent Movie Certificates') 128 | plt.xlabel('Number of Movies') 129 | plt.ylabel('Certificate') 130 | plt.tight_layout() 131 | plt.grid(axis='x', linestyle='--', alpha=0.7) 132 | plt.show() 133 | 134 | # Objective 7: Identify which genres have the most number of movies: Are dramas really more liked than action 135 | 136 | df_exploded = df.explode('Genre') 137 | 138 | genre_counts = df_exploded['Genre'].value_counts().reset_index() 139 | genre_counts.columns = ['Genre', 'Number of Movies'] 140 | 141 | top_genres = genre_counts.head(15) 142 | 143 | plt.figure(figsize=(12, 6)) 144 | ax = sns.barplot(x='Number of Movies', y='Genre', data=top_genres, palette='viridis') 145 | 146 | for p in ax.patches: 147 | width = p.get_width() 148 | plt.text( 149 | width + 1, 150 | p.get_y() + p.get_height() / 2, 151 | f'{int(width)}', 152 | ha='left', 153 | va='center' 154 | ) 155 | 156 | plt.title('Top Genres by Number of Movies in IMDb Top 1000', fontsize=16) 157 | plt.xlabel('Number of Movies', fontsize=12) 158 | plt.ylabel('Genre', fontsize=12) 159 | plt.grid(axis='x', linestyle='--', alpha=0.7) 160 | plt.tight_layout() 161 | plt.show() 162 | 163 | # Objective 8: See if popularity (votes) correlates with commercial success (gross). 164 | 165 | df_votes_gross = df.dropna(subset=['Votes', 'Gross']) 166 | 167 | plt.figure(figsize=(10, 6)) 168 | sns.scatterplot(data=df_votes_gross, x='Votes', y='Gross', alpha=0.6) 169 | sns.regplot(data=df_votes_gross, x='Votes', y='Gross', scatter=False, color='red') # Optional trendline 170 | 171 | plt.title('Votes vs Gross Earnings') 172 | plt.xlabel('Votes (Popularity)') 173 | plt.ylabel('Gross Earnings (in Millions USD)') 174 | plt.grid(True, linestyle='--', alpha=0.5) 175 | plt.tight_layout() 176 | plt.show() 177 | 178 | # Objective 9: Understand if longer movies tend to have higher/lower ratings. 179 | 180 | plt.figure(figsize=(10, 6)) 181 | 182 | sns.scatterplot(data=df, x='Duration', y='Rate', alpha=0.6) 183 | sns.regplot(data=df, x='Duration', y='Rate', scatter=False, color='red') 184 | 185 | plt.title('IMDb Rating vs Duration') 186 | plt.xlabel('Duration (minutes)') 187 | plt.ylabel('IMDb Rating') 188 | plt.grid(True) 189 | plt.tight_layout() 190 | plt.show() 191 | 192 | #Objective 10: Top 10 directors with highest number of movies in top 1000 IMDb ratings 193 | 194 | df['Director'] = df['Cast'].str.extract(r'Director:\s([^\|]+)') 195 | 196 | director_counts = df['Director'].value_counts().reset_index() 197 | director_counts.columns = ['Director', 'Number of Movies'] 198 | 199 | top_directors = director_counts.head(10) 200 | 201 | plt.figure(figsize=(12, 6)) 202 | sns.barplot(x='Number of Movies', y='Director', data=top_directors, palette='viridis') 203 | plt.title('Top 10 Directors with Most Movies in IMDb Top 1000', fontsize=16) 204 | plt.xlabel('Number of Movies', fontsize=12) 205 | plt.ylabel('Director', fontsize=12) 206 | plt.grid(axis='x', linestyle='--', alpha=0.7) 207 | plt.tight_layout() 208 | plt.show() 209 | 210 | #Objective 11: Calculate correlations between numerical features 211 | 212 | corr_matrix = df[['Rate', 'Duration', 'Year', 'Votes', 'Gross']].corr() 213 | 214 | plt.figure(figsize=(10, 8)) 215 | sns.heatmap( 216 | corr_matrix, 217 | annot=True, 218 | cmap='coolwarm', 219 | center=0, 220 | fmt=".2f", 221 | linewidths=0.5, 222 | cbar_kws={'label': 'Correlation Strength'} 223 | ) 224 | plt.title('Correlation Between Movie Features\n(Rating, Runtime, Year, Votes, Gross)', fontsize=14) 225 | plt.xticks(rotation=45) 226 | plt.tight_layout() 227 | plt.show() -------------------------------------------------------------------------------- /MovieReport.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Parsh-Kalwania/Sem4_Python_CA2_project/19aefed525a5b4038c98c988fb98332da20daf24/MovieReport.docx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🎬 IMDb Top 1000 Movies - Exploratory Data Analysis (EDA) 2 | 3 | This project presents an Exploratory Data Analysis (EDA) of the **IMDb Top 1000 Movies** dataset. It explores trends and insights from movies based on their ratings, genre, duration, gross earnings, and other features using Python libraries like Pandas, Matplotlib, and Seaborn. 4 | 5 | --- 6 | 7 | ## 📁 Dataset 8 | 9 | - **Source**: [IMDb Top 1000 Movies dataset (CSV format)](https://data.world/melanithapa/movie-data/workspace/file?filename=IMDB_top_1000.csv) 10 | - **Size**: 1000 rows × 10 columns 11 | - **Features Include**: 12 | - `Title`: Name of the movie 13 | - `Certificate`: Age rating 14 | - `Duration`: Length of the movie 15 | - `Genre`: Movie genres 16 | - `Rate`: IMDb rating 17 | - `Metascore`: Metacritic score 18 | - `Gross`: Box office gross earnings 19 | - `Cast` and `Director` (extracted from text) 20 | 21 | --- 22 | 23 | ## 📊 Objective 24 | 25 | - Analyze and visualize trends in top-rated IMDb movies. 26 | - Understand relationships between IMDb rating, Metascore, and Gross earnings. 27 | - Identify the most common genres, certificates, and directors. 28 | - Detect outliers and explore data cleaning decisions. 29 | 30 | --- 31 | 32 | ## 📌 Key Analysis & Decisions 33 | 34 | ### 🔍 Data Cleaning 35 | - **Dropped Nulls**: Chose to drop rows with missing values instead of filling them with mean/median/forward fill to avoid introducing bias or assumptions not supported by the data. 36 | 37 | ### 📈 Insights 38 | - **Top Genres & Directors**: Visualized frequency of genres and directors with multiple top-rated movies. 39 | - **Rating Analysis**: Found correlation between IMDb rating and Metascore. 40 | - **Gross Earnings**: Detected outliers using a boxplot, showing a few high-grossing movies skewing the distribution. 41 | 42 | ### 📦 Outlier Detection 43 | - Used **boxplot analysis** on Gross to identify high-grossing outliers. 44 | - These outliers were **retained** as they represent genuinely popular/blockbuster films. 45 | 46 | --- 47 | 48 | ## 🛠️ Tools Used 49 | 50 | - **Python 3.13** 51 | - **Pandas**, **NumPy** for data manipulation 52 | - **Seaborn**, **Matplotlib** for visualizations 53 | - **VSCode** for development 54 | 55 | --- 56 | 57 | ## 📷 Visuals 58 | 59 | - Barplots of Genre, Certificate, and Director distributions 60 | - Boxplot of Gross earnings showing outliers 61 | - Scatterplots for correlation analysis 62 | 63 | --- 64 | 65 | ## 📌 Conclusion 66 | 67 | This EDA helped uncover interesting patterns in IMDb's top-rated films. The analysis highlights the dominance of certain genres and directors, the skewed nature of box office revenues, and the reasonable correlation between critic and user ratings. 68 | --------------------------------------------------------------------------------