└── movies.py /movies.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | # Load dataset 6 | df = pd.read_csv("C:/Users/garg0/Downloads/movies.csv") 7 | 8 | # Clean column names 9 | df.columns = df.columns.str.strip() 10 | 11 | # Set seaborn style 12 | sns.set(style="whitegrid") 13 | 14 | # ========== 1. Runtime Distribution ========== 15 | df1 = df.dropna(subset=["runtime_in_minutes"]) 16 | plt.figure(figsize=(8,6)) 17 | sns.histplot(df1["runtime_in_minutes"], bins=30, kde=True, color="skyblue") # type: ignore 18 | plt.title("Distribution of Movie Runtime") 19 | plt.xlabel("Runtime (Minutes)") 20 | plt.tight_layout() 21 | plt.show() 22 | 23 | # ========== 2. Top Genres by Count ========== 24 | df2 = df.dropna(subset=["genre"]) 25 | top_genres = df2["genre"].value_counts().head(10) 26 | 27 | plt.figure(figsize=(10,5)) 28 | sns.barplot(x=top_genres.values, y=top_genres.index, palette="viridis") 29 | plt.title("Top 10 Genres by Movie Count") 30 | plt.xlabel("Number of Movies") 31 | plt.tight_layout() 32 | plt.show() 33 | 34 | # ========== 3. Tomatometer Rating vs Audience Rating ========== 35 | df3 = df.dropna(subset=["tomatometer_rating", "audience_rating"]) 36 | plt.figure(figsize=(8,6)) 37 | sns.scatterplot(data=df3, x="tomatometer_rating", y="audience_rating", alpha=0.6) 38 | sns.regplot(data=df3, x="tomatometer_rating", y="audience_rating", scatter=False, color="red") 39 | plt.title("Tomatometer vs Audience Rating") 40 | plt.xlabel("Tomatometer Rating (%)") 41 | plt.ylabel("Audience Rating (%)") 42 | plt.tight_layout() 43 | plt.show() 44 | 45 | # ========== 4. Average Tomatometer by Studio (Top 10) ========== 46 | df4 = df.dropna(subset=["studio_name", "tomatometer_rating"]) 47 | top_studios = df4.groupby("studio_name")["tomatometer_rating"].mean().sort_values(ascending=False).head(10) 48 | 49 | plt.figure(figsize=(10,5)) 50 | sns.barplot(x=top_studios.values, y=top_studios.index, palette="magma") 51 | plt.title("Top 10 Studios by Avg Tomatometer Rating") 52 | plt.xlabel("Average Tomatometer Rating (%)") 53 | plt.tight_layout() 54 | plt.show() 55 | 56 | # ========== 5. Movie Releases Over Time ========== 57 | df5 = df.dropna(subset=["in_theaters_date"]) 58 | df5["in_theaters_date"] = pd.to_datetime(df5["in_theaters_date"], errors="coerce") 59 | df5 = df5.dropna(subset=["in_theaters_date"]) 60 | df5["release_year"] = df5["in_theaters_date"].dt.year 61 | 62 | yearly_counts = df5["release_year"].value_counts().sort_index() 63 | 64 | plt.figure(figsize=(10,6)) 65 | sns.lineplot(x=yearly_counts.index, y=yearly_counts.values, marker="o") 66 | plt.title("Number of Movie Releases Per Year") 67 | plt.xlabel("Year") 68 | plt.ylabel("Number of Movies") 69 | plt.tight_layout() 70 | plt.show() 71 | 72 | 73 | # Select relevant numeric columns 74 | numeric_cols = [ 75 | 'tomatometer_rating', 76 | 'tomatometer_count', 77 | 'audience_rating', 78 | 'audience_count', 79 | 'runtime_in_minutes' 80 | ] 81 | 82 | # Drop rows with missing values in these columns 83 | df_numeric = df[numeric_cols].dropna() 84 | 85 | # Compute the correlation matrix 86 | corr_matrix = df_numeric.corr() 87 | 88 | # Plot the heatmap 89 | plt.figure(figsize=(10, 6)) 90 | sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5) 91 | plt.title("Correlation Heatmap of Movie Metrics") 92 | plt.tight_layout() 93 | plt.show() 94 | 95 | genre_counts = df["genre"].value_counts() 96 | 97 | # Top 5 genres, group rest as 'Other' 98 | # Count genres 99 | genre_counts = df["genre"].value_counts() 100 | 101 | # Top 5 genres, group rest as 'Other' 102 | top_genres = genre_counts[:5] 103 | other_genres = genre_counts[5:].sum() 104 | genre_pie = pd.concat([top_genres, pd.Series({"Other": other_genres})]) 105 | 106 | # Plot pie chart 107 | plt.figure(figsize=(8,8)) 108 | plt.pie(genre_pie, labels=genre_pie.index, autopct="%1.1f%%", startangle=140, colors=plt.cm.Paired.colors) # type: ignore 109 | plt.title("Top 5 Genres and Others") 110 | plt.tight_layout() 111 | plt.show() 112 | 113 | 114 | df_box = df.dropna(subset=["genre", "audience_rating"]) 115 | top_5_genres = df_box["genre"].value_counts().index[:5] 116 | df_box = df_box[df_box["genre"].isin(top_5_genres)] 117 | 118 | # Plot boxplot 119 | plt.figure(figsize=(10,6)) 120 | sns.boxplot(data=df_box, x="genre", y="audience_rating", palette="Set2") 121 | plt.title("Audience Rating Distribution by Genre") 122 | plt.xlabel("Genre") 123 | plt.ylabel("Audience Rating (%)") 124 | plt.tight_layout() 125 | plt.show() --------------------------------------------------------------------------------