└── movies.py


/movies.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import matplotlib.pyplot as plt
  3 | import seaborn as sns
  4 | 
  5 | # Load dataset
  6 | df = pd.read_csv("C:/Users/garg0/Downloads/movies.csv")
  7 | 
  8 | # Clean column names
  9 | df.columns = df.columns.str.strip()
 10 | 
 11 | # Set seaborn style
 12 | sns.set(style="whitegrid")
 13 | 
 14 | # ========== 1. Runtime Distribution ==========
 15 | df1 = df.dropna(subset=["runtime_in_minutes"])
 16 | plt.figure(figsize=(8,6))
 17 | sns.histplot(df1["runtime_in_minutes"], bins=30, kde=True, color="skyblue") # type: ignore
 18 | plt.title("Distribution of Movie Runtime")
 19 | plt.xlabel("Runtime (Minutes)")
 20 | plt.tight_layout()
 21 | plt.show()
 22 | 
 23 | # ========== 2. Top Genres by Count ==========
 24 | df2 = df.dropna(subset=["genre"])
 25 | top_genres = df2["genre"].value_counts().head(10)
 26 | 
 27 | plt.figure(figsize=(10,5))
 28 | sns.barplot(x=top_genres.values, y=top_genres.index, palette="viridis")
 29 | plt.title("Top 10 Genres by Movie Count")
 30 | plt.xlabel("Number of Movies")
 31 | plt.tight_layout()
 32 | plt.show()
 33 | 
 34 | # ========== 3. Tomatometer Rating vs Audience Rating ==========
 35 | df3 = df.dropna(subset=["tomatometer_rating", "audience_rating"])
 36 | plt.figure(figsize=(8,6))
 37 | sns.scatterplot(data=df3, x="tomatometer_rating", y="audience_rating", alpha=0.6)
 38 | sns.regplot(data=df3, x="tomatometer_rating", y="audience_rating", scatter=False, color="red")
 39 | plt.title("Tomatometer vs Audience Rating")
 40 | plt.xlabel("Tomatometer Rating (%)")
 41 | plt.ylabel("Audience Rating (%)")
 42 | plt.tight_layout()
 43 | plt.show()
 44 | 
 45 | # ========== 4. Average Tomatometer by Studio (Top 10) ==========
 46 | df4 = df.dropna(subset=["studio_name", "tomatometer_rating"])
 47 | top_studios = df4.groupby("studio_name")["tomatometer_rating"].mean().sort_values(ascending=False).head(10)
 48 | 
 49 | plt.figure(figsize=(10,5))
 50 | sns.barplot(x=top_studios.values, y=top_studios.index, palette="magma")
 51 | plt.title("Top 10 Studios by Avg Tomatometer Rating")
 52 | plt.xlabel("Average Tomatometer Rating (%)")
 53 | plt.tight_layout()
 54 | plt.show()
 55 | 
 56 | # ========== 5. Movie Releases Over Time ==========
 57 | df5 = df.dropna(subset=["in_theaters_date"])
 58 | df5["in_theaters_date"] = pd.to_datetime(df5["in_theaters_date"], errors="coerce")
 59 | df5 = df5.dropna(subset=["in_theaters_date"])
 60 | df5["release_year"] = df5["in_theaters_date"].dt.year
 61 | 
 62 | yearly_counts = df5["release_year"].value_counts().sort_index()
 63 | 
 64 | plt.figure(figsize=(10,6))
 65 | sns.lineplot(x=yearly_counts.index, y=yearly_counts.values, marker="o")
 66 | plt.title("Number of Movie Releases Per Year")
 67 | plt.xlabel("Year")
 68 | plt.ylabel("Number of Movies")
 69 | plt.tight_layout()
 70 | plt.show()
 71 | 
 72 | 
 73 | # Select relevant numeric columns
 74 | numeric_cols = [
 75 |     'tomatometer_rating',
 76 |     'tomatometer_count',
 77 |     'audience_rating',
 78 |     'audience_count',
 79 |     'runtime_in_minutes'
 80 | ]
 81 | 
 82 | # Drop rows with missing values in these columns
 83 | df_numeric = df[numeric_cols].dropna()
 84 | 
 85 | # Compute the correlation matrix
 86 | corr_matrix = df_numeric.corr()
 87 | 
 88 | # Plot the heatmap
 89 | plt.figure(figsize=(10, 6))
 90 | sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
 91 | plt.title("Correlation Heatmap of Movie Metrics")
 92 | plt.tight_layout()
 93 | plt.show()
 94 | 
 95 | genre_counts = df["genre"].value_counts()
 96 | 
 97 | # Top 5 genres, group rest as 'Other'
 98 | # Count genres
 99 | genre_counts = df["genre"].value_counts()
100 | 
101 | # Top 5 genres, group rest as 'Other'
102 | top_genres = genre_counts[:5]
103 | other_genres = genre_counts[5:].sum()
104 | genre_pie = pd.concat([top_genres, pd.Series({"Other": other_genres})])
105 | 
106 | # Plot pie chart
107 | plt.figure(figsize=(8,8))
108 | plt.pie(genre_pie, labels=genre_pie.index, autopct="%1.1f%%", startangle=140, colors=plt.cm.Paired.colors) # type: ignore
109 | plt.title("Top 5 Genres and Others")
110 | plt.tight_layout()
111 | plt.show()
112 | 
113 | 
114 | df_box = df.dropna(subset=["genre", "audience_rating"])
115 | top_5_genres = df_box["genre"].value_counts().index[:5]
116 | df_box = df_box[df_box["genre"].isin(top_5_genres)]
117 | 
118 | # Plot boxplot
119 | plt.figure(figsize=(10,6))
120 | sns.boxplot(data=df_box, x="genre", y="audience_rating", palette="Set2")
121 | plt.title("Audience Rating Distribution by Genre")
122 | plt.xlabel("Genre")
123 | plt.ylabel("Audience Rating (%)")
124 | plt.tight_layout()
125 | plt.show()


--------------------------------------------------------------------------------