├── README.md └── CA2PYTHON.py /README.md: -------------------------------------------------------------------------------- 1 | # DATA-ANALYSIS 2 | 1. Abstract - 3 | This project does Exploratory Data Analysis (EDA) of a dataset for movie data like gross revenue, runtime, meta scores, IMDb ratings, genres, and certificates. The principal goals are cleaning the dataset, revenue trend analysis, visualization of genre distribution, and identifying revenue outliers with the help of Python tools data science methodologies. 4 | 5 | 2. Introduction - 6 | Box office performance data, runtime, and genre trends in the movie industry are what movie industry statistics mirror. Based on a real movie dataset, the purpose of this project is to extract insightful information with Python libraries such as Pandas, NumPy, Matplotlib, and Seaborn. 7 | 8 | 3. Methodology - 9 | • Load dataset using Pandas 10 | • Clean the data (remove missing values) 11 | • Use summary statistics to understand the revenue and rating distribution 12 | • Create visualizations to explore genres and gross revenue 13 | • Detect outliers using the IQR method 14 | 15 | 16 | 4. Objectives - 17 | Understand and load the dataset: Familiarize yourself with the organization and composition of the movie dataset with Pandas and preview initial records. 18 | 19 | Clean and prepare the data: Detect missing or invalid values and get the dataset ready for analysis through simple data wrangling procedures. 20 | 21 | Analyze revenue distribution: Utilize descriptive statistics and graphical tools such as histograms and boxplots to discover how gross revenue is distributed within the dataset. 22 | 23 | Investigate genres and their trends in revenue: Determine the most frequent genres and contrast how gross revenue differs across them through visualization such as bar charts and pie charts. 24 | 25 | Find outliers and make conclusions: Use IQR-based outlier detection to find atypical revenue values that could influence general analysis, and provide key patterns and findings of the EDA. 26 | 27 | 5. Results and Analysis - 28 | The dataset was free of any missing values post-cleaning. 29 | 30 | Summary statistics reported a broad array of gross revenue values. 31 | 32 | A few genres occur more than others. 33 | 34 | Outliers existed in gross revenue data that affect average-based inferences. 35 | 36 | 37 | -------------------------------------------------------------------------------- /CA2PYTHON.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | # Load data 7 | df2 = pd.read_csv("/Users/anushka./Downloads/imdb_top_1000 2.csv") 8 | print("Original columns:", df2.columns.tolist()) 9 | 10 | # Include all required columns 11 | required_columns = ['Gross', 'Runtime', 'Meta_score', 'IMDB_Rating', 'Genre', 'Certificate'] 12 | df2 = df2[required_columns].dropna() 13 | 14 | # Convert 'Gross' from string to float (remove commas, scale to crores) 15 | df2['Gross'] = df2['Gross'].str.replace(',', '').astype(float) / 10000000 # in crores 16 | 17 | # Clean and convert 'Runtime' (e.g., "142 min" → 142.0) 18 | df2['Runtime'] = df2['Runtime'].str.extract('(\d+)').astype(float) 19 | 20 | # Normalize Meta_score to 0–10 scale 21 | df2['Meta_score'] = df2['Meta_score'].astype(float) / 10 22 | 23 | # Extract first genre only 24 | df2['Genre'] = df2['Genre'].str.split(',').str[0] 25 | 26 | # Check for missing values 27 | print("Missing values after cleaning:\n", df2.isnull().sum()) 28 | print("Cleaned Data Shape:", df2.shape) 29 | print("Sample Cleaned Data:\n", df2.head()) 30 | print("Basic Statistics:") 31 | print(df2.describe()) 32 | 33 | # Histogram of Gross Revenue 34 | plt.figure(figsize=(8, 5)) 35 | sns.histplot(df2['Gross'], bins=20, kde=True, color='skyblue') 36 | plt.title('Gross Revenue Distribution (in Crores)') 37 | plt.xlabel('Gross Revenue (₹ Crores)') 38 | plt.ylabel('Frequency') 39 | plt.grid(True) 40 | plt.tight_layout() 41 | plt.show() 42 | 43 | # Bar plot of top 10 genres by revenue 44 | plt.figure(figsize=(8, 6)) 45 | genre_revenue = df2.groupby('Genre')['Gross'].mean().sort_values(ascending=False).head(10) 46 | genre_revenue.plot(kind='bar') 47 | plt.title('Top 10 Genres by Average Gross Revenue') 48 | plt.xlabel('Genre') 49 | plt.ylabel('Average Revenue (₹ Crores)') 50 | plt.xticks(rotation=45) 51 | plt.tight_layout() 52 | plt.show() 53 | 54 | # Improved Pie chart of Certificate distribution 55 | plt.figure(figsize=(10, 10)) 56 | 57 | # Get value counts and limit to top 5, group the rest as "Other" 58 | certificate_counts = df2['Certificate'].value_counts() 59 | top_n = 5 60 | top_certificates = certificate_counts.head(top_n) 61 | others_count = certificate_counts[top_n:].sum() 62 | top_certificates['Other'] = others_count 63 | 64 | # Define colors 65 | colors = ['#FF9999', '#66B2FF', '#99FF99', '#FFCC99', '#FF99CC', '#CCCCFF'] 66 | 67 | # Create pie chart 68 | plt.pie(top_certificates, labels=top_certificates.index, autopct='%1.1f%%', 69 | colors=colors, startangle=90, pctdistance=0.85, labeldistance=None) 70 | plt.title('Distribution of Certificate Ratings', pad=20) 71 | plt.axis('equal') 72 | 73 | # Add legend and move percentage labels outside 74 | plt.legend(top_certificates.index, loc="center left", bbox_to_anchor=(1, 0.5), 75 | fontsize=10, title="Certificates") 76 | for i, patch in enumerate(plt.gcf().get_axes()[0].patches): 77 | plt.gcf().get_axes()[0].annotate(f'{top_certificates.iloc[i]/top_certificates.sum()*100:.1f}%', 78 | xy=(patch.center[0] * 1.4, patch.center[1] * 1.4), 79 | ha='center', va='center') 80 | plt.tight_layout() 81 | plt.show() 82 | 83 | # Scatter plot of Gross Revenue vs Runtime 84 | plt.figure(figsize=(8, 6)) 85 | plt.plot(df2['Runtime'], df2['Gross'], 'o', alpha=0.5) 86 | plt.title('Gross Revenue vs Runtime') 87 | plt.xlabel('Runtime (minutes)') 88 | plt.ylabel('Gross Revenue (₹ Crores)') 89 | plt.grid(True) 90 | plt.tight_layout() 91 | plt.show() 92 | 93 | # Correlation Heatmap 94 | numeric_cols = ['Gross', 'Runtime', 'Meta_score', 'IMDB_Rating'] 95 | correlation_matrix = df2[numeric_cols].corr() 96 | plt.figure(figsize=(10, 8)) 97 | sns.heatmap(correlation_matrix, annot=True, cmap='RdBu', vmin=-1, vmax=1, center=0) 98 | plt.title('Correlation Heatmap of Numeric Features') 99 | plt.tight_layout() 100 | plt.show() 101 | 102 | # Bar plot of average Gross Revenue by Genre 103 | plt.figure(figsize=(10, 6)) 104 | genre_revenue = df2.groupby('Genre')['Gross'].mean().sort_values() 105 | genre_revenue.plot(kind='bar') 106 | plt.title('Average Gross Revenue by Genre') 107 | plt.xlabel('Genre') 108 | plt.ylabel('Average Gross Revenue (₹ Crores)') 109 | plt.xticks(rotation=45) 110 | plt.tight_layout() 111 | plt.show() 112 | 113 | # Box plot for Gross with outliers 114 | plt.figure(figsize=(10, 6)) 115 | sns.boxplot(y=df2['Gross'], palette='Set2', fliersize=8) 116 | plt.title('Box Plot of Gross Revenue (with Outliers)') 117 | plt.ylabel('Gross Revenue (₹ Crores)') 118 | plt.grid(True, linestyle='--', alpha=0.7) 119 | plt.tight_layout() 120 | plt.show() 121 | 122 | # Filter outliers using IQR and create box plot 123 | Q1 = df2['Gross'].quantile(0.25) 124 | Q3 = df2['Gross'].quantile(0.75) 125 | IQR = Q3 - Q1 126 | lower_bound = Q1 - 1.5 * IQR 127 | upper_bound = Q3 + 1.5 * IQR 128 | 129 | df_no_outliers = df2[(df2['Gross'] >= lower_bound) & (df2['Gross'] <= upper_bound)] 130 | 131 | plt.figure(figsize=(10, 6)) 132 | sns.boxplot(y=df_no_outliers['Gross'], palette='Set2') 133 | plt.title('Box Plot of Gross Revenue (Outliers Removed)') 134 | plt.ylabel('Gross Revenue (₹ Crores)') 135 | plt.grid(True, linestyle='--', alpha=0.7) 136 | plt.tight_layout() 137 | plt.show() --------------------------------------------------------------------------------