├── dataset.xlsx ├── project report.docx └── ca2project.py /dataset.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vatsalrustagi277/Data-Visualization/HEAD/dataset.xlsx -------------------------------------------------------------------------------- /project report.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vatsalrustagi277/Data-Visualization/HEAD/project report.docx -------------------------------------------------------------------------------- /ca2project.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import seaborn as sns 4 | import matplotlib.pyplot as plt 5 | from scipy import stats 6 | 7 | # Load the dataset 8 | df = pd.read_excel("dataset.xlsx") 9 | 10 | # Display the first few rows of the dataset 11 | print(df.head()) 12 | 13 | # Get a summary of the dataset 14 | summary = df.describe(include='all') 15 | print(summary) 16 | 17 | # Check for null values 18 | null_values = df.isnull().sum() 19 | print(null_values) 20 | 21 | # Fill numerical null values with the mean 22 | numeric_cols = df.select_dtypes(include=[np.number]).columns 23 | df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean()) 24 | 25 | 26 | # Calculate correlation matrix only for numeric columns 27 | correlation_matrix = df[numeric_cols].corr() 28 | print(correlation_matrix) 29 | 30 | # Calculate covariance matrix only for numeric columns 31 | covariance_matrix = df[numeric_cols].cov() 32 | print(covariance_matrix) 33 | 34 | # Calculate IQR for numerical columns 35 | Q1 = df[numeric_cols].quantile(0.25) 36 | Q3 = df[numeric_cols].quantile(0.75) 37 | IQR = Q3 - Q1 38 | 39 | # Define outlier bounds 40 | lower_bound = Q1 - 1.5 * IQR 41 | upper_bound = Q3 + 1.5 * IQR 42 | 43 | # Identify outliers 44 | outliers = ((df[numeric_cols] < lower_bound) | (df[numeric_cols] > upper_bound)).sum() 45 | print("Outliers based on IQR method:\n", outliers) 46 | 47 | df.rename(columns={ 48 | 'Central Goods and Services Tax ( CGST ) Revenue': 'CGST', 49 | 'State Goods and Services Tax ( SGST )Revenue': 'SGST', 50 | 'Integrated Goods and Services Tax ( IGST )Revenue': 'IGST', 51 | 'CESS Tax Revenue': 'CESS', 52 | 'srcStateName': 'State', 53 | 'Month': 'MonthFull' 54 | }, inplace=True) 55 | 56 | # --- 2. Summary Stats --- 57 | print("\n🔹 Descriptive Statistics:\n") 58 | print(df[['CGST', 'SGST', 'IGST', 'CESS']].describe()) 59 | 60 | # --- 3. Correlation Heatmap --- 61 | plt.figure(figsize=(8, 6)) 62 | corr = df[['CGST', 'SGST', 'IGST', 'CESS']].corr() 63 | sns.heatmap(corr, annot=True, cmap='Greens') 64 | plt.title("Correlation Heatmap of GST Revenues") 65 | plt.tight_layout() 66 | plt.show() 67 | 68 | # --- 4. Revenue Distributions --- 69 | plt.figure(figsize=(14, 8)) 70 | for i, col in enumerate(['CGST', 'SGST', 'IGST', 'CESS']): 71 | plt.subplot(2, 2, i + 1) 72 | sns.histplot(df[col], bins=30, kde=True, color='cyan') 73 | plt.title(f"{col} Distribution") 74 | plt.tight_layout() 75 | plt.show() 76 | 77 | state_totals = df.groupby('State')[['CGST', 'SGST', 'IGST', 'CESS']].sum() 78 | state_totals['Total'] = state_totals.sum(axis=1) 79 | top_states = state_totals.sort_values('Total', ascending=False).head(10) 80 | 81 | plt.figure(figsize=(10, 6)) 82 | sns.barplot(x=top_states['Total'], y=top_states.index, palette='pastel') 83 | plt.title("Top 10 States by Total GST Revenue") 84 | plt.xlabel("Total GST Revenue") 85 | plt.ylabel("State") 86 | plt.tight_layout() 87 | plt.show() 88 | 89 | # --- 6. Monthly Trends --- 90 | monthly_trend = df.groupby('MonthFull')[['CGST', 'SGST', 'IGST', 'CESS']].sum() 91 | monthly_trend = monthly_trend.reset_index() 92 | 93 | plt.figure(figsize=(12, 6)) 94 | for tax in ['CGST', 'SGST', 'IGST', 'CESS']: 95 | sns.lineplot(x='MonthFull', y=tax, data=monthly_trend, label=tax) 96 | plt.title("Monthly GST Revenue Trends") 97 | plt.xticks(rotation=45) 98 | plt.ylabel("Revenue") 99 | plt.legend() 100 | plt.tight_layout() 101 | plt.show() 102 | 103 | # --- 7. Overall GST Type Share --- 104 | total_share = df[['CGST', 'SGST', 'IGST', 'CESS']].sum() 105 | plt.figure(figsize=(6, 6)) 106 | plt.pie(total_share, labels=total_share.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette("muted")) 107 | plt.title("Overall GST Revenue Share by Type") 108 | plt.tight_layout() 109 | plt.show() 110 | --------------------------------------------------------------------------------