├── dataset.xlsx
├── project report.docx
└── ca2project.py


/dataset.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vatsalrustagi277/Data-Visualization/HEAD/dataset.xlsx


--------------------------------------------------------------------------------
/project report.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vatsalrustagi277/Data-Visualization/HEAD/project report.docx


--------------------------------------------------------------------------------
/ca2project.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import seaborn as sns
  4 | import matplotlib.pyplot as plt
  5 | from scipy import stats
  6 | 
  7 | # Load the dataset
  8 | df = pd.read_excel("dataset.xlsx")
  9 | 
 10 | # Display the first few rows of the dataset
 11 | print(df.head())
 12 | 
 13 | # Get a summary of the dataset
 14 | summary = df.describe(include='all')
 15 | print(summary)
 16 | 
 17 | # Check for null values
 18 | null_values = df.isnull().sum()
 19 | print(null_values)
 20 | 
 21 | # Fill numerical null values with the mean
 22 | numeric_cols = df.select_dtypes(include=[np.number]).columns
 23 | df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
 24 | 
 25 | 
 26 | # Calculate correlation matrix only for numeric columns
 27 | correlation_matrix = df[numeric_cols].corr()
 28 | print(correlation_matrix)
 29 | 
 30 | # Calculate covariance matrix only for numeric columns
 31 | covariance_matrix = df[numeric_cols].cov()
 32 | print(covariance_matrix)
 33 | 
 34 | # Calculate IQR for numerical columns
 35 | Q1 = df[numeric_cols].quantile(0.25)
 36 | Q3 = df[numeric_cols].quantile(0.75)
 37 | IQR = Q3 - Q1
 38 | 
 39 | # Define outlier bounds
 40 | lower_bound = Q1 - 1.5 * IQR
 41 | upper_bound = Q3 + 1.5 * IQR
 42 | 
 43 | # Identify outliers
 44 | outliers = ((df[numeric_cols] < lower_bound) | (df[numeric_cols] > upper_bound)).sum()
 45 | print("Outliers based on IQR method:\n", outliers)
 46 | 
 47 | df.rename(columns={
 48 |     'Central Goods and Services Tax ( CGST ) Revenue': 'CGST',
 49 |     'State Goods and Services Tax ( SGST )Revenue': 'SGST',
 50 |     'Integrated Goods and Services Tax ( IGST )Revenue': 'IGST',
 51 |     'CESS Tax Revenue': 'CESS',
 52 |     'srcStateName': 'State',
 53 |     'Month': 'MonthFull'
 54 | }, inplace=True)
 55 | 
 56 | # --- 2. Summary Stats ---
 57 | print("\n🔹 Descriptive Statistics:\n")
 58 | print(df[['CGST', 'SGST', 'IGST', 'CESS']].describe())
 59 | 
 60 | # --- 3. Correlation Heatmap ---
 61 | plt.figure(figsize=(8, 6))
 62 | corr = df[['CGST', 'SGST', 'IGST', 'CESS']].corr()
 63 | sns.heatmap(corr, annot=True, cmap='Greens')
 64 | plt.title("Correlation Heatmap of GST Revenues")
 65 | plt.tight_layout()
 66 | plt.show()
 67 | 
 68 | # --- 4. Revenue Distributions ---
 69 | plt.figure(figsize=(14, 8))
 70 | for i, col in enumerate(['CGST', 'SGST', 'IGST', 'CESS']):
 71 |     plt.subplot(2, 2, i + 1)
 72 |     sns.histplot(df[col], bins=30, kde=True, color='cyan')
 73 |     plt.title(f"{col} Distribution")
 74 | plt.tight_layout()
 75 | plt.show()
 76 | 
 77 | state_totals = df.groupby('State')[['CGST', 'SGST', 'IGST', 'CESS']].sum()
 78 | state_totals['Total'] = state_totals.sum(axis=1)
 79 | top_states = state_totals.sort_values('Total', ascending=False).head(10)
 80 | 
 81 | plt.figure(figsize=(10, 6))
 82 | sns.barplot(x=top_states['Total'], y=top_states.index, palette='pastel')
 83 | plt.title("Top 10 States by Total GST Revenue")
 84 | plt.xlabel("Total GST Revenue")
 85 | plt.ylabel("State")
 86 | plt.tight_layout()
 87 | plt.show()
 88 | 
 89 | # --- 6. Monthly Trends ---
 90 | monthly_trend = df.groupby('MonthFull')[['CGST', 'SGST', 'IGST', 'CESS']].sum()
 91 | monthly_trend = monthly_trend.reset_index()
 92 | 
 93 | plt.figure(figsize=(12, 6))
 94 | for tax in ['CGST', 'SGST', 'IGST', 'CESS']:
 95 |     sns.lineplot(x='MonthFull', y=tax, data=monthly_trend, label=tax)
 96 | plt.title("Monthly GST Revenue Trends")
 97 | plt.xticks(rotation=45)
 98 | plt.ylabel("Revenue")
 99 | plt.legend()
100 | plt.tight_layout()
101 | plt.show()
102 | 
103 | # --- 7. Overall GST Type Share ---
104 | total_share = df[['CGST', 'SGST', 'IGST', 'CESS']].sum()
105 | plt.figure(figsize=(6, 6))
106 | plt.pie(total_share, labels=total_share.index, autopct='%1.1f%%', startangle=140, colors=sns.color_palette("muted"))
107 | plt.title("Overall GST Revenue Share by Type")
108 | plt.tight_layout()
109 | plt.show()
110 | 


--------------------------------------------------------------------------------