├── pollution_data.xlsx
├── README.md
└── index.py


/pollution_data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/whokunalupadhyay/eda_project/HEAD/pollution_data.xlsx


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## 📊 Project Overview
 2 | 
 3 | This project involves exploratory data analysis (EDA) on pollution-related data. The goal is to preprocess, visualize, and draw meaningful insights from the dataset. Key steps include data cleaning, handling missing values, and creating informative visualizations using libraries like `pandas` and `matplotlib`.
 4 | 
 5 | ## 🔧 Technologies Used
 6 | 
 7 | - Python 3.11
 8 | - pandas
 9 | - matplotlib
10 | - Git & GitHub
11 | 
12 | **Author**: Kunal Upadhyay


--------------------------------------------------------------------------------
/index.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import seaborn as sns
  5 | 
  6 | 
  7 | df=pd.read_excel("pollution_data.xlsx")
  8 | 
  9 | # Removing whitespace from column names for ease
 10 | df.columns = df.columns.str.strip()  
 11 | 
 12 | #data summary
 13 | print(df.isnull().sum())
 14 | 
 15 | print("Top 5 rows of dataset\n")
 16 | print(df.head())
 17 | 
 18 | print("Last 5 rows of dataset\n")
 19 | print(df.tail())
 20 | 
 21 | print("Information about the dataset\n")
 22 | print(df.info())
 23 | 
 24 | print("Available columns\n")
 25 | print(df.columns.tolist(),"\n")
 26 | 
 27 | #checking for duplicates
 28 | duplicates=df.duplicated().sum()
 29 | print(duplicates)
 30 | 
 31 | print("Statistical summary\n")
 32 | print(df.describe())
 33 | 
 34 | # Convert 'last_update' to datetime
 35 | df['last_update'] = pd.to_datetime(df['last_update'], errors='coerce')
 36 | 
 37 | # Drop rows with missing avg values
 38 | df = df.dropna(subset=['pollutant_avg'])
 39 | 
 40 | # Group by date and pollutant
 41 | trend_df = df.groupby([df['last_update'].dt.date, 'pollutant_id'])['pollutant_avg'].mean().unstack()
 42 | 
 43 | # Check if data exists
 44 | print("Grouped data preview:")
 45 | print(trend_df.head())
 46 | 
 47 | # Plot
 48 | heatmap_data = df.groupby([df['last_update'].dt.date, 'pollutant_id'])['pollutant_avg'].mean().unstack()
 49 | 
 50 | # Plot heatmap
 51 | plt.figure(figsize=(12, 6))
 52 | sns.heatmap(heatmap_data.T, cmap="YlGnBu", linewidths=0.5, linecolor='gray')
 53 | 
 54 | plt.title("Pollutant Levels Over Time (Heatmap)")
 55 | plt.xlabel("Date")
 56 | plt.ylabel("Pollutant")
 57 | plt.xticks(rotation=45)
 58 | plt.tight_layout()
 59 | plt.show()
 60 | 
 61 | # Group by city and pollutant, then find average values
 62 | hotspot_df = df.groupby(['city', 'pollutant_id'])['pollutant_avg'].mean().unstack()
 63 | 
 64 | # Find cities with highest average PM2.5
 65 | top_cities = hotspot_df.sort_values(by='PM2.5', ascending=False).head(10)
 66 | 
 67 | # Plot
 68 | top_cities.plot(kind='bar', figsize=(12,6), title="Top 10 Pollution Hotspots by PM2.5")
 69 | plt.ylabel("Average PM2.5 Level")
 70 | plt.xticks(rotation=45, ha='right')
 71 | plt.tight_layout()
 72 | plt.show()
 73 | 
 74 | 
 75 | # Average pollutant levels per city
 76 | pollutant_comparison = df.groupby(['city', 'pollutant_id'])['pollutant_avg'].mean().unstack()
 77 | 
 78 | # Plot for selected cities
 79 | selected_cities = ['Delhi', 'Mumbai', 'Chennai', 'Kolkata']
 80 | pollutant_comparison.loc[selected_cities].plot(kind='bar', figsize=(10,6))
 81 | plt.title("Pollutant Comparison Across Major Cities")
 82 | plt.ylabel("Average Level")
 83 | plt.xticks(rotation=45)
 84 | plt.tight_layout()
 85 | plt.show()
 86 | 
 87 | 
 88 | 
 89 | # Extract month from datetime
 90 | df['month'] = df['last_update'].dt.month_name()
 91 | 
 92 | # Boxplot: Seasonal variation for PM2.5
 93 | plt.figure(figsize=(12,6))
 94 | sns.boxplot(x='month', y='pollutant_avg', data=df[df['pollutant_id'] == 'PM2.5'])
 95 | plt.title("Monthly Variation in PM2.5 Levels")
 96 | plt.ylabel("PM2.5")
 97 | plt.xticks(rotation=45)
 98 | plt.tight_layout()
 99 | plt.show()
100 | 
101 | 
102 | # Heatmap of average pollutants per city
103 | 
104 | plt.figure(figsize=(14,8))
105 | sns.heatmap(hotspot_df.fillna(0).T, cmap="Reds", annot=False)
106 | plt.title("Heatmap: Average Pollutant Levels Across Cities")
107 | plt.xlabel("City")
108 | plt.ylabel("Pollutant")
109 | plt.tight_layout()
110 | plt.show()
111 | 
112 | 


--------------------------------------------------------------------------------