├── pollution_data.xlsx ├── README.md └── index.py /pollution_data.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/whokunalupadhyay/eda_project/HEAD/pollution_data.xlsx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 📊 Project Overview 2 | 3 | This project involves exploratory data analysis (EDA) on pollution-related data. The goal is to preprocess, visualize, and draw meaningful insights from the dataset. Key steps include data cleaning, handling missing values, and creating informative visualizations using libraries like `pandas` and `matplotlib`. 4 | 5 | ## 🔧 Technologies Used 6 | 7 | - Python 3.11 8 | - pandas 9 | - matplotlib 10 | - Git & GitHub 11 | 12 | **Author**: Kunal Upadhyay -------------------------------------------------------------------------------- /index.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | 7 | df=pd.read_excel("pollution_data.xlsx") 8 | 9 | # Removing whitespace from column names for ease 10 | df.columns = df.columns.str.strip() 11 | 12 | #data summary 13 | print(df.isnull().sum()) 14 | 15 | print("Top 5 rows of dataset\n") 16 | print(df.head()) 17 | 18 | print("Last 5 rows of dataset\n") 19 | print(df.tail()) 20 | 21 | print("Information about the dataset\n") 22 | print(df.info()) 23 | 24 | print("Available columns\n") 25 | print(df.columns.tolist(),"\n") 26 | 27 | #checking for duplicates 28 | duplicates=df.duplicated().sum() 29 | print(duplicates) 30 | 31 | print("Statistical summary\n") 32 | print(df.describe()) 33 | 34 | # Convert 'last_update' to datetime 35 | df['last_update'] = pd.to_datetime(df['last_update'], errors='coerce') 36 | 37 | # Drop rows with missing avg values 38 | df = df.dropna(subset=['pollutant_avg']) 39 | 40 | # Group by date and pollutant 41 | trend_df = df.groupby([df['last_update'].dt.date, 'pollutant_id'])['pollutant_avg'].mean().unstack() 42 | 43 | # Check if data exists 44 | print("Grouped data preview:") 45 | print(trend_df.head()) 46 | 47 | # Plot 48 | heatmap_data = df.groupby([df['last_update'].dt.date, 'pollutant_id'])['pollutant_avg'].mean().unstack() 49 | 50 | # Plot heatmap 51 | plt.figure(figsize=(12, 6)) 52 | sns.heatmap(heatmap_data.T, cmap="YlGnBu", linewidths=0.5, linecolor='gray') 53 | 54 | plt.title("Pollutant Levels Over Time (Heatmap)") 55 | plt.xlabel("Date") 56 | plt.ylabel("Pollutant") 57 | plt.xticks(rotation=45) 58 | plt.tight_layout() 59 | plt.show() 60 | 61 | # Group by city and pollutant, then find average values 62 | hotspot_df = df.groupby(['city', 'pollutant_id'])['pollutant_avg'].mean().unstack() 63 | 64 | # Find cities with highest average PM2.5 65 | top_cities = hotspot_df.sort_values(by='PM2.5', ascending=False).head(10) 66 | 67 | # Plot 68 | top_cities.plot(kind='bar', figsize=(12,6), title="Top 10 Pollution Hotspots by PM2.5") 69 | plt.ylabel("Average PM2.5 Level") 70 | plt.xticks(rotation=45, ha='right') 71 | plt.tight_layout() 72 | plt.show() 73 | 74 | 75 | # Average pollutant levels per city 76 | pollutant_comparison = df.groupby(['city', 'pollutant_id'])['pollutant_avg'].mean().unstack() 77 | 78 | # Plot for selected cities 79 | selected_cities = ['Delhi', 'Mumbai', 'Chennai', 'Kolkata'] 80 | pollutant_comparison.loc[selected_cities].plot(kind='bar', figsize=(10,6)) 81 | plt.title("Pollutant Comparison Across Major Cities") 82 | plt.ylabel("Average Level") 83 | plt.xticks(rotation=45) 84 | plt.tight_layout() 85 | plt.show() 86 | 87 | 88 | 89 | # Extract month from datetime 90 | df['month'] = df['last_update'].dt.month_name() 91 | 92 | # Boxplot: Seasonal variation for PM2.5 93 | plt.figure(figsize=(12,6)) 94 | sns.boxplot(x='month', y='pollutant_avg', data=df[df['pollutant_id'] == 'PM2.5']) 95 | plt.title("Monthly Variation in PM2.5 Levels") 96 | plt.ylabel("PM2.5") 97 | plt.xticks(rotation=45) 98 | plt.tight_layout() 99 | plt.show() 100 | 101 | 102 | # Heatmap of average pollutants per city 103 | 104 | plt.figure(figsize=(14,8)) 105 | sns.heatmap(hotspot_df.fillna(0).T, cmap="Reds", annot=False) 106 | plt.title("Heatmap: Average Pollutant Levels Across Cities") 107 | plt.xlabel("City") 108 | plt.ylabel("Pollutant") 109 | plt.tight_layout() 110 | plt.show() 111 | 112 | --------------------------------------------------------------------------------