└── README.md /README.md: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | from scipy.stats import ttest_ind 6 | 7 | # Load dataset 8 | df = pd.read_csv(r"C:\Users\yusuf raja\Downloads\Air_Quality.csv") 9 | 10 | # Optional: Clean column names 11 | df.columns = df.columns.str.strip() 12 | 13 | # =============================== 14 | # 1.Data Cleaning 15 | # =============================== 16 | # Drop rows with missing location info 17 | df_clean = df.dropna(subset=["Geo Place Name", "Geo Join ID"]).copy() 18 | 19 | # Check for missing values 20 | print("Missing values:\n", df_clean.isnull().sum()) 21 | 22 | # =============================== 23 | # 2.Compare Air Quality in Different Locations 24 | # =============================== 25 | plt.figure(figsize=(14, 6)) 26 | top_locations = df_clean.groupby("Geo Place Name")["Data Value"].mean().sort_values(ascending=False).head(10) 27 | 28 | sns.barplot(x=top_locations.values, y=top_locations.index, hue=top_locations.index, palette="coolwarm", dodge=False, legend=False) 29 | plt.title("Top 10 Locations with Highest Average Air Pollution") 30 | plt.xlabel("Average Pollution Level") 31 | plt.ylabel("Location") 32 | plt.tight_layout() 33 | plt.show() 34 | 35 | # =============================== 36 | # 3.Spot Areas with the Worst Air Pollution 37 | # =============================== 38 | worst = df_clean.sort_values("Data Value", ascending=False).head(10) 39 | print("Top 10 Most Polluted Records:\n", worst[["Geo Place Name", "Name", "Data Value", "Time Period"]]) 40 | 41 | # =============================== 42 | # 4.Compare Pollution Between Area Types 43 | # =============================== 44 | plt.figure(figsize=(12, 6)) 45 | sns.boxplot(x="Geo Type Name", y="Data Value", data=df_clean) 46 | plt.title("Air Pollution Comparison by Area Type") 47 | plt.xticks(rotation=45) 48 | plt.tight_layout() 49 | plt.show() 50 | 51 | # =============================== 52 | # 5.Seasonal Pollution Trends 53 | # =============================== 54 | df_clean["Season"] = df_clean["Time Period"].apply(lambda x: x.split()[0] if pd.notnull(x) and " " in x else x) 55 | 56 | plt.figure(figsize=(10, 5)) 57 | sns.boxplot(x="Season", y="Data Value", data=df_clean) 58 | plt.title("Pollution Levels by Season") 59 | plt.tight_layout() 60 | plt.show() 61 | 62 | # =============================== 63 | # Additional Analysis 1: T-Test between two area types 64 | # =============================== 65 | urban = df_clean[df_clean["Geo Type Name"] == "Urban"]["Data Value"].dropna() 66 | suburban = df_clean[df_clean["Geo Type Name"] == "Suburban"]["Data Value"].dropna() 67 | 68 | print(f"Urban sample size: {len(urban)}") 69 | print(f"Suburban sample size: {len(suburban)}") 70 | 71 | if len(urban) >= 2 and len(suburban) >= 2: 72 | t_stat, p_value = ttest_ind(urban, suburban, equal_var=False) 73 | print("\nT-Test: Urban vs Suburban Air Pollution Levels") 74 | print(f"T-statistic: {t_stat:.4f}") 75 | print(f"P-value: {p_value:.4f}") 76 | if p_value < 0.05: 77 | print("→ Significant difference between Urban and Suburban air pollution levels.") 78 | else: 79 | print("→ No significant difference between Urban and Suburban air pollution levels.") 80 | else: 81 | print("❌ Not enough data to perform T-test.") 82 | 83 | # =============================== 84 | # Additional Analysis 2: Heatmap of Season vs Geo Type Name 85 | # =============================== 86 | pivot_table = df_clean.pivot_table(values="Data Value", index="Geo Type Name", columns="Season", aggfunc="mean") 87 | 88 | plt.figure(figsize=(10, 6)) 89 | sns.heatmap(pivot_table, annot=True, fmt=".1f", cmap="YlGnBu", linewidths=.5) 90 | plt.title("Average Pollution by Season and Area Type") 91 | plt.tight_layout() 92 | plt.show() 93 | 94 | # =============================== 95 | # Additional Analysis 3: Met-style plot – Pollution trend by year 96 | # =============================== 97 | df_clean["Year"] = df_clean["Time Period"].apply(lambda x: x.split()[-1] if pd.notnull(x) and " " in x else None) 98 | df_clean["Year"] = pd.to_numeric(df_clean["Year"], errors='coerce') 99 | 100 | plt.figure(figsize=(12, 5)) 101 | sns.scatterplot(x="Year", y="Data Value", data=df_clean, alpha=0.4) 102 | sns.lineplot(x="Year", y="Data Value", data=df_clean, estimator='mean', errorbar=None, color='red', label="Mean Trend") 103 | plt.title("Pollution Levels Over Time") 104 | plt.tight_layout() 105 | plt.show() 106 | 107 | # =============================== 108 | # Additional Analysis 4: Scatter Plot – Pollution vs Year by Area Type 109 | # =============================== 110 | plt.figure(figsize=(12, 6)) 111 | sns.scatterplot(data=df_clean.dropna(subset=["Year", "Data Value"]), x="Year", y="Data Value", hue="Geo Type Name", alpha=0.6, palette="Set2") 112 | plt.title("Scatter Plot: Pollution Level vs Year by Area Type") 113 | plt.xlabel("Year") 114 | plt.ylabel("Pollution Level (Data Value)") 115 | plt.legend(title="Area Type") 116 | plt.tight_layout() 117 | plt.show() 118 | 119 | 120 | --------------------------------------------------------------------------------