└── README.md


/README.md:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import matplotlib.pyplot as plt
  4 | import seaborn as sns
  5 | from scipy.stats import ttest_ind
  6 | 
  7 | # Load dataset
  8 | df = pd.read_csv(r"C:\Users\yusuf raja\Downloads\Air_Quality.csv")
  9 | 
 10 | # Optional: Clean column names
 11 | df.columns = df.columns.str.strip()
 12 | 
 13 | # ===============================
 14 | # 1.Data Cleaning
 15 | # ===============================
 16 | # Drop rows with missing location info
 17 | df_clean = df.dropna(subset=["Geo Place Name", "Geo Join ID"]).copy()
 18 | 
 19 | # Check for missing values
 20 | print("Missing values:\n", df_clean.isnull().sum())
 21 | 
 22 | # ===============================
 23 | # 2.Compare Air Quality in Different Locations
 24 | # ===============================
 25 | plt.figure(figsize=(14, 6))
 26 | top_locations = df_clean.groupby("Geo Place Name")["Data Value"].mean().sort_values(ascending=False).head(10)
 27 | 
 28 | sns.barplot(x=top_locations.values, y=top_locations.index, hue=top_locations.index, palette="coolwarm", dodge=False, legend=False)
 29 | plt.title("Top 10 Locations with Highest Average Air Pollution")
 30 | plt.xlabel("Average Pollution Level")
 31 | plt.ylabel("Location")
 32 | plt.tight_layout()
 33 | plt.show()
 34 | 
 35 | # ===============================
 36 | # 3.Spot Areas with the Worst Air Pollution
 37 | # ===============================
 38 | worst = df_clean.sort_values("Data Value", ascending=False).head(10)
 39 | print("Top 10 Most Polluted Records:\n", worst[["Geo Place Name", "Name", "Data Value", "Time Period"]])
 40 | 
 41 | # ===============================
 42 | # 4.Compare Pollution Between Area Types
 43 | # ===============================
 44 | plt.figure(figsize=(12, 6))
 45 | sns.boxplot(x="Geo Type Name", y="Data Value", data=df_clean)
 46 | plt.title("Air Pollution Comparison by Area Type")
 47 | plt.xticks(rotation=45)
 48 | plt.tight_layout()
 49 | plt.show()
 50 | 
 51 | # ===============================
 52 | # 5.Seasonal Pollution Trends
 53 | # ===============================
 54 | df_clean["Season"] = df_clean["Time Period"].apply(lambda x: x.split()[0] if pd.notnull(x) and " " in x else x)
 55 | 
 56 | plt.figure(figsize=(10, 5))
 57 | sns.boxplot(x="Season", y="Data Value", data=df_clean)
 58 | plt.title("Pollution Levels by Season")
 59 | plt.tight_layout()
 60 | plt.show()
 61 | 
 62 | # ===============================
 63 | # Additional Analysis 1: T-Test between two area types
 64 | # ===============================
 65 | urban = df_clean[df_clean["Geo Type Name"] == "Urban"]["Data Value"].dropna()
 66 | suburban = df_clean[df_clean["Geo Type Name"] == "Suburban"]["Data Value"].dropna()
 67 | 
 68 | print(f"Urban sample size: {len(urban)}")
 69 | print(f"Suburban sample size: {len(suburban)}")
 70 | 
 71 | if len(urban) >= 2 and len(suburban) >= 2:
 72 |     t_stat, p_value = ttest_ind(urban, suburban, equal_var=False)
 73 |     print("\nT-Test: Urban vs Suburban Air Pollution Levels")
 74 |     print(f"T-statistic: {t_stat:.4f}")
 75 |     print(f"P-value: {p_value:.4f}")
 76 |     if p_value < 0.05:
 77 |         print("→ Significant difference between Urban and Suburban air pollution levels.")
 78 |     else:
 79 |         print("→ No significant difference between Urban and Suburban air pollution levels.")
 80 | else:
 81 |     print("❌ Not enough data to perform T-test.")
 82 | 
 83 | # ===============================
 84 | # Additional Analysis 2: Heatmap of Season vs Geo Type Name
 85 | # ===============================
 86 | pivot_table = df_clean.pivot_table(values="Data Value", index="Geo Type Name", columns="Season", aggfunc="mean")
 87 | 
 88 | plt.figure(figsize=(10, 6))
 89 | sns.heatmap(pivot_table, annot=True, fmt=".1f", cmap="YlGnBu", linewidths=.5)
 90 | plt.title("Average Pollution by Season and Area Type")
 91 | plt.tight_layout()
 92 | plt.show()
 93 | 
 94 | # ===============================
 95 | # Additional Analysis 3: Met-style plot – Pollution trend by year
 96 | # ===============================
 97 | df_clean["Year"] = df_clean["Time Period"].apply(lambda x: x.split()[-1] if pd.notnull(x) and " " in x else None)
 98 | df_clean["Year"] = pd.to_numeric(df_clean["Year"], errors='coerce')
 99 | 
100 | plt.figure(figsize=(12, 5))
101 | sns.scatterplot(x="Year", y="Data Value", data=df_clean, alpha=0.4)
102 | sns.lineplot(x="Year", y="Data Value", data=df_clean, estimator='mean', errorbar=None, color='red', label="Mean Trend")
103 | plt.title("Pollution Levels Over Time")
104 | plt.tight_layout()
105 | plt.show()
106 | 
107 | # ===============================
108 | # Additional Analysis 4: Scatter Plot – Pollution vs Year by Area Type
109 | # ===============================
110 | plt.figure(figsize=(12, 6))
111 | sns.scatterplot(data=df_clean.dropna(subset=["Year", "Data Value"]), x="Year", y="Data Value", hue="Geo Type Name", alpha=0.6, palette="Set2")
112 | plt.title("Scatter Plot: Pollution Level vs Year by Area Type")
113 | plt.xlabel("Year")
114 | plt.ylabel("Pollution Level (Data Value)")
115 | plt.legend(title="Area Type")
116 | plt.tight_layout()
117 | plt.show()
118 | 
119 | 
120 | 


--------------------------------------------------------------------------------