├── Build └── code.py ├── ProjectP.csv └── README.md /Build/code.py: -------------------------------------------------------------------------------- 1 | #Project Topic:-🌍 Air Quality Trends Analysis in India 2 | 3 | import pandas as pd 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | 7 | df=pd.read_csv("C:\\Users\\hp\\Desktop\\ProjectP.csv") 8 | print(df.head()) #--> will show some top rows 9 | print(df.tail()) #---> will show last some rows data 10 | print(df.columns) #---> will shows all column names to us 11 | print(df.info) 12 | print(df.describe()) #---> will show Basic statistics like (mean,median,avg etc.) 13 | 14 | 15 | print("\n") 16 | '''Now check mising values''' 17 | print(df.isnull()) 18 | print(df.isnull().sum()) #--> will show how many missing values(null values) per column 19 | print(df.isnull().sum().sum()) 20 | 21 | 22 | print("\n") 23 | '''Now Droping / filling teh missing values''' 24 | #here i preffered filling instead of dropna because it can loss my useful data because of only one null vlaue present.. 25 | df=df.ffill() #---> ffill means filling previous value of that particular column 26 | #now checking again is there any missing value after filling 27 | print(df.isnull().sum().sum()) 28 | 29 | 30 | print("\n") 31 | '''Check for duplicates''' 32 | print(df.duplicated().sum()) #-->check how many duplictes are in dataset 33 | df=df.drop_duplicates() #--> remove duplicates if any 34 | 35 | 36 | print("\n") 37 | '''Check Unique Values''' 38 | print(df['state'].unique()) #--> will show name of unique states 39 | print(df['pollutant_id'].value_counts()) #---> shows how many times each pollutant appears in the dataset 40 | 41 | print("\n") 42 | print(df.shape) 43 | print(df.size) 44 | 45 | '''So above was part of EDA(Exploratory Data Analysis) of our dataset for our better understanding..''' 46 | 47 | # Objective1:- 48 | '''To analyze the air quality trends across different cities and states in India over multiple years 49 | by identifying which cities have the highest and lowest AQI values and 50 | understanding how air pollution levels have changed over time.''' 51 | 52 | 53 | df['last_update'] = pd.to_datetime(df['last_update']) # convert to datetime in proper format 54 | df['year'] = df['last_update'].dt.year #--> it will extract only years from it.. 55 | city_year_avg = df.groupby(['city', 'year'])['pollutant_avg'].mean().reset_index() #--># Group by city and year, then take mean of pollutant_avg 56 | highest = city_year_avg.sort_values(by='pollutant_avg', ascending=False).head(10) # Highest average 57 | lowest = city_year_avg.sort_values(by='pollutant_avg').head(10) # lowest 58 | 59 | # '''Both the barplots in one...''' 60 | # Set figure size for both plots together 61 | plt.figure(figsize=(12, 7)) 62 | 63 | # # First plot - Top 10 most polluted cities 64 | plt.subplot(2, 1, 1) # (rows, cols, position) -> this is 1st plot 65 | sns.barplot(data=highest, x='pollutant_avg', y='city', palette='Reds_r',hue='city') 66 | plt.title('Top 10 Most Polluted Cities (Highest Average AQI)') 67 | plt.xlabel('Average AQI') 68 | plt.ylabel('City') 69 | 70 | # # Second plot - Top 10 least polluted cities 71 | plt.subplot(2, 1, 2) # this is 2nd plot 72 | sns.barplot(data=lowest, x='pollutant_avg', y='city', palette='Greens',hue='city') 73 | plt.title('Top 10 Least Polluted Cities (Lowest Average AQI)') 74 | plt.xlabel('Average AQI') 75 | plt.ylabel('City') 76 | 77 | # # Show both together 78 | plt.tight_layout() #--> used for adjusts spacing to prevent overlap between plot elements 79 | plt.show() 80 | 81 | '''Now one more chart for this objective..''' 82 | #Grouped Bar Chart, also known as a Clustered Bar Chart 83 | 84 | # Filter top 5 cities with highest average pollution 85 | top_cities = df.groupby('city')['pollutant_avg'].mean().sort_values(ascending=False).head(5).index 86 | filtered_df = df[df['city'].isin(top_cities)] 87 | 88 | # Plot grouped bar chart 89 | plt.figure(figsize=(12, 6)) 90 | chart = sns.barplot(data=filtered_df, x='city', y='pollutant_avg', hue='state', palette='deep') 91 | 92 | # Add titles and labels 93 | plt.title("Grouped Bar Chart - Top 5 Most Polluted Cities by State", fontsize=14) 94 | plt.xlabel("City") 95 | plt.ylabel("Average Pollution Level") 96 | plt.legend(title='State', bbox_to_anchor=(1.05, 1), loc='upper left') 97 | plt.tight_layout() 98 | 99 | plt.show() 100 | 101 | 102 | 103 | #Objective2:- 104 | '''2. To compare the contribution of different pollutants such as PM2.5, NO2, CO, and SO2 in 105 | overall air pollution and determine which pollutant has the most significant impact on AQI levels.''' 106 | 107 | pollutant_data = df.groupby('pollutant_id')['pollutant_avg'].mean() 108 | #Set some colors 109 | colors = ['indianred', 'cornflowerblue', 'mediumseagreen', 'sandybrown', 'mediumorchid'] 110 | 111 | # Make a pie chart (donut style) 112 | plt.figure(figsize=(7, 7)) 113 | plt.pie(pollutant_data, 114 | labels=pollutant_data.index, 115 | colors=colors, 116 | autopct='%1.1f%%', #--->show slices with percentage labels 117 | startangle=140) #--→ Rotates the chart so the slices are better positioned. 118 | 119 | #Draw white circle in center to make it a donut 120 | circle = plt.Circle((0, 0), 0.30, color='white') 121 | plt.gca().add_artist(circle) #--> it basically adds a white circle in the center to create a donut chart 122 | 123 | # Add a simple title 124 | plt.title("Pollution by Type", fontsize=13, color='darkblue',backgroundcolor='lightblue') 125 | 126 | plt.tight_layout() 127 | plt.show() 128 | 129 | 130 | #Objective 3:- 131 | '''3. To study seasonal variations in air pollution by analyzing AQI data across different seasons 132 | (winter, summer, monsoon, and post-monsoon) and determine which season has the highest pollution levels.''' 133 | 134 | 135 | # Extract month from the date 136 | df['month'] = df['last_update'].dt.month 137 | 138 | # Function to assign seasons based on month 139 | def get_season(month): 140 | if month in [12, 1, 2]: 141 | return 'Winter' 142 | elif month in [3, 4, 5]: 143 | return 'Summer' 144 | elif month in [6, 7, 8]: 145 | return 'Monsoon' 146 | else: 147 | return 'Post-Monsoon' 148 | 149 | # Create a new column 'season' 150 | df['season'] = df['month'].apply(get_season) 151 | 152 | # Create the box plot 153 | plt.figure(figsize=(10, 6)) 154 | sns.boxplot(data=df, x='season', y='pollutant_avg', palette='muted',hue='season',legend=False) 155 | 156 | # Add title and labels 157 | plt.title("Seasonal Variation in Air Pollution (AQI)", fontsize=16, 158 | backgroundcolor='lightblue', 159 | fontweight='bold', 160 | pad=20 161 | ) 162 | plt.xlabel("Season") 163 | plt.ylabel("Average AQI") 164 | 165 | # Show the plot 166 | plt.tight_layout() 167 | plt.show() 168 | 169 | 170 | 171 | '''Now for this objective one more way of visualising with heatmap''' 172 | # Average AQI per month 173 | df['month'] = pd.to_datetime(df['last_update']).dt.month 174 | monthly_avg = df.groupby(['state', 'month'])['pollutant_avg'].mean().unstack() 175 | 176 | # Plotting heatmap 177 | plt.figure(figsize=(12, 8)) 178 | sns.heatmap(monthly_avg, cmap='coolwarm', linewidths=0.3, annot=True, fmt=".1f") 179 | 180 | plt.title("Heatmap of Monthly Average AQI by State", fontsize=14) 181 | plt.xlabel("Month") 182 | plt.ylabel("State") 183 | plt.tight_layout() 184 | plt.show() 185 | 186 | 187 | #Objective4:- 188 | '''4. To rank Indian states based on their air quality index (AQI) by calculating the average AQI 189 | for each state and identifying the most and least polluted states.''' 190 | 191 | state_avg = df.groupby('state')['pollutant_avg'].mean().sort_values(ascending=False) 192 | 193 | # Plot the horizontal bar chart 194 | plt.figure(figsize=(10, 7)) 195 | sns.barplot(x=state_avg.values, y=state_avg.index, palette='YlGnBu',hue=state_avg.index,legend=False) 196 | 197 | # Add title and labels 198 | plt.title("Average AQI by Indian States", fontsize=14, backgroundcolor='lightblue') 199 | plt.xlabel("Average AQI") 200 | plt.ylabel("State") 201 | 202 | #Show the plot 203 | plt.tight_layout() 204 | plt.show() 205 | 206 | #Objective:-(small part) 207 | '''To understand how often different AQI levels occur by showing the overall distribution of air pollution using a histogram.''' 208 | plt.figure(figsize=(8, 6)) 209 | sns.histplot(data=df, x='pollutant_avg', kde=True, bins=90, color='yellow') 210 | plt.title("Histogram of Average AQI", backgroundcolor='lightblue') 211 | plt.xlabel("Average AQI") 212 | plt.ylabel("Frequency") 213 | plt.tight_layout() 214 | plt.show() 215 | 216 | 217 | #Objective5:- 218 | '''To compare the spread and distribution of AQI levels across different Indian states using a violin plot.''' 219 | # Set the figure size 220 | plt.figure(figsize=(12, 6)) 221 | 222 | # Create the violin plot 223 | sns.violinplot(data=df, x='state', y='pollutant_avg', palette='pastel',hue='state',legend=False) 224 | 225 | # Add title and labels 226 | plt.title("AQI Distribution Across Indian States", backgroundcolor='lightblue', fontsize=14) 227 | plt.xlabel("State") 228 | plt.ylabel("Average AQI") 229 | 230 | # Rotate state names if they overlap 231 | plt.xticks(rotation=60,ha='right') #--> ha used so that names don'y clash with eachother 232 | 233 | # Show the plot 234 | plt.tight_layout() 235 | plt.show() 236 | 237 | #Objective6:- 238 | '''"To compare different air quality values (like minimum, maximum, and average pollution) for each state 239 | in India, and to see if there are any patterns, similarities, or differences between them."''' 240 | 241 | sns.set_palette('pastel') 242 | 243 | # Create the pair plot 244 | #-->here numerical values in dataset that's why making pairplot 245 | pair_plot = sns.pairplot( 246 | df, 247 | hue='state', #categorial data 248 | height=1.7, 249 | diag_kind='kde', 250 | ) 251 | 252 | # Add a title to the plot 253 | pair_plot.fig.suptitle('Pairwise Relationships of Air Quality Indicators', y=1.02) 254 | 255 | # Show the plot 256 | plt.show() 257 | 258 | #Objective7:- 259 | '''To visualize the relationship between the average and maximum AQI values across different Indian states 260 | using a scatter plot.''' 261 | 262 | plt.figure(figsize=(8, 6)) 263 | 264 | # Create scatter plot 265 | sns.scatterplot(data=df, x='pollutant_avg', y='pollutant_max', hue='state', palette='colorblind') 266 | 267 | # Add title and labels 268 | plt.title("Scatter Plot: Avg vs Max AQI by State", fontsize=14, backgroundcolor='lightblue') 269 | plt.xlabel("Average AQI") 270 | plt.ylabel("Maximum AQI") 271 | plt.tight_layout() 272 | plt.show() 273 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Air-Pollution-Analysis-Python-Project 2 | This project analyzes air quality data from various cities and states across India. It uses Python libraries like Pandas, Matplotlib, and Seaborn to perform Exploratory Data Analysis (EDA) and derive insights about air pollution trends, seasonal variations, state and city comparisons, and pollutant contributions. 3 | 📁 Dataset 4 | File used: ProjectP.csv 5 | 6 | The dataset contains information about: 7 | City and state names 8 | Different pollutants like PM2.5, NO2, CO, SO2, etc. 9 | Air Quality Index (AQI) values (average, min, and max) 10 | Last updated date for each record 11 | 12 | 🧪 Tools & Libraries 13 | #pandas – for data manipulation 14 | matplotlib – for creating visualizations 15 | seaborn – for enhanced statistical plots 16 | 17 | 🧼 Data Preprocessing 18 | Displayed initial rows using head() and tail() 19 | Checked column names, null values, and summary statistics 20 | Filled missing values using forward fill (ffill) 21 | Removed duplicate records 22 | Extracted new columns like year, month, and season 23 | 24 | 🎯 Objectives and Insights 25 | 26 | ✅ Objective 1: Analyze Air Quality Trends Over Time 27 | Visualized cities with the highest and lowest AQI over multiple years. 28 | Identified Top 10 polluted and least polluted cities. 29 | Created grouped bar charts for comparison across cities and states. 30 | 31 | ✅ Objective 2: Contribution of Different Pollutants 32 | Used a donut-style pie chart to compare pollutants like PM2.5, NO2, CO, and SO2. 33 | Found out which pollutants have a greater contribution to AQI. 34 | 35 | ✅ Objective 3: Seasonal Variation in Air Pollution 36 | Categorized months into seasons: Winter, Summer, Monsoon, Post-Monsoon. 37 | Visualized seasonal AQI levels using boxplots and heatmaps. 38 | Found which season has the highest pollution levels. 39 | 40 | ✅ Objective 4: Ranking Indian States by AQI 41 | Calculated average AQI per state. 42 | Visualized rankings using horizontal bar charts. 43 | 44 | ✅ Objective 5: Distribution of AQI Levels 45 | Used a histogram to show how frequently different AQI levels occur. 46 | 47 | ✅ Objective 6: Spread of AQI Across States 48 | Used violin plots to visualize AQI distribution across different Indian states. 49 | 50 | ✅ Objective 7: Compare Air Quality Stats (Min, Max, Avg) 51 | Created a pairplot to study relationships between different AQI stats for each state. 52 | Used a scatter plot to explore the relation between average and maximum AQI values. 53 | 54 | 55 | 📌 Key Insights 56 | Certain cities consistently report higher AQI levels across years. 57 | Winter tends to have the worst air quality due to various environmental and human factors. 58 | PM2.5 contributes the most to overall pollution in many cities. 59 | States like Delhi and Uttar Pradesh show consistently high average AQI. 60 | 61 | 62 | 📷 Visualizations 63 | The project includes: 64 | Bar plots 65 | Pie chart (donut style) 66 | Boxplots and violin plots 67 | Heatmaps 68 | Pair plots 69 | Scatter plots 70 | All plots are designed to highlight trends, make comparisons, and reveal patterns in air pollution data. 71 | 72 | 📚 How to Run 73 | Make sure you have Python installed. 74 | 75 | Install required libraries: 76 | pip install pandas matplotlib seaborn 77 | Place the dataset ProjectP.csv in the specified path or update the path in the code. 78 | Run the Python script in your IDE or terminal. 79 | 80 | 81 | 🙋‍♀️ Author 82 | Easha Sharma 83 | Student of BTech (Computer Science and Engineering) 84 | Passionate about data analysis and environmental awareness. 85 | --------------------------------------------------------------------------------