├── README.md ├── Screenshot 2024-09-24 161731.png ├── Screenshot 2025-04-11 234358.png ├── Screenshot 2025-04-11 234411.png ├── Screenshot 2025-04-11 234423.png ├── Screenshot 2025-04-11 234436.png └── project-code2.py /README.md: -------------------------------------------------------------------------------- 1 | # Data-Analysis-using-Python -------------------------------------------------------------------------------- /Screenshot 2024-09-24 161731.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkhilSappa/Data-Analysis-using-Python/HEAD/Screenshot 2024-09-24 161731.png -------------------------------------------------------------------------------- /Screenshot 2025-04-11 234358.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkhilSappa/Data-Analysis-using-Python/HEAD/Screenshot 2025-04-11 234358.png -------------------------------------------------------------------------------- /Screenshot 2025-04-11 234411.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkhilSappa/Data-Analysis-using-Python/HEAD/Screenshot 2025-04-11 234411.png -------------------------------------------------------------------------------- /Screenshot 2025-04-11 234423.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkhilSappa/Data-Analysis-using-Python/HEAD/Screenshot 2025-04-11 234423.png -------------------------------------------------------------------------------- /Screenshot 2025-04-11 234436.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AkhilSappa/Data-Analysis-using-Python/HEAD/Screenshot 2025-04-11 234436.png -------------------------------------------------------------------------------- /project-code2.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import seaborn as sb 4 | import numpy as np 5 | from sklearn.linear_model import LinearRegression 6 | from sklearn.metrics import r2_score 7 | 8 | 9 | # Load data 10 | file_path = r"C:\Users\akhil\OneDrive\Desktop\INT-375\project\PROJECT.csv" 11 | df = pd.read_csv(file_path) 12 | 13 | print(df.info()) 14 | print(df.describe()) 15 | 16 | # Handle Missing Values 17 | df['pollutant_id'].fillna(df['pollutant_id'].mode()[0], inplace=True) 18 | df['pollutant_avg'].fillna(df['pollutant_avg'].mean(), inplace=True) 19 | df['pollutant_min'].fillna(df['pollutant_min'].mean(), inplace=True) 20 | df['pollutant_max'].fillna(df['pollutant_max'].mean(), inplace=True) 21 | df['station'].fillna(df['station'].mode()[0], inplace=True) 22 | df['last_update'] = pd.to_datetime(df['last_update'], errors='coerce') 23 | 24 | 25 | #Objective 1: Statewise Average Pollution Levels (Bar Chart) 26 | state_avg = df.groupby('state')['pollutant_avg'].mean().sort_values() 27 | 28 | # Plot horizontal bar chart 29 | plt.figure(figsize=(10, 8)) 30 | state_avg.plot(kind='barh', color='skyblue') 31 | plt.title('Average Pollution Levels by State') 32 | plt.xlabel('Average Pollutant Level') 33 | plt.ylabel('State') 34 | plt.tight_layout() 35 | plt.show() 36 | 37 | # Objective 2: Top and Bottom Cities by Pollution Level (Vertical Bar Chart) 38 | city_avg = df.groupby('city')['pollutant_avg'].mean().sort_values() 39 | top_10 = city_avg.tail(10) 40 | bottom_10 = city_avg.head(10) 41 | combined = pd.concat([bottom_10, top_10]) 42 | plt.figure(figsize=(14, 7)) 43 | bars = plt.bar(combined.index, combined.values, 44 | color=['#1f77b4']*10 + ['#ff7f0e']*10, edgecolor='black') 45 | plt.title('Top 10 and Bottom 10 Cities by Average Pollution Level', fontsize=16, weight='bold', pad=20) 46 | plt.xlabel('City', fontsize=14) 47 | plt.ylabel('Average Pollutant Level', fontsize=14) 48 | plt.xticks(rotation=45, ha='right', fontsize=10) 49 | for bar in bars: 50 | height = bar.get_height() 51 | plt.text(bar.get_x() + bar.get_width()/2, height + 0.5, f'{height:.1f}', 52 | ha='center', va='bottom', fontsize=10) 53 | plt.legend(['Bottom 10 (Low Pollution)', 'Top 10 (High Pollution)'], fontsize=12, loc='upper left') 54 | plt.gca().spines['top'].set_visible(False) 55 | plt.gca().spines['right'].set_visible(False) 56 | plt.tight_layout() 57 | plt.show() 58 | 59 | # Objective 3: Pollution Intensity Map (Geographical Scatter Plot) 60 | geo_data = df.groupby(['city', 'latitude', 'longitude'])['pollutant_avg'].mean().reset_index() 61 | plt.figure(figsize=(12, 8)) 62 | scatter = plt.scatter(geo_data['longitude'], geo_data['latitude'], 63 | s=geo_data['pollutant_avg']*20, c=geo_data['pollutant_avg'], 64 | cmap='crest', alpha=0.7, edgecolor='black') 65 | plt.colorbar(scatter, label='Average Pollutant Level', shrink=0.8) 66 | plt.title('Pollution Intensity Across Cities', fontsize=16, weight='bold', pad=20) 67 | plt.xlabel('Longitude', fontsize=14) 68 | plt.ylabel('Latitude', fontsize=14) 69 | plt.gca().set_facecolor('#f5f5f5') 70 | plt.gca().spines['top'].set_visible(False) 71 | plt.gca().spines['right'].set_visible(False) 72 | plt.tight_layout() 73 | plt.show() 74 | 75 | # Objective 4: Pollutant Distribution by Type (Donut Chart) 76 | pollutant_counts = df['pollutant_id'].value_counts() 77 | plt.figure(figsize=(10, 10)) 78 | wedges, texts, autotexts = plt.pie(pollutant_counts, labels=pollutant_counts.index, 79 | autopct='%1.1f%%', startangle=90, 80 | colors=sb.color_palette('Set2'), 81 | pctdistance=0.85, textprops={'fontsize': 12}) 82 | centre_circle = plt.Circle((0, 0), 0.70, fc='white') 83 | plt.gca().add_artist(centre_circle) 84 | plt.title('Distribution of Pollutant Types', fontsize=16, weight='bold', pad=20) 85 | total_readings = pollutant_counts.sum() 86 | plt.text(0, 0, f'Total\n{total_readings}', ha='center', va='center', fontsize=14, weight='bold') 87 | plt.axis('equal') 88 | plt.tight_layout() 89 | plt.show() 90 | 91 | # Objective 5: Predictive Analysis with Regression Plot 92 | # Prepare features and target 93 | X = df[['pollutant_min', 'pollutant_max']] 94 | y = df['pollutant_avg'] 95 | 96 | # Fit linear regression 97 | model = LinearRegression() 98 | model.fit(X, y) 99 | 100 | # Predict 101 | y_pred = model.predict(X) 102 | 103 | # Calculate R² score 104 | r2 = r2_score(y, y_pred) 105 | print(f'R² Score: {r2:.4f}') 106 | 107 | # Plot scatter with regression line for pollutant_max 108 | plt.figure(figsize=(10, 6)) 109 | sb.scatterplot(x=df['pollutant_max'], y=df['pollutant_avg'], alpha=0.5, color='#FF7F50') 110 | # Compute regression line for pollutant_max, holding pollutant_min at mean 111 | slope = model.coef_[1] 112 | intercept = model.intercept_ - model.coef_[0] * df['pollutant_min'].mean() 113 | x_range = np.array([df['pollutant_max'].min(), df['pollutant_max'].max()]) 114 | y_range = slope * x_range + intercept 115 | plt.plot(x_range, y_range, color='red', label='Regression Line') 116 | # Add city name annotations for top 5 pollutant_avg values 117 | top_cities = df.nlargest(5, 'pollutant_avg') 118 | for i, row in top_cities.iterrows(): 119 | plt.annotate(row['city'], (row['pollutant_max'], row['pollutant_avg']), 120 | xytext=(5, 5), textcoords='offset points', fontsize=10) 121 | plt.title('Regression Plot: Pollutant Level vs Maximum Levels') 122 | plt.xlabel('Maximum Pollutant Level') 123 | plt.ylabel('Average Pollutant Level') 124 | plt.legend() 125 | plt.tight_layout() 126 | 127 | # Save the plot for LinkedIn 128 | plt.savefig('pollution_regression.png', dpi=300, bbox_inches='tight') 129 | plt.show() --------------------------------------------------------------------------------