├── README.md
├── Screenshot 2024-09-24 161731.png
├── Screenshot 2025-04-11 234358.png
├── Screenshot 2025-04-11 234411.png
├── Screenshot 2025-04-11 234423.png
├── Screenshot 2025-04-11 234436.png
└── project-code2.py


/README.md:
--------------------------------------------------------------------------------
1 | # Data-Analysis-using-Python


--------------------------------------------------------------------------------
/Screenshot 2024-09-24 161731.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AkhilSappa/Data-Analysis-using-Python/HEAD/Screenshot 2024-09-24 161731.png


--------------------------------------------------------------------------------
/Screenshot 2025-04-11 234358.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AkhilSappa/Data-Analysis-using-Python/HEAD/Screenshot 2025-04-11 234358.png


--------------------------------------------------------------------------------
/Screenshot 2025-04-11 234411.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AkhilSappa/Data-Analysis-using-Python/HEAD/Screenshot 2025-04-11 234411.png


--------------------------------------------------------------------------------
/Screenshot 2025-04-11 234423.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AkhilSappa/Data-Analysis-using-Python/HEAD/Screenshot 2025-04-11 234423.png


--------------------------------------------------------------------------------
/Screenshot 2025-04-11 234436.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AkhilSappa/Data-Analysis-using-Python/HEAD/Screenshot 2025-04-11 234436.png


--------------------------------------------------------------------------------
/project-code2.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import matplotlib.pyplot as plt
  3 | import seaborn as sb
  4 | import numpy as np
  5 | from sklearn.linear_model import LinearRegression
  6 | from sklearn.metrics import r2_score
  7 | 
  8 | 
  9 | # Load data
 10 | file_path = r"C:\Users\akhil\OneDrive\Desktop\INT-375\project\PROJECT.csv"
 11 | df = pd.read_csv(file_path)
 12 | 
 13 | print(df.info())
 14 | print(df.describe())
 15 | 
 16 | #  Handle Missing Values 
 17 | df['pollutant_id'].fillna(df['pollutant_id'].mode()[0], inplace=True)
 18 | df['pollutant_avg'].fillna(df['pollutant_avg'].mean(), inplace=True)
 19 | df['pollutant_min'].fillna(df['pollutant_min'].mean(), inplace=True)
 20 | df['pollutant_max'].fillna(df['pollutant_max'].mean(), inplace=True)
 21 | df['station'].fillna(df['station'].mode()[0], inplace=True)
 22 | df['last_update'] = pd.to_datetime(df['last_update'], errors='coerce')
 23 | 
 24 | 
 25 | #Objective 1: Statewise Average Pollution Levels (Bar Chart)
 26 | state_avg = df.groupby('state')['pollutant_avg'].mean().sort_values()
 27 | 
 28 | # Plot horizontal bar chart
 29 | plt.figure(figsize=(10, 8))
 30 | state_avg.plot(kind='barh', color='skyblue')
 31 | plt.title('Average Pollution Levels by State')
 32 | plt.xlabel('Average Pollutant Level')
 33 | plt.ylabel('State')
 34 | plt.tight_layout()
 35 | plt.show()
 36 | 
 37 | # Objective 2: Top and Bottom Cities by Pollution Level (Vertical Bar Chart)
 38 | city_avg = df.groupby('city')['pollutant_avg'].mean().sort_values()
 39 | top_10 = city_avg.tail(10)
 40 | bottom_10 = city_avg.head(10)
 41 | combined = pd.concat([bottom_10, top_10])
 42 | plt.figure(figsize=(14, 7))
 43 | bars = plt.bar(combined.index, combined.values, 
 44 |                color=['#1f77b4']*10 + ['#ff7f0e']*10, edgecolor='black')
 45 | plt.title('Top 10 and Bottom 10 Cities by Average Pollution Level', fontsize=16, weight='bold', pad=20)
 46 | plt.xlabel('City', fontsize=14)
 47 | plt.ylabel('Average Pollutant Level', fontsize=14)
 48 | plt.xticks(rotation=45, ha='right', fontsize=10)
 49 | for bar in bars:
 50 |     height = bar.get_height()
 51 |     plt.text(bar.get_x() + bar.get_width()/2, height + 0.5, f'{height:.1f}', 
 52 |              ha='center', va='bottom', fontsize=10)
 53 | plt.legend(['Bottom 10 (Low Pollution)', 'Top 10 (High Pollution)'], fontsize=12, loc='upper left')
 54 | plt.gca().spines['top'].set_visible(False)
 55 | plt.gca().spines['right'].set_visible(False)
 56 | plt.tight_layout()
 57 | plt.show()
 58 | 
 59 | # Objective 3: Pollution Intensity Map (Geographical Scatter Plot)
 60 | geo_data = df.groupby(['city', 'latitude', 'longitude'])['pollutant_avg'].mean().reset_index()
 61 | plt.figure(figsize=(12, 8))
 62 | scatter = plt.scatter(geo_data['longitude'], geo_data['latitude'], 
 63 |                      s=geo_data['pollutant_avg']*20, c=geo_data['pollutant_avg'], 
 64 |                      cmap='crest', alpha=0.7, edgecolor='black')
 65 | plt.colorbar(scatter, label='Average Pollutant Level', shrink=0.8)
 66 | plt.title('Pollution Intensity Across Cities', fontsize=16, weight='bold', pad=20)
 67 | plt.xlabel('Longitude', fontsize=14)
 68 | plt.ylabel('Latitude', fontsize=14)
 69 | plt.gca().set_facecolor('#f5f5f5')
 70 | plt.gca().spines['top'].set_visible(False)
 71 | plt.gca().spines['right'].set_visible(False)
 72 | plt.tight_layout()
 73 | plt.show()
 74 | 
 75 | # Objective 4: Pollutant Distribution by Type (Donut Chart)
 76 | pollutant_counts = df['pollutant_id'].value_counts()
 77 | plt.figure(figsize=(10, 10))
 78 | wedges, texts, autotexts = plt.pie(pollutant_counts, labels=pollutant_counts.index, 
 79 |                                    autopct='%1.1f%%', startangle=90, 
 80 |                                    colors=sb.color_palette('Set2'), 
 81 |                                    pctdistance=0.85, textprops={'fontsize': 12})
 82 | centre_circle = plt.Circle((0, 0), 0.70, fc='white')
 83 | plt.gca().add_artist(centre_circle)
 84 | plt.title('Distribution of Pollutant Types', fontsize=16, weight='bold', pad=20)
 85 | total_readings = pollutant_counts.sum()
 86 | plt.text(0, 0, f'Total\n{total_readings}', ha='center', va='center', fontsize=14, weight='bold')
 87 | plt.axis('equal')
 88 | plt.tight_layout()
 89 | plt.show()
 90 | 
 91 | # Objective 5: Predictive Analysis with Regression Plot
 92 | # Prepare features and target
 93 | X = df[['pollutant_min', 'pollutant_max']]
 94 | y = df['pollutant_avg']
 95 | 
 96 | # Fit linear regression
 97 | model = LinearRegression()
 98 | model.fit(X, y)
 99 | 
100 | # Predict
101 | y_pred = model.predict(X)
102 | 
103 | # Calculate R² score
104 | r2 = r2_score(y, y_pred)
105 | print(f'R² Score: {r2:.4f}')
106 | 
107 | # Plot scatter with regression line for pollutant_max
108 | plt.figure(figsize=(10, 6))
109 | sb.scatterplot(x=df['pollutant_max'], y=df['pollutant_avg'], alpha=0.5, color='#FF7F50')
110 | # Compute regression line for pollutant_max, holding pollutant_min at mean
111 | slope = model.coef_[1]
112 | intercept = model.intercept_ - model.coef_[0] * df['pollutant_min'].mean()
113 | x_range = np.array([df['pollutant_max'].min(), df['pollutant_max'].max()])
114 | y_range = slope * x_range + intercept
115 | plt.plot(x_range, y_range, color='red', label='Regression Line')
116 | # Add city name annotations for top 5 pollutant_avg values
117 | top_cities = df.nlargest(5, 'pollutant_avg')
118 | for i, row in top_cities.iterrows():
119 |     plt.annotate(row['city'], (row['pollutant_max'], row['pollutant_avg']), 
120 |                  xytext=(5, 5), textcoords='offset points', fontsize=10)
121 | plt.title('Regression Plot: Pollutant Level vs Maximum Levels')
122 | plt.xlabel('Maximum Pollutant Level')
123 | plt.ylabel('Average Pollutant Level')
124 | plt.legend()
125 | plt.tight_layout()
126 | 
127 | # Save the plot for LinkedIn
128 | plt.savefig('pollution_regression.png', dpi=300, bbox_inches='tight')
129 | plt.show()


--------------------------------------------------------------------------------