├── Dataset.csv
├── Pollution_Data_Analysis_Presentation.pdf
├── README.md
├── code.ipynb
├── python ca2 report.docx
└── python code


/Pollution_Data_Analysis_Presentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ManaswiMuppidi/Manaswi-Python/44c291a1dc637cb25bff8926b6470480356de6c8/Pollution_Data_Analysis_Presentation.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Pollution_Data_analysis
2 | 
3 | 
4 | 
5 | 
6 | 


--------------------------------------------------------------------------------
/python ca2 report.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ManaswiMuppidi/Manaswi-Python/44c291a1dc637cb25bff8926b6470480356de6c8/python ca2 report.docx


--------------------------------------------------------------------------------
/python code:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import seaborn as sns
 4 | import matplotlib.pyplot as plt
 5 | from scipy.stats import zscore
 6 | 
 7 | # Step 1: Import the dataset 
 8 | file_path = ("C:\\Users\\MANASWI\\OneDrive\\Desktop\\Dataset.csv")
 9 | df = pd.read_csv(file_path, encoding='utf-8')
10 | 
11 | # Step 2: Get info
12 | print(df.info())
13 | 
14 | # Step 3: Handle missing data
15 | df.replace(" ", np.nan, inplace=True)
16 | df.dropna(subset=['pollutant_min', 'pollutant_max', 'pollutant_avg'], inplace=True)
17 | 
18 | # Convert to numeric just in case
19 | df['pollutant_min'] = pd.to_numeric(df['pollutant_min'], errors='coerce')
20 | df['pollutant_max'] = pd.to_numeric(df['pollutant_max'], errors='coerce')
21 | df['pollutant_avg'] = pd.to_numeric(df['pollutant_avg'], errors='coerce')
22 | df.dropna(inplace=True)
23 | 
24 | # Step 4: Objectives
25 | 
26 | ## Objective 1: Analyze Student Performance Trends (pollutant_avg over time)
27 | plt.figure(figsize=(10,6))
28 | sns.lineplot(data=df, x='last_update', y='pollutant_avg')
29 | plt.xticks(rotation=45)
30 | plt.title("Trend of Pollutant Averages Over Time")
31 | plt.tight_layout()
32 | plt.show()
33 | 
34 | ## Objective 2: Evaluate Impact of Infrastructure (by Station)
35 | plt.figure(figsize=(12,6))
36 | sns.boxplot(data=df, x='station', y='pollutant_avg')
37 | plt.xticks(rotation=90)
38 | plt.title("Pollutant Levels by Station")
39 | plt.tight_layout()
40 | plt.show()
41 | 
42 | ## Objective 3: Compare Performance Across States
43 | plt.figure(figsize=(12,6))
44 | sns.barplot(data=df, x='state', y='pollutant_avg', estimator=np.mean)
45 | plt.xticks(rotation=90)
46 | plt.title("Average Pollutant by State")
47 | plt.tight_layout()
48 | plt.show()
49 | 
50 | ## Objective 4: Key Predictors (correlation heatmap)
51 | plt.figure(figsize=(6,4))
52 | sns.heatmap(df[['pollutant_min', 'pollutant_max', 'pollutant_avg']].corr(), annot=True, cmap='coolwarm')
53 | plt.title("Correlation Between Pollutant Metrics")
54 | plt.show()
55 | 
56 | ## Objective 5: Track Type Disparities (by pollutant type)
57 | plt.figure(figsize=(12,6))
58 | sns.boxplot(data=df, x='pollutant_id', y='pollutant_avg')
59 | plt.title("Pollutant Average by Type")
60 | plt.tight_layout()
61 | plt.show()
62 | 
63 | # Step 5: Relationship between variables - Pairplot
64 | sns.pairplot(df[['pollutant_min', 'pollutant_max', 'pollutant_avg']])
65 | plt.show()
66 | 
67 | # Step 6: Heatmap
68 | plt.figure(figsize=(10,6))
69 | sns.heatmap(df[['pollutant_min', 'pollutant_max', 'pollutant_avg']].corr(), annot=True, cmap='viridis')
70 | plt.title("Heatmap of Pollutant Data")
71 | plt.show()
72 | 
73 | # Step 7: Outliers
74 | 
75 | ## Boxplot
76 | plt.figure(figsize=(8,5))
77 | sns.boxplot(data=df[['pollutant_avg']])
78 | plt.title("Boxplot for Pollutant Average")
79 | plt.show()
80 | 
81 | ## Z-score
82 | z_scores = zscore(df[['pollutant_avg']])
83 | outliers = df[(np.abs(z_scores) > 3).any(axis=1)]
84 | print(f"Total Records: {df.shape[0]}, Outliers Detected: {outliers.shape[0]}")
85 | 


--------------------------------------------------------------------------------