├── Dataset.csv ├── Pollution_Data_Analysis_Presentation.pdf ├── README.md ├── code.ipynb ├── python ca2 report.docx └── python code /Pollution_Data_Analysis_Presentation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ManaswiMuppidi/Manaswi-Python/44c291a1dc637cb25bff8926b6470480356de6c8/Pollution_Data_Analysis_Presentation.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Pollution_Data_analysis 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /python ca2 report.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ManaswiMuppidi/Manaswi-Python/44c291a1dc637cb25bff8926b6470480356de6c8/python ca2 report.docx -------------------------------------------------------------------------------- /python code: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import seaborn as sns 4 | import matplotlib.pyplot as plt 5 | from scipy.stats import zscore 6 | 7 | # Step 1: Import the dataset 8 | file_path = ("C:\\Users\\MANASWI\\OneDrive\\Desktop\\Dataset.csv") 9 | df = pd.read_csv(file_path, encoding='utf-8') 10 | 11 | # Step 2: Get info 12 | print(df.info()) 13 | 14 | # Step 3: Handle missing data 15 | df.replace(" ", np.nan, inplace=True) 16 | df.dropna(subset=['pollutant_min', 'pollutant_max', 'pollutant_avg'], inplace=True) 17 | 18 | # Convert to numeric just in case 19 | df['pollutant_min'] = pd.to_numeric(df['pollutant_min'], errors='coerce') 20 | df['pollutant_max'] = pd.to_numeric(df['pollutant_max'], errors='coerce') 21 | df['pollutant_avg'] = pd.to_numeric(df['pollutant_avg'], errors='coerce') 22 | df.dropna(inplace=True) 23 | 24 | # Step 4: Objectives 25 | 26 | ## Objective 1: Analyze Student Performance Trends (pollutant_avg over time) 27 | plt.figure(figsize=(10,6)) 28 | sns.lineplot(data=df, x='last_update', y='pollutant_avg') 29 | plt.xticks(rotation=45) 30 | plt.title("Trend of Pollutant Averages Over Time") 31 | plt.tight_layout() 32 | plt.show() 33 | 34 | ## Objective 2: Evaluate Impact of Infrastructure (by Station) 35 | plt.figure(figsize=(12,6)) 36 | sns.boxplot(data=df, x='station', y='pollutant_avg') 37 | plt.xticks(rotation=90) 38 | plt.title("Pollutant Levels by Station") 39 | plt.tight_layout() 40 | plt.show() 41 | 42 | ## Objective 3: Compare Performance Across States 43 | plt.figure(figsize=(12,6)) 44 | sns.barplot(data=df, x='state', y='pollutant_avg', estimator=np.mean) 45 | plt.xticks(rotation=90) 46 | plt.title("Average Pollutant by State") 47 | plt.tight_layout() 48 | plt.show() 49 | 50 | ## Objective 4: Key Predictors (correlation heatmap) 51 | plt.figure(figsize=(6,4)) 52 | sns.heatmap(df[['pollutant_min', 'pollutant_max', 'pollutant_avg']].corr(), annot=True, cmap='coolwarm') 53 | plt.title("Correlation Between Pollutant Metrics") 54 | plt.show() 55 | 56 | ## Objective 5: Track Type Disparities (by pollutant type) 57 | plt.figure(figsize=(12,6)) 58 | sns.boxplot(data=df, x='pollutant_id', y='pollutant_avg') 59 | plt.title("Pollutant Average by Type") 60 | plt.tight_layout() 61 | plt.show() 62 | 63 | # Step 5: Relationship between variables - Pairplot 64 | sns.pairplot(df[['pollutant_min', 'pollutant_max', 'pollutant_avg']]) 65 | plt.show() 66 | 67 | # Step 6: Heatmap 68 | plt.figure(figsize=(10,6)) 69 | sns.heatmap(df[['pollutant_min', 'pollutant_max', 'pollutant_avg']].corr(), annot=True, cmap='viridis') 70 | plt.title("Heatmap of Pollutant Data") 71 | plt.show() 72 | 73 | # Step 7: Outliers 74 | 75 | ## Boxplot 76 | plt.figure(figsize=(8,5)) 77 | sns.boxplot(data=df[['pollutant_avg']]) 78 | plt.title("Boxplot for Pollutant Average") 79 | plt.show() 80 | 81 | ## Z-score 82 | z_scores = zscore(df[['pollutant_avg']]) 83 | outliers = df[(np.abs(z_scores) > 3).any(axis=1)] 84 | print(f"Total Records: {df.shape[0]}, Outliers Detected: {outliers.shape[0]}") 85 | --------------------------------------------------------------------------------