├── README.md └── int 375 ca2.py /README.md: -------------------------------------------------------------------------------- 1 | # Int375-Python-Project -------------------------------------------------------------------------------- /int 375 ca2.py: -------------------------------------------------------------------------------- 1 | # Import required libraries 2 | import pandas as pd 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | 7 | # Display settings 8 | pd.set_option('display.max_columns', None) 9 | sns.set(style='whitegrid') 10 | 11 | # Load the dataset 12 | file_path = "C:\\Users\\ACER\\Desktop\\CA2 INT375.csv" # Updated path 13 | df = pd.read_csv(file_path) 14 | 15 | # Clean column names 16 | df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_') 17 | 18 | # View data structure 19 | print("First 5 rows of the dataset:\n") 20 | print(df.head()) 21 | print("\nColumn names:\n", df.columns) 22 | 23 | # ------------------------------------------ 24 | # Objective 1: Distribution across Panchayats & Villages + HEATMAP 25 | # ------------------------------------------ 26 | if 'panchayat_name' in df.columns and 'village_name' in df.columns: 27 | distribution = df.groupby(['panchayat_name', 'village_name']).size().reset_index(name='scheme_count') 28 | print("\nDistribution of schemes:\n", distribution.head()) 29 | 30 | # Bar plot by Panchayat 31 | plt.figure(figsize=(12, 6)) 32 | sns.countplot(data=df, y='panchayat_name', order=df['panchayat_name'].value_counts().index) 33 | plt.title('Water Supply Schemes by Panchayat') 34 | plt.xlabel('Number of Schemes') 35 | plt.ylabel('Panchayat') 36 | plt.tight_layout() 37 | plt.show() 38 | 39 | # Heatmap 40 | heatmap_data = df.groupby(['panchayat_name', 'village_name']).size().unstack(fill_value=0) 41 | plt.figure(figsize=(14, 10)) 42 | sns.heatmap(heatmap_data, cmap='Blues', linewidths=0.5, linecolor='gray') 43 | plt.title('Heatmap of Scheme Distribution Across Panchayats and Villages') 44 | plt.xlabel('Village Name') 45 | plt.ylabel('Panchayat Name') 46 | plt.tight_layout() 47 | plt.show() 48 | else: 49 | print("Panchayat or Village columns not found.") 50 | 51 | # ------------------------------------------ 52 | # Objective 2: Financial Analysis 53 | # ------------------------------------------ 54 | # Identify correct column names 55 | print("\nAvailable columns for financial analysis:\n", df.columns) 56 | 57 | # Try to find matching column names for cost/expenditure 58 | cost_col = next((col for col in df.columns if 'estimated_cost' in col), None) 59 | exp_col = next((col for col in df.columns if 'expenditure' in col), None) 60 | 61 | if cost_col and exp_col: 62 | df[cost_col] = pd.to_numeric(df[cost_col], errors='coerce') 63 | df[exp_col] = pd.to_numeric(df[exp_col], errors='coerce') 64 | 65 | fin_df = df.dropna(subset=[cost_col, exp_col]) 66 | fin_df['utilization_ratio'] = fin_df[exp_col] / fin_df[cost_col] 67 | 68 | print("\nBudget Utilization Statistics:\n", fin_df['utilization_ratio'].describe()) 69 | 70 | plt.figure(figsize=(10, 6)) 71 | sns.histplot(fin_df['utilization_ratio'], bins=20, kde=True) 72 | plt.axvline(1, color='red', linestyle='--', label='Fully Utilized') 73 | plt.title('Budget Utilization Ratio (Expenditure / Estimated Cost)') 74 | plt.xlabel('Utilization Ratio') 75 | plt.ylabel('Number of Schemes') 76 | plt.legend() 77 | plt.tight_layout() 78 | plt.show() 79 | else: 80 | print("Financial columns not found.") 81 | 82 | # ------------------------------------------ 83 | # Objective 3: Implementation Timeline 84 | # ------------------------------------------ 85 | # Try to find matching date columns 86 | comm_col = next((col for col in df.columns if 'commencement' in col), None) 87 | comp_col = next((col for col in df.columns if 'completion' in col), None) 88 | 89 | if comm_col and comp_col: 90 | df[comm_col] = pd.to_datetime(df[comm_col], errors='coerce') 91 | df[comp_col] = pd.to_datetime(df[comp_col], errors='coerce') 92 | df['duration_days'] = (df[comp_col] - df[comm_col]).dt.days 93 | 94 | print("\nProject Duration Statistics:\n", df['duration_days'].describe()) 95 | 96 | plt.figure(figsize=(10, 6)) 97 | sns.histplot(df['duration_days'].dropna(), bins=20, kde=True) 98 | plt.title('Implementation Duration of Schemes') 99 | plt.xlabel('Duration (Days)') 100 | plt.tight_layout() 101 | plt.show() 102 | else: 103 | print("Commencement or Completion date columns not found.") 104 | 105 | # ------------------------------------------ 106 | # Objective 4: Water Source Type Analysis 107 | # ------------------------------------------ 108 | source_col = next((col for col in df.columns if 'source' in col), None) 109 | 110 | if source_col: 111 | source_counts = df[source_col].value_counts() 112 | print("\nWater Source Type Frequency:\n", source_counts) 113 | 114 | plt.figure(figsize=(10, 5)) 115 | sns.barplot(x=source_counts.index, y=source_counts.values) 116 | plt.title('Types of Water Sources Used in Schemes') 117 | plt.ylabel('Number of Schemes') 118 | plt.xlabel('Water Source Type') 119 | plt.xticks(rotation=45) 120 | plt.tight_layout() 121 | plt.show() 122 | else: 123 | print("Source of Water column not found.") 124 | 125 | # ------------------------------------------ 126 | # Objective 5: Geographical Coverage by Habitation 127 | # ------------------------------------------ 128 | habit_col = next((col for col in df.columns if 'habitation_id' in col), None) 129 | 130 | if habit_col: 131 | unique_habitations = df[habit_col].nunique() 132 | print(f"\nTotal Unique Habitations Covered: {unique_habitations}") 133 | 134 | habitation_counts = df[habit_col].value_counts() 135 | 136 | plt.figure(figsize=(12, 5)) 137 | habitation_counts.head(10).plot(kind='bar') 138 | plt.title('Top 10 Habitations by Number of Schemes') 139 | plt.xlabel('Habitation ID') 140 | plt.ylabel('Scheme Count') 141 | plt.tight_layout() 142 | plt.show() 143 | else: 144 | print("Habitation ID column not found.") 145 | 146 | #yearwise scheme implementation 147 | #Year-wise Scheme Implementation Trend 148 | plt.figure() 149 | scheme_per_year = df['sanction_year'].value_counts().sort_index() 150 | sns.barplot(x=scheme_per_year.index, y=scheme_per_year.values, palette="viridis") 151 | plt.title('Number of Schemes Sanctioned per Year') 152 | plt.xlabel('Sanction Year') 153 | plt.ylabel('Number of Schemes') 154 | plt.xticks(rotation=45) 155 | plt.tight_layout() 156 | plt.show() 157 | #2 Scatter plot: Estimated Cost vs Expenditure 158 | sns.set(style="whitegrid") 159 | plt.figure(figsize=(10, 6)) 160 | sns.scatterplot(data=df, x='estimated_cost', y='expenditure', hue='source_type', alpha=0.7) 161 | plt.title('Scatter Plot: Estimated Cost vs Expenditure') 162 | plt.xlabel('Estimated Cost (in Crores)') 163 | plt.ylabel('Expenditure (in Crores)') 164 | plt.legend(title='Source Type', bbox_to_anchor=(1.05, 1), loc='upper left') 165 | plt.tight_layout() 166 | plt.show() 167 | # Histogram of Estimated Cost 168 | sns.set(style="whitegrid") 169 | plt.figure(figsize=(10, 6)) 170 | sns.histplot(df['estimated_cost'], bins=30, color='steelblue', kde=True) 171 | plt.title('Histogram: Distribution of Estimated Scheme Cost') 172 | plt.xlabel('Estimated Cost (in Crores)') 173 | plt.ylabel('Number of Schemes') 174 | plt.tight_layout() 175 | plt.show() 176 | # Box plot: Expenditure by Source Type 177 | sns.set(style="whitegrid") 178 | plt.figure(figsize=(10, 6)) 179 | sns.boxplot(data=df, x='source_type', y='expenditure', palette='pastel') 180 | plt.title('Box Plot: Expenditure by Source Type') 181 | plt.xlabel('Source Type') 182 | plt.ylabel('Expenditure (in Crores)') 183 | plt.xticks(rotation=30) 184 | plt.tight_layout() 185 | plt.show() 186 | #6 line chart 187 | yearly_schemes = df['sanction_year'].value_counts().sort_index() 188 | plt.figure(figsize=(10, 6)) 189 | sns.lineplot(x=yearly_schemes.index, y=yearly_schemes.values, marker='o', color='teal') 190 | plt.title('Line Chart: Number of Schemes Sanctioned Per Year') 191 | plt.xlabel('Sanction Year') 192 | plt.ylabel('Number of Schemes') 193 | plt.grid(True) 194 | #plt.tight_layo 195 | 196 | # ------------------------------------------ 197 | # Summary 198 | # ------------------------------------------ 199 | print("\nProject Analysis Complete") 200 | print("Summary of Insights:") 201 | print("- Schemes are unevenly distributed across panchayats and villages.") 202 | print("- Budget utilization varies significantly across projects.") 203 | print("- Many schemes are delayed beyond estimated timelines.") 204 | print("- Deep Tubewells or similar water sources dominate usage.") 205 | print("- Some habitations have more schemes, others are underserved.") 206 | --------------------------------------------------------------------------------