├── README.md
└── int 375 ca2.py


/README.md:
--------------------------------------------------------------------------------
1 | # Int375-Python-Project


--------------------------------------------------------------------------------
/int 375 ca2.py:
--------------------------------------------------------------------------------
  1 | # Import required libraries
  2 | import pandas as pd
  3 | import numpy as np
  4 | import matplotlib.pyplot as plt
  5 | import seaborn as sns
  6 | 
  7 | # Display settings
  8 | pd.set_option('display.max_columns', None)
  9 | sns.set(style='whitegrid')
 10 | 
 11 | # Load the dataset
 12 | file_path = "C:\\Users\\ACER\\Desktop\\CA2 INT375.csv"  # Updated path
 13 | df = pd.read_csv(file_path)
 14 | 
 15 | # Clean column names
 16 | df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
 17 | 
 18 | # View data structure
 19 | print("First 5 rows of the dataset:\n")
 20 | print(df.head())
 21 | print("\nColumn names:\n", df.columns)
 22 | 
 23 | # ------------------------------------------
 24 | # Objective 1: Distribution across Panchayats & Villages + HEATMAP
 25 | # ------------------------------------------
 26 | if 'panchayat_name' in df.columns and 'village_name' in df.columns:
 27 |     distribution = df.groupby(['panchayat_name', 'village_name']).size().reset_index(name='scheme_count')
 28 |     print("\nDistribution of schemes:\n", distribution.head())
 29 | 
 30 |     # Bar plot by Panchayat
 31 |     plt.figure(figsize=(12, 6))
 32 |     sns.countplot(data=df, y='panchayat_name', order=df['panchayat_name'].value_counts().index)
 33 |     plt.title('Water Supply Schemes by Panchayat')
 34 |     plt.xlabel('Number of Schemes')
 35 |     plt.ylabel('Panchayat')
 36 |     plt.tight_layout()
 37 |     plt.show()
 38 | 
 39 |     # Heatmap
 40 |     heatmap_data = df.groupby(['panchayat_name', 'village_name']).size().unstack(fill_value=0)
 41 |     plt.figure(figsize=(14, 10))
 42 |     sns.heatmap(heatmap_data, cmap='Blues', linewidths=0.5, linecolor='gray')
 43 |     plt.title('Heatmap of Scheme Distribution Across Panchayats and Villages')
 44 |     plt.xlabel('Village Name')
 45 |     plt.ylabel('Panchayat Name')
 46 |     plt.tight_layout()
 47 |     plt.show()
 48 | else:
 49 |     print("Panchayat or Village columns not found.")
 50 | 
 51 | # ------------------------------------------
 52 | # Objective 2: Financial Analysis
 53 | # ------------------------------------------
 54 | # Identify correct column names
 55 | print("\nAvailable columns for financial analysis:\n", df.columns)
 56 | 
 57 | # Try to find matching column names for cost/expenditure
 58 | cost_col = next((col for col in df.columns if 'estimated_cost' in col), None)
 59 | exp_col = next((col for col in df.columns if 'expenditure' in col), None)
 60 | 
 61 | if cost_col and exp_col:
 62 |     df[cost_col] = pd.to_numeric(df[cost_col], errors='coerce')
 63 |     df[exp_col] = pd.to_numeric(df[exp_col], errors='coerce')
 64 | 
 65 |     fin_df = df.dropna(subset=[cost_col, exp_col])
 66 |     fin_df['utilization_ratio'] = fin_df[exp_col] / fin_df[cost_col]
 67 | 
 68 |     print("\nBudget Utilization Statistics:\n", fin_df['utilization_ratio'].describe())
 69 | 
 70 |     plt.figure(figsize=(10, 6))
 71 |     sns.histplot(fin_df['utilization_ratio'], bins=20, kde=True)
 72 |     plt.axvline(1, color='red', linestyle='--', label='Fully Utilized')
 73 |     plt.title('Budget Utilization Ratio (Expenditure / Estimated Cost)')
 74 |     plt.xlabel('Utilization Ratio')
 75 |     plt.ylabel('Number of Schemes')
 76 |     plt.legend()
 77 |     plt.tight_layout()
 78 |     plt.show()
 79 | else:
 80 |     print("Financial columns not found.")
 81 | 
 82 | # ------------------------------------------
 83 | # Objective 3: Implementation Timeline
 84 | # ------------------------------------------
 85 | # Try to find matching date columns
 86 | comm_col = next((col for col in df.columns if 'commencement' in col), None)
 87 | comp_col = next((col for col in df.columns if 'completion' in col), None)
 88 | 
 89 | if comm_col and comp_col:
 90 |     df[comm_col] = pd.to_datetime(df[comm_col], errors='coerce')
 91 |     df[comp_col] = pd.to_datetime(df[comp_col], errors='coerce')
 92 |     df['duration_days'] = (df[comp_col] - df[comm_col]).dt.days
 93 | 
 94 |     print("\nProject Duration Statistics:\n", df['duration_days'].describe())
 95 | 
 96 |     plt.figure(figsize=(10, 6))
 97 |     sns.histplot(df['duration_days'].dropna(), bins=20, kde=True)
 98 |     plt.title('Implementation Duration of Schemes')
 99 |     plt.xlabel('Duration (Days)')
100 |     plt.tight_layout()
101 |     plt.show()
102 | else:
103 |     print("Commencement or Completion date columns not found.")
104 | 
105 | # ------------------------------------------
106 | # Objective 4: Water Source Type Analysis
107 | # ------------------------------------------
108 | source_col = next((col for col in df.columns if 'source' in col), None)
109 | 
110 | if source_col:
111 |     source_counts = df[source_col].value_counts()
112 |     print("\nWater Source Type Frequency:\n", source_counts)
113 | 
114 |     plt.figure(figsize=(10, 5))
115 |     sns.barplot(x=source_counts.index, y=source_counts.values)
116 |     plt.title('Types of Water Sources Used in Schemes')
117 |     plt.ylabel('Number of Schemes')
118 |     plt.xlabel('Water Source Type')
119 |     plt.xticks(rotation=45)
120 |     plt.tight_layout()
121 |     plt.show()
122 | else:
123 |     print("Source of Water column not found.")
124 | 
125 | # ------------------------------------------
126 | # Objective 5: Geographical Coverage by Habitation
127 | # ------------------------------------------
128 | habit_col = next((col for col in df.columns if 'habitation_id' in col), None)
129 | 
130 | if habit_col:
131 |     unique_habitations = df[habit_col].nunique()
132 |     print(f"\nTotal Unique Habitations Covered: {unique_habitations}")
133 | 
134 |     habitation_counts = df[habit_col].value_counts()
135 | 
136 |     plt.figure(figsize=(12, 5))
137 |     habitation_counts.head(10).plot(kind='bar')
138 |     plt.title('Top 10 Habitations by Number of Schemes')
139 |     plt.xlabel('Habitation ID')
140 |     plt.ylabel('Scheme Count')
141 |     plt.tight_layout()
142 |     plt.show()
143 | else:
144 |     print("Habitation ID column not found.")
145 | 
146 | #yearwise scheme implementation
147 | #Year-wise Scheme Implementation Trend
148 | plt.figure()
149 | scheme_per_year = df['sanction_year'].value_counts().sort_index()
150 | sns.barplot(x=scheme_per_year.index, y=scheme_per_year.values, palette="viridis")
151 | plt.title('Number of Schemes Sanctioned per Year')
152 | plt.xlabel('Sanction Year')
153 | plt.ylabel('Number of Schemes')
154 | plt.xticks(rotation=45)
155 | plt.tight_layout()
156 | plt.show()
157 | #2 Scatter plot: Estimated Cost vs Expenditure
158 | sns.set(style="whitegrid")
159 | plt.figure(figsize=(10, 6))
160 | sns.scatterplot(data=df, x='estimated_cost', y='expenditure', hue='source_type', alpha=0.7)
161 | plt.title('Scatter Plot: Estimated Cost vs Expenditure')
162 | plt.xlabel('Estimated Cost (in Crores)')
163 | plt.ylabel('Expenditure (in Crores)')
164 | plt.legend(title='Source Type', bbox_to_anchor=(1.05, 1), loc='upper left')
165 | plt.tight_layout()
166 | plt.show()
167 | # Histogram of Estimated Cost
168 | sns.set(style="whitegrid")
169 | plt.figure(figsize=(10, 6))
170 | sns.histplot(df['estimated_cost'], bins=30, color='steelblue', kde=True)
171 | plt.title('Histogram: Distribution of Estimated Scheme Cost')
172 | plt.xlabel('Estimated Cost (in Crores)')
173 | plt.ylabel('Number of Schemes')
174 | plt.tight_layout()
175 | plt.show()
176 | # Box plot: Expenditure by Source Type
177 | sns.set(style="whitegrid")
178 | plt.figure(figsize=(10, 6))
179 | sns.boxplot(data=df, x='source_type', y='expenditure', palette='pastel')
180 | plt.title('Box Plot: Expenditure by Source Type')
181 | plt.xlabel('Source Type')
182 | plt.ylabel('Expenditure (in Crores)')
183 | plt.xticks(rotation=30)
184 | plt.tight_layout()
185 | plt.show()
186 | #6 line chart
187 | yearly_schemes = df['sanction_year'].value_counts().sort_index()
188 | plt.figure(figsize=(10, 6))
189 | sns.lineplot(x=yearly_schemes.index, y=yearly_schemes.values, marker='o', color='teal')
190 | plt.title('Line Chart: Number of Schemes Sanctioned Per Year')
191 | plt.xlabel('Sanction Year')
192 | plt.ylabel('Number of Schemes')
193 | plt.grid(True)
194 | #plt.tight_layo
195 | 
196 | # ------------------------------------------
197 | # Summary
198 | # ------------------------------------------
199 | print("\nProject Analysis Complete")
200 | print("Summary of Insights:")
201 | print("- Schemes are unevenly distributed across panchayats and villages.")
202 | print("- Budget utilization varies significantly across projects.")
203 | print("- Many schemes are delayed beyond estimated timelines.")
204 | print("- Deep Tubewells or similar water sources dominate usage.")
205 | print("- Some habitations have more schemes, others are underserved.")
206 | 


--------------------------------------------------------------------------------