├── 1.png ├── 10.png ├── 11.png ├── 12.png ├── 13.png ├── 14.png ├── 15.png ├── 16.png ├── 17.png ├── 18.png ├── 19.png ├── 2.png ├── 20.png ├── 21.png ├── 22.png ├── 23.png ├── 24.png ├── 3.png ├── 4.png ├── 5.png ├── 6.png ├── 7.png ├── 8.png ├── 9.png ├── README.md ├── dataset.csv └── project_python_file.py /1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/1.png -------------------------------------------------------------------------------- /10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/10.png -------------------------------------------------------------------------------- /11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/11.png -------------------------------------------------------------------------------- /12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/12.png -------------------------------------------------------------------------------- /13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/13.png -------------------------------------------------------------------------------- /14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/14.png -------------------------------------------------------------------------------- /15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/15.png -------------------------------------------------------------------------------- /16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/16.png -------------------------------------------------------------------------------- /17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/17.png -------------------------------------------------------------------------------- /18.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/18.png -------------------------------------------------------------------------------- /19.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/19.png -------------------------------------------------------------------------------- /2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/2.png -------------------------------------------------------------------------------- /20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/20.png -------------------------------------------------------------------------------- /21.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/21.png -------------------------------------------------------------------------------- /22.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/22.png -------------------------------------------------------------------------------- /23.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/23.png -------------------------------------------------------------------------------- /24.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/24.png -------------------------------------------------------------------------------- /3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/3.png -------------------------------------------------------------------------------- /4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/4.png -------------------------------------------------------------------------------- /5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/5.png -------------------------------------------------------------------------------- /6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/6.png -------------------------------------------------------------------------------- /7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/7.png -------------------------------------------------------------------------------- /8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/8.png -------------------------------------------------------------------------------- /9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/9.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sector-wise Revenue Expenditure Analysis of Indian States and UTs 2 | This project explores, analyzes, and visualizes sector-wise revenue expenditure across Indian states and union territories. Using Python's data science stack, the goal is to uncover trends, compare regional allocations, test statistical hypotheses, and detect relationships in spending patterns. 3 | 4 | ## 📌 Objectives 5 | 1. Generate descriptive statistics and identify key trends, distributions, and inconsistencies in sector-wise revenue expenditure. 6 | 2. Design and implement visual representations using Matplotlib and Seaborn for comparing sector-wise allocations across years and regions. 7 | 3. Conduct statistical hypothesis testing to evaluate significant differences in revenue expenditure patterns between selected sectors or regions. 8 | 4. Uncover relationships and correlations among sectors through numerical and visual statistical analysis. 9 | 5. Analyze year-on-year changes in sector-wise revenue expenditure to detect growth patterns, stagnation, or shifts in spending priorities. 10 | 11 | ## 🛠️ Technologies Used 12 | - Python 13 | - Pandas 14 | - NumPy 15 | - Matplotlib 16 | - Seaborn 17 | - SciPy / StatsModels 18 | 19 | 20 | -------------------------------------------------------------------------------- /project_python_file.py: -------------------------------------------------------------------------------- 1 | ### EDA ### 2 | #--------------------------------------------------------------------------------------------------------------- 3 | 4 | import pandas as pd 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | import seaborn as sns 8 | 9 | df = pd.read_csv(r"C:\Users\shubh\Downloads\7567_all_files\ZIP\7567\7567_source_data.csv") 10 | 11 | print(df.columns) # Column names 12 | print(df.shape) # Number of rows and columns 13 | print(df.info()) # Data types and non-null counts 14 | print(df.describe) # Summary for numerical columns 15 | print(df.head()) # First five records 16 | print(df.tail()) # Last five records 17 | print(df.isnull().sum()) # Total missing values per column 18 | print(df.duplicated().sum()) # Check for duplicate rows 19 | print(df.dropna()) # Remove missing/duplicate values 20 | print(df.fillna(method='ffill')) # Fill missing/duplicate values 21 | 22 | 23 | 24 | 25 | 26 | ### Objective 1 ### 27 | # -------------------------------------------------------------------------------------------------------------- 28 | # To generate descriptive statistics and identify key trends, distributions, and inconsistencies in 29 | # sector-wise revenue expenditure across states and union territories. 30 | 31 | import numpy as np 32 | import pandas as pd 33 | import matplotlib.pyplot as plt 34 | import seaborn as sns 35 | 36 | # Load the dataset 37 | data = pd.read_csv(r"C:\Users\shubh\Downloads\7567_all_files\ZIP\7567\7567_source_data.csv") 38 | 39 | sector_columns = data.columns[3:78] 40 | 41 | # 1. Generate basic summary statistics 42 | summary = data[sector_columns].describe() 43 | print("Sector-wise Descriptive Statistics:\n", summary) 44 | 45 | # 2. Analyze missing values 46 | missing_info = data[sector_columns].isnull().sum() 47 | missing_info = missing_info[missing_info > 0].sort_values(ascending=False) 48 | print("\nMissing Value Count by Sector:\n", missing_info) 49 | 50 | # 3. Visualize distributions of selected expenditure sectors 51 | selected = [ 52 | "Education, sports, art and culture", 53 | "Medical and public health", 54 | "Agriculture and allied activities", 55 | "Energy", 56 | "Transport and communications" 57 | ] 58 | 59 | data[selected].hist(bins=25, figsize=(8, 6), color='red', edgecolor='black') 60 | plt.suptitle("Distribution of Expenditures in Key Sectors") 61 | plt.tight_layout() 62 | plt.show() 63 | 64 | # 5. Trend analysis: Mean total expenditure by year 65 | data["srcYear"] = data["srcYear"].astype(str) 66 | avg_expenditure_by_year = data.groupby("srcYear")["Total expenditure"].mean() 67 | 68 | plt.figure(figsize=(8, 6)) 69 | sns.lineplot(x=avg_expenditure_by_year.index, y=avg_expenditure_by_year.values, marker='o', color='blue') 70 | plt.title("Mean Total Expenditure Across Financial Years") 71 | plt.xlabel("Financial Year") 72 | plt.ylabel("Average Expenditure") 73 | plt.xticks(rotation=45) 74 | plt.grid(True) 75 | plt.tight_layout() 76 | plt.show() 77 | 78 | 79 | 80 | 81 | 82 | ### Objective 2 ### 83 | #--------------------------------------------------------------------------------------------------------------- 84 | ## To design and implement visual representations using Matplotlib and Seaborn for 85 | ## comparing sector-wise allocations across years and regions. 86 | 87 | import numpy as np 88 | import pandas as pd 89 | import matplotlib.pyplot as plt 90 | import seaborn as sns 91 | 92 | # Load the data 93 | df = pd.read_csv(r"C:\Users\shubh\Downloads\7567_all_files\ZIP\7567\7567_source_data.csv") 94 | 95 | # Filter to use 'Accounts' for consistency 96 | filtered_data = df[df["Budget type"] == "Accounts"].copy() 97 | 98 | # Ensure 'srcYear' is string type for plotting 99 | filtered_data["srcYear"] = filtered_data["srcYear"].astype(str) 100 | 101 | # Set a few important sectors for visual comparison 102 | important_sectors = [ 103 | "Education, sports, art and culture", 104 | "Medical and public health", 105 | "Agriculture and allied activities" 106 | ] 107 | 108 | # 1. Barplot: Year-wise sector expenditure 109 | 110 | sector_yearly = filtered_data.groupby("srcYear")[important_sectors].mean() 111 | sector_yearly.plot(kind='bar', figsize=(12, 6), colormap="Set2") 112 | plt.title("Average Sector-wise Expenditure Over Years") 113 | plt.ylabel("Average Expenditure") 114 | plt.xlabel("Financial Year") 115 | plt.show() 116 | 117 | # 3. Lineplot: Trend in one sector over years across states 118 | 119 | selected_states = ['MAHARASHTRA', 'BIHAR', 'KARNATAKA', 'TAMIL NADU'] 120 | plt.figure(figsize=(8, 6)) 121 | for state in selected_states: 122 | state_data = filtered_data[filtered_data["srcStateName"] == state] 123 | sns.lineplot( 124 | x="srcYear", 125 | y="Education, sports, art and culture", 126 | data=state_data, 127 | label=state, 128 | marker='o' 129 | ) 130 | 131 | plt.title("Education Sector Trend Over Time (Selected States)") 132 | plt.xlabel("Financial Year") 133 | plt.ylabel("Expenditure") 134 | plt.xticks(rotation=45) 135 | plt.legend(title="State") 136 | plt.grid(True) 137 | plt.tight_layout() 138 | plt.show() 139 | 140 | 141 | 142 | 143 | ### Objective 3 ### 144 | #------------------------------------------------------------------------------------------------------------------- 145 | ## To conduct statistical hypothesis testing to evaluate significant differences in 146 | ## revenue expenditure patterns between selected sectors or regions. 147 | 148 | import numpy as np 149 | import pandas as pd 150 | import scipy.stats as stats 151 | import seaborn as sns 152 | import matplotlib.pyplot as plt 153 | 154 | 155 | df = pd.read_csv(r"C:\Users\shubh\Downloads\7567_all_files\ZIP\7567\7567_source_data.csv") 156 | df = df[df["Budget type"] == "Accounts"] 157 | 158 | # Drop rows with missing data for target sectors 159 | df_clean = df.dropna(subset=[ 160 | "Education, sports, art and culture", 161 | "Medical and public health", 162 | "Agriculture and allied activities" 163 | ]) 164 | 165 | # 1. T-Test: Education vs Medical Expenditure 166 | edu = df_clean["Education, sports, art and culture"] 167 | med = df_clean["Medical and public health"] 168 | 169 | t_stat1, p_val1 = stats.ttest_ind(edu, med, equal_var=False) 170 | print("T-Test: Education vs Medical Expenditure") 171 | print(f"T-Statistic: {t_stat1:.3f}, P-Value: {p_val1:.4f}") 172 | if p_val1 < 0.05: 173 | print("Significant difference exists between education and medical sector spending.\n") 174 | else: 175 | print("No significant difference found between education and medical sector spending.\n") 176 | 177 | # Boxplot: Education vs Medical 178 | plt.figure(figsize=(8, 6)) 179 | sns.boxplot(data=df_clean[["Education, sports, art and culture", "Medical and public health"]], orient='h') 180 | plt.title("Boxplot: Education vs Medical Expenditure") 181 | plt.xlabel("Expenditure") 182 | plt.yticks(ticks=[0, 1], labels=["Education", "Medical"]) 183 | plt.tight_layout() 184 | plt.show() 185 | 186 | # 2. T-Test: Agriculture in Tamil Nadu vs Karnataka 187 | agri1 = df_clean[df_clean["srcStateName"] == "TAMIL NADU"]["Agriculture and allied activities"] 188 | agri2 = df_clean[df_clean["srcStateName"] == "KARNATAKA"]["Agriculture and allied activities"] 189 | 190 | t_stat2, p_val2 = stats.ttest_ind(agri1, agri2, equal_var=False) 191 | print("T-Test: Agriculture Expenditure - Tamil Nadu vs Karnataka") 192 | print(f"T-Statistic: {t_stat2:.3f}, P-Value: {p_val2:.4f}") 193 | if p_val2 < 0.05: 194 | print("Significant difference in agriculture spending between Tamil Nadu and Karnataka.\n") 195 | else: 196 | print("No significant difference in agriculture spending between the two states.\n") 197 | 198 | 199 | 200 | 201 | 202 | 203 | ### Objective 4 ### 204 | #------------------------------------------------------------------------------------------------------------------ 205 | ## To uncover relationships and correlations among sectors based on 206 | ## revenue allocation patterns through numerical and visual statistical analysis. 207 | 208 | import numpy as np 209 | import pandas as pd 210 | import seaborn as sns 211 | import matplotlib.pyplot as plt 212 | 213 | df = pd.read_csv(r"C:\Users\shubh\Downloads\7567_all_files\ZIP\7567\7567_source_data.csv") 214 | df = df[df["Budget type"] == "Accounts"] 215 | 216 | sectors_to_analyze = [ 217 | "Education, sports, art and culture", 218 | "Medical and public health", 219 | "Agriculture and allied activities", 220 | "Energy", 221 | "Transport and communications", 222 | "Social security and welfare", 223 | "Urban development" 224 | ] 225 | 226 | # Drop rows with missing values for the selected sectors 227 | df_corr = df[sectors_to_analyze].dropna() 228 | 229 | # 1. Compute the correlation matrix 230 | correlation_matrix = df_corr.corr() 231 | print("Correlation Matrix:\n") 232 | print(correlation_matrix) 233 | 234 | # 2. Heatmap of correlations 235 | plt.figure(figsize=(8, 6)) 236 | sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5) 237 | plt.title("Correlation Heatmap: Sector-wise Revenue Allocation") 238 | plt.tight_layout() 239 | plt.show() 240 | 241 | # 3. Pairplot (optional for deep exploration) 242 | sns.pairplot(df_corr) 243 | plt.suptitle("Pairwise Sector Relationships", y=1.02) 244 | plt.show() 245 | 246 | # 4. Find top 3 positively and negatively correlated pairs 247 | # Flatten matrix, remove self-correlations, sort 248 | corr_pairs = correlation_matrix.unstack().reset_index() 249 | corr_pairs.columns = ['Sector 1', 'Sector 2', 'Correlation'] 250 | filtered_pairs = corr_pairs[corr_pairs['Sector 1'] != corr_pairs['Sector 2']] 251 | 252 | # Drop duplicate pairs 253 | filtered_pairs['pair_key'] = filtered_pairs[['Sector 1', 'Sector 2']].apply(lambda row: tuple(sorted(row)), axis=1) 254 | filtered_pairs = filtered_pairs.drop_duplicates('pair_key').drop(columns='pair_key') 255 | 256 | # Top correlations 257 | print("\nTop 3 Positive Correlations:") 258 | print(filtered_pairs.sort_values(by='Correlation', ascending=False).head(3)) 259 | 260 | print("\nTop 3 Negative Correlations:") 261 | print(filtered_pairs.sort_values(by='Correlation', ascending=True).head(3)) 262 | 263 | 264 | 265 | 266 | ### Objective 5 ### 267 | #---------------------------------------------------------------------------------------------------------------- 268 | ## To analyze year-on-year changes in sector-wise revenue expenditure to 269 | ## detect growth patterns, stagnation, or sudden shifts in government spending priorities. 270 | 271 | import numpy as np 272 | import pandas as pd 273 | import matplotlib.pyplot as plt 274 | import seaborn as sns 275 | 276 | df = pd.read_csv(r"C:\Users\shubh\Downloads\7567_all_files\ZIP\7567\7567_source_data.csv") 277 | df = df[df["Budget type"] == "Accounts"] 278 | df["srcYear"] = df["srcYear"].astype(str) 279 | 280 | # Choose sectors to analyze 281 | sectors = [ 282 | "Education, sports, art and culture", 283 | "Medical and public health", 284 | "Agriculture and allied activities", 285 | "Energy", 286 | "Transport and communications" 287 | ] 288 | 289 | # Group by year and calculate average expenditure 290 | yearly_avg = df.groupby("srcYear")[sectors].mean() 291 | 292 | # Compute year-on-year % change 293 | yearly_pct_change = yearly_avg.pct_change() * 100 294 | 295 | # 1. Line plots for actual year-on-year expenditure 296 | plt.figure(figsize=(12, 6)) 297 | for sector in sectors: 298 | sns.lineplot(x=yearly_avg.index, y=yearly_avg[sector], label=sector) 299 | 300 | plt.title("Year-on-Year Average Expenditure by Sector") 301 | plt.xlabel("Financial Year") 302 | plt.ylabel("Average Expenditure") 303 | plt.legend() 304 | plt.xticks(rotation=45) 305 | plt.tight_layout() 306 | plt.show() 307 | 308 | # 2. Line plots for % change (growth/stagnation detection) 309 | plt.figure(figsize=(12, 6)) 310 | for sector in sectors: 311 | sns.lineplot(x=yearly_pct_change.index, y=yearly_pct_change[sector], label=sector) 312 | 313 | plt.title("Year-on-Year % Change in Expenditure by Sector") 314 | plt.xlabel("Financial Year") 315 | plt.ylabel("Percent Change (%)") 316 | plt.legend() 317 | plt.xticks(rotation=45) 318 | plt.tight_layout() 319 | plt.show() 320 | 321 | # 3. Identify major shifts (jumps/drops) 322 | print("\nSudden Year-on-Year Changes:") 323 | for sector in sectors: 324 | big_changes = yearly_pct_change[abs(yearly_pct_change[sector]) > 20][sector] 325 | if not big_changes.empty: 326 | print(f"\n{sector}:") 327 | print(big_changes.round(2)) 328 | --------------------------------------------------------------------------------