├── 1.png
├── 10.png
├── 11.png
├── 12.png
├── 13.png
├── 14.png
├── 15.png
├── 16.png
├── 17.png
├── 18.png
├── 19.png
├── 2.png
├── 20.png
├── 21.png
├── 22.png
├── 23.png
├── 24.png
├── 3.png
├── 4.png
├── 5.png
├── 6.png
├── 7.png
├── 8.png
├── 9.png
├── README.md
├── dataset.csv
└── project_python_file.py


/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/1.png


--------------------------------------------------------------------------------
/10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/10.png


--------------------------------------------------------------------------------
/11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/11.png


--------------------------------------------------------------------------------
/12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/12.png


--------------------------------------------------------------------------------
/13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/13.png


--------------------------------------------------------------------------------
/14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/14.png


--------------------------------------------------------------------------------
/15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/15.png


--------------------------------------------------------------------------------
/16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/16.png


--------------------------------------------------------------------------------
/17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/17.png


--------------------------------------------------------------------------------
/18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/18.png


--------------------------------------------------------------------------------
/19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/19.png


--------------------------------------------------------------------------------
/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/2.png


--------------------------------------------------------------------------------
/20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/20.png


--------------------------------------------------------------------------------
/21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/21.png


--------------------------------------------------------------------------------
/22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/22.png


--------------------------------------------------------------------------------
/23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/23.png


--------------------------------------------------------------------------------
/24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/24.png


--------------------------------------------------------------------------------
/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/3.png


--------------------------------------------------------------------------------
/4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/4.png


--------------------------------------------------------------------------------
/5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/5.png


--------------------------------------------------------------------------------
/6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/6.png


--------------------------------------------------------------------------------
/7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/7.png


--------------------------------------------------------------------------------
/8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/8.png


--------------------------------------------------------------------------------
/9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShubhamKumar284/Data_Science-with-python-library/6373e5b1fdd2cd9ac71da9346f205eb7d74b0244/9.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Sector-wise Revenue Expenditure Analysis of Indian States and UTs
 2 | This project explores, analyzes, and visualizes sector-wise revenue expenditure across Indian states and union territories. Using Python's data science stack, the goal is to uncover trends, compare regional allocations, test statistical hypotheses, and detect relationships in spending patterns.
 3 | 
 4 | ## 📌 Objectives
 5 | 1. Generate descriptive statistics and identify key trends, distributions, and inconsistencies in sector-wise revenue expenditure.
 6 | 2. Design and implement visual representations using Matplotlib and Seaborn for comparing sector-wise allocations across years and regions.
 7 | 3. Conduct statistical hypothesis testing to evaluate significant differences in revenue expenditure patterns between selected sectors or regions.
 8 | 4. Uncover relationships and correlations among sectors through numerical and visual statistical analysis.
 9 | 5. Analyze year-on-year changes in sector-wise revenue expenditure to detect growth patterns, stagnation, or shifts in spending priorities.
10 | 
11 | ## 🛠️ Technologies Used
12 | - Python
13 | - Pandas
14 | - NumPy
15 | - Matplotlib
16 | - Seaborn
17 | - SciPy / StatsModels
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/project_python_file.py:
--------------------------------------------------------------------------------
  1 | ### EDA ###
  2 | #---------------------------------------------------------------------------------------------------------------
  3 | 
  4 | import pandas as pd
  5 | import numpy as np
  6 | import matplotlib.pyplot as plt
  7 | import seaborn as sns
  8 | 
  9 | df = pd.read_csv(r"C:\Users\shubh\Downloads\7567_all_files\ZIP\7567\7567_source_data.csv")
 10 | 
 11 | print(df.columns)                  # Column names
 12 | print(df.shape)                    # Number of rows and columns
 13 | print(df.info())                   # Data types and non-null counts
 14 | print(df.describe)                 # Summary for numerical columns
 15 | print(df.head())                   # First five records
 16 | print(df.tail())                   # Last five records
 17 | print(df.isnull().sum())           # Total missing values per column
 18 | print(df.duplicated().sum())       # Check for duplicate rows
 19 | print(df.dropna())                 # Remove missing/duplicate values
 20 | print(df.fillna(method='ffill'))   # Fill missing/duplicate values
 21 | 
 22 | 
 23 | 
 24 | 
 25 | 
 26 | ### Objective 1 ###
 27 | # --------------------------------------------------------------------------------------------------------------
 28 | # To generate descriptive statistics and identify key trends, distributions, and inconsistencies in 
 29 | # sector-wise revenue expenditure across states and union territories.
 30 | 
 31 | import numpy as np
 32 | import pandas as pd
 33 | import matplotlib.pyplot as plt
 34 | import seaborn as sns
 35 | 
 36 | # Load the dataset
 37 | data = pd.read_csv(r"C:\Users\shubh\Downloads\7567_all_files\ZIP\7567\7567_source_data.csv")
 38 | 
 39 | sector_columns = data.columns[3:78]
 40 | 
 41 | # 1. Generate basic summary statistics
 42 | summary = data[sector_columns].describe()
 43 | print("Sector-wise Descriptive Statistics:\n", summary)
 44 | 
 45 | # 2. Analyze missing values
 46 | missing_info = data[sector_columns].isnull().sum()
 47 | missing_info = missing_info[missing_info > 0].sort_values(ascending=False)
 48 | print("\nMissing Value Count by Sector:\n", missing_info)
 49 | 
 50 | # 3. Visualize distributions of selected expenditure sectors
 51 | selected = [
 52 |     "Education, sports, art and culture",
 53 |     "Medical and public health",
 54 |     "Agriculture and allied activities",
 55 |     "Energy",
 56 |     "Transport and communications"
 57 | ]
 58 | 
 59 | data[selected].hist(bins=25, figsize=(8, 6), color='red', edgecolor='black')
 60 | plt.suptitle("Distribution of Expenditures in Key Sectors")
 61 | plt.tight_layout()
 62 | plt.show()
 63 | 
 64 | # 5. Trend analysis: Mean total expenditure by year
 65 | data["srcYear"] = data["srcYear"].astype(str)
 66 | avg_expenditure_by_year = data.groupby("srcYear")["Total expenditure"].mean()
 67 | 
 68 | plt.figure(figsize=(8, 6))
 69 | sns.lineplot(x=avg_expenditure_by_year.index, y=avg_expenditure_by_year.values, marker='o', color='blue')
 70 | plt.title("Mean Total Expenditure Across Financial Years")
 71 | plt.xlabel("Financial Year")
 72 | plt.ylabel("Average Expenditure")
 73 | plt.xticks(rotation=45)
 74 | plt.grid(True)
 75 | plt.tight_layout()
 76 | plt.show()
 77 | 
 78 | 
 79 | 
 80 | 
 81 | 
 82 | ### Objective 2 ###
 83 | #---------------------------------------------------------------------------------------------------------------
 84 | ## To design and implement visual representations using Matplotlib and Seaborn for 
 85 | ## comparing sector-wise allocations across years and regions.
 86 | 
 87 | import numpy as np
 88 | import pandas as pd
 89 | import matplotlib.pyplot as plt
 90 | import seaborn as sns
 91 | 
 92 | # Load the data
 93 | df = pd.read_csv(r"C:\Users\shubh\Downloads\7567_all_files\ZIP\7567\7567_source_data.csv")
 94 | 
 95 | # Filter to use 'Accounts' for consistency
 96 | filtered_data = df[df["Budget type"] == "Accounts"].copy()
 97 | 
 98 | # Ensure 'srcYear' is string type for plotting
 99 | filtered_data["srcYear"] = filtered_data["srcYear"].astype(str)
100 | 
101 | # Set a few important sectors for visual comparison
102 | important_sectors = [
103 |     "Education, sports, art and culture",
104 |     "Medical and public health",
105 |     "Agriculture and allied activities"
106 | ]
107 | 
108 | # 1. Barplot: Year-wise sector expenditure
109 | 
110 | sector_yearly = filtered_data.groupby("srcYear")[important_sectors].mean()
111 | sector_yearly.plot(kind='bar', figsize=(12, 6), colormap="Set2")
112 | plt.title("Average Sector-wise Expenditure Over Years")
113 | plt.ylabel("Average Expenditure")
114 | plt.xlabel("Financial Year")
115 | plt.show()
116 | 
117 | # 3. Lineplot: Trend in one sector over years across states
118 | 
119 | selected_states = ['MAHARASHTRA', 'BIHAR', 'KARNATAKA', 'TAMIL NADU']
120 | plt.figure(figsize=(8, 6))
121 | for state in selected_states:
122 |     state_data = filtered_data[filtered_data["srcStateName"] == state]
123 |     sns.lineplot(
124 |         x="srcYear",
125 |         y="Education, sports, art and culture",
126 |         data=state_data,
127 |         label=state,
128 |         marker='o'
129 |     )
130 | 
131 | plt.title("Education Sector Trend Over Time (Selected States)")
132 | plt.xlabel("Financial Year")
133 | plt.ylabel("Expenditure")
134 | plt.xticks(rotation=45)
135 | plt.legend(title="State")
136 | plt.grid(True)
137 | plt.tight_layout()
138 | plt.show()
139 | 
140 | 
141 | 
142 | 
143 | ### Objective 3 ###
144 | #-------------------------------------------------------------------------------------------------------------------
145 | ## To conduct statistical hypothesis testing to evaluate significant differences in 
146 | ## revenue expenditure patterns between selected sectors or regions.
147 | 
148 | import numpy as np
149 | import pandas as pd
150 | import scipy.stats as stats
151 | import seaborn as sns
152 | import matplotlib.pyplot as plt
153 | 
154 | 
155 | df = pd.read_csv(r"C:\Users\shubh\Downloads\7567_all_files\ZIP\7567\7567_source_data.csv")
156 | df = df[df["Budget type"] == "Accounts"]
157 | 
158 | # Drop rows with missing data for target sectors
159 | df_clean = df.dropna(subset=[
160 |     "Education, sports, art and culture",
161 |     "Medical and public health",
162 |     "Agriculture and allied activities"
163 | ])
164 | 
165 | # 1. T-Test: Education vs Medical Expenditure
166 | edu = df_clean["Education, sports, art and culture"]
167 | med = df_clean["Medical and public health"]
168 | 
169 | t_stat1, p_val1 = stats.ttest_ind(edu, med, equal_var=False)
170 | print("T-Test: Education vs Medical Expenditure")
171 | print(f"T-Statistic: {t_stat1:.3f}, P-Value: {p_val1:.4f}")
172 | if p_val1 < 0.05:
173 |     print("Significant difference exists between education and medical sector spending.\n")
174 | else:
175 |     print("No significant difference found between education and medical sector spending.\n")
176 | 
177 | # Boxplot: Education vs Medical
178 | plt.figure(figsize=(8, 6))
179 | sns.boxplot(data=df_clean[["Education, sports, art and culture", "Medical and public health"]], orient='h')
180 | plt.title("Boxplot: Education vs Medical Expenditure")
181 | plt.xlabel("Expenditure")
182 | plt.yticks(ticks=[0, 1], labels=["Education", "Medical"])
183 | plt.tight_layout()
184 | plt.show()
185 | 
186 | # 2. T-Test: Agriculture in Tamil Nadu vs Karnataka
187 | agri1 = df_clean[df_clean["srcStateName"] == "TAMIL NADU"]["Agriculture and allied activities"]
188 | agri2 = df_clean[df_clean["srcStateName"] == "KARNATAKA"]["Agriculture and allied activities"]
189 | 
190 | t_stat2, p_val2 = stats.ttest_ind(agri1, agri2, equal_var=False)
191 | print("T-Test: Agriculture Expenditure - Tamil Nadu vs Karnataka")
192 | print(f"T-Statistic: {t_stat2:.3f}, P-Value: {p_val2:.4f}")
193 | if p_val2 < 0.05:
194 |     print("Significant difference in agriculture spending between Tamil Nadu and Karnataka.\n")
195 | else:
196 |     print("No significant difference in agriculture spending between the two states.\n")
197 | 
198 | 
199 | 
200 | 
201 | 
202 | 
203 | ### Objective 4 ###
204 | #------------------------------------------------------------------------------------------------------------------
205 | ## To uncover relationships and correlations among sectors based on 
206 | ## revenue allocation patterns through numerical and visual statistical analysis.
207 | 
208 | import numpy as np
209 | import pandas as pd
210 | import seaborn as sns
211 | import matplotlib.pyplot as plt
212 | 
213 | df = pd.read_csv(r"C:\Users\shubh\Downloads\7567_all_files\ZIP\7567\7567_source_data.csv")
214 | df = df[df["Budget type"] == "Accounts"]
215 | 
216 | sectors_to_analyze = [
217 |     "Education, sports, art and culture",
218 |     "Medical and public health",
219 |     "Agriculture and allied activities",
220 |     "Energy",
221 |     "Transport and communications",
222 |     "Social security and welfare",
223 |     "Urban development"
224 | ]
225 | 
226 | # Drop rows with missing values for the selected sectors
227 | df_corr = df[sectors_to_analyze].dropna()
228 | 
229 | # 1. Compute the correlation matrix
230 | correlation_matrix = df_corr.corr()
231 | print("Correlation Matrix:\n")
232 | print(correlation_matrix)
233 | 
234 | # 2. Heatmap of correlations
235 | plt.figure(figsize=(8, 6))
236 | sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
237 | plt.title("Correlation Heatmap: Sector-wise Revenue Allocation")
238 | plt.tight_layout()
239 | plt.show()
240 | 
241 | # 3. Pairplot (optional for deep exploration)
242 | sns.pairplot(df_corr)
243 | plt.suptitle("Pairwise Sector Relationships", y=1.02)
244 | plt.show()
245 | 
246 | # 4. Find top 3 positively and negatively correlated pairs
247 | # Flatten matrix, remove self-correlations, sort
248 | corr_pairs = correlation_matrix.unstack().reset_index()
249 | corr_pairs.columns = ['Sector 1', 'Sector 2', 'Correlation']
250 | filtered_pairs = corr_pairs[corr_pairs['Sector 1'] != corr_pairs['Sector 2']]
251 | 
252 | # Drop duplicate pairs
253 | filtered_pairs['pair_key'] = filtered_pairs[['Sector 1', 'Sector 2']].apply(lambda row: tuple(sorted(row)), axis=1)
254 | filtered_pairs = filtered_pairs.drop_duplicates('pair_key').drop(columns='pair_key')
255 | 
256 | # Top correlations
257 | print("\nTop 3 Positive Correlations:")
258 | print(filtered_pairs.sort_values(by='Correlation', ascending=False).head(3))
259 | 
260 | print("\nTop 3 Negative Correlations:")
261 | print(filtered_pairs.sort_values(by='Correlation', ascending=True).head(3))
262 | 
263 | 
264 | 
265 | 
266 | ### Objective 5 ###
267 | #----------------------------------------------------------------------------------------------------------------
268 | ## To analyze year-on-year changes in sector-wise revenue expenditure to 
269 | ## detect growth patterns, stagnation, or sudden shifts in government spending priorities.
270 | 
271 | import numpy as np
272 | import pandas as pd
273 | import matplotlib.pyplot as plt
274 | import seaborn as sns
275 | 
276 | df = pd.read_csv(r"C:\Users\shubh\Downloads\7567_all_files\ZIP\7567\7567_source_data.csv")
277 | df = df[df["Budget type"] == "Accounts"]
278 | df["srcYear"] = df["srcYear"].astype(str)
279 | 
280 | # Choose sectors to analyze
281 | sectors = [
282 |     "Education, sports, art and culture",
283 |     "Medical and public health",
284 |     "Agriculture and allied activities",
285 |     "Energy",
286 |     "Transport and communications"
287 | ]
288 | 
289 | # Group by year and calculate average expenditure
290 | yearly_avg = df.groupby("srcYear")[sectors].mean()
291 | 
292 | # Compute year-on-year % change
293 | yearly_pct_change = yearly_avg.pct_change() * 100
294 | 
295 | # 1. Line plots for actual year-on-year expenditure
296 | plt.figure(figsize=(12, 6))
297 | for sector in sectors:
298 |     sns.lineplot(x=yearly_avg.index, y=yearly_avg[sector], label=sector)
299 | 
300 | plt.title("Year-on-Year Average Expenditure by Sector")
301 | plt.xlabel("Financial Year")
302 | plt.ylabel("Average Expenditure")
303 | plt.legend()
304 | plt.xticks(rotation=45)
305 | plt.tight_layout()
306 | plt.show()
307 | 
308 | # 2. Line plots for % change (growth/stagnation detection)
309 | plt.figure(figsize=(12, 6))
310 | for sector in sectors:
311 |     sns.lineplot(x=yearly_pct_change.index, y=yearly_pct_change[sector], label=sector)
312 | 
313 | plt.title("Year-on-Year % Change in Expenditure by Sector")
314 | plt.xlabel("Financial Year")
315 | plt.ylabel("Percent Change (%)")
316 | plt.legend()
317 | plt.xticks(rotation=45)
318 | plt.tight_layout()
319 | plt.show()
320 | 
321 | # 3. Identify major shifts (jumps/drops)
322 | print("\nSudden Year-on-Year Changes:")
323 | for sector in sectors:
324 |     big_changes = yearly_pct_change[abs(yearly_pct_change[sector]) > 20][sector]
325 |     if not big_changes.empty:
326 |         print(f"\n{sector}:")
327 |         print(big_changes.round(2))
328 | 


--------------------------------------------------------------------------------