├── README.md └── PythonEDA.py /README.md: -------------------------------------------------------------------------------- 1 | # Python EDA on GDP per 1000 Hours Worked 2 | 3 | This project explores global economic productivity through GDP per 1000 hours worked, with a focus on India and world comparisons. 4 | 5 | ## Files Included 6 | - `PythonEDA.py`: Python script for data analysis and visualization. 7 | - `gdp_over_hours_worked.csv`: Dataset containing GDP and working hours data. 8 | 9 | ## Features 10 | - Data cleaning and transformation 11 | - Visualizations (bar plots, scatter plots) 12 | - Regression modeling 13 | - Country-level comparisons (with emphasis on India) 14 | 15 | ## How to Run 16 | Run the script using Python: 17 | ```bash 18 | python PythonEDA.py -------------------------------------------------------------------------------- /PythonEDA.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import seaborn as sb 4 | import matplotlib.pyplot as plt 5 | from sklearn.linear_model import LinearRegression 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.metrics import mean_squared_error, r2_score 8 | from sklearn.preprocessing import StandardScaler 9 | 10 | df = pd.read_csv("C:/Users/DELL/Desktop/4th Semester/Python Project/gdp_over_hours_worked.csv") 11 | df['year'] = pd.to_datetime(df['year'], format='%Y') 12 | 13 | # DATA CLEANING 14 | print("--- Missing Values Before Cleaning ---") 15 | print(df.isnull().sum()) 16 | 17 | num_cols = [ 18 | 'gdp', 'hours_worked', 'pop', 'gdp_over_k_hours_worked', 19 | 'gdp_ppp_c', 'gdp_ppp', 'gdp_c', 'unemployment_r', 20 | 'pop_over_65', 'working_age_pop_pct', 'employment_rate', 21 | 'hours_per_employed', 'employed', 'total_hours_alternative', 22 | 'gdp_ppp_over_k_hours_worked', 'gdp_over_pop', 23 | 'gdp_ppp_over_pop', 'gdp_ppp_over_labor_force', 24 | 'gdp_ppp_over_pop_c', 'gdp_over_pop_c', 'gdp_ppp_over_k_hours_worked_c' 25 | ] 26 | 27 | for col in num_cols: 28 | if df[col].dtype in ['float64', 'int64']: 29 | df[col].fillna(df[col].median(), inplace=True) 30 | 31 | df['labor_force'].fillna(df['labor_force'].median(), inplace=True) 32 | 33 | print("--- Missing Values After Cleaning ---") 34 | print(df.isnull().sum()) 35 | 36 | #GLOBAL OBJECTIVE 1: World-Level Analysis 37 | 38 | # Global Yearly Summary 39 | global_yearly = df.groupby('year').agg({ 40 | 'gdp': 'sum', 41 | 'pop': 'sum', 42 | 'hours_worked': 'mean', 43 | 'labor_force': 'sum' 44 | }).reset_index() 45 | global_yearly['gdp_per_capita'] = global_yearly['gdp'] / global_yearly['pop'] 46 | 47 | latest_year = df['year'].dt.year.max() 48 | latest_data = df[df['year'].dt.year == latest_year] 49 | top_gdp_per_hour = latest_data.sort_values(by='gdp_over_k_hours_worked', ascending=False).head(10) 50 | bottom_gdp_per_hour = latest_data.sort_values(by='gdp_over_k_hours_worked').head(5) 51 | 52 | top_gdp_rank = df.groupby(['year', 'country'])['gdp'].sum().reset_index() 53 | 54 | if 'region' in df.columns: 55 | region_trend = df.groupby(['region', 'year'])['gdp_over_k_hours_worked'].mean().reset_index() 56 | 57 | aging_corr = df[['pop_over_65', 'gdp', 'labor_force', 'hours_worked']].corr() 58 | 59 | india = df[df['country'] == 'India'] 60 | 61 | #GLOBAL OBJECTIVE 2: Regression Model 62 | reg_data = df[['gdp', 'hours_worked', 'pop', 'labor_force', 'gdp_over_k_hours_worked']] 63 | X = reg_data[['gdp', 'hours_worked', 'pop', 'labor_force']] 64 | y = reg_data['gdp_over_k_hours_worked'] 65 | 66 | scaler = StandardScaler() 67 | X_scaled = scaler.fit_transform(X) 68 | 69 | X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42) 70 | 71 | model = LinearRegression() 72 | model.fit(X_train, y_train) 73 | y_pred = model.predict(X_test) 74 | 75 | print("\n--- Linear Regression Evaluation ---") 76 | print("Mean Squared Error:", mean_squared_error(y_test, y_pred)) 77 | print("R-squared:", r2_score(y_test, y_pred)) 78 | 79 | plt.figure(figsize=(8, 6)) 80 | plt.scatter(y_test, y_pred, alpha=0.7, color='teal', marker = 'x') 81 | plt.xlabel("Actual GDP per 1000 Hours") 82 | plt.ylabel("Predicted GDP per 1000 Hours") 83 | plt.title("Actual vs Predicted GDP/1000 Hours (Regression)") 84 | plt.grid(True) 85 | plt.tight_layout() 86 | plt.show() 87 | 88 | # GLOBAL OBJECTIVE 3: Efficiency Ranking 89 | efficiency_df = df[df['year'].dt.year == 2021][['country', 'gdp_over_k_hours_worked']].dropna() 90 | top_10 = efficiency_df.sort_values(by='gdp_over_k_hours_worked', ascending=False).head(10) 91 | bottom_10 = efficiency_df.sort_values(by='gdp_over_k_hours_worked').head(10) 92 | latest = df[df['year'].dt.year == 2021] 93 | 94 | #Top-10 countries 95 | plt.figure(figsize=(10, 5)) 96 | sb.barplot(data=top_10, y='country', x='gdp_over_k_hours_worked', palette='Greens') 97 | plt.title('Top 10 Countries by GDP per 1000 Hours (2021)') 98 | plt.xlabel('GDP per 1000 Hours') 99 | plt.tight_layout() 100 | plt.show() 101 | 102 | #Bottom-10 countries 103 | bottom_10 = ( 104 | latest[['country', 'gdp_over_k_hours_worked']] 105 | .dropna() 106 | .drop_duplicates(subset='country') 107 | .sort_values(by='gdp_over_k_hours_worked') 108 | .head(10) 109 | ) 110 | 111 | plt.figure(figsize=(10, 5)) 112 | sb.barplot(data=bottom_10, x='country', y='gdp_over_k_hours_worked', palette='Reds') 113 | plt.title('Bottom 10 Countries by GDP per 1000 Hours (2021)') 114 | plt.xlabel('Country') 115 | plt.ylabel('GDP per 1000 Hours Worked') 116 | plt.xticks(rotation=45) 117 | plt.tight_layout() 118 | plt.show() 119 | 120 | # GLOBAL OBJECTIVE 4: Developed vs Developing GDP per Capita 121 | dev = ['United States', 'Germany', 'UK', 'France', 'Japan', 'Canada'] 122 | devg = ['India', 'Nigeria', 'Bangladesh', 'Pakistan', 'Kenya'] 123 | 124 | df['gdp_per_capita'] = df['gdp'] / df['pop'] 125 | compare = df[df['country'].isin(dev + devg)].copy() 126 | compare['group'] = compare['country'].apply(lambda x: 'Developed' if x in dev else 'Developing') 127 | 128 | plt.figure(figsize=(10, 5)) 129 | sb.lineplot(data=compare, x='year', y='gdp_per_capita', hue='group', palette='Set2') 130 | plt.title('GDP per Capita: Developed vs Developing') 131 | plt.tight_layout() 132 | plt.show() 133 | 134 | # OBJECTIVE 1: GDP Comparison (2021) 135 | selected = latest[latest['country'].isin(['India', 'China', 'United States', 'Germany'])] 136 | 137 | plt.figure(figsize=(8, 6)) 138 | sb.barplot(data=selected, x='country', y='gdp', palette='viridis') 139 | plt.title('GDP Comparison (2021)') 140 | plt.ylabel('GDP (USD)') 141 | plt.xlabel('Country') 142 | plt.tight_layout() 143 | plt.show() 144 | 145 | # OBJECTIVE 2: India GDP Trend Over Time 146 | plt.figure(figsize=(10, 5)) 147 | plt.plot(india['year'], india['gdp'], label='GDP', color='skyblue', linewidth=2) 148 | plt.scatter(india['year'], india['gdp'], color='navy', s=50) 149 | plt.title('India GDP Over the Years') 150 | plt.xlabel('Year') 151 | plt.ylabel('GDP') 152 | plt.grid(True) 153 | plt.legend() 154 | plt.tight_layout() 155 | plt.show() 156 | 157 | # OBJECTIVE 3: Correlation Heatmaps (World & India) 158 | # Global Correlation Heatmap 159 | corr = df[['gdp', 'hours_worked', 'gdp_over_k_hours_worked', 'pop']].corr() 160 | plt.figure(figsize=(8, 6)) 161 | sb.heatmap(corr, annot=True, cmap='coolwarm', linewidths=0.5) 162 | plt.title('Correlation Matrix (World)') 163 | plt.tight_layout() 164 | plt.show() 165 | 166 | # 🇮🇳 India Correlation Heatmap 167 | india_numeric = india.select_dtypes(include=['float64', 'int64']) 168 | valid_cols = india_numeric.loc[:, india_numeric.nunique() > 1].dropna(axis=1, how='any') 169 | corr_india = valid_cols.corr() 170 | 171 | plt.figure(figsize=(10, 6)) 172 | sb.heatmap(corr_india, annot=True, cmap='coolwarm', linewidths=0.5, vmin=-1, vmax=1) 173 | plt.title('Correlation Matrix (India)') 174 | plt.tight_layout() 175 | plt.show() 176 | 177 | # OBJECTIVE 5: India's Share of Global GDP Over Time 178 | total_gdp_year = df.groupby('year')['gdp'].sum().reset_index(name='global_gdp') 179 | india_gdp = india[['year', 'gdp']].merge(total_gdp_year, on='year') 180 | india_gdp['share'] = india_gdp['gdp'] / india_gdp['global_gdp'] * 100 181 | 182 | plt.figure(figsize=(10, 5)) 183 | plt.plot(india_gdp['year'], india_gdp['share'], color='purple') 184 | plt.title("India's Share of Global GDP Over Time (%)") 185 | plt.xlabel('Year') 186 | plt.ylabel('Share (%)') 187 | plt.grid(True) 188 | plt.tight_layout() 189 | plt.show() 190 | 191 | # OBJECTIVE 6: India - unemployment rate (since employment rate is flat) 192 | plt.figure(figsize=(10, 5)) 193 | plt.plot(india['year'], india['unemployment_r'] * 100, label='Unemployment Rate (%)', color='red', linewidth=2) 194 | plt.title('India: Unemployment Rate Over Time') 195 | plt.xlabel('Year') 196 | plt.ylabel('Percentage') 197 | plt.grid(True) 198 | plt.legend() 199 | plt.tight_layout() 200 | plt.show() 201 | --------------------------------------------------------------------------------