├── README.md └── Project.py /README.md: -------------------------------------------------------------------------------- 1 | # Election-Data-Analysis 2 | This project involves an in-depth analysis of a synthetic Indian election dataset to uncover insights about voter behavior, party performance, and regional election outcomes. 3 | -------------------------------------------------------------------------------- /Project.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | import numpy as np 5 | 6 | 7 | #load and clean the data from xlsx file 8 | 9 | df = pd.read_csv('C:/Users/aryan/OneDrive/Desktop/temp/32_Constituency_Data_Summary_Report.csv') 10 | 11 | print("Initial Data Overview:") 12 | print(df.info()) 13 | 14 | # 1. Missing Values 15 | print("\nMissing Values:") 16 | print(df.isnull().sum()) 17 | 18 | df['Men'].fillna(df['Men'].median(), inplace=True) 19 | df['Women'].fillna(df['Women'].median(), inplace=True) 20 | df['Third Gender'].fillna(df['Third Gender'].median(), inplace=True) 21 | 22 | # 2. Data Integrity Checks 23 | print("\nData Types Integrity:") 24 | print(df.dtypes) 25 | 26 | # 3. Outlier Detection using IQR method 27 | def detect_outliers(df, column): 28 | Q1 = df[column].quantile(0.25) 29 | Q3 = df[column].quantile(0.75) 30 | IQR = Q3 - Q1 31 | lower_bound = Q1 - 1.5 * IQR 32 | upper_bound = Q3 + 1.5 * IQR 33 | return df[(df[column] < lower_bound) | (df[column] > upper_bound)] 34 | 35 | outliers_men = detect_outliers(df, 'Men') 36 | outliers_women = detect_outliers(df, 'Women') 37 | outliers_third_gender = detect_outliers(df, 'Third Gender') 38 | 39 | print("\nOutliers in 'Men' column:") 40 | print(outliers_men) 41 | print("\nOutliers in 'Women' column:") 42 | print(outliers_women) 43 | print("\nOutliers in 'Third Gender' column:") 44 | print(outliers_third_gender) 45 | 46 | 47 | 48 | # 4. Duplicate Check 49 | duplicates = df[df.duplicated()] 50 | print("\nDuplicate Rows:") 51 | print(duplicates) 52 | 53 | df.drop_duplicates(inplace=True) 54 | 55 | # 5. EDA - Basic Statistics 56 | print("\nBasic Statistics:") 57 | print(df.describe()) 58 | 59 | # Visualize outliers in Men, Women, and Third Gender columns 60 | plt.figure(figsize=(12, 6)) 61 | 62 | plt.subplot(1, 3, 1) 63 | plt.boxplot(df['Men']) 64 | plt.title('Outlier Detection: Men') 65 | 66 | plt.subplot(1, 3, 2) 67 | plt.boxplot(df['Women']) 68 | plt.title('Outlier Detection: Women') 69 | 70 | plt.subplot(1, 3, 3) 71 | plt.boxplot(df['Third Gender']) 72 | plt.title('Outlier Detection: Third Gender') 73 | 74 | plt.tight_layout() 75 | plt.show() 76 | 77 | # --- Objective 1: Candidate Category Analysis --- 78 | 79 | # Filter relevant candidate categories 80 | candidate_categories = [ 81 | 'Candidates - Nominated', 82 | 'Candidates - Nomination Rejected', 83 | 'Candidates - Withdrawn', 84 | 'Candidates - Contested' 85 | ] 86 | 87 | candidates_df = df[df['Category'].isin(candidate_categories)] 88 | 89 | # Group by category and sum gender-wise counts 90 | grouped = candidates_df.groupby('Category')[['Men', 'Women', 'Third Gender']].sum() 91 | 92 | # Plotting with custom colors 93 | colors = ['#1f77b4', '#ff7f0e', '#d62728'] # Blue for Men, Orange for Women, Red for Third Gender 94 | grouped.plot(kind='bar', stacked=True, figsize=(10, 6), color=colors) 95 | plt.title('Gender-wise Distribution of Candidates by Category') 96 | plt.ylabel('Number of Candidates') 97 | plt.xlabel('Candidate Category') 98 | plt.xticks(rotation=45) 99 | plt.legend(title='Gender') 100 | plt.tight_layout() 101 | plt.show() 102 | 103 | # --- Objective 2: Voter Data Analysis (General, Overseas, Proxy, Postal) --- # 104 | 105 | # Filter the data for Voters categories (General, Overseas, Proxy, Postal) 106 | voters_data = df[df['Category'].str.contains('Voters')] 107 | 108 | # Group by Category and sum the values for Men, Women, and Third Gender 109 | voters_data_grouped = voters_data.groupby('Category').sum()[['Men', 'Women', 'Third Gender']] 110 | 111 | # Calculate the total voters (sum of Men, Women, Third Gender) 112 | voters_data_grouped['Total Voters'] = voters_data_grouped.sum(axis=1) 113 | 114 | # Filter out categories with no voters 115 | filtered_voters_data = voters_data_grouped[voters_data_grouped['Total Voters'] > 0] 116 | 117 | # Display the filtered result for better clarity 118 | print("\nFiltered Total Voters by Category:") 119 | print(filtered_voters_data) 120 | 121 | # Plotting the Total Voters for each category with adjusted y-axis scale 122 | plt.figure(figsize=(10, 6)) 123 | 124 | # Plotting the total voters for each category 125 | filtered_voters_data['Total Voters'].plot(kind='bar', color='lightcoral') 126 | 127 | # Adding a title and labels 128 | plt.title('Total Number of Voters by Category') 129 | plt.xlabel('Category') 130 | plt.ylabel('Total Voters') 131 | 132 | # Automatically adjust y-axis to fit all values properly 133 | plt.yscale('linear') 134 | 135 | # Adjusting the xticks for better readability 136 | plt.xticks(rotation=45) 137 | 138 | # Adding data labels to bars for clarity 139 | for index, value in enumerate(filtered_voters_data['Total Voters']): 140 | plt.text(index, value, str(int(value)), ha='center', va='bottom') 141 | 142 | # Display the plot 143 | plt.tight_layout() 144 | plt.show() 145 | 146 | # --- Objective 3: Invalid & Deducted Votes Analysis --- 147 | 148 | # Categories of interest 149 | invalid_deducted_categories = [ 150 | 'Votes - Total Deducted Votes From Evm', 151 | 'Votes - Postal Votes Deducted' 152 | ] 153 | 154 | # Filter relevant data 155 | invalid_deducted_df = df[df['Category'].isin(invalid_deducted_categories)] 156 | 157 | # Calculate total invalid/deducted votes 158 | invalid_deducted_total = invalid_deducted_df.groupby('Category')[['Men', 'Women', 'Third Gender']].sum().sum(axis=1) 159 | 160 | # Display or show a fallback message 161 | print("\nTotal Invalid & Deducted Votes by Category (Without Gender Breakdown):") 162 | print(invalid_deducted_total) 163 | 164 | # Plot if data is available 165 | if invalid_deducted_total.sum() > 0: 166 | plt.figure(figsize=(10, 6)) 167 | invalid_deducted_total.plot(kind='bar', color='lightcoral') 168 | 169 | plt.title('Total Invalid & Deducted Votes by Category') 170 | plt.xlabel('Category') 171 | plt.ylabel('Total Invalid Votes') 172 | plt.xticks(rotation=45) 173 | plt.tight_layout() 174 | plt.show() 175 | else: 176 | print("\nNo data available for Invalid or Deducted Votes in the selected categories.") 177 | 178 | 179 | # Objective 4: Gender Representation in Elections 180 | # ============================= 181 | 182 | # Filter data only for "Candidates - Contested" 183 | contested_candidates = df[df['Category'] == 'Candidates - Contested'] 184 | 185 | # Sum gender-wise contested candidates 186 | gender_representation = contested_candidates[['Men', 'Women', 'Third Gender']].sum() 187 | 188 | # Display the result 189 | print("\nTotal Candidates Contested by Gender:") 190 | print(gender_representation) 191 | 192 | # Plotting 193 | plt.figure(figsize=(8, 6)) 194 | colors = ['skyblue', 'orange', 'red'] 195 | gender_representation.plot(kind='bar', color=colors) 196 | 197 | plt.title('Gender Representation Among Contesting Candidates') 198 | plt.xlabel('Gender') 199 | plt.ylabel('Number of Candidates') 200 | plt.xticks(rotation=0) 201 | plt.tight_layout() 202 | 203 | # Show the plot 204 | plt.show() 205 | 206 | # --- Objective 5: NOTA Votes Analysis --- 207 | # ============================ 208 | 209 | # Filter rows for NOTA votes 210 | nota_df = df[df['Category'].str.contains("Votes - Votes Polled For 'Nota'", case=False, na=False)] 211 | 212 | # Check if data is available 213 | if not nota_df.empty: 214 | # Sum NOTA votes across all genders 215 | total_nota_votes = nota_df[['Men', 'Women', 'Third Gender']].sum() 216 | 217 | # Calculate total and display 218 | total_nota_votes['Total NOTA Votes'] = total_nota_votes.sum() 219 | print("\nTotal NOTA Votes by Gender:") 220 | print(total_nota_votes) 221 | 222 | # Plotting NOTA votes 223 | plt.figure(figsize=(8, 6)) 224 | total_nota_votes.drop('Total NOTA Votes').plot(kind='bar', color=['skyblue', 'lightgreen', 'salmon']) 225 | plt.title("NOTA Votes by Gender") 226 | plt.ylabel("Number of Votes") 227 | plt.xlabel("Gender") 228 | plt.xticks(rotation=0) 229 | plt.tight_layout() 230 | plt.show() 231 | else: 232 | print("\n No data available for NOTA votes in the dataset.") 233 | 234 | 235 | --------------------------------------------------------------------------------