├── README.md
└── Project.py


/README.md:
--------------------------------------------------------------------------------
1 | # Election-Data-Analysis
2 | This project involves an in-depth analysis of a synthetic Indian election dataset to uncover insights about voter behavior, party performance, and regional election outcomes.
3 | 


--------------------------------------------------------------------------------
/Project.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import matplotlib.pyplot as plt
  3 | import seaborn as sns
  4 | import numpy as np
  5 | 
  6 | 
  7 | #load and clean the data from xlsx file
  8 | 
  9 | df = pd.read_csv('C:/Users/aryan/OneDrive/Desktop/temp/32_Constituency_Data_Summary_Report.csv')
 10 | 
 11 | print("Initial Data Overview:")
 12 | print(df.info())
 13 | 
 14 | # 1. Missing Values
 15 | print("\nMissing Values:")
 16 | print(df.isnull().sum())
 17 | 
 18 | df['Men'].fillna(df['Men'].median(), inplace=True)
 19 | df['Women'].fillna(df['Women'].median(), inplace=True)
 20 | df['Third Gender'].fillna(df['Third Gender'].median(), inplace=True)
 21 | 
 22 | # 2. Data Integrity Checks
 23 | print("\nData Types Integrity:")
 24 | print(df.dtypes)
 25 | 
 26 | # 3. Outlier Detection using IQR method
 27 | def detect_outliers(df, column):
 28 |     Q1 = df[column].quantile(0.25) 
 29 |     Q3 = df[column].quantile(0.75)
 30 |     IQR = Q3 - Q1
 31 |     lower_bound = Q1 - 1.5 * IQR
 32 |     upper_bound = Q3 + 1.5 * IQR
 33 |     return df[(df[column] < lower_bound) | (df[column] > upper_bound)]
 34 | 
 35 | outliers_men = detect_outliers(df, 'Men')
 36 | outliers_women = detect_outliers(df, 'Women')
 37 | outliers_third_gender = detect_outliers(df, 'Third Gender')
 38 | 
 39 | print("\nOutliers in 'Men' column:")
 40 | print(outliers_men)
 41 | print("\nOutliers in 'Women' column:")
 42 | print(outliers_women)
 43 | print("\nOutliers in 'Third Gender' column:")
 44 | print(outliers_third_gender)
 45 | 
 46 | 
 47 | 
 48 | # 4. Duplicate Check
 49 | duplicates = df[df.duplicated()]
 50 | print("\nDuplicate Rows:")
 51 | print(duplicates)
 52 | 
 53 | df.drop_duplicates(inplace=True)
 54 | 
 55 | # 5. EDA - Basic Statistics
 56 | print("\nBasic Statistics:")
 57 | print(df.describe())
 58 | 
 59 | # Visualize outliers in Men, Women, and Third Gender columns
 60 | plt.figure(figsize=(12, 6))
 61 | 
 62 | plt.subplot(1, 3, 1)
 63 | plt.boxplot(df['Men'])
 64 | plt.title('Outlier Detection: Men')
 65 | 
 66 | plt.subplot(1, 3, 2)
 67 | plt.boxplot(df['Women'])
 68 | plt.title('Outlier Detection: Women')
 69 | 
 70 | plt.subplot(1, 3, 3)
 71 | plt.boxplot(df['Third Gender'])
 72 | plt.title('Outlier Detection: Third Gender')
 73 | 
 74 | plt.tight_layout()
 75 | plt.show()
 76 | 
 77 | # --- Objective 1: Candidate Category Analysis ---
 78 | 
 79 | # Filter relevant candidate categories
 80 | candidate_categories = [
 81 |     'Candidates - Nominated',
 82 |     'Candidates - Nomination Rejected',
 83 |     'Candidates - Withdrawn',
 84 |     'Candidates - Contested'
 85 | ]
 86 | 
 87 | candidates_df = df[df['Category'].isin(candidate_categories)]
 88 | 
 89 | # Group by category and sum gender-wise counts
 90 | grouped = candidates_df.groupby('Category')[['Men', 'Women', 'Third Gender']].sum()
 91 | 
 92 | # Plotting with custom colors
 93 | colors = ['#1f77b4', '#ff7f0e', '#d62728']  # Blue for Men, Orange for Women, Red for Third Gender
 94 | grouped.plot(kind='bar', stacked=True, figsize=(10, 6), color=colors)
 95 | plt.title('Gender-wise Distribution of Candidates by Category')
 96 | plt.ylabel('Number of Candidates')
 97 | plt.xlabel('Candidate Category')
 98 | plt.xticks(rotation=45)
 99 | plt.legend(title='Gender')
100 | plt.tight_layout()
101 | plt.show()
102 | 
103 | # --- Objective 2: Voter Data Analysis (General, Overseas, Proxy, Postal) --- #
104 | 
105 | # Filter the data for Voters categories (General, Overseas, Proxy, Postal)
106 | voters_data = df[df['Category'].str.contains('Voters')]
107 | 
108 | # Group by Category and sum the values for Men, Women, and Third Gender
109 | voters_data_grouped = voters_data.groupby('Category').sum()[['Men', 'Women', 'Third Gender']]
110 | 
111 | # Calculate the total voters (sum of Men, Women, Third Gender)
112 | voters_data_grouped['Total Voters'] = voters_data_grouped.sum(axis=1)
113 | 
114 | # Filter out categories with no voters
115 | filtered_voters_data = voters_data_grouped[voters_data_grouped['Total Voters'] > 0]
116 | 
117 | # Display the filtered result for better clarity
118 | print("\nFiltered Total Voters by Category:")
119 | print(filtered_voters_data)
120 | 
121 | # Plotting the Total Voters for each category with adjusted y-axis scale
122 | plt.figure(figsize=(10, 6))
123 | 
124 | # Plotting the total voters for each category
125 | filtered_voters_data['Total Voters'].plot(kind='bar', color='lightcoral')
126 | 
127 | # Adding a title and labels
128 | plt.title('Total Number of Voters by Category')
129 | plt.xlabel('Category')
130 | plt.ylabel('Total Voters')
131 | 
132 | # Automatically adjust y-axis to fit all values properly
133 | plt.yscale('linear')
134 | 
135 | # Adjusting the xticks for better readability
136 | plt.xticks(rotation=45)
137 | 
138 | # Adding data labels to bars for clarity
139 | for index, value in enumerate(filtered_voters_data['Total Voters']):
140 |     plt.text(index, value, str(int(value)), ha='center', va='bottom')
141 | 
142 | # Display the plot
143 | plt.tight_layout()
144 | plt.show()
145 | 
146 | # --- Objective 3: Invalid & Deducted Votes Analysis ---
147 | 
148 | # Categories of interest
149 | invalid_deducted_categories = [
150 |     'Votes - Total Deducted Votes From Evm',
151 |     'Votes - Postal Votes Deducted'
152 | ]
153 | 
154 | # Filter relevant data
155 | invalid_deducted_df = df[df['Category'].isin(invalid_deducted_categories)]
156 | 
157 | # Calculate total invalid/deducted votes
158 | invalid_deducted_total = invalid_deducted_df.groupby('Category')[['Men', 'Women', 'Third Gender']].sum().sum(axis=1)
159 | 
160 | # Display or show a fallback message
161 | print("\nTotal Invalid & Deducted Votes by Category (Without Gender Breakdown):")
162 | print(invalid_deducted_total)
163 | 
164 | # Plot if data is available
165 | if invalid_deducted_total.sum() > 0:
166 |     plt.figure(figsize=(10, 6))
167 |     invalid_deducted_total.plot(kind='bar', color='lightcoral')
168 | 
169 |     plt.title('Total Invalid & Deducted Votes by Category')
170 |     plt.xlabel('Category')
171 |     plt.ylabel('Total Invalid Votes')
172 |     plt.xticks(rotation=45)
173 |     plt.tight_layout()
174 |     plt.show()
175 | else:
176 | print("\nNo data available for Invalid or Deducted Votes in the selected categories.")
177 | 
178 | 
179 | # Objective 4: Gender Representation in Elections
180 | # =============================
181 | 
182 | # Filter data only for "Candidates - Contested"
183 | contested_candidates = df[df['Category'] == 'Candidates - Contested']
184 | 
185 | # Sum gender-wise contested candidates
186 | gender_representation = contested_candidates[['Men', 'Women', 'Third Gender']].sum()
187 | 
188 | # Display the result
189 | print("\nTotal Candidates Contested by Gender:")
190 | print(gender_representation)
191 | 
192 | # Plotting
193 | plt.figure(figsize=(8, 6))
194 | colors = ['skyblue', 'orange', 'red']
195 | gender_representation.plot(kind='bar', color=colors)
196 | 
197 | plt.title('Gender Representation Among Contesting Candidates')
198 | plt.xlabel('Gender')
199 | plt.ylabel('Number of Candidates')
200 | plt.xticks(rotation=0)
201 | plt.tight_layout()
202 | 
203 | # Show the plot
204 | plt.show()
205 | 
206 | # --- Objective 5: NOTA Votes Analysis ---
207 | # ============================
208 | 
209 | # Filter rows for NOTA votes
210 | nota_df = df[df['Category'].str.contains("Votes - Votes Polled For 'Nota'", case=False, na=False)]
211 | 
212 | # Check if data is available
213 | if not nota_df.empty:
214 |     # Sum NOTA votes across all genders
215 |     total_nota_votes = nota_df[['Men', 'Women', 'Third Gender']].sum()
216 | 
217 |     # Calculate total and display
218 |     total_nota_votes['Total NOTA Votes'] = total_nota_votes.sum()
219 |     print("\nTotal NOTA Votes by Gender:")
220 |     print(total_nota_votes)
221 | 
222 |     # Plotting NOTA votes
223 |     plt.figure(figsize=(8, 6))
224 |     total_nota_votes.drop('Total NOTA Votes').plot(kind='bar', color=['skyblue', 'lightgreen', 'salmon'])
225 |     plt.title("NOTA Votes by Gender")
226 |     plt.ylabel("Number of Votes")
227 |     plt.xlabel("Gender")
228 |     plt.xticks(rotation=0)
229 |     plt.tight_layout()
230 |     plt.show()
231 | else:
232 |     print("\n No data available for NOTA votes in the dataset.")
233 | 
234 | 
235 | 


--------------------------------------------------------------------------------