├── README.md
├── hello.txt
├── new.py
├── pythonproject.py.py
├── pythonproject1.py.py
└── pythonproject2.py.py


/README.md:
--------------------------------------------------------------------------------
1 | # Python-EDA  
2 | 


--------------------------------------------------------------------------------
/hello.txt:
--------------------------------------------------------------------------------
1 | This is my first Github push
2 | 


--------------------------------------------------------------------------------
/new.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import seaborn as sns
 3 | import matplotlib.pyplot as plt
 4 | from scipy.stats import pearsonr
 5 | 
 6 | # Load dataset
 7 | df = pd.read_csv("33_Constituency_Wise_Detailed_Result.csv")
 8 | 
 9 | # Fix: Remove leading/trailing spaces from all column names
10 | df.columns = df.columns.str.strip()
11 | 
12 | # Now define correct column names (no space needed at end)
13 | x = 'Total Votes Polled In The Constituency'
14 | y = 'Total Electors'
15 | 
16 | # Scatter Plot with Regression Line
17 | plt.figure(figsize=(8, 5))
18 | sns.regplot(x=df[x], y=df[y], scatter_kws={'color':'blue'}, line_kws={"color":"red"})
19 | plt.title(f"Scatter Plot with Linear Regression ({x} vs {y})")
20 | plt.xlabel(x)
21 | plt.ylabel(y)
22 | plt.grid(True)
23 | plt.show()
24 | 
25 | # Pearson Correlation Coefficient
26 | correlation, p_value = pearsonr(df[x].dropna(), df[y].dropna())
27 | print(f"Pearson Correlation Coefficient between {x} and {y}: {correlation:.3f}")
28 | print(f"P-value: {p_value:.5f}")
29 | 


--------------------------------------------------------------------------------
/pythonproject.py.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import seaborn as sns
 5 | 
 6 | df = pd.read_csv("33_Constituency_Wise_Detailed_Result.csv")
 7 | print("First 5 Rows:")
 8 | print(df.head())
 9 | print("\nLast 5 Rows:")
10 | print(df.tail())
11 | print("\nShape of Dataset:", df.shape)
12 | print("\nData Types and Non-Null Counts:")
13 | print(df.info())
14 | print("\nSummary Statistics:")
15 | print(df.describe(include='all'))
16 | 
17 | print("\nColumn Names:")
18 | print(df.columns.tolist())
19 | print("\nMissing Values:")
20 | print(df.isnull().sum())
21 | 
22 | # Visualizing missing value plt.figure(figsize=(12, 6))
23 | sns.heatmap(df.isnull(), cmap="YlGnBu", cbar=False)
24 | plt.title("Missing Values Heatmap")
25 | plt.show()
26 | # Fill 'Gender' and 'Category' with mode
27 | df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)
28 | df['Category'].fillna(df['Category'].mode()[0], inplace=True)
29 | 
30 | # Fill 'Age' with median
31 | df['Age'].fillna(df['Age'].median(), inplace=True)
32 | 
33 | # Confirm no missing values remain
34 | print(df.isnull().sum())
35 | 
36 | duplicate_count = df.duplicated().sum()
37 | print(f"Number of duplicate rows: {duplicate_count}")
38 | 
39 | gender_counts = df['Gender'].value_counts()
40 | plt.figure(figsize=(6, 6))
41 | plt.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=140)
42 | plt.title("Gender Distribution")
43 | plt.axis('equal')  # Equal aspect ratio ensures the pie is drawn as a circle.
44 | plt.show()
45 | 
46 | plt.figure(figsize=(8, 4))
47 | sns.histplot(df['Age'], kde=True, bins=20)
48 | plt.title("Distribution of Age")
49 | plt.xlabel("Age")
50 | plt.ylabel("Count")
51 | plt.show()
52 | 


--------------------------------------------------------------------------------
/pythonproject1.py.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import seaborn as sns
 5 | 
 6 | 
 7 | df = pd.read_csv("33_Constituency_Wise_Detailed_Result.csv")
 8 | numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
 9 | 
10 | # Histograms
11 | df[numerical_cols].hist(figsize=(15, 10), bins=20)
12 | plt.suptitle("Histograms of Numerical Features")
13 | plt.show()
14 | 
15 | # Boxplots
16 | for col in numerical_cols:
17 |     plt.figure(figsize=(6, 1.5))
18 |     sns.boxplot(x=df[col])
19 |     plt.title(f"Boxplot of {col}")
20 |     plt.show()
21 | categorical_cols = df.select_dtypes(include='object').columns.tolist()
22 | 
23 | for col in categorical_cols:
24 |     plt.figure(figsize=(8, 4))
25 |     sns.countplot(y=df[col], order=df[col].value_counts().index)
26 |     plt.title(f"Count Plot of {col}")
27 |     plt.tight_layout()
28 |     plt.show()
29 | 
30 | 
31 | plt.figure(figsize=(10, 6))
32 | sns.heatmap(df[numerical_cols].corr(), annot=True, cmap='coolwarm', fmt=".2f")
33 | plt.title("Correlation Heatmap")
34 | plt.show()
35 | 
36 | for cat in categorical_cols:
37 |     for num in numerical_cols:
38 |         plt.figure(figsize=(8, 4))
39 |         sns.boxplot(x=df[cat], y=df[num])
40 |         plt.title(f"{num} by {cat}")
41 |         plt.xticks(rotation=45)
42 |         plt.tight_layout()
43 |         plt.show()
44 | 


--------------------------------------------------------------------------------
/pythonproject2.py.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import seaborn as sns
 5 | 
 6 | df = pd.read_csv("33_Constituency_Wise_Detailed_Result.csv")
 7 | 
 8 | print("Shape of dataset:", df.shape)
 9 | print("Data types:\n", df.dtypes)
10 | print("Numerical columns:\n", df.select_dtypes(include=np.number).columns.tolist())
11 | 
12 | Q1 = df['Age'].quantile(0.25)
13 | Q3 = df['Age'].quantile(0.75)
14 | IQR = Q3 - Q1
15 | lower = Q1 - 1.5 * IQR
16 | upper = Q3 + 1.5 * IQR
17 | 
18 | df_filtered = df[(df['Age'] >= lower) & (df['Age'] <= upper)]
19 | 
20 | df['Age'] = np.where(df['Age'] < lower, lower,
21 |               np.where(df['Age'] > upper, upper, df['Age']))
22 | 
23 | df['Log_Age'] = np.log1p(df['Age'])
24 | 
25 | plt.figure(figsize=(6, 1.5))
26 | sns.boxplot(x=df_filtered['Age'])
27 | plt.title("Boxplot Before Handling Outliers in Age")
28 | plt.show()
29 | 
30 | plt.figure(figsize=(6, 1.5))
31 | sns.boxplot(x=df['Age'])
32 | plt.title("Boxplot After Handling Outliers in Age")
33 | plt.show()
34 | 
35 | plt.figure(figsize=(6, 1.5))
36 | sns.boxplot(x=df['Log_Age'])
37 | plt.title("Boxplot of Log-Transformed Age")
38 | plt.show()
39 | 
40 | print("Skewness Before (filtered):", df_filtered['Age'].skew())
41 | print("Skewness After (capped):", df['Age'].skew())
42 | print("Skewness After Log Transform:", df['Log_Age'].skew())
43 | 


--------------------------------------------------------------------------------