├── README.md ├── hello.txt ├── new.py ├── pythonproject.py.py ├── pythonproject1.py.py └── pythonproject2.py.py /README.md: -------------------------------------------------------------------------------- 1 | # Python-EDA 2 | -------------------------------------------------------------------------------- /hello.txt: -------------------------------------------------------------------------------- 1 | This is my first Github push 2 | -------------------------------------------------------------------------------- /new.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import seaborn as sns 3 | import matplotlib.pyplot as plt 4 | from scipy.stats import pearsonr 5 | 6 | # Load dataset 7 | df = pd.read_csv("33_Constituency_Wise_Detailed_Result.csv") 8 | 9 | # Fix: Remove leading/trailing spaces from all column names 10 | df.columns = df.columns.str.strip() 11 | 12 | # Now define correct column names (no space needed at end) 13 | x = 'Total Votes Polled In The Constituency' 14 | y = 'Total Electors' 15 | 16 | # Scatter Plot with Regression Line 17 | plt.figure(figsize=(8, 5)) 18 | sns.regplot(x=df[x], y=df[y], scatter_kws={'color':'blue'}, line_kws={"color":"red"}) 19 | plt.title(f"Scatter Plot with Linear Regression ({x} vs {y})") 20 | plt.xlabel(x) 21 | plt.ylabel(y) 22 | plt.grid(True) 23 | plt.show() 24 | 25 | # Pearson Correlation Coefficient 26 | correlation, p_value = pearsonr(df[x].dropna(), df[y].dropna()) 27 | print(f"Pearson Correlation Coefficient between {x} and {y}: {correlation:.3f}") 28 | print(f"P-value: {p_value:.5f}") 29 | -------------------------------------------------------------------------------- /pythonproject.py.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | df = pd.read_csv("33_Constituency_Wise_Detailed_Result.csv") 7 | print("First 5 Rows:") 8 | print(df.head()) 9 | print("\nLast 5 Rows:") 10 | print(df.tail()) 11 | print("\nShape of Dataset:", df.shape) 12 | print("\nData Types and Non-Null Counts:") 13 | print(df.info()) 14 | print("\nSummary Statistics:") 15 | print(df.describe(include='all')) 16 | 17 | print("\nColumn Names:") 18 | print(df.columns.tolist()) 19 | print("\nMissing Values:") 20 | print(df.isnull().sum()) 21 | 22 | # Visualizing missing value plt.figure(figsize=(12, 6)) 23 | sns.heatmap(df.isnull(), cmap="YlGnBu", cbar=False) 24 | plt.title("Missing Values Heatmap") 25 | plt.show() 26 | # Fill 'Gender' and 'Category' with mode 27 | df['Gender'].fillna(df['Gender'].mode()[0], inplace=True) 28 | df['Category'].fillna(df['Category'].mode()[0], inplace=True) 29 | 30 | # Fill 'Age' with median 31 | df['Age'].fillna(df['Age'].median(), inplace=True) 32 | 33 | # Confirm no missing values remain 34 | print(df.isnull().sum()) 35 | 36 | duplicate_count = df.duplicated().sum() 37 | print(f"Number of duplicate rows: {duplicate_count}") 38 | 39 | gender_counts = df['Gender'].value_counts() 40 | plt.figure(figsize=(6, 6)) 41 | plt.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=140) 42 | plt.title("Gender Distribution") 43 | plt.axis('equal') # Equal aspect ratio ensures the pie is drawn as a circle. 44 | plt.show() 45 | 46 | plt.figure(figsize=(8, 4)) 47 | sns.histplot(df['Age'], kde=True, bins=20) 48 | plt.title("Distribution of Age") 49 | plt.xlabel("Age") 50 | plt.ylabel("Count") 51 | plt.show() 52 | -------------------------------------------------------------------------------- /pythonproject1.py.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | 7 | df = pd.read_csv("33_Constituency_Wise_Detailed_Result.csv") 8 | numerical_cols = df.select_dtypes(include=np.number).columns.tolist() 9 | 10 | # Histograms 11 | df[numerical_cols].hist(figsize=(15, 10), bins=20) 12 | plt.suptitle("Histograms of Numerical Features") 13 | plt.show() 14 | 15 | # Boxplots 16 | for col in numerical_cols: 17 | plt.figure(figsize=(6, 1.5)) 18 | sns.boxplot(x=df[col]) 19 | plt.title(f"Boxplot of {col}") 20 | plt.show() 21 | categorical_cols = df.select_dtypes(include='object').columns.tolist() 22 | 23 | for col in categorical_cols: 24 | plt.figure(figsize=(8, 4)) 25 | sns.countplot(y=df[col], order=df[col].value_counts().index) 26 | plt.title(f"Count Plot of {col}") 27 | plt.tight_layout() 28 | plt.show() 29 | 30 | 31 | plt.figure(figsize=(10, 6)) 32 | sns.heatmap(df[numerical_cols].corr(), annot=True, cmap='coolwarm', fmt=".2f") 33 | plt.title("Correlation Heatmap") 34 | plt.show() 35 | 36 | for cat in categorical_cols: 37 | for num in numerical_cols: 38 | plt.figure(figsize=(8, 4)) 39 | sns.boxplot(x=df[cat], y=df[num]) 40 | plt.title(f"{num} by {cat}") 41 | plt.xticks(rotation=45) 42 | plt.tight_layout() 43 | plt.show() 44 | -------------------------------------------------------------------------------- /pythonproject2.py.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | df = pd.read_csv("33_Constituency_Wise_Detailed_Result.csv") 7 | 8 | print("Shape of dataset:", df.shape) 9 | print("Data types:\n", df.dtypes) 10 | print("Numerical columns:\n", df.select_dtypes(include=np.number).columns.tolist()) 11 | 12 | Q1 = df['Age'].quantile(0.25) 13 | Q3 = df['Age'].quantile(0.75) 14 | IQR = Q3 - Q1 15 | lower = Q1 - 1.5 * IQR 16 | upper = Q3 + 1.5 * IQR 17 | 18 | df_filtered = df[(df['Age'] >= lower) & (df['Age'] <= upper)] 19 | 20 | df['Age'] = np.where(df['Age'] < lower, lower, 21 | np.where(df['Age'] > upper, upper, df['Age'])) 22 | 23 | df['Log_Age'] = np.log1p(df['Age']) 24 | 25 | plt.figure(figsize=(6, 1.5)) 26 | sns.boxplot(x=df_filtered['Age']) 27 | plt.title("Boxplot Before Handling Outliers in Age") 28 | plt.show() 29 | 30 | plt.figure(figsize=(6, 1.5)) 31 | sns.boxplot(x=df['Age']) 32 | plt.title("Boxplot After Handling Outliers in Age") 33 | plt.show() 34 | 35 | plt.figure(figsize=(6, 1.5)) 36 | sns.boxplot(x=df['Log_Age']) 37 | plt.title("Boxplot of Log-Transformed Age") 38 | plt.show() 39 | 40 | print("Skewness Before (filtered):", df_filtered['Age'].skew()) 41 | print("Skewness After (capped):", df['Age'].skew()) 42 | print("Skewness After Log Transform:", df['Log_Age'].skew()) 43 | --------------------------------------------------------------------------------