├── Complete EDA_PROJECT code.py ├── EDA Report.pdf └── README.md /Complete EDA_PROJECT code.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | 7 | # load the dataset 8 | df=pd.read_csv('D:/data.csv') 9 | 10 | # view the data 11 | print("Dataset: ") 12 | print(df.head()) 13 | print() 14 | 15 | # Basic information 16 | print("Basic info: ") 17 | print(df.info()) 18 | print() 19 | 20 | # Describe the data 21 | print("Dataset Description: ") 22 | print(df.describe()) 23 | print() 24 | 25 | 26 | # find null values 27 | print("Total null values: ") 28 | print(df.isnull().sum()) 29 | print() 30 | # replace null values 31 | df.replace(np.nan, '0',inplace=True) 32 | 33 | 34 | # Convert to datetime format 35 | df['Crash Date'] = pd.to_datetime(df['Crash Date'], errors='coerce') 36 | # Group by month (you can change to 'D' for daily or 'Y' for yearly) 37 | crash_trend = df['Crash Date'].dt.to_period('M').value_counts().sort_index() 38 | # Plotting 39 | plt.figure(figsize=(12, 6)) 40 | crash_trend.plot(kind='line', marker='o') 41 | plt.title('Crash Count Over Time') 42 | plt.xlabel('Month') 43 | plt.ylabel('Number of Crashes') 44 | plt.grid(True) 45 | plt.tight_layout() 46 | plt.show() 47 | 48 | 49 | # Replace 'Driver At Fault' with the actual column name if it's different 50 | gender_counts = df['Driver At Fault'].value_counts() 51 | # Plotting the pie chart 52 | plt.figure(figsize=(8, 8)) 53 | plt.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=90, colors=['red', 'green', 'orange']) 54 | plt.title("Crash Distribution Based on Driver's Fault") 55 | plt.axis('equal') # Equal aspect ratio ensures the pie is drawn as a circle 56 | plt.show() 57 | 58 | 59 | # Replace 'License Type' with the actual column name in your dataset 60 | license_counts = df['Light'].value_counts() 61 | # Plotting the bar chart 62 | plt.figure(figsize=(10, 6)) 63 | license_counts.plot(kind='bar', color='skyblue', edgecolor='black') 64 | plt.title('Crash count based on Light Condition') 65 | plt.xlabel('Light Condition') 66 | plt.ylabel('Number of Crashes') 67 | plt.xticks(rotation=45) 68 | plt.grid(axis='y') 69 | plt.tight_layout() 70 | plt.show() 71 | 72 | 73 | # Replace 'Speed Limit' with the actual column name if different 74 | plt.figure(figsize=(10, 6)) 75 | plt.hist(df['Speed Limit'].dropna(), bins=20, color='cornflowerblue', edgecolor='black') 76 | plt.title('Crash count based on Speed Limit') 77 | plt.xlabel('Speed Limit') 78 | plt.ylabel('Number of Drivers') 79 | plt.grid(axis='y') 80 | plt.tight_layout() 81 | plt.show() 82 | 83 | 84 | # Replace 'Vehicle Type' with your actual column name 85 | violation_counts = df['Vehicle Body Type'].value_counts() 86 | # Plotting horizontal bar chart 87 | plt.figure(figsize=(10, 6)) 88 | violation_counts.plot(kind='barh', color='mediumseagreen', edgecolor='black') 89 | plt.title('Crash Count by Vehicle Type') 90 | plt.xlabel('Number of Crashes') 91 | plt.ylabel('Vehicle Type') 92 | plt.grid(axis='x') 93 | plt.tight_layout() 94 | plt.show() 95 | 96 | 97 | # Compute the correlation matrix 98 | # This will automatically select numerical columns 99 | corr_matrix = df.corr(numeric_only=True) 100 | # Plotting the heatmap 101 | plt.figure(figsize=(10, 8)) 102 | sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5) 103 | plt.title('Correlation Heatmap of Numerical Features') 104 | plt.tight_layout() 105 | plt.show() 106 | 107 | 108 | # Replace 'Speed' and 'Age' with actual column names if different 109 | plt.figure(figsize=(10, 6)) 110 | plt.scatter(df['Age'], df['Speed Limit'], alpha=0.6, color='teal', edgecolor='black') 111 | plt.title('Scatter Plot: Speed vs Age of Drivers') 112 | plt.xlabel('Age') 113 | plt.ylabel('Speed at Time of Crash') 114 | plt.grid(True) 115 | plt.tight_layout() 116 | plt.show() 117 | 118 | 119 | # Replace 'Age' and 'Gender' with the actual column names in your dataset 120 | plt.figure(figsize=(10, 6)) 121 | sns.boxplot(x='Gender', y='Age', data=df, palette='Set2') 122 | plt.title('Box Plot: Age Distribution by Gender') 123 | plt.xlabel('Gender') 124 | plt.ylabel('Age') 125 | plt.grid(True) 126 | plt.tight_layout() 127 | plt.show() 128 | 129 | 130 | # Replace 'Age' and 'Injury Severity' with actual column names 131 | plt.figure(figsize=(12, 6)) 132 | sns.violinplot(x='Injury Severity', y='Age', data=df, palette='Pastel1') 133 | plt.title('Violin Plot: Age Distribution by Violation Type') 134 | plt.xlabel('Injury Severity') 135 | plt.ylabel('Speed at Time of Crash') 136 | plt.xticks(rotation=45) 137 | plt.grid(True) 138 | plt.tight_layout() 139 | plt.show() 140 | 141 | 142 | # Replace with actual column names if different 143 | # Create a crosstab: rows = Weather, columns = Injury Severity 144 | crosstab = pd.crosstab(df['Weather'], df['Injury Severity']) 145 | # Plotting the stacked bar chart 146 | crosstab.plot(kind='bar', stacked=True, figsize=(12, 6), colormap='Set3', edgecolor='black') 147 | plt.title('Stacked Bar Plot: Injury Severity according to Weather') 148 | plt.xlabel('Weather') 149 | plt.ylabel('Number of Crashes') 150 | plt.xticks(rotation=45) 151 | plt.legend(title='Weather') 152 | plt.tight_layout() 153 | plt.show() 154 | -------------------------------------------------------------------------------- /EDA Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Deepakjaat23/EDA-PROJECT/3658570b1537b1fd30a3ef1558b5aba9321ebd7e/EDA Report.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # EDA-PROJECT 2 | This repository contains a comprehensive Exploratory Data Analysis (EDA) report that provides in-depth insights into a given dataset. The report includes data preprocessing, statistical summaries, visualizations, and key findings to help understand the data structure, patterns, and relationships. It serves as a foundational step for further data modeling and machine learning tasks. 3 | --------------------------------------------------------------------------------