├── Complete EDA_PROJECT code.py
├── EDA Report.pdf
└── README.md


/Complete EDA_PROJECT code.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import matplotlib.pyplot as plt
  4 | import seaborn as sns
  5 | 
  6 | 
  7 | # load the dataset
  8 | df=pd.read_csv('D:/data.csv')
  9 | 
 10 | # view the data
 11 | print("Dataset: ")
 12 | print(df.head())
 13 | print()
 14 | 
 15 | # Basic information
 16 | print("Basic info: ")
 17 | print(df.info())
 18 | print()
 19 | 
 20 | # Describe the data
 21 | print("Dataset Description: ")
 22 | print(df.describe())
 23 | print()
 24 | 
 25 | 
 26 | # find null values
 27 | print("Total null values: ")
 28 | print(df.isnull().sum())
 29 | print()
 30 | # replace null values
 31 | df.replace(np.nan, '0',inplace=True)
 32 | 
 33 | 
 34 | # Convert to datetime format
 35 | df['Crash Date'] = pd.to_datetime(df['Crash Date'], errors='coerce')
 36 | # Group by month (you can change to 'D' for daily or 'Y' for yearly)
 37 | crash_trend = df['Crash Date'].dt.to_period('M').value_counts().sort_index()
 38 | # Plotting
 39 | plt.figure(figsize=(12, 6))
 40 | crash_trend.plot(kind='line', marker='o')
 41 | plt.title('Crash Count Over Time')
 42 | plt.xlabel('Month')
 43 | plt.ylabel('Number of Crashes')
 44 | plt.grid(True)
 45 | plt.tight_layout()
 46 | plt.show()
 47 | 
 48 | 
 49 | # Replace 'Driver At Fault' with the actual column name if it's different
 50 | gender_counts = df['Driver At Fault'].value_counts()
 51 | # Plotting the pie chart
 52 | plt.figure(figsize=(8, 8))
 53 | plt.pie(gender_counts, labels=gender_counts.index, autopct='%1.1f%%', startangle=90, colors=['red', 'green', 'orange'])
 54 | plt.title("Crash Distribution Based on Driver's Fault")
 55 | plt.axis('equal')  # Equal aspect ratio ensures the pie is drawn as a circle
 56 | plt.show()
 57 | 
 58 | 
 59 | # Replace 'License Type' with the actual column name in your dataset
 60 | license_counts = df['Light'].value_counts()
 61 | # Plotting the bar chart
 62 | plt.figure(figsize=(10, 6))
 63 | license_counts.plot(kind='bar', color='skyblue', edgecolor='black')
 64 | plt.title('Crash count based on Light Condition')
 65 | plt.xlabel('Light Condition')
 66 | plt.ylabel('Number of Crashes')
 67 | plt.xticks(rotation=45)
 68 | plt.grid(axis='y')
 69 | plt.tight_layout()
 70 | plt.show()
 71 | 
 72 | 
 73 | # Replace 'Speed Limit' with the actual column name if different
 74 | plt.figure(figsize=(10, 6))
 75 | plt.hist(df['Speed Limit'].dropna(), bins=20, color='cornflowerblue', edgecolor='black')
 76 | plt.title('Crash count based on Speed Limit')
 77 | plt.xlabel('Speed Limit')
 78 | plt.ylabel('Number of Drivers')
 79 | plt.grid(axis='y')
 80 | plt.tight_layout()
 81 | plt.show()
 82 | 
 83 | 
 84 | # Replace 'Vehicle Type' with your actual column name
 85 | violation_counts = df['Vehicle Body Type'].value_counts()
 86 | # Plotting horizontal bar chart
 87 | plt.figure(figsize=(10, 6))
 88 | violation_counts.plot(kind='barh', color='mediumseagreen', edgecolor='black')
 89 | plt.title('Crash Count by Vehicle Type')
 90 | plt.xlabel('Number of Crashes')
 91 | plt.ylabel('Vehicle Type')
 92 | plt.grid(axis='x')
 93 | plt.tight_layout()
 94 | plt.show()
 95 | 
 96 | 
 97 | # Compute the correlation matrix
 98 | # This will automatically select numerical columns
 99 | corr_matrix = df.corr(numeric_only=True)
100 | # Plotting the heatmap
101 | plt.figure(figsize=(10, 8))
102 | sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
103 | plt.title('Correlation Heatmap of Numerical Features')
104 | plt.tight_layout()
105 | plt.show()
106 | 
107 | 
108 | # Replace 'Speed' and 'Age' with actual column names if different
109 | plt.figure(figsize=(10, 6))
110 | plt.scatter(df['Age'], df['Speed Limit'], alpha=0.6, color='teal', edgecolor='black')
111 | plt.title('Scatter Plot: Speed vs Age of Drivers')
112 | plt.xlabel('Age')
113 | plt.ylabel('Speed at Time of Crash')
114 | plt.grid(True)
115 | plt.tight_layout()
116 | plt.show()
117 | 
118 | 
119 | # Replace 'Age' and 'Gender' with the actual column names in your dataset
120 | plt.figure(figsize=(10, 6))
121 | sns.boxplot(x='Gender', y='Age', data=df, palette='Set2')
122 | plt.title('Box Plot: Age Distribution by Gender')
123 | plt.xlabel('Gender')
124 | plt.ylabel('Age')
125 | plt.grid(True)
126 | plt.tight_layout()
127 | plt.show()
128 | 
129 | 
130 | # Replace 'Age' and 'Injury Severity' with actual column names
131 | plt.figure(figsize=(12, 6))
132 | sns.violinplot(x='Injury Severity', y='Age', data=df, palette='Pastel1')
133 | plt.title('Violin Plot: Age Distribution by Violation Type')
134 | plt.xlabel('Injury Severity')
135 | plt.ylabel('Speed at Time of Crash')
136 | plt.xticks(rotation=45)
137 | plt.grid(True)
138 | plt.tight_layout()
139 | plt.show()
140 | 
141 | 
142 | # Replace with actual column names if different
143 | # Create a crosstab: rows = Weather, columns = Injury Severity
144 | crosstab = pd.crosstab(df['Weather'], df['Injury Severity'])
145 | # Plotting the stacked bar chart
146 | crosstab.plot(kind='bar', stacked=True, figsize=(12, 6), colormap='Set3', edgecolor='black')
147 | plt.title('Stacked Bar Plot: Injury Severity according to Weather')
148 | plt.xlabel('Weather')
149 | plt.ylabel('Number of Crashes')
150 | plt.xticks(rotation=45)
151 | plt.legend(title='Weather')
152 | plt.tight_layout()
153 | plt.show()
154 | 


--------------------------------------------------------------------------------
/EDA Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Deepakjaat23/EDA-PROJECT/3658570b1537b1fd30a3ef1558b5aba9321ebd7e/EDA Report.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # EDA-PROJECT
2 | This repository contains a comprehensive Exploratory Data Analysis (EDA) report that provides in-depth insights into a given dataset. The report includes data preprocessing, statistical summaries, visualizations, and key findings to help understand the data structure, patterns, and relationships. It serves as a foundational step for further data modeling and machine learning tasks.
3 | 


--------------------------------------------------------------------------------