├── Figure_1.png ├── Figure_2.png ├── Figure_3.png ├── T-ouput2.png ├── T-output.png └── DSM5.py /Figure_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rahat-karim/PRODIGY_TrackCode_DS_Task5/HEAD/Figure_1.png -------------------------------------------------------------------------------- /Figure_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rahat-karim/PRODIGY_TrackCode_DS_Task5/HEAD/Figure_2.png -------------------------------------------------------------------------------- /Figure_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rahat-karim/PRODIGY_TrackCode_DS_Task5/HEAD/Figure_3.png -------------------------------------------------------------------------------- /T-ouput2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rahat-karim/PRODIGY_TrackCode_DS_Task5/HEAD/T-ouput2.png -------------------------------------------------------------------------------- /T-output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Rahat-karim/PRODIGY_TrackCode_DS_Task5/HEAD/T-output.png -------------------------------------------------------------------------------- /DSM5.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | import folium 6 | from folium.plugins import HeatMap 7 | from sklearn.model_selection import train_test_split 8 | from sklearn.ensemble import RandomForestClassifier 9 | from sklearn.metrics import classification_report 10 | 11 | # Path to the CSV file 12 | file_path = 'US_Accidents_March23.csv' 13 | 14 | # Load the dataset 15 | try: 16 | df = pd.read_csv(file_path, on_bad_lines='skip') 17 | except pd.errors.ParserError as e: 18 | print(f"ParserError: {e}") 19 | exit() 20 | except FileNotFoundError as e: 21 | print(f"FileNotFoundError: {e}") 22 | exit() 23 | 24 | # Check for missing values 25 | print(df.isnull().sum()) 26 | 27 | # Drop rows with significant missing data or fill with appropriate values 28 | df.dropna(subset=['Start_Time', 'End_Time', 'Weather_Condition'], inplace=True) 29 | 30 | # Convert date columns to datetime with error handling 31 | df['Start_Time'] = pd.to_datetime(df['Start_Time'], errors='coerce') 32 | df['End_Time'] = pd.to_datetime(df['End_Time'], errors='coerce') 33 | 34 | # Drop rows where datetime conversion failed 35 | df.dropna(subset=['Start_Time', 'End_Time'], inplace=True) 36 | 37 | # Distribution of accidents over time 38 | df['Hour'] = df['Start_Time'].dt.hour 39 | sns.histplot(df['Hour'], bins=24, kde=True) 40 | plt.title('Distribution of Accidents by Hour') 41 | plt.xlabel('Hour of the Day') 42 | plt.ylabel('Number of Accidents') 43 | plt.show() 44 | 45 | # Distribution by weather conditions 46 | sns.countplot(x='Weather_Condition', data=df) 47 | plt.title('Distribution of Accidents by Weather Condition') 48 | plt.xticks(rotation=90) 49 | plt.xlabel('Weather Condition') 50 | plt.ylabel('Number of Accidents') 51 | plt.show() 52 | 53 | # Visualizing Accident Hotspots 54 | # Sample subset for performance (optional) 55 | df_sample = df.sample(n=10000, random_state=1) 56 | 57 | # Create a base map 58 | base_map = folium.Map(location=[df_sample['Start_Lat'].mean(), df_sample['Start_Lng'].mean()], zoom_start=5) 59 | 60 | # Add heatmap 61 | heat_data = [[row['Start_Lat'], row['Start_Lng']] for index, row in df_sample.iterrows()] 62 | HeatMap(heat_data).add_to(base_map) 63 | 64 | # Save map 65 | base_map.save("accident_hotspots.html") 66 | 67 | # Correlation analysis 68 | corr = df[['Hour', 'Severity', 'Weather_Condition']].apply(lambda x: pd.factorize(x)[0]).corr() 69 | sns.heatmap(corr, annot=True, cmap='coolwarm') 70 | plt.title('Correlation between Factors') 71 | plt.show() 72 | 73 | # Further analysis and machine learning (optional) 74 | 75 | # Feature engineering 76 | df['Day_of_Week'] = df['Start_Time'].dt.dayofweek 77 | 78 | # Select features and target variable 79 | features = df[['Hour', 'Day_of_Week', 'Weather_Condition']].apply(lambda x: pd.factorize(x)[0]) 80 | target = df['Severity'] 81 | 82 | # Split data 83 | X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42) 84 | 85 | # Train model 86 | clf = RandomForestClassifier(n_estimators=100, random_state=42) 87 | clf.fit(X_train, y_train) 88 | 89 | # Predict and evaluate 90 | y_pred = clf.predict(X_test) 91 | print(classification_report(y_test, y_pred)) --------------------------------------------------------------------------------