├── Crime_Data.csv ├── README.md └── pyproject.py /README.md: -------------------------------------------------------------------------------- 1 | # Crime-Data-Analysis 2 | A data analysis and visualization project that explores crime patterns in US using Python, numpy, pandas, seaborn, and matplotlib. 3 | -------------------------------------------------------------------------------- /pyproject.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import seaborn as sns 5 | 6 | # Set Seaborn style for compelling visuals 7 | sns.set_style("whitegrid") 8 | sns.set_palette("husl") 9 | plt.rcParams.update({'font.size': 12, 'axes.titlesize': 16, 'axes.labelsize': 12}) 10 | 11 | # Load the dataset 12 | df = pd.read_csv("Crime_Data.csv") 13 | 14 | # --- Data Cleaning --- 15 | df["Mocodes"].fillna("Null", inplace=True) 16 | df["Vict Sex"].fillna("Unknown", inplace=True) 17 | df["Vict Descent"].fillna("Unknown", inplace=True) 18 | df["Premis Desc"].fillna("Unknown", inplace=True) 19 | df["Weapon Desc"].fillna("Unknown", inplace=True) 20 | df["Weapon Used Cd"].fillna("N/A", inplace=True) 21 | df["Crm Cd 1"].fillna("N/A", inplace=True) 22 | df["Crm Cd 2"].fillna("N/A", inplace=True) 23 | df["Crm Cd 3"].fillna("N/A", inplace=True) 24 | 25 | # Remove duplicate rows 26 | df.drop_duplicates(inplace=True) 27 | 28 | # OBJECTIVE 1: Analyze and Visualize Top 10 Crime Categories 29 | crime_counts = df['Crm Cd Desc'].value_counts() 30 | top_crimes = crime_counts.head(10) 31 | 32 | short_labels = { 33 | 'VEHICLE - STOLEN': 'Stolen Vehicle', 34 | 'BATTERY - SIMPLE ASSAULT': 'Battery Assault', 35 | 'BURGLARY FROM VEHICLE': 'Burglary (Vehicle)', 36 | 'VANDALISM - FELONY ($400 & OVER, ALL CHURCH VANDALISMS)': 'Felony Vandalism', 37 | 'ASSAULT WITH DEADLY WEAPON, AGGRAVATED ASSAULT': 'Aggr. Assault w/ Weapon', 38 | 'INTIMATE PARTNER - SIMPLE ASSAULT': 'IP Assault', 39 | 'BURGLARY': 'Burglary', 40 | 'THEFT PLAIN - PETTY ($950 & UNDER)': 'Petty Theft', 41 | 'THEFT FROM MOTOR VEHICLE - PETTY ($950 & UNDER)': 'Theft from Vehicle', 42 | 'VANDALISM - MISDEMEANOR ($399 OR UNDER)': 'Misdemeanor Vandalism' 43 | } 44 | top_crimes.index = top_crimes.index.map(short_labels) 45 | 46 | # Top 10 Crime Categories Bar Plot (Enhanced) 47 | plt.figure(figsize=(10, 6)) 48 | sns.barplot(x=top_crimes.values, y=top_crimes.index, palette="cubehelix", width=0.5) 49 | plt.title("Top 10 Crime Categories", fontsize=20, weight='bold') 50 | plt.xlabel("Number of Crimes", fontsize=16) 51 | plt.ylabel("Crime Type", fontsize=16) 52 | plt.xticks(fontsize=14) 53 | plt.yticks(fontsize=14, fontweight='bold') 54 | plt.tight_layout() 55 | plt.grid(True, axis='x', linestyle='--', alpha=0.7) 56 | plt.show() 57 | 58 | # OBJECTIVE 2: Time-Series Analysis of Crime Trends by Month and Year 59 | df['DATE OCC'] = pd.to_datetime(df['DATE OCC'], errors='coerce') 60 | df['Year'] = df['DATE OCC'].dt.year 61 | df['Month'] = df['DATE OCC'].dt.month 62 | df['Day'] = df['DATE OCC'].dt.day 63 | 64 | df_filtered = df[(df['Year'] >= 2000) & (df['Year'] <= pd.Timestamp.now().year)] 65 | crime_trends_by_month = df.groupby('Month').size() 66 | 67 | # Enhanced Time Series Analysis (Average Crime Trends by Month) 68 | plt.figure(figsize=(10, 6)) 69 | sns.barplot(x=crime_trends_by_month.index, y=crime_trends_by_month.values, palette="crest", width=0.7) 70 | plt.title("Average Crime Trends by Month", fontsize=20, weight='bold') 71 | plt.xlabel("Month", fontsize=16) 72 | plt.ylabel("Number of Crimes", fontsize=16) 73 | plt.xticks(ticks=np.arange(12), labels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 74 | 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'], fontsize=14) 75 | plt.yticks(fontsize=14, fontweight='bold') 76 | plt.tight_layout() 77 | 78 | # Adding gridlines for a more refined look 79 | plt.grid(True, axis='y', linestyle='--', alpha=0.7) 80 | plt.show() 81 | 82 | # OBJECTIVE 3: Crime Hotspot Detection Using Grid-Based Heatmap 83 | lat_bins = np.linspace(df["LAT"].min(), df["LAT"].max(), 4) 84 | lon_bins = np.linspace(df["LON"].min(), df["LON"].max(), 4) 85 | 86 | crime_density, _, _ = np.histogram2d(df["LAT"], df["LON"], bins=[lat_bins, lon_bins]) 87 | 88 | # Enhanced Heatmap with Light Shades 89 | plt.figure(figsize=(10, 6)) 90 | plt.imshow(crime_density.T, cmap='Blues', origin='lower', aspect='auto', 91 | extent=[df["LON"].min(), df["LON"].max(), df["LAT"].min(), df["LAT"].max()]) 92 | plt.colorbar(label='Crime Density') 93 | plt.title("Crime Hotspots (Enhanced Heatmap)", fontsize=20, weight='bold') 94 | plt.xlabel("Longitude", fontsize=16) 95 | plt.ylabel("Latitude", fontsize=16) 96 | plt.tight_layout() 97 | 98 | # Adding gridlines and annotations for better clarity 99 | plt.grid(True, linestyle='-', color='black', alpha=0.3) 100 | plt.show() 101 | 102 | # OBJECTIVE 4: Victim Age Distribution by Crime Type and Gender 103 | df_valid = df[(df['Vict Age'] > 0) & (df['Vict Age'] <= 100)] 104 | top5_crimes = df_valid['Crm Cd Desc'].value_counts().head(5).index 105 | df_top5 = df_valid[df_valid['Crm Cd Desc'].isin(top5_crimes)] 106 | 107 | # Stripplot for Victim Age Distribution by Crime Type and Gender 108 | plt.figure(figsize=(12, 6)) 109 | sns.stripplot(data=df_top5, x='Crm Cd Desc', y='Vict Age', hue='Vict Sex', 110 | jitter=True, dodge=True, alpha=0.7, palette='coolwarm') 111 | plt.title("Victim Age Distribution by Crime Type and Gender", fontsize=20, weight='bold') 112 | plt.xlabel("Crime Type", fontsize=16) 113 | plt.ylabel("Victim Age", fontsize=16) 114 | plt.xticks(rotation=25, fontsize=14) 115 | plt.legend(title='Victim Sex', fontsize=14) 116 | plt.tight_layout() 117 | plt.show() 118 | 119 | # OBJECTIVE 5: Analyze Top 5 Weapons Used in Crimes 120 | weapon_counts = df['Weapon Desc'].value_counts().head(5) 121 | 122 | # Enhanced Pie Chart (Exploded View & Color Palette) 123 | plt.figure(figsize=(8, 8)) # Increased size of the pie chart 124 | explode = (0.1, 0.1, 0.1, 0.1, 0.1) # Exploding each slice slightly 125 | 126 | # Create the pie chart without labels, displaying only percentages 127 | colors = sns.color_palette("Spectral", n_colors=len(weapon_counts)) # Ensure correct number of colors 128 | plt.pie(weapon_counts, autopct='%1.1f%%', startangle=120, colors=colors, 129 | textprops={'fontsize': 12, 'fontweight': 'bold'}, explode=explode, labels=None, pctdistance=0.85) 130 | 131 | plt.title("Top 5 Weapons Used in Crimes", fontsize=18, weight='bold') 132 | plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle 133 | 134 | # Create a color-coded legend on the right side 135 | labels = weapon_counts.index.tolist() # Convert Index to list of labels 136 | plt.legend(handles=[plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=color, markersize=10) 137 | for color in colors], labels=labels, title="Weapon Descriptions", loc='center left', 138 | bbox_to_anchor=(1.05, 0.5), fontsize=12) 139 | 140 | plt.tight_layout() 141 | plt.show() 142 | 143 | # OBJECTIVE 6: Victim Age Distribution (Histogram) 144 | df_valid = df[(df['Vict Age'] > 0) & (df['Vict Age'] <= 100)] 145 | 146 | # Histogram for Victim Age Distribution 147 | plt.figure(figsize=(10, 6)) 148 | sns.histplot(df_valid['Vict Age'], bins=20, kde=True, color="green") 149 | plt.title("Victim Age Distribution", fontsize=20, weight='bold') 150 | plt.xlabel("Age", fontsize=16) 151 | plt.ylabel("Frequency", fontsize=16) 152 | plt.tight_layout() 153 | plt.show() 154 | 155 | # OBJECTIVE 7: Analyze Crime Distribution by Status Description 156 | status_desc_counts = df['Status Desc'].value_counts() 157 | 158 | # Bar Plot for Crime Distribution by Status Description 159 | plt.figure(figsize=(10, 6)) 160 | status_desc_counts.plot(kind='bar', stacked=True, color=sns.color_palette("Set2")) 161 | plt.title("Distribution of Crimes by Status Description", fontsize=20, weight='bold') 162 | plt.xlabel("Status", fontsize=16) 163 | plt.ylabel("Number of Crimes", fontsize=16) 164 | plt.xticks(rotation=0, fontsize=14) 165 | plt.tight_layout() 166 | plt.show() 167 | 168 | # OBJECTIVE 8: Monthly Crime Trends by Area in 2020 169 | df['Month'] = df['DATE OCC'].dt.strftime('%b') 170 | df_2020 = df[df['Year'] == 2020] 171 | month_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 172 | 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] 173 | df_2020['Month'] = pd.Categorical(df_2020['Month'], categories=month_order, ordered=True) 174 | 175 | monthly_crimes = df_2020.groupby(['Month', 'AREA NAME']).size().unstack(fill_value=0) 176 | 177 | # Lineplot for Monthly Crime Trends by Area (2020) 178 | plt.figure(figsize=(10, 8)) 179 | for area in monthly_crimes.columns: 180 | sns.lineplot(y=monthly_crimes.index, x=monthly_crimes[area], label=area, marker='o') 181 | 182 | plt.title("Monthly Crime Trends by Area (2020, Horizontal View)", fontsize=20, weight='bold') 183 | plt.xlabel("Number of Crimes", fontsize=16) 184 | plt.ylabel("Month", fontsize=16) 185 | plt.legend(title="Area", bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=14) 186 | plt.tight_layout() 187 | plt.show() 188 | 189 | # Save statistics to CSV 190 | df.to_csv("crime_data_cleaned.csv") 191 | print("-----Analysis Done!-------") 192 | --------------------------------------------------------------------------------