├── README.md └── predicting_crime.py /README.md: -------------------------------------------------------------------------------- 1 | # Predicting-Crime-in-Toronto 2 | Using data sourced from the Toronto Police (http://data.torontopolice.on.ca/pages/open-data), I construct a multi-class classification model using a Random Forest classifier to predict the type of major crime committed based on time of day, neighbourhood, division, year, month, etc. The dataset includes every major crime committed from 2014-2017* in the city of Toronto, with detailed information about the location and time of offence. The data contains only categorical variables so the modeling process tests both numeric encoding and OneHot encoding, with some improvement with the latter approach. 3 | 4 | The model performs reasonably well on F1-score (precision and recall) for a five-class classification problem. Though the data set is somewhat unbalanced towards assaults (higher volume), balancing class weights does not materially impact model performance. 5 | -------------------------------------------------------------------------------- /predicting_crime.py: -------------------------------------------------------------------------------- 1 | 2 | #--------------------------------------------------# 3 | 4 | #1) IMPORT LIBRARIES 5 | 6 | #Computation and Structuring: 7 | 8 | import pandas as pd 9 | from sklearn.model_selection import train_test_split 10 | from sklearn.preprocessing import OneHotEncoder 11 | 12 | #Modeling: 13 | 14 | from sklearn.ensemble import RandomForestClassifier 15 | from sklearn.ensemble import GradientBoostingClassifier 16 | 17 | #Testing: 18 | 19 | from sklearn.metrics import confusion_matrix, classification_report, accuracy_score 20 | 21 | #--------------------------------------------------# 22 | 23 | #2) DATA IMPORT AND PRE-PROCESSING 24 | 25 | #import full data set 26 | df = pd.read_csv('MCI_2014_to_2017.csv',sep=',') 27 | 28 | #list of relevant columns for model 29 | col_list = ['occurrenceyear', 'occurrencemonth','occurrenceday','occurrencedayofyear','occurrencedayofweek','occurrencehour','MCI', 'Division', 'Hood_ID','premisetype'] 30 | 31 | #dataframe created from list of relevant columns 32 | 33 | df2 = df[col_list] 34 | df2 = df2[df2['occurrenceyear'] > 2013] #drop "stale" crimes, where occurence is before 2014. Since data set is filtered based on reported date, we're ignoring these old crimes. 35 | 36 | #Factorize dependent variable column: 37 | 38 | crime_var = pd.factorize(df2['MCI']) #codes the list of crimes to a int64 variable 39 | df2['MCI'] = crime_var[0] 40 | definition_list_MCI = crime_var[1] #create an index reference so we know which crimes are coded to which factors 41 | 42 | #factorize independent variables: 43 | 44 | #factorize premisetype: 45 | 46 | premise_var = pd.factorize(df2['premisetype']) 47 | df2['premisetype'] = premise_var[0] 48 | definition_list_premise = premise_var[1] 49 | 50 | #factorize occurenceyear: 51 | 52 | year_var = pd.factorize(df2['occurrenceyear']) 53 | df2['occurrenceyear'] = year_var[0] 54 | definition_list_year = year_var[1] 55 | 56 | #factorize occurencemonth: 57 | 58 | month_var = pd.factorize(df2['occurrencemonth']) 59 | df2['occurrencemonth'] = month_var[0] 60 | definition_list_month = month_var[1] 61 | 62 | #factorize occurenceday: 63 | 64 | day_var = pd.factorize(df2['occurrenceday']) 65 | df2['occurenceday'] = day_var[0] 66 | definition_list_day = day_var[1] 67 | 68 | #factorize occurencedayofweek: 69 | 70 | dayweek_var = pd.factorize(df2['occurrencedayofweek']) 71 | df2['occurrencedayofweek'] = dayweek_var[0] 72 | definition_list_day = dayweek_var[1] 73 | 74 | #factorize division: 75 | 76 | division_var = pd.factorize(df2['Division']) 77 | df2['Division'] = division_var[0] 78 | definition_list_division = division_var[1] 79 | 80 | #factorize HOOD_ID: 81 | 82 | hood_var = pd.factorize(df2['Hood_ID']) 83 | df2['Hood_ID'] = hood_var[0] 84 | definition_list_hood = hood_var[1] 85 | 86 | #factorize occurencehour: 87 | 88 | hour_var = pd.factorize(df2['occurrencehour']) 89 | df2['occurrencehour'] = hour_var[0] 90 | definition_list_hour = hour_var[1] 91 | 92 | #factorize occurencedayofyear: 93 | 94 | dayyear_var = pd.factorize(df2['occurrencedayofyear']) 95 | df2['occurrencedayofyear'] = dayyear_var[0] 96 | definition_list_dayyear = dayyear_var[1] 97 | 98 | #set X and Y: 99 | 100 | X = df2.drop(['MCI'],axis=1).values #sets x and converts to an array 101 | print(X.head()) 102 | 103 | y = df2['MCI'].values #sets y and converts to an array 104 | 105 | #split the data into train and test sets for numeric encoded dataset: 106 | 107 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21) 108 | 109 | #need to OneHotEncode all the X variables for input into the classification model: 110 | 111 | binary_encoder = OneHotEncoder(sparse=False) 112 | encoded_X = binary_encoder.fit_transform(X) 113 | 114 | X_train_OH, X_test_OH, y_train_OH, y_test_OH = train_test_split(encoded_X, y, test_size = 0.25, random_state = 21) 115 | 116 | 117 | #--------------------------------------------------# 118 | 119 | #3) MODELING AND TESTING: 120 | 121 | #Numeric Encoded Model w/ SKLEARN: 122 | 123 | classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 42) 124 | classifier.fit(X_train, y_train) 125 | y_pred = classifier.predict(X_test) # Predicting the Test set results 126 | 127 | print(accuracy_score(y_test, y_pred)) #accuracy at 0.63 128 | print(confusion_matrix(y_test, y_pred)) 129 | print(classification_report(y_test,y_pred, target_names=definition_list_MCI)) 130 | 131 | #theft over is pulling down results. Pretty good on Assault (largest sample size) and break and enter 132 | 133 | 134 | #One Hot Encoded Model w/ SKLEARN: 135 | 136 | classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 42) 137 | classifier.fit(X_train_OH, y_train_OH) 138 | y_pred_OH = classifier.predict(X_test_OH) # Predicting the Test set results 139 | 140 | print(accuracy_score(y_test_OH, y_pred_OH)) #modest improvement to 0.648 141 | print(confusion_matrix(y_test_OH, y_pred_OH)) 142 | print(classification_report(y_test_OH,y_pred_OH, target_names=definition_list_MCI)) #modest improvement 143 | 144 | #Balanced Class Weight doesn't make a big difference for results: 145 | 146 | classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 42, class_weight='balanced') 147 | classifier.fit(X_train, y_train) 148 | y_pred = classifier.predict(X_test) 149 | print(accuracy_score(y_test, y_pred)) #accuracy at 0.63 150 | print(confusion_matrix(y_test, y_pred)) 151 | print(classification_report(y_test,y_pred, target_names=definition_list_MCI)) 152 | 153 | #--------------------------------------------------# 154 | 155 | #gradientboost performs poorly relative to randomforest 156 | 157 | grad_class = GradientBoostingClassifier(learning_rate=0.1,n_estimators = 10, random_state = 42) 158 | grad_class.fit(X_train_OH, y_train_OH) 159 | y_pred_OH = grad_class.predict(X_test_OH) # Predicting the Test set results 160 | 161 | print(accuracy_score(y_test_OH, y_pred_OH)) #modest improvement to 0.648 162 | print(confusion_matrix(y_test_OH, y_pred_OH)) 163 | print(classification_report(y_test_OH,y_pred_OH, target_names=definition_list_MCI)) 164 | --------------------------------------------------------------------------------