├── README.md
└── predicting_crime.py


/README.md:
--------------------------------------------------------------------------------
1 | # Predicting-Crime-in-Toronto
2 | Using data sourced from the Toronto Police (http://data.torontopolice.on.ca/pages/open-data), I construct a multi-class classification model using a Random Forest classifier to predict the type of major crime committed based on time of day, neighbourhood, division, year, month, etc. The dataset includes every major crime committed from 2014-2017* in the city of Toronto, with detailed information about the location and time of offence. The data contains only categorical variables so the modeling process tests both numeric encoding and OneHot encoding, with some improvement with the latter approach. 
3 | 
4 | The model performs reasonably well on F1-score (precision and recall) for a five-class classification problem. Though the data set is somewhat unbalanced towards assaults (higher volume), balancing class weights does not materially impact model performance.
5 | 


--------------------------------------------------------------------------------
/predicting_crime.py:
--------------------------------------------------------------------------------
  1 | 
  2 | #--------------------------------------------------#
  3 | 
  4 | #1) IMPORT LIBRARIES
  5 | 
  6 | #Computation and Structuring:
  7 | 
  8 | import pandas as pd
  9 | from sklearn.model_selection import train_test_split
 10 | from sklearn.preprocessing import OneHotEncoder
 11 | 
 12 | #Modeling:
 13 | 
 14 | from sklearn.ensemble import RandomForestClassifier
 15 | from sklearn.ensemble import GradientBoostingClassifier
 16 | 
 17 | #Testing:
 18 | 
 19 | from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
 20 | 
 21 | #--------------------------------------------------#
 22 | 
 23 | #2) DATA IMPORT AND PRE-PROCESSING
 24 | 
 25 | #import full data set
 26 | df = pd.read_csv('MCI_2014_to_2017.csv',sep=',') 
 27 | 
 28 | #list of relevant columns for model
 29 | col_list = ['occurrenceyear',	'occurrencemonth','occurrenceday','occurrencedayofyear','occurrencedayofweek','occurrencehour','MCI',	'Division',	'Hood_ID','premisetype']
 30 | 
 31 | #dataframe created from list of relevant columns
 32 | 
 33 | df2 = df[col_list]
 34 | df2 = df2[df2['occurrenceyear'] > 2013] #drop "stale" crimes, where occurence is before 2014. Since data set is filtered based on reported date, we're ignoring these old crimes.
 35 | 
 36 | #Factorize dependent variable column:
 37 | 
 38 | crime_var = pd.factorize(df2['MCI']) #codes the list of crimes to a int64 variable
 39 | df2['MCI'] = crime_var[0]
 40 | definition_list_MCI = crime_var[1] #create an index reference so we know which crimes are coded to which factors
 41 | 
 42 | #factorize independent variables:
 43 | 
 44 | #factorize premisetype:
 45 | 
 46 | premise_var = pd.factorize(df2['premisetype'])
 47 | df2['premisetype'] = premise_var[0]
 48 | definition_list_premise = premise_var[1] 
 49 | 
 50 | #factorize occurenceyear:
 51 | 
 52 | year_var = pd.factorize(df2['occurrenceyear'])
 53 | df2['occurrenceyear'] = year_var[0]
 54 | definition_list_year = year_var[1] 
 55 | 
 56 | #factorize occurencemonth:
 57 | 
 58 | month_var = pd.factorize(df2['occurrencemonth'])
 59 | df2['occurrencemonth'] = month_var[0]
 60 | definition_list_month = month_var[1] 
 61 | 
 62 | #factorize occurenceday:
 63 | 
 64 | day_var = pd.factorize(df2['occurrenceday'])
 65 | df2['occurenceday'] = day_var[0]
 66 | definition_list_day = day_var[1] 
 67 | 
 68 | #factorize occurencedayofweek:
 69 | 
 70 | dayweek_var = pd.factorize(df2['occurrencedayofweek'])
 71 | df2['occurrencedayofweek'] = dayweek_var[0]
 72 | definition_list_day = dayweek_var[1] 
 73 | 
 74 | #factorize division:
 75 | 
 76 | division_var = pd.factorize(df2['Division'])
 77 | df2['Division'] = division_var[0]
 78 | definition_list_division = division_var[1] 
 79 | 
 80 | #factorize HOOD_ID:
 81 | 
 82 | hood_var = pd.factorize(df2['Hood_ID'])
 83 | df2['Hood_ID'] = hood_var[0]
 84 | definition_list_hood = hood_var[1] 
 85 | 
 86 | #factorize occurencehour:
 87 | 
 88 | hour_var = pd.factorize(df2['occurrencehour'])
 89 | df2['occurrencehour'] = hour_var[0]
 90 | definition_list_hour = hour_var[1] 
 91 | 
 92 | #factorize occurencedayofyear:
 93 | 
 94 | dayyear_var = pd.factorize(df2['occurrencedayofyear'])
 95 | df2['occurrencedayofyear'] = dayyear_var[0]
 96 | definition_list_dayyear = dayyear_var[1] 
 97 | 
 98 | #set X and Y:
 99 | 
100 | X = df2.drop(['MCI'],axis=1).values #sets x and converts to an array
101 | print(X.head())
102 | 
103 | y = df2['MCI'].values #sets y and converts to an array
104 | 
105 | #split the data into train and test sets for numeric encoded dataset:
106 | 
107 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 21)
108 | 
109 | #need to OneHotEncode all the X variables for input into the classification model:
110 | 
111 | binary_encoder = OneHotEncoder(sparse=False)
112 | encoded_X = binary_encoder.fit_transform(X)
113 | 
114 | X_train_OH, X_test_OH, y_train_OH, y_test_OH = train_test_split(encoded_X, y, test_size = 0.25, random_state = 21)
115 | 
116 | 
117 | #--------------------------------------------------#
118 | 
119 | #3) MODELING AND TESTING:
120 | 
121 | #Numeric Encoded Model w/ SKLEARN:
122 | 
123 | classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 42)
124 | classifier.fit(X_train, y_train)
125 | y_pred = classifier.predict(X_test) # Predicting the Test set results
126 | 
127 | print(accuracy_score(y_test, y_pred)) #accuracy at 0.63
128 | print(confusion_matrix(y_test, y_pred)) 
129 | print(classification_report(y_test,y_pred, target_names=definition_list_MCI)) 
130 | 
131 | #theft over is pulling down results. Pretty good on Assault (largest sample size) and break and enter 
132 | 
133 | 
134 | #One Hot Encoded Model w/ SKLEARN:
135 | 
136 | classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 42)
137 | classifier.fit(X_train_OH, y_train_OH)
138 | y_pred_OH = classifier.predict(X_test_OH) # Predicting the Test set results
139 | 
140 | print(accuracy_score(y_test_OH, y_pred_OH)) #modest improvement to 0.648
141 | print(confusion_matrix(y_test_OH, y_pred_OH)) 
142 | print(classification_report(y_test_OH,y_pred_OH, target_names=definition_list_MCI)) #modest improvement
143 | 
144 | #Balanced Class Weight doesn't make a big difference for results:
145 | 
146 | classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 42, class_weight='balanced')
147 | classifier.fit(X_train, y_train)
148 | y_pred = classifier.predict(X_test) 
149 | print(accuracy_score(y_test, y_pred)) #accuracy at 0.63
150 | print(confusion_matrix(y_test, y_pred)) 
151 | print(classification_report(y_test,y_pred, target_names=definition_list_MCI)) 
152 | 
153 | #--------------------------------------------------#
154 | 
155 | #gradientboost performs poorly relative to randomforest
156 | 
157 | grad_class = GradientBoostingClassifier(learning_rate=0.1,n_estimators = 10, random_state = 42)
158 | grad_class.fit(X_train_OH, y_train_OH)
159 | y_pred_OH = grad_class.predict(X_test_OH) # Predicting the Test set results
160 | 
161 | print(accuracy_score(y_test_OH, y_pred_OH)) #modest improvement to 0.648
162 | print(confusion_matrix(y_test_OH, y_pred_OH)) 
163 | print(classification_report(y_test_OH,y_pred_OH, target_names=definition_list_MCI)) 
164 | 


--------------------------------------------------------------------------------