├── .gitignore
├── README.md
├── allstate-code.R
├── allstate-paper.md
├── allstate-presentation.pdf
├── allstate-presentation.pptx
├── notes
├── allstate-ideas.md
└── allstate-option-dependencies.md
└── viz
├── allstate-viz-1.png
├── allstate-viz-2.png
├── allstate-viz-3.png
├── allstate-viz-4.png
└── allstate-viz-5.png
/.gitignore:
--------------------------------------------------------------------------------
1 | *.csv
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Allstate Purchase Prediction Challenge
2 |
3 | For my [General Assembly Data Science](https://generalassemb.ly/education/data-science) class project, I competed in [Kaggle's Allstate Purchase Prediction Challenge](http://www.kaggle.com/c/allstate-purchase-prediction-challenge).
4 |
5 | * [Project paper](allstate-paper.md) (Markdown document)
6 | * [R code](allstate-code.R)
7 | * [Blog post](http://www.dataschool.io/kaggle-allstate-purchase-prediction-challenge/)
8 | * Presentation
9 | * [Recording on YouTube](https://www.youtube.com/watch?v=HGr1yQV3Um0)
10 | * [Slides on Speaker Deck](https://speakerdeck.com/justmarkham/allstate-purchase-prediction-challenge-on-kaggle)
11 | * [PDF](allstate-presentation.pdf) or [PowerPoint](allstate-presentation.pptx)
12 |
13 | There is a thread on the Kaggle forums in which competitors are [sharing their solutions](http://www.kaggle.com/c/allstate-purchase-prediction-challenge/forums/t/8218/solution-sharing), and another thread in which you can [provide feedback on my paper](http://www.kaggle.com/c/allstate-purchase-prediction-challenge/forums/t/8222/i-wrote-a-paper-on-the-allstate-competition-feedback-is-welcome/)!
14 |
15 | Per the competition rules surrounding the sharing of code, I created this as a private repository and did not make it public until after the competition ended on May 19.
16 |
--------------------------------------------------------------------------------
/allstate-code.R:
--------------------------------------------------------------------------------
1 | ### Kaggle Competition: Allstate Purchase Prediction Challenge
2 | ### Code by Kevin Markham
3 | ### https://github.com/justmarkham/kaggle-allstate
4 |
5 | # Notes about this file:
6 | # 1. Most of the code is explained in my paper:
7 | # https://github.com/justmarkham/kaggle-allstate/blob/master/allstate-paper.md
8 | # 2. For the sake of brevity, a lot of my exploratory code is not included.
9 | # 3. Many of the variables are poorly named, my apologies.
10 | # 4. The code assumes that "train.csv" and "test_v2.csv" are in your working
11 | # directory.
12 | # 5. To generate my best set of predictions, simply load the required libraries
13 | # and then run the "Loading Data" section and "Model Building Part 10". All
14 | # other code is irrelevant.
15 |
16 |
17 | # Load required libraries
18 | library(dplyr)
19 | library(caret)
20 | library(ROCR)
21 | library(randomForest)
22 | library(nnet)
23 | library(ggplot2)
24 |
25 |
26 | ## LOADING DATA AND ADDING FEATURES ##
27 |
28 | # Define column classes for reading in the data sets
29 | colClasses <- c(rep("integer", 4), "character", rep("factor", 2),
30 | "integer", "factor", "integer", "factor", rep("integer", 3),
31 | rep("factor", 2), "integer", rep("factor", 7), "integer")
32 |
33 | # Function for pre-processing and feature engineering
34 | preprocess <- function(data) {
35 |
36 | # add features
37 | data$plan <- paste0(data$A, data$B, data$C, data$D, data$E, data$F, data$G)
38 | data$hour <- as.integer(substr(data$time, 1, 2))
39 | data$timeofday <- as.factor(ifelse(data$hour %in% 6:15, "day",
40 | ifelse(data$hour %in% 16:18, "evening",
41 | "night")))
42 | data$weekend <- as.factor(ifelse(data$day %in% 0:4, "No", "Yes"))
43 | data$family <- as.factor(ifelse(data$group_size > 2 & data$age_youngest <
44 | 25 & data$married_couple==1, "Yes", "No"))
45 | data$agediff <- data$age_oldest-data$age_youngest
46 | data$individual <- as.factor(ifelse(data$agediff==0 & data$group_size==1,
47 | "Yes", "No"))
48 | data$stategroup <- as.factor(ifelse(data$state %in% c("SD","ND"), "g1",
49 | ifelse(data$state %in% c("AL","WY"), "g2",
50 | ifelse(data$state %in% c("OK","ME","AR",
51 | "WV"), "g3",
52 | ifelse(data$state %in% c("DC","NE","GA"),
53 | "g5", "g4")))))
54 |
55 | # fix NA's for duration_previous and C_previous
56 | data$duration_previous[is.na(data$duration_previous)] <- 0
57 | levels(data$C_previous) <- c("1", "2", "3", "4", "none")
58 | data$C_previous[is.na(data$C_previous)] <- "none"
59 |
60 | # replace risk_factor NA's by predicting a value
61 | datanorisk <- data[is.na(data$risk_factor), ]
62 | datarisk <- data[!is.na(data$risk_factor), ]
63 | lm.fit <- lm(risk_factor ~ age_youngest*group_size+married_couple+
64 | homeowner, data=datarisk)
65 | lm.pred <- predict(lm.fit, newdata=datanorisk)
66 | data$risk_factor[is.na(data$risk_factor)] <- round(lm.pred, 0)
67 |
68 | # for car_age greater than 30, "round down" to 30
69 | data$car_age[data$car_age > 30] <- 30
70 |
71 | return(data)
72 | }
73 |
74 | # read in training set
75 | train <- read.csv("train.csv", colClasses=colClasses)
76 | train <- preprocess(train)
77 |
78 | # trainsub is subset of train that only includes purchases
79 | trainsub <- train[!duplicated(train$customer_ID, fromLast=TRUE), ]
80 |
81 | # trainex is subset of train that excludes purchases
82 | trainex <- train[duplicated(train$customer_ID, fromLast=TRUE), ]
83 |
84 | # trainex2 only includes last quote before purchase
85 | trainex2 <- trainex[!duplicated(trainex$customer_ID, fromLast=TRUE), ]
86 |
87 | # changed is anyone who changed from their last quote
88 | changed <- ifelse(trainsub$plan == trainex2$plan, "No", "Yes")
89 | changelog <- ifelse(trainsub$plan == trainex2$plan, FALSE, TRUE)
90 | trainsub$changed <- as.factor(changed)
91 | trainex2$changed <- as.factor(changed)
92 | trainsub$changelog <- changelog
93 | trainex2$changelog <- changelog
94 |
95 | # compute "stability" feature from trainex and add to trainex2
96 | customerstability <- trainex %.% group_by(customer_ID) %.%
97 | summarise(quotes=n(), uniqueplans=n_distinct(plan),
98 | stability=(quotes-uniqueplans+1)/quotes)
99 | trainex2$stability <- customerstability$stability
100 |
101 | # compute "planfreq" feature on trainex2
102 | nrowtrainex2 <- nrow(trainex2)
103 | planfreqs <- trainex2 %.% group_by(plan) %.%
104 | summarise(planfreq=n()/nrowtrainex2)
105 | trainex2 <- left_join(trainex2, planfreqs)
106 |
107 | # trainex3 is identical to trainex2 but also includes purchases
108 | trainex3 <- cbind(trainex2, Apurch=trainsub$A, Bpurch=trainsub$B,
109 | Cpurch=trainsub$C, Dpurch=trainsub$D, Epurch=trainsub$E, Fpurch=trainsub$F,
110 | Gpurch=trainsub$G, planpurch=trainsub$plan, stringsAsFactors=FALSE)
111 |
112 | # read in test set
113 | test <- read.csv("test_v2.csv", colClasses=colClasses)
114 | test <- preprocess(test)
115 |
116 | # fix locations that are NA
117 | s <- split(test$location, test$state)
118 | s2 <- sapply(s, function(x) x[1])
119 | NAstates <- test[is.na(test$location), "state"]
120 | NAlocations <- s2[NAstates]
121 | test$location[is.na(test$location)] <- NAlocations
122 |
123 | # add "changed" variable and default to No
124 | test$changed <- factor(rep("No", nrow(test)), levels=c("No", "Yes"))
125 |
126 | # testsub only shows last (known) quote before purchase
127 | testsub <- test[!duplicated(test$customer_ID, fromLast=TRUE), ]
128 |
129 | # compute "stability" feature from test and add to testsub
130 | customerstability <- test %.% group_by(customer_ID) %.% summarise(quotes=n(),
131 | uniqueplans=n_distinct(plan), stability=(quotes-uniqueplans+1)/quotes)
132 | testsub$stability <- customerstability$stability
133 |
134 | # compute "planfreq" feature on testsub
135 | nrowtestsub <- nrow(testsub)
136 | planfreqs <- testsub %.% group_by(plan) %.% summarise(planfreq=n()/nrowtestsub)
137 | testsub <- left_join(testsub, planfreqs)
138 |
139 |
140 | ## DATA EXPLORATION ##
141 |
142 | # check for NA values
143 | sapply(train, function(x) mean(is.na(x)))
144 | # risk_factor, C_previous, duration_previous
145 | sapply(test, function(x) mean(is.na(x)))
146 | # risk_factor, C_previous, duration_previous, location
147 |
148 | uniquetrainplan <- unique(train$plan)
149 | uniquetestplan <- unique(test$plan)
150 | # plans in train: 1809
151 | # plans in test: 1596
152 | # union: 1878 (69 plans in test that are not in train)
153 | # intersection: 1527
154 |
155 |
156 | ## VISUALIZATIONS ##
157 |
158 | # Viz 1: Number of shopping points
159 | shoptrain <- data.frame(maxpoint=trainex2$shopping_pt, dataset=rep("train",
160 | nrow(trainex2)))
161 | shoptest <- data.frame(maxpoint=testsub$shopping_pt, dataset=rep("test",
162 | nrow(testsub)))
163 | shoppingpoints <- rbind(shoptrain, shoptest)
164 | shoppingpoints$dataset <- as.factor(shoppingpoints$dataset)
165 | ggplot(shoppingpoints) + aes(factor(maxpoint)) + geom_histogram() +
166 | facet_grid(dataset ~ .) + labs(x="Number of Shopping Points",
167 | y="Frequency", title="Comparing Number of Shopping Points in
168 | Training vs Test Sets")
169 | ggsave("allstate-viz-1.png")
170 |
171 | # Viz 2: Predictive power of final quote before purchase
172 | s <- split(trainex2, trainex2$shopping_pt)
173 | s2 <- sapply(s, function(x) sum(x$changed=="No")/nrow(x))
174 | s2b <- sapply(s, nrow)
175 | acclastentry <- data.frame(numshoppoints=as.integer(names(s2)), accuracy=s2,
176 | Observations=s2b)
177 | ggplot(acclastentry) + aes(numshoppoints, accuracy, size=Observations) +
178 | geom_point() + geom_line(size=0.5) + scale_x_continuous(breaks=1:12) +
179 | theme(panel.grid.minor=element_blank()) + labs(x="Number of Shopping
180 | Points", y="Prediction Accuracy", title="Effect of Number of Shopping
181 | Points on Predictive Power of Last Quote")
182 | ggsave("allstate-viz-2.png")
183 |
184 | # Viz 3: Effect of purchase hour on the predictive power of the final quote
185 | s3 <- split(trainsub, trainsub$hour)
186 | s4 <- sapply(s3, function(x) sum(x$changed=="Yes")/nrow(x))
187 | s5 <- as.data.frame(table(trainsub$hour))$Freq
188 | changebyhour <- data.frame(hour=as.integer(names(s4)),
189 | percentchanged=s4, count=s5)
190 | ggplot(changebyhour) + aes(hour, percentchanged, color=count) +
191 | geom_point(size=4) + labs(x="Hour of Purchase", y="Percent Changed",
192 | title="Effect of Purchase Hour on Likelihood of Changing from Last Quote")
193 | ggsave("allstate-viz-3.png")
194 |
195 | # Viz 4: Dependencies between options
196 | C_names <- list("1"="C=1", "2"="C=2", "3"="C=3", "4"="C=4")
197 | C_labeller <- function(variable, value){ return(C_names[value]) }
198 | ggplot(trainsub, aes(D)) + geom_bar() + facet_grid(. ~ C,
199 | labeller=C_labeller) + labs(x="Customer Selection of Option D (1, 2, 3)",
200 | y="Frequency", title="Customer selection of Option D based on their
201 | selection for Option C")
202 | ggsave("allstate-viz-4.png")
203 |
204 | # Viz 5: Clustering of states
205 | # based on: http://is-r.tumblr.com/post/37708137014/us-state-maps-using-map-data
206 | states <- map_data("state")
207 | states$grp <- as.factor(ifelse(states$region %in% c("south dakota",
208 | "north dakota"), "1 (least likely)",
209 | ifelse(states$region %in% c("alabama","wyoming"), "2",
210 | ifelse(states$region %in% c("oklahoma","maine","arkansas","west virginia"),
211 | "3",
212 | ifelse(states$region %in% c("colorado","connecticut","delaware","florida",
213 | "iowa","idaho","indiana","kansas","kentucky","maryland","missouri",
214 | "mississippi","montana","new hampshire","new mexico","nevada",
215 | "new york","ohio","oregon","pennsylvania","rhode island","tennessee",
216 | "utah","washington","wisconsin"), "4",
217 | ifelse(states$region %in% c("district of columbia","nebraska","georgia"),
218 | "5 (most likely)", "unassigned"))))))
219 | ggplot(states) + aes(x=long, y=lat, group=group, fill=grp) +
220 | geom_polygon(color="black") + theme_bw() +
221 | theme(panel.border=element_blank()) + scale_y_continuous(breaks=c()) +
222 | scale_x_continuous(breaks=c()) + labs(title="Clustering of States
223 | Based on Customer Likelihood of Changing from Last Quote", fill="Cluster",
224 | x="", y="") + scale_fill_brewer(palette="Pastel1")
225 | ggsave("allstate-viz-5.png")
226 |
227 |
228 | ## MODEL BUILDING ##
229 | ## Note: Each "PART" is explained in the paper
230 |
231 | # PART 1:
232 |
233 | # Submit the baseline
234 | pred <- data.frame(customer_ID = testsub$customer_ID, plan = testsub$plan)
235 | write.csv(pred, file="submit1.csv", row.names=FALSE, quote=FALSE) # 0.53793
236 |
237 | # PART 2:
238 |
239 | # Rule-based predictions
240 | # Example: if C=4, then change D to 3
241 | testsub$D[testsub$C==4] <- 3
242 | filter(testsub, C==4, D!=3)
243 | pred <- data.frame(customer_ID = testsub$customer_ID,
244 | plan = paste0(testsub$A, testsub$B, testsub$C, testsub$D,
245 | testsub$E, testsub$F, testsub$G))
246 | write.csv(pred, file="submit2.csv", row.names=FALSE, quote=FALSE) # 0.53769
247 |
248 | # PART 4:
249 |
250 | # logistic regression for predicting "changed"
251 | glm.fit <- glm(changed ~ state+cost+A+C+D+E+F+G+age_oldest+age_youngest+
252 | car_value+car_age+shopping_pt+timeofday+weekend+risk_factor+C_previous+
253 | duration_previous+stability+planfreq, data=trainex2, family=binomial)
254 | summary(glm.fit)
255 | glm.probs <- predict(glm.fit, type="response")
256 | glm.pred <- ifelse(glm.probs>0.5, "Yes", "No")
257 | confusionMatrix(glm.pred, trainex2$changed, "Yes")
258 | predob <- prediction(glm.probs, trainex2$changed)
259 | acc <- performance(predob, "acc"); plot(acc)
260 | prec <- performance(predob, "prec"); plot(prec)
261 | roc <- performance(predob, "tpr", "fpr"); plot(roc)
262 |
263 | # 5-fold CV for logistic regression
264 | set.seed(5)
265 | folds <- sample(rep(1:5, length = nrow(trainex2)))
266 | for(k in 1:5) {
267 | fit <- glm(changed ~ state+cost+A+C+D+E+F+G+age_oldest+age_youngest+
268 | car_value+car_age+shopping_pt+timeofday+weekend+risk_factor+C_previous+
269 | duration_previous+stability, data=trainex2[folds!=k, ],
270 | family=binomial)
271 | probs <- predict(fit, newdata=trainex2[folds==k, ], type="response")
272 | pred <- ifelse(probs>0.5, "Yes", "No")
273 | print(mean(pred==trainex2$changed[folds==k]))
274 | }
275 |
276 | # random forests for predicting "changed"
277 | rf.fit <- randomForest(changed ~ stategroup+cost+A+C+D+E+F+G+age_oldest+
278 | age_youngest+car_value+car_age+shopping_pt+timeofday+weekend+risk_factor+
279 | C_previous+duration_previous+stability+planfreq, data=trainex2, mtry=5)
280 | rf.pred <- ifelse(rf.fit$votes[, 2]>0.5, "Yes", "No")
281 | confusionMatrix(rf.pred, trainex2$changed, "Yes")
282 |
283 | # PART 5:
284 |
285 | # 5-fold CV for logistic regression - for precision at 0.85 cutoff
286 | set.seed(5)
287 | folds <- sample(rep(1:5, length = nrow(trainex2)))
288 | predyestotal <- 0
289 | tptotal <- 0
290 | for(k in 1:5) {
291 | fit <- glm(changed ~ state+cost+A+C+D+E+F+G+age_oldest+age_youngest+
292 | car_value+car_age+shopping_pt+timeofday+weekend+risk_factor+C_previous+
293 | duration_previous+stability+planfreq, data=trainex2[folds!=k, ],
294 | family=binomial)
295 | probs <- predict(fit, newdata=trainex2[folds==k, ], type="response")
296 | pred <- as.factor(ifelse(probs>0.85, "Yes", "No"))
297 | predyes <- sum(pred=="Yes")
298 | predyestotal <- predyestotal + predyes
299 | actual <- trainex2$changed[folds==k]
300 | actualwhenpredyes <- actual[pred=="Yes"]
301 | tp <- sum(actualwhenpredyes=="Yes")
302 | tptotal <- tptotal + tp
303 | }
304 | print(tptotal)
305 | print(predyestotal)
306 | print(tptotal/predyestotal)
307 |
308 | # train model on trainex2 and predict changed on testsub
309 | glm.fit <- glm(changed ~ state+cost+A+C+D+E+F+G+age_oldest+age_youngest+
310 | car_value+car_age+shopping_pt+timeofday+weekend+risk_factor+C_previous+
311 | duration_previous+stability, data=trainex2, family=binomial)
312 | glm.probs <- predict(glm.fit, newdata=testsub, type="response")
313 | glm.pred <- ifelse(glm.probs>0.85, "Yes", "No")
314 |
315 | # update "changed" variable on testsub to reflect prediction
316 | testsub$changed <- as.factor(glm.pred)
317 |
318 | # for records predicting changed, planpred = 9999999
319 | testsub$planpred <- ifelse(testsub$changed=="Yes", "9999999", testsub$plan)
320 | pred <- data.frame(customer_ID = testsub$customer_ID, plan = testsub$planpred)
321 | write.csv(pred, file="submit6.csv", row.names=FALSE, quote=FALSE)
322 | # 52 changes, 16 affect leaderboard
323 | # 0.53769 means 4 below baseline
324 |
325 | # PART 6:
326 |
327 | # 5-fold CV for random forests - for precision at 0.85 cutoff
328 | set.seed(5)
329 | folds <- sample(rep(1:5, length = nrow(trainex2)))
330 | predyestotal <- 0
331 | tptotal <- 0
332 | for(k in 1:5) {
333 | fit <- randomForest(changed ~ stategroup+cost+A+C+D+E+F+G+age_oldest+
334 | age_youngest+car_value+car_age+shopping_pt+timeofday+weekend+
335 | risk_factor+C_previous+duration_previous+stability+planfreq,
336 | data=trainex2[folds!=k, ], mtry=5)
337 | votes <- predict(fit, newdata=trainex2[folds==k, ], type="vote")
338 | pred <- as.factor(ifelse(votes[, 2]>0.85, "Yes", "No"))
339 | predyes <- sum(pred=="Yes")
340 | predyestotal <- predyestotal + predyes
341 | actual <- trainex2$changed[folds==k]
342 | actualwhenpredyes <- actual[pred=="Yes"]
343 | tp <- sum(actualwhenpredyes=="Yes")
344 | tptotal <- tptotal + tp
345 | }
346 | print(tptotal)
347 | print(predyestotal)
348 | print(tptotal/predyestotal)
349 |
350 | # PART 7 and PART 8:
351 |
352 | # multinom to predict A (repeat for each option)
353 | a.mn.fit <- multinom(A ~ .-customer_ID-record_type-day-time-location-plan-
354 | hour-agediff-stategroup, data=trainex2)
355 | a.mn.pred <- predict(a.mn.fit, newdata=trainex2, type="class")
356 | confusionMatrix(a.mn.pred, trainsub$A)
357 |
358 | # multinom to predict Apurch (repeat for each option)
359 | a.mn.fit <- multinom(Apurch ~ .-customer_ID-record_type-day-time-location-
360 | plan-hour-agediff-stategroup-Bpurch-Cpurch-Dpurch-Epurch-Fpurch-Gpurch-
361 | planpurch, data=trainex3)
362 | a.mn.pred <- predict(a.mn.fit, newdata=trainex3, type="class")
363 | mean(a.mn.pred==trainex3$Apurch)
364 | mean(a.mn.pred==trainex3$A)
365 |
366 | # random forests to predict A (repeat for each option)
367 | a.rf.fit <- randomForest(A ~ .-customer_ID-record_type-day-time-location-
368 | plan-hour-agediff-state, data=trainex2, importance=FALSE)
369 | mean(a.rf.fit$predicted==trainex2$A)
370 | mean(a.rf.fit$predicted==trainsub$A)
371 |
372 | # random forests to predict Apurch (repeat for each option)
373 | a.rf.fit <- randomForest(Apurch ~ .-customer_ID-record_type-day-time-location-
374 | plan-hour-agediff-state-Bpurch-Cpurch-Dpurch-Epurch-Fpurch-Gpurch-
375 | planpurch, data=trainex3, importance=FALSE)
376 | mean(a.rf.fit$predicted==trainex3$Apurch)
377 | mean(a.rf.fit$predicted==trainex3$A)
378 |
379 | # random forests to predict Apurch, trained only on subset for which change is
380 | # predicted (repeat for each option)
381 | trainex4 <- trainex3[trainex3$changed=="Yes", ]
382 | a.rf.fit <- randomForest(Apurch ~ .-customer_ID-record_type-day-time-location-
383 | plan-hour-agediff-state-Bpurch-Cpurch-Dpurch-Epurch-Fpurch-Gpurch-
384 | planpurch, data=trainex4, importance=FALSE)
385 | mean(a.rf.fit$predicted==trainex4$Apurch)
386 | mean(a.rf.fit$predicted==trainex4$A)
387 |
388 | # combine option predictions for A through G
389 | pred.plan <- paste0(a.rf.fit$predicted, b.rf.fit$predicted, c.rf.fit$predicted,
390 | d.rf.fit$predicted, e.rf.fit$predicted, f.rf.fit$predicted,
391 | g.rf.fit$predicted)
392 |
393 | # for people I predict changed in train, did I get it right?
394 | predchange.ix <- which(glm.pred=="Yes")
395 | predchange.cid <- trainex2[predchange.ix, "customer_ID"]
396 | for (i in 1:length(predchange.cid)) {
397 | # customer_ID
398 | print(predchange.cid[i])
399 | # final quote
400 | print(trainex2[trainex2$customer_ID==predchange.cid[i], "plan"])
401 | # predicted purchase
402 | print(pred.plan[predchange.ix[i]])
403 | # actual purchase
404 | print(trainsub[trainsub$customer_ID==predchange.cid[i], "plan"])
405 | }
406 |
407 | # PART 9:
408 |
409 | # train model on trainex2 and predict changed on testsub
410 | glm.fit <- glm(changed ~ state+cost+A+C+D+E+F+G+age_oldest+age_youngest+
411 | car_value+car_age+shopping_pt+timeofday+weekend+risk_factor+C_previous+
412 | duration_previous+stability+planfreq, data=trainex2, family=binomial)
413 | glm.probs <- predict(glm.fit, newdata=testsub, type="response")
414 | glm.pred <- ifelse(glm.probs>0.9, "Yes", "No")
415 |
416 | # update changed variable on testsub to reflect prediction
417 | testsub$changed <- as.factor(glm.pred)
418 |
419 | # for records predicting changed, start by predicting plan, then modify each one
420 | testsub$planpred <- testsub$plan
421 | testsub$planpred[testsub$customer_ID=="10006040"] <- "1033021"
422 | testsub$planpred[testsub$customer_ID=="10011049"] <- "1133112"
423 | testsub$planpred[testsub$customer_ID=="10026297"] <- "0011002"
424 | testsub$planpred[testsub$customer_ID=="10027789"] <- "0012002"
425 | testsub$planpred[testsub$customer_ID=="10054016"] <- "0011002"
426 | testsub$planpred[testsub$customer_ID=="10054825"] <- "0011001"
427 | testsub$planpred[testsub$customer_ID=="10068571"] <- "0011002"
428 | testsub$planpred[testsub$customer_ID=="10113734"] <- "2022032"
429 | testsub$planpred[testsub$customer_ID=="10116125"] <- "0012002"
430 |
431 | # submit
432 | pred <- data.frame(customer_ID = testsub$customer_ID, plan = testsub$planpred)
433 | write.csv(pred, file="submit11.csv", row.names=FALSE, quote=FALSE)
434 | # 9 changes, 3 affect leaderboard
435 | # 0.53793 means 0 below baseline
436 |
437 | # PART 10:
438 |
439 | # calculate change likelihood of plans
440 | rec <- train %.% group_by(plan) %.% summarise(planpur=mean(record_type),
441 | plancnt=n()) %.% arrange(planpur, desc(plancnt))
442 |
443 | # function to "fix" plans based upon thresholds
444 | fixplans <- function(planpurmax, plancntmin, commonmin) {
445 |
446 | # make list of fixes
447 | rectop <- rec[rec$planpur<=planpurmax & rec$plancnt>=plancntmin, "plan"]
448 | rectopbest <- vector(mode="character", length=length(rectop))
449 | rectopcommon <- vector(mode="numeric", length=length(rectop))
450 |
451 | for (i in 1:length(rectop)) {
452 | # vector of unique customers that looked at that plan
453 | cust <- unique(train[train$plan==rectop[i], "customer_ID"])
454 | # what are all the plans that those customers purchased?
455 | purplan <- train[train$customer_ID %in% cust & train$record_type==1,
456 | "plan"]
457 | # what was the most common purchased?
458 | rectopbest[i] <- names(sort(table(purplan), decreasing=TRUE))[1]
459 | # how common was it?
460 | rectopcommon[i] <- sort(table(purplan),
461 | decreasing=TRUE)[1]/length(purplan)
462 | }
463 |
464 | fixes <- data.frame(old=rectop[rectopcommon>=commonmin],
465 | new=rectopbest[rectopcommon>=commonmin], stringsAsFactors=FALSE)
466 | fixes <- fixes[fixes$old!=fixes$new, ]
467 | print(nrow(fixes))
468 | print(fixes)
469 |
470 | # reset testsub, and check how many fixes will be made
471 | testsub <- test[!duplicated(test$customer_ID, fromLast=TRUE), ]
472 | testsub$planpred <- testsub$plan
473 | print(sum(testsub$planpred %in% fixes$old))
474 |
475 | # make fixes
476 | nrowtestsub <- nrow(testsub)
477 | for (i in 1:nrowtestsub) {
478 | if (testsub$planpred[i] %in% fixes$old) {
479 | testsub$planpred[i] <-
480 | fixes$new[which(fixes$old==testsub$planpred[i])]
481 | }
482 | }
483 |
484 | return(testsub)
485 | }
486 |
487 | # submission that had my best PUBLIC leaderboard score
488 | # 5 plans, 322 fixes
489 | # public score: 0.53853, above baseline by 10 picks
490 | # private score: 0.53266, below baseline by 1 pick
491 | ts <- fixplans(0.05, 70, 0.05)
492 | pred <- data.frame(customer_ID = ts$customer_ID, plan = ts$planpred)
493 | write.csv(pred, file="submit34.csv", row.names=FALSE, quote=FALSE)
494 |
495 | # submission that had my best PRIVATE leaderboard score
496 | # 2 plans, 305 fixes
497 | # public score: 0.53847, above baseline by 9 picks
498 | # private score: 0.53277, above baseline by 3 picks
499 | ts <- fixplans(0.05, 500, 0.05)
500 | pred <- data.frame(customer_ID = ts$customer_ID, plan = ts$planpred)
501 | write.csv(pred, file="submit25.csv", row.names=FALSE, quote=FALSE)
502 |
503 | # PART 11:
504 |
505 | # predict change (for 12403 people, 100+ plans, 98 fixes)
506 | glm.fit <- glm(changed ~ state+cost+A+C+D+E+F+G+age_oldest+age_youngest+
507 | car_value+car_age+shopping_pt+timeofday+weekend+risk_factor+C_previous+
508 | duration_previous+stability+planfreq, data=trainex2, family=binomial)
509 | glm.probs <- predict(glm.fit, newdata=testsub, type="response")
510 | glm.pred <- ifelse(glm.probs>0.5, "Yes", "No")
511 | table(glm.pred)
512 | ts <- fixplans(0.2, 10, 0.05)
513 | ts$changed <- as.factor(glm.pred)
514 | nrow(ts[ts$planpred!=ts$plan & ts$changed=="Yes", ])
515 | ts$planpred <- ifelse(ts$changed=="No", ts$plan, ts$planpred)
516 | nrow(ts[ts$planpred!=ts$plan,])
517 |
518 | # submit: 0.53787, 1 below baseline
519 | pred <- data.frame(customer_ID = ts$customer_ID, plan = ts$planpred)
520 | write.csv(pred, file="submit24.csv", row.names=FALSE, quote=FALSE)
521 |
--------------------------------------------------------------------------------
/allstate-paper.md:
--------------------------------------------------------------------------------
1 | ## Problem Statement
2 |
3 | For my project, I entered the "[Allstate Purchase Prediction Challenge](http://www.kaggle.com/c/allstate-purchase-prediction-challenge)" on Kaggle. In this competition, the goal is to predict the exact car insurance options purchased by individual customers. The data available for training a model is the history of car insurance quotes that each customer reviewed before making a purchase, the options they actually purchased (the "purchase point"), and data about the customer and their car. The data available for testing was identical to the training set, except it was a different set of customers, the purchase point was excluded (since it was the value to be predicted), and an unknown number of quotes were also excluded.
4 |
5 | For a prediction on a given customer to be counted as correct, one must successfully predict the value for all seven options, and each option has 2 to 4 possible values. Therefore, one could treat this as a classificiation problem with over 2,000 possible classes.
6 |
7 |
8 | ## Hypotheses
9 |
10 | At the start of the competition, I came up with two hypotheses:
11 |
12 | 1. I hypothesized that smart feature engineering and feature selection would be more important than the usage of advanced machine learning techniques. This hypothesis was partially based on [readings](http://homes.cs.washington.edu/~pedrod/papers/cacm12.pdf) from the course, and partially based on necessity (my toolbox of machine learning techniques is somewhat limited!)
13 |
14 | 2. I hypothesized that there would be patterns in the data that I could use to my advantage, which would not necessarily even require machine learning. Here are some examples of patterns that I hypothesized:
15 | * Customers buying the last set of options they were quoted
16 | * Customers reviewing a bunch of different options, and simply choosing the set of options that they looked at the most number of times
17 | * Customers reviewing a bunch of different options, and simply choosing the set of options that was the cheapest
18 | * Individual options that are highly correlated (e.g., if A=1, then perhaps B is almost always 0)
19 | * Sets of options that are "illegal" (e.g., if C=1, then perhaps D cannot be 2)
20 | * Sets of options that are extremely common for a given customer characteristic (e.g., families with young kids always choose E=0 and F=2)
21 |
22 |
23 | ## Data Exploration and Visualization
24 |
25 | Here are some of my key findings from the exploratory process, and what I concluded from those findings.
26 |
27 | 1. **Missing values:**
28 | * risk_factor was NA is 36.1% of the training set and 38.0% of the test set. As a predictor that I considered potentially useful, I decided to impute the risk_factor for those customers using a linear regression model based on other customer characteristics.
29 | * C_previous and duration_previous were NA for 2.8% of the training set and 4.9% of the test set. I decided that those NA values were probably indicative of new customers, and thus I imputed values of 0 for duration_previous and "none" (a categorical variable) for C_previous.
30 | * location was NA for 0.3% of the test set. I decided to impute the location for each customer by copying the location from another customer in the same state.
31 |
32 | 2. **Unique plans:**
33 | * Out of the 2,304 possible combinations of the 7 options, the training set included 1,809 unique plans and the test set included 1,596 unique plans. The union between those two sets included 1,878 unique plans, indicating that the test set contained 69 plans that were in the test set but never appeared in the training set.
34 | * Because more than 80% of the possible combinations did actually appear in the data, and because the number of plan combinations is so large, I concluded that it was better to predict the 7 individual options for each customer and combine them, rather than try to predict the entire plan (all 7 options at once) using a single model.
35 |
36 | 3. **Number of shopping points:**
37 | * As seen in the plot below, the training set contained a roughly normal distribution of "shopping points" (the number of quotes a customer reviewed), whereas the test set contained a a very different distribution.

38 | * I concluded that the number of shopping points was probably deliberately truncated in the test set in order to limit the information available to competitors and make the problem more challenging. (Kaggle later [confirmed](http://www.kaggle.com/c/allstate-purchase-prediction-challenge/forums/t/7240/is-the-test-data-truncation-a-trim-or-thinning-out) that the test set was truncated, but [declined](http://www.kaggle.com/c/allstate-purchase-prediction-challenge/forums/t/7119/recreating-a-truncated-test) to provide details on the truncation algorithm.) I also concluded that it might be useful to similarly truncate the training set (for cross-validation) to provide more accurate estimates of test error during the modeling process.
39 |
40 | 4. **Predictive power of final quote before purchase:**
41 | * As seen in the plot below, the final quote a customer requests before the "purchase point" is hugely predictive (in the training set) of which options they will actually purchase. The final quote correctly predicted the purchased options 50% to 75% of the time, with that percentage steadily increasing as customers review more quotes. (Note that the right side of the plot is unstable due to the small number of observations with 10 or more shopping points.)

42 | * I concluded that using the final quote in the test set as a predictor of the purchase would be an excellent baseline strategy, and indeed this method was used as the "benchmark" on the Kaggle leaderboard (producing a score of 0.53793 on the public leaderboard).
43 | * I also concluded that this is precisely why the number of shopping points was truncated in the test set; otherwise, the baseline strategy would likely have worked about 75% of the time on the test set.
44 |
45 | 5. **Effect of purchase hour on the predictive power of the final quote:**
46 | * I hypothesized that the time of day might affect the likelihood that a given customer would change their options between the final quote and the purchase point. As seen in the plot below, customers making a purchase between 9am and 4pm tended to change from their final quote about 30% of the time, whereas customers purchasing in the evening (or especially overnight) tended to change their options 35% of the time (or more).

47 | * I concluded that the time of day would be a useful feature to include in my models. I also concluded that binning the time into a few distinct categories might create an even more useful feature, since the variability during the overnight hours (as seen in the plot) would cause the model to overfit the training data for those individual hours, and thus an "overnight" category (averaging those values) would be more stable.
48 |
49 | 6. **Dependencies between options:**
50 | * I created dozens of plots to explore the relationships between the 7 different options (for the purchase point only). One example is below, in which I'm plotting the 3 options for D faceted against the 4 options for C. As you can see, there are clear patterns in the data. D=1 is only likely if C=1; D=3 is very likely if C=3, and is basically guaranteed if C=4.

51 | * I compiled [a set of "rules"](notes/allstate-option-dependencies.md) similar to this across all of the options, and concluded that I might be able to use those rules to "fix" any predicted combinations in the test set which seemed unlikely.
52 |
53 |
54 | ## Feature Transformation and Engineering
55 |
56 | The [dataset provided by Kaggle](http://www.kaggle.com/c/allstate-purchase-prediction-challenge/data) included 25 features. I used some of those features as-is, and I engineered additional features using transformations or combinations of features. The competition rules did not allow the use of supplementary datasets, and so no other data was used.
57 |
58 | 1. **Features used as-is:** I used the following features as-is, and either treated them as continuous variables or categorical variables. For the final model, the A through G options were treated as the response variables with all other variables used as predictors, though I also built an intermediate model (described further in the "Model Building" section) in which A through G were used as predictors instead.
59 | * Continuous variables:
60 | * group_size: number of people covered by the policy
61 | * car_age: age of customer's car
62 | * risk_factor: 1 through 4 assessment of customer risk
63 | * age_oldest: age of oldest person covered by the policy
64 | * age_youngest: age of youngest person covered by the policy
65 | * duration_previous: years covered by previous issuer
66 | * cost: cost of quoted options
67 | * Categorical variables:
68 | * state: customer location
69 | * location: ID of customer location (more specific than state)
70 | * homeowner: yes or no
71 | * car_value: value of customer's car when new, expressed only as letters 'a' through 'i'
72 | * married_couple: yes or no
73 | * C_previous: what customer previous had for option C
74 | * A/B/C/D/E/F/G: coverage options
75 |
76 | 2. **"Simplified" features:** I created "simpler" versions of certain features, under the theory that simpler features might be less noisy and could possibly prevent my models from overfitting the training set.
77 | * Instead of using "time" as a feature, I created an "hour" categorical feature by truncating the minutes from the exact time. I also created a "timeofday" categorical feature using the data from exploration #5 above: day (6am-3pm), evening (4pm-6pm), and night (7pm-5am).
78 | * Instead of using "day" as a feature, I created a "weekend" categorical feature (yes=weekend, no=weekday).
79 | * There were a very small number of cars with a "car_age" of 30 to 75 years. Since I was using car_age as a continuous feature, I decided to convert all car ages over 30 to be exactly 30, under the theory that the purchase behavior of those users might be similar.
80 | * During the model building process (discussed in detail below), I noticed that the customer's state appeared to affect their likelihood of changing options between the last quote and the purchase point. States in which that likelihood was very low (North Dakota, South Dakota, Alabama, and Wyoming) also happened to be the states that I think of as the least technologically advanced. If outside data was allowed, I would have tried adding a new feature to reflect the technological level of each state. But since outside data is not allowed, I created a "stategroup" categorical feature by clustering states into 5 groups using their "likelihood of change" coefficient from one of my models. The group assignments are shown below; note that 15 states have not been assigned a group since they are not present in the dataset.

81 |
82 | 3. **"Conceptual" features:** I created a few features to represent "concepts" by combining different features, under the theory that the concepts might have better predictive power than the individual features (in a way that might not be captured by an interaction term).
83 | * I created a "family" categorical feature for any customer that was listed as married, had a group size larger than 2, and the age of the youngest covered individual (presumably their child) was less than 25.
84 | * I created an "agediff" continuous feature that was simply the age difference between the youngest and oldest covered individuals.
85 | * I created an "individual" categorical feature for any customer whose group size was 1 and the "agediff" was 0.
86 |
87 | 4. **Features to represent past quotes:** When anticipating the model building process, I knew from item #4 of data exploration (above) that the final quote before purchase would have the best predictive power of the actual purchase. Given that I only wanted to make a single prediction per customer, my plan was to only use that final quote before purchase (for each customer) as the input to the model. That seemed to waste a lot of available (and potentially useful) data, but I had a difficult time conceptualizing how to effectively integrate the not-final-quote data into the model. I came up with two solutions:
88 | * I used "shopping_pt" as a continuous feature, since it represented the number of quotes a customer requested before purchasing. My theory (based on data exploration #4) was that a higher shopping_pt indicated a greater likelihood that the customer would simply choose the last quote, making shopping_pt a useful predictor.
89 | * I created a new continuous feature called "stability", which was a number between 0 and 1 that represented how much a given customer changed their plan options during the quoting process. I created the formula stability=(numquotes - uniqueplansviewed + 1)/numquotes. For example, a customer who requested 8 quotes but only looked at 3 different plan combinations would have a stability of (8-3+1)/8 = 0.75, whereas if they had looked at 8 different plan combinations, their stability would be (8-8+1)/8 = 0.125. My theory was that a low stability would indicate a high likelihood of changing options between a customer's final quote and actual purchase.
90 |
91 | 5. **Feature to represent plan frequency:**
92 | * I created a "planfreq" continous feature that was simply the frequency with which a given plan occurred across all customers. My theory was that a plan with a low frequency might indicate a greater likelihood of switching options, since perhaps that combination of options is unpopular for a reason.
93 |
94 |
95 | ## Challenges with the Data
96 |
97 | 1. The biggest challenge with the dataset was that there were over 2,000 possible plan combinations, and your prediction is only scored as correct if all 7 options are correct. Additionally, the Kaggle system does not provide any feedback on "how wrong" your predictions are, making it impossible to differentiate between a prediction in which 6/7 options are correct and a prediction in which 1/7 options are correct.
98 |
99 | 2. Another huge challenge (closely related to challenge #1) is that there is a huge "risk" when predicting any plan other than the last quote. As discussed in data exploration #4, you can obtain roughly 50% accuracy on the test set simply by using the last quote as your prediction. Thus, if you predict anything other than that last quote for a given customer, you have a 50% chance of "breaking" an already correct prediction. The only way to mitigate that risk is by developing a predictive model that is more than 50% accurate. And since I had decided to predict each of the 7 options individually (based on data exploration #2), those 7 predictive models would each have to be at least 90% accurate in order for the combined prediction to be at least 50% accurate (since 0.90^7 roughly equals 0.50). 90% accuracy for 7 different models is quite a high bar!
100 |
101 | 3. As discussed in feature engineering #4, it was challenging to determine how to use the not-final-quote data.
102 |
103 | 4. One of my hypotheses was that customers who do change from their final quote might simply be changing to a set of options that they looked at previously. If this was often the case, it could make prediction significantly easier, and eliminate the need to predict each option individually. Unfortunately, when I examined the quote history of 15 random customers (in the training set) that did change from their final quote, I found that every single one of them purchased a combination of options that they never looked at during the quoting process.
104 |
105 | 5. Another challenge with the dataset is that the car insurance options are not identified in any meaningful way, preventing you from making educated guesses about which options are correlated and which variables might influence each option.
106 |
107 | 6. As discussed in data exploration #3, the test set was substantially truncated in terms of number of quotes per customer, making it more challenging to build models that work well on both the training set and the test set.
108 |
109 |
110 | ## Model Building
111 |
112 | Below is a description of the model building process I went through. Because I'm most fluent in R, I built all of the models (and did all of the visualization and feature engineering) in R.
113 |
114 | 1. As discussed in data exploration #4, my baseline strategy was to predict the purchase options for each customer simply by using their last quote. That produced a score on the public leaderboard of 0.53793, which represents your accuracy on 30% of the test set. (Since around half of the competitors [obtained](http://www.kaggle.com/c/allstate-purchase-prediction-challenge/leaderboard) that exact score, one can infer that this strategy was widely used.) All of my follow-up models simply revised the baseline predictions, rather than predicting every customer "from scratch".
115 |
116 | 2. As discussed in data exploration #6, I noticed correlations between certain options. For example, D is nearly always 3 if C equals 4 for a given customer. Using these and other "[rules](notes/allstate-option-dependencies.md)" that I developed during the exploratory process, I revised the baseline predictions by simply converting any pairs of options that seemed unlikely. In other words, if one of my baseline predictions had C equals 4 and D not equal to 3, I changed D to 3. I submitted a variety of these rule-based predictions, and every time my score on the public leaderboard decreased. I realized the flaw in this approach: For any customer where the baseline was predicting incorrectly, it's impossible to know how many of their options are incorrect. Thus the rule-based approach may fix a single incorrect option, but it also "breaks" baseline predictions that were already correct at a much higher rate.
117 |
118 | 3. My key insight from the previous model building step was that it was critical to not break existing baseline predictions that were already correct, and only attempt to "fix" baseline predictions that were already incorrect (since there would be no risk of making those predictions worse). Thus, I decided to use a multi-stage approach, in which I first predicted which customers were going to change from their final quote, and then would predict new options only for that smaller set of customers.
119 |
120 | 4. For predicting who would change, I began with logistic regression on the training set and created a 5-fold cross-validation framework to predict test set accuracy. I also tried random forests, but stuck with logistic regression for the time being because it ran much quicker and thus allowed me to iterate much more quickly through different models.
121 |
122 | 5. My prediction accuracy (of which customers would change) barely increased over the null error rate, regardless of which features I included in the model. Therefore, I decided to instead optimize my model for precision and set a high threshold for predicting change. In other words, I would only be predicting change for a very small number of customers, but I would be highly confident that those customers would change. I created a new 5-fold cross-validation framework to calculate precision of my "change" predictions, and managed to get 91% precision. I then tested this method by predicting change for the test set, changing my baseline predictions for that small number of customers to "9999999" (definitely incorrect), and then see how my public leaderboard score was affected. It appeared that this method was about 75% accurate on the test set, which validated this approach.
123 |
124 | 6. I repeated step #5 of the model building process using random forests instead of logistic regression, to see if that would improve my precision. I again created a 5-fold cross-validation framework and set a high threshold for predicting change by examining the fraction of out-of-bag "votes" that predicted change. Despite some effort at tuning the model, I was not able to improve upon the results from logistic regression.
125 |
126 | 7. I moved on to the second stage of model building, namely predicting the new set of options for those customers who I'm predicting will change from their last quote. As discussed in data exploration #2, I had decided to predict each option individually, rather than try to predict the set of options as a whole. Since most of the 7 options have more than 2 classes, I explored different R packages for multinomial classification. I first considered the `mlogit` and `mnlogit` packages, but found the documentation confusing. I tried using the `glmnet` package for regularized multinomial classification, but it took an exceptionally long time to run. I ended up using both random forests (from the `randomForest` package) and the multinom function (from the `nnet` package), both of which ran relatively quickly.
127 |
128 | 8. During the multinomial classification process, I tried three different approaches. To explain the approaches, we can use the "A" option as an example:
129 | * For approach #1, I tried to predict A (for each customer) by giving the model every feature other than A in that customer's final quote. That approach only produced 60%-80% accuracy on the training set (across each of the 7 options), making this approach too inaccurate to be useful.
130 | * For approach #2, I gave the model the same features as approach #1 but also gave it "current A" as a feature, and asked it to predict "final A". The training set accuracy for approach #2 rose significantly, but only because the model simply predicted "final A" to be equal to "current A" 99.9% of the time, making it a useless model.
131 | * For approach #3, I revised approach #2 by only training the model on the subset of data for which change was predicted. Unfortunately, this approach performed no better than approach #1.
132 |
133 | 9. Since my attempts at multinomial classification were unsuccessful at improving upon the baseline strategy, I decided to supplement the machine learning approach with manual adjustments:
134 | * First, I set a very high threshold for my logistic regression model for predicting which customers would change from their final quote, resulting in a prediction that only 9 customers would change.
135 | * Second, I manually adjusted the options for those 9 customers using the [set of "rules"](notes/allstate-option-dependencies.md) I had created in data exploration #6. Essentially, I was looking for any unlikely option combinations in their final quote and changing them to a more likely combination.
136 | * Third, I tweaked those manual option predictions by comparing them against the predictions generated by my random forest model for multinomial classification.
137 | * Although this was a time-intensive process, I decided that if it worked, I could potentially figure out how to scale it. (Per the competition rules, "human prediction" is not allowed, and thus I would only use this strategy if I could figure out how to convert it into a pure machine learning model.)
138 | * When I submitted my new predictions to Kaggle, my public leaderboard score was identical to the score for the baseline approach. That indicated that either none of those 9 customers were part of the public leaderboard, or that those few customers who were part of the public leaderboard were not corrected by this process.
139 |
140 | 10. Based on a [tip from the Kaggle forums](http://www.kaggle.com/c/allstate-purchase-prediction-challenge/forums/t/8119/how-to-get-above-baseline/44398#post44398), I decided to try a completely different approach. Rather than trying to predict the plans for specific customers based upon their characteristics and behavior, the nature of this approach was to seek out complete plans (combinations of 7 options) that were viewed but rarely purchased in the training set, and if any of those plans were predicted for the test set by the baseline approach, simply replace those predictions with "more likely" plans. My theory is that there are combinations of options that "don't make sense" (probably from a financial perspective), and that anyone who looks at one these combinations is unlikely to actually purchase it. Note that this approach completely ignores the customer characteristics!
141 | * The first step was to determine which plans are "unlikely" to be purchased. I decided to use the full training set, and calculate for each plan the percentage of times it was purchased out of all the times it was viewed. In other words, if a plan appeared in the training set 100 times, and 10 of those times were purchases (the other 90 were quotes), then the purchase percentage was 10%.
142 | * The second step was to determine which plan should be substituted for any "unlikely" plan. I decided to build a vector of unique customers that looked at each unlikely plan, and use as its substitute the plan that was most commonly purchased by that vector of customers.
143 | * The third step was to adjust the "thresholds" that would determine how many plans would be considered "unlikely" and thus "fixed" in the test set predictions. I decided upon three thresholds:
144 | * purchase percentage: The lower the percentage (10% in the example above), the more likely it is that a given plan won't be purchased.
145 | * plan count: The more times a plan is viewed (100 in the example above), the more confidence I had that the purchase percentage is representative.
146 | * replacement plan commonality: If 90 times out of 100 a plan is not purchased, but those 90 customers end up choosing 85 different plans, I would have very little confidence that the "most common" of those 85 plans is a "good enough" substitute prediction for the original. So, I created a threshold of how common the most common replacement plan had to be (among the actual replacement plans) in order for that substitution to be counted as "useful". For example, if the most common replacement plan for those 90 customers was chosen by 18 of those customers, the "commonality" score was 18/90=20%.
147 | * Once I determined the thresholds, I created a function that would allow me to input different threshold values and see how many different predictions would be replaced, as well as see the table of the unlikely plans and their replacements.
148 | * I experimented with different threshold values and used submissions to the public leaderboard as a gauge of which values made the most sense. The threshold values which produced the best results were a maximum purchase percentage of 5%, a minimum plan count of 70, and a minimum commonality score of 5%. This "fixed" 5 different plans that were originally predicted for 322 customers (about 0.6% of customers), and produced a score of 0.53853. That score is only a small improvement over the baseline (0.53793), but was enough to move me up to the top 20% of the leaderboard.
149 |
150 | 11. Since I now had a strategy that made marginal improvements over the baseline approach, I wanted to improve this approach by taking advantage of the previous models I had built. Specifically, instead of replacing all instances of a set of "unlikely" plans, a potentially smarter approach would be to replace only some instances. Therefore, I tried predicting which customers would change from their final quote, and then only replacing their predicted plan if they had an unlikely plan. Unfortunately, the intersection between the set of "customers who are very likely to change" and the set of "customers who have very unlikely plans" was tiny. The only way to generate an intersection containing a meaningful number of customers (around 100) was to significantly reduce the "change prediction threshold" in my logistic regression model. This approach did not end up improving my best model, probably because my change prediction model was only performing well at a high threshold (as discussed in model building #5).
151 |
152 | 12. Although I came up with [many more ideas](notes/allstate-ideas.md) for how to improve my predictive model, I ultimately ran out of time due to the end of the competition period.
153 |
154 |
155 | ## Business Applications and Implementation
156 |
157 | The [Kaggle description of this challenge](http://www.kaggle.com/c/allstate-purchase-prediction-challenge) summarized the goal of this competition as follows: "If the eventual purchase can be predicted sooner in the shopping window, the quoting process is shortened and the issuer is less likely to lose the customer's business." The techniques described throughout this paper could likely be applied to any business in which customers are deciding between multiple products (or multiple options for the same product). If the business can gain an insight into which product or option a customer is likely to end up choosing, they could nudge the customer toward that product (in order to increase their conversion rate), or instead nudge the customer toward a slightly more expensive product (in order to maximize their profit from that sale).
158 |
159 | The implementation details of such a system would depend upon the specific application, but the two critical factors would be how to gather data about the user before (or during) the shopping process, and how to predict the most likely purchase in a real-time manner. Two ideas for addressing the latter issue are precalculating the predictions for clusters of customers or encoding the prediction algorithm into a tree-like set of rules, either of which would allow predictions to be "served up" to the user in a near real-time fashion.
160 |
161 |
162 | ## Results and Key Learnings
163 |
164 | Because the public leaderboard only reflects a competitor's score on 30% of the test set, it is impossible to know what the final results will be until the competition closes on May 19. However, it seems very likely that the final rankings will be similar to the public rankings.
165 |
166 | As of May 18, the public leaderboard includes 1,568 competitors. 714 of those competitors have a score exactly matching the baseline approach (0.53793), with 432 below the baseline and 422 above the baseline. With my best score of 0.53853, I am currently ranked 253rd. The top competitor has beaten the baseline by less than 1% (0.54571), clearly indicating that this is a very challenging predictive problem.
167 |
168 | I learned many lessons about the predictive modeling process by participating in this competition:
169 |
170 | 1. **Early in the competition, try many different approaches:** Although I did iterate through a variety of different approaches during the course of the competition, it was not until the final week of the competition that I tried a completely different approach that ended up actually beating the baseline score. Had I spent more time earlier on iterating through different approaches, I may have had much more time to refine and build upon the approach that ended up working.
171 |
172 | 2. **Smarter strategies trump more modeling and more data:** Although this problem appeared to be one of predicting insurance options based upon customer data and behavior, a key insight was that it was actually a problem of predicting option combinations that were unlikely to be purchased. Certainly the best competitors are tackling both problems at once, but I would argue that the second problem was the less obvious but more important problem to tackle. Solving that problem does not actually require most of the provided data, and (at least in my case) does not use any traditional machine learning models. This demonstrates that you don't have to use data just because you have it, and you don't have to feed your data into machine learning models just because you know how to use them.
173 |
174 | 3. **Real-world data is hard to work with:** Even though the data provided by Kaggle was relatively clean, that did not mean it was easy to work with. It was a challenging problem simply figuring out how to train a model on the available data, because it's not obvious how to "learn" something from each quote requested by a customer. I would imagine that in the real world, this is often a major issue, and is a sharp contrast with a textbook problem in which the data has already been simplified for you!
175 |
176 | 4. **Algorithms and processes that allow for rapid iteration are priceless:** Although random forests can often outperform other algorithms in predictive accuracy, I found myself mostly using logistic regression because I could iterate through different approaches in seconds or minutes rather than hours. In addition, the reusable functions that I built for data pre-processing, cross-validation, and other tasks were time well spent because they also allowed me to iterate quickly with minimal code changes.
177 |
178 | 5. **Learn from others around you:** Some of my key insights (and ideas for different approaches) came from paying close attention to the Kaggle forums. Because there are so many different ways to go about solving a given problem, it is crucial to learn from those around you in order to be exposed to different ways of thinking.
179 |
180 |
181 | ## Postscript
182 |
183 | I finalized this paper on May 18, the day before this competition ended. When it closed on May 19, my standing on the [public leaderboard](http://www.kaggle.com/c/allstate-purchase-prediction-challenge/leaderboard/public) was 263rd, with a score of 0.53853 (10 "picks" above the last quoted plan benchmark of 0.53793). When the [private leaderboard](http://www.kaggle.com/c/allstate-purchase-prediction-challenge/leaderboard/private) was unveiled, I unfortunately dropped to 1039th, with a score of 0.53266 (1 "pick" below the benchmark of 0.53269).
184 |
185 | In retrospect, I should have cross-validated my submissions before deciding which two submissions to select as the ones that "counted", especially because I did have one submission that beat the private leaderboard benchmark. I naively assumed that even if I was overfitting the public leaderboard slightly, I was far enough above the benchmark that I wouldn't drop below it on the private leaderboard.
186 |
187 | I will be closely watching the ["solution sharing" thread](http://www.kaggle.com/c/allstate-purchase-prediction-challenge/forums/t/8218/solution-sharing) on the Kaggle forums, to learn about the approaches used by other competitors!
188 |
--------------------------------------------------------------------------------
/allstate-presentation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/kaggle-allstate/ffc6b1be246ab3e7d10bc541e0dcc5679d8020dd/allstate-presentation.pdf
--------------------------------------------------------------------------------
/allstate-presentation.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/kaggle-allstate/ffc6b1be246ab3e7d10bc541e0dcc5679d8020dd/allstate-presentation.pptx
--------------------------------------------------------------------------------
/notes/allstate-ideas.md:
--------------------------------------------------------------------------------
1 | ## Ideas for improving my Allstate predictive models
2 |
3 | As mentioned in my [paper](../allstate-paper.md) in Model Building section 12, I came up with many ideas for how to improve my predictive models that I didn't actually have time to execute. My rough notes are listed below:
4 |
5 | 1. Rather than coming up with a list of "unlikely" plans and always predicting a replacement plan, selectively predict whether to replace that plan for each customer based upon their characteristics.
6 | 2. Rather than always predicting the same replacement plan for a given "unlikely" plan, change which replacement plan to predict based upon the characteristics of each customer.
7 | 3. Experiment with adding interaction terms to the models.
8 | 4. Use a more sophisticated approach to feature selection (R packages: `relaimpo`, `bestglm`).
9 | 5. Use the quote just before the final quote as a set of additional features (A/B/C/D/E/F/G, day, time, cost).
10 | 6. Spend more time tuning the randomForest model (variable selection, ntree, mtry).
11 | 7. Create an ensemble of predictive models.
12 | 8. For predicting which customers will change between the last quote and the purchase point: Rather than using a high probability cutoff and then only predicting new options for that small group of customers, instead pass the "change probability" as a feature to the next model.
13 | 9. Truncate the training set to match the test set distribution.
14 | 10. Cluster customers into groups, and make predictions separately for each group.
15 | 11. Use PCA to discover latent features, and use those features instead of the raw or engineered features.
16 | 12. Come up with more ways to incorporate all of the training data (and not just the final quote) into the predictive models.
17 | 13. Come up with more ways to make use of the interactions and dependencies between individual options.
18 |
--------------------------------------------------------------------------------
/notes/allstate-option-dependencies.md:
--------------------------------------------------------------------------------
1 | ## Allstate Option Dependencies
2 |
3 | As discussed in my [paper](../allstate-paper.md) in Data Exploration section 6, I compiled a loose set of "rules" that attempted to capture dependencies between different options. In Model Building section 9 of the paper, I used these rules to manually "fix" the predicted plan for customers that my model predicted had a strong likelihood of changing options between their final quote and the purchase point.
4 |
5 | * Option A (0, 1, 2)
6 | * if B=1 or C=3 or D=3 or E=1 or F=1/2, more likely to choose A=1
7 | * if E=1, almost never choose A=0
8 | * if F=0, more likely to choose A=0
9 | * if F=3, never choose A=1
10 | * Option B (0, 1)
11 | * if A=0 or C=1 or E=0 or F=0, more likely to choose B=0
12 | * if E=1, more likely to choose B=1
13 | * Option C (1, 2, 3, 4)
14 | * if A=1 or D=3 or E=1 or F=1, more likely to choose C=3
15 | * if D=1, almost always choose C=1
16 | * if D=2, never choose C=4
17 | * Option D (1, 2, 3)
18 | * if A=1 or C=3, more likely to choose D=3
19 | * if C=2/3, almost never choose D=1
20 | * if C=4, always choose D=3
21 | * Option E (0, 1)
22 | * if A=0, almost always choose E=0
23 | * if B=0 or C=1 or F=0, more likely to choose E=0
24 | * Option F (0, 1, 2, 3)
25 | * if A=0, almost always choose F=0
26 | * if A=1, never choose F=3
27 | * Option G (1, 2, 3, 4)
28 | * no patterns
29 |
--------------------------------------------------------------------------------
/viz/allstate-viz-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/kaggle-allstate/ffc6b1be246ab3e7d10bc541e0dcc5679d8020dd/viz/allstate-viz-1.png
--------------------------------------------------------------------------------
/viz/allstate-viz-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/kaggle-allstate/ffc6b1be246ab3e7d10bc541e0dcc5679d8020dd/viz/allstate-viz-2.png
--------------------------------------------------------------------------------
/viz/allstate-viz-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/kaggle-allstate/ffc6b1be246ab3e7d10bc541e0dcc5679d8020dd/viz/allstate-viz-3.png
--------------------------------------------------------------------------------
/viz/allstate-viz-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/kaggle-allstate/ffc6b1be246ab3e7d10bc541e0dcc5679d8020dd/viz/allstate-viz-4.png
--------------------------------------------------------------------------------
/viz/allstate-viz-5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/justmarkham/kaggle-allstate/ffc6b1be246ab3e7d10bc541e0dcc5679d8020dd/viz/allstate-viz-5.png
--------------------------------------------------------------------------------