├── ModelBuilding.ipynb
├── README.md
├── clustering_helper_functions .ipynb
├── customer_segmentation.csv
├── order_segmentation_0.0.csv
├── ulabox_customer_segmentation_0.0.ipynb
└── ulabox_order_segmentation_0.0.ipynb
/ModelBuilding.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "Here we will build a gradient boosting trees model to classify the customers into corresponding classes.
\n",
8 | "
\n",
9 | "Recall that we used averages over all the orders of a customer to calculate Food%, Fresh% etc. of a customer. A customer may change from one class to another, which is natural. Birth of a new baby may make the customer new_parents, after years once the baby grows up they become normal again. A customer may turn more health concious when he gets old.
\n",
10 | "
\n",
11 | "To address this problem we can update the data of the customer with the current average after every order and put the data again to be predicted by the model. A better idea will be to use the concept of exponential moving average which is used in technical analysis of stock markets. We can have a certain number of orders as the look back period for the exponential moving average. What a moving average does is, it gives exponentially more importance to the recent data and less importance to the earlier data. Their by catching current trend in the customer's orders.
"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "Here we will just create a basic model with the data we have."
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 1,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "import pandas as pd\n",
28 | "import numpy as np\n",
29 | "import matplotlib.pyplot as plt\n",
30 | "import matplotlib as mpl\n",
31 | "import seaborn as sns\n",
32 | "import scipy as sc\n",
33 | "import xgboost as xgb\n",
34 | "import itertools"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 14,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "from xgboost.sklearn import XGBClassifier"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 18,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "import warnings\n",
53 | "warnings.filterwarnings('ignore')"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 2,
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "data = pd.read_csv('customer_segmentation.csv')"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 3,
68 | "metadata": {},
69 | "outputs": [
70 | {
71 | "data": {
72 | "text/html": [
73 | "
\n",
74 | "\n",
87 | "
\n",
88 | " \n",
89 | " \n",
90 | " | \n",
91 | " customer | \n",
92 | " order | \n",
93 | " total_items | \n",
94 | " discount% | \n",
95 | " weekday | \n",
96 | " hour | \n",
97 | " Food% | \n",
98 | " Fresh% | \n",
99 | " Drinks% | \n",
100 | " Home% | \n",
101 | " Beauty% | \n",
102 | " Health% | \n",
103 | " Baby% | \n",
104 | " Pets% | \n",
105 | " num_orders | \n",
106 | " labels | \n",
107 | " class | \n",
108 | "
\n",
109 | " \n",
110 | " \n",
111 | " \n",
112 | " 0 | \n",
113 | " 0 | \n",
114 | " 0 | \n",
115 | " 44.666667 | \n",
116 | " 14.110000 | \n",
117 | " 4 | \n",
118 | " 13 | \n",
119 | " 14.070000 | \n",
120 | " 73.203333 | \n",
121 | " 4.356667 | \n",
122 | " 6.200000 | \n",
123 | " 2.176667 | \n",
124 | " 0.000 | \n",
125 | " 0.000000 | \n",
126 | " 0.0 | \n",
127 | " 3.0 | \n",
128 | " 1 | \n",
129 | " fresh_regulars | \n",
130 | "
\n",
131 | " \n",
132 | " 1 | \n",
133 | " 1 | \n",
134 | " 3 | \n",
135 | " 31.150000 | \n",
136 | " 17.849000 | \n",
137 | " 1 | \n",
138 | " 12 | \n",
139 | " 17.762000 | \n",
140 | " 52.909000 | \n",
141 | " 17.761000 | \n",
142 | " 3.207500 | \n",
143 | " 2.314500 | \n",
144 | " 4.352 | \n",
145 | " 1.695000 | \n",
146 | " 0.0 | \n",
147 | " 20.0 | \n",
148 | " 4 | \n",
149 | " loyals | \n",
150 | "
\n",
151 | " \n",
152 | " 2 | \n",
153 | " 2 | \n",
154 | " 23 | \n",
155 | " 26.000000 | \n",
156 | " 2.970000 | \n",
157 | " 6 | \n",
158 | " 23 | \n",
159 | " 24.100000 | \n",
160 | " 22.290000 | \n",
161 | " 38.690000 | \n",
162 | " 14.920000 | \n",
163 | " 0.000000 | \n",
164 | " 0.000 | \n",
165 | " 0.000000 | \n",
166 | " 0.0 | \n",
167 | " 1.0 | \n",
168 | " 8 | \n",
169 | " grocery_regulars | \n",
170 | "
\n",
171 | " \n",
172 | " 3 | \n",
173 | " 3 | \n",
174 | " 24 | \n",
175 | " 27.782609 | \n",
176 | " 4.102174 | \n",
177 | " 1 | \n",
178 | " 10 | \n",
179 | " 23.825652 | \n",
180 | " 51.280870 | \n",
181 | " 8.220870 | \n",
182 | " 14.773478 | \n",
183 | " 0.000000 | \n",
184 | " 0.000 | \n",
185 | " 1.898696 | \n",
186 | " 0.0 | \n",
187 | " 23.0 | \n",
188 | " 4 | \n",
189 | " loyals | \n",
190 | "
\n",
191 | " \n",
192 | " 4 | \n",
193 | " 4 | \n",
194 | " 47 | \n",
195 | " 17.103448 | \n",
196 | " 4.373103 | \n",
197 | " 3 | \n",
198 | " 9 | \n",
199 | " 24.841379 | \n",
200 | " 51.082414 | \n",
201 | " 10.291034 | \n",
202 | " 13.035172 | \n",
203 | " 0.683793 | \n",
204 | " 0.000 | \n",
205 | " 0.065517 | \n",
206 | " 0.0 | \n",
207 | " 29.0 | \n",
208 | " 4 | \n",
209 | " loyals | \n",
210 | "
\n",
211 | " \n",
212 | "
\n",
213 | "
"
214 | ],
215 | "text/plain": [
216 | " customer order total_items discount% weekday hour Food% \\\n",
217 | "0 0 0 44.666667 14.110000 4 13 14.070000 \n",
218 | "1 1 3 31.150000 17.849000 1 12 17.762000 \n",
219 | "2 2 23 26.000000 2.970000 6 23 24.100000 \n",
220 | "3 3 24 27.782609 4.102174 1 10 23.825652 \n",
221 | "4 4 47 17.103448 4.373103 3 9 24.841379 \n",
222 | "\n",
223 | " Fresh% Drinks% Home% Beauty% Health% Baby% Pets% \\\n",
224 | "0 73.203333 4.356667 6.200000 2.176667 0.000 0.000000 0.0 \n",
225 | "1 52.909000 17.761000 3.207500 2.314500 4.352 1.695000 0.0 \n",
226 | "2 22.290000 38.690000 14.920000 0.000000 0.000 0.000000 0.0 \n",
227 | "3 51.280870 8.220870 14.773478 0.000000 0.000 1.898696 0.0 \n",
228 | "4 51.082414 10.291034 13.035172 0.683793 0.000 0.065517 0.0 \n",
229 | "\n",
230 | " num_orders labels class \n",
231 | "0 3.0 1 fresh_regulars \n",
232 | "1 20.0 4 loyals \n",
233 | "2 1.0 8 grocery_regulars \n",
234 | "3 23.0 4 loyals \n",
235 | "4 29.0 4 loyals "
236 | ]
237 | },
238 | "execution_count": 3,
239 | "metadata": {},
240 | "output_type": "execute_result"
241 | }
242 | ],
243 | "source": [
244 | "data.head()"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": 4,
250 | "metadata": {},
251 | "outputs": [
252 | {
253 | "data": {
254 | "text/html": [
255 | "\n",
256 | "\n",
269 | "
\n",
270 | " \n",
271 | " \n",
272 | " | \n",
273 | " customer | \n",
274 | " order | \n",
275 | " total_items | \n",
276 | " discount% | \n",
277 | " weekday | \n",
278 | " hour | \n",
279 | " Food% | \n",
280 | " Fresh% | \n",
281 | " Drinks% | \n",
282 | " Home% | \n",
283 | " Beauty% | \n",
284 | " Health% | \n",
285 | " Baby% | \n",
286 | " Pets% | \n",
287 | " num_orders | \n",
288 | " labels | \n",
289 | "
\n",
290 | " \n",
291 | " \n",
292 | " \n",
293 | " count | \n",
294 | " 9354.000000 | \n",
295 | " 9354.000000 | \n",
296 | " 9354.000000 | \n",
297 | " 9354.000000 | \n",
298 | " 9354.000000 | \n",
299 | " 9354.000000 | \n",
300 | " 9354.000000 | \n",
301 | " 9354.000000 | \n",
302 | " 9354.000000 | \n",
303 | " 9354.000000 | \n",
304 | " 9354.000000 | \n",
305 | " 9354.000000 | \n",
306 | " 9354.000000 | \n",
307 | " 9354.000000 | \n",
308 | " 9354.000000 | \n",
309 | " 9354.000000 | \n",
310 | "
\n",
311 | " \n",
312 | " mean | \n",
313 | " 5022.473808 | \n",
314 | " 15025.143789 | \n",
315 | " 32.022679 | \n",
316 | " 11.857907 | \n",
317 | " 3.657473 | \n",
318 | " 15.258071 | \n",
319 | " 25.886560 | \n",
320 | " 15.173542 | \n",
321 | " 23.717923 | \n",
322 | " 15.517726 | \n",
323 | " 6.083896 | \n",
324 | " 1.280169 | \n",
325 | " 11.036040 | \n",
326 | " 1.144381 | \n",
327 | " 3.085311 | \n",
328 | " 4.904319 | \n",
329 | "
\n",
330 | " \n",
331 | " std | \n",
332 | " 2945.899928 | \n",
333 | " 8825.170543 | \n",
334 | " 18.724271 | \n",
335 | " 19.372177 | \n",
336 | " 2.181161 | \n",
337 | " 5.709821 | \n",
338 | " 24.018227 | \n",
339 | " 19.856395 | \n",
340 | " 21.745537 | \n",
341 | " 18.024529 | \n",
342 | " 11.766312 | \n",
343 | " 5.089555 | \n",
344 | " 23.515242 | \n",
345 | " 6.224596 | \n",
346 | " 3.247710 | \n",
347 | " 2.906824 | \n",
348 | "
\n",
349 | " \n",
350 | " min | \n",
351 | " 0.000000 | \n",
352 | " 0.000000 | \n",
353 | " 4.250000 | \n",
354 | " -31.820000 | \n",
355 | " 1.000000 | \n",
356 | " 0.000000 | \n",
357 | " 0.000000 | \n",
358 | " 0.000000 | \n",
359 | " 0.000000 | \n",
360 | " 0.000000 | \n",
361 | " 0.000000 | \n",
362 | " 0.000000 | \n",
363 | " 0.000000 | \n",
364 | " 0.000000 | \n",
365 | " 1.000000 | \n",
366 | " 0.000000 | \n",
367 | "
\n",
368 | " \n",
369 | " 25% | \n",
370 | " 2457.250000 | \n",
371 | " 7307.750000 | \n",
372 | " 19.446970 | \n",
373 | " 2.560000 | \n",
374 | " 2.000000 | \n",
375 | " 11.000000 | \n",
376 | " 9.608750 | \n",
377 | " 0.000000 | \n",
378 | " 7.880250 | \n",
379 | " 2.125833 | \n",
380 | " 0.000000 | \n",
381 | " 0.000000 | \n",
382 | " 0.000000 | \n",
383 | " 0.000000 | \n",
384 | " 1.000000 | \n",
385 | " 2.000000 | \n",
386 | "
\n",
387 | " \n",
388 | " 50% | \n",
389 | " 4951.500000 | \n",
390 | " 14777.500000 | \n",
391 | " 28.763889 | \n",
392 | " 5.750000 | \n",
393 | " 3.000000 | \n",
394 | " 16.000000 | \n",
395 | " 20.900000 | \n",
396 | " 4.867000 | \n",
397 | " 18.971500 | \n",
398 | " 10.501250 | \n",
399 | " 2.280417 | \n",
400 | " 0.000000 | \n",
401 | " 0.000000 | \n",
402 | " 0.000000 | \n",
403 | " 2.000000 | \n",
404 | " 5.000000 | \n",
405 | "
\n",
406 | " \n",
407 | " 75% | \n",
408 | " 7573.750000 | \n",
409 | " 22804.750000 | \n",
410 | " 40.000000 | \n",
411 | " 12.382500 | \n",
412 | " 6.000000 | \n",
413 | " 20.000000 | \n",
414 | " 33.802500 | \n",
415 | " 26.487083 | \n",
416 | " 33.567292 | \n",
417 | " 21.720000 | \n",
418 | " 7.350000 | \n",
419 | " 0.000000 | \n",
420 | " 7.097250 | \n",
421 | " 0.000000 | \n",
422 | " 4.000000 | \n",
423 | " 8.000000 | \n",
424 | "
\n",
425 | " \n",
426 | " max | \n",
427 | " 10237.000000 | \n",
428 | " 29997.000000 | \n",
429 | " 147.500000 | \n",
430 | " 100.000000 | \n",
431 | " 7.000000 | \n",
432 | " 23.000000 | \n",
433 | " 100.000000 | \n",
434 | " 100.000000 | \n",
435 | " 100.000000 | \n",
436 | " 100.000000 | \n",
437 | " 100.000000 | \n",
438 | " 100.000000 | \n",
439 | " 100.000000 | \n",
440 | " 100.000000 | \n",
441 | " 52.000000 | \n",
442 | " 9.000000 | \n",
443 | "
\n",
444 | " \n",
445 | "
\n",
446 | "
"
447 | ],
448 | "text/plain": [
449 | " customer order total_items discount% weekday \\\n",
450 | "count 9354.000000 9354.000000 9354.000000 9354.000000 9354.000000 \n",
451 | "mean 5022.473808 15025.143789 32.022679 11.857907 3.657473 \n",
452 | "std 2945.899928 8825.170543 18.724271 19.372177 2.181161 \n",
453 | "min 0.000000 0.000000 4.250000 -31.820000 1.000000 \n",
454 | "25% 2457.250000 7307.750000 19.446970 2.560000 2.000000 \n",
455 | "50% 4951.500000 14777.500000 28.763889 5.750000 3.000000 \n",
456 | "75% 7573.750000 22804.750000 40.000000 12.382500 6.000000 \n",
457 | "max 10237.000000 29997.000000 147.500000 100.000000 7.000000 \n",
458 | "\n",
459 | " hour Food% Fresh% Drinks% Home% \\\n",
460 | "count 9354.000000 9354.000000 9354.000000 9354.000000 9354.000000 \n",
461 | "mean 15.258071 25.886560 15.173542 23.717923 15.517726 \n",
462 | "std 5.709821 24.018227 19.856395 21.745537 18.024529 \n",
463 | "min 0.000000 0.000000 0.000000 0.000000 0.000000 \n",
464 | "25% 11.000000 9.608750 0.000000 7.880250 2.125833 \n",
465 | "50% 16.000000 20.900000 4.867000 18.971500 10.501250 \n",
466 | "75% 20.000000 33.802500 26.487083 33.567292 21.720000 \n",
467 | "max 23.000000 100.000000 100.000000 100.000000 100.000000 \n",
468 | "\n",
469 | " Beauty% Health% Baby% Pets% num_orders \\\n",
470 | "count 9354.000000 9354.000000 9354.000000 9354.000000 9354.000000 \n",
471 | "mean 6.083896 1.280169 11.036040 1.144381 3.085311 \n",
472 | "std 11.766312 5.089555 23.515242 6.224596 3.247710 \n",
473 | "min 0.000000 0.000000 0.000000 0.000000 1.000000 \n",
474 | "25% 0.000000 0.000000 0.000000 0.000000 1.000000 \n",
475 | "50% 2.280417 0.000000 0.000000 0.000000 2.000000 \n",
476 | "75% 7.350000 0.000000 7.097250 0.000000 4.000000 \n",
477 | "max 100.000000 100.000000 100.000000 100.000000 52.000000 \n",
478 | "\n",
479 | " labels \n",
480 | "count 9354.000000 \n",
481 | "mean 4.904319 \n",
482 | "std 2.906824 \n",
483 | "min 0.000000 \n",
484 | "25% 2.000000 \n",
485 | "50% 5.000000 \n",
486 | "75% 8.000000 \n",
487 | "max 9.000000 "
488 | ]
489 | },
490 | "execution_count": 4,
491 | "metadata": {},
492 | "output_type": "execute_result"
493 | }
494 | ],
495 | "source": [
496 | "data.describe()"
497 | ]
498 | },
499 | {
500 | "cell_type": "code",
501 | "execution_count": 11,
502 | "metadata": {},
503 | "outputs": [],
504 | "source": [
505 | "y = data['labels'].values\n",
506 | "X = data.drop(['weekday', 'hour','labels', 'class'], axis=1).values"
507 | ]
508 | },
509 | {
510 | "cell_type": "code",
511 | "execution_count": 7,
512 | "metadata": {},
513 | "outputs": [],
514 | "source": [
515 | "from sklearn.model_selection import StratifiedKFold"
516 | ]
517 | },
518 | {
519 | "cell_type": "code",
520 | "execution_count": 8,
521 | "metadata": {},
522 | "outputs": [],
523 | "source": [
524 | "skf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=42)"
525 | ]
526 | },
527 | {
528 | "cell_type": "code",
529 | "execution_count": 9,
530 | "metadata": {},
531 | "outputs": [],
532 | "source": [
533 | "xgbc = XGBClassifier()"
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": 56,
539 | "metadata": {},
540 | "outputs": [
541 | {
542 | "name": "stdout",
543 | "output_type": "stream",
544 | "text": [
545 | "0.9797441364605544\n",
546 | "0.9770544290288153\n",
547 | "0.9705724986623863\n",
548 | "0.9689507494646681\n",
549 | "0.9716122121049813\n"
550 | ]
551 | }
552 | ],
553 | "source": [
554 | "cv_scores = []\n",
555 | "for train_index, test_index in skf.split(X,y):\n",
556 | " X_train, X_test = X[train_index], X[test_index]\n",
557 | " y_train, y_test = y[train_index], y[test_index]\n",
558 | " xgbc.fit(X_train, y_train)\n",
559 | " score = xgbc.score(X_test, y_test)\n",
560 | " print(score)\n",
561 | " cv_scores.append(score)"
562 | ]
563 | },
564 | {
565 | "cell_type": "markdown",
566 | "metadata": {},
567 | "source": [
568 | "That's not a bad score, let's try to improve it by tuning parameters."
569 | ]
570 | },
571 | {
572 | "cell_type": "code",
573 | "execution_count": 13,
574 | "metadata": {},
575 | "outputs": [],
576 | "source": [
577 | "from sklearn.model_selection import GridSearchCV"
578 | ]
579 | },
580 | {
581 | "cell_type": "code",
582 | "execution_count": 44,
583 | "metadata": {},
584 | "outputs": [],
585 | "source": [
586 | "params={\n",
587 | " 'max_depth':[6,7],\n",
588 | " 'learning_rate':[0.05],\n",
589 | " 'n_estimators':[500],\n",
590 | " 'objective':['multi:softprob'],\n",
591 | " 'gamma':[0],\n",
592 | " 'max_delta_step':[1],\n",
593 | " 'subsample':[0.9,0.8],\n",
594 | " 'colsample_bytree':[1.0],\n",
595 | " 'colsample_bylevel':[1.0],\n",
596 | " 'min_child_weight':[1.0]\n",
597 | "}"
598 | ]
599 | },
600 | {
601 | "cell_type": "code",
602 | "execution_count": 45,
603 | "metadata": {},
604 | "outputs": [],
605 | "source": [
606 | "grid_search_xgb = GridSearchCV(estimator=XGBClassifier(), param_grid=params, n_jobs=-1)"
607 | ]
608 | },
609 | {
610 | "cell_type": "code",
611 | "execution_count": 46,
612 | "metadata": {
613 | "scrolled": true
614 | },
615 | "outputs": [
616 | {
617 | "data": {
618 | "text/plain": [
619 | "GridSearchCV(cv=None, error_score='raise',\n",
620 | " estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
621 | " colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,\n",
622 | " max_depth=3, min_child_weight=1, missing=None, n_estimators=100,\n",
623 | " n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,\n",
624 | " reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n",
625 | " silent=True, subsample=1),\n",
626 | " fit_params=None, iid=True, n_jobs=-1,\n",
627 | " param_grid={'colsample_bylevel': [1.0], 'colsample_bytree': [1.0], 'gamma': [0], 'subsample': [0.9, 0.8], 'min_child_weight': [1.0], 'max_delta_step': [1], 'objective': ['multi:softprob'], 'n_estimators': [500], 'learning_rate': [0.05], 'max_depth': [6, 7]},\n",
628 | " pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',\n",
629 | " scoring=None, verbose=0)"
630 | ]
631 | },
632 | "execution_count": 46,
633 | "metadata": {},
634 | "output_type": "execute_result"
635 | }
636 | ],
637 | "source": [
638 | "grid_search_xgb.fit(X,y)"
639 | ]
640 | },
641 | {
642 | "cell_type": "code",
643 | "execution_count": 47,
644 | "metadata": {},
645 | "outputs": [
646 | {
647 | "name": "stdout",
648 | "output_type": "stream",
649 | "text": [
650 | "{'colsample_bylevel': 1.0, 'colsample_bytree': 1.0, 'gamma': 0, 'subsample': 0.9, 'min_child_weight': 1.0, 'max_delta_step': 1, 'objective': 'multi:softprob', 'n_estimators': 500, 'learning_rate': 0.05, 'max_depth': 6}\n"
651 | ]
652 | }
653 | ],
654 | "source": [
655 | "print(grid_search_xgb.best_params_)"
656 | ]
657 | },
658 | {
659 | "cell_type": "code",
660 | "execution_count": 48,
661 | "metadata": {},
662 | "outputs": [],
663 | "source": [
664 | "#results = pd.DataFrame(grid_search_xgb.cv_results_)\n",
665 | "results = pd.concat([results, pd.DataFrame(grid_search_xgb.cv_results_)], axis=0)"
666 | ]
667 | },
668 | {
669 | "cell_type": "code",
670 | "execution_count": 49,
671 | "metadata": {},
672 | "outputs": [
673 | {
674 | "data": {
675 | "text/html": [
676 | "\n",
677 | "\n",
690 | "
\n",
691 | " \n",
692 | " \n",
693 | " | \n",
694 | " 21 | \n",
695 | " 0 | \n",
696 | " 1 | \n",
697 | "
\n",
698 | " \n",
699 | " \n",
700 | " \n",
701 | " mean_fit_time | \n",
702 | " 30.7451 | \n",
703 | " 28.9921 | \n",
704 | " 28.5919 | \n",
705 | "
\n",
706 | " \n",
707 | " mean_score_time | \n",
708 | " 1.22534 | \n",
709 | " 2.06268 | \n",
710 | " 1.80435 | \n",
711 | "
\n",
712 | " \n",
713 | " mean_test_score | \n",
714 | " 0.972525 | \n",
715 | " 0.972525 | \n",
716 | " 0.972525 | \n",
717 | "
\n",
718 | " \n",
719 | " mean_train_score | \n",
720 | " 1 | \n",
721 | " 1 | \n",
722 | " 1 | \n",
723 | "
\n",
724 | " \n",
725 | " param_colsample_bylevel | \n",
726 | " 1 | \n",
727 | " 1 | \n",
728 | " 1 | \n",
729 | "
\n",
730 | " \n",
731 | " param_colsample_bytree | \n",
732 | " 1 | \n",
733 | " 1 | \n",
734 | " 1 | \n",
735 | "
\n",
736 | " \n",
737 | " param_gamma | \n",
738 | " 0 | \n",
739 | " 0 | \n",
740 | " 0 | \n",
741 | "
\n",
742 | " \n",
743 | " param_learning_rate | \n",
744 | " 0.05 | \n",
745 | " 0.05 | \n",
746 | " 0.05 | \n",
747 | "
\n",
748 | " \n",
749 | " param_max_delta_step | \n",
750 | " 1 | \n",
751 | " 1 | \n",
752 | " 1 | \n",
753 | "
\n",
754 | " \n",
755 | " param_max_depth | \n",
756 | " 6 | \n",
757 | " 6 | \n",
758 | " 6 | \n",
759 | "
\n",
760 | " \n",
761 | " param_min_child_weight | \n",
762 | " 1 | \n",
763 | " 1 | \n",
764 | " 1 | \n",
765 | "
\n",
766 | " \n",
767 | " param_n_estimators | \n",
768 | " 500 | \n",
769 | " 500 | \n",
770 | " 500 | \n",
771 | "
\n",
772 | " \n",
773 | " param_objective | \n",
774 | " multi:softprob | \n",
775 | " multi:softprob | \n",
776 | " multi:softprob | \n",
777 | "
\n",
778 | " \n",
779 | " param_subsample | \n",
780 | " 0.9 | \n",
781 | " 0.9 | \n",
782 | " 0.8 | \n",
783 | "
\n",
784 | " \n",
785 | " params | \n",
786 | " {'colsample_bylevel': 1.0, 'colsample_bytree':... | \n",
787 | " {'colsample_bylevel': 1.0, 'colsample_bytree':... | \n",
788 | " {'colsample_bylevel': 1.0, 'colsample_bytree':... | \n",
789 | "
\n",
790 | " \n",
791 | " rank_test_score | \n",
792 | " 1 | \n",
793 | " 1 | \n",
794 | " 1 | \n",
795 | "
\n",
796 | " \n",
797 | " split0_test_score | \n",
798 | " 0.966037 | \n",
799 | " 0.966037 | \n",
800 | " 0.966037 | \n",
801 | "
\n",
802 | " \n",
803 | " split0_train_score | \n",
804 | " 1 | \n",
805 | " 1 | \n",
806 | " 1 | \n",
807 | "
\n",
808 | " \n",
809 | " split1_test_score | \n",
810 | " 0.978198 | \n",
811 | " 0.978198 | \n",
812 | " 0.978839 | \n",
813 | "
\n",
814 | " \n",
815 | " split1_train_score | \n",
816 | " 1 | \n",
817 | " 1 | \n",
818 | " 1 | \n",
819 | "
\n",
820 | " \n",
821 | " split2_test_score | \n",
822 | " 0.973346 | \n",
823 | " 0.973346 | \n",
824 | " 0.972704 | \n",
825 | "
\n",
826 | " \n",
827 | " split2_train_score | \n",
828 | " 1 | \n",
829 | " 1 | \n",
830 | " 1 | \n",
831 | "
\n",
832 | " \n",
833 | " std_fit_time | \n",
834 | " 0.52249 | \n",
835 | " 0.455066 | \n",
836 | " 0.425876 | \n",
837 | "
\n",
838 | " \n",
839 | " std_score_time | \n",
840 | " 0.302093 | \n",
841 | " 0.588788 | \n",
842 | " 0.687544 | \n",
843 | "
\n",
844 | " \n",
845 | " std_test_score | \n",
846 | " 0.0050003 | \n",
847 | " 0.0050003 | \n",
848 | " 0.00522994 | \n",
849 | "
\n",
850 | " \n",
851 | " std_train_score | \n",
852 | " 0 | \n",
853 | " 0 | \n",
854 | " 0 | \n",
855 | "
\n",
856 | " \n",
857 | "
\n",
858 | "
"
859 | ],
860 | "text/plain": [
861 | " 21 \\\n",
862 | "mean_fit_time 30.7451 \n",
863 | "mean_score_time 1.22534 \n",
864 | "mean_test_score 0.972525 \n",
865 | "mean_train_score 1 \n",
866 | "param_colsample_bylevel 1 \n",
867 | "param_colsample_bytree 1 \n",
868 | "param_gamma 0 \n",
869 | "param_learning_rate 0.05 \n",
870 | "param_max_delta_step 1 \n",
871 | "param_max_depth 6 \n",
872 | "param_min_child_weight 1 \n",
873 | "param_n_estimators 500 \n",
874 | "param_objective multi:softprob \n",
875 | "param_subsample 0.9 \n",
876 | "params {'colsample_bylevel': 1.0, 'colsample_bytree':... \n",
877 | "rank_test_score 1 \n",
878 | "split0_test_score 0.966037 \n",
879 | "split0_train_score 1 \n",
880 | "split1_test_score 0.978198 \n",
881 | "split1_train_score 1 \n",
882 | "split2_test_score 0.973346 \n",
883 | "split2_train_score 1 \n",
884 | "std_fit_time 0.52249 \n",
885 | "std_score_time 0.302093 \n",
886 | "std_test_score 0.0050003 \n",
887 | "std_train_score 0 \n",
888 | "\n",
889 | " 0 \\\n",
890 | "mean_fit_time 28.9921 \n",
891 | "mean_score_time 2.06268 \n",
892 | "mean_test_score 0.972525 \n",
893 | "mean_train_score 1 \n",
894 | "param_colsample_bylevel 1 \n",
895 | "param_colsample_bytree 1 \n",
896 | "param_gamma 0 \n",
897 | "param_learning_rate 0.05 \n",
898 | "param_max_delta_step 1 \n",
899 | "param_max_depth 6 \n",
900 | "param_min_child_weight 1 \n",
901 | "param_n_estimators 500 \n",
902 | "param_objective multi:softprob \n",
903 | "param_subsample 0.9 \n",
904 | "params {'colsample_bylevel': 1.0, 'colsample_bytree':... \n",
905 | "rank_test_score 1 \n",
906 | "split0_test_score 0.966037 \n",
907 | "split0_train_score 1 \n",
908 | "split1_test_score 0.978198 \n",
909 | "split1_train_score 1 \n",
910 | "split2_test_score 0.973346 \n",
911 | "split2_train_score 1 \n",
912 | "std_fit_time 0.455066 \n",
913 | "std_score_time 0.588788 \n",
914 | "std_test_score 0.0050003 \n",
915 | "std_train_score 0 \n",
916 | "\n",
917 | " 1 \n",
918 | "mean_fit_time 28.5919 \n",
919 | "mean_score_time 1.80435 \n",
920 | "mean_test_score 0.972525 \n",
921 | "mean_train_score 1 \n",
922 | "param_colsample_bylevel 1 \n",
923 | "param_colsample_bytree 1 \n",
924 | "param_gamma 0 \n",
925 | "param_learning_rate 0.05 \n",
926 | "param_max_delta_step 1 \n",
927 | "param_max_depth 6 \n",
928 | "param_min_child_weight 1 \n",
929 | "param_n_estimators 500 \n",
930 | "param_objective multi:softprob \n",
931 | "param_subsample 0.8 \n",
932 | "params {'colsample_bylevel': 1.0, 'colsample_bytree':... \n",
933 | "rank_test_score 1 \n",
934 | "split0_test_score 0.966037 \n",
935 | "split0_train_score 1 \n",
936 | "split1_test_score 0.978839 \n",
937 | "split1_train_score 1 \n",
938 | "split2_test_score 0.972704 \n",
939 | "split2_train_score 1 \n",
940 | "std_fit_time 0.425876 \n",
941 | "std_score_time 0.687544 \n",
942 | "std_test_score 0.00522994 \n",
943 | "std_train_score 0 "
944 | ]
945 | },
946 | "execution_count": 49,
947 | "metadata": {},
948 | "output_type": "execute_result"
949 | }
950 | ],
951 | "source": [
952 | "results[results['mean_test_score']==results['mean_test_score'].max()].T"
953 | ]
954 | },
955 | {
956 | "cell_type": "code",
957 | "execution_count": 50,
958 | "metadata": {},
959 | "outputs": [],
960 | "source": [
961 | "selected_xgbc = XGBClassifier(learning_rate=0.05, max_depth=6, n_estimators=500, subsample=0.9)"
962 | ]
963 | },
964 | {
965 | "cell_type": "code",
966 | "execution_count": 51,
967 | "metadata": {},
968 | "outputs": [],
969 | "source": [
970 | "from sklearn.model_selection import train_test_split"
971 | ]
972 | },
973 | {
974 | "cell_type": "code",
975 | "execution_count": 52,
976 | "metadata": {},
977 | "outputs": [],
978 | "source": [
979 | "X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.1)"
980 | ]
981 | },
982 | {
983 | "cell_type": "code",
984 | "execution_count": 53,
985 | "metadata": {},
986 | "outputs": [
987 | {
988 | "data": {
989 | "text/plain": [
990 | "XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n",
991 | " colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,\n",
992 | " max_depth=6, min_child_weight=1, missing=None, n_estimators=500,\n",
993 | " n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,\n",
994 | " reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,\n",
995 | " silent=True, subsample=0.9)"
996 | ]
997 | },
998 | "execution_count": 53,
999 | "metadata": {},
1000 | "output_type": "execute_result"
1001 | }
1002 | ],
1003 | "source": [
1004 | "selected_xgbc.fit(X_train, y_train)"
1005 | ]
1006 | },
1007 | {
1008 | "cell_type": "code",
1009 | "execution_count": 54,
1010 | "metadata": {},
1011 | "outputs": [
1012 | {
1013 | "data": {
1014 | "text/plain": [
1015 | "1.0"
1016 | ]
1017 | },
1018 | "execution_count": 54,
1019 | "metadata": {},
1020 | "output_type": "execute_result"
1021 | }
1022 | ],
1023 | "source": [
1024 | "selected_xgbc.score(X_train, y_train)"
1025 | ]
1026 | },
1027 | {
1028 | "cell_type": "code",
1029 | "execution_count": 55,
1030 | "metadata": {},
1031 | "outputs": [
1032 | {
1033 | "data": {
1034 | "text/plain": [
1035 | "0.9850427350427351"
1036 | ]
1037 | },
1038 | "execution_count": 55,
1039 | "metadata": {},
1040 | "output_type": "execute_result"
1041 | }
1042 | ],
1043 | "source": [
1044 | "selected_xgbc.score(X_test, y_test)"
1045 | ]
1046 | },
1047 | {
1048 | "cell_type": "markdown",
1049 | "metadata": {},
1050 | "source": [
1051 | "Well, That's an improvement! This model can be further used to predict the classes of customers."
1052 | ]
1053 | }
1054 | ],
1055 | "metadata": {
1056 | "kernelspec": {
1057 | "display_name": "Python 3",
1058 | "language": "python",
1059 | "name": "python3"
1060 | },
1061 | "language_info": {
1062 | "codemirror_mode": {
1063 | "name": "ipython",
1064 | "version": 3
1065 | },
1066 | "file_extension": ".py",
1067 | "mimetype": "text/x-python",
1068 | "name": "python",
1069 | "nbconvert_exporter": "python",
1070 | "pygments_lexer": "ipython3",
1071 | "version": "3.5.2"
1072 | }
1073 | },
1074 | "nbformat": 4,
1075 | "nbformat_minor": 2
1076 | }
1077 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # customer-segmentation-python
2 | This project applies customer segmentation to the customer data from a company and derives conclusions and data driven ideas based on it.
3 | ### Dataset
4 | This data set is the customer data of a online super market company Ulabox. The data set is available in this link https://github.com/ulabox/datasets
5 | ### Customer segmentation
6 | In customer segmentation we categorize similar customers together in the same cluster and analyse them. It can reveal information like:
7 | 1) who are the most valuable customers of the company
8 | 2) what kinds of customers does the company have
9 | 3) This can be used for targeted marketing and other marketing strategies.
10 | 4) Sometimes it can even reveal a potential white space in the market place which no company has yet occupied.
11 | Well we can get creative here.
12 | ### Clustering
13 | Clustering is a process in which we put similar data points into the same cluster. There are a lot of algorithms to do this, for example agglomerative heirarchical clustering, kmeans clustering, Gaussian Mixture Model etc.
14 | ## Map to the project
15 | 1) The order_segmentation_0.0.ipynb file contains detailed notes and explanation of doing segmentation of orders in the data. I have also added my ideas in it. It's a clean walk through. I suggest to start there.
16 | 2) The customer_segmentation.ipynb file tries to do segmentation of customers in the data. It is very much similar to the order segmentation notebook. Though it doesn't have a lot of explanation you should be able to understand it after going through the former notebook. At the end of this notebook it gets real interesting.
17 | 3) I have added another file which is a bunch of functions that could help in visualizing and finding meaningful clusters within the data. These functions provide various ways to analyse for clusters in the data.
18 | 4) Model_Building.ipynb is where we build a model to predict the class of each customer, which can be used to find the classes of customers in future. I have added some ideas there.
19 | 5) The two csv files are the results after clustering.
20 | Thank you for your time :)
21 |
--------------------------------------------------------------------------------
/clustering_helper_functions .ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "def pca_results(pca, df):\n",
10 | " dimensions = ['dimention {}'.format(i) for i in range(1,pca.n_components_+1)]\n",
11 | " fig, ax = plt.subplots(figsize=(18,12))\n",
12 | " components = pd.DataFrame(pca.components_)\n",
13 | " components.plot(ax=ax, kind='bar');\n",
14 | " labels = [str(s) for s in df.columns]\n",
15 | " ax.legend(labels)\n",
16 | " ax.set_ylabel('Feature Weights')\n",
17 | " ax.set_xticklabels(dimensions, rotation=90)\n",
18 | " for i, ev in enumerate(np.round(pca.explained_variance_ratio_, 3)):\n",
19 | " ax.text(i-0.04, ax.get_ylim()[1]+0.05, ev)\n",
20 | " plt.show()"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "def pca_2d_plot(pca, df):\n",
30 | " fig, ax = plt.subplots(figsize=(10,10))\n",
31 | " transformed_data = pca.transform(df.values)\n",
32 | " ax.scatter(transformed_data[:,0], transformed_data[:,1], s=3)\n",
33 | " plt.show()"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 3,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "def dependant_variable_detector(df):\n",
43 | " from sklearn.preprocessing import StandardScaler\n",
44 | " from sklearn.linear_model import LinearRegression\n",
45 | " scaler = StandardScaler()\n",
46 | " lr = LinearRegression()\n",
47 | " columns = list(df.columns)\n",
48 | " for col in columns:\n",
49 | " y = scaler.fit_transform(df[col].values.reshape(-1,1))\n",
50 | " X = scaler.fit_transform(df.drop(col, axis=1).values)\n",
51 | " lr.fit(X,y)\n",
52 | " print('Using '+col+' as dependent variable R2 score is :'+str(lr.score(X,y)))"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 4,
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "def plot_corr_matrix(df):\n",
62 | " df_corr = df.corr()\n",
63 | " fig, ax = plt.subplots(figsize=(12,12))\n",
64 | " cax = ax.matshow(df_corr.values, interpolation='nearest')\n",
65 | " fig.colorbar(cax)\n",
66 | " plt.xticks(range(len(df.columns)), df.columns)\n",
67 | " plt.yticks(range(len(df.columns)), df.columns)"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 5,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "def turkey_outlier_detector(df, cols=None):\n",
77 | " if cols is None:\n",
78 | " cols = [str(s) for s in df.describe().columns]\n",
79 | " \n",
80 | " q1 = {}\n",
81 | " q3 = {}\n",
82 | " iqd = {}\n",
83 | " r_limit = {}\n",
84 | " l_limit = {}\n",
85 | " outlier_count = {}\n",
86 | " outlier_indices = {}\n",
87 | " for col in cols:\n",
88 | " q1[col] = np.percentile(df[col].values, 25)\n",
89 | " q3[col] = np.percentile(df[col].values, 75)\n",
90 | " iqd[col] = q3[col] - q1[col]\n",
91 | " r_limit[col] = q3[col] + 1.5*iqd[col]\n",
92 | " l_limit[col] = q1[col] - 1.5*iqd[col]\n",
93 | " data_outlier = df[~((df[col]l_limit[col]))]\n",
94 | " outlier_count[col] = data_outlier.shape[0]\n",
95 | " outlier_indices[col] = data_outlier.index\n",
96 | " \n",
97 | " for col in cols:\n",
98 | " print('_'*25)\n",
99 | " print(col+'-'*8+'>'+str(outlier_count[col]))\n",
100 | " \n",
101 | " return outlier_indices"
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": 6,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "def hopkins_statistic(df):\n",
111 | " from sklearn.neighbors import NearestNeighbors\n",
112 | " from sklearn.preprocessing import StandardScaler\n",
113 | " n_samples = df.shape[0]\n",
114 | " num_samples = [int(f*n_samples) for f in [0.25,0.5,0.75]]\n",
115 | " states = [123,42,67,248,654]\n",
116 | " for n in num_samples:\n",
117 | " print('-'*12+str(n)+'-'*12)\n",
118 | " hopkins_statistic = []\n",
119 | " for random_state in states:\n",
120 | " data = df.sample(n=n, random_state=random_state)\n",
121 | " nbrs = NearestNeighbors(n_neighbors=2)\n",
122 | " scaler = StandardScaler()\n",
123 | " X = scaler.fit_transform(data.values)\n",
124 | " nbrs.fit(X)\n",
125 | " sample_dist = nbrs.kneighbors(X)[0][:,1]\n",
126 | " sample_dist = np.sum(sample_dist)\n",
127 | " random_data = np.random.rand(X.shape[0], X.shape[1])\n",
128 | " nbrs.fit(random_data)\n",
129 | " random_dist = nbrs.kneighbors(random_data)[0][:,1]\n",
130 | " random_dist = np.sum(random_dist)\n",
131 | " hs = sample_dist/(sample_dist+random_dist)\n",
132 | " hopkins_statistic.append(hs)\n",
133 | " print('*'*25)\n",
134 | " print('hopkins statistic :'+str(hs))\n",
135 | " print('mean hopkins statistic :'+str(np.mean(np.array(hopkins_statistic))))\n",
136 | " print('hopkins statistic standard deviation :'+str(np.std(np.array(hopkins_statistic))))"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 7,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "def kth_nearest_data_point(df, k_max):\n",
146 | " from sklearn.neighbors import NearestNeighbors\n",
147 | " from sklearn.preprocessing import StandardScaler\n",
148 | " ks = range(1,k_max+1)\n",
149 | " scaler = StandardScaler()\n",
150 | " X = scaler.fit_transform(df.values)\n",
151 | " nbrs = NearestNeighbors(n_neighbors=k_max)\n",
152 | " nbrs.fit(X)\n",
153 | " kneighbors_result = nbrs.kneighbors()[0]\n",
154 | " kth_neighbor_dist = list(np.sum(kneighbors_result, axis=0))\n",
155 | " fig, ax = plt.subplots(figsize=(12,12))\n",
156 | " ax.plot(ks, kth_neighbor_dist);\n",
157 | " plt.show()"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": 8,
163 | "metadata": {},
164 | "outputs": [],
165 | "source": [
166 | "def plot_silhoutte_score(X, max_clusters=20):\n",
167 | " from sklearn.cluster import KMeans\n",
168 | " from sklearn.metrics import silhouette_score\n",
169 | " num_clusters = range(2,max_clusters+1)\n",
170 | " sil_score = []\n",
171 | " for n in num_clusters:\n",
172 | " kmeans = KMeans(n_clusters=n)\n",
173 | " kmeans.fit(X)\n",
174 | " preds = kmeans.predict(X)\n",
175 | " sil_score.append(silhouette_score(X, preds))\n",
176 | " \n",
177 | " fig, ax = plt.subplots(figsize=(12,12))\n",
178 | " ax.plot(num_clusters, sil_score)\n",
179 | " plt.show()"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 9,
185 | "metadata": {},
186 | "outputs": [],
187 | "source": [
188 | "def under_partition_measure(X, k_max):\n",
189 | " from sklearn.cluster import KMeans\n",
190 | " ks = range(1,k_max+1)\n",
191 | " UPM = []\n",
192 | " for k in ks:\n",
193 | " kmeans = KMeans(n_clusters=k)\n",
194 | " kmeans.fit(X)\n",
195 | " UPM.append(kmeans.inertia_)\n",
196 | " fig, ax = plt.subplots(figsize=(14,14))\n",
197 | " ax.plot(ks, UPM);\n",
198 | " plt.show()\n",
199 | " return UPM"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": 10,
205 | "metadata": {},
206 | "outputs": [],
207 | "source": [
208 | "def over_partition_measure(X, k_max):\n",
209 | " from sklearn.cluster import KMeans\n",
210 | " from sklearn.metrics.pairwise import pairwise_distances\n",
211 | " ks = range(1,k_max+1)\n",
212 | " OPM = []\n",
213 | " for k in ks:\n",
214 | " kmeans = KMeans(n_clusters=k)\n",
215 | " kmeans.fit(X)\n",
216 | " centers = kmeans.cluster_centers_\n",
217 | " d_min = np.inf\n",
218 | " for pair in list(itertools.combinations(centers, 2)):\n",
219 | " d = pairwise_distances(pair[0].reshape(1,-1), pair[1].reshape(1,-1), metric='euclidean')\n",
220 | " if d