├── CustomerAddress_Cleaned.csv
├── CustomerDemographic_Cleaned.csv
├── Customer_Trans_RFM_Analysis.csv
├── DQA and Data Cleaning Customer Address.ipynb
├── DQA and Data Cleaning CustomerDemographic.ipynb
├── DQA and Data Cleaning NewCustomerList.ipynb
├── DQA and Data Cleaning Transactions.ipynb
├── NewCustomerList_Cleaned.csv
├── README.md
├── RFM Analysis.ipynb
├── Raw_data.xlsx
├── Transactions_Cleaned.csv
└── data visualization
├── Car Owners by State.PNG
├── Customer Segment Distribution.PNG
├── Female vs Male Bike Purchases.PNG
├── Frequency vs Monetary.PNG
├── New Customer Wealth Segment.PNG
├── New Customers Age Distribution.PNG
├── New Customers Job Industry.PNG
├── Old Customers Age Distribution.PNG
├── Old Customers Job Industry.PNG
├── Old Customers Wealth Segment.PNG
├── Recency vs Monetary.PNG
└── Sales Dashboard.gif
/DQA and Data Cleaning Customer Address.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np\n",
11 | "import matplotlib.pyplot as plt\n",
12 | "import seaborn as sns\n",
13 | "%matplotlib inline\n",
14 | "\n",
15 | "from datetime import datetime, date"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 2,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "# Loading the Customer Address Data from the excel file\n",
25 | "\n",
26 | "cust_address = pd.read_excel('Raw_data.xlsx' , sheet_name='CustomerAddress')"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 3,
32 | "metadata": {},
33 | "outputs": [
34 | {
35 | "data": {
36 | "text/html": [
37 | "
\n",
38 | "\n",
51 | "
\n",
52 | " \n",
53 | " \n",
54 | " \n",
55 | " customer_id \n",
56 | " address \n",
57 | " postcode \n",
58 | " state \n",
59 | " country \n",
60 | " property_valuation \n",
61 | " \n",
62 | " \n",
63 | " \n",
64 | " \n",
65 | " 0 \n",
66 | " 1 \n",
67 | " 060 Morning Avenue \n",
68 | " 2016 \n",
69 | " New South Wales \n",
70 | " Australia \n",
71 | " 10 \n",
72 | " \n",
73 | " \n",
74 | " 1 \n",
75 | " 2 \n",
76 | " 6 Meadow Vale Court \n",
77 | " 2153 \n",
78 | " New South Wales \n",
79 | " Australia \n",
80 | " 10 \n",
81 | " \n",
82 | " \n",
83 | " 2 \n",
84 | " 4 \n",
85 | " 0 Holy Cross Court \n",
86 | " 4211 \n",
87 | " QLD \n",
88 | " Australia \n",
89 | " 9 \n",
90 | " \n",
91 | " \n",
92 | " 3 \n",
93 | " 5 \n",
94 | " 17979 Del Mar Point \n",
95 | " 2448 \n",
96 | " New South Wales \n",
97 | " Australia \n",
98 | " 4 \n",
99 | " \n",
100 | " \n",
101 | " 4 \n",
102 | " 6 \n",
103 | " 9 Oakridge Court \n",
104 | " 3216 \n",
105 | " VIC \n",
106 | " Australia \n",
107 | " 9 \n",
108 | " \n",
109 | " \n",
110 | "
\n",
111 | "
"
112 | ],
113 | "text/plain": [
114 | " customer_id address postcode state country \\\n",
115 | "0 1 060 Morning Avenue 2016 New South Wales Australia \n",
116 | "1 2 6 Meadow Vale Court 2153 New South Wales Australia \n",
117 | "2 4 0 Holy Cross Court 4211 QLD Australia \n",
118 | "3 5 17979 Del Mar Point 2448 New South Wales Australia \n",
119 | "4 6 9 Oakridge Court 3216 VIC Australia \n",
120 | "\n",
121 | " property_valuation \n",
122 | "0 10 \n",
123 | "1 10 \n",
124 | "2 9 \n",
125 | "3 4 \n",
126 | "4 9 "
127 | ]
128 | },
129 | "execution_count": 3,
130 | "metadata": {},
131 | "output_type": "execute_result"
132 | }
133 | ],
134 | "source": [
135 | "# Checking first 5 records from Customer Address Data\n",
136 | "\n",
137 | "cust_address.head(5)"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": 4,
143 | "metadata": {},
144 | "outputs": [
145 | {
146 | "name": "stdout",
147 | "output_type": "stream",
148 | "text": [
149 | "\n",
150 | "RangeIndex: 3999 entries, 0 to 3998\n",
151 | "Data columns (total 6 columns):\n",
152 | "customer_id 3999 non-null int64\n",
153 | "address 3999 non-null object\n",
154 | "postcode 3999 non-null int64\n",
155 | "state 3999 non-null object\n",
156 | "country 3999 non-null object\n",
157 | "property_valuation 3999 non-null int64\n",
158 | "dtypes: int64(3), object(3)\n",
159 | "memory usage: 187.5+ KB\n"
160 | ]
161 | }
162 | ],
163 | "source": [
164 | "# Information of columns and data-types of Customer Address Data.\n",
165 | "\n",
166 | "cust_address.info()"
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {},
172 | "source": [
173 | "The data-type of columns looks fine. Let;s check for the data quality and apply data cleaning process where ever applicable to clean our dataset before performing any analysis."
174 | ]
175 | },
176 | {
177 | "cell_type": "markdown",
178 | "metadata": {},
179 | "source": [
180 | "## Total Records"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 4,
186 | "metadata": {},
187 | "outputs": [
188 | {
189 | "name": "stdout",
190 | "output_type": "stream",
191 | "text": [
192 | "Total records (rows) in the dataset : 3999\n",
193 | "Total columns (features) in the dataset : 6\n"
194 | ]
195 | }
196 | ],
197 | "source": [
198 | "print(\"Total records (rows) in the dataset : {}\".format(cust_address.shape[0]))\n",
199 | "print(\"Total columns (features) in the dataset : {}\".format(cust_address.shape[1]))"
200 | ]
201 | },
202 | {
203 | "cell_type": "markdown",
204 | "metadata": {},
205 | "source": [
206 | "## Numeric Columns and Non-Numeric Columns"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 7,
212 | "metadata": {},
213 | "outputs": [
214 | {
215 | "name": "stdout",
216 | "output_type": "stream",
217 | "text": [
218 | "The numeric columns are : ['customer_id' 'postcode' 'property_valuation']\n",
219 | "The non-numeric columns are : ['address' 'state' 'country']\n"
220 | ]
221 | }
222 | ],
223 | "source": [
224 | "# select numeric columns\n",
225 | "df_numeric = cust_address.select_dtypes(include=[np.number])\n",
226 | "numeric_cols = df_numeric.columns.values\n",
227 | "print(\"The numeric columns are : {}\".format(numeric_cols))\n",
228 | "\n",
229 | "\n",
230 | "# select non-numeric columns\n",
231 | "df_non_numeric = cust_address.select_dtypes(exclude=[np.number])\n",
232 | "non_numeric_cols = df_non_numeric.columns.values\n",
233 | "print(\"The non-numeric columns are : {}\".format(non_numeric_cols))"
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "metadata": {},
239 | "source": [
240 | "## 1. Missing Values Check"
241 | ]
242 | },
243 | {
244 | "cell_type": "markdown",
245 | "metadata": {},
246 | "source": [
247 | "Checking for the presence of any missing values in the dataset. If missing values are present for a particular feature then depending upon the situation the feature may be either dropped (cases when a major amount of data is missing) or an appropiate value will be imputed in the feature column with missing values."
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": 7,
253 | "metadata": {},
254 | "outputs": [
255 | {
256 | "data": {
257 | "text/plain": [
258 | "customer_id 0\n",
259 | "address 0\n",
260 | "postcode 0\n",
261 | "state 0\n",
262 | "country 0\n",
263 | "property_valuation 0\n",
264 | "dtype: int64"
265 | ]
266 | },
267 | "execution_count": 7,
268 | "metadata": {},
269 | "output_type": "execute_result"
270 | }
271 | ],
272 | "source": [
273 | "# Total number of missing values\n",
274 | "cust_address.isnull().sum()"
275 | ]
276 | },
277 | {
278 | "cell_type": "markdown",
279 | "metadata": {},
280 | "source": [
281 | "In the dataset there are no missing values. "
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {},
287 | "source": [
288 | "## 2. Inconsistency Check in Data"
289 | ]
290 | },
291 | {
292 | "cell_type": "markdown",
293 | "metadata": {},
294 | "source": [
295 | "We will check whether there is inconsistent data / typo error data is present in the categorical columns. \n",
296 | "The columns to be checked are 'address', 'postcode' ,'state', 'country' ."
297 | ]
298 | },
299 | {
300 | "cell_type": "markdown",
301 | "metadata": {},
302 | "source": [
303 | "### 2.1 State"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": 8,
309 | "metadata": {},
310 | "outputs": [
311 | {
312 | "data": {
313 | "text/plain": [
314 | "NSW 2054\n",
315 | "VIC 939\n",
316 | "QLD 838\n",
317 | "New South Wales 86\n",
318 | "Victoria 82\n",
319 | "Name: state, dtype: int64"
320 | ]
321 | },
322 | "execution_count": 8,
323 | "metadata": {},
324 | "output_type": "execute_result"
325 | }
326 | ],
327 | "source": [
328 | "cust_address['state'].value_counts()"
329 | ]
330 | },
331 | {
332 | "cell_type": "markdown",
333 | "metadata": {},
334 | "source": [
335 | "Here there are inconsistent data in State column. For New South Wales and Victoria we have two values, one being the full name and the other being their short name. The State names should be standardised and columns with state as New South Wales will be replaced by NSW and columns with state as Victoria will be replaced by VIC ."
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": 9,
341 | "metadata": {},
342 | "outputs": [],
343 | "source": [
344 | "# Function to replace full state names with their short forms.\n",
345 | "\n",
346 | "def replace_state_names(state_name):\n",
347 | " \n",
348 | " # Making Short Form of State Names as standards\n",
349 | " if state_name=='New South Wales':\n",
350 | " return 'NSW'\n",
351 | " elif state_name=='Victoria':\n",
352 | " return 'VIC'\n",
353 | " else :\n",
354 | " return state_name\n",
355 | "\n",
356 | "# Applying the above fuction to state column\n",
357 | "cust_address['state'] = cust_address['state'].apply(replace_state_names)"
358 | ]
359 | },
360 | {
361 | "cell_type": "markdown",
362 | "metadata": {},
363 | "source": [
364 | "After applying the above function the state name is standardised and there is no inconsistency in the state column."
365 | ]
366 | },
367 | {
368 | "cell_type": "code",
369 | "execution_count": 10,
370 | "metadata": {},
371 | "outputs": [
372 | {
373 | "data": {
374 | "text/plain": [
375 | "NSW 2140\n",
376 | "VIC 1021\n",
377 | "QLD 838\n",
378 | "Name: state, dtype: int64"
379 | ]
380 | },
381 | "execution_count": 10,
382 | "metadata": {},
383 | "output_type": "execute_result"
384 | }
385 | ],
386 | "source": [
387 | "cust_address['state'].value_counts()"
388 | ]
389 | },
390 | {
391 | "cell_type": "markdown",
392 | "metadata": {},
393 | "source": [
394 | "### 2.2 Country"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": 11,
400 | "metadata": {},
401 | "outputs": [
402 | {
403 | "data": {
404 | "text/plain": [
405 | "Australia 3999\n",
406 | "Name: country, dtype: int64"
407 | ]
408 | },
409 | "execution_count": 11,
410 | "metadata": {},
411 | "output_type": "execute_result"
412 | }
413 | ],
414 | "source": [
415 | "cust_address['country'].value_counts()"
416 | ]
417 | },
418 | {
419 | "cell_type": "markdown",
420 | "metadata": {},
421 | "source": [
422 | "There is no inconsistency of data in the Country column."
423 | ]
424 | },
425 | {
426 | "cell_type": "markdown",
427 | "metadata": {},
428 | "source": [
429 | "### 2.3 Postcode"
430 | ]
431 | },
432 | {
433 | "cell_type": "markdown",
434 | "metadata": {},
435 | "source": [
436 | "The Postcode column looks perfect. There is no inconsistency / typo in the data."
437 | ]
438 | },
439 | {
440 | "cell_type": "code",
441 | "execution_count": 12,
442 | "metadata": {},
443 | "outputs": [
444 | {
445 | "data": {
446 | "text/html": [
447 | "\n",
448 | "\n",
461 | "
\n",
462 | " \n",
463 | " \n",
464 | " \n",
465 | " address \n",
466 | " postcode \n",
467 | " state \n",
468 | " country \n",
469 | " \n",
470 | " \n",
471 | " \n",
472 | " \n",
473 | " 0 \n",
474 | " 060 Morning Avenue \n",
475 | " 2016 \n",
476 | " NSW \n",
477 | " Australia \n",
478 | " \n",
479 | " \n",
480 | " 1 \n",
481 | " 6 Meadow Vale Court \n",
482 | " 2153 \n",
483 | " NSW \n",
484 | " Australia \n",
485 | " \n",
486 | " \n",
487 | " 2 \n",
488 | " 0 Holy Cross Court \n",
489 | " 4211 \n",
490 | " QLD \n",
491 | " Australia \n",
492 | " \n",
493 | " \n",
494 | " 3 \n",
495 | " 17979 Del Mar Point \n",
496 | " 2448 \n",
497 | " NSW \n",
498 | " Australia \n",
499 | " \n",
500 | " \n",
501 | " 4 \n",
502 | " 9 Oakridge Court \n",
503 | " 3216 \n",
504 | " VIC \n",
505 | " Australia \n",
506 | " \n",
507 | " \n",
508 | " 5 \n",
509 | " 4 Delaware Trail \n",
510 | " 2210 \n",
511 | " NSW \n",
512 | " Australia \n",
513 | " \n",
514 | " \n",
515 | " 6 \n",
516 | " 49 Londonderry Lane \n",
517 | " 2650 \n",
518 | " NSW \n",
519 | " Australia \n",
520 | " \n",
521 | " \n",
522 | " 7 \n",
523 | " 97736 7th Trail \n",
524 | " 2023 \n",
525 | " NSW \n",
526 | " Australia \n",
527 | " \n",
528 | " \n",
529 | " 8 \n",
530 | " 93405 Ludington Park \n",
531 | " 3044 \n",
532 | " VIC \n",
533 | " Australia \n",
534 | " \n",
535 | " \n",
536 | " 9 \n",
537 | " 44339 Golden Leaf Alley \n",
538 | " 4557 \n",
539 | " QLD \n",
540 | " Australia \n",
541 | " \n",
542 | " \n",
543 | " 10 \n",
544 | " 2 Sutherland Street \n",
545 | " 3799 \n",
546 | " VIC \n",
547 | " Australia \n",
548 | " \n",
549 | " \n",
550 | " 11 \n",
551 | " 9 Mcbride Trail \n",
552 | " 2760 \n",
553 | " NSW \n",
554 | " Australia \n",
555 | " \n",
556 | " \n",
557 | " 12 \n",
558 | " 9861 New Castle Avenue \n",
559 | " 2428 \n",
560 | " NSW \n",
561 | " Australia \n",
562 | " \n",
563 | " \n",
564 | " 13 \n",
565 | " 52 Moland Street \n",
566 | " 3331 \n",
567 | " VIC \n",
568 | " Australia \n",
569 | " \n",
570 | " \n",
571 | " 14 \n",
572 | " 82391 Kensington Lane \n",
573 | " 3058 \n",
574 | " VIC \n",
575 | " Australia \n",
576 | " \n",
577 | " \n",
578 | " 15 \n",
579 | " 092 2nd Alley \n",
580 | " 2135 \n",
581 | " NSW \n",
582 | " Australia \n",
583 | " \n",
584 | " \n",
585 | " 16 \n",
586 | " 59 Spaight Circle \n",
587 | " 2233 \n",
588 | " NSW \n",
589 | " Australia \n",
590 | " \n",
591 | " \n",
592 | " 17 \n",
593 | " 032 Bartelt Crossing \n",
594 | " 2444 \n",
595 | " NSW \n",
596 | " Australia \n",
597 | " \n",
598 | " \n",
599 | " 18 \n",
600 | " 18 Jenna Center \n",
601 | " 2650 \n",
602 | " NSW \n",
603 | " Australia \n",
604 | " \n",
605 | " \n",
606 | " 19 \n",
607 | " 3 Cordelia Plaza \n",
608 | " 2153 \n",
609 | " NSW \n",
610 | " Australia \n",
611 | " \n",
612 | " \n",
613 | " 20 \n",
614 | " 28 5th Center \n",
615 | " 4413 \n",
616 | " QLD \n",
617 | " Australia \n",
618 | " \n",
619 | " \n",
620 | " 21 \n",
621 | " 52 Carey Alley \n",
622 | " 4740 \n",
623 | " QLD \n",
624 | " Australia \n",
625 | " \n",
626 | " \n",
627 | " 22 \n",
628 | " 96 Texas Plaza \n",
629 | " 3218 \n",
630 | " VIC \n",
631 | " Australia \n",
632 | " \n",
633 | " \n",
634 | " 23 \n",
635 | " 48 Eagan Avenue \n",
636 | " 4868 \n",
637 | " QLD \n",
638 | " Australia \n",
639 | " \n",
640 | " \n",
641 | " 24 \n",
642 | " 9 Buell Park \n",
643 | " 4116 \n",
644 | " QLD \n",
645 | " Australia \n",
646 | " \n",
647 | " \n",
648 | " 25 \n",
649 | " 02663 Buell Parkway \n",
650 | " 2519 \n",
651 | " NSW \n",
652 | " Australia \n",
653 | " \n",
654 | " \n",
655 | " 26 \n",
656 | " 2294 Pleasure Place \n",
657 | " 2135 \n",
658 | " NSW \n",
659 | " Australia \n",
660 | " \n",
661 | " \n",
662 | " 27 \n",
663 | " 2951 Petterle Place \n",
664 | " 2756 \n",
665 | " NSW \n",
666 | " Australia \n",
667 | " \n",
668 | " \n",
669 | " 28 \n",
670 | " 63 Lukken Drive \n",
671 | " 2170 \n",
672 | " NSW \n",
673 | " Australia \n",
674 | " \n",
675 | " \n",
676 | " 29 \n",
677 | " 833 Luster Way \n",
678 | " 4005 \n",
679 | " QLD \n",
680 | " Australia \n",
681 | " \n",
682 | " \n",
683 | " ... \n",
684 | " ... \n",
685 | " ... \n",
686 | " ... \n",
687 | " ... \n",
688 | " \n",
689 | " \n",
690 | " 3969 \n",
691 | " 81609 Vernon Terrace \n",
692 | " 3934 \n",
693 | " VIC \n",
694 | " Australia \n",
695 | " \n",
696 | " \n",
697 | " 3970 \n",
698 | " 37 Hintze Court \n",
699 | " 2168 \n",
700 | " NSW \n",
701 | " Australia \n",
702 | " \n",
703 | " \n",
704 | " 3971 \n",
705 | " 07 Morning Court \n",
706 | " 3805 \n",
707 | " VIC \n",
708 | " Australia \n",
709 | " \n",
710 | " \n",
711 | " 3972 \n",
712 | " 327 Loeprich Street \n",
713 | " 3187 \n",
714 | " VIC \n",
715 | " Australia \n",
716 | " \n",
717 | " \n",
718 | " 3973 \n",
719 | " 5106 Northridge Drive \n",
720 | " 2560 \n",
721 | " NSW \n",
722 | " Australia \n",
723 | " \n",
724 | " \n",
725 | " 3974 \n",
726 | " 9754 High Crossing Terrace \n",
727 | " 2148 \n",
728 | " NSW \n",
729 | " Australia \n",
730 | " \n",
731 | " \n",
732 | " 3975 \n",
733 | " 7 Meadow Vale Court \n",
734 | " 2114 \n",
735 | " NSW \n",
736 | " Australia \n",
737 | " \n",
738 | " \n",
739 | " 3976 \n",
740 | " 80260 Morning Road \n",
741 | " 3178 \n",
742 | " VIC \n",
743 | " Australia \n",
744 | " \n",
745 | " \n",
746 | " 3977 \n",
747 | " 38017 Briar Crest Drive \n",
748 | " 2165 \n",
749 | " NSW \n",
750 | " Australia \n",
751 | " \n",
752 | " \n",
753 | " 3978 \n",
754 | " 60 Morningstar Center \n",
755 | " 2126 \n",
756 | " NSW \n",
757 | " Australia \n",
758 | " \n",
759 | " \n",
760 | " 3979 \n",
761 | " 218 Stuart Junction \n",
762 | " 2223 \n",
763 | " NSW \n",
764 | " Australia \n",
765 | " \n",
766 | " \n",
767 | " 3980 \n",
768 | " 9 Butterfield Lane \n",
769 | " 4077 \n",
770 | " QLD \n",
771 | " Australia \n",
772 | " \n",
773 | " \n",
774 | " 3981 \n",
775 | " 614 Burning Wood Way \n",
776 | " 2148 \n",
777 | " NSW \n",
778 | " Australia \n",
779 | " \n",
780 | " \n",
781 | " 3982 \n",
782 | " 9 Grover Point \n",
783 | " 4218 \n",
784 | " QLD \n",
785 | " Australia \n",
786 | " \n",
787 | " \n",
788 | " 3983 \n",
789 | " 565 Bunting Park \n",
790 | " 2076 \n",
791 | " NSW \n",
792 | " Australia \n",
793 | " \n",
794 | " \n",
795 | " 3984 \n",
796 | " 9461 Saint Paul Trail \n",
797 | " 2428 \n",
798 | " NSW \n",
799 | " Australia \n",
800 | " \n",
801 | " \n",
802 | " 3985 \n",
803 | " 5204 Delaware Pass \n",
804 | " 2560 \n",
805 | " NSW \n",
806 | " Australia \n",
807 | " \n",
808 | " \n",
809 | " 3986 \n",
810 | " 24 Scott Pass \n",
811 | " 4300 \n",
812 | " QLD \n",
813 | " Australia \n",
814 | " \n",
815 | " \n",
816 | " 3987 \n",
817 | " 8 Randy Parkway \n",
818 | " 2209 \n",
819 | " NSW \n",
820 | " Australia \n",
821 | " \n",
822 | " \n",
823 | " 3988 \n",
824 | " 681 Elmside Place \n",
825 | " 3750 \n",
826 | " VIC \n",
827 | " Australia \n",
828 | " \n",
829 | " \n",
830 | " 3989 \n",
831 | " 2918 Summer Ridge Hill \n",
832 | " 3030 \n",
833 | " VIC \n",
834 | " Australia \n",
835 | " \n",
836 | " \n",
837 | " 3990 \n",
838 | " 613 Erie Lane \n",
839 | " 2088 \n",
840 | " NSW \n",
841 | " Australia \n",
842 | " \n",
843 | " \n",
844 | " 3991 \n",
845 | " 0 Transport Center \n",
846 | " 3977 \n",
847 | " VIC \n",
848 | " Australia \n",
849 | " \n",
850 | " \n",
851 | " 3992 \n",
852 | " 4 Dovetail Crossing \n",
853 | " 2350 \n",
854 | " NSW \n",
855 | " Australia \n",
856 | " \n",
857 | " \n",
858 | " 3993 \n",
859 | " 736 Roxbury Junction \n",
860 | " 2540 \n",
861 | " NSW \n",
862 | " Australia \n",
863 | " \n",
864 | " \n",
865 | " 3994 \n",
866 | " 1482 Hauk Trail \n",
867 | " 3064 \n",
868 | " VIC \n",
869 | " Australia \n",
870 | " \n",
871 | " \n",
872 | " 3995 \n",
873 | " 57042 Village Green Point \n",
874 | " 4511 \n",
875 | " QLD \n",
876 | " Australia \n",
877 | " \n",
878 | " \n",
879 | " 3996 \n",
880 | " 87 Crescent Oaks Alley \n",
881 | " 2756 \n",
882 | " NSW \n",
883 | " Australia \n",
884 | " \n",
885 | " \n",
886 | " 3997 \n",
887 | " 8194 Lien Street \n",
888 | " 4032 \n",
889 | " QLD \n",
890 | " Australia \n",
891 | " \n",
892 | " \n",
893 | " 3998 \n",
894 | " 320 Acker Drive \n",
895 | " 2251 \n",
896 | " NSW \n",
897 | " Australia \n",
898 | " \n",
899 | " \n",
900 | "
\n",
901 | "
3999 rows × 4 columns
\n",
902 | "
"
903 | ],
904 | "text/plain": [
905 | " address postcode state country\n",
906 | "0 060 Morning Avenue 2016 NSW Australia\n",
907 | "1 6 Meadow Vale Court 2153 NSW Australia\n",
908 | "2 0 Holy Cross Court 4211 QLD Australia\n",
909 | "3 17979 Del Mar Point 2448 NSW Australia\n",
910 | "4 9 Oakridge Court 3216 VIC Australia\n",
911 | "5 4 Delaware Trail 2210 NSW Australia\n",
912 | "6 49 Londonderry Lane 2650 NSW Australia\n",
913 | "7 97736 7th Trail 2023 NSW Australia\n",
914 | "8 93405 Ludington Park 3044 VIC Australia\n",
915 | "9 44339 Golden Leaf Alley 4557 QLD Australia\n",
916 | "10 2 Sutherland Street 3799 VIC Australia\n",
917 | "11 9 Mcbride Trail 2760 NSW Australia\n",
918 | "12 9861 New Castle Avenue 2428 NSW Australia\n",
919 | "13 52 Moland Street 3331 VIC Australia\n",
920 | "14 82391 Kensington Lane 3058 VIC Australia\n",
921 | "15 092 2nd Alley 2135 NSW Australia\n",
922 | "16 59 Spaight Circle 2233 NSW Australia\n",
923 | "17 032 Bartelt Crossing 2444 NSW Australia\n",
924 | "18 18 Jenna Center 2650 NSW Australia\n",
925 | "19 3 Cordelia Plaza 2153 NSW Australia\n",
926 | "20 28 5th Center 4413 QLD Australia\n",
927 | "21 52 Carey Alley 4740 QLD Australia\n",
928 | "22 96 Texas Plaza 3218 VIC Australia\n",
929 | "23 48 Eagan Avenue 4868 QLD Australia\n",
930 | "24 9 Buell Park 4116 QLD Australia\n",
931 | "25 02663 Buell Parkway 2519 NSW Australia\n",
932 | "26 2294 Pleasure Place 2135 NSW Australia\n",
933 | "27 2951 Petterle Place 2756 NSW Australia\n",
934 | "28 63 Lukken Drive 2170 NSW Australia\n",
935 | "29 833 Luster Way 4005 QLD Australia\n",
936 | "... ... ... ... ...\n",
937 | "3969 81609 Vernon Terrace 3934 VIC Australia\n",
938 | "3970 37 Hintze Court 2168 NSW Australia\n",
939 | "3971 07 Morning Court 3805 VIC Australia\n",
940 | "3972 327 Loeprich Street 3187 VIC Australia\n",
941 | "3973 5106 Northridge Drive 2560 NSW Australia\n",
942 | "3974 9754 High Crossing Terrace 2148 NSW Australia\n",
943 | "3975 7 Meadow Vale Court 2114 NSW Australia\n",
944 | "3976 80260 Morning Road 3178 VIC Australia\n",
945 | "3977 38017 Briar Crest Drive 2165 NSW Australia\n",
946 | "3978 60 Morningstar Center 2126 NSW Australia\n",
947 | "3979 218 Stuart Junction 2223 NSW Australia\n",
948 | "3980 9 Butterfield Lane 4077 QLD Australia\n",
949 | "3981 614 Burning Wood Way 2148 NSW Australia\n",
950 | "3982 9 Grover Point 4218 QLD Australia\n",
951 | "3983 565 Bunting Park 2076 NSW Australia\n",
952 | "3984 9461 Saint Paul Trail 2428 NSW Australia\n",
953 | "3985 5204 Delaware Pass 2560 NSW Australia\n",
954 | "3986 24 Scott Pass 4300 QLD Australia\n",
955 | "3987 8 Randy Parkway 2209 NSW Australia\n",
956 | "3988 681 Elmside Place 3750 VIC Australia\n",
957 | "3989 2918 Summer Ridge Hill 3030 VIC Australia\n",
958 | "3990 613 Erie Lane 2088 NSW Australia\n",
959 | "3991 0 Transport Center 3977 VIC Australia\n",
960 | "3992 4 Dovetail Crossing 2350 NSW Australia\n",
961 | "3993 736 Roxbury Junction 2540 NSW Australia\n",
962 | "3994 1482 Hauk Trail 3064 VIC Australia\n",
963 | "3995 57042 Village Green Point 4511 QLD Australia\n",
964 | "3996 87 Crescent Oaks Alley 2756 NSW Australia\n",
965 | "3997 8194 Lien Street 4032 QLD Australia\n",
966 | "3998 320 Acker Drive 2251 NSW Australia\n",
967 | "\n",
968 | "[3999 rows x 4 columns]"
969 | ]
970 | },
971 | "execution_count": 12,
972 | "metadata": {},
973 | "output_type": "execute_result"
974 | }
975 | ],
976 | "source": [
977 | "cust_address[['address','postcode', 'state' , 'country']].drop_duplicates()"
978 | ]
979 | },
980 | {
981 | "cell_type": "markdown",
982 | "metadata": {},
983 | "source": [
984 | "## 3. Duplication Checks"
985 | ]
986 | },
987 | {
988 | "cell_type": "markdown",
989 | "metadata": {},
990 | "source": [
991 | "We need to ensure that there is no duplication of records in the dataset. This may lead to error in data analysis due to poor data quality. If there are duplicate rows of data then we need to drop such records. For checking for duplicate records we need to firstly remove the primary key column of the dataset then apply drop_duplicates() function provided by Python."
992 | ]
993 | },
994 | {
995 | "cell_type": "code",
996 | "execution_count": 13,
997 | "metadata": {},
998 | "outputs": [
999 | {
1000 | "name": "stdout",
1001 | "output_type": "stream",
1002 | "text": [
1003 | "Number of records after removing customer_id (pk), duplicates : 3999\n",
1004 | "Number of records in original dataset : 3999\n"
1005 | ]
1006 | }
1007 | ],
1008 | "source": [
1009 | "# Dropping the primary key column i.e customer_id and storing into a temporary dataframe.\n",
1010 | "cust_address_dedupped = cust_address.drop('customer_id', axis=1).drop_duplicates()\n",
1011 | "\n",
1012 | "print(\"Number of records after removing customer_id (pk), duplicates : {}\".format(cust_address_dedupped.shape[0]))\n",
1013 | "print(\"Number of records in original dataset : {}\".format(cust_address.shape[0]))"
1014 | ]
1015 | },
1016 | {
1017 | "cell_type": "markdown",
1018 | "metadata": {},
1019 | "source": [
1020 | "Since both the numbers are same. There are no duplicate records in the dataset "
1021 | ]
1022 | },
1023 | {
1024 | "cell_type": "markdown",
1025 | "metadata": {},
1026 | "source": [
1027 | "## 4. Exporting the Cleaned Customer Demographic Data Set to csv"
1028 | ]
1029 | },
1030 | {
1031 | "cell_type": "markdown",
1032 | "metadata": {},
1033 | "source": [
1034 | "Currently the Customer Address dataset is clean. Hence we can export the data to a csv to continue our data analysis of Customer Segments by joining it to other tables."
1035 | ]
1036 | },
1037 | {
1038 | "cell_type": "code",
1039 | "execution_count": 14,
1040 | "metadata": {},
1041 | "outputs": [],
1042 | "source": [
1043 | "cust_address.to_csv('CustomerAddress_Cleaned.csv', index=False)"
1044 | ]
1045 | },
1046 | {
1047 | "cell_type": "markdown",
1048 | "metadata": {},
1049 | "source": [
1050 | "## 5. Checking for Master-Detail Record Counts"
1051 | ]
1052 | },
1053 | {
1054 | "cell_type": "markdown",
1055 | "metadata": {},
1056 | "source": [
1057 | "Checking with the Master Table (CustomerDemographic_Cleaned.csv) containing the entire Customer Data for the Customer IDs which are getting dropped from the Customer Address Dataset. \n",
1058 | "Basically these are the Customers who have an address but are not a part of the Demographics dataset yet. "
1059 | ]
1060 | },
1061 | {
1062 | "cell_type": "code",
1063 | "execution_count": 38,
1064 | "metadata": {},
1065 | "outputs": [],
1066 | "source": [
1067 | "cust_demo_detail = pd.read_csv('CustomerDemographic_Cleaned.csv')"
1068 | ]
1069 | },
1070 | {
1071 | "cell_type": "code",
1072 | "execution_count": 39,
1073 | "metadata": {},
1074 | "outputs": [
1075 | {
1076 | "data": {
1077 | "text/html": [
1078 | "\n",
1079 | "\n",
1092 | "
\n",
1093 | " \n",
1094 | " \n",
1095 | " \n",
1096 | " customer_id \n",
1097 | " first_name \n",
1098 | " last_name \n",
1099 | " gender \n",
1100 | " past_3_years_bike_related_purchases \n",
1101 | " DOB \n",
1102 | " job_title \n",
1103 | " job_industry_category \n",
1104 | " wealth_segment \n",
1105 | " deceased_indicator \n",
1106 | " owns_car \n",
1107 | " tenure \n",
1108 | " Age \n",
1109 | " \n",
1110 | " \n",
1111 | " \n",
1112 | " \n",
1113 | " 0 \n",
1114 | " 1 \n",
1115 | " Laraine \n",
1116 | " Medendorp \n",
1117 | " Female \n",
1118 | " 93 \n",
1119 | " 1953-10-12 \n",
1120 | " Executive Secretary \n",
1121 | " Health \n",
1122 | " Mass Customer \n",
1123 | " N \n",
1124 | " Yes \n",
1125 | " 11.0 \n",
1126 | " 67 \n",
1127 | " \n",
1128 | " \n",
1129 | " 1 \n",
1130 | " 2 \n",
1131 | " Eli \n",
1132 | " Bockman \n",
1133 | " Male \n",
1134 | " 81 \n",
1135 | " 1980-12-16 \n",
1136 | " Administrative Officer \n",
1137 | " Financial Services \n",
1138 | " Mass Customer \n",
1139 | " N \n",
1140 | " Yes \n",
1141 | " 16.0 \n",
1142 | " 40 \n",
1143 | " \n",
1144 | " \n",
1145 | " 2 \n",
1146 | " 3 \n",
1147 | " Arlin \n",
1148 | " Dearle \n",
1149 | " Male \n",
1150 | " 61 \n",
1151 | " 1954-01-20 \n",
1152 | " Recruiting Manager \n",
1153 | " Property \n",
1154 | " Mass Customer \n",
1155 | " N \n",
1156 | " Yes \n",
1157 | " 15.0 \n",
1158 | " 67 \n",
1159 | " \n",
1160 | " \n",
1161 | " 3 \n",
1162 | " 4 \n",
1163 | " Talbot \n",
1164 | " None \n",
1165 | " Male \n",
1166 | " 33 \n",
1167 | " 1961-10-03 \n",
1168 | " Missing \n",
1169 | " IT \n",
1170 | " Mass Customer \n",
1171 | " N \n",
1172 | " No \n",
1173 | " 7.0 \n",
1174 | " 59 \n",
1175 | " \n",
1176 | " \n",
1177 | " 4 \n",
1178 | " 5 \n",
1179 | " Sheila-kathryn \n",
1180 | " Calton \n",
1181 | " Female \n",
1182 | " 56 \n",
1183 | " 1977-05-13 \n",
1184 | " Senior Editor \n",
1185 | " Missing \n",
1186 | " Affluent Customer \n",
1187 | " N \n",
1188 | " Yes \n",
1189 | " 8.0 \n",
1190 | " 43 \n",
1191 | " \n",
1192 | " \n",
1193 | "
\n",
1194 | "
"
1195 | ],
1196 | "text/plain": [
1197 | " customer_id first_name last_name gender \\\n",
1198 | "0 1 Laraine Medendorp Female \n",
1199 | "1 2 Eli Bockman Male \n",
1200 | "2 3 Arlin Dearle Male \n",
1201 | "3 4 Talbot None Male \n",
1202 | "4 5 Sheila-kathryn Calton Female \n",
1203 | "\n",
1204 | " past_3_years_bike_related_purchases DOB job_title \\\n",
1205 | "0 93 1953-10-12 Executive Secretary \n",
1206 | "1 81 1980-12-16 Administrative Officer \n",
1207 | "2 61 1954-01-20 Recruiting Manager \n",
1208 | "3 33 1961-10-03 Missing \n",
1209 | "4 56 1977-05-13 Senior Editor \n",
1210 | "\n",
1211 | " job_industry_category wealth_segment deceased_indicator owns_car \\\n",
1212 | "0 Health Mass Customer N Yes \n",
1213 | "1 Financial Services Mass Customer N Yes \n",
1214 | "2 Property Mass Customer N Yes \n",
1215 | "3 IT Mass Customer N No \n",
1216 | "4 Missing Affluent Customer N Yes \n",
1217 | "\n",
1218 | " tenure Age \n",
1219 | "0 11.0 67 \n",
1220 | "1 16.0 40 \n",
1221 | "2 15.0 67 \n",
1222 | "3 7.0 59 \n",
1223 | "4 8.0 43 "
1224 | ]
1225 | },
1226 | "execution_count": 39,
1227 | "metadata": {},
1228 | "output_type": "execute_result"
1229 | }
1230 | ],
1231 | "source": [
1232 | "cust_demo_detail.head()"
1233 | ]
1234 | },
1235 | {
1236 | "cell_type": "code",
1237 | "execution_count": 43,
1238 | "metadata": {},
1239 | "outputs": [
1240 | {
1241 | "name": "stdout",
1242 | "output_type": "stream",
1243 | "text": [
1244 | "Total Records in Customer_Demographic_Table : 3912\n",
1245 | "Total Records in Customer_Address_Table : 3999\n",
1246 | "In Demographic Table 87 records are getting dropped due to data cleaning process in Demographic Table\n"
1247 | ]
1248 | }
1249 | ],
1250 | "source": [
1251 | "print(\"Total Records in Customer_Demographic_Table : {}\".format(cust_demo_detail.shape[0]))\n",
1252 | "print(\"Total Records in Customer_Address_Table : {}\".format(cust_address.shape[0]))\n",
1253 | "print('In Demographic Table {} records are getting dropped due to data cleaning process in Demographic Table'\n",
1254 | " .format(cust_address.shape[0]-cust_demo_detail.shape[0]))"
1255 | ]
1256 | },
1257 | {
1258 | "cell_type": "markdown",
1259 | "metadata": {},
1260 | "source": [
1261 | "#### Customer IDs in Address table getting dropped :"
1262 | ]
1263 | },
1264 | {
1265 | "cell_type": "code",
1266 | "execution_count": 53,
1267 | "metadata": {},
1268 | "outputs": [
1269 | {
1270 | "data": {
1271 | "text/html": [
1272 | "\n",
1273 | "\n",
1286 | "
\n",
1287 | " \n",
1288 | " \n",
1289 | " \n",
1290 | " customer_id \n",
1291 | " address \n",
1292 | " postcode \n",
1293 | " state \n",
1294 | " country \n",
1295 | " property_valuation \n",
1296 | " first_name \n",
1297 | " last_name \n",
1298 | " gender \n",
1299 | " past_3_years_bike_related_purchases \n",
1300 | " DOB \n",
1301 | " job_title \n",
1302 | " job_industry_category \n",
1303 | " wealth_segment \n",
1304 | " deceased_indicator \n",
1305 | " owns_car \n",
1306 | " tenure \n",
1307 | " Age \n",
1308 | " \n",
1309 | " \n",
1310 | " \n",
1311 | " \n",
1312 | " 0 \n",
1313 | " 1 \n",
1314 | " 060 Morning Avenue \n",
1315 | " 2016.0 \n",
1316 | " NSW \n",
1317 | " Australia \n",
1318 | " 10.0 \n",
1319 | " Laraine \n",
1320 | " Medendorp \n",
1321 | " Female \n",
1322 | " 93.0 \n",
1323 | " 1953-10-12 \n",
1324 | " Executive Secretary \n",
1325 | " Health \n",
1326 | " Mass Customer \n",
1327 | " N \n",
1328 | " Yes \n",
1329 | " 11.0 \n",
1330 | " 67.0 \n",
1331 | " \n",
1332 | " \n",
1333 | " 1 \n",
1334 | " 2 \n",
1335 | " 6 Meadow Vale Court \n",
1336 | " 2153.0 \n",
1337 | " NSW \n",
1338 | " Australia \n",
1339 | " 10.0 \n",
1340 | " Eli \n",
1341 | " Bockman \n",
1342 | " Male \n",
1343 | " 81.0 \n",
1344 | " 1980-12-16 \n",
1345 | " Administrative Officer \n",
1346 | " Financial Services \n",
1347 | " Mass Customer \n",
1348 | " N \n",
1349 | " Yes \n",
1350 | " 16.0 \n",
1351 | " 40.0 \n",
1352 | " \n",
1353 | " \n",
1354 | " 2 \n",
1355 | " 4 \n",
1356 | " 0 Holy Cross Court \n",
1357 | " 4211.0 \n",
1358 | " QLD \n",
1359 | " Australia \n",
1360 | " 9.0 \n",
1361 | " Talbot \n",
1362 | " None \n",
1363 | " Male \n",
1364 | " 33.0 \n",
1365 | " 1961-10-03 \n",
1366 | " Missing \n",
1367 | " IT \n",
1368 | " Mass Customer \n",
1369 | " N \n",
1370 | " No \n",
1371 | " 7.0 \n",
1372 | " 59.0 \n",
1373 | " \n",
1374 | " \n",
1375 | " 3 \n",
1376 | " 5 \n",
1377 | " 17979 Del Mar Point \n",
1378 | " 2448.0 \n",
1379 | " NSW \n",
1380 | " Australia \n",
1381 | " 4.0 \n",
1382 | " Sheila-kathryn \n",
1383 | " Calton \n",
1384 | " Female \n",
1385 | " 56.0 \n",
1386 | " 1977-05-13 \n",
1387 | " Senior Editor \n",
1388 | " Missing \n",
1389 | " Affluent Customer \n",
1390 | " N \n",
1391 | " Yes \n",
1392 | " 8.0 \n",
1393 | " 43.0 \n",
1394 | " \n",
1395 | " \n",
1396 | " 4 \n",
1397 | " 6 \n",
1398 | " 9 Oakridge Court \n",
1399 | " 3216.0 \n",
1400 | " VIC \n",
1401 | " Australia \n",
1402 | " 9.0 \n",
1403 | " Curr \n",
1404 | " Duckhouse \n",
1405 | " Male \n",
1406 | " 35.0 \n",
1407 | " 1966-09-16 \n",
1408 | " Missing \n",
1409 | " Retail \n",
1410 | " High Net Worth \n",
1411 | " N \n",
1412 | " Yes \n",
1413 | " 13.0 \n",
1414 | " 54.0 \n",
1415 | " \n",
1416 | " \n",
1417 | "
\n",
1418 | "
"
1419 | ],
1420 | "text/plain": [
1421 | " customer_id address postcode state country \\\n",
1422 | "0 1 060 Morning Avenue 2016.0 NSW Australia \n",
1423 | "1 2 6 Meadow Vale Court 2153.0 NSW Australia \n",
1424 | "2 4 0 Holy Cross Court 4211.0 QLD Australia \n",
1425 | "3 5 17979 Del Mar Point 2448.0 NSW Australia \n",
1426 | "4 6 9 Oakridge Court 3216.0 VIC Australia \n",
1427 | "\n",
1428 | " property_valuation first_name last_name gender \\\n",
1429 | "0 10.0 Laraine Medendorp Female \n",
1430 | "1 10.0 Eli Bockman Male \n",
1431 | "2 9.0 Talbot None Male \n",
1432 | "3 4.0 Sheila-kathryn Calton Female \n",
1433 | "4 9.0 Curr Duckhouse Male \n",
1434 | "\n",
1435 | " past_3_years_bike_related_purchases DOB job_title \\\n",
1436 | "0 93.0 1953-10-12 Executive Secretary \n",
1437 | "1 81.0 1980-12-16 Administrative Officer \n",
1438 | "2 33.0 1961-10-03 Missing \n",
1439 | "3 56.0 1977-05-13 Senior Editor \n",
1440 | "4 35.0 1966-09-16 Missing \n",
1441 | "\n",
1442 | " job_industry_category wealth_segment deceased_indicator owns_car \\\n",
1443 | "0 Health Mass Customer N Yes \n",
1444 | "1 Financial Services Mass Customer N Yes \n",
1445 | "2 IT Mass Customer N No \n",
1446 | "3 Missing Affluent Customer N Yes \n",
1447 | "4 Retail High Net Worth N Yes \n",
1448 | "\n",
1449 | " tenure Age \n",
1450 | "0 11.0 67.0 \n",
1451 | "1 16.0 40.0 \n",
1452 | "2 7.0 59.0 \n",
1453 | "3 8.0 43.0 \n",
1454 | "4 13.0 54.0 "
1455 | ]
1456 | },
1457 | "execution_count": 53,
1458 | "metadata": {},
1459 | "output_type": "execute_result"
1460 | }
1461 | ],
1462 | "source": [
1463 | "cust_drop = cust_address.merge(cust_demo_detail , left_on = 'customer_id', right_on='customer_id'\n",
1464 | " , how='outer')\n",
1465 | "cust_drop.head()"
1466 | ]
1467 | }
1468 | ],
1469 | "metadata": {
1470 | "kernelspec": {
1471 | "display_name": "Python 3",
1472 | "language": "python",
1473 | "name": "python3"
1474 | },
1475 | "language_info": {
1476 | "codemirror_mode": {
1477 | "name": "ipython",
1478 | "version": 3
1479 | },
1480 | "file_extension": ".py",
1481 | "mimetype": "text/x-python",
1482 | "name": "python",
1483 | "nbconvert_exporter": "python",
1484 | "pygments_lexer": "ipython3",
1485 | "version": "3.7.3"
1486 | }
1487 | },
1488 | "nbformat": 4,
1489 | "nbformat_minor": 2
1490 | }
1491 |
--------------------------------------------------------------------------------
/DQA and Data Cleaning Transactions.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 7,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import numpy as np\n",
11 | "import matplotlib.pyplot as plt\n",
12 | "import seaborn as sns\n",
13 | "%matplotlib inline\n",
14 | "\n",
15 | "from datetime import datetime, date\n",
16 | "plt.style.use('ggplot')"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 8,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "# Loading the Transactions Data from the excel file\n",
26 | "\n",
27 | "trans = pd.read_excel('Raw_data.xlsx' , sheet_name='Transactions')"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 9,
33 | "metadata": {},
34 | "outputs": [
35 | {
36 | "data": {
37 | "text/html": [
38 | "\n",
39 | "\n",
52 | "
\n",
53 | " \n",
54 | " \n",
55 | " \n",
56 | " transaction_id \n",
57 | " product_id \n",
58 | " customer_id \n",
59 | " transaction_date \n",
60 | " online_order \n",
61 | " order_status \n",
62 | " brand \n",
63 | " product_line \n",
64 | " product_class \n",
65 | " product_size \n",
66 | " list_price \n",
67 | " standard_cost \n",
68 | " product_first_sold_date \n",
69 | " \n",
70 | " \n",
71 | " \n",
72 | " \n",
73 | " 0 \n",
74 | " 1 \n",
75 | " 2 \n",
76 | " 2950 \n",
77 | " 2017-02-25 \n",
78 | " 0.0 \n",
79 | " Approved \n",
80 | " Solex \n",
81 | " Standard \n",
82 | " medium \n",
83 | " medium \n",
84 | " 71.49 \n",
85 | " 53.62 \n",
86 | " 41245.0 \n",
87 | " \n",
88 | " \n",
89 | " 1 \n",
90 | " 2 \n",
91 | " 3 \n",
92 | " 3120 \n",
93 | " 2017-05-21 \n",
94 | " 1.0 \n",
95 | " Approved \n",
96 | " Trek Bicycles \n",
97 | " Standard \n",
98 | " medium \n",
99 | " large \n",
100 | " 2091.47 \n",
101 | " 388.92 \n",
102 | " 41701.0 \n",
103 | " \n",
104 | " \n",
105 | " 2 \n",
106 | " 3 \n",
107 | " 37 \n",
108 | " 402 \n",
109 | " 2017-10-16 \n",
110 | " 0.0 \n",
111 | " Approved \n",
112 | " OHM Cycles \n",
113 | " Standard \n",
114 | " low \n",
115 | " medium \n",
116 | " 1793.43 \n",
117 | " 248.82 \n",
118 | " 36361.0 \n",
119 | " \n",
120 | " \n",
121 | " 3 \n",
122 | " 4 \n",
123 | " 88 \n",
124 | " 3135 \n",
125 | " 2017-08-31 \n",
126 | " 0.0 \n",
127 | " Approved \n",
128 | " Norco Bicycles \n",
129 | " Standard \n",
130 | " medium \n",
131 | " medium \n",
132 | " 1198.46 \n",
133 | " 381.10 \n",
134 | " 36145.0 \n",
135 | " \n",
136 | " \n",
137 | " 4 \n",
138 | " 5 \n",
139 | " 78 \n",
140 | " 787 \n",
141 | " 2017-10-01 \n",
142 | " 1.0 \n",
143 | " Approved \n",
144 | " Giant Bicycles \n",
145 | " Standard \n",
146 | " medium \n",
147 | " large \n",
148 | " 1765.30 \n",
149 | " 709.48 \n",
150 | " 42226.0 \n",
151 | " \n",
152 | " \n",
153 | "
\n",
154 | "
"
155 | ],
156 | "text/plain": [
157 | " transaction_id product_id customer_id transaction_date online_order \\\n",
158 | "0 1 2 2950 2017-02-25 0.0 \n",
159 | "1 2 3 3120 2017-05-21 1.0 \n",
160 | "2 3 37 402 2017-10-16 0.0 \n",
161 | "3 4 88 3135 2017-08-31 0.0 \n",
162 | "4 5 78 787 2017-10-01 1.0 \n",
163 | "\n",
164 | " order_status brand product_line product_class product_size \\\n",
165 | "0 Approved Solex Standard medium medium \n",
166 | "1 Approved Trek Bicycles Standard medium large \n",
167 | "2 Approved OHM Cycles Standard low medium \n",
168 | "3 Approved Norco Bicycles Standard medium medium \n",
169 | "4 Approved Giant Bicycles Standard medium large \n",
170 | "\n",
171 | " list_price standard_cost product_first_sold_date \n",
172 | "0 71.49 53.62 41245.0 \n",
173 | "1 2091.47 388.92 41701.0 \n",
174 | "2 1793.43 248.82 36361.0 \n",
175 | "3 1198.46 381.10 36145.0 \n",
176 | "4 1765.30 709.48 42226.0 "
177 | ]
178 | },
179 | "execution_count": 9,
180 | "metadata": {},
181 | "output_type": "execute_result"
182 | }
183 | ],
184 | "source": [
185 | "# Checking first 5 records from Transactions Data\n",
186 | "\n",
187 | "trans.head(5)"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 10,
193 | "metadata": {},
194 | "outputs": [
195 | {
196 | "name": "stdout",
197 | "output_type": "stream",
198 | "text": [
199 | "\n",
200 | "RangeIndex: 20000 entries, 0 to 19999\n",
201 | "Data columns (total 13 columns):\n",
202 | "transaction_id 20000 non-null int64\n",
203 | "product_id 20000 non-null int64\n",
204 | "customer_id 20000 non-null int64\n",
205 | "transaction_date 20000 non-null datetime64[ns]\n",
206 | "online_order 19640 non-null float64\n",
207 | "order_status 20000 non-null object\n",
208 | "brand 19803 non-null object\n",
209 | "product_line 19803 non-null object\n",
210 | "product_class 19803 non-null object\n",
211 | "product_size 19803 non-null object\n",
212 | "list_price 20000 non-null float64\n",
213 | "standard_cost 19803 non-null float64\n",
214 | "product_first_sold_date 19803 non-null float64\n",
215 | "dtypes: datetime64[ns](1), float64(4), int64(3), object(5)\n",
216 | "memory usage: 2.0+ MB\n"
217 | ]
218 | }
219 | ],
220 | "source": [
221 | "# Information of columns and data-types of Transactions Data.\n",
222 | "\n",
223 | "trans.info()"
224 | ]
225 | },
226 | {
227 | "cell_type": "markdown",
228 | "metadata": {},
229 | "source": [
230 | "The datatype of product_first_sold_date column is not in datetime format . The data type of this column must be changed from int64 to datetime format."
231 | ]
232 | },
233 | {
234 | "cell_type": "markdown",
235 | "metadata": {},
236 | "source": [
237 | "## Total Records"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": 11,
243 | "metadata": {},
244 | "outputs": [
245 | {
246 | "name": "stdout",
247 | "output_type": "stream",
248 | "text": [
249 | "Total records (rows) in the dataset : 20000\n",
250 | "Total columns (features) in the dataset : 13\n"
251 | ]
252 | }
253 | ],
254 | "source": [
255 | "print(\"Total records (rows) in the dataset : {}\".format(trans.shape[0]))\n",
256 | "print(\"Total columns (features) in the dataset : {}\".format(trans.shape[1]))"
257 | ]
258 | },
259 | {
260 | "cell_type": "markdown",
261 | "metadata": {},
262 | "source": [
263 | "## Numeric Columns and Non-Numeric Columns"
264 | ]
265 | },
266 | {
267 | "cell_type": "code",
268 | "execution_count": 12,
269 | "metadata": {},
270 | "outputs": [
271 | {
272 | "name": "stdout",
273 | "output_type": "stream",
274 | "text": [
275 | "The numeric columns are :\n",
276 | "['transaction_id' 'product_id' 'customer_id' 'online_order' 'list_price'\n",
277 | " 'standard_cost' 'product_first_sold_date']\n",
278 | "The non-numeric columns are :\n",
279 | "['transaction_date' 'order_status' 'brand' 'product_line' 'product_class'\n",
280 | " 'product_size']\n"
281 | ]
282 | }
283 | ],
284 | "source": [
285 | "# select numeric columns\n",
286 | "df_numeric = trans.select_dtypes(include=[np.number])\n",
287 | "numeric_cols = df_numeric.columns.values\n",
288 | "print(\"The numeric columns are :\")\n",
289 | "print(numeric_cols)\n",
290 | "\n",
291 | "\n",
292 | "# select non-numeric columns\n",
293 | "df_non_numeric = trans.select_dtypes(exclude=[np.number])\n",
294 | "non_numeric_cols = df_non_numeric.columns.values\n",
295 | "print(\"The non-numeric columns are :\")\n",
296 | "print(non_numeric_cols)"
297 | ]
298 | },
299 | {
300 | "cell_type": "markdown",
301 | "metadata": {},
302 | "source": [
303 | "## 1. Missing Values Check"
304 | ]
305 | },
306 | {
307 | "cell_type": "markdown",
308 | "metadata": {},
309 | "source": [
310 | "Checking for the presence of any missing values in the dataset. If missing values are present for a particular feature then depending upon the situation the feature may be either dropped (cases when a major amount of data is missing) or an appropiate value will be imputed in the feature column with missing values."
311 | ]
312 | },
313 | {
314 | "cell_type": "code",
315 | "execution_count": 13,
316 | "metadata": {},
317 | "outputs": [
318 | {
319 | "data": {
320 | "text/plain": [
321 | "transaction_id 0\n",
322 | "product_id 0\n",
323 | "customer_id 0\n",
324 | "transaction_date 0\n",
325 | "online_order 360\n",
326 | "order_status 0\n",
327 | "brand 197\n",
328 | "product_line 197\n",
329 | "product_class 197\n",
330 | "product_size 197\n",
331 | "list_price 0\n",
332 | "standard_cost 197\n",
333 | "product_first_sold_date 197\n",
334 | "dtype: int64"
335 | ]
336 | },
337 | "execution_count": 13,
338 | "metadata": {},
339 | "output_type": "execute_result"
340 | }
341 | ],
342 | "source": [
343 | "# Total number of missing values\n",
344 | "\n",
345 | "trans.isnull().sum()"
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "execution_count": 14,
351 | "metadata": {},
352 | "outputs": [
353 | {
354 | "data": {
355 | "text/plain": [
356 | "transaction_id 0.000\n",
357 | "product_id 0.000\n",
358 | "customer_id 0.000\n",
359 | "transaction_date 0.000\n",
360 | "online_order 1.800\n",
361 | "order_status 0.000\n",
362 | "brand 0.985\n",
363 | "product_line 0.985\n",
364 | "product_class 0.985\n",
365 | "product_size 0.985\n",
366 | "list_price 0.000\n",
367 | "standard_cost 0.985\n",
368 | "product_first_sold_date 0.985\n",
369 | "dtype: float64"
370 | ]
371 | },
372 | "execution_count": 14,
373 | "metadata": {},
374 | "output_type": "execute_result"
375 | }
376 | ],
377 | "source": [
378 | "# Percentage of missing values\n",
379 | "\n",
380 | "trans.isnull().mean()*100"
381 | ]
382 | },
383 | {
384 | "cell_type": "markdown",
385 | "metadata": {},
386 | "source": [
387 | "Here it is observed that columns like online_order, brand, product_line, product_class , product_size, standard_cost, product_first_sold_date have missing values."
388 | ]
389 | },
390 | {
391 | "cell_type": "markdown",
392 | "metadata": {},
393 | "source": [
394 | "### 1.1 Online Order"
395 | ]
396 | },
397 | {
398 | "cell_type": "markdown",
399 | "metadata": {},
400 | "source": [
401 | "Since 1.8 % of the records have online_order data missing we can perform mode imputation for this categorical column. "
402 | ]
403 | },
404 | {
405 | "cell_type": "code",
406 | "execution_count": 15,
407 | "metadata": {},
408 | "outputs": [
409 | {
410 | "data": {
411 | "text/html": [
412 | "\n",
413 | "\n",
426 | "
\n",
427 | " \n",
428 | " \n",
429 | " \n",
430 | " transaction_id \n",
431 | " product_id \n",
432 | " customer_id \n",
433 | " transaction_date \n",
434 | " online_order \n",
435 | " order_status \n",
436 | " brand \n",
437 | " product_line \n",
438 | " product_class \n",
439 | " product_size \n",
440 | " list_price \n",
441 | " standard_cost \n",
442 | " product_first_sold_date \n",
443 | " \n",
444 | " \n",
445 | " \n",
446 | " \n",
447 | " 97 \n",
448 | " 98 \n",
449 | " 49 \n",
450 | " 333 \n",
451 | " 2017-06-23 \n",
452 | " NaN \n",
453 | " Approved \n",
454 | " Trek Bicycles \n",
455 | " Road \n",
456 | " medium \n",
457 | " medium \n",
458 | " 533.51 \n",
459 | " 400.13 \n",
460 | " 37823.0 \n",
461 | " \n",
462 | " \n",
463 | " 166 \n",
464 | " 167 \n",
465 | " 90 \n",
466 | " 3177 \n",
467 | " 2017-04-26 \n",
468 | " NaN \n",
469 | " Approved \n",
470 | " Norco Bicycles \n",
471 | " Standard \n",
472 | " low \n",
473 | " medium \n",
474 | " 363.01 \n",
475 | " 290.41 \n",
476 | " 38482.0 \n",
477 | " \n",
478 | " \n",
479 | " 169 \n",
480 | " 170 \n",
481 | " 6 \n",
482 | " 404 \n",
483 | " 2017-10-16 \n",
484 | " NaN \n",
485 | " Approved \n",
486 | " OHM Cycles \n",
487 | " Standard \n",
488 | " high \n",
489 | " medium \n",
490 | " 227.88 \n",
491 | " 136.73 \n",
492 | " 37838.0 \n",
493 | " \n",
494 | " \n",
495 | " 250 \n",
496 | " 251 \n",
497 | " 63 \n",
498 | " 1967 \n",
499 | " 2017-04-11 \n",
500 | " NaN \n",
501 | " Approved \n",
502 | " Solex \n",
503 | " Standard \n",
504 | " medium \n",
505 | " medium \n",
506 | " 1483.20 \n",
507 | " 99.59 \n",
508 | " 42145.0 \n",
509 | " \n",
510 | " \n",
511 | " 300 \n",
512 | " 301 \n",
513 | " 78 \n",
514 | " 2530 \n",
515 | " 2017-03-24 \n",
516 | " NaN \n",
517 | " Approved \n",
518 | " Giant Bicycles \n",
519 | " Standard \n",
520 | " medium \n",
521 | " large \n",
522 | " 1765.30 \n",
523 | " 709.48 \n",
524 | " 35455.0 \n",
525 | " \n",
526 | " \n",
527 | " 336 \n",
528 | " 337 \n",
529 | " 82 \n",
530 | " 1615 \n",
531 | " 2017-10-30 \n",
532 | " NaN \n",
533 | " Approved \n",
534 | " Norco Bicycles \n",
535 | " Standard \n",
536 | " high \n",
537 | " medium \n",
538 | " 1148.64 \n",
539 | " 689.18 \n",
540 | " 41533.0 \n",
541 | " \n",
542 | " \n",
543 | " 342 \n",
544 | " 343 \n",
545 | " 61 \n",
546 | " 1478 \n",
547 | " 2017-03-11 \n",
548 | " NaN \n",
549 | " Approved \n",
550 | " Norco Bicycles \n",
551 | " Standard \n",
552 | " medium \n",
553 | " small \n",
554 | " 586.45 \n",
555 | " 521.94 \n",
556 | " 33429.0 \n",
557 | " \n",
558 | " \n",
559 | " 398 \n",
560 | " 399 \n",
561 | " 83 \n",
562 | " 1306 \n",
563 | " 2017-12-18 \n",
564 | " NaN \n",
565 | " Approved \n",
566 | " Solex \n",
567 | " Touring \n",
568 | " medium \n",
569 | " large \n",
570 | " 2083.94 \n",
571 | " 675.03 \n",
572 | " 38206.0 \n",
573 | " \n",
574 | " \n",
575 | " 476 \n",
576 | " 477 \n",
577 | " 73 \n",
578 | " 367 \n",
579 | " 2017-02-12 \n",
580 | " NaN \n",
581 | " Approved \n",
582 | " Solex \n",
583 | " Standard \n",
584 | " medium \n",
585 | " medium \n",
586 | " 1945.43 \n",
587 | " 333.18 \n",
588 | " 38859.0 \n",
589 | " \n",
590 | " \n",
591 | " 529 \n",
592 | " 530 \n",
593 | " 62 \n",
594 | " 1202 \n",
595 | " 2017-03-31 \n",
596 | " NaN \n",
597 | " Approved \n",
598 | " Solex \n",
599 | " Standard \n",
600 | " medium \n",
601 | " medium \n",
602 | " 478.16 \n",
603 | " 298.72 \n",
604 | " 34143.0 \n",
605 | " \n",
606 | " \n",
607 | " 558 \n",
608 | " 559 \n",
609 | " 20 \n",
610 | " 3104 \n",
611 | " 2017-03-21 \n",
612 | " NaN \n",
613 | " Approved \n",
614 | " Trek Bicycles \n",
615 | " Standard \n",
616 | " medium \n",
617 | " small \n",
618 | " 1775.81 \n",
619 | " 1580.47 \n",
620 | " 33455.0 \n",
621 | " \n",
622 | " \n",
623 | " 576 \n",
624 | " 577 \n",
625 | " 1 \n",
626 | " 149 \n",
627 | " 2017-05-04 \n",
628 | " NaN \n",
629 | " Approved \n",
630 | " Giant Bicycles \n",
631 | " Standard \n",
632 | " medium \n",
633 | " medium \n",
634 | " 1403.50 \n",
635 | " 954.82 \n",
636 | " 40649.0 \n",
637 | " \n",
638 | " \n",
639 | " 760 \n",
640 | " 761 \n",
641 | " 4 \n",
642 | " 2428 \n",
643 | " 2017-12-20 \n",
644 | " NaN \n",
645 | " Approved \n",
646 | " Giant Bicycles \n",
647 | " Standard \n",
648 | " high \n",
649 | " medium \n",
650 | " 1129.13 \n",
651 | " 677.48 \n",
652 | " 33549.0 \n",
653 | " \n",
654 | " \n",
655 | " 769 \n",
656 | " 770 \n",
657 | " 4 \n",
658 | " 2414 \n",
659 | " 2017-09-10 \n",
660 | " NaN \n",
661 | " Approved \n",
662 | " Giant Bicycles \n",
663 | " Standard \n",
664 | " high \n",
665 | " medium \n",
666 | " 1129.13 \n",
667 | " 677.48 \n",
668 | " 38258.0 \n",
669 | " \n",
670 | " \n",
671 | " 843 \n",
672 | " 844 \n",
673 | " 40 \n",
674 | " 3412 \n",
675 | " 2017-07-09 \n",
676 | " NaN \n",
677 | " Approved \n",
678 | " OHM Cycles \n",
679 | " Standard \n",
680 | " high \n",
681 | " medium \n",
682 | " 1458.17 \n",
683 | " 874.90 \n",
684 | " 36498.0 \n",
685 | " \n",
686 | " \n",
687 | " 914 \n",
688 | " 915 \n",
689 | " 39 \n",
690 | " 2756 \n",
691 | " 2017-03-16 \n",
692 | " NaN \n",
693 | " Approved \n",
694 | " Giant Bicycles \n",
695 | " Standard \n",
696 | " medium \n",
697 | " large \n",
698 | " 1812.75 \n",
699 | " 582.48 \n",
700 | " 40336.0 \n",
701 | " \n",
702 | " \n",
703 | " 917 \n",
704 | " 918 \n",
705 | " 35 \n",
706 | " 2281 \n",
707 | " 2017-04-30 \n",
708 | " NaN \n",
709 | " Approved \n",
710 | " Giant Bicycles \n",
711 | " Standard \n",
712 | " medium \n",
713 | " medium \n",
714 | " 1403.50 \n",
715 | " 954.82 \n",
716 | " 42688.0 \n",
717 | " \n",
718 | " \n",
719 | " 1060 \n",
720 | " 1061 \n",
721 | " 63 \n",
722 | " 1342 \n",
723 | " 2017-07-14 \n",
724 | " NaN \n",
725 | " Approved \n",
726 | " Solex \n",
727 | " Standard \n",
728 | " medium \n",
729 | " medium \n",
730 | " 1483.20 \n",
731 | " 99.59 \n",
732 | " 34996.0 \n",
733 | " \n",
734 | " \n",
735 | " 1071 \n",
736 | " 1072 \n",
737 | " 36 \n",
738 | " 1505 \n",
739 | " 2017-11-09 \n",
740 | " NaN \n",
741 | " Approved \n",
742 | " Solex \n",
743 | " Standard \n",
744 | " low \n",
745 | " medium \n",
746 | " 945.04 \n",
747 | " 507.58 \n",
748 | " 40784.0 \n",
749 | " \n",
750 | " \n",
751 | " 1159 \n",
752 | " 1160 \n",
753 | " 73 \n",
754 | " 1722 \n",
755 | " 2017-12-15 \n",
756 | " NaN \n",
757 | " Approved \n",
758 | " Solex \n",
759 | " Standard \n",
760 | " medium \n",
761 | " medium \n",
762 | " 1945.43 \n",
763 | " 333.18 \n",
764 | " 37499.0 \n",
765 | " \n",
766 | " \n",
767 | " 1192 \n",
768 | " 1193 \n",
769 | " 71 \n",
770 | " 79 \n",
771 | " 2017-11-17 \n",
772 | " NaN \n",
773 | " Approved \n",
774 | " Solex \n",
775 | " Standard \n",
776 | " high \n",
777 | " large \n",
778 | " 1842.92 \n",
779 | " 1105.75 \n",
780 | " 34996.0 \n",
781 | " \n",
782 | " \n",
783 | " 1200 \n",
784 | " 1201 \n",
785 | " 84 \n",
786 | " 3241 \n",
787 | " 2017-05-25 \n",
788 | " NaN \n",
789 | " Approved \n",
790 | " Trek Bicycles \n",
791 | " Road \n",
792 | " medium \n",
793 | " medium \n",
794 | " 290.62 \n",
795 | " 215.14 \n",
796 | " 38206.0 \n",
797 | " \n",
798 | " \n",
799 | " 1354 \n",
800 | " 1355 \n",
801 | " 92 \n",
802 | " 1864 \n",
803 | " 2017-02-28 \n",
804 | " NaN \n",
805 | " Approved \n",
806 | " WeareA2B \n",
807 | " Standard \n",
808 | " medium \n",
809 | " small \n",
810 | " 1415.01 \n",
811 | " 1259.36 \n",
812 | " 37626.0 \n",
813 | " \n",
814 | " \n",
815 | " 1361 \n",
816 | " 1362 \n",
817 | " 32 \n",
818 | " 1395 \n",
819 | " 2017-01-27 \n",
820 | " NaN \n",
821 | " Approved \n",
822 | " Giant Bicycles \n",
823 | " Standard \n",
824 | " high \n",
825 | " medium \n",
826 | " 1179.00 \n",
827 | " 707.40 \n",
828 | " 38482.0 \n",
829 | " \n",
830 | " \n",
831 | " 1371 \n",
832 | " 1372 \n",
833 | " 17 \n",
834 | " 3254 \n",
835 | " 2017-04-19 \n",
836 | " NaN \n",
837 | " Approved \n",
838 | " Solex \n",
839 | " Standard \n",
840 | " high \n",
841 | " medium \n",
842 | " 1024.66 \n",
843 | " 614.80 \n",
844 | " 35378.0 \n",
845 | " \n",
846 | " \n",
847 | " 1381 \n",
848 | " 1382 \n",
849 | " 96 \n",
850 | " 1393 \n",
851 | " 2017-12-29 \n",
852 | " NaN \n",
853 | " Approved \n",
854 | " WeareA2B \n",
855 | " Road \n",
856 | " low \n",
857 | " small \n",
858 | " 1172.78 \n",
859 | " 1043.77 \n",
860 | " 37539.0 \n",
861 | " \n",
862 | " \n",
863 | " 1523 \n",
864 | " 1524 \n",
865 | " 31 \n",
866 | " 2151 \n",
867 | " 2017-05-05 \n",
868 | " NaN \n",
869 | " Approved \n",
870 | " Giant Bicycles \n",
871 | " Standard \n",
872 | " medium \n",
873 | " medium \n",
874 | " 230.91 \n",
875 | " 173.18 \n",
876 | " 39031.0 \n",
877 | " \n",
878 | " \n",
879 | " 1552 \n",
880 | " 1553 \n",
881 | " 62 \n",
882 | " 842 \n",
883 | " 2017-10-23 \n",
884 | " NaN \n",
885 | " Approved \n",
886 | " Solex \n",
887 | " Standard \n",
888 | " high \n",
889 | " medium \n",
890 | " 1024.66 \n",
891 | " 614.80 \n",
892 | " 35378.0 \n",
893 | " \n",
894 | " \n",
895 | " 1555 \n",
896 | " 1556 \n",
897 | " 73 \n",
898 | " 2776 \n",
899 | " 2017-10-30 \n",
900 | " NaN \n",
901 | " Approved \n",
902 | " Solex \n",
903 | " Standard \n",
904 | " medium \n",
905 | " medium \n",
906 | " 1945.43 \n",
907 | " 333.18 \n",
908 | " 37499.0 \n",
909 | " \n",
910 | " \n",
911 | " 1621 \n",
912 | " 1622 \n",
913 | " 0 \n",
914 | " 736 \n",
915 | " 2017-09-07 \n",
916 | " NaN \n",
917 | " Approved \n",
918 | " WeareA2B \n",
919 | " Standard \n",
920 | " medium \n",
921 | " small \n",
922 | " 175.89 \n",
923 | " 131.92 \n",
924 | " 37668.0 \n",
925 | " \n",
926 | " \n",
927 | " ... \n",
928 | " ... \n",
929 | " ... \n",
930 | " ... \n",
931 | " ... \n",
932 | " ... \n",
933 | " ... \n",
934 | " ... \n",
935 | " ... \n",
936 | " ... \n",
937 | " ... \n",
938 | " ... \n",
939 | " ... \n",
940 | " ... \n",
941 | " \n",
942 | " \n",
943 | " 18068 \n",
944 | " 18069 \n",
945 | " 26 \n",
946 | " 788 \n",
947 | " 2017-07-14 \n",
948 | " NaN \n",
949 | " Approved \n",
950 | " WeareA2B \n",
951 | " Standard \n",
952 | " medium \n",
953 | " medium \n",
954 | " 1992.93 \n",
955 | " 762.63 \n",
956 | " 34115.0 \n",
957 | " \n",
958 | " \n",
959 | " 18085 \n",
960 | " 18086 \n",
961 | " 48 \n",
962 | " 2813 \n",
963 | " 2017-04-07 \n",
964 | " NaN \n",
965 | " Approved \n",
966 | " WeareA2B \n",
967 | " Standard \n",
968 | " medium \n",
969 | " medium \n",
970 | " 1762.96 \n",
971 | " 950.52 \n",
972 | " 41064.0 \n",
973 | " \n",
974 | " \n",
975 | " 18335 \n",
976 | " 18336 \n",
977 | " 95 \n",
978 | " 2958 \n",
979 | " 2017-06-04 \n",
980 | " NaN \n",
981 | " Approved \n",
982 | " Giant Bicycles \n",
983 | " Standard \n",
984 | " medium \n",
985 | " large \n",
986 | " 569.56 \n",
987 | " 528.43 \n",
988 | " 37874.0 \n",
989 | " \n",
990 | " \n",
991 | " 18471 \n",
992 | " 18472 \n",
993 | " 81 \n",
994 | " 176 \n",
995 | " 2017-03-06 \n",
996 | " NaN \n",
997 | " Approved \n",
998 | " Norco Bicycles \n",
999 | " Standard \n",
1000 | " medium \n",
1001 | " small \n",
1002 | " 586.45 \n",
1003 | " 521.94 \n",
1004 | " 33429.0 \n",
1005 | " \n",
1006 | " \n",
1007 | " 18501 \n",
1008 | " 18502 \n",
1009 | " 19 \n",
1010 | " 2195 \n",
1011 | " 2017-11-02 \n",
1012 | " NaN \n",
1013 | " Approved \n",
1014 | " OHM Cycles \n",
1015 | " Road \n",
1016 | " high \n",
1017 | " large \n",
1018 | " 12.01 \n",
1019 | " 7.21 \n",
1020 | " 39880.0 \n",
1021 | " \n",
1022 | " \n",
1023 | " 18606 \n",
1024 | " 18607 \n",
1025 | " 43 \n",
1026 | " 2138 \n",
1027 | " 2017-12-01 \n",
1028 | " NaN \n",
1029 | " Approved \n",
1030 | " Norco Bicycles \n",
1031 | " Standard \n",
1032 | " medium \n",
1033 | " medium \n",
1034 | " 1555.58 \n",
1035 | " 818.01 \n",
1036 | " 37873.0 \n",
1037 | " \n",
1038 | " \n",
1039 | " 18624 \n",
1040 | " 18625 \n",
1041 | " 85 \n",
1042 | " 1133 \n",
1043 | " 2017-06-17 \n",
1044 | " NaN \n",
1045 | " Approved \n",
1046 | " WeareA2B \n",
1047 | " Standard \n",
1048 | " medium \n",
1049 | " medium \n",
1050 | " 752.64 \n",
1051 | " 205.36 \n",
1052 | " 42218.0 \n",
1053 | " \n",
1054 | " \n",
1055 | " 18689 \n",
1056 | " 18690 \n",
1057 | " 23 \n",
1058 | " 2987 \n",
1059 | " 2017-12-13 \n",
1060 | " NaN \n",
1061 | " Approved \n",
1062 | " Norco Bicycles \n",
1063 | " Standard \n",
1064 | " medium \n",
1065 | " medium \n",
1066 | " 1198.46 \n",
1067 | " 381.10 \n",
1068 | " 41701.0 \n",
1069 | " \n",
1070 | " \n",
1071 | " 18711 \n",
1072 | " 18712 \n",
1073 | " 78 \n",
1074 | " 665 \n",
1075 | " 2017-09-22 \n",
1076 | " NaN \n",
1077 | " Approved \n",
1078 | " Giant Bicycles \n",
1079 | " Standard \n",
1080 | " medium \n",
1081 | " large \n",
1082 | " 1765.30 \n",
1083 | " 709.48 \n",
1084 | " 38193.0 \n",
1085 | " \n",
1086 | " \n",
1087 | " 18722 \n",
1088 | " 18723 \n",
1089 | " 65 \n",
1090 | " 1597 \n",
1091 | " 2017-11-21 \n",
1092 | " NaN \n",
1093 | " Approved \n",
1094 | " WeareA2B \n",
1095 | " Standard \n",
1096 | " medium \n",
1097 | " medium \n",
1098 | " 1807.45 \n",
1099 | " 778.69 \n",
1100 | " 42145.0 \n",
1101 | " \n",
1102 | " \n",
1103 | " 18751 \n",
1104 | " 18752 \n",
1105 | " 22 \n",
1106 | " 2200 \n",
1107 | " 2017-10-12 \n",
1108 | " NaN \n",
1109 | " Approved \n",
1110 | " WeareA2B \n",
1111 | " Standard \n",
1112 | " medium \n",
1113 | " medium \n",
1114 | " 60.34 \n",
1115 | " 45.26 \n",
1116 | " 33455.0 \n",
1117 | " \n",
1118 | " \n",
1119 | " 18754 \n",
1120 | " 18755 \n",
1121 | " 55 \n",
1122 | " 403 \n",
1123 | " 2017-01-30 \n",
1124 | " NaN \n",
1125 | " Approved \n",
1126 | " Trek Bicycles \n",
1127 | " Road \n",
1128 | " medium \n",
1129 | " large \n",
1130 | " 1894.19 \n",
1131 | " 598.76 \n",
1132 | " 35707.0 \n",
1133 | " \n",
1134 | " \n",
1135 | " 18821 \n",
1136 | " 18822 \n",
1137 | " 30 \n",
1138 | " 3464 \n",
1139 | " 2017-12-13 \n",
1140 | " NaN \n",
1141 | " Approved \n",
1142 | " Solex \n",
1143 | " Standard \n",
1144 | " high \n",
1145 | " medium \n",
1146 | " 748.17 \n",
1147 | " 448.90 \n",
1148 | " 37337.0 \n",
1149 | " \n",
1150 | " \n",
1151 | " 18846 \n",
1152 | " 18847 \n",
1153 | " 57 \n",
1154 | " 50 \n",
1155 | " 2017-09-20 \n",
1156 | " NaN \n",
1157 | " Approved \n",
1158 | " WeareA2B \n",
1159 | " Touring \n",
1160 | " medium \n",
1161 | " large \n",
1162 | " 1890.39 \n",
1163 | " 260.14 \n",
1164 | " 34170.0 \n",
1165 | " \n",
1166 | " \n",
1167 | " 18982 \n",
1168 | " 18983 \n",
1169 | " 21 \n",
1170 | " 743 \n",
1171 | " 2017-01-03 \n",
1172 | " NaN \n",
1173 | " Approved \n",
1174 | " Solex \n",
1175 | " Standard \n",
1176 | " medium \n",
1177 | " large \n",
1178 | " 1071.23 \n",
1179 | " 380.74 \n",
1180 | " 34244.0 \n",
1181 | " \n",
1182 | " \n",
1183 | " 19024 \n",
1184 | " 19025 \n",
1185 | " 36 \n",
1186 | " 1937 \n",
1187 | " 2017-08-04 \n",
1188 | " NaN \n",
1189 | " Approved \n",
1190 | " Solex \n",
1191 | " Standard \n",
1192 | " low \n",
1193 | " medium \n",
1194 | " 945.04 \n",
1195 | " 507.58 \n",
1196 | " 35052.0 \n",
1197 | " \n",
1198 | " \n",
1199 | " 19082 \n",
1200 | " 19083 \n",
1201 | " 90 \n",
1202 | " 1204 \n",
1203 | " 2017-12-06 \n",
1204 | " NaN \n",
1205 | " Approved \n",
1206 | " Norco Bicycles \n",
1207 | " Standard \n",
1208 | " low \n",
1209 | " medium \n",
1210 | " 363.01 \n",
1211 | " 290.41 \n",
1212 | " 36367.0 \n",
1213 | " \n",
1214 | " \n",
1215 | " 19100 \n",
1216 | " 19101 \n",
1217 | " 8 \n",
1218 | " 2713 \n",
1219 | " 2017-10-21 \n",
1220 | " NaN \n",
1221 | " Approved \n",
1222 | " Solex \n",
1223 | " Road \n",
1224 | " medium \n",
1225 | " small \n",
1226 | " 1703.52 \n",
1227 | " 1516.13 \n",
1228 | " 38216.0 \n",
1229 | " \n",
1230 | " \n",
1231 | " 19271 \n",
1232 | " 19272 \n",
1233 | " 91 \n",
1234 | " 3017 \n",
1235 | " 2017-07-07 \n",
1236 | " NaN \n",
1237 | " Approved \n",
1238 | " Solex \n",
1239 | " Standard \n",
1240 | " medium \n",
1241 | " medium \n",
1242 | " 100.35 \n",
1243 | " 75.26 \n",
1244 | " 36367.0 \n",
1245 | " \n",
1246 | " \n",
1247 | " 19282 \n",
1248 | " 19283 \n",
1249 | " 55 \n",
1250 | " 1364 \n",
1251 | " 2017-08-02 \n",
1252 | " NaN \n",
1253 | " Approved \n",
1254 | " Trek Bicycles \n",
1255 | " Road \n",
1256 | " medium \n",
1257 | " large \n",
1258 | " 1894.19 \n",
1259 | " 598.76 \n",
1260 | " 37823.0 \n",
1261 | " \n",
1262 | " \n",
1263 | " 19303 \n",
1264 | " 19304 \n",
1265 | " 35 \n",
1266 | " 1776 \n",
1267 | " 2017-08-22 \n",
1268 | " NaN \n",
1269 | " Approved \n",
1270 | " Trek Bicycles \n",
1271 | " Standard \n",
1272 | " low \n",
1273 | " medium \n",
1274 | " 1057.51 \n",
1275 | " 154.40 \n",
1276 | " 34527.0 \n",
1277 | " \n",
1278 | " \n",
1279 | " 19368 \n",
1280 | " 19369 \n",
1281 | " 66 \n",
1282 | " 3223 \n",
1283 | " 2017-12-16 \n",
1284 | " NaN \n",
1285 | " Approved \n",
1286 | " Giant Bicycles \n",
1287 | " Road \n",
1288 | " low \n",
1289 | " small \n",
1290 | " 590.26 \n",
1291 | " 525.33 \n",
1292 | " 38647.0 \n",
1293 | " \n",
1294 | " \n",
1295 | " 19369 \n",
1296 | " 19370 \n",
1297 | " 91 \n",
1298 | " 1107 \n",
1299 | " 2017-09-02 \n",
1300 | " NaN \n",
1301 | " Approved \n",
1302 | " WeareA2B \n",
1303 | " Standard \n",
1304 | " low \n",
1305 | " medium \n",
1306 | " 642.31 \n",
1307 | " 513.85 \n",
1308 | " 41922.0 \n",
1309 | " \n",
1310 | " \n",
1311 | " 19433 \n",
1312 | " 19434 \n",
1313 | " 86 \n",
1314 | " 1557 \n",
1315 | " 2017-03-10 \n",
1316 | " NaN \n",
1317 | " Approved \n",
1318 | " OHM Cycles \n",
1319 | " Standard \n",
1320 | " medium \n",
1321 | " medium \n",
1322 | " 235.63 \n",
1323 | " 125.07 \n",
1324 | " 38206.0 \n",
1325 | " \n",
1326 | " \n",
1327 | " 19495 \n",
1328 | " 19496 \n",
1329 | " 2 \n",
1330 | " 3028 \n",
1331 | " 2017-11-14 \n",
1332 | " NaN \n",
1333 | " Approved \n",
1334 | " Giant Bicycles \n",
1335 | " Road \n",
1336 | " low \n",
1337 | " small \n",
1338 | " 590.26 \n",
1339 | " 525.33 \n",
1340 | " 42710.0 \n",
1341 | " \n",
1342 | " \n",
1343 | " 19514 \n",
1344 | " 19515 \n",
1345 | " 51 \n",
1346 | " 690 \n",
1347 | " 2017-01-22 \n",
1348 | " NaN \n",
1349 | " Approved \n",
1350 | " OHM Cycles \n",
1351 | " Standard \n",
1352 | " high \n",
1353 | " medium \n",
1354 | " 2005.66 \n",
1355 | " 1203.40 \n",
1356 | " 37823.0 \n",
1357 | " \n",
1358 | " \n",
1359 | " 19573 \n",
1360 | " 19574 \n",
1361 | " 18 \n",
1362 | " 1735 \n",
1363 | " 2017-01-15 \n",
1364 | " NaN \n",
1365 | " Approved \n",
1366 | " Solex \n",
1367 | " Standard \n",
1368 | " medium \n",
1369 | " medium \n",
1370 | " 575.27 \n",
1371 | " 431.45 \n",
1372 | " 41345.0 \n",
1373 | " \n",
1374 | " \n",
1375 | " 19580 \n",
1376 | " 19581 \n",
1377 | " 49 \n",
1378 | " 1933 \n",
1379 | " 2017-10-12 \n",
1380 | " NaN \n",
1381 | " Approved \n",
1382 | " Trek Bicycles \n",
1383 | " Road \n",
1384 | " medium \n",
1385 | " medium \n",
1386 | " 533.51 \n",
1387 | " 400.13 \n",
1388 | " 41064.0 \n",
1389 | " \n",
1390 | " \n",
1391 | " 19635 \n",
1392 | " 19636 \n",
1393 | " 98 \n",
1394 | " 1389 \n",
1395 | " 2017-07-26 \n",
1396 | " NaN \n",
1397 | " Approved \n",
1398 | " Trek Bicycles \n",
1399 | " Standard \n",
1400 | " high \n",
1401 | " medium \n",
1402 | " 358.39 \n",
1403 | " 215.03 \n",
1404 | " 38002.0 \n",
1405 | " \n",
1406 | " \n",
1407 | " 19843 \n",
1408 | " 19844 \n",
1409 | " 3 \n",
1410 | " 8 \n",
1411 | " 2017-01-28 \n",
1412 | " NaN \n",
1413 | " Approved \n",
1414 | " Trek Bicycles \n",
1415 | " Standard \n",
1416 | " medium \n",
1417 | " large \n",
1418 | " 2091.47 \n",
1419 | " 388.92 \n",
1420 | " 37823.0 \n",
1421 | " \n",
1422 | " \n",
1423 | "
\n",
1424 | "
360 rows × 13 columns
\n",
1425 | "
"
1426 | ],
1427 | "text/plain": [
1428 | " transaction_id product_id customer_id transaction_date online_order \\\n",
1429 | "97 98 49 333 2017-06-23 NaN \n",
1430 | "166 167 90 3177 2017-04-26 NaN \n",
1431 | "169 170 6 404 2017-10-16 NaN \n",
1432 | "250 251 63 1967 2017-04-11 NaN \n",
1433 | "300 301 78 2530 2017-03-24 NaN \n",
1434 | "336 337 82 1615 2017-10-30 NaN \n",
1435 | "342 343 61 1478 2017-03-11 NaN \n",
1436 | "398 399 83 1306 2017-12-18 NaN \n",
1437 | "476 477 73 367 2017-02-12 NaN \n",
1438 | "529 530 62 1202 2017-03-31 NaN \n",
1439 | "558 559 20 3104 2017-03-21 NaN \n",
1440 | "576 577 1 149 2017-05-04 NaN \n",
1441 | "760 761 4 2428 2017-12-20 NaN \n",
1442 | "769 770 4 2414 2017-09-10 NaN \n",
1443 | "843 844 40 3412 2017-07-09 NaN \n",
1444 | "914 915 39 2756 2017-03-16 NaN \n",
1445 | "917 918 35 2281 2017-04-30 NaN \n",
1446 | "1060 1061 63 1342 2017-07-14 NaN \n",
1447 | "1071 1072 36 1505 2017-11-09 NaN \n",
1448 | "1159 1160 73 1722 2017-12-15 NaN \n",
1449 | "1192 1193 71 79 2017-11-17 NaN \n",
1450 | "1200 1201 84 3241 2017-05-25 NaN \n",
1451 | "1354 1355 92 1864 2017-02-28 NaN \n",
1452 | "1361 1362 32 1395 2017-01-27 NaN \n",
1453 | "1371 1372 17 3254 2017-04-19 NaN \n",
1454 | "1381 1382 96 1393 2017-12-29 NaN \n",
1455 | "1523 1524 31 2151 2017-05-05 NaN \n",
1456 | "1552 1553 62 842 2017-10-23 NaN \n",
1457 | "1555 1556 73 2776 2017-10-30 NaN \n",
1458 | "1621 1622 0 736 2017-09-07 NaN \n",
1459 | "... ... ... ... ... ... \n",
1460 | "18068 18069 26 788 2017-07-14 NaN \n",
1461 | "18085 18086 48 2813 2017-04-07 NaN \n",
1462 | "18335 18336 95 2958 2017-06-04 NaN \n",
1463 | "18471 18472 81 176 2017-03-06 NaN \n",
1464 | "18501 18502 19 2195 2017-11-02 NaN \n",
1465 | "18606 18607 43 2138 2017-12-01 NaN \n",
1466 | "18624 18625 85 1133 2017-06-17 NaN \n",
1467 | "18689 18690 23 2987 2017-12-13 NaN \n",
1468 | "18711 18712 78 665 2017-09-22 NaN \n",
1469 | "18722 18723 65 1597 2017-11-21 NaN \n",
1470 | "18751 18752 22 2200 2017-10-12 NaN \n",
1471 | "18754 18755 55 403 2017-01-30 NaN \n",
1472 | "18821 18822 30 3464 2017-12-13 NaN \n",
1473 | "18846 18847 57 50 2017-09-20 NaN \n",
1474 | "18982 18983 21 743 2017-01-03 NaN \n",
1475 | "19024 19025 36 1937 2017-08-04 NaN \n",
1476 | "19082 19083 90 1204 2017-12-06 NaN \n",
1477 | "19100 19101 8 2713 2017-10-21 NaN \n",
1478 | "19271 19272 91 3017 2017-07-07 NaN \n",
1479 | "19282 19283 55 1364 2017-08-02 NaN \n",
1480 | "19303 19304 35 1776 2017-08-22 NaN \n",
1481 | "19368 19369 66 3223 2017-12-16 NaN \n",
1482 | "19369 19370 91 1107 2017-09-02 NaN \n",
1483 | "19433 19434 86 1557 2017-03-10 NaN \n",
1484 | "19495 19496 2 3028 2017-11-14 NaN \n",
1485 | "19514 19515 51 690 2017-01-22 NaN \n",
1486 | "19573 19574 18 1735 2017-01-15 NaN \n",
1487 | "19580 19581 49 1933 2017-10-12 NaN \n",
1488 | "19635 19636 98 1389 2017-07-26 NaN \n",
1489 | "19843 19844 3 8 2017-01-28 NaN \n",
1490 | "\n",
1491 | " order_status brand product_line product_class product_size \\\n",
1492 | "97 Approved Trek Bicycles Road medium medium \n",
1493 | "166 Approved Norco Bicycles Standard low medium \n",
1494 | "169 Approved OHM Cycles Standard high medium \n",
1495 | "250 Approved Solex Standard medium medium \n",
1496 | "300 Approved Giant Bicycles Standard medium large \n",
1497 | "336 Approved Norco Bicycles Standard high medium \n",
1498 | "342 Approved Norco Bicycles Standard medium small \n",
1499 | "398 Approved Solex Touring medium large \n",
1500 | "476 Approved Solex Standard medium medium \n",
1501 | "529 Approved Solex Standard medium medium \n",
1502 | "558 Approved Trek Bicycles Standard medium small \n",
1503 | "576 Approved Giant Bicycles Standard medium medium \n",
1504 | "760 Approved Giant Bicycles Standard high medium \n",
1505 | "769 Approved Giant Bicycles Standard high medium \n",
1506 | "843 Approved OHM Cycles Standard high medium \n",
1507 | "914 Approved Giant Bicycles Standard medium large \n",
1508 | "917 Approved Giant Bicycles Standard medium medium \n",
1509 | "1060 Approved Solex Standard medium medium \n",
1510 | "1071 Approved Solex Standard low medium \n",
1511 | "1159 Approved Solex Standard medium medium \n",
1512 | "1192 Approved Solex Standard high large \n",
1513 | "1200 Approved Trek Bicycles Road medium medium \n",
1514 | "1354 Approved WeareA2B Standard medium small \n",
1515 | "1361 Approved Giant Bicycles Standard high medium \n",
1516 | "1371 Approved Solex Standard high medium \n",
1517 | "1381 Approved WeareA2B Road low small \n",
1518 | "1523 Approved Giant Bicycles Standard medium medium \n",
1519 | "1552 Approved Solex Standard high medium \n",
1520 | "1555 Approved Solex Standard medium medium \n",
1521 | "1621 Approved WeareA2B Standard medium small \n",
1522 | "... ... ... ... ... ... \n",
1523 | "18068 Approved WeareA2B Standard medium medium \n",
1524 | "18085 Approved WeareA2B Standard medium medium \n",
1525 | "18335 Approved Giant Bicycles Standard medium large \n",
1526 | "18471 Approved Norco Bicycles Standard medium small \n",
1527 | "18501 Approved OHM Cycles Road high large \n",
1528 | "18606 Approved Norco Bicycles Standard medium medium \n",
1529 | "18624 Approved WeareA2B Standard medium medium \n",
1530 | "18689 Approved Norco Bicycles Standard medium medium \n",
1531 | "18711 Approved Giant Bicycles Standard medium large \n",
1532 | "18722 Approved WeareA2B Standard medium medium \n",
1533 | "18751 Approved WeareA2B Standard medium medium \n",
1534 | "18754 Approved Trek Bicycles Road medium large \n",
1535 | "18821 Approved Solex Standard high medium \n",
1536 | "18846 Approved WeareA2B Touring medium large \n",
1537 | "18982 Approved Solex Standard medium large \n",
1538 | "19024 Approved Solex Standard low medium \n",
1539 | "19082 Approved Norco Bicycles Standard low medium \n",
1540 | "19100 Approved Solex Road medium small \n",
1541 | "19271 Approved Solex Standard medium medium \n",
1542 | "19282 Approved Trek Bicycles Road medium large \n",
1543 | "19303 Approved Trek Bicycles Standard low medium \n",
1544 | "19368 Approved Giant Bicycles Road low small \n",
1545 | "19369 Approved WeareA2B Standard low medium \n",
1546 | "19433 Approved OHM Cycles Standard medium medium \n",
1547 | "19495 Approved Giant Bicycles Road low small \n",
1548 | "19514 Approved OHM Cycles Standard high medium \n",
1549 | "19573 Approved Solex Standard medium medium \n",
1550 | "19580 Approved Trek Bicycles Road medium medium \n",
1551 | "19635 Approved Trek Bicycles Standard high medium \n",
1552 | "19843 Approved Trek Bicycles Standard medium large \n",
1553 | "\n",
1554 | " list_price standard_cost product_first_sold_date \n",
1555 | "97 533.51 400.13 37823.0 \n",
1556 | "166 363.01 290.41 38482.0 \n",
1557 | "169 227.88 136.73 37838.0 \n",
1558 | "250 1483.20 99.59 42145.0 \n",
1559 | "300 1765.30 709.48 35455.0 \n",
1560 | "336 1148.64 689.18 41533.0 \n",
1561 | "342 586.45 521.94 33429.0 \n",
1562 | "398 2083.94 675.03 38206.0 \n",
1563 | "476 1945.43 333.18 38859.0 \n",
1564 | "529 478.16 298.72 34143.0 \n",
1565 | "558 1775.81 1580.47 33455.0 \n",
1566 | "576 1403.50 954.82 40649.0 \n",
1567 | "760 1129.13 677.48 33549.0 \n",
1568 | "769 1129.13 677.48 38258.0 \n",
1569 | "843 1458.17 874.90 36498.0 \n",
1570 | "914 1812.75 582.48 40336.0 \n",
1571 | "917 1403.50 954.82 42688.0 \n",
1572 | "1060 1483.20 99.59 34996.0 \n",
1573 | "1071 945.04 507.58 40784.0 \n",
1574 | "1159 1945.43 333.18 37499.0 \n",
1575 | "1192 1842.92 1105.75 34996.0 \n",
1576 | "1200 290.62 215.14 38206.0 \n",
1577 | "1354 1415.01 1259.36 37626.0 \n",
1578 | "1361 1179.00 707.40 38482.0 \n",
1579 | "1371 1024.66 614.80 35378.0 \n",
1580 | "1381 1172.78 1043.77 37539.0 \n",
1581 | "1523 230.91 173.18 39031.0 \n",
1582 | "1552 1024.66 614.80 35378.0 \n",
1583 | "1555 1945.43 333.18 37499.0 \n",
1584 | "1621 175.89 131.92 37668.0 \n",
1585 | "... ... ... ... \n",
1586 | "18068 1992.93 762.63 34115.0 \n",
1587 | "18085 1762.96 950.52 41064.0 \n",
1588 | "18335 569.56 528.43 37874.0 \n",
1589 | "18471 586.45 521.94 33429.0 \n",
1590 | "18501 12.01 7.21 39880.0 \n",
1591 | "18606 1555.58 818.01 37873.0 \n",
1592 | "18624 752.64 205.36 42218.0 \n",
1593 | "18689 1198.46 381.10 41701.0 \n",
1594 | "18711 1765.30 709.48 38193.0 \n",
1595 | "18722 1807.45 778.69 42145.0 \n",
1596 | "18751 60.34 45.26 33455.0 \n",
1597 | "18754 1894.19 598.76 35707.0 \n",
1598 | "18821 748.17 448.90 37337.0 \n",
1599 | "18846 1890.39 260.14 34170.0 \n",
1600 | "18982 1071.23 380.74 34244.0 \n",
1601 | "19024 945.04 507.58 35052.0 \n",
1602 | "19082 363.01 290.41 36367.0 \n",
1603 | "19100 1703.52 1516.13 38216.0 \n",
1604 | "19271 100.35 75.26 36367.0 \n",
1605 | "19282 1894.19 598.76 37823.0 \n",
1606 | "19303 1057.51 154.40 34527.0 \n",
1607 | "19368 590.26 525.33 38647.0 \n",
1608 | "19369 642.31 513.85 41922.0 \n",
1609 | "19433 235.63 125.07 38206.0 \n",
1610 | "19495 590.26 525.33 42710.0 \n",
1611 | "19514 2005.66 1203.40 37823.0 \n",
1612 | "19573 575.27 431.45 41345.0 \n",
1613 | "19580 533.51 400.13 41064.0 \n",
1614 | "19635 358.39 215.03 38002.0 \n",
1615 | "19843 2091.47 388.92 37823.0 \n",
1616 | "\n",
1617 | "[360 rows x 13 columns]"
1618 | ]
1619 | },
1620 | "execution_count": 15,
1621 | "metadata": {},
1622 | "output_type": "execute_result"
1623 | }
1624 | ],
1625 | "source": [
1626 | "trans[trans['online_order'].isnull()]"
1627 | ]
1628 | },
1629 | {
1630 | "cell_type": "code",
1631 | "execution_count": 16,
1632 | "metadata": {},
1633 | "outputs": [
1634 | {
1635 | "data": {
1636 | "text/plain": [
1637 | "0 1.0\n",
1638 | "dtype: float64"
1639 | ]
1640 | },
1641 | "execution_count": 16,
1642 | "metadata": {},
1643 | "output_type": "execute_result"
1644 | }
1645 | ],
1646 | "source": [
1647 | "most_freq_online_mode = trans['online_order'].mode()\n",
1648 | "most_freq_online_mode"
1649 | ]
1650 | },
1651 | {
1652 | "cell_type": "markdown",
1653 | "metadata": {},
1654 | "source": [
1655 | "Since 1 is the most frequent value of online order. We will do a mode imputation on this categorical value. "
1656 | ]
1657 | },
1658 | {
1659 | "cell_type": "code",
1660 | "execution_count": 17,
1661 | "metadata": {},
1662 | "outputs": [],
1663 | "source": [
1664 | "trans['online_order'].fillna(1, inplace=True)"
1665 | ]
1666 | },
1667 | {
1668 | "cell_type": "code",
1669 | "execution_count": 18,
1670 | "metadata": {},
1671 | "outputs": [
1672 | {
1673 | "data": {
1674 | "text/plain": [
1675 | "0"
1676 | ]
1677 | },
1678 | "execution_count": 18,
1679 | "metadata": {},
1680 | "output_type": "execute_result"
1681 | }
1682 | ],
1683 | "source": [
1684 | "trans['online_order'].isnull().sum()"
1685 | ]
1686 | },
1687 | {
1688 | "cell_type": "markdown",
1689 | "metadata": {},
1690 | "source": [
1691 | "Currently there are no missing values for online_order column"
1692 | ]
1693 | },
1694 | {
1695 | "cell_type": "markdown",
1696 | "metadata": {},
1697 | "source": [
1698 | "## 1.2 Brand, Product Line, Product Class, Product Size, Standard Cost, Product First Sold Date"
1699 | ]
1700 | },
1701 | {
1702 | "cell_type": "markdown",
1703 | "metadata": {},
1704 | "source": [
1705 | "It is observed that when brand is null then all other column values with missing values viz. 'product_line', 'product_class', 'product_size', 'standard_cost', 'product_first_sold_date' are also null. Also this null values comprise 1 % of the dataset. Hence we can drop these records/rows. "
1706 | ]
1707 | },
1708 | {
1709 | "cell_type": "code",
1710 | "execution_count": 19,
1711 | "metadata": {},
1712 | "outputs": [
1713 | {
1714 | "data": {
1715 | "text/html": [
1716 | "\n",
1717 | "\n",
1730 | "
\n",
1731 | " \n",
1732 | " \n",
1733 | " \n",
1734 | " brand \n",
1735 | " product_line \n",
1736 | " product_class \n",
1737 | " product_size \n",
1738 | " standard_cost \n",
1739 | " product_first_sold_date \n",
1740 | " \n",
1741 | " \n",
1742 | " \n",
1743 | " \n",
1744 | " 136 \n",
1745 | " NaN \n",
1746 | " NaN \n",
1747 | " NaN \n",
1748 | " NaN \n",
1749 | " NaN \n",
1750 | " NaN \n",
1751 | " \n",
1752 | " \n",
1753 | "
\n",
1754 | "
"
1755 | ],
1756 | "text/plain": [
1757 | " brand product_line product_class product_size standard_cost \\\n",
1758 | "136 NaN NaN NaN NaN NaN \n",
1759 | "\n",
1760 | " product_first_sold_date \n",
1761 | "136 NaN "
1762 | ]
1763 | },
1764 | "execution_count": 19,
1765 | "metadata": {},
1766 | "output_type": "execute_result"
1767 | }
1768 | ],
1769 | "source": [
1770 | "trans[trans['brand'].isnull()][['brand', 'product_line', 'product_class', 'product_size', \n",
1771 | " 'standard_cost', 'product_first_sold_date']].drop_duplicates()"
1772 | ]
1773 | },
1774 | {
1775 | "cell_type": "code",
1776 | "execution_count": 20,
1777 | "metadata": {},
1778 | "outputs": [
1779 | {
1780 | "data": {
1781 | "text/plain": [
1782 | "197"
1783 | ]
1784 | },
1785 | "execution_count": 20,
1786 | "metadata": {},
1787 | "output_type": "execute_result"
1788 | }
1789 | ],
1790 | "source": [
1791 | "trans[trans['brand'].isnull()][['brand', 'product_line', 'product_class', 'product_size', \n",
1792 | " 'standard_cost', 'product_first_sold_date']].shape[0]"
1793 | ]
1794 | },
1795 | {
1796 | "cell_type": "code",
1797 | "execution_count": 21,
1798 | "metadata": {},
1799 | "outputs": [
1800 | {
1801 | "data": {
1802 | "text/plain": [
1803 | "Int64Index([ 136, 159, 366, 406, 676, 780, 1003, 1130, 1196,\n",
1804 | " 1282,\n",
1805 | " ...\n",
1806 | " 19045, 19132, 19176, 19196, 19205, 19340, 19383, 19793, 19859,\n",
1807 | " 19871],\n",
1808 | " dtype='int64', length=197)"
1809 | ]
1810 | },
1811 | "execution_count": 21,
1812 | "metadata": {},
1813 | "output_type": "execute_result"
1814 | }
1815 | ],
1816 | "source": [
1817 | "records_to_drop = trans[trans['brand'].isnull()][['brand', 'product_line', 'product_class', 'product_size', \n",
1818 | " 'standard_cost', 'product_first_sold_date']].index\n",
1819 | "records_to_drop"
1820 | ]
1821 | },
1822 | {
1823 | "cell_type": "code",
1824 | "execution_count": 22,
1825 | "metadata": {},
1826 | "outputs": [],
1827 | "source": [
1828 | "trans.drop(index=records_to_drop, axis=0, inplace=True)"
1829 | ]
1830 | },
1831 | {
1832 | "cell_type": "markdown",
1833 | "metadata": {},
1834 | "source": [
1835 | " Finally there are no Missing values in the transaction dataset. "
1836 | ]
1837 | },
1838 | {
1839 | "cell_type": "code",
1840 | "execution_count": 23,
1841 | "metadata": {},
1842 | "outputs": [
1843 | {
1844 | "data": {
1845 | "text/plain": [
1846 | "transaction_id 0\n",
1847 | "product_id 0\n",
1848 | "customer_id 0\n",
1849 | "transaction_date 0\n",
1850 | "online_order 0\n",
1851 | "order_status 0\n",
1852 | "brand 0\n",
1853 | "product_line 0\n",
1854 | "product_class 0\n",
1855 | "product_size 0\n",
1856 | "list_price 0\n",
1857 | "standard_cost 0\n",
1858 | "product_first_sold_date 0\n",
1859 | "dtype: int64"
1860 | ]
1861 | },
1862 | "execution_count": 23,
1863 | "metadata": {},
1864 | "output_type": "execute_result"
1865 | }
1866 | ],
1867 | "source": [
1868 | "trans.isnull().sum()"
1869 | ]
1870 | },
1871 | {
1872 | "cell_type": "code",
1873 | "execution_count": 24,
1874 | "metadata": {},
1875 | "outputs": [
1876 | {
1877 | "name": "stdout",
1878 | "output_type": "stream",
1879 | "text": [
1880 | "Total records after removing Missing Values: 19803\n"
1881 | ]
1882 | }
1883 | ],
1884 | "source": [
1885 | "print(\"Total records after removing Missing Values: {}\".format(trans.shape[0]))"
1886 | ]
1887 | },
1888 | {
1889 | "cell_type": "markdown",
1890 | "metadata": {},
1891 | "source": [
1892 | "## 2. Creating a new feature \"Profit\""
1893 | ]
1894 | },
1895 | {
1896 | "cell_type": "markdown",
1897 | "metadata": {},
1898 | "source": [
1899 | "The Profit column will be the difference between the list price and the standard price of a product. "
1900 | ]
1901 | },
1902 | {
1903 | "cell_type": "code",
1904 | "execution_count": 25,
1905 | "metadata": {},
1906 | "outputs": [],
1907 | "source": [
1908 | "trans['Profit'] = trans['list_price']-trans['standard_cost']"
1909 | ]
1910 | },
1911 | {
1912 | "cell_type": "code",
1913 | "execution_count": 26,
1914 | "metadata": {},
1915 | "outputs": [
1916 | {
1917 | "data": {
1918 | "text/plain": [
1919 | ""
1920 | ]
1921 | },
1922 | "execution_count": 26,
1923 | "metadata": {},
1924 | "output_type": "execute_result"
1925 | },
1926 | {
1927 | "data": {
1928 | "image/png": "\n",
1929 | "text/plain": [
1930 | ""
1931 | ]
1932 | },
1933 | "metadata": {},
1934 | "output_type": "display_data"
1935 | }
1936 | ],
1937 | "source": [
1938 | "# Dystribution of the Profit Column\n",
1939 | "\n",
1940 | "plt.figure(figsize=(20,8))\n",
1941 | "sns.distplot(trans['Profit'])"
1942 | ]
1943 | },
1944 | {
1945 | "cell_type": "markdown",
1946 | "metadata": {},
1947 | "source": [
1948 | "## 3. Inconsistency Check in Data"
1949 | ]
1950 | },
1951 | {
1952 | "cell_type": "markdown",
1953 | "metadata": {},
1954 | "source": [
1955 | "We will check whether there is inconsistent data / typo error data is present in the categorical columns. \n",
1956 | "The columns to be checked are 'online_order', 'order_status' ,'product_line', 'product_class' , 'product_class' and 'brand' "
1957 | ]
1958 | },
1959 | {
1960 | "cell_type": "markdown",
1961 | "metadata": {},
1962 | "source": [
1963 | "### 3.1 Online Order"
1964 | ]
1965 | },
1966 | {
1967 | "cell_type": "markdown",
1968 | "metadata": {},
1969 | "source": [
1970 | "There is no inconsistent data in online_order column."
1971 | ]
1972 | },
1973 | {
1974 | "cell_type": "code",
1975 | "execution_count": 26,
1976 | "metadata": {},
1977 | "outputs": [
1978 | {
1979 | "data": {
1980 | "text/plain": [
1981 | "1.0 10097\n",
1982 | "0.0 9706\n",
1983 | "Name: online_order, dtype: int64"
1984 | ]
1985 | },
1986 | "execution_count": 26,
1987 | "metadata": {},
1988 | "output_type": "execute_result"
1989 | }
1990 | ],
1991 | "source": [
1992 | "trans['online_order'].value_counts()"
1993 | ]
1994 | },
1995 | {
1996 | "cell_type": "markdown",
1997 | "metadata": {},
1998 | "source": [
1999 | "### 3.2 Order Status"
2000 | ]
2001 | },
2002 | {
2003 | "cell_type": "markdown",
2004 | "metadata": {},
2005 | "source": [
2006 | "There is no inconsistent data in order_status column."
2007 | ]
2008 | },
2009 | {
2010 | "cell_type": "code",
2011 | "execution_count": 27,
2012 | "metadata": {},
2013 | "outputs": [
2014 | {
2015 | "data": {
2016 | "text/plain": [
2017 | "Approved 19625\n",
2018 | "Cancelled 178\n",
2019 | "Name: order_status, dtype: int64"
2020 | ]
2021 | },
2022 | "execution_count": 27,
2023 | "metadata": {},
2024 | "output_type": "execute_result"
2025 | }
2026 | ],
2027 | "source": [
2028 | "trans['order_status'].value_counts()"
2029 | ]
2030 | },
2031 | {
2032 | "cell_type": "code",
2033 | "execution_count": 28,
2034 | "metadata": {},
2035 | "outputs": [
2036 | {
2037 | "data": {
2038 | "text/html": [
2039 | "\n",
2040 | "\n",
2053 | "
\n",
2054 | " \n",
2055 | " \n",
2056 | " \n",
2057 | " order_status \n",
2058 | " online_order \n",
2059 | " \n",
2060 | " \n",
2061 | " \n",
2062 | " \n",
2063 | " 0 \n",
2064 | " Approved \n",
2065 | " 0.0 \n",
2066 | " \n",
2067 | " \n",
2068 | " 1 \n",
2069 | " Approved \n",
2070 | " 1.0 \n",
2071 | " \n",
2072 | " \n",
2073 | " 42 \n",
2074 | " Cancelled \n",
2075 | " 0.0 \n",
2076 | " \n",
2077 | " \n",
2078 | " 254 \n",
2079 | " Cancelled \n",
2080 | " 1.0 \n",
2081 | " \n",
2082 | " \n",
2083 | "
\n",
2084 | "
"
2085 | ],
2086 | "text/plain": [
2087 | " order_status online_order\n",
2088 | "0 Approved 0.0\n",
2089 | "1 Approved 1.0\n",
2090 | "42 Cancelled 0.0\n",
2091 | "254 Cancelled 1.0"
2092 | ]
2093 | },
2094 | "execution_count": 28,
2095 | "metadata": {},
2096 | "output_type": "execute_result"
2097 | }
2098 | ],
2099 | "source": [
2100 | "trans[['order_status', 'online_order']].drop_duplicates()"
2101 | ]
2102 | },
2103 | {
2104 | "cell_type": "markdown",
2105 | "metadata": {},
2106 | "source": [
2107 | "### 3.3 Product Line"
2108 | ]
2109 | },
2110 | {
2111 | "cell_type": "markdown",
2112 | "metadata": {},
2113 | "source": [
2114 | "There is no inconsistent data in product_line column."
2115 | ]
2116 | },
2117 | {
2118 | "cell_type": "code",
2119 | "execution_count": 29,
2120 | "metadata": {},
2121 | "outputs": [
2122 | {
2123 | "data": {
2124 | "text/plain": [
2125 | "Standard 14176\n",
2126 | "Road 3970\n",
2127 | "Touring 1234\n",
2128 | "Mountain 423\n",
2129 | "Name: product_line, dtype: int64"
2130 | ]
2131 | },
2132 | "execution_count": 29,
2133 | "metadata": {},
2134 | "output_type": "execute_result"
2135 | }
2136 | ],
2137 | "source": [
2138 | "trans['product_line'].value_counts()"
2139 | ]
2140 | },
2141 | {
2142 | "cell_type": "markdown",
2143 | "metadata": {},
2144 | "source": [
2145 | "### 3.4 Product Class"
2146 | ]
2147 | },
2148 | {
2149 | "cell_type": "markdown",
2150 | "metadata": {},
2151 | "source": [
2152 | "There is no inconsistent data in product_class column."
2153 | ]
2154 | },
2155 | {
2156 | "cell_type": "code",
2157 | "execution_count": 30,
2158 | "metadata": {},
2159 | "outputs": [
2160 | {
2161 | "data": {
2162 | "text/plain": [
2163 | "medium 13826\n",
2164 | "high 3013\n",
2165 | "low 2964\n",
2166 | "Name: product_class, dtype: int64"
2167 | ]
2168 | },
2169 | "execution_count": 30,
2170 | "metadata": {},
2171 | "output_type": "execute_result"
2172 | }
2173 | ],
2174 | "source": [
2175 | "trans['product_class'].value_counts()"
2176 | ]
2177 | },
2178 | {
2179 | "cell_type": "markdown",
2180 | "metadata": {},
2181 | "source": [
2182 | "### 3.5 Product Size"
2183 | ]
2184 | },
2185 | {
2186 | "cell_type": "markdown",
2187 | "metadata": {},
2188 | "source": [
2189 | "There is no inconsistent data in product_size column."
2190 | ]
2191 | },
2192 | {
2193 | "cell_type": "code",
2194 | "execution_count": 31,
2195 | "metadata": {},
2196 | "outputs": [
2197 | {
2198 | "data": {
2199 | "text/plain": [
2200 | "medium 12990\n",
2201 | "large 3976\n",
2202 | "small 2837\n",
2203 | "Name: product_size, dtype: int64"
2204 | ]
2205 | },
2206 | "execution_count": 31,
2207 | "metadata": {},
2208 | "output_type": "execute_result"
2209 | }
2210 | ],
2211 | "source": [
2212 | "trans['product_size'].value_counts()"
2213 | ]
2214 | },
2215 | {
2216 | "cell_type": "markdown",
2217 | "metadata": {},
2218 | "source": [
2219 | "### 3.6 Brand"
2220 | ]
2221 | },
2222 | {
2223 | "cell_type": "markdown",
2224 | "metadata": {},
2225 | "source": [
2226 | "There is no inconsistent data in brand column."
2227 | ]
2228 | },
2229 | {
2230 | "cell_type": "code",
2231 | "execution_count": 32,
2232 | "metadata": {},
2233 | "outputs": [
2234 | {
2235 | "data": {
2236 | "text/plain": [
2237 | "Solex 4253\n",
2238 | "Giant Bicycles 3312\n",
2239 | "WeareA2B 3295\n",
2240 | "OHM Cycles 3043\n",
2241 | "Trek Bicycles 2990\n",
2242 | "Norco Bicycles 2910\n",
2243 | "Name: brand, dtype: int64"
2244 | ]
2245 | },
2246 | "execution_count": 32,
2247 | "metadata": {},
2248 | "output_type": "execute_result"
2249 | }
2250 | ],
2251 | "source": [
2252 | "trans['brand'].value_counts()"
2253 | ]
2254 | },
2255 | {
2256 | "cell_type": "markdown",
2257 | "metadata": {},
2258 | "source": [
2259 | "## 4. Duplication Checks"
2260 | ]
2261 | },
2262 | {
2263 | "cell_type": "markdown",
2264 | "metadata": {},
2265 | "source": [
2266 | "We need to ensure that there is no duplication of records in the dataset. This may lead to error in data analysis due to poor data quality. If there are duplicate rows of data then we need to drop such records. For checking for duplicate records we need to firstly remove the primary key column of the dataset then apply drop_duplicates() function provided by Python."
2267 | ]
2268 | },
2269 | {
2270 | "cell_type": "code",
2271 | "execution_count": 33,
2272 | "metadata": {},
2273 | "outputs": [
2274 | {
2275 | "name": "stdout",
2276 | "output_type": "stream",
2277 | "text": [
2278 | "Number of records after removing customer_id (pk), duplicates : 19803\n",
2279 | "Number of records in original dataset : 19803\n"
2280 | ]
2281 | }
2282 | ],
2283 | "source": [
2284 | "trans_dedupped = trans.drop('transaction_id', axis=1).drop_duplicates()\n",
2285 | "\n",
2286 | "print(\"Number of records after removing customer_id (pk), duplicates : {}\".format(trans_dedupped.shape[0]))\n",
2287 | "print(\"Number of records in original dataset : {}\".format(trans.shape[0]))"
2288 | ]
2289 | },
2290 | {
2291 | "cell_type": "markdown",
2292 | "metadata": {},
2293 | "source": [
2294 | "Since both the numbers are same. There are no duplicate records in the dataset "
2295 | ]
2296 | },
2297 | {
2298 | "cell_type": "markdown",
2299 | "metadata": {},
2300 | "source": [
2301 | "## 5. Exporting the Cleaned Transactions Data Set to csv"
2302 | ]
2303 | },
2304 | {
2305 | "cell_type": "markdown",
2306 | "metadata": {},
2307 | "source": [
2308 | "Currently the Transactions dataset is clean. Hence we can export the data to a csv to continue our data analysis of Customer Segments by joining it to other tables."
2309 | ]
2310 | },
2311 | {
2312 | "cell_type": "code",
2313 | "execution_count": 38,
2314 | "metadata": {},
2315 | "outputs": [],
2316 | "source": [
2317 | "trans.to_csv('Transactions_Cleaned.csv', index=False)"
2318 | ]
2319 | }
2320 | ],
2321 | "metadata": {
2322 | "kernelspec": {
2323 | "display_name": "Python 3",
2324 | "language": "python",
2325 | "name": "python3"
2326 | },
2327 | "language_info": {
2328 | "codemirror_mode": {
2329 | "name": "ipython",
2330 | "version": 3
2331 | },
2332 | "file_extension": ".py",
2333 | "mimetype": "text/x-python",
2334 | "name": "python",
2335 | "nbconvert_exporter": "python",
2336 | "pygments_lexer": "ipython3",
2337 | "version": "3.7.3"
2338 | }
2339 | },
2340 | "nbformat": 4,
2341 | "nbformat_minor": 2
2342 | }
2343 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Data Analytics Customer Segmentation
2 |
3 | ## Goal of the project
4 | The purpose of this project is to conduct a Customer Segmentation Analysis for an Automobile bike Company. Customer segmentation is performed by developing a RFM Model. RFM (Recency, Frequency, Monetary) analysis is a behavior-based approach grouping customers into segments. It groups the customers on the basis of their previous purchase transactions. In this analysis the customer segment was divided into 11 groups. The analysis will help in determining which customers segments should be targeted in order to enhance sales revenue for the company. A Sales Dashboard for Customer Segmentation is developed using Tableau and the data quality assessment and analysis is done using Python .
5 |
6 |
7 | ## Tableau Dashboard
8 | The Sales Dashboard for Customer Segmentation can be found [here](https://public.tableau.com/profile/abhishek.chowdhury#!/vizhome/CustomerSegmentationDashboard_16175595616510/RFMDashboard).
9 |
10 |
11 | In case of failure of loading Jupyter Notebooks on Github, the following notebooks can be found in nbviewer. Click on the respective hyperlinks to view:
12 | - [RFM Analysis.ipynb](https://nbviewer.jupyter.org/github/AbhishekGit-hash/Data-Analytics-Customer-Segmentation/blob/master/RFM%20Analysis.ipynb)
13 | - [DQA and Data Cleaning CustomerDemographic.ipynb](https://nbviewer.jupyter.org/github/AbhishekGit-hash/Data-Analytics-Customer-Segmentation/blob/master/DQA%20and%20Data%20Cleaning%20CustomerDemographic.ipynb)
14 | - [DQA and Data Cleaning NewCustomerList.ipynb](https://nbviewer.jupyter.org/github/AbhishekGit-hash/Data-Analytics-Customer-Segmentation/blob/master/DQA%20and%20Data%20Cleaning%20NewCustomerList.ipynb)
15 | - [DQA and Data Cleaning Transactions.ipynb](https://nbviewer.jupyter.org/github/AbhishekGit-hash/Data-Analytics-Customer-Segmentation/blob/master/DQA%20and%20Data%20Cleaning%20Transactions.ipynb)
16 | - [DQA and Data Cleaning Customer Address.ipynb](https://nbviewer.jupyter.org/github/AbhishekGit-hash/Data-Analytics-Customer-Segmentation/blob/master/DQA%20and%20Data%20Cleaning%20Customer%20Address.ipynb)
17 |
18 |
19 | ## Analysis Approach
20 | ### 1. Data Quality Assessment and Data Cleaning
21 | The first step towards generating useful insights from the data was the data prepartion, quality assessment and data cleaning step. After the cleaning process exploratory data analysis on the dataset and identification customer purchasing behaviours to generate insights can be performed.
22 |
23 | In the data cleaning step the data quality of the following datasets were first assesed. After a data quality assessment the following data quality issues was observed and the necessary process to mitigate the issue was followed :
24 | - CustomerDemographics.xlsx :
25 | - 1 Irrelevent column was present and such columns were dropped from the dataset.
26 | - There were 5 columns were Missing values were present. For such columns based on the volumne of the missing values either the records were dropped or appropiate values were imputed at places of missing values
27 | - For gender column there was no standardisation of data. Based on the values available the column data was standardised to remove data inconsistency.
28 | - The Date of Birth column was transformed to create a new feature column 'Age' and 'Age Group' to check for discripency of age distribution. An outlier was observed and the record was removed.
29 | - Checked whether there are duplicate records present in the dataset. In this dataset there were no duplicate records.
30 | - NewCustomerList.xlsx :
31 | - 5 Irrelevent column was present and such columns were dropped from the dataset.
32 | - There were 4 columns were Missing values were present. For such columns based on the volumne of the missing values either the records were dropped or appropiate values were imputed at places of missing values
33 | - The Date of Birth column was transformed to create a new feature column 'Age' and 'Age Group' to check for discripency of age distribution.
34 | - There was no data inconsistency.
35 | - Checked whether there are duplicate records present in the dataset. In this dataset there were no duplicate records.
36 | - Transaction_data.xlsx :
37 | - The product_first_sold_date column is not in datetime format. The data type of this column was changed from int64 to datetime format.
38 | - There were 7 columns were Missing values were present. For such columns based on the volumne of the missing values either the records were dropped or appropiate values were imputed at places of missing values
39 | - A new feature column 'Profit' was created which is basically the difference between list price and standard price.
40 | - There was no data inconsistency.
41 | - Checked whether there are duplicate records present in the dataset. In this dataset there were no duplicate records.
42 | - CustomerAddress.xlsx :
43 | - For states column there was no standardisation of data. Based on the values available the column data was standardised to remove data inconsistency.
44 | - There were certain customer IDs from Customer Dempgraphics table which were getting dropped in the Address table.
45 |
46 | ### 2. Exploratory Data Analysis on Customer Segments
47 | After the data cleaning process, exploratory analysis on the dataset is performed and the following insights are obtained :
48 | - New vs Old Customers Age Distribution
49 | - Most New customers are aged between 40-49 also for Old Customers the most of them are aged between 40-49
50 | - The lowest number of customers for both the types of customers is present in the age bracket under 20 and above 80 age groups.
51 | - The automobile company is popular among New Customers among the age groups 20-29 and 40-49.
52 | - A steep drop in customers is observed in the 30-39 age group among the New Customers
53 |
54 |
55 | Old Customers by Age Distribution
56 | New Customers by Age Distribution
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 | - Bike purchases over last 3 years by Gender
65 | - Most bike puechases are done by Feamale over the last 3 years. Approximately 51% of the bike purchases are done by Female compared to 49% of the purchases being done by Male.
66 | - The Female purchases are 10,000 more than that of Male purchases (numerically).
67 |
68 |
69 | - New vs Old Customers Job Industry Distribution
70 | - Most New customers are from the Manufacturing and Financial Services sector (approx 20% of the New Customers).
71 | - The lowest number of customers are from the Agriculture and Telecom sector approx 3%.
72 | - Similar trend is observed among Old Customers as well.
73 |
74 |
75 | Old Customers by Job Industry
76 | New Customers by Job Industry
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 | - Wealth Segmentation by Age Category
85 | - Across all age categories the largest number of customers are from 'Mass Customer' Segment
86 | - The next category comes from the 'High Net Worth' customers.
87 | - In the age group 40-49, Affluent segment out performs the High Net Worth customers in terms of number of customers.
88 |
89 |
90 | Old Customers Wealth by Age Group
91 | New Customers Wealth by Age Group
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 | - Cars owned by States
100 | - New South Wales has the largest number of people who donot own a car.
101 | - In Victoria the proportion is quite even.
102 | - In Queensland the number of people owning a car is greater than who donot have a car.
103 |
104 |
105 |
106 | ### 3. RFM Analysis and Customer Segmentation
107 | In this stage of analysis the customer segmentation was done by developing an RFM Model. The RFM (Recency, Frequency, Monetary) analysis is a behavior-based approach grouping customers into segments. It groups the customers on the basis of their previous purchase transactions.
108 |
109 | In this analysis the customer segment was divided into 11 groups. The groups being :
110 | - Platinum Customers
111 | - Very Loyal Customers
112 | - Recent Customers
113 | - Potential Customers
114 | - Lost Customers
115 | - Losing Customers
116 | - Late Bloomer
117 | - High Risk Customers
118 | - Evasive Customers
119 | - Becoming Loyal
120 | - Almost lost Customers
121 |
122 | As of the current state of the Automobile business the current distribution of customers segments is depicted below:
123 |
124 |
125 | ### 4. RFM Analysis: Scatter Plots
126 | #### Recency vs Monetary :
127 | The visualization shows that recent customers have purchased more products and generated relatively more revenue than the customers who visited a while ageo.
128 |
129 |
130 | #### Frequency vs Monetary :
131 | The visualization shows that customers belonging to Platinum/ Very Loyal/ Becoming Loyal Customer Segments have a greater frequency and generate greater monetary for the business
132 |
133 |
134 | ## Datasets Used
135 | The datasets used include:
136 | - __Raw_data.xlsx__: This excel file dataset included the following sheets of data:
137 | - __Transactions_data.xlsx__: This dataset included the transactions data of the customers across all the different states in Australia.
138 | - __NewCustomerList.xlsx__: This dataset included the new customers who visted the automobile bike company recently.
139 | - __CustomerDemographic.xlsx__: This dataset included entire details of the Customer Demographics.
140 | - __CustomerAddress.xlsx__: This dataset included the address of the Customers.
141 |
142 |
143 | ## Tools and Technologies used
144 | The tools used in this project include:
145 | - __Python__ - This was needed to conduct Data Quality Assessment and also for Data Cleaning processes . With Python libraries pandas, matplotlib, seaborn exploratory data analysis of the datasets and to gain useful insights from the data was possible.
146 | - __Tableau__ - This Business Intelligence tool was required to explore data and create charts, graphs, visualizations to come up with a Sales Dashboard for Customer Segmenatation for the automobile bike company. The Tableau Sales Dashboard can be found [here](https://public.tableau.com/profile/abhishek.chowdhury#!/vizhome/CustomerSegmentationDashboard_16175595616510/RFMDashboard)
147 |
148 |
149 | ## Built With
150 | - Python 3.8.2, Tableau
151 |
152 | ## Authors
153 | - Abhishek Chowdhury - [Github Profile](https://github.com/AbhishekGit-hash)
154 |
--------------------------------------------------------------------------------
/Raw_data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AbhishekGit-hash/Data-Analytics-Customer-Segmentation/782075ecfd063596629d57ec8a6897ab0fec657a/Raw_data.xlsx
--------------------------------------------------------------------------------
/data visualization/Car Owners by State.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AbhishekGit-hash/Data-Analytics-Customer-Segmentation/782075ecfd063596629d57ec8a6897ab0fec657a/data visualization/Car Owners by State.PNG
--------------------------------------------------------------------------------
/data visualization/Customer Segment Distribution.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AbhishekGit-hash/Data-Analytics-Customer-Segmentation/782075ecfd063596629d57ec8a6897ab0fec657a/data visualization/Customer Segment Distribution.PNG
--------------------------------------------------------------------------------
/data visualization/Female vs Male Bike Purchases.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AbhishekGit-hash/Data-Analytics-Customer-Segmentation/782075ecfd063596629d57ec8a6897ab0fec657a/data visualization/Female vs Male Bike Purchases.PNG
--------------------------------------------------------------------------------
/data visualization/Frequency vs Monetary.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AbhishekGit-hash/Data-Analytics-Customer-Segmentation/782075ecfd063596629d57ec8a6897ab0fec657a/data visualization/Frequency vs Monetary.PNG
--------------------------------------------------------------------------------
/data visualization/New Customer Wealth Segment.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AbhishekGit-hash/Data-Analytics-Customer-Segmentation/782075ecfd063596629d57ec8a6897ab0fec657a/data visualization/New Customer Wealth Segment.PNG
--------------------------------------------------------------------------------
/data visualization/New Customers Age Distribution.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AbhishekGit-hash/Data-Analytics-Customer-Segmentation/782075ecfd063596629d57ec8a6897ab0fec657a/data visualization/New Customers Age Distribution.PNG
--------------------------------------------------------------------------------
/data visualization/New Customers Job Industry.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AbhishekGit-hash/Data-Analytics-Customer-Segmentation/782075ecfd063596629d57ec8a6897ab0fec657a/data visualization/New Customers Job Industry.PNG
--------------------------------------------------------------------------------
/data visualization/Old Customers Age Distribution.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AbhishekGit-hash/Data-Analytics-Customer-Segmentation/782075ecfd063596629d57ec8a6897ab0fec657a/data visualization/Old Customers Age Distribution.PNG
--------------------------------------------------------------------------------
/data visualization/Old Customers Job Industry.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AbhishekGit-hash/Data-Analytics-Customer-Segmentation/782075ecfd063596629d57ec8a6897ab0fec657a/data visualization/Old Customers Job Industry.PNG
--------------------------------------------------------------------------------
/data visualization/Old Customers Wealth Segment.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AbhishekGit-hash/Data-Analytics-Customer-Segmentation/782075ecfd063596629d57ec8a6897ab0fec657a/data visualization/Old Customers Wealth Segment.PNG
--------------------------------------------------------------------------------
/data visualization/Recency vs Monetary.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AbhishekGit-hash/Data-Analytics-Customer-Segmentation/782075ecfd063596629d57ec8a6897ab0fec657a/data visualization/Recency vs Monetary.PNG
--------------------------------------------------------------------------------
/data visualization/Sales Dashboard.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AbhishekGit-hash/Data-Analytics-Customer-Segmentation/782075ecfd063596629d57ec8a6897ab0fec657a/data visualization/Sales Dashboard.gif
--------------------------------------------------------------------------------