├── ch.1 ├── R │ └── SettingUpREnvironment.R └── python │ └── Setting Up Python Environment.ipynb ├── ch.10 ├── R │ └── CustomerSegmentation.R └── python │ └── CustomerSegmentation.ipynb ├── ch.11 ├── R │ └── CustomerRetention.R └── python │ └── CustomerRetention.ipynb ├── ch.12 ├── R │ └── ABTesting.R └── python │ └── ABTesting.ipynb ├── ch.2 ├── R │ └── ConversionRate.R └── python │ └── ConversionRate.ipynb ├── ch.3 ├── R │ └── RegressionAnalysis.R └── python │ └── RegressionAnalysis.ipynb ├── ch.4 ├── R │ └── FromEngagementToConversions.R └── python │ └── From Engagement to Conversions.ipynb ├── ch.5 ├── R │ └── ProductAnalytics.R └── python │ └── Product Analytics.ipynb ├── ch.6 ├── R │ └── ProductRecommendation.R └── python │ └── ProductRecommendation.ipynb ├── ch.7 ├── R │ └── CustomerBehaviors.R └── python │ └── CustomerBehaviors.ipynb ├── ch.8 ├── R │ └── PredictingEngagement.R └── python │ └── PredictingEngagement.ipynb └── ch.9 ├── R └── CustomerLifetimeValue.R └── python └── CustomerLifetimeValue.ipynb /ch.1/R/SettingUpREnvironment.R: -------------------------------------------------------------------------------- 1 | # Test Data 2 | data <- data.frame( 3 | "X"=c(0, 0.25, 0.5, 1), 4 | "Y"=c(0, 0.5, 0.5, 1), 5 | "output"=c(0, 0, 1, 1) 6 | ) 7 | 8 | # Train Logistic Regression 9 | logit.fit <- glm( 10 | output ~ X + Y, 11 | data = data, 12 | family = binomial 13 | ) 14 | 15 | # Show Fitted Results 16 | summary(logit.fit) 17 | 18 | # Predict Class Probabilities 19 | logit.probs <- predict( 20 | logit.fit, 21 | newdata=data, 22 | type="response" 23 | ) 24 | 25 | # Predict Classes 26 | logit.pred <- ifelse(logit.probs > 0.5, 1, 0) 27 | logit.pred 28 | 29 | 30 | # Plotting Library 31 | library(ggplot2) 32 | 33 | # Simple Scatterplot 34 | ggplot(data, aes(x=X, y=Y, color=output)) + 35 | geom_point(size=3, shape=19) + 36 | ggtitle('Actual') + 37 | theme(plot.title = element_text(hjust = 0.5)) 38 | 39 | ggplot(data, aes(x=X, y=Y, color=logit.pred)) + 40 | geom_point(size=3, shape=19) + 41 | ggtitle('Predicted') + 42 | theme(plot.title = element_text(hjust = 0.5)) 43 | -------------------------------------------------------------------------------- /ch.1/python/Setting Up Python Environment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "from sklearn.linear_model import LogisticRegression" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "input_data = np.array([\n", 20 | " [0, 0],\n", 21 | " [0.25, 0.25],\n", 22 | " [0.5, 0.5],\n", 23 | " [1, 1],\n", 24 | "])" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "output_data = [\n", 34 | " 0,\n", 35 | " 0,\n", 36 | " 1,\n", 37 | " 1\n", 38 | "]" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 4, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "logit_model = LogisticRegression()" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 5, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "data": { 57 | "text/plain": [ 58 | "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n", 59 | " intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n", 60 | " penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", 61 | " verbose=0, warm_start=False)" 62 | ] 63 | }, 64 | "execution_count": 5, 65 | "metadata": {}, 66 | "output_type": "execute_result" 67 | } 68 | ], 69 | "source": [ 70 | "logit_model.fit(input_data, output_data)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 6, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "data": { 80 | "text/plain": [ 81 | "array([[0.43001235, 0.43001235]])" 82 | ] 83 | }, 84 | "execution_count": 6, 85 | "metadata": {}, 86 | "output_type": "execute_result" 87 | } 88 | ], 89 | "source": [ 90 | "logit_model.coef_" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 7, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "data": { 100 | "text/plain": [ 101 | "array([-0.18498028])" 102 | ] 103 | }, 104 | "execution_count": 7, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "logit_model.intercept_" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 8, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "predicted_output = logit_model.predict(input_data)" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 9, 125 | "metadata": {}, 126 | "outputs": [ 127 | { 128 | "data": { 129 | "text/plain": [ 130 | "array([0, 1, 1, 1])" 131 | ] 132 | }, 133 | "execution_count": 9, 134 | "metadata": {}, 135 | "output_type": "execute_result" 136 | } 137 | ], 138 | "source": [ 139 | "predicted_output" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 10, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "%matplotlib inline" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 11, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "import matplotlib.pyplot as plt" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 12, 163 | "metadata": {}, 164 | "outputs": [ 165 | { 166 | "data": { 167 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEWCAYAAACJ0YulAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAFaVJREFUeJzt3X+Q3XV97/Hni/A7i6iNrl4TCfViNVKn6BbwamVTqAZ6J7nToQ5UUbxqbDVXb6320nqHOvTeGWtBW0aqpFOuFitrqlPNYCy2ylbaGgciFQUaGyNoBEQEaZcf8iPv+8c5fl02m+wmu99zcnafj5kzOd/v97Pf7/ud3eS138/3nO9JVSFJEsAh/S5AknTwMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQTpIJBlNsqvfdWhxMxSkSZKMJ7kvyRGzGLsySSU5tBe1Sb1gKEhdSVYCvwQUsLavxUh9YihIP/VaYCvwEeB1P1mZ5KgklyS5Pcn9Sf4xyVHAl7pDfpRkIslLkrwnyccmfe0TziaSvD7JrUn+I8nOJG/uXXvSzDztlX7qtcD7ga8AW5MMV9X3gYuBFwD/BbgLOAXYDbwc+Dbw5Kp6DCDJK2c4xt3AfwV2dr/+c0mur6qvttCPtN88U5CAJC8DjgM2VdU24FvAbyQ5BPjvwNur6ntV9XhV/XNV/fhAjlNVn62qb1XHPwCfpzNlJR0UDAWp43XA56vqnu7yx7vrlgFH0gmJOUtyZpKtSe5N8iPgrO4xpIOC00da9LrXB14FLElyV3f1EcCTgWcCDwPPAb425Uunu8XwA8DRk5afMek4RwCfojNN9ZmqejTJp4HMRx/SfPBMQYL/BjwOrAJ+oft4PnAdnf/ArwDen+Q/JVnSvaB8BPADOtcWfnbSvv4FeHmSZyc5Fvi9SdsOpxM2PwAeS3Im8Ip2W5P2j6EgdaaJ/l9Vfaeq7vrJA/gg8GrgAuDrwPXAvcAfAYdU1YPA/wX+KcmPkpxaVX8HfAK4CdgGXP2Tg1TVfwBvAzYB9wG/AWzuVZPSbMQP2ZEk/YRnCpKkhqEgSWoYCpKkhqEgSWoM3PsUli1bVitXrpzzfh544AGWLl0694IGhP0uXIupV7DfA7Vt27Z7quppM40buFBYuXIlN9xww5z3Mz4+zujo6NwLGhD2u3Atpl7Bfg9UkttnM87pI0lSw1CQJDUMBUlSw1CQJDUMBUlSw1CQJDUMBUlSw1CQpINNFVx3Hbz1rfDd78LWrT07dGuhkOSKJHcn+cZetifJpUl2JLkpyYvaqkWSBsrb3gZnngkf+hDcfTecfjr8/u/35NBtnil8BFizj+1nAid0H+uBD7VYiyQNhq9+Fa64Ah54oHPGAPDgg/AnfwLbt7d++NZCoaq+ROdTqvZmHfCX1bEVeHKSZ7ZVjyQNhKuvhocf3nP97t3w2c+2fvhWP3ktyUrg6qo6cZptVwPvrap/7C5/AfhfVbXHjY2SrKdzNsHw8PCLx8bG5lzbxMQEQ0NDc97PoLDfhWsx9QqLoN/vfx++973mLGFi+XKGdu2CQw6B5cvhaTPe025aq1ev3lZVIzON6+cN8TLNumkTqqo2AhsBRkZGaj5uDuVNtRa2xdTvYuoVFkG/t98Oz38+PPQQAOMXX8zoO98JRx0Ft90GT396q4fv56uPdgErJi0vB+7oUy2SdHA47ji4/HI48kgYGuqcIRx1FFx5ZeuBAP09U9gMbEgyBpwC3F9Vd/axHkk6OJx3Hvzqr8LnPtcJhDvvhGOP7cmhWwuFJFcBo8CyJLuAPwAOA6iqDwNbgLOAHcCDwOvbqkWSBs5TnwqvfjWMj/csEKDFUKiqc2fYXsBb2zq+JGn/+Y5mSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVKj1VBIsibJ9iQ7klwwzfZnJ7k2yY1JbkpyVpv1SJL2rbVQSLIEuAw4E1gFnJtk1ZRh/xvYVFUnAecAf9ZWPZKkmbV5pnAysKOqdlbVI8AYsG7KmAKe1H1+LHBHi/VIkmaQqmpnx8nZwJqqemN3+TzglKraMGnMM4HPA08BlgJnVNW2afa1HlgPMDw8/OKxsbE51zcxMcHQ0NCc9zMo7HfhWky9gv0eqNWrV2+rqpGZxh065yPtXaZZNzWBzgU+UlWXJHkJcGWSE6tq9xO+qGojsBFgZGSkRkdH51zc+Pg487GfQWG/C9di6hXst21tTh/tAlZMWl7OntNDbwA2AVTVl4EjgWUt1iRJ2oc2Q+F64IQkxyc5nM6F5M1TxnwHOB0gyfPphMIPWqxJkrQPrYVCVT0GbACuAW6l8yqjm5NclGRtd9jvAG9K8jXgKuD8ausihyRpRm1eU6CqtgBbpqy7cNLzW4CXtlmDJGn2fEezJKlhKEiSGoaCJKlhKEiSGoaCJKlhKEiSGoaCJKlhKEiSGoaCJKlhKEiSGoaCJKlhKEiSGoaCJKlhKEiSGoaCJKlhKEiSGoaCJKlhKEiSGoaCJKlhKEiSGoaCJKlhKEiSGoaCJKlhKEiSGoaCJKlhKEiSGoaCJKlhKEiSGoaCJKlhKEiSGq2GQpI1SbYn2ZHkgr2MeVWSW5LcnOTjbdYjSdq3Q9vacZIlwGXArwC7gOuTbK6qWyaNOQH4PeClVXVfkqe3VY8kaWZtnimcDOyoqp1V9QgwBqybMuZNwGVVdR9AVd3dYj2SpBmkqtrZcXI2sKaq3thdPg84pao2TBrzaeCbwEuBJcB7qupvp9nXemA9wPDw8IvHxsbmXN/ExARDQ0Nz3s+gsN+FazH1CvZ7oFavXr2tqkZmGtfa9BGQadZNTaBDgROAUWA5cF2SE6vqR0/4oqqNwEaAkZGRGh0dnXNx4+PjzMd+BoX9LlyLqVew37a1OX20C1gxaXk5cMc0Yz5TVY9W1beB7XRCQpLUB22GwvXACUmOT3I4cA6wecqYTwOrAZIsA54L7GyxJknSPrQWClX1GLABuAa4FdhUVTcnuSjJ2u6wa4AfJrkFuBZ4V1X9sK2aJEn71uY1BapqC7BlyroLJz0v4B3dhySpz3xHsySpYShIkhqGgiSpYShIkhqGgiSpYShIkhqGgiSpYShIkhqGgiSp0eo7mqVFYfduGB+Hb34TXvACeNnLINPdJFg6+BkK0lzcey+cdhrcdhs8/jgsWQLPex588YtwzDH9rk7ab3udPkqyJcnK3pUiDaC3vhW2b4eJCXjooc6fX/86/O7v9rsy6YDs65rCR4DPJ3l3ksN6VI80OKrgU5+CRx994vof/xj+6q/6U5M0R3udPqqqTUk+C1wI3JDkSmD3pO3v70F90sGrqnM9YTpTg0IaEDO9+uhR4AHgCOCYKQ9pcTvkEBgd7fw52ZIlcNZZfSlJmqu9nikkWQO8n86npb2oqh7sWVXSoLj8cjj1VHjwwc5j6dLOBeY//dN+VyYdkH29+ujdwK9X1c29KkYaOM95DuzYAR/7WOcC80knwatfDUND/a5MOiD7uqbwS70sRBpYxx7beRWStAD4jmZJUsNQkCQ1DAVJUsNQkCQ1DAVJUsNQkCQ1DAVJUsNQkCQ1DAVJUsNQkCQ1DAVJUqPVUEiyJsn2JDuSXLCPcWcnqSQjbdYjSdq31kIhyRLgMuBMYBVwbpJV04w7Bngb8JW2apEkzU6bZwonAzuqamdVPQKMAeumGfeHwPuAh1usRZI0C/v6PIW5ehbw3UnLu4BTJg9IchKwoqquTvLOve0oyXpgPcDw8DDj4+NzLm5iYmJe9jMo7HfhWky9gv22rc1QyDTrqtmYHAJ8ADh/ph1V1UZgI8DIyEiNjo7Oubjx8XHmYz+Dwn4XrsXUK9hv29qcPtoFrJi0vBy4Y9LyMcCJwHiS24BTgc1ebJak/mkzFK4HTkhyfJLDgXPofN4zAFV1f1Utq6qVVbUS2AqsraobWqxJkrQPrYVCVT0GbACuAW4FNlXVzUkuSrK2reNKkg5cm9cUqKotwJYp6y7cy9jRNmuRJM3MdzRLkhqGgiSpYShIkhqGgiSpYShIkhqGgiSpYShIkhqGgiSpYShIkhqGgiSpYShIkhqGgiSpYShIkhqGgiSpYShIkhqGgiSpYShIkhqGgiSpYShIkhqtfkazBttdd8HWrTA8DKeeCkm/K5LUNkNBe6iCCy6ASy+Fww+H3bvhGc+Av/97OO64flcnqU1OH2kPf/M3cNll8PDD8O//DhMTsHMnrFvX78oktc1Q0B4uvRQeeOCJ63bvhm9+E/7t3/pTk6TeMBS0h/vvn379oYd2zhwkLVyGgvbwa78GRx655/olS+CFL+x9PZJ6x1DQHt7+dlixAo4+urO8ZEnn+caNcNhh/a1NUrt89ZH28KQnwY03wkc/Clu2dALiLW+Bn//5flcmqW2Ggqa1dGknCN7yln5XIqmXnD6SJDUMBUlSo9VQSLImyfYkO5JcMM32dyS5JclNSb6QxPfLSlIftRYKSZYAlwFnAquAc5OsmjLsRmCkql4IfBJ4X1v1SJJm1uaZwsnAjqraWVWPAGPAE26UUFXXVtWD3cWtwPIW65EkzSBV1c6Ok7OBNVX1xu7yecApVbVhL+M/CNxVVf9nmm3rgfUAw8PDLx4bG5tzfRMTEwwNDc15P4PCfheuxdQr2O+BWr169baqGplpXJsvSZ3uRsvTJlCS1wAjwGnTba+qjcBGgJGRkRodHZ1zcePj48zHfgaF/S5ci6lXsN+2tRkKu4AVk5aXA3dMHZTkDODdwGlV9eMW65EkzaDNawrXAyckOT7J4cA5wObJA5KcBFwOrK2qu1usRZI0C62FQlU9BmwArgFuBTZV1c1JLkqytjvsj4Eh4K+T/EuSzXvZnSSpB1q9zUVVbQG2TFl34aTnZ7R5fEnS/vEdzZKkhqEgSWoYCpKkhqEgSWoYCpKkhqEgSWoYCpKkhqEgSWoYCpKkhqEgSWoYCpKkhqEgSWoYCpKkhqEgSWoYCpKkhqEgSWoYCpKkhqEgSWoYCpKkhqEgSWoYCpKkhqEgSWoYCpKkhqEgSWoYCpKkhqEgSWoYCpKkhqEgSWoc2u8Ceu3xx+FrX4OHHoLdu+EQY1GSGq3+l5hkTZLtSXYkuWCa7Uck+UR3+1eSrGyznuuug2c9C047Df71X+HZz4YbbmjziJI0WFoLhSRLgMuAM4FVwLlJVk0Z9gbgvqr6z8AHgD9qq5577oGzzoLvfx8mJjpnCd/7HpxxRmdZktTumcLJwI6q2llVjwBjwLopY9YBH+0+/yRwepK0UcxVV3WmjqZ6/HH41KfaOKIkDZ5UVTs7Ts4G1lTVG7vL5wGnVNWGSWO+0R2zq7v8re6Ye6bsaz2wHmB4ePjFY2Nj+13PHXfAnXf+dHn58gl27Roi6UwpDQ/v9y4HysTEBENDQ/0uo2cWU7+LqVew3wO1evXqbVU1MtO4Ni80T/cb/9QEms0YqmojsBFgZGSkRkdH97uYL3wB3vzmn04VXXzxOO985yhLl8K118Iv/uJ+73KgjI+PcyB/b4NqMfW7mHoF+21bm9NHu4AVk5aXA3fsbUySQ4FjgXvbKOaXfxlOPRWOPvqn65YuhVe+cuEHgiTNVpuhcD1wQpLjkxwOnANsnjJmM/C67vOzgS9WS/NZCWzZApdc0gmHoSH44Adh06Y2jiZJg6m1UKiqx4ANwDXArcCmqro5yUVJ1naH/QXwM0l2AO8A9njZ6nw67DD4zd+EL38Zfu7n4PzzYcmSNo8oSYOl1TevVdUWYMuUdRdOev4w8Ott1iBJmj3fzytJahgKkqSGoSBJahgKkqSGoSBJahgKkqSGoSBJarR2Q7y2JPkBcPs87GoZcM+MoxYO+124FlOvYL8H6riqetpMgwYuFOZLkhtmc8fAhcJ+F67F1CvYb9ucPpIkNQwFSVJjMYfCxn4X0GP2u3Atpl7Bflu1aK8pSJL2tJjPFCRJUxgKkqTGgg+FJGuSbE+yI8keH+KT5Igkn+hu/0qSlb2vcn7Motd3JLklyU1JvpDkuH7UOV9m6nfSuLOTVJKBfhnjbPpN8qru9/jmJB/vdY3zaRY/z89Ocm2SG7s/02f1o875kOSKJHcn+cZetifJpd2/i5uSvKi1YqpqwT6AJcC3gJ8FDge+BqyaMuYtwIe7z88BPtHvulvsdTVwdPf5bw1qr7PttzvuGOBLwFZgpN91t/z9PQG4EXhKd/np/a675X43Ar/Vfb4KuK3fdc+h35cDLwK+sZftZwGfAwKcCnylrVoW+pnCycCOqtpZVY8AY8C6KWPWAR/tPv8kcHqS9LDG+TJjr1V1bVU92F3cCizvcY3zaTbfW4A/BN4HPNzL4lowm37fBFxWVfcBVNXdPa5xPs2m3wKe1H1+LHBHD+ubV1X1JeDefQxZB/xldWwFnpzkmW3UstBD4VnAdyct7+qum3ZMdT5X+n7gZ3pS3fyaTa+TvYHObx6DasZ+k5wErKiqq3tZWEtm8/19LvDcJP+UZGuSNT2rbv7Npt/3AK9JsovOx/7+j96U1hf7++/7gLX6Gc0Hgel+45/6GtzZjBkEs+4jyWuAEeC0Vitq1z77TXII8AHg/F4V1LLZfH8PpTOFNErnLPC6JCdW1Y9arq0Ns+n3XOAjVXVJkpcAV3b73d1+eT3Xs/+nFvqZwi5gxaTl5ex5itmMSXIondPQfZ3GHaxm0ytJzgDeDaytqh/3qLY2zNTvMcCJwHiS2+jMw24e4IvNs/1Z/kxVPVpV3wa20wmJQTSbft8AbAKoqi8DR9K5edxCNKt/3/NhoYfC9cAJSY5PcjidC8mbp4zZDLyu+/xs4IvVvbIzYGbstTudcjmdQBjk+WaYod+qur+qllXVyqpaSecaytqquqE/5c7ZbH6WP03nxQQkWUZnOmlnT6ucP7Pp9zvA6QBJnk8nFH7Q0yp7ZzPw2u6rkE4F7q+qO9s40IKePqqqx5JsAK6h82qGK6rq5iQXATdU1WbgL+icdu6gc4ZwTv8qPnCz7PWPgSHgr7vX0r9TVWv7VvQczLLfBWOW/V4DvCLJLcDjwLuq6of9q/rAzbLf3wH+PMlv05lKOX9Af6EjyVV0pv2Wda+R/AFwGEBVfZjONZOzgB3Ag8DrW6tlQP8OJUktWOjTR5Kk/WAoSJIahoIkqWEoSJIahoIkqWEoSHOQZEWSbyd5anf5Kd3lgb4DrRYvQ0Gag6r6LvAh4L3dVe8FNlbV7f2rSjpwvk9BmqMkhwHbgCvo3Kn0pO6dPaWBs6Df0Sz1QlU9muRdwN8CrzAQNMicPpLmx5nAnXRuwicNLENBmqMkvwD8Cp07sf52Wx9+IvWCoSDNQfdT+j4E/M+q+g6dmw5e3N+qpANnKEhz8yY6d5v9u+7ynwHPSzLIH2CkRcxXH0mSGp4pSJIahoIkqWEoSJIahoIkqWEoSJIahoIkqWEoSJIa/x8pW7deGORxzQAAAABJRU5ErkJggg==\n", 168 | "text/plain": [ 169 | "
" 170 | ] 171 | }, 172 | "metadata": {}, 173 | "output_type": "display_data" 174 | } 175 | ], 176 | "source": [ 177 | "plt.scatter(\n", 178 | " x=input_data[:,0], \n", 179 | " y=input_data[:,1], \n", 180 | " color=[('red' if x == 1 else 'blue') for x in output_data]\n", 181 | ")\n", 182 | "plt.xlabel('X')\n", 183 | "plt.ylabel('Y')\n", 184 | "plt.title('Actual')\n", 185 | "plt.grid()\n", 186 | "plt.show()" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 13, 192 | "metadata": {}, 193 | "outputs": [ 194 | { 195 | "data": { 196 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYUAAAEWCAYAAACJ0YulAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAFnlJREFUeJzt3X+UX3V95/Hni4SIZoBqo1PXBIKKVUo9C4xAj+4yKVRDXJM9PeiBVZQeIW01a1uLLqsuumi7LsVqOVIlXVkRqzHq1qYYi7+YSrvGQyKLFWhsjCCBCCJIO/yGvPeP79frMJnMDJm5M3xnno9z5uR77/3kc9/v+fWae+/3e7+pKiRJAjhgtguQJD15GAqSpIahIElqGAqSpIahIElqGAqSpIahIO2HJMuTVJKF3eUvJXnDDOz3PUk+2fZ+NH8ZCprTktyc5IEkw0nuSPK/k/RN936q6tSqunyS9Zwy3fuXpouhoPngVVXVBxwLvAR418iN6fBnQcJQ0DxSVbcBXwKOTjKU5I+S/ANwP/DcJIcm+ViS3UluS/K+JAsAkixIclGSu5LsBF45cu7ufGePWD4nyU1J/jXJjUmOTXIFcBjwN90jl7d3x56Y5P8m+WmS65MMjpjniCR/153nK8CSlj9NmucMBc0bSZYBq4DruqvOBNYCBwO3AJcDjwLPB44BXg787Bf9OcB/6K4fAE4bZz+vBt4DvB44BFgN/KSqzgR+SPfIpaouTPIc4IvA+4BnAOcCn0/yzO50nwK20QmD9wKtX7fQ/GYoaD74QpKfAn8P/B3wx931H6+qG6rqUTq/kE8Ffr+q7quqO4EPAqd3x74G+FBV3VpVdwP/Y5z9nQ1cWFXXVseOqrplH2NfB2yuqs1VtaeqvgJsBVYlOYzO6a7/VlUPVdU3gL/Z78+CNAkLZ7sAaQb8x6r66sgVSQBuHbHqcOBAYHd3G3T+aPrZmH8zavy+fskDLAO+P8naDgdeneRVI9YdCFzd3ec9VXXfqP0um+Tc0hNmKGg+G3mL4FuBh4Al3SOH0Xbz+F/Gh40z763A8yaxz5+NvaKqzhk9MMnhwNOTLB4RDIeNMYc0bTx9JAFVtRv4MvCBJIckOSDJ85Kc1B2yEXhLkqVJng6cN850/ws4N8lx3Wc2Pb/7Cx7gDuC5I8Z+EnhVkld0L2YflGQwydLuKaetwH9PsijJy4BXIbXIUJB+7vXAIuBG4B7gc8Czu9v+ArgKuB74NvB/9jVJVX0W+CM6F4n/FfgCnWsW0LkW8a7uM43OrapbgTXAO4Af0zlyeBs//9n8T8AJwN3Au4FPTEej0r7EN9mRJP2MRwqSpIahIElqGAqSpIahIElq9NzrFJYsWVLLly+f8jz33XcfixcvnnpBPcJ+56751CvY7/7atm3bXVX1zInG9VwoLF++nK1bt055nqGhIQYHB6deUI+w37lrPvUK9ru/koz3KvyGp48kSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQZKebKrgmmvgzW+GW2+FLVtmbNethUKSy5LcmeS7+9ieJBcn2ZHkO0mObasWSeopb3kLnHoqfOQjcOedcPLJ8I53zMiu2zxS+DiwcpztpwJHdj/WAh9psRZJ6g3f/jZcdhncd1/niAHg/vvhQx+C7dtb331roVBV36DzblH7sgb4RHVsAX4hybPHGS9Jc9+VV8KDD+69fs8e+OIXW999q++8lmQ5cGVVHT3GtiuB91fV33eXvwb8l6ra68ZGSdbSOZqgv7//uA0bNky5tuHhYfr6+qY8T6+w37lrPvUK86DfO+6A225rjhKGly6lb9cuOOAAWLoUnjnhPe3GtGLFim1VNTDRuNm8IV7GWDdmQlXVemA9wMDAQE3HzaG8qdbcNp/6nU+9wjzo95Zb4EUvggceAGDooosYPPdceOpT4eab4VnPanX3s/nso13AshHLS4HbZ6kWSXpyOPxwuPRSOOgg6OvrHCE89alwxRWtBwLM7pHCJmBdkg3ACcC9VbV7FuuRpCeHM8+EV74SvvSlTiDs3g2HHjoju24tFJJ8GhgEliTZBbwbOBCgqj4KbAZWATuA+4HfaqsWSeo5z3gGvPa1MDQ0Y4EALYZCVZ0xwfYC3tzW/iVJT5yvaJYkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNVoNhSQrk2xPsiPJeWNsPyzJ1UmuS/KdJKvarEeSNL7WQiHJAuAS4FTgKOCMJEeNGvYuYGNVHQOcDvx5W/VIkibW5pHC8cCOqtpZVQ8DG4A1o8YUcEj38aHA7S3WI0maQKqqnYmT04CVVXV2d/lM4ISqWjdizLOBLwNPBxYDp1TVtjHmWgusBejv7z9uw4YNU65veHiYvr6+Kc/TK+x37ppPvYL97q8VK1Zsq6qBicYtnPKe9i1jrBudQGcAH6+qDyT5NeCKJEdX1Z7H/aeq9cB6gIGBgRocHJxycUNDQ0zHPL3Cfueu+dQr2G/b2jx9tAtYNmJ5KXufHnojsBGgqr4JHAQsabEmSdI42gyFa4EjkxyRZBGdC8mbRo35IXAyQJIX0QmFH7dYkyRpHK2FQlU9CqwDrgJuovMsoxuSXJBkdXfYHwLnJLke+DRwVrV1kUOSNKE2rylQVZuBzaPWnT/i8Y3AS9usQZI0eb6iWZLUMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSY1WQyHJyiTbk+xIct4+xrwmyY1JbkjyqTbrkSSNb2FbEydZAFwC/AawC7g2yaaqunHEmCOB/wq8tKruSfKstuqRJE2szSOF44EdVbWzqh4GNgBrRo05B7ikqu4BqKo7W6xHkjSBVFU7EyenASur6uzu8pnACVW1bsSYLwDfA14KLADeU1V/O8Zca4G1AP39/cdt2LBhyvUNDw/T19c35Xl6hf3OXfOpV7Df/bVixYptVTUw0bjWTh8BGWPd6ARaCBwJDAJLgWuSHF1VP33cf6paD6wHGBgYqMHBwSkXNzQ0xHTM0yvsd+6aT72C/batzdNHu4BlI5aXArePMeavq+qRqvoBsJ1OSEiSZkGboXAtcGSSI5IsAk4HNo0a8wVgBUCSJcALgJ0t1iRJGkdroVBVjwLrgKuAm4CNVXVDkguSrO4Ouwr4SZIbgauBt1XVT9qqSZI0vjavKVBVm4HNo9adP+JxAW/tfkiSZpmvaJYkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNVp9RbM0L+zZA0ND8L3vwa/8CrzsZZCxbhIsPfkZCtJU3H03nHQS3HwzPPYYLFgAL3whfP3rcPDBs12d9ITt8/RRks1Jls9cKVIPevObYft2GB6GBx7o/PuP/whvf/tsVybtl/GuKXwc+HKSdyY5cIbqkXpHFXz+8/DII49f/9BD8Jd/OTs1SVO0z9NHVbUxyReB84GtSa4A9ozY/qczUJ/05FXVuZ4wltFBIfWIiZ599AhwH/AU4OBRH9L8dsABMDjY+XekBQtg1apZKUmaqn0eKSRZCfwpnXdLO7aq7p+xqqRecemlcOKJcP/9nY/FizsXmP/sz2a7Mmm/jPfso3cCr66qG2aqGKnnPO95sGMHfPKTnQvMxxwDr30t9PXNdmXSfhnvmsK/m8lCpJ516KGdZyFJc4CvaJYkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNQwFSVLDUJAkNVoNhSQrk2xPsiPJeeOMOy1JJRlosx5J0vhaC4UkC4BLgFOBo4Azkhw1xriDgbcA32qrFknS5LR5pHA8sKOqdlbVw8AGYM0Y494LXAg82GItkqRJGO/9FKbqOcCtI5Z3ASeMHJDkGGBZVV2Z5Nx9TZRkLbAWoL+/n6GhoSkXNzw8PC3z9Ar7nbvmU69gv21rMxQyxrpqNiYHAB8EzppooqpaD6wHGBgYqMHBwSkXNzQ0xHTM0yvsd+6aT72C/batzdNHu4BlI5aXArePWD4YOBoYSnIzcCKwyYvNkjR72gyFa4EjkxyRZBFwOp33ewagqu6tqiVVtbyqlgNbgNVVtbXFmiRJ42gtFKrqUWAdcBVwE7Cxqm5IckGS1W3tV5K0/9q8pkBVbQY2j1p3/j7GDrZZiyRpYr6iWZLUMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQZLUaPU9mtXjfvQj2LIF+vvhxBMhme2KJLXMUNDequC88+Dii2HRItizB37pl+CrX4XDD5/t6iS1yNNH2ttf/RVccgk8+CD8y7/A8DDs3Alr1sx2ZZJaZihobxdfDPfd9/h1e/bA974H//zPs1OTpBlhKGhv99479vqFCztHDpLmLENBe/vN34SDDtp7/YIF8OIXz3w9kmaMoaC9/d7vwbJl8LSndZYXLOg8Xr8eDjxwdmuT1CqffaS9HXIIXHcdXH45bN7cCYg3vQl+9VdnuzJJLTMUNLbFiztB8KY3zXYlkmaQp48kSQ1DQZLUaDUUkqxMsj3JjiTnjbH9rUluTPKdJF9L4stlJWkWtRYKSRYAlwCnAkcBZyQ5atSw64CBqnox8DngwrbqkSRNrM0jheOBHVW1s6oeBjYAj7tPQlVdXVX3dxe3AEtbrEeSNIFUVTsTJ6cBK6vq7O7ymcAJVbVuH+M/DPyoqt43xra1wFqA/v7+4zZs2DDl+oaHh+nr65vyPL3Cfueu+dQr2O/+WrFixbaqGphoXJtPSR3rPstjJlCS1wEDwEljba+q9cB6gIGBgRocHJxycUNDQ0zHPL3Cfueu+dQr2G/b2gyFXcCyEctLgdtHD0pyCvBO4KSqeqjFeiRJE2jzmsK1wJFJjkiyCDgd2DRyQJJjgEuB1VV1Z4u1SJImobVQqKpHgXXAVcBNwMaquiHJBUlWd4f9CdAHfDbJ/0uyaR/TSZJmQKu3uaiqzcDmUevOH/H4lDb3L0l6YnxFsySpYShIkhqGgiSpYShIkhqGgiSpYShIkhqGgiSpYShIkhqGgiSpYShIkhqGgiSpYShIkhqGgiSpYShIkhqGgiSpYShIkhqGgiSpYShIkhqGgiSpYShIkhqGgiSpYShIkhqGgiSpYShIkhqGgiSpYShIkhqGgiSpYShIkhoLZ7uAmfbYY3D99fDAA7BnDxxgLEpSo9VfiUlWJtmeZEeS88bY/pQkn+lu/1aS5W3Wc8018JznwEknwT/9Exx2GGzd2uYeJam3tBYKSRYAlwCnAkcBZyQ5atSwNwL3VNXzgQ8C/7Oteu66C1atgjvugOHhzlHCbbfBKad0liVJ7R4pHA/sqKqdVfUwsAFYM2rMGuDy7uPPAScnSRvFfPrTnVNHoz32GHz+823sUZJ6T6qqnYmT04CVVXV2d/lM4ISqWjdizHe7Y3Z1l7/fHXPXqLnWAmsB+vv7j9uwYcMTruf222H37p8vL106zK5dfSSdU0r9/U94yp4yPDxMX1/fbJcxY+ZTv/OpV7Df/bVixYptVTUw0bg2LzSP9Rf/6ASazBiqaj2wHmBgYKAGBwefcDFf+xr89m///FTRRRcNce65gyxeDFdfDS95yROesqcMDQ2xP5+3XjWf+p1PvYL9tq3N00e7gGUjlpcCt+9rTJKFwKHA3W0U8+u/DieeCE972s/XLV4Mr3jF3A8ESZqsNkPhWuDIJEckWQScDmwaNWYT8Ibu49OAr1dL57MS2LwZPvCBTjj09cGHPwwbN7axN0nqTa2FQlU9CqwDrgJuAjZW1Q1JLkiyujvsY8AvJtkBvBXY62mr0+nAA+F3fge++U345V+Gs86CBQva3KMk9ZZWX7xWVZuBzaPWnT/i8YPAq9usQZI0eb6eV5LUMBQkSQ1DQZLUMBQkSQ1DQZLUMBQkSQ1DQZLUaO2GeG1J8mPglmmYaglw14Sj5g77nbvmU69gv/vr8Kp65kSDei4UpkuSrZO5Y+BcYb9z13zqFey3bZ4+kiQ1DAVJUmM+h8L62S5ghtnv3DWfegX7bdW8vaYgSdrbfD5SkCSNYihIkhpzPhSSrEyyPcmOJHu9iU+SpyT5THf7t5Isn/kqp8cken1rkhuTfCfJ15IcPht1TpeJ+h0x7rQklaSnn8Y4mX6TvKb7Nb4hyadmusbpNInv58OSXJ3kuu739KrZqHM6JLksyZ1JvruP7Ulycfdz8Z0kx7ZWTFXN2Q9gAfB94LnAIuB64KhRY94EfLT7+HTgM7Ndd4u9rgCe1n38u73a62T77Y47GPgGsAUYmO26W/76HglcBzy9u/ys2a675X7XA7/bfXwUcPNs1z2Ffv89cCzw3X1sXwV8CQhwIvCttmqZ60cKxwM7qmpnVT0MbADWjBqzBri8+/hzwMlJMoM1TpcJe62qq6vq/u7iFmDpDNc4nSbztQV4L3Ah8OBMFteCyfR7DnBJVd0DUFV3znCN02ky/RZwSPfxocDtM1jftKqqbwB3jzNkDfCJ6tgC/EKSZ7dRy1wPhecAt45Y3tVdN+aY6ryv9L3AL85IddNrMr2O9EY6f3n0qgn7TXIMsKyqrpzJwloyma/vC4AXJPmHJFuSrJyx6qbfZPp9D/C6JLvovO3vf56Z0mbFE/353m+tvkfzk8BYf/GPfg7uZMb0gkn3keR1wABwUqsVtWvcfpMcAHwQOGumCmrZZL6+C+mcQhqkcxR4TZKjq+qnLdfWhsn0ewbw8ar6QJJfA67o9run/fJm3Iz9nprrRwq7gGUjlpey9yFmMybJQjqHoeMdxj1ZTaZXkpwCvBNYXVUPzVBtbZio34OBo4GhJDfTOQ+7qYcvNk/2e/mvq+qRqvoBsJ1OSPSiyfT7RmAjQFV9EziIzs3j5qJJ/XxPh7keCtcCRyY5IskiOheSN40aswl4Q/fxacDXq3tlp8dM2Gv3dMqldAKhl883wwT9VtW9VbWkqpZX1XI611BWV9XW2Sl3yibzvfwFOk8mIMkSOqeTds5oldNnMv3+EDgZIMmL6ITCj2e0ypmzCXh991lIJwL3VtXuNnY0p08fVdWjSdYBV9F5NsNlVXVDkguArVW1CfgYncPOHXSOEE6fvYr33yR7/ROgD/hs91r6D6tq9awVPQWT7HfOmGS/VwEvT3Ij8Bjwtqr6yexVvf8m2e8fAn+R5A/onEo5q0f/oCPJp+mc9lvSvUbybuBAgKr6KJ1rJquAHcD9wG+1VkuPfg4lSS2Y66ePJElPgKEgSWoYCpKkhqEgSWoYCpKkhqEgTUGSZUl+kOQZ3eWnd5d7+g60mr8MBWkKqupW4CPA+7ur3g+sr6pbZq8qaf/5OgVpipIcCGwDLqNzp9Jjunf2lHrOnH5FszQTquqRJG8D/hZ4uYGgXubpI2l6nArspnMTPqlnGQrSFCX5t8Bv0LkT6x+09eYn0kwwFKQp6L5L30eA36+qH9K56eBFs1uVtP8MBWlqzqFzt9mvdJf/HHhhkl5+AyPNYz77SJLU8EhBktQwFCRJDUNBktQwFCRJDUNBktQwFCRJDUNBktT4/3MH+lStpVvqAAAAAElFTkSuQmCC\n", 197 | "text/plain": [ 198 | "
" 199 | ] 200 | }, 201 | "metadata": {}, 202 | "output_type": "display_data" 203 | } 204 | ], 205 | "source": [ 206 | "plt.scatter(\n", 207 | " x=input_data[:,0], \n", 208 | " y=input_data[:,1], \n", 209 | " color=[('red' if x == 1 else 'blue') for x in predicted_output]\n", 210 | ")\n", 211 | "plt.xlabel('X')\n", 212 | "plt.ylabel('Y')\n", 213 | "plt.title('Predicted')\n", 214 | "plt.grid()\n", 215 | "plt.show()" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [] 224 | } 225 | ], 226 | "metadata": { 227 | "kernelspec": { 228 | "display_name": "Python 3", 229 | "language": "python", 230 | "name": "python3" 231 | }, 232 | "language_info": { 233 | "codemirror_mode": { 234 | "name": "ipython", 235 | "version": 3 236 | }, 237 | "file_extension": ".py", 238 | "mimetype": "text/x-python", 239 | "name": "python", 240 | "nbconvert_exporter": "python", 241 | "pygments_lexer": "ipython3", 242 | "version": "3.6.5" 243 | } 244 | }, 245 | "nbformat": 4, 246 | "nbformat_minor": 2 247 | } 248 | -------------------------------------------------------------------------------- /ch.10/R/CustomerSegmentation.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(readxl) 3 | library(ggplot2) 4 | #### 1. Load Data #### 5 | df <- read_excel( 6 | path="~/Documents/data-science-for-marketing/ch.10/data/Online Retail.xlsx", 7 | sheet="Online Retail" 8 | ) 9 | 10 | #### 2. Date Clean-Up #### 11 | 12 | # ignore negative quantity 13 | dim(df) 14 | df <- df[which(df$Quantity > 0),] 15 | dim(df) 16 | 17 | # remove records with NA 18 | df <- na.omit(df) 19 | dim(df) 20 | 21 | # excluding incomplete month 22 | sprintf("Date Range: %s ~ %s", min(df$InvoiceDate), max(df$InvoiceDate)) 23 | dim(df) 24 | df <- df[which(df$InvoiceDate < '2011-12-01'),] 25 | dim(df) 26 | 27 | # total sales 28 | df$Sales <- df$Quantity * df$UnitPrice 29 | 30 | # per customer data 31 | customerDF <- df %>% 32 | group_by(CustomerID) %>% 33 | summarize(TotalSales=sum(Sales), OrderCount=length(unique(InvoiceDate))) %>% 34 | mutate(AvgOrderValue=TotalSales/OrderCount) 35 | 36 | rankDF <- customerDF %>% 37 | mutate(TotalSales=rank(TotalSales), OrderCount=rank(OrderCount, ties.method="first"), AvgOrderValue=rank(AvgOrderValue)) 38 | 39 | normalizedDF <- rankDF %>% 40 | mutate(TotalSales=scale(TotalSales), OrderCount=scale(OrderCount), AvgOrderValue=scale(AvgOrderValue)) 41 | 42 | # check for normalization - mean of 0 & std of 1 43 | summary(normalizedDF) 44 | sapply(normalizedDF, sd) 45 | 46 | #### 3. Customer Segmentation via K-Means Clustering #### 47 | 48 | cluster <- kmeans(normalizedDF[c("TotalSales", "OrderCount", "AvgOrderValue")], 4) 49 | 50 | # cluster centers 51 | cluster$centers 52 | # cluster labels 53 | normalizedDF$Cluster <- cluster$cluster 54 | 55 | normalizedDF %>% group_by(Cluster) %>% summarise(Count=n()) 56 | 57 | ggplot(normalizedDF, aes(x=AvgOrderValue, y=OrderCount, color=Cluster)) + 58 | geom_point() 59 | 60 | ggplot(normalizedDF, aes(x=TotalSales, y=OrderCount, color=Cluster)) + 61 | geom_point() 62 | 63 | ggplot(normalizedDF, aes(x=TotalSales, y=AvgOrderValue, color=Cluster)) + 64 | geom_point() 65 | 66 | 67 | # Selecting the best number of cluster 68 | library(cluster) 69 | 70 | for(n_cluster in 4:8){ 71 | cluster <- kmeans(normalizedDF[c("TotalSales", "OrderCount", "AvgOrderValue")], n_cluster) 72 | 73 | silhouetteScore <- mean( 74 | silhouette( 75 | cluster$cluster, 76 | dist(normalizedDF[c("TotalSales", "OrderCount", "AvgOrderValue")], method = "euclidean") 77 | )[,3] 78 | ) 79 | print(sprintf('Silhouette Score for %i Clusters: %0.4f', n_cluster, silhouetteScore)) 80 | } 81 | 82 | # Interpreting customer segments 83 | cluster <- kmeans(normalizedDF[c("TotalSales", "OrderCount", "AvgOrderValue")], 4) 84 | normalizedDF$Cluster <- cluster$cluster 85 | # count per cluster 86 | normalizedDF %>% group_by(Cluster) %>% summarise(Count=n()) 87 | # cluster centers 88 | cluster$centers 89 | 90 | # High value cluster summary 91 | summary(customerDF[which(normalizedDF$Cluster == 4),]) 92 | 93 | highValueCustomers <- unlist( 94 | customerDF[which(normalizedDF$Cluster == 4),'CustomerID'][,1], use.names = FALSE 95 | ) 96 | 97 | df[which(df$CustomerID %in% highValueCustomers),] %>% 98 | group_by(Description) %>% 99 | summarise(Count=n()) %>% 100 | arrange(desc(Count)) 101 | 102 | 103 | -------------------------------------------------------------------------------- /ch.11/R/CustomerRetention.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(tidyr) 3 | library(readxl) 4 | 5 | #### 1. Load Data #### 6 | df <- read_excel( 7 | path="~/Documents/data-science-for-marketing/ch.11/data/WA_Fn-UseC_-Telco-Customer-Churn.xlsx" 8 | ) 9 | 10 | #### 2. Date Analysis & Preparation #### 11 | df <- df %>% drop_na() 12 | 13 | apply(df, 2, function(x) length(unique(x))) 14 | 15 | ggplot(df %>% group_by(gender) %>% summarise(Count=n()), aes(x=gender, y=Count)) + 16 | geom_bar(width=0.5, stat="identity") + 17 | ggtitle('') + 18 | xlab("Gender") + 19 | ylab("Count") + 20 | theme(plot.title = element_text(hjust = 0.5)) 21 | 22 | ggplot(df %>% group_by(InternetService) %>% summarise(Count=n()), aes(x=InternetService, y=Count)) + 23 | geom_bar(width=0.5, stat="identity") + 24 | ggtitle('') + 25 | xlab("Internet Service") + 26 | ylab("Count") + 27 | theme(plot.title = element_text(hjust = 0.5)) 28 | 29 | ggplot(df %>% group_by(PaymentMethod) %>% summarise(Count=n()), aes(x=PaymentMethod, y=Count)) + 30 | geom_bar(width=0.5, stat="identity") + 31 | ggtitle('') + 32 | xlab("Payment Method") + 33 | ylab("Count") + 34 | theme(plot.title = element_text(hjust = 0.5)) 35 | 36 | # Binary & Continuous Vars 37 | sampleDF <- df %>% 38 | select(tenure, MonthlyCharges, TotalCharges, gender, Partner, Dependents, PhoneService, PaperlessBilling, Churn) %>% 39 | mutate( 40 | # transforming continuous vars 41 | tenure=(tenure - mean(tenure))/sd(tenure), 42 | MonthlyCharges=(log(MonthlyCharges) - mean(log(MonthlyCharges)))/sd(log(MonthlyCharges)), 43 | TotalCharges=(log(TotalCharges) - mean(log(TotalCharges)))/sd(log(TotalCharges)), 44 | # encoding binary categorical vars 45 | gender=gender %>% as.factor() %>% as.numeric() - 1, 46 | Partner=Partner %>% as.factor() %>% as.numeric() - 1, 47 | Dependents=Dependents %>% as.factor() %>% as.numeric() - 1, 48 | PhoneService=PhoneService %>% as.factor() %>% as.numeric() - 1, 49 | PaperlessBilling=PaperlessBilling %>% as.factor() %>% as.numeric() - 1, 50 | Churn=Churn %>% as.factor() %>% as.numeric() - 1 51 | ) 52 | 53 | summary(df[,c("tenure", "MonthlyCharges", "TotalCharges")]) 54 | apply(df[,c("tenure", "MonthlyCharges", "TotalCharges")], 2, sd) 55 | 56 | summary(sampleDF[,c("tenure", "MonthlyCharges", "TotalCharges")]) 57 | apply(sampleDF[,c("tenure", "MonthlyCharges", "TotalCharges")], 2, sd) 58 | 59 | # Dummy vars 60 | # install.packages('dummies') 61 | library(dummies) 62 | 63 | sampleDF <- cbind(sampleDF, dummy(df$MultipleLines, sep=".")) 64 | names(sampleDF) = gsub("sampleDF", "MultipleLines", names(sampleDF)) 65 | 66 | sampleDF <- cbind(sampleDF, dummy(df$InternetService, sep=".")) 67 | names(sampleDF) = gsub("sampleDF", "InternetService", names(sampleDF)) 68 | 69 | sampleDF <- cbind(sampleDF, dummy(df$OnlineSecurity, sep=".")) 70 | names(sampleDF) = gsub("sampleDF", "OnlineSecurity", names(sampleDF)) 71 | 72 | sampleDF <- cbind(sampleDF, dummy(df$OnlineBackup, sep=".")) 73 | names(sampleDF) = gsub("sampleDF", "OnlineBackup", names(sampleDF)) 74 | 75 | sampleDF <- cbind(sampleDF, dummy(df$DeviceProtection, sep=".")) 76 | names(sampleDF) = gsub("sampleDF", "DeviceProtection", names(sampleDF)) 77 | 78 | sampleDF <- cbind(sampleDF, dummy(df$TechSupport, sep=".")) 79 | names(sampleDF) = gsub("sampleDF", "TechSupport", names(sampleDF)) 80 | 81 | sampleDF <- cbind(sampleDF, dummy(df$StreamingTV, sep=".")) 82 | names(sampleDF) = gsub("sampleDF", "StreamingTV", names(sampleDF)) 83 | 84 | sampleDF <- cbind(sampleDF, dummy(df$StreamingMovies, sep=".")) 85 | names(sampleDF) = gsub("sampleDF", "StreamingMovies", names(sampleDF)) 86 | 87 | sampleDF <- cbind(sampleDF, dummy(df$Contract, sep=".")) 88 | names(sampleDF) = gsub("sampleDF", "Contract", names(sampleDF)) 89 | 90 | sampleDF <- cbind(sampleDF, dummy(df$PaymentMethod, sep=".")) 91 | names(sampleDF) = gsub("sampleDF", "PaymentMethod", names(sampleDF)) 92 | 93 | 94 | #### 3. Train & Test Set Split #### 95 | library(caTools) 96 | 97 | sample <- sample.split(sampleDF$Churn, SplitRatio = .7) 98 | 99 | train <- as.data.frame(subset(sampleDF, sample == TRUE)) 100 | test <- as.data.frame(subset(sampleDF, sample == FALSE)) 101 | 102 | trainX <- as.matrix(train[,names(train) != "Churn"]) 103 | trainY <- train$Churn 104 | testX <- as.matrix(test[,names(test) != "Churn"]) 105 | testY <- test$Churn 106 | 107 | 108 | #### 4. Aritificial Neural Network (ANN) with Keras #### 109 | install.packages("devtools") 110 | devtools::install_github("rstudio/tensorflow") 111 | library(tensorflow) 112 | install_tensorflow() 113 | 114 | devtools::install_github("rstudio/keras") 115 | library(keras) 116 | install_keras() 117 | 118 | 119 | model <- keras_model_sequential() 120 | model %>% 121 | layer_dense(units = 16, kernel_initializer = "uniform", activation = 'relu', input_shape=ncol(sampleDF)-1) %>% 122 | layer_dense(units = 8, kernel_initializer = "uniform", activation = 'relu') %>% 123 | layer_dense(units = 1, kernel_initializer = "uniform", activation = 'sigmoid') %>% 124 | compile( 125 | optimizer = 'adam', 126 | loss = 'binary_crossentropy', 127 | metrics = c('accuracy') 128 | ) 129 | 130 | history <- model %>% fit( 131 | trainX, 132 | trainY, 133 | epochs = 50, batch_size = 100, 134 | validation_split = 0.2 135 | ) 136 | 137 | # Evaluating ANN model 138 | inSamplePreds <- as.double(model %>% predict_classes(trainX)) 139 | outSamplePreds <- as.double(model %>% predict_classes(testX)) 140 | 141 | # - Accuracy, Precision, and Recall 142 | inSampleAccuracy <- mean(trainY == inSamplePreds) 143 | outSampleAccuracy <- mean(testY == outSamplePreds) 144 | 145 | inSamplePrecision <- sum(inSamplePreds & trainY) / sum(inSamplePreds) 146 | outSamplePrecision <- sum(outSamplePreds & testY) / sum(outSamplePreds) 147 | 148 | inSampleRecall <- sum(inSamplePreds & trainY) / sum(trainY) 149 | outSampleRecall <- sum(outSamplePreds & testY) / sum(testY) 150 | 151 | 152 | print(sprintf('In-Sample Accuracy: %0.4f', inSampleAccuracy)) 153 | print(sprintf('Out-Sample Accuracy: %0.4f', outSampleAccuracy)) 154 | print(sprintf('In-Sample Precision: %0.4f', inSamplePrecision)) 155 | print(sprintf('Out-Sample Precision: %0.4f', outSamplePrecision)) 156 | print(sprintf('In-Sample Recall: %0.4f', inSampleRecall)) 157 | print(sprintf('Out-Sample Recall: %0.4f', outSampleRecall)) 158 | 159 | 160 | # - ROC & AUC 161 | library(ROCR) 162 | 163 | outSamplePredProbs <- as.double(predict(model, testX)) 164 | 165 | pred <- prediction(outSamplePredProbs, testY) 166 | perf <- performance(pred, measure = "tpr", x.measure = "fpr") 167 | auc <- performance(pred, measure='auc')@y.values[[1]] 168 | 169 | plot( 170 | perf, 171 | main=sprintf('Model ROC Curve (AUC: %0.2f)', auc), 172 | col='darkorange', 173 | lwd=2 174 | ) + grid() 175 | abline(a = 0, b = 1, col='darkgray', lty=3, lwd=2) 176 | 177 | 178 | 179 | 180 | -------------------------------------------------------------------------------- /ch.11/python/CustomerRetention.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%matplotlib inline" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "import matplotlib.pyplot as plt\n", 19 | "import pandas as pd\n", 20 | "import numpy as np" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "# 1. Load Data" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 3, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "df = pd.read_excel('../data/WA_Fn-UseC_-Telco-Customer-Churn.xlsx')" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 4, 42 | "metadata": {}, 43 | "outputs": [ 44 | { 45 | "data": { 46 | "text/plain": [ 47 | "(7043, 21)" 48 | ] 49 | }, 50 | "execution_count": 4, 51 | "metadata": {}, 52 | "output_type": "execute_result" 53 | } 54 | ], 55 | "source": [ 56 | "df.shape" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 5, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/html": [ 67 | "
\n", 68 | "\n", 81 | "\n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | "
customerIDgenderSeniorCitizenPartnerDependentstenurePhoneServiceMultipleLinesInternetServiceOnlineSecurity...DeviceProtectionTechSupportStreamingTVStreamingMoviesContractPaperlessBillingPaymentMethodMonthlyChargesTotalChargesChurn
07590-VHVEGFemale0YesNo1NoNo phone serviceDSLNo...NoNoNoNoMonth-to-monthYesElectronic check29.8529.85No
15575-GNVDEMale0NoNo34YesNoDSLYes...YesNoNoNoOne yearNoMailed check56.951889.5No
23668-QPYBKMale0NoNo2YesNoDSLYes...NoNoNoNoMonth-to-monthYesMailed check53.85108.15Yes
37795-CFOCWMale0NoNo45NoNo phone serviceDSLYes...YesYesNoNoOne yearNoBank transfer (automatic)42.301840.75No
49237-HQITUFemale0NoNo2YesNoFiber opticNo...NoNoNoNoMonth-to-monthYesElectronic check70.70151.65Yes
59305-CDSKCFemale0NoNo8YesYesFiber opticNo...YesNoYesYesMonth-to-monthYesElectronic check99.65820.5Yes
61452-KIOVKMale0NoYes22YesYesFiber opticNo...NoNoYesNoMonth-to-monthYesCredit card (automatic)89.101949.4No
76713-OKOMCFemale0NoNo10NoNo phone serviceDSLYes...NoNoNoNoMonth-to-monthNoMailed check29.75301.9No
87892-POOKPFemale0YesNo28YesYesFiber opticNo...YesYesYesYesMonth-to-monthYesElectronic check104.803046.05Yes
96388-TABGUMale0NoYes62YesNoDSLYes...NoNoNoNoOne yearNoBank transfer (automatic)56.153487.95No
\n", 351 | "

10 rows × 21 columns

\n", 352 | "
" 353 | ], 354 | "text/plain": [ 355 | " customerID gender SeniorCitizen Partner Dependents tenure PhoneService \\\n", 356 | "0 7590-VHVEG Female 0 Yes No 1 No \n", 357 | "1 5575-GNVDE Male 0 No No 34 Yes \n", 358 | "2 3668-QPYBK Male 0 No No 2 Yes \n", 359 | "3 7795-CFOCW Male 0 No No 45 No \n", 360 | "4 9237-HQITU Female 0 No No 2 Yes \n", 361 | "5 9305-CDSKC Female 0 No No 8 Yes \n", 362 | "6 1452-KIOVK Male 0 No Yes 22 Yes \n", 363 | "7 6713-OKOMC Female 0 No No 10 No \n", 364 | "8 7892-POOKP Female 0 Yes No 28 Yes \n", 365 | "9 6388-TABGU Male 0 No Yes 62 Yes \n", 366 | "\n", 367 | " MultipleLines InternetService OnlineSecurity ... DeviceProtection \\\n", 368 | "0 No phone service DSL No ... No \n", 369 | "1 No DSL Yes ... Yes \n", 370 | "2 No DSL Yes ... No \n", 371 | "3 No phone service DSL Yes ... Yes \n", 372 | "4 No Fiber optic No ... No \n", 373 | "5 Yes Fiber optic No ... Yes \n", 374 | "6 Yes Fiber optic No ... No \n", 375 | "7 No phone service DSL Yes ... No \n", 376 | "8 Yes Fiber optic No ... Yes \n", 377 | "9 No DSL Yes ... No \n", 378 | "\n", 379 | " TechSupport StreamingTV StreamingMovies Contract PaperlessBilling \\\n", 380 | "0 No No No Month-to-month Yes \n", 381 | "1 No No No One year No \n", 382 | "2 No No No Month-to-month Yes \n", 383 | "3 Yes No No One year No \n", 384 | "4 No No No Month-to-month Yes \n", 385 | "5 No Yes Yes Month-to-month Yes \n", 386 | "6 No Yes No Month-to-month Yes \n", 387 | "7 No No No Month-to-month No \n", 388 | "8 Yes Yes Yes Month-to-month Yes \n", 389 | "9 No No No One year No \n", 390 | "\n", 391 | " PaymentMethod MonthlyCharges TotalCharges Churn \n", 392 | "0 Electronic check 29.85 29.85 No \n", 393 | "1 Mailed check 56.95 1889.5 No \n", 394 | "2 Mailed check 53.85 108.15 Yes \n", 395 | "3 Bank transfer (automatic) 42.30 1840.75 No \n", 396 | "4 Electronic check 70.70 151.65 Yes \n", 397 | "5 Electronic check 99.65 820.5 Yes \n", 398 | "6 Credit card (automatic) 89.10 1949.4 No \n", 399 | "7 Mailed check 29.75 301.9 No \n", 400 | "8 Electronic check 104.80 3046.05 Yes \n", 401 | "9 Bank transfer (automatic) 56.15 3487.95 No \n", 402 | "\n", 403 | "[10 rows x 21 columns]" 404 | ] 405 | }, 406 | "execution_count": 5, 407 | "metadata": {}, 408 | "output_type": "execute_result" 409 | } 410 | ], 411 | "source": [ 412 | "df.head(10)" 413 | ] 414 | }, 415 | { 416 | "cell_type": "markdown", 417 | "metadata": {}, 418 | "source": [ 419 | "# 2. Data Analysis & Preparation" 420 | ] 421 | }, 422 | { 423 | "cell_type": "markdown", 424 | "metadata": {}, 425 | "source": [ 426 | "#### - Encoding target var: Churn" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 6, 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [ 435 | "df['Churn'] = df['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": 7, 441 | "metadata": {}, 442 | "outputs": [ 443 | { 444 | "data": { 445 | "text/plain": [ 446 | "0.2653698707936959" 447 | ] 448 | }, 449 | "execution_count": 7, 450 | "metadata": {}, 451 | "output_type": "execute_result" 452 | } 453 | ], 454 | "source": [ 455 | "df['Churn'].mean()" 456 | ] 457 | }, 458 | { 459 | "cell_type": "markdown", 460 | "metadata": {}, 461 | "source": [ 462 | "#### - TotalCharges" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 8, 468 | "metadata": {}, 469 | "outputs": [], 470 | "source": [ 471 | "df['TotalCharges'] = df['TotalCharges'].replace(' ', np.nan).astype(float)" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": 9, 477 | "metadata": {}, 478 | "outputs": [ 479 | { 480 | "data": { 481 | "text/plain": [ 482 | "(7043, 21)" 483 | ] 484 | }, 485 | "execution_count": 9, 486 | "metadata": {}, 487 | "output_type": "execute_result" 488 | } 489 | ], 490 | "source": [ 491 | "df.shape" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": 10, 497 | "metadata": {}, 498 | "outputs": [ 499 | { 500 | "data": { 501 | "text/plain": [ 502 | "(7032, 21)" 503 | ] 504 | }, 505 | "execution_count": 10, 506 | "metadata": {}, 507 | "output_type": "execute_result" 508 | } 509 | ], 510 | "source": [ 511 | "df.dropna().shape" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": 11, 517 | "metadata": {}, 518 | "outputs": [], 519 | "source": [ 520 | "df = df.dropna()" 521 | ] 522 | }, 523 | { 524 | "cell_type": "markdown", 525 | "metadata": {}, 526 | "source": [ 527 | "#### - Continuous Vars" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": 12, 533 | "metadata": {}, 534 | "outputs": [ 535 | { 536 | "data": { 537 | "text/html": [ 538 | "
\n", 539 | "\n", 552 | "\n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | "
tenureMonthlyChargesTotalCharges
count7032.0000007032.0000007032.000000
mean32.42178664.7982082283.300441
std24.54526030.0859742266.771362
min1.00000018.25000018.800000
25%9.00000035.587500401.450000
50%29.00000070.3500001397.475000
75%55.00000089.8625003794.737500
max72.000000118.7500008684.800000
\n", 612 | "
" 613 | ], 614 | "text/plain": [ 615 | " tenure MonthlyCharges TotalCharges\n", 616 | "count 7032.000000 7032.000000 7032.000000\n", 617 | "mean 32.421786 64.798208 2283.300441\n", 618 | "std 24.545260 30.085974 2266.771362\n", 619 | "min 1.000000 18.250000 18.800000\n", 620 | "25% 9.000000 35.587500 401.450000\n", 621 | "50% 29.000000 70.350000 1397.475000\n", 622 | "75% 55.000000 89.862500 3794.737500\n", 623 | "max 72.000000 118.750000 8684.800000" 624 | ] 625 | }, 626 | "execution_count": 12, 627 | "metadata": {}, 628 | "output_type": "execute_result" 629 | } 630 | ], 631 | "source": [ 632 | "df[['tenure', 'MonthlyCharges', 'TotalCharges']].describe()" 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": 13, 638 | "metadata": {}, 639 | "outputs": [], 640 | "source": [ 641 | "df['MonthlyCharges'] = np.log(df['MonthlyCharges'])\n", 642 | "df['MonthlyCharges'] = (df['MonthlyCharges'] - df['MonthlyCharges'].mean())/df['MonthlyCharges'].std()\n", 643 | "\n", 644 | "df['TotalCharges'] = np.log(df['TotalCharges'])\n", 645 | "df['TotalCharges'] = (df['TotalCharges'] - df['TotalCharges'].mean())/df['TotalCharges'].std()\n", 646 | "\n", 647 | "df['tenure'] = (df['tenure'] - df['tenure'].mean())/df['tenure'].std()" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": 14, 653 | "metadata": {}, 654 | "outputs": [ 655 | { 656 | "data": { 657 | "text/html": [ 658 | "
\n", 659 | "\n", 672 | "\n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | "
tenureMonthlyChargesTotalCharges
count7.032000e+037.032000e+037.032000e+03
mean-1.028756e-164.688495e-147.150708e-15
std1.000000e+001.000000e+001.000000e+00
min-1.280157e+00-1.882268e+00-2.579056e+00
25%-9.542285e-01-7.583727e-01-6.080585e-01
50%-1.394072e-013.885103e-011.950521e-01
75%9.198605e-018.004829e-018.382338e-01
max1.612459e+001.269576e+001.371323e+00
\n", 732 | "
" 733 | ], 734 | "text/plain": [ 735 | " tenure MonthlyCharges TotalCharges\n", 736 | "count 7.032000e+03 7.032000e+03 7.032000e+03\n", 737 | "mean -1.028756e-16 4.688495e-14 7.150708e-15\n", 738 | "std 1.000000e+00 1.000000e+00 1.000000e+00\n", 739 | "min -1.280157e+00 -1.882268e+00 -2.579056e+00\n", 740 | "25% -9.542285e-01 -7.583727e-01 -6.080585e-01\n", 741 | "50% -1.394072e-01 3.885103e-01 1.950521e-01\n", 742 | "75% 9.198605e-01 8.004829e-01 8.382338e-01\n", 743 | "max 1.612459e+00 1.269576e+00 1.371323e+00" 744 | ] 745 | }, 746 | "execution_count": 14, 747 | "metadata": {}, 748 | "output_type": "execute_result" 749 | } 750 | ], 751 | "source": [ 752 | "df[['tenure', 'MonthlyCharges', 'TotalCharges']].describe()" 753 | ] 754 | }, 755 | { 756 | "cell_type": "code", 757 | "execution_count": 15, 758 | "metadata": {}, 759 | "outputs": [ 760 | { 761 | "data": { 762 | "text/plain": [ 763 | "['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges', 'Churn']" 764 | ] 765 | }, 766 | "execution_count": 15, 767 | "metadata": {}, 768 | "output_type": "execute_result" 769 | } 770 | ], 771 | "source": [ 772 | "continuous_vars = list(df.describe().columns)\n", 773 | "continuous_vars" 774 | ] 775 | }, 776 | { 777 | "cell_type": "markdown", 778 | "metadata": {}, 779 | "source": [ 780 | "#### - One-Hot Encoding" 781 | ] 782 | }, 783 | { 784 | "cell_type": "code", 785 | "execution_count": 16, 786 | "metadata": {}, 787 | "outputs": [ 788 | { 789 | "name": "stdout", 790 | "output_type": "stream", 791 | "text": [ 792 | "customerID 7032\n", 793 | "gender 2\n", 794 | "SeniorCitizen 2\n", 795 | "Partner 2\n", 796 | "Dependents 2\n", 797 | "tenure 72\n", 798 | "PhoneService 2\n", 799 | "MultipleLines 3\n", 800 | "InternetService 3\n", 801 | "OnlineSecurity 3\n", 802 | "OnlineBackup 3\n", 803 | "DeviceProtection 3\n", 804 | "TechSupport 3\n", 805 | "StreamingTV 3\n", 806 | "StreamingMovies 3\n", 807 | "Contract 3\n", 808 | "PaperlessBilling 2\n", 809 | "PaymentMethod 4\n", 810 | "MonthlyCharges 1584\n", 811 | "TotalCharges 6530\n", 812 | "Churn 2\n" 813 | ] 814 | } 815 | ], 816 | "source": [ 817 | "for col in list(df.columns):\n", 818 | " print(col, df[col].nunique())" 819 | ] 820 | }, 821 | { 822 | "cell_type": "code", 823 | "execution_count": 17, 824 | "metadata": { 825 | "scrolled": true 826 | }, 827 | "outputs": [ 828 | { 829 | "data": { 830 | "image/png": "\n", 831 | "text/plain": [ 832 | "
" 833 | ] 834 | }, 835 | "metadata": {}, 836 | "output_type": "display_data" 837 | }, 838 | { 839 | "data": { 840 | "image/png": "\n", 841 | "text/plain": [ 842 | "
" 843 | ] 844 | }, 845 | "metadata": {}, 846 | "output_type": "display_data" 847 | }, 848 | { 849 | "data": { 850 | "image/png": "\n", 851 | "text/plain": [ 852 | "
" 853 | ] 854 | }, 855 | "metadata": {}, 856 | "output_type": "display_data" 857 | } 858 | ], 859 | "source": [ 860 | "df.groupby('gender').count()['customerID'].plot(\n", 861 | " kind='bar', color='skyblue', grid=True, figsize=(8,6), title='Gender'\n", 862 | ")\n", 863 | "plt.show()\n", 864 | "\n", 865 | "df.groupby('InternetService').count()['customerID'].plot(\n", 866 | " kind='bar', color='skyblue', grid=True, figsize=(8,6), title='Internet Service'\n", 867 | ")\n", 868 | "plt.show()\n", 869 | "\n", 870 | "df.groupby('PaymentMethod').count()['customerID'].plot(\n", 871 | " kind='bar', color='skyblue', grid=True, figsize=(8,6), title='Payment Method'\n", 872 | ")\n", 873 | "plt.show()" 874 | ] 875 | }, 876 | { 877 | "cell_type": "code", 878 | "execution_count": 18, 879 | "metadata": {}, 880 | "outputs": [], 881 | "source": [ 882 | "dummy_cols = []\n", 883 | "\n", 884 | "sample_set = df[['tenure', 'MonthlyCharges', 'TotalCharges', 'Churn']].copy(deep=True)\n", 885 | "\n", 886 | "for col in list(df.columns):\n", 887 | " if col not in ['tenure', 'MonthlyCharges', 'TotalCharges', 'Churn'] and df[col].nunique() < 5:\n", 888 | " dummy_vars = pd.get_dummies(df[col])\n", 889 | " dummy_vars.columns = [col+str(x) for x in dummy_vars.columns] \n", 890 | " sample_set = pd.concat([sample_set, dummy_vars], axis=1)" 891 | ] 892 | }, 893 | { 894 | "cell_type": "code", 895 | "execution_count": 19, 896 | "metadata": {}, 897 | "outputs": [ 898 | { 899 | "data": { 900 | "text/html": [ 901 | "
\n", 902 | "\n", 915 | "\n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | " \n", 1106 | " \n", 1107 | " \n", 1108 | " \n", 1109 | " \n", 1110 | " \n", 1111 | " \n", 1112 | " \n", 1113 | " \n", 1114 | " \n", 1115 | " \n", 1116 | " \n", 1117 | " \n", 1118 | " \n", 1119 | " \n", 1120 | " \n", 1121 | " \n", 1122 | " \n", 1123 | " \n", 1124 | " \n", 1125 | " \n", 1126 | " \n", 1127 | " \n", 1128 | " \n", 1129 | " \n", 1130 | " \n", 1131 | " \n", 1132 | " \n", 1133 | " \n", 1134 | " \n", 1135 | " \n", 1136 | " \n", 1137 | " \n", 1138 | " \n", 1139 | " \n", 1140 | " \n", 1141 | " \n", 1142 | " \n", 1143 | " \n", 1144 | " \n", 1145 | " \n", 1146 | " \n", 1147 | " \n", 1148 | " \n", 1149 | " \n", 1150 | " \n", 1151 | " \n", 1152 | " \n", 1153 | " \n", 1154 | " \n", 1155 | " \n", 1156 | " \n", 1157 | " \n", 1158 | " \n", 1159 | " \n", 1160 | " \n", 1161 | " \n", 1162 | " \n", 1163 | " \n", 1164 | " \n", 1165 | " \n", 1166 | " \n", 1167 | " \n", 1168 | " \n", 1169 | " \n", 1170 | " \n", 1171 | " \n", 1172 | " \n", 1173 | " \n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | "
tenureMonthlyChargesTotalChargesChurngenderFemalegenderMaleSeniorCitizen0SeniorCitizen1PartnerNoPartnerYes...StreamingMoviesYesContractMonth-to-monthContractOne yearContractTwo yearPaperlessBillingNoPaperlessBillingYesPaymentMethodBank transfer (automatic)PaymentMethodCredit card (automatic)PaymentMethodElectronic checkPaymentMethodMailed check
0-1.280157-1.054244-2.2813820101001...0100010010
10.0642980.0328960.3892690011010...0010100001
2-1.239416-0.061298-1.4525201011010...0100010001
30.512450-0.4675780.3724390011010...0010101000
4-1.2394160.396862-1.2348601101010...0100010010
5-0.9949700.974468-0.1478081101010...1100010010
6-0.4245950.7861420.4093630011010...0100010100
7-0.913487-1.059891-0.7915500101010...0100100001
8-0.1801481.0592690.6967331101001...1100010010
91.2050480.0090880.7839560011010...0010101000
\n", 1185 | "

10 rows × 47 columns

\n", 1186 | "
" 1187 | ], 1188 | "text/plain": [ 1189 | " tenure MonthlyCharges TotalCharges Churn genderFemale genderMale \\\n", 1190 | "0 -1.280157 -1.054244 -2.281382 0 1 0 \n", 1191 | "1 0.064298 0.032896 0.389269 0 0 1 \n", 1192 | "2 -1.239416 -0.061298 -1.452520 1 0 1 \n", 1193 | "3 0.512450 -0.467578 0.372439 0 0 1 \n", 1194 | "4 -1.239416 0.396862 -1.234860 1 1 0 \n", 1195 | "5 -0.994970 0.974468 -0.147808 1 1 0 \n", 1196 | "6 -0.424595 0.786142 0.409363 0 0 1 \n", 1197 | "7 -0.913487 -1.059891 -0.791550 0 1 0 \n", 1198 | "8 -0.180148 1.059269 0.696733 1 1 0 \n", 1199 | "9 1.205048 0.009088 0.783956 0 0 1 \n", 1200 | "\n", 1201 | " SeniorCitizen0 SeniorCitizen1 PartnerNo PartnerYes \\\n", 1202 | "0 1 0 0 1 \n", 1203 | "1 1 0 1 0 \n", 1204 | "2 1 0 1 0 \n", 1205 | "3 1 0 1 0 \n", 1206 | "4 1 0 1 0 \n", 1207 | "5 1 0 1 0 \n", 1208 | "6 1 0 1 0 \n", 1209 | "7 1 0 1 0 \n", 1210 | "8 1 0 0 1 \n", 1211 | "9 1 0 1 0 \n", 1212 | "\n", 1213 | " ... StreamingMoviesYes ContractMonth-to-month \\\n", 1214 | "0 ... 0 1 \n", 1215 | "1 ... 0 0 \n", 1216 | "2 ... 0 1 \n", 1217 | "3 ... 0 0 \n", 1218 | "4 ... 0 1 \n", 1219 | "5 ... 1 1 \n", 1220 | "6 ... 0 1 \n", 1221 | "7 ... 0 1 \n", 1222 | "8 ... 1 1 \n", 1223 | "9 ... 0 0 \n", 1224 | "\n", 1225 | " ContractOne year ContractTwo year PaperlessBillingNo \\\n", 1226 | "0 0 0 0 \n", 1227 | "1 1 0 1 \n", 1228 | "2 0 0 0 \n", 1229 | "3 1 0 1 \n", 1230 | "4 0 0 0 \n", 1231 | "5 0 0 0 \n", 1232 | "6 0 0 0 \n", 1233 | "7 0 0 1 \n", 1234 | "8 0 0 0 \n", 1235 | "9 1 0 1 \n", 1236 | "\n", 1237 | " PaperlessBillingYes PaymentMethodBank transfer (automatic) \\\n", 1238 | "0 1 0 \n", 1239 | "1 0 0 \n", 1240 | "2 1 0 \n", 1241 | "3 0 1 \n", 1242 | "4 1 0 \n", 1243 | "5 1 0 \n", 1244 | "6 1 0 \n", 1245 | "7 0 0 \n", 1246 | "8 1 0 \n", 1247 | "9 0 1 \n", 1248 | "\n", 1249 | " PaymentMethodCredit card (automatic) PaymentMethodElectronic check \\\n", 1250 | "0 0 1 \n", 1251 | "1 0 0 \n", 1252 | "2 0 0 \n", 1253 | "3 0 0 \n", 1254 | "4 0 1 \n", 1255 | "5 0 1 \n", 1256 | "6 1 0 \n", 1257 | "7 0 0 \n", 1258 | "8 0 1 \n", 1259 | "9 0 0 \n", 1260 | "\n", 1261 | " PaymentMethodMailed check \n", 1262 | "0 0 \n", 1263 | "1 1 \n", 1264 | "2 1 \n", 1265 | "3 0 \n", 1266 | "4 0 \n", 1267 | "5 0 \n", 1268 | "6 0 \n", 1269 | "7 1 \n", 1270 | "8 0 \n", 1271 | "9 0 \n", 1272 | "\n", 1273 | "[10 rows x 47 columns]" 1274 | ] 1275 | }, 1276 | "execution_count": 19, 1277 | "metadata": {}, 1278 | "output_type": "execute_result" 1279 | } 1280 | ], 1281 | "source": [ 1282 | "sample_set.head(10)" 1283 | ] 1284 | }, 1285 | { 1286 | "cell_type": "code", 1287 | "execution_count": 20, 1288 | "metadata": {}, 1289 | "outputs": [ 1290 | { 1291 | "data": { 1292 | "text/plain": [ 1293 | "(7032, 47)" 1294 | ] 1295 | }, 1296 | "execution_count": 20, 1297 | "metadata": {}, 1298 | "output_type": "execute_result" 1299 | } 1300 | ], 1301 | "source": [ 1302 | "sample_set.shape" 1303 | ] 1304 | }, 1305 | { 1306 | "cell_type": "code", 1307 | "execution_count": 21, 1308 | "metadata": { 1309 | "scrolled": true 1310 | }, 1311 | "outputs": [ 1312 | { 1313 | "data": { 1314 | "text/plain": [ 1315 | "['tenure',\n", 1316 | " 'MonthlyCharges',\n", 1317 | " 'TotalCharges',\n", 1318 | " 'Churn',\n", 1319 | " 'genderFemale',\n", 1320 | " 'genderMale',\n", 1321 | " 'SeniorCitizen0',\n", 1322 | " 'SeniorCitizen1',\n", 1323 | " 'PartnerNo',\n", 1324 | " 'PartnerYes',\n", 1325 | " 'DependentsNo',\n", 1326 | " 'DependentsYes',\n", 1327 | " 'PhoneServiceNo',\n", 1328 | " 'PhoneServiceYes',\n", 1329 | " 'MultipleLinesNo',\n", 1330 | " 'MultipleLinesNo phone service',\n", 1331 | " 'MultipleLinesYes',\n", 1332 | " 'InternetServiceDSL',\n", 1333 | " 'InternetServiceFiber optic',\n", 1334 | " 'InternetServiceNo',\n", 1335 | " 'OnlineSecurityNo',\n", 1336 | " 'OnlineSecurityNo internet service',\n", 1337 | " 'OnlineSecurityYes',\n", 1338 | " 'OnlineBackupNo',\n", 1339 | " 'OnlineBackupNo internet service',\n", 1340 | " 'OnlineBackupYes',\n", 1341 | " 'DeviceProtectionNo',\n", 1342 | " 'DeviceProtectionNo internet service',\n", 1343 | " 'DeviceProtectionYes',\n", 1344 | " 'TechSupportNo',\n", 1345 | " 'TechSupportNo internet service',\n", 1346 | " 'TechSupportYes',\n", 1347 | " 'StreamingTVNo',\n", 1348 | " 'StreamingTVNo internet service',\n", 1349 | " 'StreamingTVYes',\n", 1350 | " 'StreamingMoviesNo',\n", 1351 | " 'StreamingMoviesNo internet service',\n", 1352 | " 'StreamingMoviesYes',\n", 1353 | " 'ContractMonth-to-month',\n", 1354 | " 'ContractOne year',\n", 1355 | " 'ContractTwo year',\n", 1356 | " 'PaperlessBillingNo',\n", 1357 | " 'PaperlessBillingYes',\n", 1358 | " 'PaymentMethodBank transfer (automatic)',\n", 1359 | " 'PaymentMethodCredit card (automatic)',\n", 1360 | " 'PaymentMethodElectronic check',\n", 1361 | " 'PaymentMethodMailed check']" 1362 | ] 1363 | }, 1364 | "execution_count": 21, 1365 | "metadata": {}, 1366 | "output_type": "execute_result" 1367 | } 1368 | ], 1369 | "source": [ 1370 | "list(sample_set.columns)" 1371 | ] 1372 | }, 1373 | { 1374 | "cell_type": "markdown", 1375 | "metadata": {}, 1376 | "source": [ 1377 | "# 3. Train & Test Sets" 1378 | ] 1379 | }, 1380 | { 1381 | "cell_type": "code", 1382 | "execution_count": 22, 1383 | "metadata": {}, 1384 | "outputs": [], 1385 | "source": [ 1386 | "target_var = 'Churn'\n", 1387 | "features = [x for x in list(sample_set.columns) if x != target_var]" 1388 | ] 1389 | }, 1390 | { 1391 | "cell_type": "markdown", 1392 | "metadata": {}, 1393 | "source": [ 1394 | "# 4. Aritificial Neural Network (ANN) with Keras" 1395 | ] 1396 | }, 1397 | { 1398 | "cell_type": "code", 1399 | "execution_count": 23, 1400 | "metadata": {}, 1401 | "outputs": [ 1402 | { 1403 | "name": "stderr", 1404 | "output_type": "stream", 1405 | "text": [ 1406 | "/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n", 1407 | " from ._conv import register_converters as _register_converters\n", 1408 | "Using TensorFlow backend.\n" 1409 | ] 1410 | } 1411 | ], 1412 | "source": [ 1413 | "from keras.models import Sequential\n", 1414 | "from keras.layers import Dense" 1415 | ] 1416 | }, 1417 | { 1418 | "cell_type": "markdown", 1419 | "metadata": {}, 1420 | "source": [ 1421 | "#### - Training a Neural Network Model" 1422 | ] 1423 | }, 1424 | { 1425 | "cell_type": "code", 1426 | "execution_count": 24, 1427 | "metadata": {}, 1428 | "outputs": [], 1429 | "source": [ 1430 | "model = Sequential()\n", 1431 | "model.add(Dense(16, input_dim=len(features), activation='relu'))\n", 1432 | "model.add(Dense(8, activation='relu'))\n", 1433 | "model.add(Dense(1, activation='sigmoid'))" 1434 | ] 1435 | }, 1436 | { 1437 | "cell_type": "code", 1438 | "execution_count": 25, 1439 | "metadata": {}, 1440 | "outputs": [], 1441 | "source": [ 1442 | "model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])" 1443 | ] 1444 | }, 1445 | { 1446 | "cell_type": "code", 1447 | "execution_count": 26, 1448 | "metadata": {}, 1449 | "outputs": [], 1450 | "source": [ 1451 | "from sklearn.model_selection import train_test_split" 1452 | ] 1453 | }, 1454 | { 1455 | "cell_type": "code", 1456 | "execution_count": 27, 1457 | "metadata": {}, 1458 | "outputs": [], 1459 | "source": [ 1460 | "X_train, X_test, y_train, y_test = train_test_split(\n", 1461 | " sample_set[features], \n", 1462 | " sample_set[target_var], \n", 1463 | " test_size=0.3\n", 1464 | ")" 1465 | ] 1466 | }, 1467 | { 1468 | "cell_type": "code", 1469 | "execution_count": 28, 1470 | "metadata": { 1471 | "scrolled": true 1472 | }, 1473 | "outputs": [ 1474 | { 1475 | "name": "stdout", 1476 | "output_type": "stream", 1477 | "text": [ 1478 | "Epoch 1/50\n", 1479 | "4922/4922 [==============================] - 0s 61us/step - loss: 0.5847 - acc: 0.7152\n", 1480 | "Epoch 2/50\n", 1481 | "4922/4922 [==============================] - 0s 14us/step - loss: 0.4703 - acc: 0.7678\n", 1482 | "Epoch 3/50\n", 1483 | "4922/4922 [==============================] - 0s 13us/step - loss: 0.4310 - acc: 0.7944\n", 1484 | "Epoch 4/50\n", 1485 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.4210 - acc: 0.7985\n", 1486 | "Epoch 5/50\n", 1487 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.4172 - acc: 0.8023\n", 1488 | "Epoch 6/50\n", 1489 | "4922/4922 [==============================] - 0s 13us/step - loss: 0.4146 - acc: 0.8027\n", 1490 | "Epoch 7/50\n", 1491 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.4126 - acc: 0.8052\n", 1492 | "Epoch 8/50\n", 1493 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.4114 - acc: 0.8054\n", 1494 | "Epoch 9/50\n", 1495 | "4922/4922 [==============================] - 0s 16us/step - loss: 0.4104 - acc: 0.8064\n", 1496 | "Epoch 10/50\n", 1497 | "4922/4922 [==============================] - 0s 15us/step - loss: 0.4092 - acc: 0.8048\n", 1498 | "Epoch 11/50\n", 1499 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.4089 - acc: 0.8070\n", 1500 | "Epoch 12/50\n", 1501 | "4922/4922 [==============================] - 0s 15us/step - loss: 0.4093 - acc: 0.8078\n", 1502 | "Epoch 13/50\n", 1503 | "4922/4922 [==============================] - 0s 13us/step - loss: 0.4076 - acc: 0.8082\n", 1504 | "Epoch 14/50\n", 1505 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.4068 - acc: 0.8090\n", 1506 | "Epoch 15/50\n", 1507 | "4922/4922 [==============================] - 0s 13us/step - loss: 0.4074 - acc: 0.8054\n", 1508 | "Epoch 16/50\n", 1509 | "4922/4922 [==============================] - 0s 13us/step - loss: 0.4073 - acc: 0.8084\n", 1510 | "Epoch 17/50\n", 1511 | "4922/4922 [==============================] - 0s 13us/step - loss: 0.4057 - acc: 0.8082\n", 1512 | "Epoch 18/50\n", 1513 | "4922/4922 [==============================] - 0s 13us/step - loss: 0.4058 - acc: 0.8104\n", 1514 | "Epoch 19/50\n", 1515 | "4922/4922 [==============================] - 0s 13us/step - loss: 0.4046 - acc: 0.8098\n", 1516 | "Epoch 20/50\n", 1517 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.4054 - acc: 0.8102\n", 1518 | "Epoch 21/50\n", 1519 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.4040 - acc: 0.8121\n", 1520 | "Epoch 22/50\n", 1521 | "4922/4922 [==============================] - 0s 11us/step - loss: 0.4038 - acc: 0.8102\n", 1522 | "Epoch 23/50\n", 1523 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.4033 - acc: 0.8094\n", 1524 | "Epoch 24/50\n", 1525 | "4922/4922 [==============================] - 0s 11us/step - loss: 0.4031 - acc: 0.8094\n", 1526 | "Epoch 25/50\n", 1527 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.4026 - acc: 0.8117\n", 1528 | "Epoch 26/50\n", 1529 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.4018 - acc: 0.8125\n", 1530 | "Epoch 27/50\n", 1531 | "4922/4922 [==============================] - 0s 14us/step - loss: 0.4030 - acc: 0.8106\n", 1532 | "Epoch 28/50\n", 1533 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.4023 - acc: 0.8098\n", 1534 | "Epoch 29/50\n", 1535 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.4013 - acc: 0.8135\n", 1536 | "Epoch 30/50\n", 1537 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.4004 - acc: 0.8129\n", 1538 | "Epoch 31/50\n", 1539 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.4014 - acc: 0.8125\n", 1540 | "Epoch 32/50\n", 1541 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.4007 - acc: 0.8145\n", 1542 | "Epoch 33/50\n", 1543 | "4922/4922 [==============================] - 0s 10us/step - loss: 0.4000 - acc: 0.8108\n", 1544 | "Epoch 34/50\n", 1545 | "4922/4922 [==============================] - 0s 10us/step - loss: 0.4007 - acc: 0.8123\n", 1546 | "Epoch 35/50\n", 1547 | "4922/4922 [==============================] - 0s 11us/step - loss: 0.3991 - acc: 0.8121\n", 1548 | "Epoch 36/50\n", 1549 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.3988 - acc: 0.8104\n", 1550 | "Epoch 37/50\n", 1551 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.3982 - acc: 0.8119\n", 1552 | "Epoch 38/50\n", 1553 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.3980 - acc: 0.8121\n", 1554 | "Epoch 39/50\n", 1555 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.3976 - acc: 0.8143\n", 1556 | "Epoch 40/50\n", 1557 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.3985 - acc: 0.8111\n", 1558 | "Epoch 41/50\n", 1559 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.3968 - acc: 0.8125\n", 1560 | "Epoch 42/50\n", 1561 | "4922/4922 [==============================] - 0s 13us/step - loss: 0.3972 - acc: 0.8145\n", 1562 | "Epoch 43/50\n", 1563 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.3962 - acc: 0.8106\n", 1564 | "Epoch 44/50\n", 1565 | "4922/4922 [==============================] - 0s 13us/step - loss: 0.3963 - acc: 0.8119\n", 1566 | "Epoch 45/50\n", 1567 | "4922/4922 [==============================] - 0s 13us/step - loss: 0.3959 - acc: 0.8137\n", 1568 | "Epoch 46/50\n", 1569 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.3950 - acc: 0.8121\n", 1570 | "Epoch 47/50\n", 1571 | "4922/4922 [==============================] - 0s 18us/step - loss: 0.3943 - acc: 0.8133\n", 1572 | "Epoch 48/50\n", 1573 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.3944 - acc: 0.8125\n", 1574 | "Epoch 49/50\n", 1575 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.3948 - acc: 0.8135\n", 1576 | "Epoch 50/50\n", 1577 | "4922/4922 [==============================] - 0s 12us/step - loss: 0.3937 - acc: 0.8139\n" 1578 | ] 1579 | }, 1580 | { 1581 | "data": { 1582 | "text/plain": [ 1583 | "" 1584 | ] 1585 | }, 1586 | "execution_count": 28, 1587 | "metadata": {}, 1588 | "output_type": "execute_result" 1589 | } 1590 | ], 1591 | "source": [ 1592 | "model.fit(X_train, y_train, epochs=50, batch_size=100)" 1593 | ] 1594 | }, 1595 | { 1596 | "cell_type": "markdown", 1597 | "metadata": {}, 1598 | "source": [ 1599 | "#### - Accuracy, Precision, Recall" 1600 | ] 1601 | }, 1602 | { 1603 | "cell_type": "code", 1604 | "execution_count": 29, 1605 | "metadata": {}, 1606 | "outputs": [], 1607 | "source": [ 1608 | "from sklearn.metrics import accuracy_score, precision_score, recall_score" 1609 | ] 1610 | }, 1611 | { 1612 | "cell_type": "code", 1613 | "execution_count": 30, 1614 | "metadata": {}, 1615 | "outputs": [], 1616 | "source": [ 1617 | "in_sample_preds = [round(x[0]) for x in model.predict(X_train)]\n", 1618 | "out_sample_preds = [round(x[0]) for x in model.predict(X_test)]" 1619 | ] 1620 | }, 1621 | { 1622 | "cell_type": "code", 1623 | "execution_count": 31, 1624 | "metadata": {}, 1625 | "outputs": [ 1626 | { 1627 | "name": "stdout", 1628 | "output_type": "stream", 1629 | "text": [ 1630 | "In-Sample Accuracy: 0.8143\n", 1631 | "Out-of-Sample Accuracy: 0.8028\n", 1632 | "\n", 1633 | "\n", 1634 | "In-Sample Precision: 0.6837\n", 1635 | "Out-of-Sample Precision: 0.6604\n", 1636 | "\n", 1637 | "\n", 1638 | "In-Sample Recall: 0.5684\n", 1639 | "Out-of-Sample Recall: 0.5099\n" 1640 | ] 1641 | } 1642 | ], 1643 | "source": [ 1644 | "print('In-Sample Accuracy: %0.4f' % accuracy_score(y_train, in_sample_preds))\n", 1645 | "print('Out-of-Sample Accuracy: %0.4f' % accuracy_score(y_test, out_sample_preds))\n", 1646 | "\n", 1647 | "print('\\n')\n", 1648 | "\n", 1649 | "print('In-Sample Precision: %0.4f' % precision_score(y_train, in_sample_preds))\n", 1650 | "print('Out-of-Sample Precision: %0.4f' % precision_score(y_test, out_sample_preds))\n", 1651 | "\n", 1652 | "print('\\n')\n", 1653 | "\n", 1654 | "print('In-Sample Recall: %0.4f' % recall_score(y_train, in_sample_preds))\n", 1655 | "print('Out-of-Sample Recall: %0.4f' % recall_score(y_test, out_sample_preds))" 1656 | ] 1657 | }, 1658 | { 1659 | "cell_type": "markdown", 1660 | "metadata": {}, 1661 | "source": [ 1662 | "#### - ROC & AUC" 1663 | ] 1664 | }, 1665 | { 1666 | "cell_type": "code", 1667 | "execution_count": 32, 1668 | "metadata": {}, 1669 | "outputs": [], 1670 | "source": [ 1671 | "from sklearn.metrics import roc_curve, auc" 1672 | ] 1673 | }, 1674 | { 1675 | "cell_type": "code", 1676 | "execution_count": 33, 1677 | "metadata": {}, 1678 | "outputs": [], 1679 | "source": [ 1680 | "in_sample_preds = [x[0] for x in model.predict(X_train)]\n", 1681 | "out_sample_preds = [x[0] for x in model.predict(X_test)]" 1682 | ] 1683 | }, 1684 | { 1685 | "cell_type": "code", 1686 | "execution_count": 34, 1687 | "metadata": {}, 1688 | "outputs": [], 1689 | "source": [ 1690 | "in_sample_fpr, in_sample_tpr, in_sample_thresholds = roc_curve(y_train, in_sample_preds)\n", 1691 | "out_sample_fpr, out_sample_tpr, out_sample_thresholds = roc_curve(y_test, out_sample_preds)" 1692 | ] 1693 | }, 1694 | { 1695 | "cell_type": "code", 1696 | "execution_count": 35, 1697 | "metadata": {}, 1698 | "outputs": [ 1699 | { 1700 | "name": "stdout", 1701 | "output_type": "stream", 1702 | "text": [ 1703 | "In-Sample AUC: 0.8659\n", 1704 | "Out-Sample AUC: 0.8466\n" 1705 | ] 1706 | } 1707 | ], 1708 | "source": [ 1709 | "in_sample_roc_auc = auc(in_sample_fpr, in_sample_tpr)\n", 1710 | "out_sample_roc_auc = auc(out_sample_fpr, out_sample_tpr)\n", 1711 | "\n", 1712 | "print('In-Sample AUC: %0.4f' % in_sample_roc_auc)\n", 1713 | "print('Out-Sample AUC: %0.4f' % out_sample_roc_auc)" 1714 | ] 1715 | }, 1716 | { 1717 | "cell_type": "code", 1718 | "execution_count": 36, 1719 | "metadata": {}, 1720 | "outputs": [ 1721 | { 1722 | "data": { 1723 | "image/png": "\n", 1724 | "text/plain": [ 1725 | "
" 1726 | ] 1727 | }, 1728 | "metadata": {}, 1729 | "output_type": "display_data" 1730 | } 1731 | ], 1732 | "source": [ 1733 | "plt.figure(figsize=(10,7))\n", 1734 | "\n", 1735 | "plt.plot(\n", 1736 | " out_sample_fpr, out_sample_tpr, color='darkorange', label='Out-Sample ROC curve (area = %0.4f)' % in_sample_roc_auc\n", 1737 | ")\n", 1738 | "plt.plot(\n", 1739 | " in_sample_fpr, in_sample_tpr, color='navy', label='In-Sample ROC curve (area = %0.4f)' % out_sample_roc_auc\n", 1740 | ")\n", 1741 | "plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')\n", 1742 | "plt.grid()\n", 1743 | "plt.xlim([0.0, 1.0])\n", 1744 | "plt.ylim([0.0, 1.05])\n", 1745 | "plt.xlabel('False Positive Rate')\n", 1746 | "plt.ylabel('True Positive Rate')\n", 1747 | "plt.title('ROC Curve')\n", 1748 | "plt.legend(loc=\"lower right\")\n", 1749 | "\n", 1750 | "plt.show()" 1751 | ] 1752 | }, 1753 | { 1754 | "cell_type": "code", 1755 | "execution_count": null, 1756 | "metadata": {}, 1757 | "outputs": [], 1758 | "source": [] 1759 | } 1760 | ], 1761 | "metadata": { 1762 | "kernelspec": { 1763 | "display_name": "Python 3", 1764 | "language": "python", 1765 | "name": "python3" 1766 | }, 1767 | "language_info": { 1768 | "codemirror_mode": { 1769 | "name": "ipython", 1770 | "version": 3 1771 | }, 1772 | "file_extension": ".py", 1773 | "mimetype": "text/x-python", 1774 | "name": "python", 1775 | "nbconvert_exporter": "python", 1776 | "pygments_lexer": "ipython3", 1777 | "version": "3.6.5" 1778 | } 1779 | }, 1780 | "nbformat": 4, 1781 | "nbformat_minor": 2 1782 | } 1783 | -------------------------------------------------------------------------------- /ch.12/R/ABTesting.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(readxl) 3 | library(ggplot2) 4 | 5 | #### 1. Load Data #### 6 | df <- read_excel( 7 | path="~/Documents/data-science-for-marketing/ch.12/data/WA_Fn-UseC_-Marketing-Campaign-Eff-UseC_-FastF.xlsx" 8 | ) 9 | 10 | #### 2. Date Analysis #### 11 | 12 | # - total sales 13 | summary(df$SalesInThousands) 14 | 15 | salesPerPromo <- df %>% 16 | group_by(Promotion) %>% 17 | summarise(Sales=sum(SalesInThousands)) 18 | 19 | salesPerPromo 20 | 21 | ggplot(salesPerPromo, aes(x="", y=Sales, fill=Promotion)) + 22 | geom_bar(width=1, stat = "identity", position=position_fill()) + 23 | geom_text(aes(x=1.25, label=Sales), position=position_fill(vjust = 0.5), color='white') + 24 | coord_polar("y") + 25 | ggtitle('sales distribution across different promotions') 26 | 27 | # - market size 28 | df %>% 29 | group_by(MarketSize) %>% 30 | summarise(Count=n()) 31 | 32 | marketSizePerPromo <- df %>% 33 | group_by(Promotion, MarketSize) %>% 34 | summarise(Count=n()) 35 | 36 | marketSizePerPromo 37 | 38 | ggplot(marketSizePerPromo, aes(x=Promotion, y=Count, fill=MarketSize)) + 39 | geom_bar(width=0.5, stat="identity", position="dodge") + 40 | ylab("Count") + 41 | xlab("Promotion") + 42 | ggtitle("breakdowns of market sizes across different promotions") + 43 | theme(plot.title=element_text(hjust=0.5)) 44 | 45 | ggplot(marketSizePerPromo, aes(x=Promotion, y=Count, fill=MarketSize)) + 46 | geom_bar(width=0.5, stat="identity", position="stack") + 47 | ylab("Count") + 48 | xlab("Promotion") + 49 | ggtitle("breakdowns of market sizes across different promotions") + 50 | theme(plot.title=element_text(hjust=0.5)) 51 | 52 | 53 | # - store age 54 | summary(df$AgeOfStore) 55 | 56 | overallAge <- df %>% 57 | group_by(AgeOfStore) %>% 58 | summarise(Count=n()) 59 | 60 | overallAge 61 | 62 | ggplot(overallAge, aes(x=AgeOfStore, y=Count)) + 63 | geom_bar(width=0.5, stat="identity") + 64 | ylab("Count") + 65 | xlab("Store Age") + 66 | ggtitle("overall distributions of age of store") + 67 | theme(plot.title=element_text(hjust=0.5)) 68 | 69 | AgePerPromo <- df %>% 70 | group_by(Promotion, AgeOfStore) %>% 71 | summarise(Count=n()) 72 | 73 | AgePerPromo 74 | 75 | ggplot(AgePerPromo, aes(x=AgeOfStore, y=Count, fill=Promotion)) + 76 | geom_bar(width=0.5, stat="identity", position="dodge2") + 77 | ylab("Count") + 78 | xlab("Store Age") + 79 | ggtitle("distributions of age of store") + 80 | theme(plot.title=element_text(hjust=0.5)) 81 | 82 | tapply(df$AgeOfStore, df$Promotion, summary) 83 | 84 | 85 | # - week number 86 | df %>% 87 | group_by(Week) %>% 88 | summarise(Count=n()) 89 | 90 | weekPerPromo <- df %>% 91 | group_by(Week, Promotion) %>% 92 | summarise(Count=n()) 93 | 94 | weekPerPromo 95 | 96 | ggplot(weekPerPromo, aes(x="", y=Count, fill=Promotion)) + 97 | geom_bar(width=1, stat = "identity", position=position_fill()) + 98 | geom_text(aes(x=1.25, label=Count), position=position_fill(vjust = 0.5), color='white') + 99 | coord_polar("y") + 100 | facet_wrap(~Week) + 101 | ggtitle('distribution across different weeks') 102 | 103 | 104 | #### 3. Statistical Significance #### 105 | 106 | # Promotion 1 vs. 2 107 | promo_1 <- df[which(df$Promotion == 1),]$SalesInThousands 108 | promo_2 <- df[which(df$Promotion == 2),]$SalesInThousands 109 | 110 | mean_1 <- mean(promo_1) 111 | mean_2 <- mean(promo_2) 112 | std_1 <- sd(promo_1) 113 | std_2 <- sd(promo_2) 114 | n_1 <- length(promo_1) 115 | n_2 <- length(promo_2) 116 | 117 | df_1_2 <- n_1 + n_2 - 2 118 | 119 | t_val <- ( 120 | mean_1 - mean_2 121 | ) / sqrt( 122 | (std_1**2/n_1 + std_2**2/n_2) 123 | ) 124 | 125 | p_val <- 2 * pt(t_val, df_1_2, lower=FALSE) 126 | 127 | # - using t.test 128 | t.test( 129 | promo_1, 130 | promo_2 131 | ) 132 | 133 | # Promotion 1 vs. 3 134 | promo_1 <- df[which(df$Promotion == 1),]$SalesInThousands 135 | promo_3 <- df[which(df$Promotion == 3),]$SalesInThousands 136 | 137 | mean_1 <- mean(promo_1) 138 | mean_3 <- mean(promo_3) 139 | std_1 <- sd(promo_1) 140 | std_3 <- sd(promo_3) 141 | n_1 <- length(promo_1) 142 | n_3 <- length(promo_3) 143 | df_1_3 <- n_1 + n_3 - 2 144 | 145 | t_val <- ( 146 | mean_1 - mean_3 147 | ) / sqrt( 148 | (std_1**2/n_1 + std_3**2/n_3) 149 | ) 150 | 151 | p_val <- 2 * pt(t_val, df_1_3, lower=FALSE) 152 | 153 | # - using t.test 154 | t.test( 155 | promo_1, 156 | promo_3 157 | ) 158 | 159 | 160 | 161 | 162 | -------------------------------------------------------------------------------- /ch.2/R/ConversionRate.R: -------------------------------------------------------------------------------- 1 | library(dplyr) # Data Wrangling and Manipulation 2 | library(ggplot2) 3 | 4 | conversionsDF <- read.csv( 5 | file="~/Documents/data-science-for-marketing/ch.2/data/bank-additional-full.csv", 6 | header=TRUE, 7 | sep=";" 8 | ) 9 | 10 | # Shape of conversionsDF 11 | dim(conversionsDF) 12 | # Quick look at conversionsDF 13 | head(conversionsDF) 14 | 15 | # Encode conversions as 0s and 1s 16 | conversionsDF$conversion <- as.integer(conversionsDF$y) - 1 17 | tail(conversionsDF) 18 | 19 | #### 1. Aggregate Conversion Rate #### 20 | sprintf("total conversions: %i out of %i", sum(conversionsDF$conversion), nrow(conversionsDF)) 21 | sprintf("conversion rate: %0.2f%%", sum(conversionsDF$conversion)/nrow(conversionsDF)*100.0) 22 | 23 | #### 2. Conversion Rates by Number of Contacts #### 24 | conversionsByNumContact <- conversionsDF %>% 25 | group_by(NumContact=campaign) %>% 26 | summarise(TotalCount=n(), NumConversions=sum(conversion)) %>% 27 | mutate(ConversionRate=NumConversions/TotalCount*100.0) 28 | 29 | head(conversionsByNumContact, 10) 30 | 31 | # line chart 32 | ggplot(data=head(conversionsByNumContact, 10), aes(x=NumContact, y=ConversionRate)) + 33 | geom_line() + 34 | ggtitle('Conversion Rates by Number of Contacts') + 35 | xlab("Number of Contacts") + 36 | ylab("Conversion Rate (%)") + 37 | ylim(c(0, 15)) + 38 | theme(plot.title = element_text(hjust = 0.5)) 39 | 40 | 41 | #### 3. Conversion Rates by Age #### 42 | 43 | # a. by age 44 | conversionsByAge <- conversionsDF %>% 45 | group_by(Age=age) %>% 46 | summarise(TotalCount=n(), NumConversions=sum(conversion)) %>% 47 | mutate(ConversionRate=NumConversions/TotalCount*100.0) 48 | 49 | head(conversionsByAge) 50 | 51 | # line chart 52 | ggplot(data=conversionsByAge, aes(x=Age, y=ConversionRate)) + 53 | geom_line() + 54 | ggtitle('Conversion Rates by Age') + 55 | xlab("Age") + 56 | ylab("Conversion Rate (%)") + 57 | theme(plot.title = element_text(hjust = 0.5)) 58 | 59 | # b. by age groups 60 | conversionsByAgeGroup <- conversionsDF %>% 61 | group_by(AgeGroup=cut(age, breaks= seq(20, 70, by = 10)) ) %>% 62 | summarise(TotalCount=n(), NumConversions=sum(conversion)) %>% 63 | mutate(ConversionRate=NumConversions/TotalCount*100.0) 64 | 65 | conversionsByAgeGroup$AgeGroup <- as.character(conversionsByAgeGroup$AgeGroup) 66 | conversionsByAgeGroup$AgeGroup[6] <- "70+" 67 | 68 | # bar chart 69 | ggplot(conversionsByAgeGroup, aes(x=AgeGroup, y=ConversionRate)) + 70 | geom_bar(width=0.5, stat="identity") + 71 | ggtitle('Conversion Rates by Age Groups') + 72 | xlab("Age") + 73 | ylab("Conversion Rate (%)") + 74 | theme(plot.title = element_text(hjust = 0.5)) 75 | 76 | #### 4. Conversions vs. Non-Conversions #### 77 | 78 | # 4.1. Marital Status 79 | conversionsByMaritalStatus <- conversionsDF %>% 80 | group_by(Marital=marital, Conversion=conversion) %>% 81 | summarise(Count=n()) 82 | 83 | conversionsByMaritalStatus 84 | 85 | # pie chart 86 | ggplot(conversionsByMaritalStatus, aes(x="", y=Count, fill=Marital)) + 87 | geom_bar(width=1, stat = "identity", position=position_fill()) + 88 | geom_text(aes(x=1.25, label=Count), position=position_fill(vjust = 0.5)) + 89 | coord_polar("y") + 90 | facet_wrap(~Conversion) + 91 | ggtitle('Marital Status (0: Non Conversions, 1: Conversions)') + 92 | theme( 93 | axis.title.x=element_blank(), 94 | axis.title.y=element_blank(), 95 | plot.title=element_text(hjust=0.5), 96 | legend.position='bottom' 97 | ) 98 | 99 | # 4.2. Education 100 | conversionsByEducation <- conversionsDF %>% 101 | group_by(Education=education, Conversion=conversion) %>% 102 | summarise(Count=n()) 103 | 104 | conversionsByEducation 105 | 106 | # pie chart 107 | ggplot(conversionsByEducation, aes(x="", y=Count, fill=Education)) + 108 | geom_bar(width=1, stat = "identity", position=position_fill()) + 109 | geom_text(aes(x=1.25, label=Count), position=position_fill(vjust = 0.5)) + 110 | coord_polar("y") + 111 | facet_wrap(~Conversion) + 112 | ggtitle('Education (0: Non Conversions, 1: Conversions)') + 113 | theme( 114 | axis.title.x=element_blank(), 115 | axis.title.y=element_blank(), 116 | plot.title=element_text(hjust=0.5), 117 | legend.position='bottom' 118 | ) 119 | 120 | # 4.3. Last Contact Duration 121 | conversionsDF$duration <- conversionsDF$duration / (60*60) 122 | 123 | ggplot(conversionsDF, aes(x="", y=duration)) + 124 | geom_boxplot() + 125 | facet_wrap(~conversion) + 126 | ylab("duration (in hours)") + 127 | xlab("0: Non-Conversion, 1: Conversion") + 128 | ggtitle("Conversion vs. Non-Conversions: Last Contact Duration") + 129 | theme(plot.title=element_text(hjust=0.5)) 130 | 131 | #### 5. Conversions by Age Groups & Marital Status #### 132 | conversionsByAgeMarital <- conversionsDF %>% 133 | group_by(AgeGroup=cut(age, breaks= seq(20, 70, by = 10)), Marital=marital) %>% 134 | summarise(Count=n(), NumConversions=sum(conversion)) %>% 135 | mutate(TotalCount=sum(Count)) %>% 136 | mutate(ConversionRate=NumConversions/TotalCount) 137 | 138 | conversionsByAgeMarital$AgeGroup <- as.character(conversionsByAgeMarital$AgeGroup) 139 | conversionsByAgeMarital$AgeGroup[is.na(conversionsByAgeMarital$AgeGroup)] <- "70+" 140 | 141 | # bar chart 142 | ggplot(conversionsByAgeMarital, aes(x=AgeGroup, y=ConversionRate, fill=Marital)) + 143 | geom_bar(width=0.5, stat="identity", position="dodge") + 144 | ylab("Conversion Rate (%)") + 145 | xlab("Age") + 146 | ggtitle("Conversion Rates by Age and Marital Status") + 147 | theme(plot.title=element_text(hjust=0.5)) 148 | 149 | # stacked bar chart 150 | ggplot(conversionsByAgeMarital, aes(x=AgeGroup, y=ConversionRate, fill=Marital)) + 151 | geom_bar(width=0.5, stat="identity", position="stack") + 152 | ylab("Conversion Rate (%)") + 153 | xlab("Age") + 154 | ggtitle("Conversion Rates by Age and Marital Status") + 155 | theme(plot.title=element_text(hjust=0.5)) 156 | 157 | -------------------------------------------------------------------------------- /ch.3/R/RegressionAnalysis.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(ggplot2) 3 | 4 | # Load data 5 | df <- read.csv( 6 | file="~/Documents/data-science-for-marketing/ch.3/data/WA_Fn-UseC_-Marketing-Customer-Value-Analysis.csv", 7 | header=TRUE, 8 | sep="," 9 | ) 10 | 11 | #### 1. Engagement Rate #### 12 | 13 | # Encode Response as 0s and 1s 14 | df$Engaged <- as.integer(df$Response) - 1 15 | 16 | engagementRate <- df %>% 17 | group_by(Engaged) %>% 18 | summarise(Count=n()) %>% 19 | mutate(Percentage=Count/nrow(df)*100.0) 20 | 21 | engagementRate 22 | 23 | # Transpose 24 | transposed <- t(engagementRate) 25 | 26 | colnames(transposed) <- engagementRate$Engaged 27 | transposed <- transposed[-1,] 28 | transposed 29 | 30 | #### 2. Renewal Offer Type #### 31 | renewalOfferType <- df %>% 32 | group_by(Engaged, Type=Renew.Offer.Type) %>% 33 | summarise(Count=n()) 34 | 35 | renewalOfferType 36 | 37 | # pie chart 38 | ggplot(renewalOfferType, aes(x="", y=Count, fill=Type)) + 39 | geom_bar(width=1, stat = "identity", position=position_fill()) + 40 | geom_text(aes(x=1.25, label=Count), position=position_fill(vjust = 0.5)) + 41 | coord_polar("y") + 42 | facet_wrap(~Engaged) + 43 | ggtitle('Renwal Offer Type (0: Not Engaged, 1: Engaged)') + 44 | theme( 45 | axis.title.x=element_blank(), 46 | axis.title.y=element_blank(), 47 | plot.title=element_text(hjust=0.5), 48 | legend.position='bottom' 49 | ) 50 | 51 | #### 3. Sales Channel #### 52 | salesChannel <- df %>% 53 | group_by(Engaged, Channel=Sales.Channel) %>% 54 | summarise(Count=n()) 55 | 56 | salesChannel 57 | 58 | # pie chart 59 | ggplot(salesChannel, aes(x="", y=Count, fill=Channel)) + 60 | geom_bar(width=1, stat = "identity", position=position_fill()) + 61 | geom_text(aes(x=1.25, label=Count), position=position_fill(vjust = 0.5)) + 62 | coord_polar("y") + 63 | facet_wrap(~Engaged) + 64 | ggtitle('Sales Channel (0: Not Engaged, 1: Engaged)') + 65 | theme( 66 | axis.title.x=element_blank(), 67 | axis.title.y=element_blank(), 68 | plot.title=element_text(hjust=0.5), 69 | legend.position='bottom' 70 | ) 71 | 72 | #### 4. Total Claim Amount #### 73 | ggplot(df, aes(x="", y=Total.Claim.Amount)) + 74 | geom_boxplot() + 75 | facet_wrap(~Engaged) + 76 | ylab("Total Claim Amount") + 77 | xlab("0: Not Engaged, 1: Engaged") + 78 | ggtitle("Engaed vs. Not Engaged: Total Claim Amount") + 79 | theme(plot.title=element_text(hjust=0.5)) 80 | 81 | # without outliers 82 | ggplot(df, aes(x="", y=Total.Claim.Amount)) + 83 | geom_boxplot(outlier.shape = NA) + 84 | scale_y_continuous(limits = quantile(df$Total.Claim.Amount, c(0.1, 0.9))) + 85 | facet_wrap(~Engaged) + 86 | ylab("Total Claim Amount") + 87 | xlab("0: Not Engaged, 1: Engaged") + 88 | ggtitle("Engaed vs. Not Engaged: Total Claim Amount") + 89 | theme(plot.title=element_text(hjust=0.5)) 90 | 91 | #### 5. Income #### 92 | 93 | # boxplot 94 | ggplot(df, aes(x="", y=Income)) + 95 | geom_boxplot() + 96 | facet_wrap(~Engaged) + 97 | ylab("Income") + 98 | xlab("0: Not Engaged, 1: Engaged") + 99 | ggtitle("Engaed vs. Not Engaged: Income") + 100 | theme(plot.title=element_text(hjust=0.5)) 101 | 102 | # summary statistics 103 | incomeDescription <- df %>% 104 | group_by(Engaged) %>% 105 | summarise( 106 | Min=min(Income), Q1=quantile(Income, 0.25), 107 | Median=median(Income), Q3=quantile(Income, 0.75), 108 | Max=max(Income) 109 | ) 110 | 111 | incomeDescription 112 | 113 | 114 | #### 6. Regression Analysis #### 115 | # summary statistics per column 116 | summary(df) 117 | # get data types of each column 118 | sapply(df, class) 119 | 120 | ## 6.1. Continuous Variables ## 121 | 122 | # get numeric columns 123 | continuousDF <- select_if(df, is.numeric) 124 | colnames(continuousDF) 125 | 126 | # Fit regression model with continuous variables 127 | logit.fit <- glm(Engaged ~ ., data = continuousDF, family = binomial) 128 | summary(logit.fit) 129 | 130 | 131 | ## 6.2. Categorical Variables ## 132 | 133 | # a. Education 134 | # Fit regression model with Education factor variables 135 | logit.fit <- glm(Engaged ~ factor(Education), data = df, family = binomial) 136 | summary(logit.fit) 137 | 138 | # b. Education + Gender 139 | # Fit regression model with Education & Gender variables 140 | logit.fit <- glm(Engaged ~ factor(Education) + factor(Gender), data = df, family = binomial) 141 | summary(logit.fit) 142 | 143 | 144 | ## 6.3. Continuous & Categorical Variables ## 145 | 146 | continuousDF$Gender <- factor(df$Gender) 147 | continuousDF$Education <- factor(df$Education) 148 | 149 | # Fit regression model with Education & Gender variables 150 | logit.fit <- glm(Engaged ~ ., data = continuousDF, family = binomial) 151 | summary(logit.fit) 152 | 153 | -------------------------------------------------------------------------------- /ch.4/R/FromEngagementToConversions.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(ggplot2) 3 | 4 | # install.packages('rattle') 5 | library(rattle) 6 | 7 | # install.packages('rpart') 8 | library(rpart) # used to build decision tree 9 | 10 | # install.packages('rpart.plot') 11 | library(rpart.plot) 12 | 13 | #### 1. Load Data #### 14 | df <- read.csv( 15 | file="~/Documents/data-science-for-marketing/ch.4/data/bank-full.csv", 16 | header=TRUE, 17 | sep=";" 18 | ) 19 | 20 | # Encode conversions as 0s and 1s 21 | df$conversion <- as.integer(df$y) - 1 22 | 23 | head(df) 24 | 25 | #### 2. Data Analysis #### 26 | 27 | # column names 28 | colnames(df) 29 | 30 | #### 2.1. Conversion Rate #### 31 | sprintf("total conversions: %i out of %i", sum(df$conversion), nrow(df)) 32 | sprintf("conversion rate: %0.2f%%", sum(df$conversion)/nrow(df)*100.0) 33 | 34 | #### 2.2. Conversion Rates by Marital Status #### 35 | conversionsByMarital <- df %>% 36 | group_by(Marital=marital) %>% 37 | summarise(Count=n(), NumConversions=sum(conversion)) %>% 38 | mutate(ConversionRate=NumConversions/Count*100.0) 39 | 40 | conversionsByMarital 41 | 42 | ggplot(conversionsByMarital, aes(x=Marital, y=ConversionRate)) + 43 | geom_bar(width=0.5, stat="identity") + 44 | ggtitle('Conversion Rates by Marital Status') + 45 | xlab("Marital Status") + 46 | ylab("Conversion Rate (%)") + 47 | theme(plot.title = element_text(hjust = 0.5)) 48 | 49 | #### 2.2. Conversion Rates by Job #### 50 | conversionsByJob <- df %>% 51 | group_by(Job=job) %>% 52 | summarise(Count=n(), NumConversions=sum(conversion)) %>% 53 | mutate(ConversionRate=NumConversions/Count*100.0) 54 | 55 | conversionsByJob 56 | 57 | ggplot(conversionsByJob, aes(x=Job, y=ConversionRate)) + 58 | geom_bar(width=0.5, stat="identity") + 59 | coord_flip() + 60 | ggtitle('Conversion Rates by Job') + 61 | xlab("Job") + 62 | ylab("Conversion Rate (%)") + 63 | theme(plot.title = element_text(hjust = 0.5)) 64 | 65 | #### 2.3. Default Rates by Conversions #### 66 | defaultByConversion <- df %>% 67 | group_by(Default=default, Conversion=conversion) %>% 68 | summarise(Count=n()) 69 | 70 | defaultByConversion 71 | 72 | ggplot(defaultByConversion, aes(x="", y=Count, fill=Default)) + 73 | geom_bar(width=1, stat = "identity", position=position_fill()) + 74 | geom_text(aes(x=1.25, label=Count), position=position_fill(vjust = 0.5)) + 75 | coord_polar("y") + 76 | facet_wrap(~Conversion) + 77 | ggtitle('Default (0: Non Conversions, 1: Conversions)') + 78 | theme( 79 | axis.title.x=element_blank(), 80 | axis.title.y=element_blank(), 81 | plot.title=element_text(hjust=0.5), 82 | legend.position='bottom' 83 | ) 84 | 85 | 86 | #### 2.4. Bank Balance by Conversions #### 87 | ggplot(df, aes(x="", y=balance)) + 88 | geom_boxplot() + 89 | facet_wrap(~conversion) + 90 | ylab("balance") + 91 | xlab("0: Non-Conversion, 1: Conversion") + 92 | ggtitle("Conversion vs. Non-Conversions: Balance") + 93 | theme(plot.title=element_text(hjust=0.5)) 94 | 95 | ggplot(df, aes(x="", y=balance)) + 96 | geom_boxplot(outlier.shape = NA) + 97 | scale_y_continuous(limits = c(-2000, 5000)) + 98 | facet_wrap(~conversion) + 99 | ylab("balance") + 100 | xlab("0: Non-Conversion, 1: Conversion") + 101 | ggtitle("Conversion vs. Non-Conversions: Balance") + 102 | theme(plot.title=element_text(hjust=0.5)) 103 | 104 | #### 2.5. Conversions by Number of Contacts #### 105 | conversionsByNumContacts <- df %>% 106 | group_by(Campaign=campaign) %>% 107 | summarise(Count=n(), NumConversions=sum(conversion)) %>% 108 | mutate(ConversionRate=NumConversions/Count*100.0) 109 | 110 | conversionsByNumContacts 111 | 112 | ggplot(conversionsByNumContacts, aes(x=Campaign, y=ConversionRate)) + 113 | geom_bar(width=0.5, stat="identity") + 114 | ggtitle('Conversion Rates by Number of Contacts') + 115 | xlab("Number of Contacts") + 116 | ylab("Conversion Rate (%)") + 117 | theme(plot.title = element_text(hjust = 0.5)) 118 | 119 | #### 3. Encoding Categorical Variables #### 120 | rapply(df, function(x) length(unique(x))) 121 | 122 | #### 3.1. encoding 'month' #### 123 | # unique values 124 | unique(df$month) 125 | # convert to numbers 126 | months = lapply(month.abb, function(x) tolower(x)) 127 | months 128 | # test 129 | match(unique(df$month), months) 130 | 131 | # encode 132 | df$month <- match(df$month, months) 133 | # check 134 | df %>% 135 | group_by(month) %>% 136 | summarise(Count=n()) 137 | 138 | #### 3.2. encoding job, housing, marital #### 139 | df$job <- factor(df$job) 140 | df$housing <- factor(df$housing) 141 | df$marital <- factor(df$marital) 142 | 143 | 144 | #### 4. Fitting Decision Trees #### 145 | 146 | # grow tree 147 | fit <- rpart( 148 | conversion ~ age + balance + campaign + previous + housing + job + marital, 149 | method="class", 150 | data=df, 151 | control=rpart.control(maxdepth=4, cp=0.0001) 152 | ) 153 | 154 | # plot tree 155 | fancyRpartPlot(fit) 156 | -------------------------------------------------------------------------------- /ch.5/R/ProductAnalytics.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(ggplot2) 3 | 4 | # install.packages("readxl") 5 | library(readxl) 6 | # install.packages("lubridate") 7 | library(lubridate) 8 | 9 | 10 | #### 1. Load Data #### 11 | df <- read_excel( 12 | path="~/Documents/research/data-science-marketing/ch.5/data/Online Retail.xlsx", 13 | sheet="Online Retail" 14 | ) 15 | 16 | #### 2. Product Analytics #### 17 | 18 | #### - Quantity Distribution #### 19 | summary(df$Quantity) 20 | 21 | ggplot(df, aes(x="", y=Quantity)) + 22 | geom_boxplot(outlier.shape = NA) + 23 | ylim(c(-15, 25))+ 24 | ylab("order quantity") + 25 | xlab("") + 26 | ggtitle("Quantity Distribution") + 27 | theme(plot.title=element_text(hjust=0.5)) 28 | 29 | # filter out orders with negative quantity (cancel orders) 30 | dim(df[which(df$Quantity > 0),]) 31 | dim(df) 32 | 33 | df <- df[which(df$Quantity > 0),] 34 | dim(df) 35 | 36 | #### 2.1. Time-series Number of Orders #### 37 | timeSeriesNumInvoices <- df %>% 38 | group_by(InvoiceDate=floor_date(InvoiceDate, "month")) %>% 39 | summarise(NumOrders=n_distinct(InvoiceNo)) 40 | 41 | ggplot(timeSeriesNumInvoices, aes(x=InvoiceDate, y=NumOrders)) + 42 | geom_line() + 43 | ylim(c(0, max(timeSeriesNumInvoices$NumOrders) + 1000)) + 44 | ylab("number of orders") + 45 | xlab("date") + 46 | ggtitle("Number of Orders over Time") + 47 | theme(plot.title=element_text(hjust=0.5)) 48 | 49 | summary(df[which(df$InvoiceDate >= as.Date("2011-12-01")),"InvoiceDate"]) 50 | 51 | dim(df[which(df$InvoiceDate < as.Date("2011-12-01")),]) 52 | dim(df) 53 | 54 | df <- df[which(df$InvoiceDate < as.Date("2011-12-01")),] 55 | dim(df) 56 | 57 | timeSeriesNumInvoices <- df %>% 58 | group_by(InvoiceDate=floor_date(InvoiceDate, "month")) %>% 59 | summarise(NumOrders=n_distinct(InvoiceNo)) 60 | 61 | ggplot(timeSeriesNumInvoices, aes(x=InvoiceDate, y=NumOrders)) + 62 | geom_line()+ 63 | ylim(c(0, max(timeSeriesNumInvoices$NumOrders) + 100)) + 64 | ylab("number of orders") + 65 | xlab("date") + 66 | ggtitle("Number of Orders over Time") + 67 | theme(plot.title=element_text(hjust=0.5)) 68 | 69 | 70 | #### 2.2. Time-series Revenue #### 71 | df$Sales <- df$Quantity * df$UnitPrice 72 | 73 | timeSeriesRevenue <- df %>% 74 | group_by(InvoiceDate=floor_date(InvoiceDate, "month")) %>% 75 | summarise(Sales=sum(Sales)) 76 | 77 | ggplot(timeSeriesRevenue, aes(x=InvoiceDate, y=Sales)) + 78 | geom_line() + 79 | ylim(c(0, max(timeSeriesRevenue$Sales) + 10000)) + 80 | ylab("sales") + 81 | xlab("date") + 82 | ggtitle("Revenue over Time") + 83 | theme(plot.title=element_text(hjust=0.5)) 84 | 85 | #### 2.3. Revenue from Repeat Customers #### 86 | 87 | # Repeat Customers 88 | invoiceCustomerDF <- df %>% 89 | group_by(InvoiceNo, InvoiceDate) %>% 90 | summarise(CustomerID=max(CustomerID), Sales=sum(Sales)) 91 | 92 | timeSeriesCustomerDF <- invoiceCustomerDF %>% 93 | group_by(InvoiceDate=floor_date(InvoiceDate, "month"), CustomerID) %>% 94 | summarise(Count=n_distinct(InvoiceNo), Sales=sum(Sales)) 95 | 96 | repeatCustomers <- na.omit(timeSeriesCustomerDF[which(timeSeriesCustomerDF$Count > 1),]) 97 | 98 | timeSeriesRepeatCustomers <- repeatCustomers %>% 99 | group_by(InvoiceDate) %>% 100 | summarise(Count=n_distinct(CustomerID), Sales=sum(Sales)) 101 | 102 | # Unique Customers 103 | timeSeriesUniqCustomers <- df %>% 104 | group_by(InvoiceDate=floor_date(InvoiceDate, "month")) %>% 105 | summarise(Count=n_distinct(CustomerID)) 106 | 107 | timeSeriesRepeatCustomers$Perc <- timeSeriesRepeatCustomers$Sales / timeSeriesRevenue$Sales*100.0 108 | timeSeriesRepeatCustomers$Total <- timeSeriesUniqCustomers$Count 109 | 110 | ggplot(timeSeriesRepeatCustomers) + 111 | geom_line(aes(x=InvoiceDate, y=Total), stat="identity", color="navy") + 112 | geom_line(aes(x=InvoiceDate, y=Count), stat="identity", color="orange") + 113 | geom_bar(aes(x=InvoiceDate, y=Perc*20), stat="identity", fill='gray', alpha=0.5) + 114 | scale_y_continuous(sec.axis = sec_axis(~./20, name="Percentage (%)")) + 115 | ggtitle("Number of Unique vs. Repeat & Revenue from Repeat Customers") + 116 | theme(plot.title=element_text(hjust=0.5)) 117 | 118 | #### 2.4. Popular Items Over Time #### 119 | popularItems <- df %>% 120 | group_by(InvoiceDate=floor_date(InvoiceDate, "month"), StockCode) %>% 121 | summarise(Quantity=sum(Quantity)) 122 | 123 | top5Items <- popularItems[ 124 | which(popularItems$InvoiceDate == as.Date("2011-11-01")), 125 | ] %>% 126 | arrange(desc(Quantity)) %>% 127 | head(5) 128 | 129 | timeSeriesTop5 <- popularItems[ 130 | which(popularItems$StockCode %in% top5Items$StockCode), 131 | ] 132 | 133 | ggplot(timeSeriesTop5, aes(x=InvoiceDate, y=Quantity, color=StockCode)) + 134 | geom_line() + 135 | ylab("number of purchases") + 136 | xlab("date") + 137 | ggtitle("Top 5 Popular Items over Time") + 138 | theme(plot.title=element_text(hjust=0.5)) 139 | 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /ch.6/R/ProductRecommendation.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(readxl) 3 | 4 | #### 1. Load Data #### 5 | df <- read_excel( 6 | path="~/Documents/research/data-science-marketing/ch.6/data/Online Retail.xlsx", 7 | sheet="Online Retail" 8 | ) 9 | 10 | # ignore cancel orders 11 | df <- df[which(df$Quantity > 0),] 12 | 13 | #### 2. Data Preparation #### 14 | 15 | ## 2.1. Handle NaNs in CustomerID field 16 | 17 | # there are 133,361 records with no CustomerID 18 | sum(is.na(df$CustomerID)) 19 | # sneak peek at records with no CustomerID 20 | head(df[which(is.na(df$CustomerID)),]) 21 | 22 | # current DataFrame shape 23 | dim(df) 24 | 25 | # remove records with NA 26 | df <- na.omit(df) 27 | dim(df) 28 | 29 | ## 2.2. Customer-Item Matrix 30 | # install.packages("reshape2") 31 | library(reshape2) 32 | 33 | customerItemMatrix <- dcast( 34 | df, CustomerID ~ StockCode, value.var="Quantity" 35 | ) 36 | # 0-1 encode 37 | encode_fn <- function(x) {as.integer(x > 0)} 38 | customerItemMatrix <- customerItemMatrix %>% 39 | mutate_at(vars(-CustomerID), funs(encode_fn)) 40 | 41 | #### 3. Collaborative Filtering ### 42 | # install.packages("coop") 43 | library(coop) 44 | 45 | ## 3.1. User-based Collaborative Filtering 46 | 47 | # User-to-User Similarity Matrix 48 | userToUserSimMatrix <- cosine( 49 | as.matrix( 50 | # excluding CustomerID column 51 | t(customerItemMatrix[, 2:dim(customerItemMatrix)[2]]) 52 | ) 53 | ) 54 | colnames(userToUserSimMatrix) <- customerItemMatrix$CustomerID 55 | 56 | # Making Recommendations 57 | top10SimilarCustomersTo12350 <- customerItemMatrix$CustomerID[ 58 | order(userToUserSimMatrix[,"12350"], decreasing = TRUE)[1:11] 59 | ] 60 | 61 | itemsBoughtByA <- customerItemMatrix[ 62 | which(customerItemMatrix$CustomerID == "12350"), 63 | ] 64 | itemsBoughtByA <- colnames(customerItemMatrix)[which(itemsBoughtByA != 0)] 65 | 66 | itemsBoughtByB <- customerItemMatrix[ 67 | which(customerItemMatrix$CustomerID == "17935"), 68 | ] 69 | itemsBoughtByB <- colnames(customerItemMatrix)[which(itemsBoughtByB != 0)] 70 | 71 | itemsToRecommendToB <- setdiff(itemsBoughtByA, itemsBoughtByB) 72 | itemsToRecommendToB 73 | 74 | itemsToRecommendToBDescriptions <- unique( 75 | df[ 76 | which(df$StockCode %in% itemsToRecommendToB), 77 | c("StockCode", "Description") 78 | ] 79 | ) 80 | itemsToRecommendToBDescriptions <- itemsToRecommendToBDescriptions[ 81 | match(itemsToRecommendToB, itemsToRecommendToBDescriptions$StockCode), 82 | ] 83 | 84 | 85 | ## 3.2. Item-based Collaborative Filtering 86 | 87 | # Item-to-Item Similarity Matrix 88 | itemToItemSimMatrix <- cosine( 89 | as.matrix( 90 | # excluding CustomerID column 91 | customerItemMatrix[, 2:dim(customerItemMatrix)[2]] 92 | ) 93 | ) 94 | 95 | # Making Recommendations 96 | top10SimilarItemsTo23166 <- colnames(itemToItemSimMatrix)[ 97 | order(itemToItemSimMatrix[,"23166"], decreasing = TRUE)[1:11] 98 | ] 99 | top10SimilarItemsTo23166 100 | 101 | top10SimilarItemDescriptions <- unique( 102 | df[ 103 | which(df$StockCode %in% top10SimilarItemsTo23166), 104 | c("StockCode", "Description") 105 | ] 106 | ) 107 | top10SimilarItemDescriptions <- top10SimilarItemDescriptions[ 108 | match(top10SimilarItemsTo23166, top10SimilarItemDescriptions$StockCode), 109 | ] 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /ch.7/R/CustomerBehaviors.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(ggplot2) 3 | 4 | #### 1. Load Data #### 5 | df <- read.csv( 6 | file="~/Documents/data-science-for-marketing/ch.7/data/WA_Fn-UseC_-Marketing-Customer-Value-Analysis.csv", 7 | header=TRUE 8 | ) 9 | 10 | # Encode engaged customers as 0s and 1s 11 | df$Engaged <- as.integer(df$Response) - 1 12 | 13 | #### 2. Analytics on Engaged Customers #### 14 | 15 | ## - Overall Engagement Rates ## 16 | engagementRate <- df %>% group_by(Response) %>% 17 | summarise(Count=n()) %>% 18 | mutate(EngagementRate=Count/nrow(df)*100.0) 19 | 20 | ggplot(engagementRate, aes(x=Response, y=EngagementRate)) + 21 | geom_bar(width=0.5, stat="identity") + 22 | ggtitle('Engagement Rate') + 23 | xlab("Engaged") + 24 | ylab("Percentage (%)") + 25 | theme(plot.title = element_text(hjust = 0.5)) 26 | 27 | 28 | ## - Engagement Rates by Offer Type ## 29 | engagementRateByOfferType <- df %>% 30 | group_by(Renew.Offer.Type) %>% 31 | summarise(Count=n(), NumEngaged=sum(Engaged)) %>% 32 | mutate(EngagementRate=NumEngaged/Count*100.0) 33 | 34 | ggplot(engagementRateByOfferType, aes(x=Renew.Offer.Type, y=EngagementRate)) + 35 | geom_bar(width=0.5, stat="identity") + 36 | ggtitle('Engagement Rates by Offer Type') + 37 | xlab("Offer Type") + 38 | ylab("Engagement Rate (%)") + 39 | theme(plot.title = element_text(hjust = 0.5)) 40 | 41 | 42 | ## - Offer Type & Vehicle Class ## 43 | engagementRateByOfferTypeVehicleClass <- df %>% 44 | group_by(Renew.Offer.Type, Vehicle.Class) %>% 45 | summarise(NumEngaged=sum(Engaged)) %>% 46 | left_join(engagementRateByOfferType[,c("Renew.Offer.Type", "Count")], by="Renew.Offer.Type") %>% 47 | mutate(EngagementRate=NumEngaged/Count*100.0) 48 | 49 | ggplot(engagementRateByOfferTypeVehicleClass, aes(x=Renew.Offer.Type, y=EngagementRate, fill=Vehicle.Class)) + 50 | geom_bar(width=0.5, stat="identity", position = "dodge") + 51 | ggtitle('Engagement Rates by Offer Type & Vehicle Class') + 52 | xlab("Offer Type") + 53 | ylab("Engagement Rate (%)") + 54 | theme(plot.title = element_text(hjust = 0.5)) 55 | 56 | 57 | ## - Engagement Rates by Sales Channel ## 58 | engagementRateBySalesChannel <- df %>% 59 | group_by(Sales.Channel) %>% 60 | summarise(Count=n(), NumEngaged=sum(Engaged)) %>% 61 | mutate(EngagementRate=NumEngaged/Count*100.0) 62 | 63 | ggplot(engagementRateBySalesChannel, aes(x=Sales.Channel, y=EngagementRate)) + 64 | geom_bar(width=0.5, stat="identity") + 65 | ggtitle('Engagement Rates by Sales Channel') + 66 | xlab("Sales Channel") + 67 | ylab("Engagement Rate (%)") + 68 | theme(plot.title = element_text(hjust = 0.5)) 69 | 70 | 71 | ## - Sales Channel & Vehicle Size ## 72 | engagementRateBySalesChannelVehicleSize <- df %>% 73 | group_by(Sales.Channel, Vehicle.Size) %>% 74 | summarise(NumEngaged=sum(Engaged)) %>% 75 | left_join(engagementRateBySalesChannel[,c("Sales.Channel", "Count")], by="Sales.Channel") %>% 76 | mutate(EngagementRate=NumEngaged/Count*100.0) 77 | 78 | ggplot(engagementRateBySalesChannelVehicleSize, aes(x=Sales.Channel, y=EngagementRate, fill=Vehicle.Size)) + 79 | geom_bar(width=0.5, stat="identity", position = "dodge") + 80 | ggtitle('Engagement Rates by Sales Channel & Vehicle Size') + 81 | xlab("Sales Channel") + 82 | ylab("Engagement Rate (%)") + 83 | theme(plot.title = element_text(hjust = 0.5)) 84 | 85 | 86 | ## - Engagement Rates by Months Since Policy Inception ## 87 | engagementRateByPolicyAge <- df %>% 88 | group_by(Months.Since.Policy.Inception) %>% 89 | summarise(Count=n(), NumEngaged=sum(Engaged)) %>% 90 | mutate(EngagementRate=NumEngaged/Count*100.0) 91 | 92 | ggplot(engagementRateByPolicyAge, aes(x=Months.Since.Policy.Inception, y=EngagementRate)) + 93 | geom_line() + 94 | ylab("Engagement Rate (%)") + 95 | xlab("Months Since Policy Inception") + 96 | ggtitle("Engagement Rates by Months Since Policy Inception") + 97 | theme(plot.title=element_text(hjust=0.5)) 98 | 99 | 100 | #### 3. Customer Segmentation by CLV & Months Since Inception #### 101 | summary(df$Customer.Lifetime.Value) 102 | summary(df$Months.Since.Policy.Inception) 103 | 104 | clv_encode_fn <- function(x) {if(x > median(df$Customer.Lifetime.Value)) "High" else "Low"} 105 | df$CLV.Segment <- sapply(df$Customer.Lifetime.Value, clv_encode_fn) 106 | 107 | policy_age_encode_fn <- function(x) {if(x > median(df$Months.Since.Policy.Inception)) "High" else "Low"} 108 | df$Policy.Age.Segment <- sapply(df$Months.Since.Policy.Inception, policy_age_encode_fn) 109 | 110 | ggplot( 111 | df[which(df$CLV.Segment=="High" & df$Policy.Age.Segment=="High"),], 112 | aes(x=Months.Since.Policy.Inception, y=log(Customer.Lifetime.Value)) 113 | ) + 114 | geom_point(color='red') + 115 | geom_point( 116 | data=df[which(df$CLV.Segment=="High" & df$Policy.Age.Segment=="Low"),], 117 | color='orange' 118 | ) + 119 | geom_point( 120 | data=df[which(df$CLV.Segment=="Low" & df$Policy.Age.Segment=="Low"),], 121 | color='green' 122 | ) + 123 | geom_point( 124 | data=df[which(df$CLV.Segment=="Low" & df$Policy.Age.Segment=="High"),], 125 | color='blue' 126 | ) + 127 | ggtitle('Segments by CLV and Policy Age') + 128 | xlab("Months Since Policy Inception") + 129 | ylab("CLV (in log scale)") + 130 | theme(plot.title = element_text(hjust = 0.5)) 131 | 132 | engagementRateBySegment <- df %>% 133 | group_by(CLV.Segment, Policy.Age.Segment) %>% 134 | summarise(Count=n(), NumEngaged=sum(Engaged)) %>% 135 | mutate(EngagementRate=NumEngaged/Count*100.0) 136 | 137 | ggplot(engagementRateBySegment, aes(x=CLV.Segment, y=EngagementRate, fill=Policy.Age.Segment)) + 138 | geom_bar(width=0.5, stat="identity", position = "dodge") + 139 | ggtitle('Engagement Rates by Customer Segments') + 140 | ylab("Engagement Rate (%)") + 141 | theme(plot.title = element_text(hjust = 0.5)) 142 | 143 | -------------------------------------------------------------------------------- /ch.8/R/PredictingEngagement.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(ggplot2) 3 | 4 | #### 1. Load Data #### 5 | df <- read.csv( 6 | file="~/Documents/data-science-for-marketing/ch.8/data/WA_Fn-UseC_-Marketing-Customer-Value-Analysis.csv", 7 | header=TRUE 8 | ) 9 | 10 | #### 2. Variable Encoding #### 11 | 12 | ## 2.1. Response Variable: Response 13 | df$Engaged <- as.integer(df$Response) - 1 14 | mean(df$Engaged) 15 | 16 | ## 2.2. Categorical Features 17 | categoricalVars = c( 18 | 'Sales.Channel', 'Vehicle.Size', 'Vehicle.Class', 'Policy', 'Policy.Type', 19 | 'EmploymentStatus', 'Marital.Status', 'Education', 'Coverage', 'Gender' 20 | ) 21 | 22 | encodedDF <- model.matrix(~.-1, df[categoricalVars]) 23 | 24 | ## 2.3. Continuous Features 25 | continuousFeatures <- c( 26 | 'Customer.Lifetime.Value', 'Income', 'Monthly.Premium.Auto', 27 | 'Months.Since.Last.Claim', 'Months.Since.Policy.Inception', 28 | 'Number.of.Open.Complaints', 'Number.of.Policies', 'Total.Claim.Amount' 29 | ) 30 | 31 | encodedDF <- cbind(encodedDF, df[continuousFeatures]) 32 | 33 | 34 | #### 3. Training & Testing #### 35 | 36 | # install.packages('caTools') 37 | library(caTools) 38 | 39 | sample <- sample.split(df$Customer, SplitRatio = .7) 40 | 41 | trainX <- as.matrix(subset(encodedDF, sample == TRUE)) 42 | trainY <- as.double(as.matrix(subset(df$Engaged, sample == TRUE))) 43 | 44 | testX <- as.matrix(subset(encodedDF, sample == FALSE)) 45 | testY <- as.double(as.matrix(subset(df$Engaged, sample == FALSE))) 46 | 47 | ## 3.1. Building Random Forest Model 48 | 49 | # - Training 50 | # install.packages('randomForest') 51 | library(randomForest) 52 | 53 | rfModel <- randomForest(x=trainX, y=factor(trainY), ntree=200, maxnodes=24) 54 | 55 | # - Individual Tree Predictions 56 | getTree(rfModel, 1) 57 | predict(rfModel, trainX, predict.all=TRUE)$individual 58 | 59 | # - Feature Importances 60 | importance(rfModel) 61 | 62 | ## 3.2. Evaluating Models 63 | 64 | inSamplePreds <- as.double(predict(rfModel, trainX)) - 1 65 | outSamplePreds <- as.double(predict(rfModel, testX)) - 1 66 | 67 | # - Accuracy, Precision, and Recall 68 | inSampleAccuracy <- mean(trainY == inSamplePreds) 69 | outSampleAccuracy <- mean(testY == outSamplePreds) 70 | print(sprintf('In-Sample Accuracy: %0.4f', inSampleAccuracy)) 71 | print(sprintf('Out-Sample Accuracy: %0.4f', outSampleAccuracy)) 72 | 73 | inSamplePrecision <- sum(inSamplePreds & trainY) / sum(inSamplePreds) 74 | outSamplePrecision <- sum(outSamplePreds & testY) / sum(outSamplePreds) 75 | print(sprintf('In-Sample Precision: %0.4f', inSamplePrecision)) 76 | print(sprintf('Out-Sample Precision: %0.4f', outSamplePrecision)) 77 | 78 | inSampleRecall <- sum(inSamplePreds & trainY) / sum(trainY) 79 | outSampleRecall <- sum(outSamplePreds & testY) / sum(testY) 80 | print(sprintf('In-Sample Recall: %0.4f', inSampleRecall)) 81 | print(sprintf('Out-Sample Recall: %0.4f', outSampleRecall)) 82 | 83 | # - ROC & AUC 84 | # install.packages('ROCR') 85 | library(ROCR) 86 | 87 | inSamplePredProbs <- as.double(predict(rfModel, trainX, type='prob')[,2]) 88 | outSamplePredProbs <- as.double(predict(rfModel, testX, type='prob')[,2]) 89 | 90 | pred <- prediction(outSamplePredProbs, testY) 91 | perf <- performance(pred, measure = "tpr", x.measure = "fpr") 92 | auc <- performance(pred, measure='auc')@y.values[[1]] 93 | 94 | plot( 95 | perf, 96 | main=sprintf('Random Forest Model ROC Curve (AUC: %0.2f)', auc), 97 | col='darkorange', 98 | lwd=2 99 | ) + grid() 100 | abline(a = 0, b = 1, col='darkgray', lty=3, lwd=2) 101 | 102 | 103 | 104 | -------------------------------------------------------------------------------- /ch.9/R/CustomerLifetimeValue.R: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(readxl) 3 | library(ggplot2) 4 | 5 | #### 1. Load Data #### 6 | df <- read_excel( 7 | path="~/Documents/data-science-for-marketing/ch.9/data/Online Retail.xlsx", 8 | sheet="Online Retail" 9 | ) 10 | 11 | #### 2. Date Clean-Up #### 12 | 13 | # ignore negative quantity 14 | dim(df) 15 | df <- df[which(df$Quantity > 0),] 16 | dim(df) 17 | 18 | # remove records with NA 19 | df <- na.omit(df) 20 | dim(df) 21 | 22 | # excluding incomplete month 23 | sprintf("Date Range: %s ~ %s", min(df$InvoiceDate), max(df$InvoiceDate)) 24 | dim(df) 25 | df <- df[which(df$InvoiceDate < '2011-12-01'),] 26 | dim(df) 27 | 28 | # total sales 29 | df$Sales <- df$Quantity * df$UnitPrice 30 | 31 | # per order data 32 | ordersDF <- df %>% 33 | group_by(CustomerID, InvoiceNo) %>% 34 | summarize(Sales=sum(Sales), InvoiceDate=max(InvoiceDate)) 35 | 36 | #### 3. Date Analysis #### 37 | 38 | # order amount & frequency summary 39 | summaryDF <- ordersDF %>% 40 | group_by(CustomerID) %>% 41 | summarize( 42 | SalesMin=min(Sales), SalesMax=max(Sales), SalesSum=sum(Sales), SalesAvg=mean(Sales), SalesCount=n(), 43 | InvoiceDateMin=min(InvoiceDate), InvoiceDateMax=max(InvoiceDate), 44 | PurchaseDuration=as.double(floor(max(InvoiceDate)-min(InvoiceDate))), 45 | PurchaseFrequency=as.double(floor(max(InvoiceDate)-min(InvoiceDate)))/n() 46 | ) 47 | 48 | # customers with repeat purchases 49 | dim(summaryDF) 50 | summaryDF <- summaryDF[which(summaryDF$PurchaseDuration > 0),] 51 | dim(summaryDF) 52 | 53 | salesCount <- summaryDF %>% 54 | group_by(SalesCount) %>% 55 | summarize(Count=n()) 56 | 57 | ggplot(salesCount[1:19,], aes(x=SalesCount, y=Count)) + 58 | geom_bar(width=0.5, stat="identity") + 59 | ggtitle('') + 60 | xlab("Sales Count") + 61 | ylab("Count") + 62 | theme(plot.title = element_text(hjust = 0.5)) 63 | 64 | summary(summaryDF$SalesCount) 65 | summary(summaryDF$SalesAvg) 66 | 67 | hist( 68 | summaryDF$PurchaseFrequency, 69 | breaks=20, 70 | xlab='avg. number of days between purchases', 71 | ylab='count', 72 | main='' 73 | ) 74 | 75 | summary(summaryDF$PurchaseDuration) 76 | summary(summaryDF$PurchaseFrequency) 77 | 78 | 79 | #### 4. Predicting 3-Month CLV #### 80 | 81 | ## 4.1. Data Prep ## 82 | 83 | # group data into every 3 months 84 | library(lubridate) 85 | 86 | ordersDF$Quarter = as.character(round_date(ordersDF$InvoiceDate, '3 months')) 87 | 88 | dataDF <- ordersDF %>% 89 | group_by(CustomerID, Quarter) %>% 90 | summarize(SalesSum=sum(Sales), SalesAvg=mean(Sales), SalesCount=n()) 91 | 92 | dataDF$Quarter[dataDF$Quarter == "2012-01-01"] <- "Q1" 93 | dataDF$Quarter[dataDF$Quarter == "2011-10-01"] <- "Q2" 94 | dataDF$Quarter[dataDF$Quarter == "2011-07-01"] <- "Q3" 95 | dataDF$Quarter[dataDF$Quarter == "2011-04-01"] <- "Q4" 96 | dataDF$Quarter[dataDF$Quarter == "2011-01-01"] <- "Q5" 97 | 98 | # building sample set 99 | # install.packages('reshape2') 100 | library(reshape2) 101 | 102 | salesSumFeaturesDF <- dcast( 103 | dataDF[which(dataDF$Quarter != "Q1"),], 104 | CustomerID ~ Quarter, 105 | value.var="SalesSum" 106 | ) 107 | colnames(salesSumFeaturesDF) <- c("CustomerID", "SalesSum.Q2", "SalesSum.Q3", "SalesSum.Q4", "SalesSum.Q5") 108 | 109 | salesAvgFeaturesDF <- dcast( 110 | dataDF[which(dataDF$Quarter != "Q1"),], 111 | CustomerID ~ Quarter, 112 | value.var="SalesAvg" 113 | ) 114 | colnames(salesAvgFeaturesDF) <- c("CustomerID", "SalesAvg.Q2", "SalesAvg.Q3", "SalesAvg.Q4", "SalesAvg.Q5") 115 | 116 | salesCountFeaturesDF <- dcast( 117 | dataDF[which(dataDF$Quarter != "Q1"),], 118 | CustomerID ~ Quarter, 119 | value.var="SalesCount" 120 | ) 121 | colnames(salesCountFeaturesDF) <- c("CustomerID", "SalesCount.Q2", "SalesCount.Q3", "SalesCount.Q4", "SalesCount.Q5") 122 | 123 | featuresDF <- merge( 124 | merge(salesSumFeaturesDF, salesAvgFeaturesDF, by="CustomerID"), 125 | salesCountFeaturesDF, by="CustomerID" 126 | ) 127 | featuresDF[is.na(featuresDF)] <- 0 128 | 129 | responseDF <- dataDF[which(dataDF$Quarter == "Q1"),] %>% 130 | select(CustomerID, SalesSum) 131 | colnames(responseDF) <- c("CustomerID", "CLV_3_Month") 132 | 133 | sampleDF <- merge(featuresDF, responseDF, by="CustomerID", all.x=TRUE) 134 | sampleDF[is.na(sampleDF)] <- 0 135 | 136 | summary(sampleDF$CLV_3_Month) 137 | 138 | ## 4.2. Regression Models ## 139 | 140 | # train/test set split 141 | library(caTools) 142 | 143 | sample <- sample.split(sampleDF$CustomerID, SplitRatio = .8) 144 | 145 | train <- as.data.frame(subset(sampleDF, sample == TRUE))[,-1] 146 | test <- as.data.frame(subset(sampleDF, sample == FALSE))[,-1] 147 | 148 | # Linear Regression model 149 | regFit <- lm(CLV_3_Month ~ ., data=train) 150 | 151 | summary(regFit) 152 | 153 | ## 4.3. Evaluation ## 154 | train_preds <- predict(regFit, train) 155 | test_preds <- predict(regFit, test) 156 | 157 | # R-squared 158 | # install.packages('miscTools') 159 | library(miscTools) 160 | 161 | inSampleR2 <- rSquared(train$CLV_3_Month, resid=train$CLV_3_Month - train_preds) 162 | outOfSampleR2 <- rSquared(test$CLV_3_Month, resid=test$CLV_3_Month - test_preds) 163 | 164 | sprintf('In-Sample R-Squared: %0.4f', inSampleR2) 165 | sprintf('Out-of-Sample R-Squared: %0.4f', outOfSampleR2) 166 | 167 | # Median Absolute Error 168 | inSampleMAE <- median(abs(train$CLV_3_Month - train_preds)) 169 | outOfSampleMAE <- median(abs(test$CLV_3_Month - test_preds)) 170 | 171 | sprintf('In-Sample MAE: %0.4f', inSampleMAE) 172 | sprintf('Out-of-Sample MAE: %0.4f', outOfSampleMAE) 173 | 174 | # Actual vs. Predicted Scatter Plot 175 | plot( 176 | train$CLV_3_Month, 177 | train_preds, 178 | xlab='actual', 179 | ylab='predicted', 180 | main='In-Sample Actual vs. Predicted' 181 | ) 182 | abline(a=0, b=1) 183 | 184 | plot( 185 | test$CLV_3_Month, 186 | test_preds, 187 | xlab='actual', 188 | ylab='predicted', 189 | main='Out-of-Sample Actual vs. Predicted' 190 | ) 191 | abline(a=0, b=1) 192 | 193 | 194 | 195 | 196 | --------------------------------------------------------------------------------