├── .DS_Store ├── artifacts ├── predictions_df.pkl ├── xgb_clf_model.pkl ├── xgb_reg_model.pkl ├── imp_spend_prob_df.pkl └── imp_spend_amount_df.pkl ├── .vscode └── settings.json ├── app_commands.txt ├── app_plot.py ├── environment.yml ├── app.py └── lab_59_customer_ltv.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/lab_59_cust_lifetime_py/HEAD/.DS_Store -------------------------------------------------------------------------------- /artifacts/predictions_df.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/lab_59_cust_lifetime_py/HEAD/artifacts/predictions_df.pkl -------------------------------------------------------------------------------- /artifacts/xgb_clf_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/lab_59_cust_lifetime_py/HEAD/artifacts/xgb_clf_model.pkl -------------------------------------------------------------------------------- /artifacts/xgb_reg_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/lab_59_cust_lifetime_py/HEAD/artifacts/xgb_reg_model.pkl -------------------------------------------------------------------------------- /artifacts/imp_spend_prob_df.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/lab_59_cust_lifetime_py/HEAD/artifacts/imp_spend_prob_df.pkl -------------------------------------------------------------------------------- /artifacts/imp_spend_amount_df.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/business-science/lab_59_cust_lifetime_py/HEAD/artifacts/imp_spend_amount_df.pkl -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.pythonPath": "/Users/mdancho/opt/anaconda3/envs/lab_59_cust_lifetime_py/bin/python", 3 | "jupyter.notebookFileRoot": "${workspaceFolder}", 4 | "jupyter.interactiveWindow.textEditor.executeSelection": true, 5 | } -------------------------------------------------------------------------------- /app_commands.txt: -------------------------------------------------------------------------------- 1 | # TERMINAL COMMANDS 2 | # Make sure to `cd` to your lab_59_cust_lifetime_py directory 3 | 4 | conda info --envs 5 | conda activate lab_59_cust_lifetime_py 6 | python app.py 7 | 8 | # Run this app with `python app.py` and 9 | # visit http://127.0.0.1:8050/ in your web browser. -------------------------------------------------------------------------------- /app_plot.py: -------------------------------------------------------------------------------- 1 | import plotly.express as px 2 | 3 | import pandas as pd 4 | 5 | predictions_df = pd.read_pickle('artifacts/predictions_df.pkl') 6 | 7 | df = predictions_df \ 8 | .assign( 9 | spend_actual_vs_pred = lambda x: x['spend_90_total'] - x['pred_spend'] 10 | ) 11 | 12 | px.scatter( 13 | data_frame=df, 14 | x = 'frequency', 15 | y = 'pred_prob', 16 | color = 'spend_actual_vs_pred', 17 | color_continuous_midpoint=0, 18 | opacity=0.5, 19 | color_continuous_scale='IceFire', 20 | # trendline='lowess', 21 | # trendline_color_override='black' 22 | ) \ 23 | .update_layout( 24 | { 25 | 'plot_bgcolor': 'white' 26 | } 27 | ) 28 | 29 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | # In terminal, run: conda env create -f environment.yml 2 | # To update, run: conda env update -f environment.yml 3 | name: lab_59_cust_lifetime_py 4 | channels: 5 | - anaconda 6 | - conda-forge 7 | - defaults 8 | dependencies: 9 | - python=3.7.1 10 | - pip 11 | - pip: 12 | # Core Data 13 | - numpy==1.20.2 14 | - pandas==1.2.2 15 | - plydata==0.4.3 16 | 17 | # Visualization 18 | - matplotlib==3.3.4 19 | - plotnine==0.7.1 20 | - mizani==0.7.2 21 | - plotly==4.14.3 22 | 23 | # Modeling & Machine Learning 24 | - statsmodels 25 | - scikit-learn==0.23.2 26 | - xgboost==0.90 27 | - sklearn-pandas==2.0.4 28 | - scikit-misc==0.1.3 29 | 30 | # API 31 | - fastapi==0.63.0 32 | - uvicorn==0.13.4 33 | 34 | # Database 35 | - sqlalchemy==1.4.7 36 | 37 | # Jupyter 38 | - jupyterlab==3.0.13 39 | - jupyterlab-server==2.4.0 40 | - jupyter-packaging==0.7.12 41 | - jupyter-server==1.6.1 # Solves ImportError: cannot import name 'get_version_info' from 'jupyter_packaging' 42 | - ipywidgets==7.6.3 43 | - ipympl==0.7.0 44 | - jupytext 45 | - papermill==2.3.3 46 | - nbconvert==5.6.1 47 | 48 | # Apps 49 | - streamlit==0.80.0 50 | - dash==1.20.0 51 | - dash_bootstrap_components==0.12.2 52 | 53 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | # LL PRO BONUS: R SHINY APPLICATION ---- 2 | # BUSINESS SCIENCE LEARNING LABS ---- 3 | # LAB 59: CUSTOMER LIFETIME VALUE | PYTHON DASH ---- 4 | # ---- 5 | 6 | # LIBRARIES 7 | 8 | import dash 9 | import dash_core_components as dcc 10 | import dash_html_components as html 11 | from dash.dependencies import Input, Output, State 12 | 13 | import dash_bootstrap_components as dbc 14 | 15 | import plotly.express as px 16 | 17 | import pandas as pd 18 | import numpy as np 19 | 20 | import pathlib 21 | 22 | # APP SETUP 23 | external_stylesheets = [dbc.themes.CYBORG] 24 | app = dash.Dash( 25 | __name__, 26 | external_stylesheets=external_stylesheets 27 | ) 28 | 29 | PLOT_BACKGROUND = 'rgba(0,0,0,0)' 30 | PLOT_FONT_COLOR = 'white' 31 | LOGO = "https://www.business-science.io/img/business-science-logo.png" 32 | 33 | # PATHS 34 | BASE_PATH = pathlib.Path(__file__).parent.resolve() 35 | ART_PATH = BASE_PATH.joinpath("artifacts").resolve() 36 | 37 | # DATA 38 | predictions_df = pd.read_pickle(ART_PATH.joinpath("predictions_df.pkl")) 39 | 40 | df = predictions_df \ 41 | .assign( 42 | spend_actual_vs_pred = lambda x: x['spend_90_total'] - x['pred_spend'] 43 | ) 44 | 45 | # LAYOUT 46 | 47 | # Slider Marks 48 | x = np.linspace(df['spend_actual_vs_pred'].min(), df['spend_actual_vs_pred'].max(), 10, dtype=int) 49 | x = x.round(0) 50 | 51 | navbar = dbc.Navbar( 52 | [ 53 | html.A( 54 | # Use row and col to control vertical alignment of logo / brand 55 | dbc.Row( 56 | [ 57 | dbc.Col(html.Img(src=LOGO, height="30px")), 58 | dbc.Col(dbc.NavbarBrand("Customer Spend Prediction", className="ml-2")), 59 | ], 60 | align="center", 61 | no_gutters=True, 62 | ), 63 | href="https://www.business-science.io/", 64 | ), 65 | dbc.NavbarToggler(id="navbar-toggler", n_clicks=0), 66 | dbc.Collapse( 67 | id="navbar-collapse", navbar=True, is_open=False 68 | ), 69 | ], 70 | color="dark", 71 | dark=True, 72 | ) 73 | 74 | app.layout = html.Div( 75 | children = [ 76 | navbar, 77 | dbc.Row( 78 | [ 79 | dbc.Col( 80 | [ 81 | 82 | html.H3("Welcome to the Customer Analytics Dashboard"), 83 | html.Div( 84 | id="intro", 85 | children="Explore Customers by Predicted Spend versus Actual Spend during the 90-day evaluation period.", 86 | ), 87 | html.Br(), 88 | html.Hr(), 89 | html.H5("Spend Actual vs Predicted"), 90 | html.P("Segment Customers that were predicted to spend but didn't. Then target these customers with targeted emails."), 91 | dcc.Slider( 92 | id = 'spend-slider', 93 | value = df['spend_actual_vs_pred'].max(), 94 | max = df['spend_actual_vs_pred'].max(), 95 | min = df['spend_actual_vs_pred'].min(), 96 | marks = {i: '$'+str(i) for i in range(x[0],x[-1]) if i % 300 == 0} 97 | ), 98 | html.Br(), 99 | html.Button("Download Segmentation", id="btn"), dcc.Download(id="download") 100 | ], 101 | width = 3, 102 | style={'margin':'10px'} 103 | ), 104 | dbc.Col( 105 | dcc.Graph(id='graph-slider'), 106 | width = 8 107 | ) 108 | ] 109 | ) 110 | ] 111 | ) 112 | 113 | # CALLBACKS 114 | @app.callback( 115 | Output('graph-slider', 'figure'), 116 | Input('spend-slider', 'value')) 117 | def update_figure(spend_delta_max): 118 | 119 | df_filtered = df[df['spend_actual_vs_pred'] <= spend_delta_max] 120 | 121 | fig = px.scatter( 122 | data_frame=df_filtered, 123 | x = 'frequency', 124 | y = 'pred_prob', 125 | color = 'spend_actual_vs_pred', 126 | color_continuous_midpoint=0, 127 | opacity=0.5, 128 | color_continuous_scale='IceFire', 129 | hover_name='customer_id', 130 | hover_data=['spend_90_total', 'pred_spend'], 131 | ) \ 132 | .update_layout( 133 | { 134 | 'plot_bgcolor': PLOT_BACKGROUND, 135 | 'paper_bgcolor':PLOT_BACKGROUND, 136 | 'font_color': PLOT_FONT_COLOR, 137 | 'height':700 138 | } 139 | ) \ 140 | .update_traces( 141 | marker = dict(size = 12) 142 | ) 143 | 144 | return fig 145 | 146 | # Download Button 147 | @app.callback( 148 | Output("download", "data"), 149 | Input("btn", "n_clicks"), 150 | State('spend-slider', 'value'), 151 | prevent_initial_call=True, 152 | ) 153 | def func(n_clicks, spend_delta_max): 154 | 155 | df_filtered = df[df['spend_actual_vs_pred'] <= spend_delta_max] 156 | 157 | return dcc.send_data_frame(df_filtered.to_csv, "customer_segmentation.csv") 158 | 159 | # Navbar 160 | @app.callback( 161 | Output("navbar-collapse", "is_open"), 162 | [Input("navbar-toggler", "n_clicks")], 163 | [State("navbar-collapse", "is_open")], 164 | ) 165 | def toggle_navbar_collapse(n, is_open): 166 | if n: 167 | return not is_open 168 | return is_open 169 | 170 | if __name__ == '__main__': 171 | app.run_server(debug=True) -------------------------------------------------------------------------------- /lab_59_customer_ltv.py: -------------------------------------------------------------------------------- 1 | # BUSINESS SCIENCE LEARNING LABS ---- 2 | # LAB 59: CUSTOMER LIFETIME VALUE ---- 3 | # CUSTOMER LIFETIME VALUE WITH MACHINE LEARNING ---- 4 | # **** ---- 5 | 6 | # CONDA ENV USED: lab_59_customer_ltv_py 7 | 8 | # LIBRARIES ---- 9 | import pandas as pd 10 | import numpy as np 11 | import joblib 12 | 13 | import plydata.cat_tools as cat 14 | import plotnine as pn 15 | 16 | from xgboost import XGBClassifier, XGBRegressor 17 | from sklearn.model_selection import GridSearchCV 18 | 19 | pn.options.dpi = 300 20 | 21 | 22 | # 1.0 DATA PREPARATION ---- 23 | 24 | cdnow_raw_df = pd.read_csv( 25 | "data/CDNOW_master.txt", 26 | sep = "\s+", 27 | names = ["customer_id", "date", "quantity", "price"] 28 | ) 29 | 30 | cdnow_raw_df.info() 31 | 32 | cdnow_df = cdnow_raw_df \ 33 | .assign( 34 | date = lambda x: x['date'].astype(str) 35 | ) \ 36 | .assign( 37 | date = lambda x: pd.to_datetime(x['date']) 38 | ) \ 39 | .dropna() 40 | 41 | cdnow_df.info() 42 | 43 | # 2.0 COHORT ANALYSIS ---- 44 | # - Only the customers that have joined at the specific business day 45 | 46 | # Get Range of Initial Purchases ---- 47 | cdnow_first_purchase_tbl = cdnow_df \ 48 | .sort_values(['customer_id', 'date']) \ 49 | .groupby('customer_id') \ 50 | .first() 51 | 52 | cdnow_first_purchase_tbl 53 | 54 | cdnow_first_purchase_tbl['date'].min() 55 | 56 | cdnow_first_purchase_tbl['date'].max() 57 | 58 | # Visualize: All purchases within cohort 59 | 60 | cdnow_df \ 61 | .reset_index() \ 62 | .set_index('date') \ 63 | [['price']] \ 64 | .resample( 65 | rule = "MS" 66 | ) \ 67 | .sum() \ 68 | .plot() 69 | 70 | # Visualize: Individual Customer Purchases 71 | 72 | ids = cdnow_df['customer_id'].unique() 73 | ids_selected = ids[0:10] 74 | 75 | cdnow_cust_id_subset_df = cdnow_df \ 76 | [cdnow_df['customer_id'].isin(ids_selected)] \ 77 | .groupby(['customer_id', 'date']) \ 78 | .sum() \ 79 | .reset_index() 80 | 81 | pn.ggplot( 82 | pn.aes('date', 'price', group = 'customer_id'), 83 | data = cdnow_cust_id_subset_df 84 | ) \ 85 | + pn.geom_line() \ 86 | + pn.geom_point() \ 87 | + pn.facet_wrap('customer_id') \ 88 | + pn.scale_x_date( 89 | date_breaks = "1 year", 90 | date_labels = "%Y" 91 | ) 92 | 93 | 94 | # 3.0 MACHINE LEARNING ---- 95 | # Frame the problem: 96 | # - What will the customers spend in the next 90-Days? (Regression) 97 | # - What is the probability of a customer to make a purchase in next 90-days? (Classification) 98 | 99 | 100 | # 3.1 TIME SPLITTING (STAGE 1) ---- 101 | 102 | n_days = 90 103 | max_date = cdnow_df['date'].max() 104 | cutoff = max_date - pd.to_timedelta(n_days, unit = "d") 105 | 106 | temporal_in_df = cdnow_df \ 107 | [cdnow_df['date'] <= cutoff] 108 | 109 | temporal_out_df = cdnow_df \ 110 | [cdnow_df['date'] > cutoff] 111 | 112 | 113 | # 3.2 FEATURE ENGINEERING (RFM) ---- 114 | # - Most challenging part 115 | # - 2-Stage Process 116 | # - Need to frame the problem 117 | # - Need to think about what features to include 118 | 119 | # Make Targets from out data ---- 120 | 121 | targets_df = temporal_out_df \ 122 | .drop('quantity', axis=1) \ 123 | .groupby('customer_id') \ 124 | .sum() \ 125 | .rename({'price': 'spend_90_total'}, axis = 1) \ 126 | .assign(spend_90_flag = 1) 127 | 128 | # Make Recency (Date) Features from in data ---- 129 | 130 | max_date = temporal_in_df['date'].max() 131 | 132 | recency_features_df = temporal_in_df \ 133 | [['customer_id', 'date']] \ 134 | .groupby('customer_id') \ 135 | .apply( 136 | lambda x: (x['date'].max() - max_date) / pd.to_timedelta(1, "day") 137 | ) \ 138 | .to_frame() \ 139 | .set_axis(["recency"], axis=1) 140 | 141 | recency_features_df 142 | 143 | # Make Frequency (Count) Features from in data ---- 144 | 145 | frequency_features_df = temporal_in_df \ 146 | [['customer_id', 'date']] \ 147 | .groupby('customer_id') \ 148 | .count() \ 149 | .set_axis(['frequency'], axis=1) 150 | 151 | frequency_features_df 152 | 153 | # Make Price (Monetary) Features from in data ---- 154 | 155 | price_features_df = temporal_in_df \ 156 | .groupby('customer_id') \ 157 | .aggregate( 158 | { 159 | 'price': ["sum", "mean"] 160 | } 161 | ) \ 162 | .set_axis(['price_sum', 'price_mean'], axis = 1) 163 | 164 | price_features_df 165 | 166 | # 3.3 COMBINE FEATURES ---- 167 | 168 | features_df = pd.concat( 169 | [recency_features_df, frequency_features_df, price_features_df], axis = 1 170 | ) \ 171 | .merge( 172 | targets_df, 173 | left_index = True, 174 | right_index = True, 175 | how = "left" 176 | ) \ 177 | .fillna(0) 178 | 179 | # 4.0 MACHINE LEARNING ----- 180 | 181 | from xgboost import XGBClassifier, XGBRegressor 182 | 183 | from sklearn.model_selection import GridSearchCV 184 | 185 | X = features_df[['recency', 'frequency', 'price_sum', 'price_mean']] 186 | 187 | # 4.1 NEXT 90-DAY SPEND PREDICTION ---- 188 | 189 | y_spend = features_df['spend_90_total'] 190 | 191 | xgb_reg_spec = XGBRegressor( 192 | objective="reg:squarederror", 193 | random_state=123 194 | ) 195 | 196 | xgb_reg_model = GridSearchCV( 197 | estimator=xgb_reg_spec, 198 | param_grid=dict( 199 | learning_rate = [0.01, 0.1, 0.3, 0.5] 200 | ), 201 | scoring = 'neg_mean_absolute_error', 202 | refit = True, 203 | cv = 5 204 | ) 205 | 206 | xgb_reg_model.fit(X, y_spend) 207 | 208 | xgb_reg_model.best_score_ 209 | 210 | xgb_reg_model.best_params_ 211 | 212 | xgb_reg_model.best_estimator_ 213 | 214 | predictions_reg = xgb_reg_model.predict(X) 215 | 216 | 217 | # 4.2 NEXT 90-DAY SPEND PROBABILITY ---- 218 | 219 | y_prob = features_df['spend_90_flag'] 220 | 221 | xgb_clf_spec = XGBClassifier( 222 | objective = "binary:logistic", 223 | random_state = 123 224 | ) 225 | 226 | xgb_clf_model = GridSearchCV( 227 | estimator=xgb_clf_spec, 228 | param_grid=dict( 229 | learning_rate = [0.01, 0.1, 0.3, 0.5] 230 | ), 231 | scoring = 'roc_auc', 232 | refit = True, 233 | cv = 5 234 | ) 235 | 236 | xgb_clf_model.fit(X, y_prob) 237 | 238 | xgb_clf_model.best_score_ 239 | 240 | xgb_clf_model.best_params_ 241 | 242 | xgb_clf_model.best_estimator_ 243 | 244 | predictions_clf = xgb_clf_model.predict_proba(X) 245 | 246 | # 4.3 FEATURE IMPORTANCE (GLOBAL) ---- 247 | 248 | # Importance | Spend Amount Model 249 | imp_spend_amount_dict = xgb_reg_model \ 250 | .best_estimator_ \ 251 | .get_booster() \ 252 | .get_score(importance_type = 'gain') 253 | 254 | imp_spend_amount_df = pd.DataFrame( 255 | data = { 256 | 'feature':list(imp_spend_amount_dict.keys()), 257 | 'value':list(imp_spend_amount_dict.values()) 258 | } 259 | ) \ 260 | .assign( 261 | feature = lambda x: cat.cat_reorder(x['feature'] , x['value']) 262 | ) 263 | 264 | pn.ggplot( 265 | pn.aes('feature', 'value'), 266 | data = imp_spend_amount_df 267 | ) \ 268 | + pn.geom_col() \ 269 | + pn.coord_flip() 270 | 271 | # Importance | Spend Probability Model 272 | imp_spend_prob_dict = xgb_clf_model \ 273 | .best_estimator_ \ 274 | .get_booster() \ 275 | .get_score(importance_type = 'gain') 276 | 277 | imp_spend_prob_df = pd.DataFrame( 278 | data = { 279 | 'feature':list(imp_spend_prob_dict.keys()), 280 | 'value':list(imp_spend_prob_dict.values()) 281 | } 282 | ) \ 283 | .assign( 284 | feature = lambda x: cat.cat_reorder(x['feature'] , x['value']) 285 | ) 286 | 287 | pn.ggplot( 288 | pn.aes('feature', 'value'), 289 | data = imp_spend_prob_df 290 | ) \ 291 | + pn.geom_col() \ 292 | + pn.coord_flip() 293 | 294 | # 5.0 SAVE WORK ---- 295 | 296 | # Save Predictions 297 | predictions_df = pd.concat( 298 | [ 299 | pd.DataFrame(predictions_reg).set_axis(['pred_spend'], axis=1), 300 | pd.DataFrame(predictions_clf)[[1]].set_axis(['pred_prob'], axis=1), 301 | features_df.reset_index() 302 | ], 303 | axis=1 304 | ) 305 | 306 | predictions_df 307 | 308 | predictions_df.to_pickle("artifacts/predictions_df.pkl") 309 | 310 | pd.read_pickle('artifacts/predictions_df.pkl') 311 | 312 | # Save Importance 313 | imp_spend_amount_df.to_pickle("artifacts/imp_spend_amount_df.pkl") 314 | imp_spend_prob_df.to_pickle("artifacts/imp_spend_prob_df.pkl") 315 | 316 | pd.read_pickle("artifacts/imp_spend_amount_df.pkl") 317 | 318 | # Save Models 319 | joblib.dump(xgb_reg_model, 'artifacts/xgb_reg_model.pkl') 320 | joblib.dump(xgb_clf_model, 'artifacts/xgb_clf_model.pkl') 321 | 322 | model = joblib.load('artifacts/xgb_reg_model.pkl') 323 | model.predict(X) 324 | 325 | 326 | # 6.0 HOW CAN WE USE THIS INFORMATION ---- 327 | 328 | # 6.1 Which customers have the highest spend probability in next 90-days? 329 | # - Target for new products similar to what they have purchased in the past 330 | 331 | predictions_df \ 332 | .sort_values('pred_prob', ascending=False) 333 | 334 | # 6.2 Which customers have recently purchased but are unlikely to buy? 335 | # - Incentivize actions to increase probability 336 | # - Provide discounts, encourage referring a friend, nurture by letting them know what's coming 337 | 338 | predictions_df \ 339 | [ 340 | predictions_df['recency'] > -90 341 | ] \ 342 | [ 343 | predictions_df['pred_prob'] < 0.20 344 | ] \ 345 | .sort_values('pred_prob', ascending=False) 346 | 347 | 348 | # 6.3 Missed opportunities: Big spenders that could be unlocked ---- 349 | # - Send bundle offers encouraging volume purchases 350 | # - Focus on missed opportunities 351 | 352 | predictions_df \ 353 | [ 354 | predictions_df['spend_90_total'] == 0.0 355 | ] \ 356 | .sort_values('pred_spend', ascending=False) 357 | 358 | # 7.0 NEXT STEPS ---- 359 | # - It's really exciting what you can do with Machine Learning. 360 | # Very powerful. But you have to put in the work. 361 | 362 | # - Learning Data Wrangling, Modeling, and Visualization (101) 363 | # - Model Improvement (Coming Soon): 364 | # - Algorithms (201-P) 365 | # - AutoML (201-P) 366 | # - Hyper Parameter Tuning (201-P) 367 | # - Forecasting: When will customers purchase? (TBD) 368 | # - Web Applications, API's & Production (202-P) 369 | --------------------------------------------------------------------------------