├── support ├── __init__.py ├── parameters.py ├── experiments │ ├── __init__.py │ ├── experiment_1.py │ └── experiment_2.py ├── datasets.py ├── evaluation.py └── model.py ├── notebooks ├── utils │ ├── __init__.py │ └── code.py ├── 3.0-experiment-2-feature-engineering.ipynb ├── 4.0-model-evaluation.ipynb └── 3.1-experiment-2-model.ipynb ├── docs ├── 1-CoNVO.docx ├── 2-Problem Model.xlsx └── 3-Vision, Arguments, and Results.docx ├── requirements.txt ├── models └── experiment-1-model.pkl ├── Makefile ├── .gitignore ├── data └── bank-additional │ ├── .Rhistory │ └── bank-additional-names.txt └── Readme.md /support/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /notebooks/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /support/parameters.py: -------------------------------------------------------------------------------- 1 | P_TARGETED = .066 2 | AVG_REVENUE = 1083 3 | AVG_COST = -8 -------------------------------------------------------------------------------- /docs/1-CoNVO.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calvdee/end-to-end-lead-scoring/HEAD/docs/1-CoNVO.docx -------------------------------------------------------------------------------- /docs/2-Problem Model.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calvdee/end-to-end-lead-scoring/HEAD/docs/2-Problem Model.xlsx -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.15.2 2 | pandas==0.23.4 3 | matplotlib==2.2.2 4 | seaborn==0.9.0 5 | scikit-learn==0.20.0 -------------------------------------------------------------------------------- /models/experiment-1-model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calvdee/end-to-end-lead-scoring/HEAD/models/experiment-1-model.pkl -------------------------------------------------------------------------------- /notebooks/utils/code.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | 3 | # Adds the root folder to the python path 4 | sys.path.insert(1, os.path.join(sys.path[0], '..')) -------------------------------------------------------------------------------- /docs/3-Vision, Arguments, and Results.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/calvdee/end-to-end-lead-scoring/HEAD/docs/3-Vision, Arguments, and Results.docx -------------------------------------------------------------------------------- /support/experiments/__init__.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics import roc_auc_score, make_scorer 2 | 3 | def get_scorer(): 4 | scorer = make_scorer(roc_auc_score) 5 | return scorer -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean-pyc 2 | 3 | clean-pyc: 4 | find . -name '*.pyc' -exec rm -f {} + 5 | find . -name '*.pyo' -exec rm -f {} + 6 | find . -name '__pycache__' -exec rm -rf {} + 7 | 8 | clean-notebooks: 9 | find . -name '.ipynb_checkpoints' -exec rm -rf {} + -------------------------------------------------------------------------------- /support/datasets.py: -------------------------------------------------------------------------------- 1 | def get_data(data_path): 2 | import pandas as pd 3 | from sklearn.preprocessing import LabelBinarizer 4 | 5 | data = pd.read_csv(data_path) 6 | 7 | X = data.drop('y', axis=1) 8 | y = data.y.apply(lambda x: 1 if x == 'yes' else 0) 9 | 10 | return X, y -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Internal 2 | # ------------------------------------------ 3 | .DS_STORE 4 | .ipynb_checkpoints/ 5 | ~$* 6 | # ------------------------------------------ 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # celery beat schedule file 92 | celerybeat-schedule 93 | 94 | # SageMath parsed files 95 | *.sage.py 96 | 97 | # Environments 98 | .env 99 | .venv 100 | env/ 101 | venv/ 102 | ENV/ 103 | env.bak/ 104 | venv.bak/ 105 | 106 | # Spyder project settings 107 | .spyderproject 108 | .spyproject 109 | 110 | # Rope project settings 111 | .ropeproject 112 | 113 | # mkdocs documentation 114 | /site 115 | 116 | # mypy 117 | .mypy_cache/ 118 | .dmypy.json 119 | dmypy.json 120 | 121 | # Pyre type checker 122 | .pyre/ 123 | -------------------------------------------------------------------------------- /support/experiments/experiment_1.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, RobustScaler 3 | from sklearn.compose import ColumnTransformer 4 | from sklearn.pipeline import Pipeline, FeatureUnion 5 | 6 | NUMERIC_FEATURES = [ 7 | 'age', 8 | 'campaign', 9 | 'previous', 10 | 'emp.var.rate', 11 | 'cons.price.idx', 12 | 'cons.conf.idx', 13 | 'euribor3m', 14 | 'nr.employed' 15 | ] 16 | 17 | CATEGORICAL_FEATURES = [ 18 | 'job', 19 | 'marital', 20 | 'education', 21 | 'default', 22 | 'housing', 23 | 'loan', 24 | 'contact', 25 | 'month', 26 | 'day_of_week', 27 | 'poutcome' 28 | ] 29 | 30 | def get_categorical_pipeline(): 31 | # Create the transformers for categorical features 32 | cat_ct = ColumnTransformer([('categoricals', 'passthrough', CATEGORICAL_FEATURES)]) 33 | 34 | # Create the pipeline to transform categorical features 35 | cat_pipeline = Pipeline([ 36 | ('cat_ct', cat_ct), 37 | ('ohe', OneHotEncoder(handle_unknown='ignore')) 38 | ]) 39 | 40 | return cat_pipeline 41 | 42 | def get_numeric_pipeline(): 43 | # Create the transformers for numeric features 44 | num_ct = ColumnTransformer([('numerics', 'passthrough', NUMERIC_FEATURES)]) 45 | 46 | # Create the pipeline to transform numeric features 47 | num_pipeline = Pipeline([ 48 | ('num_union', num_ct), 49 | ('scaler', RobustScaler()) 50 | ]) 51 | 52 | return num_pipeline 53 | 54 | def get_pipeline(): 55 | # Create the categorical and numeric pipelines 56 | cat_pipeline = get_categorical_pipeline() 57 | num_pipeline = get_numeric_pipeline() 58 | 59 | # Create the feature union of categorical and numeric attributes 60 | ft_union = FeatureUnion([ 61 | ('cat_pipeline', cat_pipeline), 62 | ('num_pipeline', num_pipeline) 63 | ]) 64 | 65 | pipeline = Pipeline([ 66 | ('ft_union', ft_union) 67 | ]) 68 | 69 | return pipeline 70 | 71 | def baseline_model_predictions(X, y, n_targeted): 72 | # Get all of the instances where the previous campaign was a success 73 | success = X[X.poutcome == 'success'] 74 | 75 | # Calcuate how many more instances we need 76 | n_rest = n_targeted - len(success) 77 | 78 | # Randomly choose from the remaining instances 79 | rest = X[~(X.index.isin(success.index))].sample(n=n_rest, random_state=1) 80 | 81 | # Combine the targeted and random groups 82 | baseline_targets = pd.concat([success, rest], axis=0) 83 | baseline_ys = y.loc[baseline_targets.index] 84 | 85 | return baseline_ys -------------------------------------------------------------------------------- /support/evaluation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .model import build_tuned_model 3 | 4 | def evaluate_model(features, target, name, model, param_grid, scorer, pipeline=None, cv_folds=5): 5 | tuned_model = build_tuned_model(name, model, features, target, param_grid, scorer, pipeline=pipeline) 6 | results = tuned_model.results 7 | best_result = results.query('rank_test_score == 1') 8 | test_mean = best_result['mean_test_score'].values[0] 9 | test_std = best_result['std_test_score'].values[0] 10 | return (tuned_model, name, test_mean, test_std) 11 | 12 | def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, 13 | n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5), scoring=None): 14 | """ 15 | Generate a simple plot of the test and training learning curve. 16 | 17 | Parameters 18 | ---------- 19 | estimator : object type that implements the "fit" and "predict" methods 20 | An object of that type which is cloned for each validation. 21 | 22 | title : string 23 | Title for the chart. 24 | 25 | X : array-like, shape (n_samples, n_features) 26 | Training vector, where n_samples is the number of samples and 27 | n_features is the number of features. 28 | 29 | y : array-like, shape (n_samples) or (n_samples, n_features), optional 30 | Target relative to X for classification or regression; 31 | None for unsupervised learning. 32 | 33 | ylim : tuple, shape (ymin, ymax), optional 34 | Defines minimum and maximum yvalues plotted. 35 | 36 | cv : int, cross-validation generator or an iterable, optional 37 | Determines the cross-validation splitting strategy. 38 | Possible inputs for cv are: 39 | - None, to use the default 3-fold cross-validation, 40 | - integer, to specify the number of folds. 41 | - An object to be used as a cross-validation generator. 42 | - An iterable yielding train/test splits. 43 | 44 | For integer/None inputs, if ``y`` is binary or multiclass, 45 | :class:`StratifiedKFold` used. If the estimator is not a classifier 46 | or if ``y`` is neither binary nor multiclass, :class:`KFold` is used. 47 | 48 | Refer :ref:`User Guide ` for the various 49 | cross-validators that can be used here. 50 | 51 | n_jobs : integer, optional 52 | Number of jobs to run in parallel (default 1). 53 | """ 54 | import matplotlib.pyplot as plt 55 | from sklearn.model_selection import learning_curve 56 | import numpy as np 57 | 58 | plt.figure() 59 | plt.title(title) 60 | if ylim is not None: 61 | plt.ylim(*ylim) 62 | plt.xlabel("Training examples") 63 | plt.ylabel("Score") 64 | train_sizes, train_scores, test_scores = learning_curve( 65 | estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes, scoring=scoring) 66 | train_scores_mean = np.mean(train_scores, axis=1) 67 | train_scores_std = np.std(train_scores, axis=1) 68 | test_scores_mean = np.mean(test_scores, axis=1) 69 | test_scores_std = np.std(test_scores, axis=1) 70 | plt.grid() 71 | 72 | plt.fill_between(train_sizes, train_scores_mean - train_scores_std, 73 | train_scores_mean + train_scores_std, alpha=0.1, 74 | color="r") 75 | plt.fill_between(train_sizes, test_scores_mean - test_scores_std, 76 | test_scores_mean + test_scores_std, alpha=0.1, color="g") 77 | plt.plot(train_sizes, train_scores_mean, 'o-', color="r", 78 | label="Training score") 79 | plt.plot(train_sizes, test_scores_mean, 'o-', color="g", 80 | label="Cross-validation score") 81 | 82 | plt.legend(loc="best"); 83 | return plt -------------------------------------------------------------------------------- /support/experiments/experiment_2.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, KBinsDiscretizer 3 | from sklearn.compose import ColumnTransformer 4 | from sklearn.pipeline import Pipeline, FeatureUnion 5 | from sklearn.base import clone 6 | 7 | 8 | NUMERIC_FEATURES = [ 9 | 'age', 10 | 'campaign', 11 | 'previous', 12 | 'emp.var.rate', 13 | 'cons.price.idx', 14 | 'cons.conf.idx', 15 | 'euribor3m', 16 | 'nr.employed', 17 | 'campaign_to_previous' 18 | ] 19 | 20 | CATEGORICAL_FEATURES = [ 21 | 'job', 22 | 'marital', 23 | 'education', 24 | 'default', 25 | 'housing', 26 | 'loan', 27 | 'contact', 28 | 'month', 29 | 'day_of_week', 30 | 'poutcome' 31 | ] 32 | 33 | NEW_CATEGORICAL_FEATURES = [ 34 | 'pcontacted_last_campaign', 35 | 'pcampaign', 36 | 'previous', 37 | 'campaign_gte10' 38 | ] 39 | 40 | def ft_pcontacted_last_campaign(X): 41 | pcontacted = ~(X == 999) 42 | return pcontacted.values.reshape(-1,1) 43 | 44 | def ft_pcampaign(X): 45 | pcampaign = ~(X == 'nonexistent') 46 | return pcampaign.values.reshape(-1,1) 47 | 48 | def ft_previous(X): 49 | previous = X.astype(str) 50 | return previous.values.reshape(-1,1) 51 | 52 | def ft_campaign_gte10(X): 53 | campaign_gte10 = X >= 10 54 | return campaign_gte10.values.reshape(-1,1) 55 | 56 | def ft_campaign_to_previous(X): 57 | ratio = lambda x: 0 if x.previous == 0 else x.campaign / x.previous 58 | campaign_to_previous = X[['campaign', 'previous']].apply(ratio, axis=1) 59 | return campaign_to_previous.values.reshape(-1,1) 60 | 61 | def get_categorical_ct(): 62 | # Create the transformers for categorical features 63 | add_pcontacted_last_campaign = FunctionTransformer(ft_pcontacted_last_campaign, validate=False) 64 | add_pcampaign = FunctionTransformer(ft_pcampaign, validate=False) 65 | add_previous = FunctionTransformer(ft_previous, validate=False) 66 | add_campaign_gte10 = FunctionTransformer(ft_campaign_gte10, validate=False) 67 | 68 | cat_features = [ 69 | ('categoricals', 'passthrough', CATEGORICAL_FEATURES), 70 | ('pcontacted_last_campaign', add_pcontacted_last_campaign, 'pdays'), 71 | ('pcampaign', add_pcampaign, 'poutcome'), 72 | ('previous', add_previous, 'previous'), 73 | ('campaign_gte10', add_campaign_gte10, 'campaign') 74 | ] 75 | cat_ct = ColumnTransformer(cat_features) 76 | 77 | return cat_ct 78 | 79 | def get_categorical_pipeline(): 80 | cat_cts = get_categorical_ct() 81 | 82 | # Create the pipeline to transform categorical features 83 | cat_pipeline = Pipeline([ 84 | ('cat_ct', cat_cts), 85 | ('ohe', OneHotEncoder(handle_unknown='ignore')) 86 | ]) 87 | 88 | return cat_pipeline 89 | 90 | def get_numeric_pipeline(): 91 | binning_pipeline = Pipeline([ 92 | ('log', FunctionTransformer(np.log, validate=True)), 93 | ('kbins', KBinsDiscretizer()) 94 | ]) 95 | 96 | # Create the transformers for numeric features 97 | # num_ct = ColumnTransformer([('numerics', 'passthrough', numerics)]) 98 | 99 | # new_num_features = [ 100 | # ('num_ct', num_ct), 101 | # ('ft_campaign_to_previous', FunctionTransformer(ft_campaign_to_previous, validate=False)) 102 | # ] 103 | # num_union = FeatureUnion(new_num_features) 104 | 105 | # # Create the pipeline to transform numeric features 106 | # num_pipeline = Pipeline([ 107 | # ('num_union', num_union), 108 | # ('scaler', RobustScaler()) 109 | # ]) 110 | 111 | age_campaign_ct = ColumnTransformer([ 112 | ('age_pipeline', clone(binning_pipeline), ['age']), 113 | ('campaign_pipeline', clone(binning_pipeline), ['campaign']) 114 | ]) 115 | 116 | return age_campaign_ct 117 | 118 | def get_pipeline(): 119 | # Create the categorical and numeric pipelines 120 | cat_pipeline = get_categorical_pipeline() 121 | num_pipeline = get_numeric_pipeline() 122 | 123 | # Create the feature union of categorical and numeric attributes 124 | ft_union = FeatureUnion([ 125 | ('cat_pipeline', cat_pipeline), 126 | # ('num_pipeline', num_pipeline) 127 | ]) 128 | 129 | pipeline = Pipeline([ 130 | ('ft_union', ft_union) 131 | ]) 132 | 133 | return pipeline -------------------------------------------------------------------------------- /data/bank-additional/.Rhistory: -------------------------------------------------------------------------------- 1 | ?wireframe 2 | librar(lattice) 3 | library(lattice) 4 | ?wireframe 5 | wireframe(volcano, shade = TRUE,# 6 | aspect = c(61/87, 0.4),# 7 | light.source = c(10,0,10)) 8 | library(rminer) 9 | help(package=rminer) 10 | ls() 11 | class(x) 12 | x 13 | class(y) 14 | y 15 | x=1 16 | y="hello" 17 | y 18 | x=c(1.1,2.3,-1,4,2e-2) 19 | x 20 | x=c(1,2,3,4,5) 21 | x 22 | x=1:10 23 | x 24 | x=10:1 25 | x 26 | x=rnorm(18,25,1) 27 | x 28 | x=round(rnorm(18,25,1)) 29 | x 30 | x=round(rnorm(18,35,10)) 31 | x 32 | x=c(18,x) 33 | x 34 | x=c(98,x) 35 | x 36 | summary(x) 37 | lenght(x) 38 | length(x) 39 | x 40 | x[3] 41 | x[3]=30 42 | x 43 | z[]=1 44 | z=vector(length=5); print(z) 45 | x 46 | z 47 | z[]=1; print(z) 48 | z 49 | z[c(1,3,5)]=2 50 | z 51 | sum(z) 52 | min(z) 53 | max(z) 54 | sort(z) 55 | plot(z,type="b",lwd=2,col="blue") 56 | plot(z,type="b",lwd=20,col="blue") 57 | a=rep(3,10) 58 | a 59 | rep(10,1000) 60 | rep(10,5,3) 61 | ?rep 62 | rep(10,5,times=3) 63 | rep(x=10,5,times=3) 64 | rep(x=10,times=3) 65 | rep(x=10,5,each=3) 66 | rep(c(1,2,3),5,each=3) 67 | rep(c(1,2,3),5) 68 | c=seq(2,20,2) 69 | seq(2,20,2) 70 | seq(1,20,2) 71 | seq(1,20,1) 72 | seq(1,20,-1) 73 | seq(20,1,-1) 74 | seq(20,1,lenght=3) 75 | seq(20,1,length=3) 76 | x 77 | x^2+1 78 | m=matrix(ncol=3,nrow=2) 79 | m 80 | m[,]=0 81 | m 82 | m[1,] 83 | m[1,]=1:3 84 | m 85 | m[2,]=4:6 86 | m 87 | m[,1]=c(-1,-1) 88 | m 89 | m[,3]=c(10,10) 90 | m 91 | class(m) 92 | sum(m) 93 | summary(m) 94 | print(m) 95 | plot(m) 96 | m=matrix(ncol=10,nrow=4); m[,]=0! 97 | m=matrix(ncol=10,nrow=4); m[,]=0 98 | m 99 | m[c(1,3),seq(1,10,2)]=1 100 | m 101 | seq(1,10,2) 102 | m 103 | m[1,] 104 | m[3,] 105 | m[c(1,3),] 106 | m[,1] 107 | m 108 | m[,3] 109 | m[,c(1,3)] 110 | m[c(1,3),seq(1,10,2)] 111 | m[c(1,3),seq(1,10,2)]=3 112 | m 113 | seq(1,10,2) 114 | ?seq 115 | y=factor(c("a","a","a","b","b","c")) 116 | y 117 | y+1 118 | summary(y) 119 | plot(y) 120 | l=list(a="ola",b=1:3) 121 | l 122 | l=list(a="ola",b=1:3,f=y) 123 | l 124 | l$b 125 | l$a 126 | l$f 127 | m=matrix(ncol=3,nrow=3); m[,]=1 128 | m 129 | d=data.frame(m); names(d)=c("dia","mes","ano")! 130 | d=data.frame(m); names(d)=c("dia","mes","ano") 131 | class(d) 132 | class(m) 133 | d 134 | d[1,] 135 | d[,1] 136 | d$dia 137 | d$mes=factor(c("Jan","Feb","Apr")) 138 | d$ano=c(1999,1999,2000); print(d) 139 | d 140 | plot(d$mes) 141 | plot(d$ano) 142 | plot(d$ano,type="b") 143 | d 144 | summary(d) 145 | d[2,1]=2 146 | d 147 | d[2,]=c(31,"Jan",2009) 148 | d 149 | edit(d) 150 | names(d) 151 | names(d)=c("Dia","Mes","Ano") 152 | names(d) 153 | d 154 | d2=rbind(d,c(21,"Feb",2008)) 155 | d 156 | d2 157 | d 158 | d2 159 | d2=rbind(d2,d2) 160 | d2 161 | sample(1:10,3) 162 | c(sample(1:50,5),sample(1:11,2)) 163 | for(i in 1:1000) c(sample(1:50,5),sample(1:11,2)) 164 | for(i in 1:1000) print(c(sample(1:50,5),sample(1:11,2))) 165 | c(sort(sample(1:50,5)),sort(sample(1:11,2))) 166 | cat("numeros:",c(sort(sample(1:50,5)),"estrelas:",sort(sample(1:11,2)))) 167 | set.seed(1) 168 | cat("numeros:",c(sort(sample(1:50,5)),"estrelas:",sort(sample(1:11,2)))) 169 | set.seed(1) 170 | cat("numeros:",c(sort(sample(1:50,5)),"estrelas:",sort(sample(1:11,2)))) 171 | x=rnorm(100,10) 172 | y=rnorm(100,8) 173 | x 174 | y 175 | t.test(x,y) 176 | cor(x,y) 177 | d2 178 | d2[d2$Mes==Jan] 179 | d2[,d2$Mes==Jan] 180 | I=whic(d2$Mes==Jan) 181 | I=which(d2$Mes==Jan) 182 | I=which(d2$Mes=="Jan") 183 | I 184 | d2[I,] 185 | library(foreign) 186 | help(package=foreign) 187 | q("no") 188 | 2509+2013+724 189 | 2507+2006+604 190 | 2507+2006+588 191 | 2507+1961+588 192 | 2507+1950+574 193 | 2507+1896+574 194 | 2507+1863+574 195 | setwd("/Users/pcortez/R/rminer") 196 | source("popular2.R") 197 | RStudio_CRAN_data_folder <- download_RStudio_CRAN_data(START = '2013-04-02', END = '2013-04-05') 198 | if(packageVersion("installr") %in% c("0.8","0.9","0.9.2")) install.packages('installr') 199 | require(installr) 200 | RStudio_CRAN_data_folder <- download_RStudio_CRAN_data(START = '2013-04-02', END = '2013-04-05') 201 | my_RStudio_CRAN_data <- read_RStudio_CRAN_data(RStudio_CRAN_data_folder) 202 | summary(my_RStudio_CRAN_data) 203 | NROW(my_RStudio_CRAN_data) 204 | which(my_RStudio_CRAN_data$package=="rminer") 205 | cat("rminer installs:",length(which(my_RStudio_CRAN_data$package=="rminer")),"\n") 206 | 1+1+1+1+2+1 207 | 160-7 208 | 6+2+1+1+1+1+1 209 | 141-13 210 | 49-23 211 | 45-6 212 | 37-2 213 | 33-5 214 | 29-7 215 | 28-2 216 | 27-2 217 | 22-4 218 | 18-12 219 | 17-12 220 | 15-3 221 | 15-2 222 | 14-6 223 | q("n") 224 | q() 225 | mean(3,4,3.5) 226 | mean(3,4.5,3.5) 227 | mean(c(3,4,3.5)) 228 | mean(c(3,4.5,4)) 229 | 70+90+45 230 | mean(c(3,4.5,4)) 231 | 4.3/6 232 | setwd("/Users/pcortez/DOUTORAMENTO/SergioMoro/DSS-article/UCI/bank-additional") 233 | d=read.table("bank-additional.csv",header=TRUE,sep=";") 234 | nrow(d) 235 | ncol(d) 236 | setwd("../bank") 237 | d=read.table("bank-full.csv",header=TRUE,sep=";") 238 | ncol(d) 239 | nrow(d) 240 | d=read.table("bank.csv",header=TRUE,sep=";") 241 | nrow(d) 242 | ncol(d) 243 | table(d$y) 244 | summary(d) 245 | setwd("../bank-additional/") 246 | d=read.table("bank-additional-full.csv",header=TRUE,sep=";") 247 | summary(d) 248 | table(d$emp.var.rate) 249 | d=read.table("bank-additional-full.csv",header=TRUE,sep=";") 250 | summary(d) 251 | q("n") 252 | q("no") 253 | -------------------------------------------------------------------------------- /data/bank-additional/bank-additional-names.txt: -------------------------------------------------------------------------------- 1 | Citation Request: 2 | This dataset is publicly available for research. The details are described in [Moro et al., 2014]. 3 | Please include this citation if you plan to use this database: 4 | 5 | [Moro et al., 2014] S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, In press, http://dx.doi.org/10.1016/j.dss.2014.03.001 6 | 7 | Available at: [pdf] http://dx.doi.org/10.1016/j.dss.2014.03.001 8 | [bib] http://www3.dsi.uminho.pt/pcortez/bib/2014-dss.txt 9 | 10 | 1. Title: Bank Marketing (with social/economic context) 11 | 12 | 2. Sources 13 | Created by: Sérgio Moro (ISCTE-IUL), Paulo Cortez (Univ. Minho) and Paulo Rita (ISCTE-IUL) @ 2014 14 | 15 | 3. Past Usage: 16 | 17 | The full dataset (bank-additional-full.csv) was described and analyzed in: 18 | 19 | S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems (2014), doi:10.1016/j.dss.2014.03.001. 20 | 21 | 4. Relevant Information: 22 | 23 | This dataset is based on "Bank Marketing" UCI dataset (please check the description at: http://archive.ics.uci.edu/ml/datasets/Bank+Marketing). 24 | The data is enriched by the addition of five new social and economic features/attributes (national wide indicators from a ~10M population country), published by the Banco de Portugal and publicly available at: https://www.bportugal.pt/estatisticasweb. 25 | This dataset is almost identical to the one used in [Moro et al., 2014] (it does not include all attributes due to privacy concerns). 26 | Using the rminer package and R tool (http://cran.r-project.org/web/packages/rminer/), we found that the addition of the five new social and economic attributes (made available here) lead to substantial improvement in the prediction of a success, even when the duration of the call is not included. Note: the file can be read in R using: d=read.table("bank-additional-full.csv",header=TRUE,sep=";") 27 | 28 | The zip file includes two datasets: 29 | 1) bank-additional-full.csv with all examples, ordered by date (from May 2008 to November 2010). 30 | 2) bank-additional.csv with 10% of the examples (4119), randomly selected from bank-additional-full.csv. 31 | The smallest dataset is provided to test more computationally demanding machine learning algorithms (e.g., SVM). 32 | 33 | The binary classification goal is to predict if the client will subscribe a bank term deposit (variable y). 34 | 35 | 5. Number of Instances: 41188 for bank-additional-full.csv 36 | 37 | 6. Number of Attributes: 20 + output attribute. 38 | 39 | 7. Attribute information: 40 | 41 | For more information, read [Moro et al., 2014]. 42 | 43 | Input variables: 44 | # bank client data: 45 | 1 - age (numeric) 46 | 2 - job : type of job (categorical: "admin.","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown") 47 | 3 - marital : marital status (categorical: "divorced","married","single","unknown"; note: "divorced" means divorced or widowed) 48 | 4 - education (categorical: "basic.4y","basic.6y","basic.9y","high.school","illiterate","professional.course","university.degree","unknown") 49 | 5 - default: has credit in default? (categorical: "no","yes","unknown") 50 | 6 - housing: has housing loan? (categorical: "no","yes","unknown") 51 | 7 - loan: has personal loan? (categorical: "no","yes","unknown") 52 | # related with the last contact of the current campaign: 53 | 8 - contact: contact communication type (categorical: "cellular","telephone") 54 | 9 - month: last contact month of year (categorical: "jan", "feb", "mar", ..., "nov", "dec") 55 | 10 - day_of_week: last contact day of the week (categorical: "mon","tue","wed","thu","fri") 56 | 11 - duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y="no"). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model. 57 | # other attributes: 58 | 12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact) 59 | 60 | 13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted) 61 | 62 | 14 - previous: number of contacts performed before this campaign and for this client (numeric) 63 | 64 | 15 - poutcome: outcome of the previous marketing campaign (categorical: "failure","nonexistent","success") 65 | 66 | # social and economic context attributes 67 | 16 - emp.var.rate: employment variation rate - quarterly indicator (numeric) 68 | 17 - cons.price.idx: consumer price index - monthly indicator (numeric) 69 | 18 - cons.conf.idx: consumer confidence index - monthly indicator (numeric) 70 | 19 - euribor3m: euribor 3 month rate - daily indicator (numeric) 71 | 20 - nr.employed: number of employees - quarterly indicator (numeric) 72 | 73 | Output variable (desired target): 74 | 21 - y - has the client subscribed a term deposit? (binary: "yes","no") 75 | 76 | 8. Missing Attribute Values: There are several missing values in some categorical attributes, all coded with the "unknown" label. These missing values can be treated as a possible class label or using deletion or imputation techniques. 77 | 78 | * Number of employees at the extremes is predictive of the outcome 79 | * Employment variability rate at the extremes is predictive of the outcome 80 | * euribor3m at the extremes is predictive of outcome -------------------------------------------------------------------------------- /support/model.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.pipeline import make_pipeline 3 | from sklearn.base import clone 4 | from sklearn.model_selection import GridSearchCV 5 | from sklearn.pipeline import Pipeline 6 | 7 | def build_tuned_model(name, base_model, X_train, y_train, hparams, scorer=None, cv_folds=5, pipeline=None): 8 | from time import time 9 | start = time() 10 | print('==> Starting {}-fold cross validation for {} model, {} examples'.format(str(cv_folds), name, len(X_train))) 11 | model = TunedModel(hparams, name=name, model=base_model, pipeline=pipeline) 12 | model.train(X_train, y_train, scorer, cv_folds) 13 | elapsed = time() - start 14 | print("==> Elapsed seconds: {:.3f}".format(elapsed)) 15 | 16 | print('Best {} model: {}'.format(model.name, model.model)) 17 | print('Best {} score: {:.3f}'.format( 18 | model.name, 19 | model.results.sort_values('mean_test_score', ascending=False 20 | ).head(1).mean_test_score.values[0])) 21 | 22 | return model 23 | 24 | # ============================================================================================================ 25 | # Model 26 | # ============================================================================================================ 27 | class Model(object): 28 | def __init__(self, name, model, pipeline=None): 29 | self.name = name 30 | self.model = model 31 | self.pipeline = pipeline 32 | 33 | def train(self, X, y): 34 | """ Fits the model and builds the full pipeline """ 35 | if self.pipeline is None: 36 | X_transformed = X 37 | self.model_pipeline = make_pipeline(self.model) 38 | else: 39 | X_transformed = self.pipeline.fit_transform(X) 40 | self.model_pipeline = make_pipeline(self.pipeline, self.model) 41 | 42 | self.model.fit(X_transformed, y) 43 | 44 | return self 45 | 46 | def predict(self, X): 47 | """ Fits the model and builds the full pipeline 48 | TODO: Make sure the model was fitted 49 | """ 50 | # if self.pipeline is None: 51 | # X_transformed = X 52 | # else: 53 | # X_transformed = self.pipeline.fit_transform(X) 54 | 55 | preds = self.model_pipeline.predict(X) 56 | 57 | return preds 58 | 59 | def get_model_pipeline(self): 60 | """ Useful for cross validation to refit the pipeline on every round """ 61 | full_pipeline = clone(self.pipeline) 62 | full_pipeline.steps.append((self.name, self.model)) 63 | return full_pipeline 64 | 65 | def score(self, X, y, scorer): 66 | """ Scores the model using the scorer 67 | 68 | Postcondititions: 69 | - score should not be 0 70 | - model.predictions should have elements 71 | """ 72 | score = 0 73 | 74 | if self.pipeline is None: 75 | model.predictions = self.model.predict(X) 76 | score = scorer(self.model, X, y) 77 | else: 78 | model_predictions = self.model_pipeline.predict(X) 79 | score = scorer(self.model_pipeline, X, y) 80 | 81 | return score 82 | 83 | def save(self, file_path): 84 | from joblib import dump 85 | dump(self, file_path) 86 | 87 | @staticmethod 88 | def load(file_path): 89 | from joblib import load 90 | model = load(file_path) 91 | return model 92 | 93 | 94 | def score_cv(self, X, y, scorer, k=5): 95 | """ Scores the model using the scorer 96 | 97 | Postcondititions: 98 | - score should not be 0 99 | - model.predictions should have elements 100 | """ 101 | from sklearn.model_selection import cross_val_score 102 | 103 | score = 0 104 | 105 | if self.pipeline is None: 106 | score = cross_val_score(self.model, X, y, scoring=scorer, cv=k, n_jobs=-1) 107 | else: 108 | score = scorer(self.model_pipeline, X, y) 109 | 110 | return (score.mean(), score.std()) 111 | 112 | # ============================================================================================================ 113 | # TunedModel 114 | # ============================================================================================================ 115 | class TunedModel(Model): 116 | """ A class used to optimize the hyperparameters for a machine learning algorithm 117 | 118 | Parameters 119 | ---------- 120 | name : string 121 | The name of a model 122 | 123 | param_grid : dict 124 | A dict of (parameter, values) pairs to optimize 125 | 126 | pipeline : object 127 | A pipeline to apply to the data before fitting the model 128 | """ 129 | 130 | def __init__(self, param_grid, **kwargs): 131 | Model.__init__(self, **kwargs) 132 | self.param_grid = param_grid 133 | 134 | def train(self, X, y, scorer, cv_folds=5): 135 | """ Tunes a model using the parameter grid that this class was initialized with. 136 | 137 | Parameters 138 | ---------- 139 | X : array-like, matrix 140 | Input data 141 | 142 | y : array-like 143 | Targets for input data 144 | 145 | cv_folds : int, optional, default: 5 146 | The number of cross-validation folds to use in the optimization process. 147 | """ 148 | if not self.pipeline: 149 | # Setup 150 | grid_search = GridSearchCV( 151 | self.model, 152 | self.param_grid, 153 | cv=cv_folds, 154 | scoring=scorer, 155 | return_train_score=True, 156 | n_jobs=-1) 157 | 158 | # Run it 159 | grid_search.fit(X, y) 160 | 161 | # Save the model 162 | self.model = grid_search.best_estimator_ 163 | else: 164 | # Setup 165 | grid_search = GridSearchCV( 166 | self.get_model_pipeline(), 167 | self.param_grid, 168 | cv=cv_folds, 169 | scoring=scorer, 170 | return_train_score=True, 171 | n_jobs=-1) 172 | 173 | # Run it 174 | grid_search.fit(X, y) 175 | 176 | # Save the model and pipeline 177 | self.model = grid_search.best_estimator_.steps[-1][1] 178 | self.pipeline = Pipeline(grid_search.best_estimator_.steps[:-1]) 179 | 180 | self.results = pd.DataFrame(grid_search.cv_results_) -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # End-to-End Applied Data Science: Lead Scoring 2 | 3 | ## Motivation 4 | The purpose of this project is to provide an end-to-end example of working an enterprise-grade data science problem, from business understanding to model evaluation, by applying two different frameworks: 5 | 6 | * [CoNVO](https://www.oreilly.com/library/view/thinking-with-data/9781491949757/ch01.html) to scope the problem and formulate a data science solution 7 | * [CRISP-DM](https://en.wikipedia.org/wiki/Cross-industry_standard_process_for_data_mining) to iterate on a model-based solution 8 | 9 | Since data and machine learning engineers are typically responsible for deployment, maintenance, and optimization of infrastructure and models, this project omits the Deployment phase of CRISP-DM. 10 | 11 | This project is not without flaws - the argument could be stronger, costs and benefits could be more accurate, and the resulting model could perform better, but its level of detail is reflective of the effort required to to create a reproducible data science solution that is (1) well aligned with stakeholder expectations and (2) poised to deliver business value. We could continue to iterate and improve our results ad infinitum in the pursuit of the perfect solution but in the real world, we are constrained by budgets and so by time. It's clear that even if we had unlimited budget to refine the analysis and model, our early and fast efforts will likely be satisfactory enough to deploy a data solution, allowing us to move on and pursue a different project. From both business and engineering perspectives, it's generally best to start with the [simplest thing that could possibly work](http://www.agilenutshell.com/simplest_thing), especially if it's relatively cheap to build. 12 | 13 | ## Overview 14 | This project uses the Bank Marketing Dataset published to the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/bank+marketing) to demonstrate the scoping and implementation of a real-world data science project. The dataset's description on the UCI Machine Learning website states: 15 | 16 | > The classification goal is to predict if the client will subscribe a term deposit (variable y). 17 | 18 | With a hypothetical business context and feedback from stakeholders regarding various aspects of a marketing problem, we can arrive at a more complete and operational description of our problem, based on the [CoNVO](https://www.oreilly.com/library/view/thinking-with-data/9781491949757/ch01.html) framework: 19 | 20 | ### Context 21 | This Portuguese bank offers term deposits as a financial product offering to its existing customers as a means to provide capital for loan products. It is currently running a telemarketing campaign in which existing customers are pitched on the term deposit product during inbound customer service and outbound sales calls. The decision maker for this campaign is the director of marketing. 22 | 23 | ### Needs 24 | A tactical change in product strategy as a result of changing market conditions (lowered interest rates) requires additional budget resources to jumpstart a new campaign for a different financial product – mortgages. The bank figures that if it can improve its conversion rate with its current campaign, then it will be in a better position to spend the remaining budget more efficiently, leading to higher revenues and more funds available for the mortgage campaign. 25 | 26 | The director of marketing is also interested in quantifying the effect of this modelling effort to ensure that data science efforts are making an impact and, if successful, making it easier to get buy in for allocate resources to data-driven efforts in the future. 27 | 28 | ### Vision 29 | We propose to create a lead scoring model that will rank customers according to a model score that indicates how likely customers are to invest in a term deposit. By focussing on the customers who are most likely to convert, the bank will earn more per marketing dollar spent on customer specialists who are pitching to customers. To this end, we will deliver a ranked list of contacts in an Excel spreadsheet that the director of marketing can hand off to her management team for operationalization. 30 | 31 | Quantifying the impact of the campaign empirically will require the design prior to and execution of a controlled experiment when the campaign is deployed. Two weeks prior to model deployment, we’ll deliver a report outlining the experimental design and two weeks following the conclusion of the campaign, we’ll deliver a report summarizing the results. 32 | 33 | ### Outcome 34 | If the bank observes a significant increase in ROI as a result of the development and deployment of the lead scoring model, the engineering team will be tasked with integrating the model into a CRM system, making the deployment of future models faster and cheaper. 35 | 36 | ## Key Insights 37 | 38 | * The conversion rate of the previous marketing campaign was 25%. 39 | 40 | * The conversion rate of the current marketing campaign is 11%. 41 | 42 | * A simple "domain-driven" model that chooses customers to target based on whether they converted in the last marketing campaign and chooses remaining customers randomly improved the current conversion rate by 250% - bringing the conversion rate of targeted customers to 39%. 43 | 44 | * A Naive Bayes model using only categorical features from the original training set improved the baseline conversion rate by 354% - bringing the conversion rate of targeted customers to 51%. 45 | 46 | * With the Naive Bayes model, targeting 6.6% of the population identifies ~30% of the respondents. 47 | 48 | ## Project Structure 49 | 50 | ### data 51 | 52 | The data retrieved from the UCI Machine Learning Repository as well as training and test sets. 53 | 54 | * `bank_additional/` 55 | * [Original dataset and supporting documentation](https://archive.ics.uci.edu/ml/datasets/bank+marketing) retrieved from the UCI Machine Learning Repository on December 1, 2018. 56 | 57 | * `train.csv` 58 | * The dataset used for exploratory analysis to train machine learning models 59 | 60 | * `test.csv` 61 | * The dataset used to evaluate the optimal model 62 | 63 | ### docs 64 | 65 | Project documentation including the project context and scope, financial models, and an articulation of the solution. All of the documentation can also be found in [this Google Drive folder](https://drive.google.com/drive/folders/1CALQVFCMiGfsMvMqmVnLJII_EXaI5Obn?usp=sharing). 66 | 67 | * `1-CoNVO.docx` ([view online in Google Docs](https://docs.google.com/document/d/1LkOGwPcyZN6hDQ-KPCfb6p9m6VegMKE3FnxdYroSYyU/edit?usp=sharing)) 68 | * A document describing the scope of the project - its Context, Needs, Vision and Outcome 69 | 70 | * `2-Problem Model.xlsx` ([view online in Google Sheets](https://drive.google.com/open?id=1rDIC9r9fduUbbMj7KvcDUYNujHfReoDB)) 71 | * A spreadsheet model of the problem dynamics including costs, benefits, campaign conversion rates, and what-if scenarios 72 | 73 | * `3-Vision, Arguments, and Results.docx` ([view online in Google Docs](https://drive.google.com/open?id=1pO5jlEivQn1SbJtorNDzHBStk1m3i0d2)) 74 | * A document articulating the refinement of the problem, establishing definitions, values, and an operational measure of success / effectiveness 75 | 76 | ### models 77 | 78 | Serialized model objects with optimized hyperparameters. 79 | 80 | * `experiment-1-model.pk` 81 | * The scikit-learn implementation of Naive Bayes which was the optimal model of the project 82 | 83 | ### notebooks 84 | 85 | Exploratory analysis and experiment code. 86 | 87 | * `1.0-exploratory-analysis.ipynb` 88 | * Exploratory analysis of existing marketing data 89 | 90 | * `2.0-experiment-1-baseline-vs-ml.ipynb` 91 | * The first experiment designed to evaluate the difference in performance between the baseline domain-driven model and a set of machine learning models 92 | 93 | * `3.0-experiment-2-feature-engineering` 94 | * Prototyping code for new features derived from existing variables included in the marketing dataset 95 | 96 | * `3.1-experiment-2-model.ipynb` 97 | * The second experiment designed to evaluate the effectiveness of derived features with the best performing model from experiment #1 98 | 99 | * `4.0-model-evaluation.ipynb` 100 | * Evaluation of the best performing model on the held-out test set 101 | 102 | ### support 103 | 104 | A supporting library that encapsulates ML model objects and provides tools for evaluation and hyperparameter tuning. Most of this code will be moved to a standalone library, except for: 105 | 106 | * `experiments/` 107 | * Code modules for experiment code that includes pipelines and derived features 108 | 109 | * `parameters.py` 110 | * Values for e.g. costs, benefits, and proportion of targeted customers from the Problem Model workbook 111 | 112 | ## Dependencies 113 | 114 | * numpy==1.15.2 115 | * pandas==0.23.4 116 | * matplotlib==2.2.2 117 | * seaborn==0.9.0 118 | * scikit-learn==0.20.0 119 | 120 | ## License 121 | The MIT License (MIT) 122 | 123 | Copyright (c) 2018 Calvin De Lima 124 | 125 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 126 | 127 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 128 | 129 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /notebooks/3.0-experiment-2-feature-engineering.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Overview\n", 8 | "\n", 9 | "The purpose of this notebook is to improve the performance of the baseline model by creating derived features for the Naive Bayes classifier. Since the BernoulliNB classifier can only handle categorial attributes, we'll derive categorical features from numeric ones and bin some of the informative attributes using feature selection to identify the best binning strategy.\n", 10 | "\n", 11 | "### Existing Features\n", 12 | "\n", 13 | "#### Categorical Attributes\n", 14 | "* Most categorical attributes are informative of the outcome of the marketing campaign except for `marital` and `day_of_week`\n", 15 | "* Several attributes contain missing values but there are not any a-priori reasons to treat the missing values differently from regular values. \n", 16 | "* An additional indicator that encodes whether a customer was contacted prior to the current campaign may improve predictive performance as `poutcome` values of 'success' and 'failure' have higher response rates than the 'nonexistent' value\n", 17 | "\n", 18 | "#### Numeric Attributes\n", 19 | "* `age` outliers include people over the age of ~62\n", 20 | " * Accross all ages, customers who were contacted more than ten times were not likely to respond to the campaign\n", 21 | " * Younger and older customers tend to convert more than middle-aged customers\n", 22 | "* `pdays` uses a very large number (999) to encode missing values. Non-missing instances are seven times more likely to convert than missing instances\n", 23 | "* `previous` values are finite - can be converted to a categorical variable / binned\n", 24 | "* Socioeconomic distributions are sparse\n", 25 | "* Strong correlation between\n", 26 | " * `emp.var.rate` and `cons.price.idx`\n", 27 | " * `emp.var.rate` and `euribor3m`\n", 28 | " * `emp.var.rate` and `nr.employed`\n", 29 | " * `nr.employed` and `euribor3m`\n", 30 | "\n", 31 | "\n", 32 | "### New Features\n", 33 | "* Create an indicator of whether a customer was previously contacted as part of a previous campaign (`pdays`)\n", 34 | "* Create an indicator of whether a customer was part of a previous campaign (`poutcome`)\n", 35 | "* Transform `previous` to a categorical variable\n", 36 | "* Apply binning to:\n", 37 | " * log(`age`)\n", 38 | " * `campaign`" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 89, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, KBinsDiscretizer\n", 48 | "from sklearn.compose import ColumnTransformer\n", 49 | "from sklearn.pipeline import Pipeline, FeatureUnion\n", 50 | "from sklearn.base import clone\n", 51 | "from utils import code\n", 52 | "\n", 53 | "import pandas as pd\n", 54 | "import numpy as np\n", 55 | "\n", 56 | "%matplotlib inline" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 2, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/html": [ 67 | "
\n", 68 | "\n", 81 | "\n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | "
agejobmaritaleducationdefaulthousingloancontactmonthday_of_week...campaignpdayspreviouspoutcomeemp.var.ratecons.price.idxcons.conf.idxeuribor3mnr.employedy
025blue-collarsinglehigh.schoolnoyesnocellularjulmon...29990nonexistent1.493.918-42.74.9625228.1no
140admin.marriedhigh.schoolnononotelephonejunthu...19990nonexistent1.494.465-41.84.9585228.1no
251technicianmarrieduniversity.degreenoyesnocellularaugwed...59990nonexistent1.493.444-36.14.9645228.1yes
337blue-collarmarriedhigh.schoolunknownyesnocellularjultue...19990nonexistent1.493.918-42.74.9615228.1no
454housemaidmarrieduniversity.degreeunknownyesnocellularaugthu...19990nonexistent1.493.444-36.14.9635228.1no
\n", 231 | "

5 rows × 21 columns

\n", 232 | "
" 233 | ], 234 | "text/plain": [ 235 | " age job marital education default housing loan \\\n", 236 | "0 25 blue-collar single high.school no yes no \n", 237 | "1 40 admin. married high.school no no no \n", 238 | "2 51 technician married university.degree no yes no \n", 239 | "3 37 blue-collar married high.school unknown yes no \n", 240 | "4 54 housemaid married university.degree unknown yes no \n", 241 | "\n", 242 | " contact month day_of_week ... campaign pdays previous poutcome \\\n", 243 | "0 cellular jul mon ... 2 999 0 nonexistent \n", 244 | "1 telephone jun thu ... 1 999 0 nonexistent \n", 245 | "2 cellular aug wed ... 5 999 0 nonexistent \n", 246 | "3 cellular jul tue ... 1 999 0 nonexistent \n", 247 | "4 cellular aug thu ... 1 999 0 nonexistent \n", 248 | "\n", 249 | " emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed y \n", 250 | "0 1.4 93.918 -42.7 4.962 5228.1 no \n", 251 | "1 1.4 94.465 -41.8 4.958 5228.1 no \n", 252 | "2 1.4 93.444 -36.1 4.964 5228.1 yes \n", 253 | "3 1.4 93.918 -42.7 4.961 5228.1 no \n", 254 | "4 1.4 93.444 -36.1 4.963 5228.1 no \n", 255 | "\n", 256 | "[5 rows x 21 columns]" 257 | ] 258 | }, 259 | "execution_count": 2, 260 | "metadata": {}, 261 | "output_type": "execute_result" 262 | } 263 | ], 264 | "source": [ 265 | "bank = pd.read_csv('../data/train.csv')\n", 266 | "bank.head()" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 3, 272 | "metadata": {}, 273 | "outputs": [], 274 | "source": [ 275 | "X = bank.drop('y', axis=1)\n", 276 | "y = bank.y" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "## Computations" 284 | ] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "metadata": {}, 289 | "source": [ 290 | "### Was the customer contacted about the last campaign?" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 7, 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "def ft_pcontacted_last_campaign(X):\n", 300 | " pcontacted = ~(X == 999)\n", 301 | " return pcontacted.values.reshape(-1,1)" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 8, 307 | "metadata": {}, 308 | "outputs": [ 309 | { 310 | "data": { 311 | "text/plain": [ 312 | "True" 313 | ] 314 | }, 315 | "execution_count": 8, 316 | "metadata": {}, 317 | "output_type": "execute_result" 318 | } 319 | ], 320 | "source": [ 321 | "add_pcontacted_last_campaign = FunctionTransformer(ft_pcontacted_last_campaign, validate=False)\n", 322 | "add_pcontacted_last_campaign_ct = ColumnTransformer([('add_pcontacted', add_pcontacted_last_campaign, 'pdays')])\n", 323 | "\n", 324 | "pcontacted_last_campaign = add_pcontacted_last_campaign_ct.fit_transform(X)\n", 325 | "\n", 326 | "# Make sure the resulting values are correct\n", 327 | "all(X.loc[np.where(pcontacted_last_campaign.ravel() == True)[0], ['pdays']] == '999')" 328 | ] 329 | }, 330 | { 331 | "cell_type": "markdown", 332 | "metadata": {}, 333 | "source": [ 334 | "### Was the customer part of the last campaign?" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 9, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "def ft_pcampaign(X):\n", 344 | " pcampaign = ~(X == 'nonexistent')\n", 345 | " return pcampaign.values.reshape(-1,1)" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 10, 351 | "metadata": {}, 352 | "outputs": [ 353 | { 354 | "data": { 355 | "text/plain": [ 356 | "True" 357 | ] 358 | }, 359 | "execution_count": 10, 360 | "metadata": {}, 361 | "output_type": "execute_result" 362 | } 363 | ], 364 | "source": [ 365 | "add_pcampaign = FunctionTransformer(ft_pcampaign, validate=False)\n", 366 | "add_pcampaign_ct = ColumnTransformer([('add_poutcome', add_pcampaign, 'poutcome')])\n", 367 | "\n", 368 | "pcampaign = add_pcampaign_ct.fit_transform(X)\n", 369 | "\n", 370 | "# Make sure the resulting values are correct\n", 371 | "all(X.loc[np.where(pcampaign.ravel() == True)[0], ['poutcome']] != 'nonexistent')" 372 | ] 373 | }, 374 | { 375 | "cell_type": "markdown", 376 | "metadata": {}, 377 | "source": [ 378 | "### Was the customer contacted more than ten times?" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 11, 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "def ft_campaign_gte10(X):\n", 388 | " campaign_gte10 = X >= 10\n", 389 | " return campaign_gte10.values.reshape(-1,1)" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 12, 395 | "metadata": {}, 396 | "outputs": [ 397 | { 398 | "data": { 399 | "text/plain": [ 400 | "True" 401 | ] 402 | }, 403 | "execution_count": 12, 404 | "metadata": {}, 405 | "output_type": "execute_result" 406 | } 407 | ], 408 | "source": [ 409 | "add_campaign_gte10 = FunctionTransformer(ft_campaign_gte10, validate=False)\n", 410 | "add_campaign_gte10_ct = ColumnTransformer([('campaign_gte10', add_campaign_gte10, 'campaign')])\n", 411 | "\n", 412 | "campaign_gte10 = add_campaign_gte10_ct.fit_transform(X)\n", 413 | "\n", 414 | "# Make sure the resulting values are correct\n", 415 | "all(X.loc[np.where(campaign_gte10.ravel() == True)[0], ['campaign']] >= 10)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "metadata": {}, 421 | "source": [ 422 | "### Transform previous to categorical" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": 13, 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [ 431 | "def ft_previous(X):\n", 432 | " previous = X.astype(str)\n", 433 | " return previous.values.reshape(-1,1)" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": 14, 439 | "metadata": {}, 440 | "outputs": [ 441 | { 442 | "data": { 443 | "text/plain": [ 444 | "True" 445 | ] 446 | }, 447 | "execution_count": 14, 448 | "metadata": {}, 449 | "output_type": "execute_result" 450 | } 451 | ], 452 | "source": [ 453 | "add_previous = FunctionTransformer(ft_previous, validate=False)\n", 454 | "add_previous_ct = ColumnTransformer([('add_previous', add_previous, 'previous')])\n", 455 | "\n", 456 | "previous = add_previous_ct.fit_transform(X)\n", 457 | "\n", 458 | "# Make sure the resulting values are correct\n", 459 | "all(previous.astype(int).ravel() == X.previous)" 460 | ] 461 | }, 462 | { 463 | "cell_type": "markdown", 464 | "metadata": {}, 465 | "source": [ 466 | "### Apply binning as a hyperparameter" 467 | ] 468 | }, 469 | { 470 | "cell_type": "markdown", 471 | "metadata": {}, 472 | "source": [ 473 | "Apply a log transform to `age` to transform the distribution" 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": 30, 479 | "metadata": {}, 480 | "outputs": [ 481 | { 482 | "data": { 483 | "image/png": "\n", 484 | "text/plain": [ 485 | "
" 486 | ] 487 | }, 488 | "metadata": {}, 489 | "output_type": "display_data" 490 | } 491 | ], 492 | "source": [ 493 | "np.log(X[['age', 'campaign']]).hist(figsize=(10,4));" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 69, 499 | "metadata": {}, 500 | "outputs": [ 501 | { 502 | "data": { 503 | "text/plain": [ 504 | "<32950x10 sparse matrix of type ''\n", 505 | "\twith 65900 stored elements in Compressed Sparse Row format>" 506 | ] 507 | }, 508 | "execution_count": 69, 509 | "metadata": {}, 510 | "output_type": "execute_result" 511 | } 512 | ], 513 | "source": [ 514 | "binning_pipeline = Pipeline([\n", 515 | " ('log', FunctionTransformer(np.log, validate=True)),\n", 516 | " ('kbins', KBinsDiscretizer())\n", 517 | "])\n", 518 | "\n", 519 | "age_campaign_ct = ColumnTransformer([\n", 520 | " ('age_pipeline', clone(binning_pipeline), ['age']),\n", 521 | " ('campaign_pipeline', clone(binning_pipeline), ['campaign'])\n", 522 | "])\n", 523 | "age_campaign_ct.fit_transform(X)" 524 | ] 525 | }, 526 | { 527 | "cell_type": "markdown", 528 | "metadata": {}, 529 | "source": [ 530 | "Apply a log transform to `campaign` to transform the distribution" 531 | ] 532 | }, 533 | { 534 | "cell_type": "markdown", 535 | "metadata": {}, 536 | "source": [ 537 | "## Pipeline" 538 | ] 539 | }, 540 | { 541 | "cell_type": "markdown", 542 | "metadata": {}, 543 | "source": [ 544 | "* Apply column transformers (categorical)\n", 545 | "* Apply column transformers (numeric)" 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": 70, 551 | "metadata": {}, 552 | "outputs": [ 553 | { 554 | "data": { 555 | "text/plain": [ 556 | "Pipeline(memory=None,\n", 557 | " steps=[('cat_ct', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,\n", 558 | " transformer_weights=None,\n", 559 | " transformers=[('pcontacted_last_campaign', FunctionTransformer(accept_sparse=False, check_inverse=True,\n", 560 | " func=, handle_unknown='ignore',\n", 562 | " n_values=None, sparse=True))])" 563 | ] 564 | }, 565 | "execution_count": 70, 566 | "metadata": {}, 567 | "output_type": "execute_result" 568 | } 569 | ], 570 | "source": [ 571 | "# Create the transformers for categorical features\n", 572 | "new_cat_features = [\n", 573 | " ('pcontacted_last_campaign', add_pcontacted_last_campaign, 'pdays'),\n", 574 | " ('poutcome', add_pcampaign, 'poutcome'),\n", 575 | " ('previous', add_previous, 'previous'),\n", 576 | " ('campaign_gte10', add_campaign_gte10, 'campaign')\n", 577 | "]\n", 578 | "cat_ct = ColumnTransformer(new_cat_features)\n", 579 | "\n", 580 | "# Create the pipeline to transform categorical features\n", 581 | "cat_pipeline = Pipeline([\n", 582 | " ('cat_ct', cat_ct),\n", 583 | " ('ohe', OneHotEncoder(handle_unknown='ignore'))\n", 584 | "])\n", 585 | "cat_pipeline.fit(X)" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": 71, 591 | "metadata": {}, 592 | "outputs": [ 593 | { 594 | "data": { 595 | "text/plain": [ 596 | "(32950, 4)" 597 | ] 598 | }, 599 | "execution_count": 71, 600 | "metadata": {}, 601 | "output_type": "execute_result" 602 | } 603 | ], 604 | "source": [ 605 | "cat_ct.fit_transform(X).shape" 606 | ] 607 | }, 608 | { 609 | "cell_type": "code", 610 | "execution_count": 72, 611 | "metadata": {}, 612 | "outputs": [ 613 | { 614 | "data": { 615 | "text/plain": [ 616 | "(32950, 24)" 617 | ] 618 | }, 619 | "execution_count": 72, 620 | "metadata": {}, 621 | "output_type": "execute_result" 622 | } 623 | ], 624 | "source": [ 625 | "# Create the feature union of categorical and numeric attributes\n", 626 | "ft_union = FeatureUnion([\n", 627 | " ('cat_pipeline', cat_pipeline),\n", 628 | " ('num_pipeline', age_campaign_ct)\n", 629 | "])\n", 630 | "\n", 631 | "ft_union.fit(X)\n", 632 | "features = ft_union.transform(X)\n", 633 | "features.shape" 634 | ] 635 | }, 636 | { 637 | "cell_type": "markdown", 638 | "metadata": {}, 639 | "source": [ 640 | "Create categorical feature names post-transformation" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": 75, 646 | "metadata": {}, 647 | "outputs": [ 648 | { 649 | "data": { 650 | "text/plain": [ 651 | "['pcontacted_last_campaign_False',\n", 652 | " 'pcontacted_last_campaign_True',\n", 653 | " 'previous_False',\n", 654 | " 'previous_True',\n", 655 | " 'poutcome_0',\n", 656 | " 'poutcome_1',\n", 657 | " 'poutcome_2',\n", 658 | " 'poutcome_3',\n", 659 | " 'poutcome_4',\n", 660 | " 'poutcome_5',\n", 661 | " 'poutcome_6',\n", 662 | " 'poutcome_7',\n", 663 | " 'campaign_gte10_False',\n", 664 | " 'campaign_gte10_True']" 665 | ] 666 | }, 667 | "execution_count": 75, 668 | "metadata": {}, 669 | "output_type": "execute_result" 670 | } 671 | ], 672 | "source": [ 673 | "# Add column names\n", 674 | "feature_names = [\n", 675 | " # Don't incclude the last entry in the `named_transformers_` list since\n", 676 | " # it's the `remainder` parameter for the ColumnTransformer\n", 677 | " ['%s_%s' % (name, value) for value in values] for \n", 678 | " name, values in list(zip(list(cat_ct.named_transformers_.keys())[1:], cat_pipeline.named_steps['ohe'].categories_))]\n", 679 | "\n", 680 | "cat_feature_names = [name for names in feature_names for name in names]\n", 681 | "cat_feature_names" 682 | ] 683 | }, 684 | { 685 | "cell_type": "markdown", 686 | "metadata": {}, 687 | "source": [ 688 | "Create numeric feature names post-transformation" 689 | ] 690 | }, 691 | { 692 | "cell_type": "code", 693 | "execution_count": 87, 694 | "metadata": {}, 695 | "outputs": [], 696 | "source": [ 697 | "# TODO\n", 698 | "\n", 699 | "# numeric_feature_names = age_campaign_ct.transformers_[0][2]\n", 700 | "# p = age_campaign_ct.named_transformers_['age_pipeline']\n", 701 | "# p2 = p.named_steps['kbins']\n", 702 | "# p2.n_bins_" 703 | ] 704 | }, 705 | { 706 | "cell_type": "code", 707 | "execution_count": 85, 708 | "metadata": {}, 709 | "outputs": [], 710 | "source": [ 711 | "# features_df = pd.DataFrame(features.todense(), columns=cat_feature_names + numeric_feature_names)\n", 712 | "# features_df.head()" 713 | ] 714 | }, 715 | { 716 | "cell_type": "markdown", 717 | "metadata": {}, 718 | "source": [ 719 | "Try the file we created as code" 720 | ] 721 | }, 722 | { 723 | "cell_type": "code", 724 | "execution_count": 90, 725 | "metadata": {}, 726 | "outputs": [], 727 | "source": [ 728 | "%load_ext autoreload\n", 729 | "%autoreload 2\n", 730 | "\n", 731 | "from support.experiments import experiment_2" 732 | ] 733 | }, 734 | { 735 | "cell_type": "code", 736 | "execution_count": 94, 737 | "metadata": {}, 738 | "outputs": [], 739 | "source": [ 740 | "pipeline = experiment_2.get_pipeline()\n", 741 | "assert features.shape == pipeline.fit_transform(X).shape" 742 | ] 743 | } 744 | ], 745 | "metadata": { 746 | "kernelspec": { 747 | "display_name": "Python 3", 748 | "language": "python", 749 | "name": "python3" 750 | }, 751 | "language_info": { 752 | "codemirror_mode": { 753 | "name": "ipython", 754 | "version": 3 755 | }, 756 | "file_extension": ".py", 757 | "mimetype": "text/x-python", 758 | "name": "python", 759 | "nbconvert_exporter": "python", 760 | "pygments_lexer": "ipython3", 761 | "version": "3.5.5" 762 | } 763 | }, 764 | "nbformat": 4, 765 | "nbformat_minor": 2 766 | } 767 | -------------------------------------------------------------------------------- /notebooks/4.0-model-evaluation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Model Evaluation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Use the best model from the experiments to estimate generalization performance on the test set." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 62, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stdout", 24 | "output_type": "stream", 25 | "text": [ 26 | "The autoreload extension is already loaded. To reload it, use:\n", 27 | " %reload_ext autoreload\n" 28 | ] 29 | } 30 | ], 31 | "source": [ 32 | "%load_ext autoreload\n", 33 | "%autoreload 2\n", 34 | "\n", 35 | "from utils import code" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 140, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import pandas as pd\n", 45 | "import numpy as np\n", 46 | "import scikitplot as skplt\n", 47 | "import matplotlib.pyplot as plt\n", 48 | "\n", 49 | "from sklearn.metrics import classification_report, confusion_matrix\n", 50 | "\n", 51 | "from support.model import Model\n", 52 | "from support.datasets import get_data\n", 53 | "from support.experiments import get_scorer\n", 54 | "from support.experiments.experiment_1 import CATEGORICAL_FEATURES, baseline_model_predictions\n", 55 | "from support.parameters import P_TARGETED, AVG_COST, AVG_REVENUE\n", 56 | "\n", 57 | "%matplotlib inline" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "## Model" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "Load the best performing model" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 84, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "tuned_model = Model.load('../models/experiment-1-model.pkl')\n", 81 | "model = Model(tuned_model.name, tuned_model.model, tuned_model.pipeline)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "Load the training and test sets" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 85, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "X_train, y_train = get_data('../data/train.csv')\n", 98 | "X_test, y_test = get_data('../data/test.csv')" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 128, 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "name": "stdout", 108 | "output_type": "stream", 109 | "text": [ 110 | "Number of training instances: 32,950\n", 111 | "Number of test instances: 8,238\n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "print('Number of training instances: {:,}'.format(len(X_train)))\n", 117 | "print('Number of test instances: {:,}'.format(len(X_test)))" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "Build the model using the training set" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 86, 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "data": { 134 | "text/plain": [ 135 | "" 136 | ] 137 | }, 138 | "execution_count": 86, 139 | "metadata": {}, 140 | "output_type": "execute_result" 141 | } 142 | ], 143 | "source": [ 144 | "model.train(X_train, y_train)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "## Model Performance" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 95, 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "name": "stdout", 161 | "output_type": "stream", 162 | "text": [ 163 | "Number of targeted customers: 543\n" 164 | ] 165 | } 166 | ], 167 | "source": [ 168 | "n_targeted = int(len(X_test) * P_TARGETED)\n", 169 | "print('Number of targeted customers:', n_targeted)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": {}, 175 | "source": [ 176 | "Predict and score the test set" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 96, 182 | "metadata": {}, 183 | "outputs": [ 184 | { 185 | "data": { 186 | "text/plain": [ 187 | "0.6560871090617483" 188 | ] 189 | }, 190 | "execution_count": 96, 191 | "metadata": {}, 192 | "output_type": "execute_result" 193 | } 194 | ], 195 | "source": [ 196 | "scorer = get_scorer()\n", 197 | "model.score(X_test, y_test, scorer)" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "We're off by about 1.2% from our cross validation results which is well within two standard deviations of the results." 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "## Financial Performance" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "Evaluate financial performance on baseline" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 126, 224 | "metadata": {}, 225 | "outputs": [ 226 | { 227 | "name": "stdout", 228 | "output_type": "stream", 229 | "text": [ 230 | "Baseline profit: $227,418\n", 231 | "Baseline conversion rate: 0.39\n", 232 | "Lift over random model: 3.5\n" 233 | ] 234 | } 235 | ], 236 | "source": [ 237 | "baseline_preds = baseline_model_predictions(X_test, y_test, n_targeted)\n", 238 | "baseline_outcomes = baseline_preds.apply(lambda x: AVG_COST if x == 0 else AVG_COST + AVG_REVENUE)\n", 239 | "baseline_profit = sum(baseline_outcomes)\n", 240 | "baseline_rate = baseline_preds.sum() / len(baseline_preds)\n", 241 | "base_rate = y_test.sum() / len(y_test)\n", 242 | "\n", 243 | "print('Baseline profit: ${:,}'.format(baseline_profit))\n", 244 | "print('Baseline conversion rate: {:.2f}'.format(baseline_rate))\n", 245 | "print('Lift over random model: {:.1f}'.format(baseline_rate / base_rate))" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 98, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "X_test_trans = model.pipeline.transform(X_test)\n", 255 | "preds = model.model.predict(X_test_trans)\n", 256 | "probs = model.model.predict_proba(X_test_trans)" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 111, 262 | "metadata": {}, 263 | "outputs": [ 264 | { 265 | "name": "stdout", 266 | "output_type": "stream", 267 | "text": [ 268 | "Model profit: $296,730\n", 269 | "Model lift: 1.3 or $69,312\n", 270 | "Targeted conversion rate: 0.51\n", 271 | "Conversion rate lift: 4.54\n" 272 | ] 273 | } 274 | ], 275 | "source": [ 276 | "# Create a dataframe of probabilities and actual / predicted outcomes\n", 277 | "probs_df = pd.DataFrame(np.hstack([probs, y_test.values.reshape(-1,1), preds.reshape(-1,1)]), columns=['p_no', 'p_yes', 'actual', 'predicted'])\n", 278 | "\n", 279 | "# Sort customers by the probability that they will convert\n", 280 | "model_targets = probs_df.sort_values('p_yes', ascending=False)\n", 281 | "\n", 282 | "# Take the top N\n", 283 | "model_targets = model_targets.head(n_targeted)\n", 284 | "\n", 285 | "# Calculate financial outcomes\n", 286 | "model_outcomes = model_targets.actual.apply(lambda x: AVG_COST if x == 0 else AVG_COST + AVG_REVENUE)\n", 287 | "model_profit = sum(model_outcomes)\n", 288 | "model_conv_rate = model_targets.actual.sum() / len(model_targets)\n", 289 | "\n", 290 | "print('Model profit: ${:,}'.format(model_profit))\n", 291 | "print('Model lift: {:.1f} or ${:,}'.format(model_profit / baseline_profit, model_profit - baseline_profit))\n", 292 | "print('Targeted conversion rate: {:.2f}'.format(model_conv_rate))\n", 293 | "print('Conversion rate lift: {:.2f}'.format(model_conv_rate / base_rate))" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "Our model produces the expected lift (see 2.0-experiment-1-baseline)." 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 112, 306 | "metadata": {}, 307 | "outputs": [ 308 | { 309 | "name": "stdout", 310 | "output_type": "stream", 311 | "text": [ 312 | " precision recall f1-score support\n", 313 | "\n", 314 | " 0.0 0.00 0.00 0.00 265\n", 315 | " 1.0 0.51 1.00 0.68 278\n", 316 | "\n", 317 | " micro avg 0.51 0.51 0.51 543\n", 318 | " macro avg 0.26 0.50 0.34 543\n", 319 | "weighted avg 0.26 0.51 0.35 543\n", 320 | "\n" 321 | ] 322 | }, 323 | { 324 | "name": "stderr", 325 | "output_type": "stream", 326 | "text": [ 327 | "/anaconda3/envs/py35-ds/lib/python3.5/site-packages/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n", 328 | " 'precision', 'predicted', average, warn_for)\n" 329 | ] 330 | } 331 | ], 332 | "source": [ 333 | "print(classification_report(model_targets.actual, model_targets.predicted))" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 148, 339 | "metadata": {}, 340 | "outputs": [ 341 | { 342 | "name": "stdout", 343 | "output_type": "stream", 344 | "text": [ 345 | "Cost of false positives: $2,224\n" 346 | ] 347 | } 348 | ], 349 | "source": [ 350 | "print('Cost of false positives: ${:,}'.format(278*8))" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "Given the neglible cost of false negatives, we can afford to miss all of the true negatives in the targeted sample - customers who did not end up converting." 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 147, 363 | "metadata": {}, 364 | "outputs": [ 365 | { 366 | "name": "stderr", 367 | "output_type": "stream", 368 | "text": [ 369 | "/anaconda3/envs/py35-ds/lib/python3.5/site-packages/matplotlib/cbook/deprecation.py:107: MatplotlibDeprecationWarning: Passing one of 'on', 'true', 'off', 'false' as a boolean is deprecated; use an actual boolean (True/False) instead.\n", 370 | " warnings.warn(message, mplDeprecation, stacklevel=1)\n" 371 | ] 372 | }, 373 | { 374 | "data": { 375 | "image/png": "\n", 376 | "text/plain": [ 377 | "
" 378 | ] 379 | }, 380 | "metadata": {}, 381 | "output_type": "display_data" 382 | } 383 | ], 384 | "source": [ 385 | "skplt.metrics.plot_cumulative_gain(y_test, probs);\n", 386 | "plt.axvline(.066, color='red');" 387 | ] 388 | } 389 | ], 390 | "metadata": { 391 | "kernelspec": { 392 | "display_name": "Python 3", 393 | "language": "python", 394 | "name": "python3" 395 | }, 396 | "language_info": { 397 | "codemirror_mode": { 398 | "name": "ipython", 399 | "version": 3 400 | }, 401 | "file_extension": ".py", 402 | "mimetype": "text/x-python", 403 | "name": "python", 404 | "nbconvert_exporter": "python", 405 | "pygments_lexer": "ipython3", 406 | "version": "3.5.5" 407 | } 408 | }, 409 | "nbformat": 4, 410 | "nbformat_minor": 2 411 | } 412 | -------------------------------------------------------------------------------- /notebooks/3.1-experiment-2-model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Experiment #2 - Derived Features" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Overview\n", 15 | "\n", 16 | "The purpose of this experiment is to determine whether derived features improve model performance. As part of the feature engineering process, we added the following categorical features:\n", 17 | "\n", 18 | "* `pcontacted_last_campaign` - whether the customer was previously contacted as part of a campaign\n", 19 | "* `pcampaign` - whether the customer was part of the the previous campaign\n", 20 | "* `previous` - the original `previous` attribute converted to a discrete value\n", 21 | "* `campaign_gte10` - whether the customer was contacted 10 or more times as part of this campaign" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 232, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "name": "stdout", 31 | "output_type": "stream", 32 | "text": [ 33 | "The autoreload extension is already loaded. To reload it, use:\n", 34 | " %reload_ext autoreload\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "%load_ext autoreload\n", 40 | "%autoreload 2\n", 41 | "\n", 42 | "from utils import code" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 233, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "import pandas as pd\n", 52 | "import numpy as np\n", 53 | "import scikitplot as skplt\n", 54 | "\n", 55 | "# Preprocessing\n", 56 | "from sklearn.compose import ColumnTransformer\n", 57 | "from sklearn.pipeline import Pipeline, FeatureUnion\n", 58 | "from sklearn.feature_selection import mutual_info_classif, chi2\n", 59 | "\n", 60 | "# Model evaluation\n", 61 | "from sklearn.model_selection import train_test_split\n", 62 | "from support.evaluation import plot_learning_curve, evaluate_model\n", 63 | "\n", 64 | "# Support\n", 65 | "from support.model import Model, build_tuned_model\n", 66 | "from support.datasets import get_data\n", 67 | "from support.experiments import experiment_1, experiment_2, get_scorer\n", 68 | "from support import parameters as params\n", 69 | "\n", 70 | "# Algos\n", 71 | "from sklearn.naive_bayes import BernoulliNB\n", 72 | "from sklearn.ensemble import ExtraTreesClassifier\n", 73 | "\n", 74 | "%matplotlib inline" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "## Data" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 234, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "data": { 91 | "text/html": [ 92 | "
\n", 93 | "\n", 106 | "\n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | "
agejobmaritaleducationdefaulthousingloancontactmonthday_of_weekdurationcampaignpdayspreviouspoutcomeemp.var.ratecons.price.idxcons.conf.idxeuribor3mnr.employed
025blue-collarsinglehigh.schoolnoyesnocellularjulmon61929990nonexistent1.493.918-42.74.9625228.1
140admin.marriedhigh.schoolnononotelephonejunthu9719990nonexistent1.494.465-41.84.9585228.1
251technicianmarrieduniversity.degreenoyesnocellularaugwed51259990nonexistent1.493.444-36.14.9645228.1
337blue-collarmarriedhigh.schoolunknownyesnocellularjultue42319990nonexistent1.493.918-42.74.9615228.1
454housemaidmarrieduniversity.degreeunknownyesnocellularaugthu29719990nonexistent1.493.444-36.14.9635228.1
\n", 250 | "
" 251 | ], 252 | "text/plain": [ 253 | " age job marital education default housing loan \\\n", 254 | "0 25 blue-collar single high.school no yes no \n", 255 | "1 40 admin. married high.school no no no \n", 256 | "2 51 technician married university.degree no yes no \n", 257 | "3 37 blue-collar married high.school unknown yes no \n", 258 | "4 54 housemaid married university.degree unknown yes no \n", 259 | "\n", 260 | " contact month day_of_week duration campaign pdays previous \\\n", 261 | "0 cellular jul mon 619 2 999 0 \n", 262 | "1 telephone jun thu 97 1 999 0 \n", 263 | "2 cellular aug wed 512 5 999 0 \n", 264 | "3 cellular jul tue 423 1 999 0 \n", 265 | "4 cellular aug thu 297 1 999 0 \n", 266 | "\n", 267 | " poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m \\\n", 268 | "0 nonexistent 1.4 93.918 -42.7 4.962 \n", 269 | "1 nonexistent 1.4 94.465 -41.8 4.958 \n", 270 | "2 nonexistent 1.4 93.444 -36.1 4.964 \n", 271 | "3 nonexistent 1.4 93.918 -42.7 4.961 \n", 272 | "4 nonexistent 1.4 93.444 -36.1 4.963 \n", 273 | "\n", 274 | " nr.employed \n", 275 | "0 5228.1 \n", 276 | "1 5228.1 \n", 277 | "2 5228.1 \n", 278 | "3 5228.1 \n", 279 | "4 5228.1 " 280 | ] 281 | }, 282 | "execution_count": 234, 283 | "metadata": {}, 284 | "output_type": "execute_result" 285 | } 286 | ], 287 | "source": [ 288 | "X, y = get_data('../data/train.csv')\n", 289 | "X.head()" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": {}, 295 | "source": [ 296 | "Create a validation set and train the model then score on the test set. The performance will be biased since we are using less training data." 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 235, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, stratify=y, random_state=1)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "## ML Models" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 236, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "scorer = get_scorer()" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | "Apply feature computations" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 242, 334 | "metadata": {}, 335 | "outputs": [ 336 | { 337 | "name": "stdout", 338 | "output_type": "stream", 339 | "text": [ 340 | "Instances: 32,950, Features: 61\n" 341 | ] 342 | } 343 | ], 344 | "source": [ 345 | "pipeline_1 = experiment_1.get_pipeline()\n", 346 | "features = pipeline_1.fit_transform(X)\n", 347 | "ps = features.shape\n", 348 | "print('Instances: {:,}, Features: {}'.format(ps[0], ps[1]))" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "### Original Model\n", 356 | "\n", 357 | "Make sure we still get the same results ~.67 AUC" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 245, 363 | "metadata": {}, 364 | "outputs": [ 365 | { 366 | "name": "stdout", 367 | "output_type": "stream", 368 | "text": [ 369 | "==> Starting 5-fold cross validation for nb model, 26360 examples\n", 370 | "==> Elapsed seconds: 3.567\n", 371 | "Best nb model: BernoulliNB(alpha=0.1, binarize=0.0, class_prior=None, fit_prior=False)\n", 372 | "Best nb score: 0.711\n" 373 | ] 374 | } 375 | ], 376 | "source": [ 377 | "# Naive Bayes\n", 378 | "param_grid = [{\n", 379 | " 'nb__alpha': [0, 0.01, 0.1, 1],\n", 380 | " 'nb__fit_prior': [True, False]\n", 381 | "}]\n", 382 | "\n", 383 | "tuned_model = build_tuned_model('nb', BernoulliNB(), X_train, y_train, param_grid, scorer, pipeline=pipeline_1, cv_folds=5)" 384 | ] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "metadata": {}, 389 | "source": [ 390 | "### New Model\n", 391 | "\n", 392 | "How does the performance change with the new features?" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 243, 398 | "metadata": {}, 399 | "outputs": [ 400 | { 401 | "name": "stdout", 402 | "output_type": "stream", 403 | "text": [ 404 | "Instances: 32,950, Features: 67\n" 405 | ] 406 | } 407 | ], 408 | "source": [ 409 | "pipeline_2 = experiment_2.get_pipeline()\n", 410 | "features = pipeline_2.fit_transform(X)\n", 411 | "ps = features.shape\n", 412 | "print('Instances: {:,}, Features: {}'.format(ps[0], ps[1]))" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": 246, 418 | "metadata": {}, 419 | "outputs": [ 420 | { 421 | "name": "stdout", 422 | "output_type": "stream", 423 | "text": [ 424 | "==> Starting 5-fold cross validation for nb model, 26360 examples\n", 425 | "==> Elapsed seconds: 4.290\n", 426 | "Best nb model: BernoulliNB(alpha=0, binarize=0.0, class_prior=None, fit_prior=False)\n", 427 | "Best nb score: 0.658\n" 428 | ] 429 | }, 430 | { 431 | "name": "stderr", 432 | "output_type": "stream", 433 | "text": [ 434 | "/anaconda3/envs/py35-ds/lib/python3.5/site-packages/sklearn/naive_bayes.py:480: UserWarning: alpha too small will result in numeric errors, setting alpha = 1.0e-10\n", 435 | " 'setting alpha = %.1e' % _ALPHA_MIN)\n" 436 | ] 437 | } 438 | ], 439 | "source": [ 440 | "tuned_model = build_tuned_model('nb', BernoulliNB(), X_train, y_train, param_grid, scorer, pipeline=pipeline_2, cv_folds=5)" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 247, 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [ 449 | "n_targeted_test = int(params.P_TARGETED * len(X_test))\n", 450 | "revenue = params.AVG_REVENUE\n", 451 | "cost = params.AVG_COST\n", 452 | "\n", 453 | "probs = tuned_model.model.predict_proba(tuned_model.pipeline.transform(X_test))\n", 454 | "preds = tuned_model.model.predict(tuned_model.pipeline.transform(X_test))\n", 455 | "\n", 456 | "# Create a dataframe of probabilities and actual / predicted outcomes\n", 457 | "probs_df = pd.DataFrame(np.hstack([probs, y_test.values.reshape(-1,1), preds.reshape(-1,1)]), columns=['p_no', 'p_yes', 'actual', 'predicted'])\n", 458 | "\n", 459 | "# Sort customers by the probability that they will convert\n", 460 | "model_targets = probs_df.sort_values('p_yes', ascending=False)\n", 461 | "\n", 462 | "# Take the top 6.6%\n", 463 | "model_targets = model_targets.head(n_targeted_test)\n", 464 | "\n", 465 | "# Calculate financial outcomes\n", 466 | "model_outcomes = model_targets.actual.apply(lambda x: cost if x == 0 else cost + revenue)" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 248, 472 | "metadata": {}, 473 | "outputs": [ 474 | { 475 | "name": "stdout", 476 | "output_type": "stream", 477 | "text": [ 478 | " precision recall f1-score support\n", 479 | "\n", 480 | " 0.0 0.00 0.00 0.00 228\n", 481 | " 1.0 0.47 1.00 0.64 206\n", 482 | "\n", 483 | " micro avg 0.47 0.47 0.47 434\n", 484 | " macro avg 0.24 0.50 0.32 434\n", 485 | "weighted avg 0.23 0.47 0.31 434\n", 486 | "\n", 487 | "Expected profit: $219,626\n" 488 | ] 489 | }, 490 | { 491 | "name": "stderr", 492 | "output_type": "stream", 493 | "text": [ 494 | "/anaconda3/envs/py35-ds/lib/python3.5/site-packages/sklearn/metrics/classification.py:1143: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n", 495 | " 'precision', 'predicted', average, warn_for)\n" 496 | ] 497 | } 498 | ], 499 | "source": [ 500 | "print(classification_report(model_targets.actual, model_targets.predicted))\n", 501 | "print('Expected profit: ${:,}'.format(sum(model_outcomes)))" 502 | ] 503 | }, 504 | { 505 | "cell_type": "markdown", 506 | "metadata": {}, 507 | "source": [ 508 | "The score is lower than the original model, is it a high bias or high variance problem?" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 151, 514 | "metadata": {}, 515 | "outputs": [ 516 | { 517 | "name": "stderr", 518 | "output_type": "stream", 519 | "text": [ 520 | "/anaconda3/envs/py35-ds/lib/python3.5/site-packages/sklearn/model_selection/_split.py:1943: FutureWarning: You should specify a value for 'cv' instead of relying on the default value. The default value will change from 3 to 5 in version 0.22.\n", 521 | " warnings.warn(CV_WARNING, FutureWarning)\n" 522 | ] 523 | }, 524 | { 525 | "data": { 526 | "image/png": "\n", 527 | "text/plain": [ 528 | "
" 529 | ] 530 | }, 531 | "metadata": {}, 532 | "output_type": "display_data" 533 | } 534 | ], 535 | "source": [ 536 | "plot_learning_curve(tuned_model.model, 'New Model Learning Curve', experiment_2.get_pipeline().fit_transform(X_train), y_train, scoring=scorer);" 537 | ] 538 | }, 539 | { 540 | "cell_type": "markdown", 541 | "metadata": {}, 542 | "source": [ 543 | "High bias, and slightly higher variance. Can we improve it with feature selection?" 544 | ] 545 | }, 546 | { 547 | "cell_type": "markdown", 548 | "metadata": {}, 549 | "source": [ 550 | "### Feature Selection" 551 | ] 552 | }, 553 | { 554 | "cell_type": "code", 555 | "execution_count": 5, 556 | "metadata": {}, 557 | "outputs": [ 558 | { 559 | "data": { 560 | "text/html": [ 561 | "
\n", 562 | "\n", 575 | "\n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | "
agejobmaritaleducationdefaulthousingloancontactmonthday_of_weekdurationcampaignpdayspreviouspoutcomeemp.var.ratecons.price.idxcons.conf.idxeuribor3mnr.employed
025blue-collarsinglehigh.schoolnoyesnocellularjulmon61929990nonexistent1.493.918-42.74.9625228.1
140admin.marriedhigh.schoolnononotelephonejunthu9719990nonexistent1.494.465-41.84.9585228.1
251technicianmarrieduniversity.degreenoyesnocellularaugwed51259990nonexistent1.493.444-36.14.9645228.1
337blue-collarmarriedhigh.schoolunknownyesnocellularjultue42319990nonexistent1.493.918-42.74.9615228.1
454housemaidmarrieduniversity.degreeunknownyesnocellularaugthu29719990nonexistent1.493.444-36.14.9635228.1
\n", 719 | "
" 720 | ], 721 | "text/plain": [ 722 | " age job marital education default housing loan \\\n", 723 | "0 25 blue-collar single high.school no yes no \n", 724 | "1 40 admin. married high.school no no no \n", 725 | "2 51 technician married university.degree no yes no \n", 726 | "3 37 blue-collar married high.school unknown yes no \n", 727 | "4 54 housemaid married university.degree unknown yes no \n", 728 | "\n", 729 | " contact month day_of_week duration campaign pdays previous \\\n", 730 | "0 cellular jul mon 619 2 999 0 \n", 731 | "1 telephone jun thu 97 1 999 0 \n", 732 | "2 cellular aug wed 512 5 999 0 \n", 733 | "3 cellular jul tue 423 1 999 0 \n", 734 | "4 cellular aug thu 297 1 999 0 \n", 735 | "\n", 736 | " poutcome emp.var.rate cons.price.idx cons.conf.idx euribor3m \\\n", 737 | "0 nonexistent 1.4 93.918 -42.7 4.962 \n", 738 | "1 nonexistent 1.4 94.465 -41.8 4.958 \n", 739 | "2 nonexistent 1.4 93.444 -36.1 4.964 \n", 740 | "3 nonexistent 1.4 93.918 -42.7 4.961 \n", 741 | "4 nonexistent 1.4 93.444 -36.1 4.963 \n", 742 | "\n", 743 | " nr.employed \n", 744 | "0 5228.1 \n", 745 | "1 5228.1 \n", 746 | "2 5228.1 \n", 747 | "3 5228.1 \n", 748 | "4 5228.1 " 749 | ] 750 | }, 751 | "execution_count": 5, 752 | "metadata": {}, 753 | "output_type": "execute_result" 754 | } 755 | ], 756 | "source": [ 757 | "X.head()" 758 | ] 759 | }, 760 | { 761 | "cell_type": "code", 762 | "execution_count": 112, 763 | "metadata": {}, 764 | "outputs": [], 765 | "source": [ 766 | "# Create a copy of the data frame with categorical features only\n", 767 | "cat_ct = experiment_2.get_categorical_ct()\n", 768 | "X_fs = cat_ct.fit_transform(X_train)\n", 769 | "features = experiment_2.CATEGORICAL_FEATURES + experiment_2.NEW_CATEGORICAL_FEATURES\n", 770 | "X_fs_df = pd.DataFrame(X_fs, columns=features)" 771 | ] 772 | }, 773 | { 774 | "cell_type": "code", 775 | "execution_count": 221, 776 | "metadata": {}, 777 | "outputs": [ 778 | { 779 | "name": "stdout", 780 | "output_type": "stream", 781 | "text": [ 782 | "Number of features: 66\n" 783 | ] 784 | } 785 | ], 786 | "source": [ 787 | "ohe_features = experiment_2.get_categorical_pipeline().fit_transform(X_train, y_train)\n", 788 | "print('Number of features:', ohe_features.shape[1])" 789 | ] 790 | }, 791 | { 792 | "cell_type": "code", 793 | "execution_count": 211, 794 | "metadata": {}, 795 | "outputs": [ 796 | { 797 | "name": "stdout", 798 | "output_type": "stream", 799 | "text": [ 800 | "==> Starting 3-fold cross validation for nb_fs model, 26360 examples\n", 801 | "==> Elapsed seconds: 351.680\n", 802 | "Best nb_fs model: BernoulliNB(alpha=0, binarize=0.0, class_prior=None, fit_prior=False)\n", 803 | "Best nb_fs score: 0.658\n" 804 | ] 805 | }, 806 | { 807 | "name": "stderr", 808 | "output_type": "stream", 809 | "text": [ 810 | "/anaconda3/envs/py35-ds/lib/python3.5/site-packages/sklearn/naive_bayes.py:480: UserWarning: alpha too small will result in numeric errors, setting alpha = 1.0e-10\n", 811 | " 'setting alpha = %.1e' % _ALPHA_MIN)\n" 812 | ] 813 | } 814 | ], 815 | "source": [ 816 | "param_grid = [{\n", 817 | " 'nb_fs__alpha': [0, 0.01, 0.1, 1],\n", 818 | " 'nb_fs__fit_prior': [True, False],\n", 819 | " 'kbest__k': np.arange(1, ohe_features.shape[1]+1),\n", 820 | " 'kbest__score_func': [chi2, mutual_info_classif]\n", 821 | "}]\n", 822 | "\n", 823 | "ft_pipeline = Pipeline([\n", 824 | " ('cat', experiment_2.get_categorical_pipeline()),\n", 825 | " ('kbest', SelectKBest())\n", 826 | "])\n", 827 | "\n", 828 | "tuned_model = build_tuned_model('nb_fs', BernoulliNB(), X_train, y_train, param_grid, scorer, pipeline=ft_pipeline, cv_folds=3)" 829 | ] 830 | }, 831 | { 832 | "cell_type": "code", 833 | "execution_count": 212, 834 | "metadata": {}, 835 | "outputs": [ 836 | { 837 | "data": { 838 | "text/plain": [ 839 | "('kbest', SelectKBest(k=18, score_func=))" 840 | ] 841 | }, 842 | "execution_count": 212, 843 | "metadata": {}, 844 | "output_type": "execute_result" 845 | } 846 | ], 847 | "source": [ 848 | "tuned_model.pipeline.steps[-1]" 849 | ] 850 | }, 851 | { 852 | "cell_type": "markdown", 853 | "metadata": {}, 854 | "source": [ 855 | "The model achieves maximum performance with 18/66 features. At this point, we could consider adding numeric features and evaluating the performance of more powerful models if stakeholders are not satisfied with a \\$56,316 lift over the baseline heuristic-based model and are willing to incur additional costs to improve the model." 856 | ] 857 | } 858 | ], 859 | "metadata": { 860 | "kernelspec": { 861 | "display_name": "Python 3", 862 | "language": "python", 863 | "name": "python3" 864 | }, 865 | "language_info": { 866 | "codemirror_mode": { 867 | "name": "ipython", 868 | "version": 3 869 | }, 870 | "file_extension": ".py", 871 | "mimetype": "text/x-python", 872 | "name": "python", 873 | "nbconvert_exporter": "python", 874 | "pygments_lexer": "ipython3", 875 | "version": "3.5.5" 876 | } 877 | }, 878 | "nbformat": 4, 879 | "nbformat_minor": 2 880 | } 881 | --------------------------------------------------------------------------------