├── README.md ├── Features.py ├── .gitignore ├── AttributeRelevance.py ├── statistical_significance_of_information_value.ipynb └── telco_dataset_eda.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # Churn analysis using IV and WOE in Python 2 | 3 | Attribute relevance analysis helps in recognizing the most important variables which have the greatest impact on target variable and understanding relations and logic between the most important predictors and the target variable. 4 | 5 | Information value (IV) and weight of evidence (WOE) are simple and powerful techniques of conducting attribute relevance analysis. They provide a great framework for exploratory analysis and have been used extensively in the credit risk world for several decades. 6 | 7 | This repository contains analysis of churn in telephone service company (using IV and WOE), comparison of effect size and information value and quick tutorial how to use information value module (created for this analysis). 8 | 9 | 10 | 11 | Learn more from the article about this analysis: **[Churn Analysis Using Information Value and Weight of Evidence](https://towardsdatascience.com/churn-analysis-information-value-and-weight-of-evidence-6a35db8b9ec5)** 12 | 13 | ------ 14 | 15 | 16 | 17 | ### Contents 18 | 19 | [**Telco Customer Churn Analysis using IV and WOE**](telco_customer_churn_analysis.ipynb) - Being able to distinguish clients who are likely to churn is a key to success and enables businesses to take appropriate actions. See what is the churner profile for Telco company. 20 | 21 | [**Statistical Significance of Information Value**](statistical_significance_of_information_value.ipynb) - How differences in distribution of 'goods' and 'bads' are measured by two methods: IV & WOE and p-value & effect size? See how statistically significant are results obtained with information value technique. 22 | 23 | [**Information Value (IV) and Weight of Evidence (WOE) in Python**](iv_and_woe_in_python.ipynb) - Information value module contains all you need to conduct IV & WOE analysis. See how to use it to get the results and create insightful visualizations. -------------------------------------------------------------------------------- /Features.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import scipy.stats as stats 3 | 4 | class CategoricalFeature(): 5 | def __init__(self, df, feature): 6 | self.df = df 7 | self.feature = feature 8 | 9 | @property 10 | def df_lite(self): 11 | df_lite = self.df 12 | df_lite['bin'] = df_lite[self.feature].fillna('MISSING') 13 | return df_lite[['bin', 'label']] 14 | 15 | 16 | class ContinuousFeature(): 17 | def __init__(self, df, feature): 18 | self.df = df 19 | self.feature = feature 20 | self.bin_min_size = int(len(self.df) * 0.05) 21 | 22 | def __generate_bins(self, bins_num): 23 | df = self.df[[self.feature, 'label']] 24 | df['bin'] = pd.qcut(df[self.feature], bins_num, duplicates='drop') \ 25 | .apply(lambda x: x.left) \ 26 | .astype(float) 27 | return df 28 | 29 | def __generate_correct_bins(self, bins_max=20): 30 | for bins_num in range(bins_max, 1, -1): 31 | df = self.__generate_bins(bins_num) 32 | df_grouped = pd.DataFrame(df.groupby('bin') \ 33 | .agg({self.feature: 'count', 34 | 'label': 'sum'})) \ 35 | .reset_index() 36 | r, p = stats.stats.spearmanr(df_grouped['bin'], df_grouped['label']) 37 | 38 | if ( 39 | abs(r)==1 and # check if woe for bins are monotonic 40 | df_grouped[self.feature].min() > self.bin_min_size # check if bin size is greater than 5% 41 | and not (df_grouped[self.feature] == df_grouped['label']).any() # check if number of good and bad is not equal to 0 42 | ): 43 | break 44 | 45 | return df 46 | 47 | @property 48 | def df_lite(self): 49 | df_lite = self.__generate_correct_bins() 50 | df_lite['bin'].fillna('MISSING', inplace=True) 51 | return df_lite[['bin', 'label']] -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/python,pycharm,windows,jupyternotebooks 3 | # Edit at https://www.gitignore.io/?templates=python,pycharm,windows,jupyternotebooks 4 | 5 | ### JupyterNotebooks ### 6 | # gitignore template for Jupyter Notebooks 7 | # website: http://jupyter.org/ 8 | 9 | .ipynb_checkpoints 10 | */.ipynb_checkpoints/* 11 | 12 | # IPython 13 | profile_default/ 14 | ipython_config.py 15 | 16 | # Remove previous ipynb_checkpoints 17 | # git rm -r .ipynb_checkpoints/ 18 | 19 | ### PyCharm ### 20 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 21 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 22 | 23 | # User-specific stuff 24 | .idea/**/workspace.xml 25 | .idea/**/tasks.xml 26 | .idea/**/usage.statistics.xml 27 | .idea/**/dictionaries 28 | .idea/**/shelf 29 | 30 | # Generated files 31 | .idea/**/contentModel.xml 32 | 33 | # Sensitive or high-churn files 34 | .idea/**/dataSources/ 35 | .idea/**/dataSources.ids 36 | .idea/**/dataSources.local.xml 37 | .idea/**/sqlDataSources.xml 38 | .idea/**/dynamic.xml 39 | .idea/**/uiDesigner.xml 40 | .idea/**/dbnavigator.xml 41 | 42 | # Gradle 43 | .idea/**/gradle.xml 44 | .idea/**/libraries 45 | 46 | # Gradle and Maven with auto-import 47 | # When using Gradle or Maven with auto-import, you should exclude module files, 48 | # since they will be recreated, and may cause churn. Uncomment if using 49 | # auto-import. 50 | # .idea/modules.xml 51 | # .idea/*.iml 52 | # .idea/modules 53 | # *.iml 54 | # *.ipr 55 | 56 | # CMake 57 | cmake-build-*/ 58 | 59 | # Mongo Explorer plugin 60 | .idea/**/mongoSettings.xml 61 | 62 | # File-based project format 63 | *.iws 64 | 65 | # IntelliJ 66 | out/ 67 | 68 | # mpeltonen/sbt-idea plugin 69 | .idea_modules/ 70 | 71 | # JIRA plugin 72 | atlassian-ide-plugin.xml 73 | 74 | # Cursive Clojure plugin 75 | .idea/replstate.xml 76 | 77 | # Crashlytics plugin (for Android Studio and IntelliJ) 78 | com_crashlytics_export_strings.xml 79 | crashlytics.properties 80 | crashlytics-build.properties 81 | fabric.properties 82 | 83 | # Editor-based Rest Client 84 | .idea/httpRequests 85 | 86 | # Android studio 3.1+ serialized cache file 87 | .idea/caches/build_file_checksums.ser 88 | 89 | ### PyCharm Patch ### 90 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 91 | 92 | # *.iml 93 | # modules.xml 94 | # .idea/misc.xml 95 | # *.ipr 96 | 97 | # Sonarlint plugin 98 | .idea/**/sonarlint/ 99 | 100 | # SonarQube Plugin 101 | .idea/**/sonarIssues.xml 102 | 103 | # Markdown Navigator plugin 104 | .idea/**/markdown-navigator.xml 105 | .idea/**/markdown-navigator/ 106 | 107 | ### Python ### 108 | # Byte-compiled / optimized / DLL files 109 | __pycache__/ 110 | *.py[cod] 111 | *$py.class 112 | 113 | # C extensions 114 | *.so 115 | 116 | # Distribution / packaging 117 | .Python 118 | build/ 119 | develop-eggs/ 120 | dist/ 121 | downloads/ 122 | eggs/ 123 | .eggs/ 124 | lib/ 125 | lib64/ 126 | parts/ 127 | sdist/ 128 | var/ 129 | wheels/ 130 | pip-wheel-metadata/ 131 | share/python-wheels/ 132 | *.egg-info/ 133 | .installed.cfg 134 | *.egg 135 | MANIFEST 136 | 137 | # PyInstaller 138 | # Usually these files are written by a python script from a template 139 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 140 | *.manifest 141 | *.spec 142 | 143 | # Installer logs 144 | pip-log.txt 145 | pip-delete-this-directory.txt 146 | 147 | # Unit test / coverage reports 148 | htmlcov/ 149 | .tox/ 150 | .nox/ 151 | .coverage 152 | .coverage.* 153 | .cache 154 | nosetests.xml 155 | coverage.xml 156 | *.cover 157 | .hypothesis/ 158 | .pytest_cache/ 159 | 160 | # Translations 161 | *.mo 162 | *.pot 163 | 164 | # Scrapy stuff: 165 | .scrapy 166 | 167 | # Sphinx documentation 168 | docs/_build/ 169 | 170 | # PyBuilder 171 | target/ 172 | 173 | # pyenv 174 | .python-version 175 | 176 | # pipenv 177 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 178 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 179 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 180 | # install all needed dependencies. 181 | #Pipfile.lock 182 | 183 | # celery beat schedule file 184 | celerybeat-schedule 185 | 186 | # SageMath parsed files 187 | *.sage.py 188 | 189 | # Spyder project settings 190 | .spyderproject 191 | .spyproject 192 | 193 | # Rope project settings 194 | .ropeproject 195 | 196 | # Mr Developer 197 | .mr.developer.cfg 198 | .project 199 | .pydevproject 200 | 201 | # mkdocs documentation 202 | /site 203 | 204 | # mypy 205 | .mypy_cache/ 206 | .dmypy.json 207 | dmypy.json 208 | 209 | # Pyre type checker 210 | .pyre/ 211 | 212 | ### Windows ### 213 | # Windows thumbnail cache files 214 | Thumbs.db 215 | Thumbs.db:encryptable 216 | ehthumbs.db 217 | ehthumbs_vista.db 218 | 219 | # Dump file 220 | *.stackdump 221 | 222 | # Folder config file 223 | [Dd]esktop.ini 224 | 225 | # Recycle Bin used on file shares 226 | $RECYCLE.BIN/ 227 | 228 | # Windows Installer files 229 | *.cab 230 | *.msi 231 | *.msix 232 | *.msm 233 | *.msp 234 | 235 | # Windows shortcuts 236 | *.lnk 237 | 238 | # Custom 239 | /data/ 240 | .idea/ 241 | 242 | # End of https://www.gitignore.io/api/python,pycharm,windows,jupyternotebooks -------------------------------------------------------------------------------- /AttributeRelevance.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import scipy.stats as stats 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | 7 | pd.set_option('mode.chained_assignment', None) 8 | 9 | class AttributeRelevance(): 10 | def seq_palette(self, n_colors): 11 | return sns.cubehelix_palette(n_colors, start=.5, rot=-.75, reverse=True) 12 | 13 | def bulk_iv(self, feats, iv, woe_extremes=False): 14 | iv_dict = {} 15 | for f in feats: 16 | iv_df, iv_value = iv.calculate_iv(f) 17 | if woe_extremes: 18 | iv_dict[f.feature] = [iv_value, iv_df['woe'].min(), iv_df['woe'].max()] 19 | cols = ['iv', 'woe_min', 'woe_max'] 20 | else: 21 | iv_dict[f.feature] = iv_value 22 | cols = ['iv'] 23 | df = pd.DataFrame.from_dict(iv_dict, orient='index', columns=cols) 24 | return df 25 | 26 | def bulk_stats(self, feats, s): 27 | stats_dict = {} 28 | for f in feats: 29 | p_value, effect_size = s.calculate_chi(f) 30 | stats_dict[f.feature] = [p_value, effect_size] 31 | df = pd.DataFrame.from_dict(stats_dict, orient='index', columns=['p-value', 'effect_size']) 32 | return df 33 | 34 | def analyze(self, feats, iv, s=None, interpretation=False): 35 | df_iv = self.bulk_iv(feats, iv).sort_values(by='iv', ascending=False) 36 | if s is not None: 37 | df_stats = self.bulk_stats(feats, s) 38 | df_iv = df_iv.merge(df_stats, left_index=True, right_index=True) 39 | if interpretation: 40 | df_iv['iv_interpretation'] = df_iv['iv'].apply(iv.interpretation) 41 | if s is not None: 42 | df_iv['es_interpretation'] = df_iv['effect_size'].apply(s.interpretation) 43 | return df_iv 44 | 45 | def draw_iv(self, feats, iv): 46 | df = self.analyze(feats, iv) 47 | fig, ax = plt.subplots(figsize=(10, 6)) 48 | sns.barplot(x=df.index, y='iv', data=df, palette=self.seq_palette(len(feats))) 49 | ax.set_title('IV values') 50 | plt.xticks(rotation=90) 51 | plt.show() 52 | 53 | def draw_woe_extremes(self, feats, iv): 54 | df = self.bulk_iv(feats, iv, woe_extremes=True).sort_values(by='iv', ascending=False) 55 | fig, ax = plt.subplots(figsize=(10, 6)) 56 | sns.barplot(x=df.index, y='woe_min', data=df, palette=self.seq_palette(len(feats))) 57 | sns.barplot(x=df.index, y='woe_max', data=df, palette=self.seq_palette(len(feats))) 58 | ax.axhline(y=0, color='black', linewidth=1) 59 | ax.set_title('Range of WOE values') 60 | ax.set_ylabel('WOE') 61 | plt.xticks(rotation=90) 62 | plt.show() 63 | 64 | def draw_woe_multiplot(self, feats, iv): 65 | n = len(feats) 66 | nrows = int(np.ceil(n/3)) 67 | fig, ax = plt.subplots(nrows=nrows, ncols=3, figsize=(15, nrows*4)) 68 | for i in range(n): 69 | iv_df, iv_value = iv.calculate_iv(feats[i]) 70 | sns.barplot(x=feats[i].feature, y='woe', data=iv_df, color='#455872', ax=fig.axes[i]) 71 | 72 | for ax in fig.axes: 73 | plt.sca(ax) 74 | plt.xticks(rotation=50) 75 | 76 | plt.tight_layout() 77 | plt.show() 78 | 79 | class Analysis(): 80 | def seq_palette(self, n_colors): 81 | return sns.cubehelix_palette(n_colors, start=.5, rot=-.75, reverse=True) 82 | 83 | def group_by_feature(self, feat): 84 | df = feat.df_lite \ 85 | .groupby('bin') \ 86 | .agg({'label': ['count', 'sum']}) \ 87 | .reset_index() 88 | df.columns = [feat.feature, 'count', 'good'] 89 | df['bad'] = df['count'] - df['good'] 90 | return df 91 | 92 | class StatsSignificance(Analysis): 93 | def calculate_chi(self, feat): 94 | df = self.group_by_feature(feat) 95 | df_chi = np.array(df[['good', 'bad']]) 96 | n = df['count'].sum() 97 | 98 | chi = stats.chi2_contingency(df_chi) 99 | cramers_v = np.sqrt(chi[0] / n) # assume that k=2 (good, bad) 100 | return chi[1], cramers_v 101 | 102 | @staticmethod 103 | def interpretation(cramers_v): 104 | if cramers_v < 0.1: 105 | return 'useless' 106 | elif cramers_v < 0.2: 107 | return 'weak' 108 | elif cramers_v < 0.4: 109 | return 'medium' 110 | elif cramers_v < 0.6: 111 | return 'strong' 112 | else: 113 | return 'very strong' 114 | 115 | def interpret_chi(self, feat): 116 | _, cramers_v = self.calculate_chi(feat) 117 | return self.interpretation(cramers_v) 118 | 119 | def print_chi(self, feat): 120 | p_value, cramers_v = self.calculate_chi(feat) 121 | print('P-value: %0.2f\nEffect size: %0.2f' % (p_value, cramers_v)) 122 | print('%s is a %s predictor' % (feat.feature.capitalize(), self.interpretation(cramers_v))) 123 | 124 | 125 | class IV(Analysis): 126 | @staticmethod 127 | def __perc_share(df, group_name): 128 | return df[group_name] / df[group_name].sum() 129 | 130 | def __calculate_perc_share(self, feat): 131 | df = self.group_by_feature(feat) 132 | df['perc_good'] = self.__perc_share(df, 'good') 133 | df['perc_bad'] = self.__perc_share(df, 'bad') 134 | df['perc_diff'] = df['perc_good'] - df['perc_bad'] 135 | return df 136 | 137 | def __calculate_woe(self, feat): 138 | df = self.__calculate_perc_share(feat) 139 | df['woe'] = np.log(df['perc_good']/df['perc_bad']) 140 | df['woe'] = df['woe'].replace([np.inf, -np.inf], np.nan).fillna(0) 141 | return df 142 | 143 | def calculate_iv(self, feat): 144 | df = self.__calculate_woe(feat) 145 | df['iv'] = df['perc_diff'] * df['woe'] 146 | return df, df['iv'].sum() 147 | 148 | def draw_woe(self, feat): 149 | iv_df, iv_value = self.calculate_iv(feat) 150 | fig, ax = plt.subplots(figsize=(10,6)) 151 | sns.barplot(x=feat.feature, y='woe', data=iv_df, palette=self.seq_palette(len(iv_df.index))) 152 | ax.set_title('WOE visualization for: ' + feat.feature) 153 | plt.show() 154 | plt.show() 155 | 156 | @staticmethod 157 | def interpretation(iv): 158 | if iv < 0.02: 159 | return 'useless' 160 | elif iv < 0.1: 161 | return 'weak' 162 | elif iv < 0.3: 163 | return 'medium' 164 | elif iv < 0.5: 165 | return 'strong' 166 | else: 167 | return 'suspicious' 168 | 169 | def interpret_iv(self, feat): 170 | _, iv = self.calculate_iv(feat) 171 | return self.interpretation(iv) 172 | 173 | def print_iv(self, feat): 174 | _, iv = self.calculate_iv(feat) 175 | print('Information value: %0.2f' % iv) 176 | print('%s is a %s predictor' % (feat.feature.capitalize(), self.interpretation(iv))) -------------------------------------------------------------------------------- /statistical_significance_of_information_value.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Statistical Significance of Information Value\n", 8 | "\n", 9 | "While Information Value and Weight of Evidence may be a totally new concepts for many people (especially collegues from other departments, not statisticians), statistical significance is much more familiar term. During presentation of analysis you might be asked if the obtained results are statistically significant. IV & WOE technique doesn't give explicit answer to this question (although it gives some intuition which results are relevant).\n", 10 | "\n", 11 | "This analysis shows how differences in distribution of 'goods' and 'bads' are measured by two methods: IV & WOE and p-value & effect size.\n", 12 | "\n", 13 | "- In order to see how it can be used in practice, check [**Telco Customer Churn Analysis**](https://github.com/klaudia-nazarko/iv-and-woe-python/blob/master/telco_customer_churn_analysis.ipynb).\n", 14 | "- Get to know how to use Information Value module, check [**IV and WOE in Python**](https://github.com/klaudia-nazarko/iv-and-woe-python/blob/master/iv_and_woe_in_python.ipynb)" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## P-value and effect size\n", 22 | "\n", 23 | "In order to measure p-value and effect size in this analysis it was used:\n", 24 | "* chi-square test of independence of variables\n", 25 | "* cramers' v effect size\n", 26 | "\n", 27 | ">Pearson's chi-squared test (χ2) is a statistical test applied to sets of categorical data to evaluate how likely it is that any observed difference between the sets arose by chance. A test of independence assesses whether observations consisting of measures on two variables, expressed in a contingency table, are independent of each other (e.g. polling responses from people of different nationalities to see if one's nationality is related to the response).\n", 28 | "\n", 29 | "Chi-square says that there is a significant relationship between variables, but it does not say just how strong and important this is.\n", 30 | "\n", 31 | ">Cramér's V is a measure of association between two nominal variables. Cramer's V is a post-test to give this additional information. It varies between 0 and 1: close to 0 it shows little association between variables; close to 1, it indicates a strong association." 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 1, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "import pandas as pd\n", 41 | "from pandas.api.types import is_numeric_dtype\n", 42 | "\n", 43 | "from Features import *\n", 44 | "from AttributeRelevance import *" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 2, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "pd.options.display.float_format = '{:,.2f}'.format\n", 54 | "pd.set_option('display.max_columns', None) " 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "df = pd.read_csv('data/telco_churn.csv', na_values=[' '])" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 4, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "df.columns = [c[0].lower() + c[1:] for c in df.columns]\n", 73 | "df['label'] = df['churn'].map({'Yes': 0, 'No': 1})\n", 74 | "df['seniorCitizen'] = df['seniorCitizen'].map({1: 'Yes', 0: 'No'})\n", 75 | "df.drop(['customerID', 'churn'], axis=1, inplace=True)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 5, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/html": [ 86 | "
\n", 87 | "\n", 100 | "\n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | "
genderseniorCitizenpartnerdependentstenurephoneServicemultipleLinesinternetServiceonlineSecurityonlineBackupdeviceProtectiontechSupportstreamingTVstreamingMoviescontractpaperlessBillingpaymentMethodmonthlyChargestotalChargeslabel
4831FemaleNoNoNo18NoNo phone serviceDSLYesYesNoYesNoNoOne yearYesMailed check40.20711.951
3739MaleNoNoNo24YesYesNoNo internet serviceNo internet serviceNo internet serviceNo internet serviceNo internet serviceNo internet serviceOne yearNoElectronic check24.60592.651
4590FemaleYesNoNo57YesYesFiber opticYesNoYesYesNoYesTwo yearNoCredit card (automatic)101.305,779.601
179MaleNoYesNo61YesNoNoNo internet serviceNo internet serviceNo internet serviceNo internet serviceNo internet serviceNo internet serviceOne yearYesCredit card (automatic)20.551,252.001
882FemaleNoNoNo6YesNoFiber opticNoNoYesNoNoYesMonth-to-monthNoElectronic check83.90497.550
\n", 244 | "
" 245 | ], 246 | "text/plain": [ 247 | " gender seniorCitizen partner dependents tenure phoneService \\\n", 248 | "4831 Female No No No 18 No \n", 249 | "3739 Male No No No 24 Yes \n", 250 | "4590 Female Yes No No 57 Yes \n", 251 | "179 Male No Yes No 61 Yes \n", 252 | "882 Female No No No 6 Yes \n", 253 | "\n", 254 | " multipleLines internetService onlineSecurity \\\n", 255 | "4831 No phone service DSL Yes \n", 256 | "3739 Yes No No internet service \n", 257 | "4590 Yes Fiber optic Yes \n", 258 | "179 No No No internet service \n", 259 | "882 No Fiber optic No \n", 260 | "\n", 261 | " onlineBackup deviceProtection techSupport \\\n", 262 | "4831 Yes No Yes \n", 263 | "3739 No internet service No internet service No internet service \n", 264 | "4590 No Yes Yes \n", 265 | "179 No internet service No internet service No internet service \n", 266 | "882 No Yes No \n", 267 | "\n", 268 | " streamingTV streamingMovies contract \\\n", 269 | "4831 No No One year \n", 270 | "3739 No internet service No internet service One year \n", 271 | "4590 No Yes Two year \n", 272 | "179 No internet service No internet service One year \n", 273 | "882 No Yes Month-to-month \n", 274 | "\n", 275 | " paperlessBilling paymentMethod monthlyCharges totalCharges \\\n", 276 | "4831 Yes Mailed check 40.20 711.95 \n", 277 | "3739 No Electronic check 24.60 592.65 \n", 278 | "4590 No Credit card (automatic) 101.30 5,779.60 \n", 279 | "179 Yes Credit card (automatic) 20.55 1,252.00 \n", 280 | "882 No Electronic check 83.90 497.55 \n", 281 | "\n", 282 | " label \n", 283 | "4831 1 \n", 284 | "3739 1 \n", 285 | "4590 1 \n", 286 | "179 1 \n", 287 | "882 0 " 288 | ] 289 | }, 290 | "execution_count": 5, 291 | "metadata": {}, 292 | "output_type": "execute_result" 293 | } 294 | ], 295 | "source": [ 296 | "df.sample(5)" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 6, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [ 305 | "feats_dict = {}\n", 306 | "\n", 307 | "for col in [c for c in df.columns if c != 'label']:\n", 308 | " if is_numeric_dtype(df[col]):\n", 309 | " feats_dict[col] = ContinuousFeature(df, col)\n", 310 | " else:\n", 311 | " feats_dict[col] = CategoricalFeature(df, col)\n", 312 | "\n", 313 | "feats = list(feats_dict.values())" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 7, 319 | "metadata": {}, 320 | "outputs": [], 321 | "source": [ 322 | "iv = IV()\n", 323 | "s = StatsSignificance()\n", 324 | "\n", 325 | "ar = AttributeRelevance()" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "metadata": {}, 331 | "source": [ 332 | "## Information value vs statistical significance\n", 333 | "\n", 334 | "Analysis of features with both IV & WOE and Chi-square test & Cramers'V shows some interesting relations between results of those two methods.\n", 335 | "\n", 336 | "**P-value vs information value**\n", 337 | "\n", 338 | "P-value for almost all featues is very, very small (less than 0.01 which gives us 99% confidence level). Only the differences of distribution for two features (with IV = 0) aren't statistically significant. It leads to the conclusion that for features that were recognized as at least medium predictor (the most interesting from analysis perspective) the differences in distribution of 'goods' and 'bads' is statistically significant. However, it's good to note that low p-value doesn't give information about the strength of relationship.\n", 339 | "\n", 340 | "**Information value vs Cramers' V**\n", 341 | "\n", 342 | "There is strong, almost linear, relationship between information value and effect size. Features with high information value have high effect size as well. Correlation coefficient for these values is: 0.94 (Pearson) and 0.98 (Spearman).\n", 343 | "\n", 344 | "**Interpretation of information value and effect size**\n", 345 | "\n", 346 | "Although the interpretation of information value and effect size differs a bit, the rules are very similar: values closer to 0 imply very weak (or lack of) relationship, while higher values suggest stronger relation." 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 32, 352 | "metadata": {}, 353 | "outputs": [ 354 | { 355 | "data": { 356 | "text/html": [ 357 | "
\n", 358 | "\n", 371 | "\n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | "
ivp-valueeffect_sizeiv_interpretationes_interpretation
contract1.240.000.41suspiciousstrong
tenure0.780.000.37suspiciousmedium
onlineSecurity0.720.000.35suspiciousmedium
techSupport0.700.000.34suspiciousmedium
internetService0.620.000.32suspiciousmedium
onlineBackup0.530.000.29suspiciousmedium
deviceProtection0.500.000.28strongmedium
paymentMethod0.460.000.30strongmedium
streamingMovies0.380.000.23strongmedium
streamingTV0.380.000.23strongmedium
totalCharges0.320.000.25strongmedium
paperlessBilling0.200.000.19mediumweak
monthlyCharges0.170.000.18mediumweak
dependents0.160.000.16mediumweak
partner0.120.000.15mediumweak
seniorCitizen0.110.000.15mediumweak
multipleLines0.010.000.04uselessuseless
phoneService0.000.340.01uselessuseless
gender0.000.490.01uselessuseless
\n", 537 | "
" 538 | ], 539 | "text/plain": [ 540 | " iv p-value effect_size iv_interpretation \\\n", 541 | "contract 1.24 0.00 0.41 suspicious \n", 542 | "tenure 0.78 0.00 0.37 suspicious \n", 543 | "onlineSecurity 0.72 0.00 0.35 suspicious \n", 544 | "techSupport 0.70 0.00 0.34 suspicious \n", 545 | "internetService 0.62 0.00 0.32 suspicious \n", 546 | "onlineBackup 0.53 0.00 0.29 suspicious \n", 547 | "deviceProtection 0.50 0.00 0.28 strong \n", 548 | "paymentMethod 0.46 0.00 0.30 strong \n", 549 | "streamingMovies 0.38 0.00 0.23 strong \n", 550 | "streamingTV 0.38 0.00 0.23 strong \n", 551 | "totalCharges 0.32 0.00 0.25 strong \n", 552 | "paperlessBilling 0.20 0.00 0.19 medium \n", 553 | "monthlyCharges 0.17 0.00 0.18 medium \n", 554 | "dependents 0.16 0.00 0.16 medium \n", 555 | "partner 0.12 0.00 0.15 medium \n", 556 | "seniorCitizen 0.11 0.00 0.15 medium \n", 557 | "multipleLines 0.01 0.00 0.04 useless \n", 558 | "phoneService 0.00 0.34 0.01 useless \n", 559 | "gender 0.00 0.49 0.01 useless \n", 560 | "\n", 561 | " es_interpretation \n", 562 | "contract strong \n", 563 | "tenure medium \n", 564 | "onlineSecurity medium \n", 565 | "techSupport medium \n", 566 | "internetService medium \n", 567 | "onlineBackup medium \n", 568 | "deviceProtection medium \n", 569 | "paymentMethod medium \n", 570 | "streamingMovies medium \n", 571 | "streamingTV medium \n", 572 | "totalCharges medium \n", 573 | "paperlessBilling weak \n", 574 | "monthlyCharges weak \n", 575 | "dependents weak \n", 576 | "partner weak \n", 577 | "seniorCitizen weak \n", 578 | "multipleLines useless \n", 579 | "phoneService useless \n", 580 | "gender useless " 581 | ] 582 | }, 583 | "metadata": {}, 584 | "output_type": "display_data" 585 | } 586 | ], 587 | "source": [ 588 | "df_analysis = ar.analyze(feats, iv, s, interpretation=True)\n", 589 | "display(df_analysis)" 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": 20, 595 | "metadata": {}, 596 | "outputs": [ 597 | { 598 | "data": { 599 | "image/png": "\n", 600 | "text/plain": [ 601 | "
" 602 | ] 603 | }, 604 | "metadata": { 605 | "needs_background": "light" 606 | }, 607 | "output_type": "display_data" 608 | }, 609 | { 610 | "name": "stdout", 611 | "output_type": "stream", 612 | "text": [ 613 | "Pearson correlation: 0.94\n", 614 | "Spearman correlation: 0.98\n" 615 | ] 616 | } 617 | ], 618 | "source": [ 619 | "df_analysis_sign = df_analysis[df_analysis['p-value']<0.05]\n", 620 | "\n", 621 | "fig, ax = plt.subplots(figsize=(10,6))\n", 622 | "sns.regplot(x='iv', y='effect_size', data=df_analysis_sign, color='#455872')\n", 623 | "ax.set_title('Information value vs effect size')\n", 624 | "plt.show()\n", 625 | "\n", 626 | "print('Pearson correlation: %0.2f' % df_analysis_sign['iv'].corr(df_analysis_sign['effect_size']))\n", 627 | "print('Spearman correlation: %0.2f' % df_analysis_sign['iv'].corr(df_analysis_sign['effect_size'], method='spearman'))" 628 | ] 629 | }, 630 | { 631 | "cell_type": "markdown", 632 | "metadata": {}, 633 | "source": [ 634 | "## Weight of Evidence vs share of 'goods' in total\n", 635 | "\n", 636 | "WOE is one of ways to show the proportion of 'good' observations in a group (bin). Another way of doing it is to calculate a share of 'goods' in total. It's not a surprise that results obtained in two ways are very similar. The relation remains the same, it's just presented on a different scale." 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": 23, 642 | "metadata": {}, 643 | "outputs": [ 644 | { 645 | "data": { 646 | "image/png": "\n", 647 | "text/plain": [ 648 | "
" 649 | ] 650 | }, 651 | "metadata": { 652 | "needs_background": "light" 653 | }, 654 | "output_type": "display_data" 655 | } 656 | ], 657 | "source": [ 658 | "iv.draw_woe(feats_dict['tenure'])" 659 | ] 660 | }, 661 | { 662 | "cell_type": "code", 663 | "execution_count": 30, 664 | "metadata": {}, 665 | "outputs": [ 666 | { 667 | "data": { 668 | "image/png": "\n", 669 | "text/plain": [ 670 | "
" 671 | ] 672 | }, 673 | "metadata": { 674 | "needs_background": "light" 675 | }, 676 | "output_type": "display_data" 677 | } 678 | ], 679 | "source": [ 680 | "df_tenure = iv.group_by_feature(feats_dict['tenure'])\n", 681 | "df_tenure['total'] = 1\n", 682 | "df_tenure['good_share'] = df_tenure['good'] / df_tenure['count']\n", 683 | "\n", 684 | "fig, ax = plt.subplots(figsize=(10,6))\n", 685 | "sns.barplot(x='tenure', y='total', data=df_tenure, color='#a9c6a3')\n", 686 | "sns.barplot(x='tenure', y='good_share', data=df_tenure, color='#4a6a7c')\n", 687 | "ax.set_title(\"Share of 'goods' in total\")\n", 688 | "plt.show()" 689 | ] 690 | }, 691 | { 692 | "cell_type": "markdown", 693 | "metadata": {}, 694 | "source": [ 695 | "### Reference\n", 696 | "\n", 697 | "1. https://en.wikipedia.org/wiki/Chi-squared_test\n", 698 | "2. https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V" 699 | ] 700 | } 701 | ], 702 | "metadata": { 703 | "kernelspec": { 704 | "display_name": "master", 705 | "language": "python", 706 | "name": "master" 707 | }, 708 | "language_info": { 709 | "codemirror_mode": { 710 | "name": "ipython", 711 | "version": 3 712 | }, 713 | "file_extension": ".py", 714 | "mimetype": "text/x-python", 715 | "name": "python", 716 | "nbconvert_exporter": "python", 717 | "pygments_lexer": "ipython3", 718 | "version": "3.7.5" 719 | } 720 | }, 721 | "nbformat": 4, 722 | "nbformat_minor": 2 723 | } 724 | -------------------------------------------------------------------------------- /telco_dataset_eda.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from pandas.api.types import is_numeric_dtype\n", 11 | "import matplotlib.pyplot as plt\n", 12 | "from matplotlib.colors import ListedColormap\n", 13 | "import seaborn as sns\n", 14 | "import math" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 13, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "pd.options.display.float_format = '{:,.2f}'.format\n", 24 | "sns.set_style(\"whitegrid\")" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 3, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "df = pd.read_csv('data/telco_churn.csv', na_values=[' '])" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 4, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "df.columns = [c[0].lower() + c[1:] for c in df.columns]\n", 43 | "df['label'] = df['churn'].map({'Yes': 0, 'No': 1})\n", 44 | "df['seniorCitizen'] = df['seniorCitizen'].map({1: 'Yes', 0: 'No'})\n", 45 | "df.drop(['customerID', 'churn'], axis=1, inplace=True)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 5, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/html": [ 56 | "
\n", 57 | "\n", 70 | "\n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | "
genderseniorCitizenpartnerdependentstenurephoneServicemultipleLinesinternetServiceonlineSecurityonlineBackupdeviceProtectiontechSupportstreamingTVstreamingMoviescontractpaperlessBillingpaymentMethodmonthlyChargestotalChargeslabel
5075FemaleNoYesYes42YesNoFiber opticNoNoYesNoYesYesMonth-to-monthYesElectronic check92.153875.401
6029MaleNoNoNo4YesNoFiber opticYesYesNoYesNoYesMonth-to-monthYesMailed check94.90360.551
6228MaleNoNoYes70YesYesDSLYesYesNoYesNoNoOne yearNoBank transfer (automatic)64.954551.501
19FemaleNoNoNo21YesNoFiber opticNoYesYesNoNoYesMonth-to-monthYesElectronic check90.051862.901
3220MaleNoYesNo70YesYesDSLYesNoYesYesNoYesTwo yearNoCredit card (automatic)77.305498.201
\n", 214 | "
" 215 | ], 216 | "text/plain": [ 217 | " gender seniorCitizen partner dependents tenure phoneService \\\n", 218 | "5075 Female No Yes Yes 42 Yes \n", 219 | "6029 Male No No No 4 Yes \n", 220 | "6228 Male No No Yes 70 Yes \n", 221 | "19 Female No No No 21 Yes \n", 222 | "3220 Male No Yes No 70 Yes \n", 223 | "\n", 224 | " multipleLines internetService onlineSecurity onlineBackup \\\n", 225 | "5075 No Fiber optic No No \n", 226 | "6029 No Fiber optic Yes Yes \n", 227 | "6228 Yes DSL Yes Yes \n", 228 | "19 No Fiber optic No Yes \n", 229 | "3220 Yes DSL Yes No \n", 230 | "\n", 231 | " deviceProtection techSupport streamingTV streamingMovies contract \\\n", 232 | "5075 Yes No Yes Yes Month-to-month \n", 233 | "6029 No Yes No Yes Month-to-month \n", 234 | "6228 No Yes No No One year \n", 235 | "19 Yes No No Yes Month-to-month \n", 236 | "3220 Yes Yes No Yes Two year \n", 237 | "\n", 238 | " paperlessBilling paymentMethod monthlyCharges \\\n", 239 | "5075 Yes Electronic check 92.15 \n", 240 | "6029 Yes Mailed check 94.90 \n", 241 | "6228 No Bank transfer (automatic) 64.95 \n", 242 | "19 Yes Electronic check 90.05 \n", 243 | "3220 No Credit card (automatic) 77.30 \n", 244 | "\n", 245 | " totalCharges label \n", 246 | "5075 3875.40 1 \n", 247 | "6029 360.55 1 \n", 248 | "6228 4551.50 1 \n", 249 | "19 1862.90 1 \n", 250 | "3220 5498.20 1 " 251 | ] 252 | }, 253 | "execution_count": 5, 254 | "metadata": {}, 255 | "output_type": "execute_result" 256 | } 257 | ], 258 | "source": [ 259 | "df.sample(5)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 7, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "def eda_categorical(data, variable, ax=None):\n", 269 | " variable_df = data[variable].value_counts(normalize=True).reset_index()\n", 270 | " n_colors = len(variable_df)\n", 271 | " variable_df.set_index('index').T.plot(kind='barh',\n", 272 | " stacked=True,\n", 273 | " colormap=ListedColormap(sns.color_palette(\"Set2\", n_colors)),\n", 274 | " width=0.15, ax=ax)\n", 275 | "\n", 276 | "def multiple_eda_categorical(data, list_categorical):\n", 277 | " n_rows = math.ceil(len(list_categorical)/2)\n", 278 | " fig = plt.figure(figsize=(12,n_rows*3))\n", 279 | "\n", 280 | " for i, variable in enumerate(list_categorical):\n", 281 | " ax = fig.add_subplot(n_rows,2,i+1)\n", 282 | " eda_categorical(data, variable, ax=ax)\n", 283 | " \n", 284 | " plt.tight_layout()\n", 285 | " plt.show()\n", 286 | "\n", 287 | "def multiple_eda_continuous(data, list_continuous):\n", 288 | " n_rows = math.ceil(len(list_continuous)/3)\n", 289 | " fig = plt.figure(figsize=(12,n_rows*5))\n", 290 | " palette = sns.color_palette('Set2', 3)\n", 291 | "\n", 292 | " for i, variable in enumerate(list_continuous):\n", 293 | " ax = fig.add_subplot(n_rows,3,i+1)\n", 294 | " sns.boxplot(x=variable, data=data, orient='v', palette=[palette[i]], ax=ax)\n", 295 | " ax.set_ylabel('')\n", 296 | " ax.set_title(variable)\n", 297 | "\n", 298 | " plt.tight_layout()\n", 299 | " plt.show()" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 6, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "continuous, categorical = [], []\n", 309 | "black_list = ['onlineSecurity', 'techSupport', 'onlineBackup', 'streamingTV', 'streamingMovies', 'deviceProtection']\n", 310 | "\n", 311 | "for col in [c for c in df.columns if c not in black_list and c != 'label']:\n", 312 | " if is_numeric_dtype(df[col]):\n", 313 | " continuous.append(col)\n", 314 | " else:\n", 315 | " categorical.append(col)" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "## Exploratory Analysis" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 18, 328 | "metadata": {}, 329 | "outputs": [ 330 | { 331 | "name": "stdout", 332 | "output_type": "stream", 333 | "text": [ 334 | "Dataset contains 7043 records\n", 335 | "Number of customers who churned: 1869 (26.54%)\n" 336 | ] 337 | } 338 | ], 339 | "source": [ 340 | "n = len(df)\n", 341 | "n_not_churn = sum(df['label'])\n", 342 | "n_churn = n - n_not_churn\n", 343 | "\n", 344 | "print('Dataset contains %d records' % (n))\n", 345 | "print('Number of customers who churned: %d (%0.2f%%)' % (n_churn, n_churn*100/n))" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 8, 351 | "metadata": {}, 352 | "outputs": [ 353 | { 354 | "data": { 355 | "image/png": "\n", 356 | "text/plain": [ 357 | "
" 358 | ] 359 | }, 360 | "metadata": {}, 361 | "output_type": "display_data" 362 | } 363 | ], 364 | "source": [ 365 | "multiple_eda_categorical(df, categorical)" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": 9, 371 | "metadata": {}, 372 | "outputs": [ 373 | { 374 | "data": { 375 | "image/png": "\n", 376 | "text/plain": [ 377 | "
" 378 | ] 379 | }, 380 | "metadata": {}, 381 | "output_type": "display_data" 382 | } 383 | ], 384 | "source": [ 385 | "multiple_eda_continuous(df, continuous)" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [] 394 | } 395 | ], 396 | "metadata": { 397 | "kernelspec": { 398 | "display_name": "Python 3", 399 | "language": "python", 400 | "name": "python3" 401 | }, 402 | "language_info": { 403 | "codemirror_mode": { 404 | "name": "ipython", 405 | "version": 3 406 | }, 407 | "file_extension": ".py", 408 | "mimetype": "text/x-python", 409 | "name": "python", 410 | "nbconvert_exporter": "python", 411 | "pygments_lexer": "ipython3", 412 | "version": "3.7.4" 413 | } 414 | }, 415 | "nbformat": 4, 416 | "nbformat_minor": 2 417 | } 418 | --------------------------------------------------------------------------------