├── tests ├── __init__.py ├── pymc3_models │ ├── __init__.py │ ├── test_bayesian_model.py │ └── test_HLR.py ├── transformers │ └── __init__.py └── visualizers │ └── __init__.py ├── ps_toolkit ├── transformers │ └── __init__.py ├── visualizers │ ├── __init__.py │ ├── roc_curve_visualizer.py │ └── separation_plot_visualizer.py ├── exc.py ├── __init__.py └── pymc3_models │ ├── __init__.py │ └── HLR.py ├── requirements_to_freeze.txt ├── requirements.txt ├── setup.py ├── README.md └── .gitignore /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/pymc3_models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/visualizers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ps_toolkit/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ps_toolkit/visualizers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ps_toolkit/exc.py: -------------------------------------------------------------------------------- 1 | class PSToolkitError(Exception): 2 | pass 3 | -------------------------------------------------------------------------------- /requirements_to_freeze.txt: -------------------------------------------------------------------------------- 1 | joblib 2 | matplotlib 3 | numpy 4 | pandas>=0.19 5 | pymc3 6 | scikit-learn>=0.18 7 | scipy 8 | seaborn -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | joblib==0.11 2 | matplotlib==2.0.0 3 | numpy==1.12.1 4 | pandas==0.19.2 5 | pymc3==3.2 6 | scikit-learn==0.18.1 7 | scipy==0.19.0 8 | seaborn==0.7.1 9 | -------------------------------------------------------------------------------- /ps_toolkit/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import print_function 3 | from __future__ import unicode_literals 4 | 5 | 6 | from ps_toolkit.pymc3_models.HLR import HLR 7 | 8 | from ps_toolkit.visualizers.roc_curve_visualizer import ROCCurveVisualizer 9 | from ps_toolkit.visualizers.separation_plot_visualizer import SeparationPlotVisualizer 10 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | setup( 5 | name='PS_Toolkit', 6 | version='2.1.1', 7 | packages=find_packages(), 8 | include_package_data=False, 9 | zip_safe=False, 10 | install_requires=[ 11 | 'joblib', 12 | 'matplotlib', 13 | 'numpy', 14 | 'pandas>=0.19', 15 | 'pymc3>=3.2', 16 | 'scipy', 17 | 'seaborn', 18 | 'sklearn' 19 | ] 20 | ) 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Parsing Science Toolkit 2 | 3 | A repo with custom PyMC3 models, transformers, and visualizers. 4 | 5 | ## Set up 6 | ```sh 7 | git clone https://github.com/parsing-science/ps-toolkit.git 8 | cd ps-toolkit 9 | ``` 10 | 11 | ## Import the repo 12 | To use the package, add it to the requirements.txt of your repo and pip install 13 | 14 | SSH: git+ssh://git@github.com/parsing-science/ps-toolkit.git#egg=PS_Toolkit 15 | 16 | To install a specific version: 17 | 18 | SSH: git+ssh://git@github.com/parsing-science/ps-toolkit.git@[tag]#egg=PS_Toolkit 19 | 20 | where [tag] is a specific version tag, e.g. `v1.0.0`. 21 | 22 | ## To run unittests 23 | ```sh 24 | cd ps-toolkit 25 | virtualenv venv 26 | source venv/bin/activate 27 | pip install -r requirements.txt 28 | python -m unittest discover -cv 29 | ``` 30 | -------------------------------------------------------------------------------- /tests/pymc3_models/test_bayesian_model.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from ps_toolkit.pymc3_models import BayesianModel 4 | 5 | 6 | class BayesianModelTestCase(unittest.TestCase): 7 | def test_create_model_raises_not_implemented_error(self): 8 | with self.assertRaises(NotImplementedError): 9 | BM = BayesianModel() 10 | BM.create_model() 11 | 12 | def test_fit_raises_not_implemented_error(self): 13 | with self.assertRaises(NotImplementedError): 14 | BM = BayesianModel() 15 | BM.fit() 16 | 17 | def test_predict_raises_not_implemented_error(self): 18 | with self.assertRaises(NotImplementedError): 19 | BM = BayesianModel() 20 | BM.predict() 21 | 22 | def test_score_raises_not_implemented_error(self): 23 | with self.assertRaises(NotImplementedError): 24 | BM = BayesianModel() 25 | BM.score() 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | *.ipynb 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # dotenv 80 | .env 81 | 82 | # virtualenv 83 | venv/ 84 | ENV/ 85 | 86 | # Spyder project settings 87 | .spyderproject 88 | 89 | # Rope project settings 90 | .ropeproject 91 | 92 | # HLR models 93 | HLR_jar/ 94 | -------------------------------------------------------------------------------- /ps_toolkit/pymc3_models/__init__.py: -------------------------------------------------------------------------------- 1 | import joblib 2 | import matplotlib.pyplot as plt 3 | import pymc3 as pm 4 | import seaborn as sns 5 | from sklearn.base import BaseEstimator 6 | 7 | 8 | class BayesianModel(BaseEstimator): 9 | """ 10 | Bayesian model base class 11 | """ 12 | def __init__(self): 13 | self.advi_hist = None 14 | self.advi_trace = None 15 | self.cached_model = None 16 | self.num_pred = None 17 | self.shared_vars = None 18 | 19 | def create_model(self): 20 | raise NotImplementedError 21 | 22 | def _set_shared_vars(self, shared_vars): 23 | """ 24 | Sets theano shared variables for the PyMC3 model. 25 | """ 26 | for key in shared_vars.keys(): 27 | self.shared_vars[key].set_value(shared_vars[key]) 28 | 29 | def _inference(self, minibatches, n=200000): 30 | """ 31 | Runs minibatch variational ADVI and then sample from those results. 32 | 33 | Parameters 34 | ---------- 35 | minibatches: minibatches for ADVI 36 | 37 | n: number of iterations for ADVI fit, defaults to 200000 38 | """ 39 | with self.cached_model: 40 | advi = pm.ADVI() 41 | approx = pm.fit( 42 | n=n, 43 | method=advi, 44 | more_replacements=minibatches, 45 | callbacks=[pm.callbacks.CheckParametersConvergence()] 46 | ) 47 | 48 | self.advi_trace = approx.sample(draws=10000) 49 | 50 | self.advi_hist = advi.hist 51 | 52 | def fit(self): 53 | raise NotImplementedError 54 | 55 | def predict(self): 56 | raise NotImplementedError 57 | 58 | def score(self): 59 | raise NotImplementedError 60 | 61 | def save(self, file_prefix, custom_params=None): 62 | """ 63 | Saves the advi_trace and custom params to files with the given file_prefix. 64 | 65 | Parameters 66 | ---------- 67 | file_prefix: str, path and prefix used to identify where to save the trace for this model. 68 | Ex: given file_prefix = "path/to/file/" 69 | This will attempt to save to "path/to/file/advi_trace.pickle" 70 | 71 | custom_params: Dictionary of custom parameters to save. Defaults to None 72 | """ 73 | fileObject = open(file_prefix + 'advi_trace.pickle', 'wb') 74 | joblib.dump(self.advi_trace, fileObject) 75 | fileObject.close() 76 | 77 | if custom_params: 78 | fileObject = open(file_prefix + 'params.pickle', 'wb') 79 | joblib.dump(custom_params, fileObject) 80 | fileObject.close() 81 | 82 | def load(self, file_prefix, load_custom_params=False): 83 | """ 84 | Loads a saved version of the advi_trace, v_params, and custom param files with the given file_prefix. 85 | 86 | Parameters 87 | ---------- 88 | file_prefix: str, path and prefix used to identify where to load the saved trace for this model. 89 | Ex: given file_prefix = "path/to/file/" 90 | This will attempt to load "path/to/file/advi_trace.pickle" 91 | 92 | load_custom_params: Boolean flag to indicate whether custom parameters should be loaded. Defaults to False. 93 | 94 | Returns 95 | ---------- 96 | custom_params: Dictionary of custom parameters 97 | """ 98 | self.advi_trace = joblib.load(file_prefix + 'advi_trace.pickle') 99 | 100 | custom_params = None 101 | if load_custom_params: 102 | custom_params = joblib.load(file_prefix + 'params.pickle') 103 | 104 | return custom_params 105 | 106 | def plot_elbo(self): 107 | """ 108 | Plot the ELBO values after running ADVI minibatch. 109 | """ 110 | sns.set_style("white") 111 | plt.plot(-self.advi_hist) 112 | plt.ylabel('ELBO') 113 | plt.xlabel('iteration') 114 | sns.despine() 115 | -------------------------------------------------------------------------------- /ps_toolkit/visualizers/roc_curve_visualizer.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | from sklearn.exceptions import NotFittedError 7 | from sklearn.metrics import roc_curve, auc 8 | 9 | from ps_toolkit.exc import PSToolkitError 10 | 11 | 12 | class ROCCurveVisualizer(object): 13 | """" A class to create an ROC curve""" 14 | 15 | def calculate_roc(self, probabilities, Y, pos_label=None, sample_weight=None): 16 | """ 17 | Function to calculate the ROC and AUC for a set of binary data. This function is built on top of the sklearn functions for those values. 18 | 19 | Parameters 20 | ---------- 21 | probabilities : a numpy array, list, or tuple of the probabilities of a True outcome for all data points 22 | 23 | Y : pandas Series or 1-column DataFrame, shape [n_samples] 24 | The outcome data used to separate the data points that have outcome=True from those of outcome=False 25 | 26 | Optional from sklearn: 27 | pos_label : int, Label considered as positive and others are considered negative. 28 | 29 | sample_weight : array-like of shape = [n_samples], Sample weights. 30 | """ 31 | 32 | try: 33 | probabilities = np.array(probabilities) 34 | probabilities = probabilities.squeeze() 35 | except: 36 | raise PSToolkitError("The probabilities must be castable to a numpy array.") 37 | 38 | if probabilities.ndim != 1: 39 | raise PSToolkitError("The probabilities must be a one dimensional numpy array, list, or tuple.") 40 | 41 | if type(Y)==pd.DataFrame and len(Y.columns) != 1: 42 | raise PSToolkitError("Y must be a one-column DataFrame or Series.") 43 | 44 | if len(Y) != len(probabilities): 45 | raise PSToolkitError("The probabilities and Y must be the same size.") 46 | 47 | if Y.isnull().any().any(): 48 | raise PSToolkitError("Y contains NaNs.") 49 | 50 | if np.isnan(probabilities).any(): 51 | raise PSToolkitError("The probabilities contains NaNs.") 52 | 53 | if not (probabilities >= 0).all() or not (probabilities <= 1).all(): 54 | raise PSToolkitError("The probabilities must be between 0 and 1.") 55 | 56 | fpr, tpr, thresholds = roc_curve(Y, probabilities, pos_label, sample_weight) 57 | self.fpr_ = fpr 58 | self.tpr_ = tpr 59 | self.thresholds_ = thresholds 60 | 61 | roc_auc = auc(fpr, tpr) 62 | self.roc_auc_ = roc_auc 63 | 64 | def create_roc_curve_plot(self): 65 | """ 66 | Function to plot the ROC curve for a set of binary data. 67 | """ 68 | 69 | if not hasattr(self, "fpr_"): 70 | raise NotFittedError("Call calculate_roc before create_roc_curve_plot") 71 | 72 | plt.figure() 73 | plt.plot(self.fpr_, self.tpr_, label='ROC curve (area = %0.2f)' % self.roc_auc_) 74 | plt.plot([0, 1], [0, 1], 'k--') 75 | plt.xlim([0.0, 1.0]) 76 | plt.ylim([0.0, 1.05]) 77 | plt.xlabel('False Positive Rate') 78 | plt.ylabel('True Positive Rate') 79 | plt.title('Receiver operating characteristic curve') 80 | plt.legend(loc="lower right") 81 | plt.show() 82 | 83 | def calculate_and_plot(self, probabilities, Y, pos_label=None, sample_weight=None): 84 | """ 85 | Function to calculate and plot the ROC curve for a set of data. 86 | 87 | Parameters 88 | ---------- 89 | probabilities : a numpy array, list, or tuple of the probabilities of a True outcome for all data points 90 | 91 | Y : pandas Series or 1-column DataFrame, shape [n_samples] 92 | The outcome data used to separate the data points that have outcome=True from those of outcome=False 93 | 94 | Optional from sklearn: 95 | pos_label : int, Label considered as positive and others are considered negative. 96 | 97 | sample_weight : array-like of shape = [n_samples], Sample weights. 98 | """ 99 | 100 | self.calculate_roc(probabilities, Y, pos_label, sample_weight) 101 | 102 | self.create_roc_curve_plot() 103 | -------------------------------------------------------------------------------- /ps_toolkit/pymc3_models/HLR.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pymc3 as pm 3 | from sklearn.metrics import accuracy_score 4 | import theano 5 | import theano.tensor as T 6 | 7 | from ps_toolkit.exc import PSToolkitError 8 | from ps_toolkit.pymc3_models import BayesianModel 9 | 10 | 11 | class HLR(BayesianModel): 12 | """ 13 | Custom Hierarchical Logistic Regression built using PyMC3. 14 | """ 15 | 16 | def __init__(self): 17 | super(HLR, self).__init__() 18 | self.num_cats = None 19 | 20 | def create_model(self): 21 | """ 22 | Creates and returns the PyMC3 model. 23 | 24 | Returns the model. 25 | """ 26 | model_input = theano.shared(np.zeros([1, self.num_pred])) 27 | 28 | model_output = theano.shared(np.zeros(1)) 29 | 30 | model_cats = theano.shared(np.zeros(1, dtype='int')) 31 | 32 | self.shared_vars = { 33 | 'model_input': model_input, 34 | 'model_output': model_output, 35 | 'model_cats': model_cats 36 | } 37 | 38 | model = pm.Model() 39 | 40 | with model: 41 | mu_alpha = pm.Normal('mu_alpha', mu=0, sd=100) 42 | sigma_alpha = pm.HalfNormal('sigma_alpha', sd=100) 43 | 44 | mu_beta = pm.Normal('mu_beta', mu=0, sd=100) 45 | sigma_beta = pm.HalfNormal('sigma_beta', sd=100) 46 | 47 | alpha = pm.Normal('alpha', mu=mu_alpha, sd=sigma_alpha, shape=(self.num_cats,)) 48 | beta = pm.Normal('beta', mu=mu_beta, sd=sigma_beta, shape=(self.num_cats, self.num_pred)) 49 | 50 | c = model_cats 51 | 52 | temp = alpha[c] + T.sum(beta[c] * model_input, 1) 53 | 54 | p = pm.invlogit(temp) 55 | 56 | o = pm.Bernoulli('o', p, observed=model_output) 57 | 58 | return model 59 | 60 | def fit(self, X, y, cats, n=200000, batch_size=100): 61 | """ 62 | Train the HLR model 63 | 64 | Parameters 65 | ---------- 66 | X : numpy array, shape [n_samples, n_features] 67 | 68 | y : numpy array, shape [n_samples, ] 69 | 70 | cats: numpy array, shape [n_samples, ] 71 | 72 | n: number of iterations for ADVI fit, defaults to 200000 73 | 74 | batch_size: number of samples to include in each minibatch for ADVI, defaults to 100 75 | """ 76 | self.num_cats = len(np.unique(cats)) 77 | num_samples, self.num_pred = X.shape 78 | 79 | if self.cached_model is None: 80 | self.cached_model = self.create_model() 81 | 82 | with self.cached_model: 83 | 84 | minibatches = { 85 | self.shared_vars['model_input']: pm.Minibatch(X, batch_size=batch_size), 86 | self.shared_vars['model_output']: pm.Minibatch(y, batch_size=batch_size), 87 | self.shared_vars['model_cats']: pm.Minibatch(cats, batch_size=batch_size) 88 | } 89 | 90 | self._inference(minibatches, n) 91 | 92 | return self 93 | 94 | def predict_proba(self, X, cats, return_std=False): 95 | """ 96 | Predicts probabilities of new data with a trained HLR 97 | 98 | Parameters 99 | ---------- 100 | X : numpy array, shape [n_samples, n_features] 101 | 102 | cats: numpy array, shape [n_samples, ] 103 | 104 | return_std: Boolean flag of whether to return standard deviations with mean probabilities. Defaults to False. 105 | """ 106 | 107 | if self.advi_trace is None: 108 | raise PSToolkitError('Run fit on the model before predict.') 109 | 110 | num_samples = X.shape[0] 111 | 112 | if self.cached_model is None: 113 | self.cached_model = self.create_model() 114 | 115 | self._set_shared_vars({'model_input': X, 'model_output': np.zeros(num_samples), 'model_cats': cats}) 116 | 117 | ppc = pm.sample_ppc(self.advi_trace, model=self.cached_model, samples=2000) 118 | 119 | if return_std: 120 | return ppc['o'].mean(axis=0), ppc['o'].std(axis=0) 121 | else: 122 | return ppc['o'].mean(axis=0) 123 | 124 | def predict(self, X, cats): 125 | """ 126 | Predicts labels of new data with a trained model 127 | 128 | Parameters 129 | ---------- 130 | X : numpy array, shape [n_samples, n_features] 131 | 132 | cats: numpy array, shape [n_samples, ] 133 | """ 134 | ppc_mean = self.predict_proba(X, cats) 135 | 136 | pred = ppc_mean > 0.5 137 | 138 | return pred 139 | 140 | def score(self, X, y, cats): 141 | """ 142 | Scores new data with a trained model. 143 | 144 | Parameters 145 | ---------- 146 | X : numpy array, shape [n_samples, n_features] 147 | 148 | y : numpy array, shape [n_samples, ] 149 | 150 | cats: numpy array, shape [n_samples, ] 151 | """ 152 | 153 | return accuracy_score(y, self.predict(X, cats)) 154 | 155 | def save(self, file_prefix): 156 | params = {'num_cats': self.num_cats, 'num_pred': self.num_pred} 157 | 158 | super(HLR, self).save(file_prefix, params) 159 | 160 | def load(self, file_prefix): 161 | params = super(HLR, self).load(file_prefix, load_custom_params=True) 162 | 163 | self.num_cats = params['num_cats'] 164 | self.num_pred = params['num_pred'] 165 | -------------------------------------------------------------------------------- /tests/pymc3_models/test_HLR.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import tempfile 3 | import unittest 4 | 5 | import numpy as np 6 | from pymc3 import summary 7 | from sklearn.model_selection import train_test_split 8 | 9 | from ps_toolkit.exc import PSToolkitError 10 | from ps_toolkit import HLR 11 | 12 | 13 | class HLRTestCase(unittest.TestCase): 14 | def setUp(self): 15 | def numpy_invlogit(x): 16 | return 1 / (1 + np.exp(-x)) 17 | 18 | self.num_cats = 3 19 | self.num_pred = 1 20 | self.num_samples_per_cat = 100000 21 | 22 | self.alphas = np.random.randn(self.num_cats) 23 | self.betas = np.random.randn(self.num_cats, self.num_pred) 24 | #TODO: make this more efficient; right now, it's very explicit so I understand it. 25 | x_a = np.random.randn(self.num_samples_per_cat, self.num_pred) 26 | y_a = np.random.binomial(1, numpy_invlogit(self.alphas[0] + np.sum(self.betas[0] * x_a, 1))) 27 | x_b = np.random.randn(self.num_samples_per_cat, self.num_pred) 28 | y_b = np.random.binomial(1, numpy_invlogit(self.alphas[1] + np.sum(self.betas[1] * x_b, 1))) 29 | x_c = np.random.randn(self.num_samples_per_cat, self.num_pred) 30 | y_c = np.random.binomial(1, numpy_invlogit(self.alphas[2] + np.sum(self.betas[2] * x_c, 1))) 31 | 32 | X = np.concatenate([x_a, x_b, x_c]) 33 | Y = np.concatenate([y_a, y_b, y_c]) 34 | cats = np.concatenate([ 35 | np.zeros(self.num_samples_per_cat, dtype=np.int), 36 | np.ones(self.num_samples_per_cat, dtype=np.int), 37 | 2*np.ones(self.num_samples_per_cat, dtype=np.int) 38 | ]) 39 | 40 | self.X_train, self.X_test, self.cat_train, self.cat_test, self.Y_train, self.Y_test = train_test_split( 41 | X, cats, Y, test_size=0.4 42 | ) 43 | 44 | self.test_HLR = HLR() 45 | 46 | self.test_dir = tempfile.mkdtemp() 47 | 48 | def tearDown(self): 49 | shutil.rmtree(self.test_dir) 50 | 51 | 52 | class HLRFitTestCase(HLRTestCase): 53 | def test_fit_returns_correct_model(self): 54 | # Note: print is here so PyMC3 output won't overwrite the test name 55 | print("") 56 | self.test_HLR.fit(self.X_train, self.Y_train, self.cat_train) 57 | 58 | self.assertEqual(self.num_cats, self.test_HLR.num_cats) 59 | self.assertEqual(self.num_pred, self.test_HLR.num_pred) 60 | 61 | #TODO: Figure out best way to test 62 | #np.testing.assert_almost_equal(self.alphas, self.test_HLR.advi_trace['alphas'].mean(), decimal=1) 63 | #np.testing.assert_almost_equal(self.betas, self.test_HLR.advi_trace['betas'].mean(), decimal=1) 64 | 65 | # For now, just check that the estimated parameters have the correct signs 66 | np.testing.assert_equal( 67 | np.sign(self.alphas), 68 | np.sign(self.test_HLR.advi_trace['alpha'].mean(axis=0)) 69 | ) 70 | np.testing.assert_equal( 71 | np.sign(self.betas), 72 | np.sign(self.test_HLR.advi_trace['beta'].mean(axis=0)) 73 | ) 74 | 75 | 76 | class HLRPredictProbaTestCase(HLRTestCase): 77 | def test_predict_proba_returns_probabilities(self): 78 | print("") 79 | self.test_HLR.fit(self.X_train, self.Y_train, self.cat_train) 80 | probs = self.test_HLR.predict_proba(self.X_test, self.cat_test) 81 | self.assertEqual(probs.shape, self.Y_test.shape) 82 | 83 | def test_predict_proba_returns_probabilities_and_std(self): 84 | print("") 85 | self.test_HLR.fit(self.X_train, self.Y_train, self.cat_train) 86 | probs, stds = self.test_HLR.predict_proba(self.X_test, self.cat_test, return_std=True) 87 | self.assertEqual(probs.shape, self.Y_test.shape) 88 | self.assertEqual(stds.shape, self.Y_test.shape) 89 | 90 | def test_predict_proba_raises_error_if_not_fit(self): 91 | with self.assertRaises(PSToolkitError) as no_fit_error: 92 | test_HLR = HLR() 93 | test_HLR.predict_proba(self.X_train, self.cat_train) 94 | 95 | expected = "Run fit on the model before predict." 96 | self.assertEqual(str(no_fit_error.exception), expected) 97 | 98 | 99 | class HLRPredictTestCase(HLRTestCase): 100 | def test_predict_returns_predictions(self): 101 | print("") 102 | self.test_HLR.fit(self.X_train, self.Y_train, self.cat_train) 103 | preds = self.test_HLR.predict(self.X_test, self.cat_test) 104 | self.assertEqual(preds.shape, self.Y_test.shape) 105 | 106 | 107 | class HLRScoreTestCase(HLRTestCase): 108 | def test_score_scores(self): 109 | print("") 110 | self.test_HLR.fit(self.X_train, self.Y_train, self.cat_train) 111 | score = self.test_HLR.score(self.X_test, self.Y_test, self.cat_test) 112 | naive_score = np.mean(self.Y_test) 113 | self.assertGreaterEqual(score, naive_score) 114 | 115 | 116 | class HLRSaveandLoadTestCase(HLRTestCase): 117 | def test_save_and_load_work_correctly(self): 118 | print("") 119 | self.test_HLR.fit(self.X_train, self.Y_train, self.cat_train) 120 | probs1 = self.test_HLR.predict_proba(self.X_test, self.cat_test) 121 | self.test_HLR.save(self.test_dir) 122 | 123 | HLR2 = HLR() 124 | 125 | HLR2.load(self.test_dir) 126 | 127 | self.assertEqual(self.test_HLR.num_cats, HLR2.num_cats) 128 | self.assertEqual(self.test_HLR.num_pred, HLR2.num_pred) 129 | self.assertEqual(summary(self.test_HLR.advi_trace), summary(HLR2.advi_trace)) 130 | 131 | probs2 = HLR2.predict_proba(self.X_test, self.cat_test) 132 | 133 | np.testing.assert_almost_equal(probs2, probs1, decimal=1) 134 | -------------------------------------------------------------------------------- /ps_toolkit/visualizers/separation_plot_visualizer.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import matplotlib.pyplot as plt 6 | from sklearn.exceptions import NotFittedError 7 | 8 | from ps_toolkit.exc import PSToolkitError 9 | 10 | 11 | class SeparationPlotVisualizer(object): 12 | """A class that can create a separation plot for a set of data with a binary outcome.""" 13 | 14 | def separate_probabilities(self, probabilities, Y): 15 | """Function to separate the probabilities for events that are true and those that are false. 16 | Useful for creating a separation plot. 17 | 18 | Parameters 19 | ---------- 20 | probabilities : a numpy array, list, or tuple of the probabilities of a True outcome for all data points 21 | 22 | Y : pandas Series or 1-column DataFrame, shape [n_samples] 23 | The outcome data used to separate the data points that have outcome=True from those of outcome=False 24 | 25 | """ 26 | 27 | if type(Y)==pd.DataFrame and len(Y.columns) != 1: 28 | raise PSToolkitError("Y must be a one-column DataFrame or Series.") 29 | 30 | if len(Y) != len(probabilities): 31 | raise PSToolkitError("The probabilities and Y must be the same size.") 32 | 33 | if Y.isnull().any().any(): 34 | raise PSToolkitError("Y contains NaNs.") 35 | 36 | if type(probabilities) != np.ndarray: 37 | probabilities = np.array(probabilities) 38 | 39 | if np.isnan(probabilities).any(): 40 | raise PSToolkitError("The probabilities contains NaNs.") 41 | 42 | if not (probabilities >= 0).all() or not (probabilities <= 1).all(): 43 | raise PSToolkitError("The probabilities must be between 0 and 1.") 44 | 45 | true_probs = [] 46 | false_probs = [] 47 | 48 | if type(Y)==pd.Series: 49 | Y = pd.DataFrame(Y) 50 | 51 | for i in range(len(Y)): 52 | if Y.iloc[i][0]: 53 | true_probs.append(probabilities[i]) 54 | else: 55 | false_probs.append(probabilities[i]) 56 | 57 | self.true_probs_ = true_probs 58 | self.false_probs_ = false_probs 59 | 60 | 61 | def create_separation_plot(self): 62 | """Function to create a separation plot for a set of true probabilities and false probabilities. 63 | """ 64 | 65 | if not hasattr(self, "true_probs_") or not hasattr(self, "false_probs_"): 66 | raise NotFittedError("Call separate_probabilities before create_separation_plot") 67 | 68 | tints = [ 69 | "#f3e6ed", 70 | "#e7cedc", 71 | "#dcb5ca", 72 | "#d09db9", 73 | "#c584a7", 74 | "#b96c96", 75 | "#ad5384", 76 | "#a23a72", 77 | "#962261", 78 | "#8b0a50" 79 | ] 80 | 81 | a_heights, a_bins = np.histogram(self.true_probs_, 82 | bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] 83 | ) 84 | 85 | a_widths = a_heights/len(self.true_probs_) 86 | 87 | b_heights, b_bins = np.histogram(self.false_probs_, 88 | bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] 89 | ) 90 | 91 | b_widths = b_heights/len(self.false_probs_) 92 | 93 | plt.subplot(2, 1, 1) 94 | 95 | left_edge=0 96 | 97 | for i in range(10): 98 | plt.bar(left_edge, 99 | 1, 100 | a_widths[i], 101 | color=tints[i], 102 | edgecolor=None, 103 | label=str(i/10)+"-"+str((i+1)/10) 104 | ) 105 | 106 | left_edge+=a_widths[i] 107 | 108 | plt.title("y=True (n={})".format(len(self.true_probs_))) 109 | 110 | plt.tick_params(axis='both', 111 | which='both', 112 | left='off', 113 | top='off', 114 | bottom = 'off', 115 | right='off', 116 | labelleft='off', 117 | labelbottom = 'off' 118 | ) 119 | 120 | plt.legend(bbox_to_anchor=(1.05, 0.7), loc=2) 121 | 122 | x1,x2,y1,y2 = plt.axis() 123 | plt.axis((0,1,y1,y2)) 124 | 125 | plt.subplot(2, 1, 2) 126 | 127 | left_edge=0 128 | 129 | for i in range(10): 130 | plt.bar(left_edge, 131 | 1, 132 | b_widths[i], 133 | color=tints[i], 134 | edgecolor=None 135 | ) 136 | 137 | left_edge+=b_widths[i] 138 | 139 | plt.title("y=False (n={})".format(len(self.false_probs_))) 140 | 141 | plt.tick_params(axis='both', 142 | which='both', 143 | left='off', 144 | top='off', 145 | bottom = 'off', 146 | right='off', 147 | labelleft='off', 148 | labelbottom = 'off') 149 | 150 | x1,x2,y1,y2 = plt.axis() 151 | plt.axis((0,1,y1,y2)) 152 | 153 | def separate_and_plot(self, probabilities, Y): 154 | """A function that combines the functionality of _separate_probabilities and _create_separation_plot. 155 | 156 | Parameters 157 | ---------- 158 | probabilities : a numpy array of the probabilities of a True outcome for all data points 159 | 160 | Y : pandas Series or 1-column DataFrame, shape [n_samples] 161 | The outcome data used to separate the data points that have outcome=True from those of outcome=False 162 | 163 | """ 164 | 165 | self.separate_probabilities(probabilities, Y) 166 | 167 | self.create_separation_plot() 168 | --------------------------------------------------------------------------------