├── tests
    ├── __init__.py
    ├── pymc3_models
    │   ├── __init__.py
    │   ├── test_bayesian_model.py
    │   └── test_HLR.py
    ├── transformers
    │   └── __init__.py
    └── visualizers
    │   └── __init__.py
├── ps_toolkit
    ├── transformers
    │   └── __init__.py
    ├── visualizers
    │   ├── __init__.py
    │   ├── roc_curve_visualizer.py
    │   └── separation_plot_visualizer.py
    ├── exc.py
    ├── __init__.py
    └── pymc3_models
    │   ├── __init__.py
    │   └── HLR.py
├── requirements_to_freeze.txt
├── requirements.txt
├── setup.py
├── README.md
└── .gitignore


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/pymc3_models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/transformers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/visualizers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ps_toolkit/transformers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ps_toolkit/visualizers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/ps_toolkit/exc.py:
--------------------------------------------------------------------------------
1 | class PSToolkitError(Exception):
2 | 	pass
3 | 


--------------------------------------------------------------------------------
/requirements_to_freeze.txt:
--------------------------------------------------------------------------------
1 | joblib
2 | matplotlib
3 | numpy
4 | pandas>=0.19
5 | pymc3
6 | scikit-learn>=0.18
7 | scipy
8 | seaborn


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | joblib==0.11
2 | matplotlib==2.0.0
3 | numpy==1.12.1
4 | pandas==0.19.2
5 | pymc3==3.2
6 | scikit-learn==0.18.1
7 | scipy==0.19.0
8 | seaborn==0.7.1
9 | 


--------------------------------------------------------------------------------
/ps_toolkit/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import print_function
 3 | from __future__ import unicode_literals
 4 | 
 5 | 
 6 | from ps_toolkit.pymc3_models.HLR import HLR
 7 | 
 8 | from ps_toolkit.visualizers.roc_curve_visualizer import ROCCurveVisualizer
 9 | from ps_toolkit.visualizers.separation_plot_visualizer import SeparationPlotVisualizer
10 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | 
 4 | setup(
 5 |     name='PS_Toolkit',
 6 |     version='2.1.1',
 7 |     packages=find_packages(),
 8 |     include_package_data=False,
 9 |     zip_safe=False,
10 |     install_requires=[
11 |         'joblib',
12 |         'matplotlib',
13 |         'numpy',
14 |         'pandas>=0.19',
15 |         'pymc3>=3.2',
16 |         'scipy',
17 |         'seaborn',
18 |         'sklearn'
19 |     ]
20 | )
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Parsing Science Toolkit
 2 | 
 3 | A repo with custom PyMC3 models, transformers, and visualizers.
 4 | 
 5 | ## Set up
 6 | ```sh
 7 | git clone https://github.com/parsing-science/ps-toolkit.git
 8 | cd ps-toolkit
 9 | ```
10 | 
11 | ## Import the repo
12 | To use the package, add it to the requirements.txt of your repo and pip install
13 | 
14 | SSH: git+ssh://git@github.com/parsing-science/ps-toolkit.git#egg=PS_Toolkit
15 | 
16 | To install a specific version:
17 | 
18 | SSH: git+ssh://git@github.com/parsing-science/ps-toolkit.git@[tag]#egg=PS_Toolkit
19 | 
20 | where [tag] is a specific version tag, e.g. `v1.0.0`.
21 | 
22 | ## To run unittests
23 | ```sh
24 | cd ps-toolkit
25 | virtualenv venv
26 | source venv/bin/activate
27 | pip install -r requirements.txt
28 | python -m unittest discover -cv
29 | ```
30 | 


--------------------------------------------------------------------------------
/tests/pymc3_models/test_bayesian_model.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from ps_toolkit.pymc3_models import BayesianModel
 4 | 
 5 | 
 6 | class BayesianModelTestCase(unittest.TestCase):
 7 |     def test_create_model_raises_not_implemented_error(self):
 8 |         with self.assertRaises(NotImplementedError):
 9 |             BM = BayesianModel()
10 |             BM.create_model()
11 | 
12 |     def test_fit_raises_not_implemented_error(self):
13 |         with self.assertRaises(NotImplementedError):
14 |             BM = BayesianModel()
15 |             BM.fit()
16 | 
17 |     def test_predict_raises_not_implemented_error(self):
18 |         with self.assertRaises(NotImplementedError):
19 |             BM = BayesianModel()
20 |             BM.predict()
21 | 
22 |     def test_score_raises_not_implemented_error(self):
23 |         with self.assertRaises(NotImplementedError):
24 |             BM = BayesianModel()
25 |             BM.score()
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | *.ipynb
72 | 
73 | # pyenv
74 | .python-version
75 | 
76 | # celery beat schedule file
77 | celerybeat-schedule
78 | 
79 | # dotenv
80 | .env
81 | 
82 | # virtualenv
83 | venv/
84 | ENV/
85 | 
86 | # Spyder project settings
87 | .spyderproject
88 | 
89 | # Rope project settings
90 | .ropeproject
91 | 
92 | # HLR models
93 | HLR_jar/
94 | 


--------------------------------------------------------------------------------
/ps_toolkit/pymc3_models/__init__.py:
--------------------------------------------------------------------------------
  1 | import joblib
  2 | import matplotlib.pyplot as plt
  3 | import pymc3 as pm
  4 | import seaborn as sns
  5 | from sklearn.base import BaseEstimator
  6 | 
  7 | 
  8 | class BayesianModel(BaseEstimator):
  9 |     """
 10 |     Bayesian model base class
 11 |     """
 12 |     def __init__(self):
 13 |         self.advi_hist = None
 14 |         self.advi_trace = None
 15 |         self.cached_model = None
 16 |         self.num_pred = None
 17 |         self.shared_vars = None
 18 | 
 19 |     def create_model(self):
 20 |         raise NotImplementedError
 21 | 
 22 |     def _set_shared_vars(self, shared_vars):
 23 |         """
 24 |         Sets theano shared variables for the PyMC3 model.
 25 |         """
 26 |         for key in shared_vars.keys():
 27 |             self.shared_vars[key].set_value(shared_vars[key])
 28 | 
 29 |     def _inference(self, minibatches, n=200000):
 30 |         """
 31 |         Runs minibatch variational ADVI and then sample from those results.
 32 | 
 33 |         Parameters
 34 |         ----------
 35 |         minibatches: minibatches for ADVI
 36 | 
 37 |         n: number of iterations for ADVI fit, defaults to 200000
 38 |         """
 39 |         with self.cached_model:
 40 |             advi = pm.ADVI()
 41 |             approx = pm.fit(
 42 |                 n=n,
 43 |                 method=advi,
 44 |                 more_replacements=minibatches,
 45 |                 callbacks=[pm.callbacks.CheckParametersConvergence()]
 46 |             )
 47 | 
 48 |         self.advi_trace = approx.sample(draws=10000)
 49 | 
 50 |         self.advi_hist = advi.hist
 51 | 
 52 |     def fit(self):
 53 |         raise NotImplementedError
 54 | 
 55 |     def predict(self):
 56 |         raise NotImplementedError
 57 | 
 58 |     def score(self):
 59 |         raise NotImplementedError
 60 | 
 61 |     def save(self, file_prefix, custom_params=None):
 62 |         """
 63 |         Saves the advi_trace and custom params to files with the given file_prefix.
 64 | 
 65 |         Parameters
 66 |         ----------
 67 |         file_prefix: str, path and prefix used to identify where to save the trace for this model.
 68 |         Ex: given file_prefix = "path/to/file/"
 69 |         This will attempt to save to "path/to/file/advi_trace.pickle"
 70 | 
 71 |         custom_params: Dictionary of custom parameters to save. Defaults to None
 72 |         """
 73 |         fileObject = open(file_prefix + 'advi_trace.pickle', 'wb')
 74 |         joblib.dump(self.advi_trace, fileObject)
 75 |         fileObject.close()
 76 | 
 77 |         if custom_params:
 78 |             fileObject = open(file_prefix + 'params.pickle', 'wb')
 79 |             joblib.dump(custom_params, fileObject)
 80 |             fileObject.close()
 81 | 
 82 |     def load(self, file_prefix, load_custom_params=False):
 83 |         """
 84 |         Loads a saved version of the advi_trace, v_params, and custom param files with the given file_prefix.
 85 | 
 86 |         Parameters
 87 |         ----------
 88 |         file_prefix: str, path and prefix used to identify where to load the saved trace for this model.
 89 |         Ex: given file_prefix = "path/to/file/"
 90 |         This will attempt to load "path/to/file/advi_trace.pickle"
 91 | 
 92 |         load_custom_params: Boolean flag to indicate whether custom parameters should be loaded. Defaults to False.
 93 | 
 94 |         Returns
 95 |         ----------
 96 |         custom_params: Dictionary of custom parameters
 97 |         """
 98 |         self.advi_trace = joblib.load(file_prefix + 'advi_trace.pickle')
 99 | 
100 |         custom_params = None
101 |         if load_custom_params:
102 |             custom_params = joblib.load(file_prefix + 'params.pickle')
103 | 
104 |         return custom_params
105 | 
106 |     def plot_elbo(self):
107 |         """
108 |         Plot the ELBO values after running ADVI minibatch.
109 |         """
110 |         sns.set_style("white")
111 |         plt.plot(-self.advi_hist)
112 |         plt.ylabel('ELBO')
113 |         plt.xlabel('iteration')
114 |         sns.despine()
115 | 


--------------------------------------------------------------------------------
/ps_toolkit/visualizers/roc_curve_visualizer.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import matplotlib.pyplot as plt
  6 | from sklearn.exceptions import NotFittedError
  7 | from sklearn.metrics import roc_curve, auc
  8 | 
  9 | from ps_toolkit.exc import PSToolkitError
 10 | 
 11 | 
 12 | class ROCCurveVisualizer(object):
 13 |     """" A class to create an ROC curve"""
 14 | 
 15 |     def calculate_roc(self, probabilities, Y, pos_label=None, sample_weight=None):
 16 |         """
 17 |         Function to calculate the ROC and AUC for a set of binary data. This function is built on top of the sklearn functions for those values.
 18 | 
 19 |         Parameters
 20 |         ----------
 21 |         probabilities : a numpy array, list, or tuple of the probabilities of a True outcome for all data points
 22 | 
 23 |         Y : pandas Series or 1-column DataFrame, shape [n_samples]
 24 |             The outcome data used to separate the data points that have outcome=True from those of outcome=False
 25 | 
 26 |         Optional from sklearn:
 27 |         pos_label : int, Label considered as positive and others are considered negative.
 28 |         
 29 |         sample_weight : array-like of shape = [n_samples], Sample weights.
 30 |         """
 31 | 
 32 |         try:
 33 |             probabilities = np.array(probabilities)
 34 |             probabilities = probabilities.squeeze()
 35 |         except:
 36 |             raise PSToolkitError("The probabilities must be castable to a numpy array.")
 37 | 
 38 |         if probabilities.ndim != 1:
 39 |             raise PSToolkitError("The probabilities must be a one dimensional numpy array, list, or tuple.")
 40 | 
 41 |         if type(Y)==pd.DataFrame and len(Y.columns) != 1:
 42 |             raise PSToolkitError("Y must be a one-column DataFrame or Series.")
 43 | 
 44 |         if len(Y) != len(probabilities):
 45 |             raise PSToolkitError("The probabilities and Y must be the same size.")
 46 | 
 47 |         if Y.isnull().any().any():
 48 |             raise PSToolkitError("Y contains NaNs.")
 49 | 
 50 |         if np.isnan(probabilities).any():
 51 |             raise PSToolkitError("The probabilities contains NaNs.")
 52 | 
 53 |         if not (probabilities >= 0).all() or not (probabilities <= 1).all():
 54 |             raise PSToolkitError("The probabilities must be between 0 and 1.")
 55 | 
 56 |         fpr, tpr, thresholds = roc_curve(Y, probabilities, pos_label, sample_weight)
 57 |         self.fpr_ = fpr
 58 |         self.tpr_ = tpr
 59 |         self.thresholds_ = thresholds
 60 | 
 61 |         roc_auc = auc(fpr, tpr)
 62 |         self.roc_auc_ = roc_auc
 63 | 
 64 |     def create_roc_curve_plot(self):
 65 |         """
 66 |         Function to plot the ROC curve for a set of binary data.
 67 |         """
 68 | 
 69 |         if not hasattr(self, "fpr_"):
 70 |             raise NotFittedError("Call calculate_roc before create_roc_curve_plot")
 71 | 
 72 |         plt.figure()
 73 |         plt.plot(self.fpr_, self.tpr_, label='ROC curve (area = %0.2f)' % self.roc_auc_)
 74 |         plt.plot([0, 1], [0, 1], 'k--')
 75 |         plt.xlim([0.0, 1.0])
 76 |         plt.ylim([0.0, 1.05])
 77 |         plt.xlabel('False Positive Rate')
 78 |         plt.ylabel('True Positive Rate')
 79 |         plt.title('Receiver operating characteristic curve')
 80 |         plt.legend(loc="lower right")
 81 |         plt.show()
 82 | 
 83 |     def calculate_and_plot(self, probabilities, Y, pos_label=None, sample_weight=None):
 84 |         """
 85 |         Function to calculate and plot the ROC curve for a set of data.
 86 | 
 87 |         Parameters
 88 |         ----------
 89 |         probabilities : a numpy array, list, or tuple of the probabilities of a True outcome for all data points
 90 | 
 91 |         Y : pandas Series or 1-column DataFrame, shape [n_samples]
 92 |             The outcome data used to separate the data points that have outcome=True from those of outcome=False
 93 | 
 94 |         Optional from sklearn:
 95 |         pos_label : int, Label considered as positive and others are considered negative.
 96 |         
 97 |         sample_weight : array-like of shape = [n_samples], Sample weights.
 98 |         """
 99 | 
100 |         self.calculate_roc(probabilities, Y, pos_label, sample_weight)
101 | 
102 |         self.create_roc_curve_plot()
103 | 


--------------------------------------------------------------------------------
/ps_toolkit/pymc3_models/HLR.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pymc3 as pm
  3 | from sklearn.metrics import accuracy_score
  4 | import theano
  5 | import theano.tensor as T
  6 | 
  7 | from ps_toolkit.exc import PSToolkitError
  8 | from ps_toolkit.pymc3_models import BayesianModel
  9 | 
 10 | 
 11 | class HLR(BayesianModel):
 12 |     """
 13 |     Custom Hierarchical Logistic Regression built using PyMC3.
 14 |     """
 15 | 
 16 |     def __init__(self):
 17 |         super(HLR, self).__init__()
 18 |         self.num_cats = None
 19 | 
 20 |     def create_model(self):
 21 |         """
 22 |         Creates and returns the PyMC3 model.
 23 | 
 24 |         Returns the model.
 25 |         """
 26 |         model_input = theano.shared(np.zeros([1, self.num_pred]))
 27 | 
 28 |         model_output = theano.shared(np.zeros(1))
 29 | 
 30 |         model_cats = theano.shared(np.zeros(1, dtype='int'))
 31 | 
 32 |         self.shared_vars = {
 33 |             'model_input': model_input,
 34 |             'model_output': model_output,
 35 |             'model_cats': model_cats
 36 |         }
 37 | 
 38 |         model = pm.Model()
 39 | 
 40 |         with model:
 41 |             mu_alpha = pm.Normal('mu_alpha', mu=0, sd=100)
 42 |             sigma_alpha = pm.HalfNormal('sigma_alpha', sd=100)
 43 | 
 44 |             mu_beta = pm.Normal('mu_beta', mu=0, sd=100)
 45 |             sigma_beta = pm.HalfNormal('sigma_beta', sd=100)
 46 | 
 47 |             alpha = pm.Normal('alpha', mu=mu_alpha, sd=sigma_alpha, shape=(self.num_cats,))
 48 |             beta = pm.Normal('beta', mu=mu_beta, sd=sigma_beta, shape=(self.num_cats, self.num_pred))
 49 | 
 50 |             c = model_cats
 51 | 
 52 |             temp = alpha[c] + T.sum(beta[c] * model_input, 1)
 53 | 
 54 |             p = pm.invlogit(temp)
 55 | 
 56 |             o = pm.Bernoulli('o', p, observed=model_output)
 57 | 
 58 |         return model
 59 | 
 60 |     def fit(self, X, y, cats, n=200000, batch_size=100):
 61 |         """
 62 |         Train the HLR model
 63 | 
 64 |         Parameters
 65 |         ----------
 66 |         X : numpy array, shape [n_samples, n_features]
 67 | 
 68 |         y : numpy array, shape [n_samples, ]
 69 | 
 70 |         cats: numpy array, shape [n_samples, ]
 71 | 
 72 |         n: number of iterations for ADVI fit, defaults to 200000
 73 | 
 74 |         batch_size: number of samples to include in each minibatch for ADVI, defaults to 100
 75 |         """
 76 |         self.num_cats = len(np.unique(cats))
 77 |         num_samples, self.num_pred = X.shape
 78 | 
 79 |         if self.cached_model is None:
 80 |             self.cached_model = self.create_model()
 81 | 
 82 |         with self.cached_model:
 83 | 
 84 |             minibatches = {
 85 |                 self.shared_vars['model_input']: pm.Minibatch(X, batch_size=batch_size),
 86 |                 self.shared_vars['model_output']: pm.Minibatch(y, batch_size=batch_size),
 87 |                 self.shared_vars['model_cats']: pm.Minibatch(cats, batch_size=batch_size)
 88 |             }
 89 | 
 90 |             self._inference(minibatches, n)
 91 | 
 92 |         return self
 93 | 
 94 |     def predict_proba(self, X, cats, return_std=False):
 95 |         """
 96 |         Predicts probabilities of new data with a trained HLR
 97 | 
 98 |         Parameters
 99 |         ----------
100 |         X : numpy array, shape [n_samples, n_features]
101 | 
102 |         cats: numpy array, shape [n_samples, ]
103 | 
104 |         return_std: Boolean flag of whether to return standard deviations with mean probabilities. Defaults to False.
105 |         """
106 | 
107 |         if self.advi_trace is None:
108 |             raise PSToolkitError('Run fit on the model before predict.')
109 | 
110 |         num_samples = X.shape[0]
111 | 
112 |         if self.cached_model is None:
113 |             self.cached_model = self.create_model()
114 | 
115 |         self._set_shared_vars({'model_input': X, 'model_output': np.zeros(num_samples), 'model_cats': cats})
116 | 
117 |         ppc = pm.sample_ppc(self.advi_trace, model=self.cached_model, samples=2000)
118 | 
119 |         if return_std:
120 |             return ppc['o'].mean(axis=0), ppc['o'].std(axis=0)
121 |         else:
122 |             return ppc['o'].mean(axis=0)
123 | 
124 |     def predict(self, X, cats):
125 |         """
126 |         Predicts labels of new data with a trained model
127 | 
128 |         Parameters
129 |         ----------
130 |         X : numpy array, shape [n_samples, n_features]
131 | 
132 |         cats: numpy array, shape [n_samples, ]
133 |         """
134 |         ppc_mean = self.predict_proba(X, cats)
135 | 
136 |         pred = ppc_mean > 0.5
137 | 
138 |         return pred
139 | 
140 |     def score(self, X, y, cats):
141 |         """
142 |         Scores new data with a trained model.
143 | 
144 |         Parameters
145 |         ----------
146 |         X : numpy array, shape [n_samples, n_features]
147 | 
148 |         y : numpy array, shape [n_samples, ]
149 | 
150 |         cats: numpy array, shape [n_samples, ]
151 |         """
152 | 
153 |         return accuracy_score(y, self.predict(X, cats))
154 | 
155 |     def save(self, file_prefix):
156 |         params = {'num_cats': self.num_cats, 'num_pred': self.num_pred}
157 | 
158 |         super(HLR, self).save(file_prefix, params)
159 | 
160 |     def load(self, file_prefix):
161 |         params = super(HLR, self).load(file_prefix, load_custom_params=True)
162 | 
163 |         self.num_cats = params['num_cats']
164 |         self.num_pred = params['num_pred']
165 | 


--------------------------------------------------------------------------------
/tests/pymc3_models/test_HLR.py:
--------------------------------------------------------------------------------
  1 | import shutil
  2 | import tempfile
  3 | import unittest
  4 | 
  5 | import numpy as np
  6 | from pymc3 import summary
  7 | from sklearn.model_selection import train_test_split
  8 | 
  9 | from ps_toolkit.exc import PSToolkitError
 10 | from ps_toolkit import HLR
 11 | 
 12 | 
 13 | class HLRTestCase(unittest.TestCase):
 14 |     def setUp(self):
 15 |         def numpy_invlogit(x):
 16 |             return 1 / (1 + np.exp(-x))
 17 | 
 18 |         self.num_cats = 3
 19 |         self.num_pred = 1
 20 |         self.num_samples_per_cat = 100000
 21 | 
 22 |         self.alphas = np.random.randn(self.num_cats)
 23 |         self.betas = np.random.randn(self.num_cats, self.num_pred)
 24 |         #TODO: make this more efficient; right now, it's very explicit so I understand it.
 25 |         x_a = np.random.randn(self.num_samples_per_cat, self.num_pred)
 26 |         y_a = np.random.binomial(1, numpy_invlogit(self.alphas[0] + np.sum(self.betas[0] * x_a, 1)))
 27 |         x_b = np.random.randn(self.num_samples_per_cat, self.num_pred)
 28 |         y_b = np.random.binomial(1, numpy_invlogit(self.alphas[1] + np.sum(self.betas[1] * x_b, 1)))
 29 |         x_c = np.random.randn(self.num_samples_per_cat, self.num_pred)
 30 |         y_c = np.random.binomial(1, numpy_invlogit(self.alphas[2] + np.sum(self.betas[2] * x_c, 1)))
 31 | 
 32 |         X = np.concatenate([x_a, x_b, x_c])
 33 |         Y = np.concatenate([y_a, y_b, y_c])
 34 |         cats = np.concatenate([
 35 |             np.zeros(self.num_samples_per_cat, dtype=np.int),
 36 |             np.ones(self.num_samples_per_cat, dtype=np.int),
 37 |             2*np.ones(self.num_samples_per_cat, dtype=np.int)
 38 |         ])
 39 | 
 40 |         self.X_train, self.X_test, self.cat_train, self.cat_test, self.Y_train, self.Y_test = train_test_split(
 41 |             X, cats, Y, test_size=0.4
 42 |         )
 43 | 
 44 |         self.test_HLR = HLR()
 45 | 
 46 |         self.test_dir = tempfile.mkdtemp()
 47 | 
 48 |     def tearDown(self):
 49 |         shutil.rmtree(self.test_dir)
 50 | 
 51 | 
 52 | class HLRFitTestCase(HLRTestCase):
 53 |     def test_fit_returns_correct_model(self):
 54 |         # Note: print is here so PyMC3 output won't overwrite the test name
 55 |         print("")
 56 |         self.test_HLR.fit(self.X_train, self.Y_train, self.cat_train)
 57 | 
 58 |         self.assertEqual(self.num_cats, self.test_HLR.num_cats)
 59 |         self.assertEqual(self.num_pred, self.test_HLR.num_pred)
 60 | 
 61 |         #TODO: Figure out best way to test
 62 |         #np.testing.assert_almost_equal(self.alphas, self.test_HLR.advi_trace['alphas'].mean(), decimal=1)
 63 |         #np.testing.assert_almost_equal(self.betas, self.test_HLR.advi_trace['betas'].mean(), decimal=1)
 64 | 
 65 |         # For now, just check that the estimated parameters have the correct signs
 66 |         np.testing.assert_equal(
 67 |             np.sign(self.alphas),
 68 |             np.sign(self.test_HLR.advi_trace['alpha'].mean(axis=0))
 69 |         )
 70 |         np.testing.assert_equal(
 71 |             np.sign(self.betas),
 72 |             np.sign(self.test_HLR.advi_trace['beta'].mean(axis=0))
 73 |         )
 74 | 
 75 | 
 76 | class HLRPredictProbaTestCase(HLRTestCase):
 77 |     def test_predict_proba_returns_probabilities(self):
 78 |         print("")
 79 |         self.test_HLR.fit(self.X_train, self.Y_train, self.cat_train)
 80 |         probs = self.test_HLR.predict_proba(self.X_test, self.cat_test)
 81 |         self.assertEqual(probs.shape, self.Y_test.shape)
 82 | 
 83 |     def test_predict_proba_returns_probabilities_and_std(self):
 84 |         print("")
 85 |         self.test_HLR.fit(self.X_train, self.Y_train, self.cat_train)
 86 |         probs, stds = self.test_HLR.predict_proba(self.X_test, self.cat_test, return_std=True)
 87 |         self.assertEqual(probs.shape, self.Y_test.shape)
 88 |         self.assertEqual(stds.shape, self.Y_test.shape)
 89 | 
 90 |     def test_predict_proba_raises_error_if_not_fit(self):
 91 |         with self.assertRaises(PSToolkitError) as no_fit_error:
 92 |             test_HLR = HLR()
 93 |             test_HLR.predict_proba(self.X_train, self.cat_train)
 94 | 
 95 |         expected = "Run fit on the model before predict."
 96 |         self.assertEqual(str(no_fit_error.exception), expected)
 97 | 
 98 | 
 99 | class HLRPredictTestCase(HLRTestCase):
100 |     def test_predict_returns_predictions(self):
101 |         print("")
102 |         self.test_HLR.fit(self.X_train, self.Y_train, self.cat_train)
103 |         preds = self.test_HLR.predict(self.X_test, self.cat_test)
104 |         self.assertEqual(preds.shape, self.Y_test.shape)
105 | 
106 | 
107 | class HLRScoreTestCase(HLRTestCase):
108 |     def test_score_scores(self):
109 |         print("")
110 |         self.test_HLR.fit(self.X_train, self.Y_train, self.cat_train)
111 |         score = self.test_HLR.score(self.X_test, self.Y_test, self.cat_test)
112 |         naive_score = np.mean(self.Y_test)
113 |         self.assertGreaterEqual(score, naive_score)
114 | 
115 | 
116 | class HLRSaveandLoadTestCase(HLRTestCase):
117 |     def test_save_and_load_work_correctly(self):
118 |         print("")
119 |         self.test_HLR.fit(self.X_train, self.Y_train, self.cat_train)
120 |         probs1 = self.test_HLR.predict_proba(self.X_test, self.cat_test)
121 |         self.test_HLR.save(self.test_dir)
122 | 
123 |         HLR2 = HLR()
124 | 
125 |         HLR2.load(self.test_dir)
126 | 
127 |         self.assertEqual(self.test_HLR.num_cats, HLR2.num_cats)
128 |         self.assertEqual(self.test_HLR.num_pred, HLR2.num_pred)
129 |         self.assertEqual(summary(self.test_HLR.advi_trace), summary(HLR2.advi_trace))
130 | 
131 |         probs2 = HLR2.predict_proba(self.X_test, self.cat_test)
132 | 
133 |         np.testing.assert_almost_equal(probs2, probs1, decimal=1)
134 | 


--------------------------------------------------------------------------------
/ps_toolkit/visualizers/separation_plot_visualizer.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import matplotlib.pyplot as plt
  6 | from sklearn.exceptions import NotFittedError
  7 | 
  8 | from ps_toolkit.exc import PSToolkitError
  9 | 
 10 | 
 11 | class SeparationPlotVisualizer(object):
 12 |     """A class that can create a separation plot for a set of data with a binary outcome."""
 13 | 
 14 |     def separate_probabilities(self, probabilities, Y):
 15 |         """Function to separate the probabilities for events that are true and those that are false.
 16 |             Useful for creating a separation plot.
 17 | 
 18 |         Parameters
 19 |         ----------
 20 |         probabilities : a numpy array, list, or tuple of the probabilities of a True outcome for all data points
 21 | 
 22 |         Y : pandas Series or 1-column DataFrame, shape [n_samples]
 23 |             The outcome data used to separate the data points that have outcome=True from those of outcome=False
 24 | 
 25 |         """
 26 | 
 27 |         if type(Y)==pd.DataFrame and len(Y.columns) != 1:
 28 |             raise PSToolkitError("Y must be a one-column DataFrame or Series.")
 29 | 
 30 |         if len(Y) != len(probabilities):
 31 |             raise PSToolkitError("The probabilities and Y must be the same size.")
 32 | 
 33 |         if Y.isnull().any().any():
 34 |             raise PSToolkitError("Y contains NaNs.")
 35 | 
 36 |         if type(probabilities) != np.ndarray:
 37 |             probabilities = np.array(probabilities)
 38 | 
 39 |         if np.isnan(probabilities).any():
 40 |             raise PSToolkitError("The probabilities contains NaNs.")
 41 | 
 42 |         if not (probabilities >= 0).all() or not (probabilities <= 1).all():
 43 |             raise PSToolkitError("The probabilities must be between 0 and 1.")
 44 | 
 45 |         true_probs = []
 46 |         false_probs = []
 47 | 
 48 |         if type(Y)==pd.Series:
 49 |             Y = pd.DataFrame(Y)
 50 | 
 51 |         for i in range(len(Y)):
 52 |             if Y.iloc[i][0]:
 53 |                 true_probs.append(probabilities[i])
 54 |             else:
 55 |                 false_probs.append(probabilities[i])
 56 | 
 57 |         self.true_probs_ = true_probs
 58 |         self.false_probs_ = false_probs
 59 |      
 60 | 
 61 |     def create_separation_plot(self):
 62 |         """Function to create a separation plot for a set of true probabilities and false probabilities.
 63 |         """
 64 | 
 65 |         if not hasattr(self, "true_probs_") or not hasattr(self, "false_probs_"):
 66 |             raise NotFittedError("Call separate_probabilities before create_separation_plot")
 67 | 
 68 |         tints = [
 69 |             "#f3e6ed", 
 70 |             "#e7cedc", 
 71 |             "#dcb5ca", 
 72 |             "#d09db9", 
 73 |             "#c584a7", 
 74 |             "#b96c96", 
 75 |             "#ad5384", 
 76 |             "#a23a72",  
 77 |             "#962261", 
 78 |             "#8b0a50"
 79 |         ]
 80 | 
 81 |         a_heights, a_bins = np.histogram(self.true_probs_, 
 82 |             bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
 83 |         )
 84 |  
 85 |         a_widths = a_heights/len(self.true_probs_)
 86 |      
 87 |         b_heights, b_bins = np.histogram(self.false_probs_, 
 88 |             bins=[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
 89 |         )
 90 |      
 91 |         b_widths = b_heights/len(self.false_probs_)
 92 | 
 93 |         plt.subplot(2, 1, 1)
 94 | 
 95 |         left_edge=0
 96 | 
 97 |         for i in range(10):
 98 |             plt.bar(left_edge, 
 99 |                 1, 
100 |                 a_widths[i], 
101 |                 color=tints[i], 
102 |                 edgecolor=None, 
103 |                 label=str(i/10)+"-"+str((i+1)/10)
104 |             )
105 | 
106 |             left_edge+=a_widths[i]
107 |      
108 |         plt.title("y=True (n={})".format(len(self.true_probs_)))
109 | 
110 |         plt.tick_params(axis='both',
111 |             which='both', 
112 |             left='off', 
113 |             top='off', 
114 |             bottom = 'off', 
115 |             right='off', 
116 |             labelleft='off', 
117 |             labelbottom = 'off'
118 |         )
119 | 
120 |         plt.legend(bbox_to_anchor=(1.05, 0.7), loc=2)   
121 | 
122 |         x1,x2,y1,y2 = plt.axis()
123 |         plt.axis((0,1,y1,y2))
124 | 
125 |         plt.subplot(2, 1, 2)
126 | 
127 |         left_edge=0
128 | 
129 |         for i in range(10):
130 |             plt.bar(left_edge, 
131 |                 1, 
132 |                 b_widths[i], 
133 |                 color=tints[i], 
134 |                 edgecolor=None
135 |             )
136 | 
137 |             left_edge+=b_widths[i]
138 | 
139 |         plt.title("y=False (n={})".format(len(self.false_probs_)))
140 | 
141 |         plt.tick_params(axis='both',
142 |             which='both', 
143 |             left='off', 
144 |             top='off', 
145 |             bottom = 'off', 
146 |             right='off', 
147 |             labelleft='off', 
148 |             labelbottom = 'off')
149 | 
150 |         x1,x2,y1,y2 = plt.axis()
151 |         plt.axis((0,1,y1,y2))
152 | 
153 |     def separate_and_plot(self, probabilities, Y):
154 |         """A function that combines the functionality of _separate_probabilities and _create_separation_plot.
155 | 
156 |         Parameters
157 |         ----------
158 |         probabilities : a numpy array of the probabilities of a True outcome for all data points
159 | 
160 |         Y : pandas Series or 1-column DataFrame, shape [n_samples]
161 |             The outcome data used to separate the data points that have outcome=True from those of outcome=False
162 | 
163 |         """
164 | 
165 |         self.separate_probabilities(probabilities, Y)
166 | 
167 |         self.create_separation_plot()
168 | 


--------------------------------------------------------------------------------