├── .gitignore
├── .travis.yml
├── LICENSE.md
├── MANIFEST.in
├── README.md
├── bdranalytics
    ├── __init__.py
    ├── images
    │   ├── __init__.py
    │   └── bdr.gif
    ├── keras
    │   ├── __init__.py
    │   ├── callbacks.py
    │   ├── generators.py
    │   ├── layers.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   └── test_generators.py
    ├── pdlearn
    │   ├── __init__.py
    │   ├── pipeline.py
    │   ├── preprocessing.py
    │   └── tests
    │   │   ├── __init__.py
    │   │   ├── test_pipeline.py
    │   │   └── test_preprocessing.py
    ├── plot
    │   ├── __init__.py
    │   ├── classification.py
    │   └── tests
    │   │   └── __init__.py
    └── sklearn
    │   ├── __init__.py
    │   ├── encoders.py
    │   ├── model_selection.py
    │   ├── preprocessing
    │       ├── __init__.py
    │       ├── encoding.py
    │       ├── preprocessing.py
    │       ├── scaling.py
    │       └── tests
    │       │   ├── __init__.py
    │       │   ├── test_encoding.py
    │       │   └── test_scaling.py
    │   └── tests
    │       ├── __init__.py
    │       └── test_model_selection.py
├── data
    ├── recruit.dat
    ├── soi.dat
    ├── soi_description.txt
    └── test.dat
├── doc
    ├── Makefile
    ├── conf.py
    ├── index.rst
    └── push_to_pages.sh
├── environment-dev.yml
├── environment.yml
├── notebooks
    ├── .gitkeep
    ├── Spark Cross Sell Frequent Pairs.ipynb
    ├── bdr-imbalanced-classification.ipynb
    ├── bdr-regression.ipynb
    ├── bdr-timeseries-classic-approach.ipynb
    └── bdr-timeseries-neuralnetwork-lstm.ipynb
├── requirements-dev.txt
├── requirements.txt
├── sample
    └── __init__.py
├── setup.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Documentation
 2 | doc/_build
 3 | doc/source
 4 | 
 5 | # Python / Anaconda
 6 | *.egg-info
 7 | .eggs
 8 | *.iml
 9 | *.pyc
10 | 
11 | # IntelliJ / PyCharm
12 | /.idea
13 | /build
14 | /src
15 | /dist
16 | 
17 | # Jupyter notebook
18 | .ipynb_checkpoints
19 | 
20 | # Joblib
21 | .cache
22 | 
23 | # OS X
24 | .DS_Store
25 | 
26 | # Tests
27 | .pytest_cache/
28 | 
29 | # Tensorflow
30 | tensorboardlog
31 | notebooks/logs/
32 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 | - '3.6'
 4 | before_install:
 5 | - sudo apt-get update
 6 | - sudo apt-get install -qq python-numpy python-scipy
 7 | install: pip install -r requirements-dev.txt
 8 | script: cd doc && make html && cd ..
 9 | after_success:
10 | - "doc/push_to_pages.sh"
11 | deploy:
12 |   provider: pypi
13 |   user: bigdatarepublic
14 |   password:
15 |     secure: c8287YylQ5wz4WJMk6DrOlstCuDbao05LpKDF2mMzTRXjwOxNGatexq04frciizuupUkKP1ui9JdOYOEgFI0pYFffmXuqmiXkIK9MG9U4AJV3CH9URV81VmgGgsN1rbHcFGKvOrxelDSY6TqQYMaHt5JCGFcPUxpvkE76KAssmzL9wYavqDuHRKhjTcYiNqw22u7V1i3Cp/7zFzBHYz0BlkzzbkWPuphMnxiTsz+HE9bTqa7Jwj1pduyAuwdDSkAVcGUSjc3GZifzY5rD77vOuovgCCtD0aY9hj3YHV4oY6+4ErkbpJkuF9urkMQV1FPgoOb7YrjxBzXTnXZWzx2E4sXlzQYgXsLF5bI+5+qPwQXtInarFSH/QGyRhTppg9RT2ItX2rri+Do3biEunRtxd3pZSaZP7I2fSOYNxnvrcx8/qC1s8X8P6lLEKkb6hH3aRMeO8L2e/X0vu/4nVua09yW1e0QZzJN6GYUe8N+vA4OQNlL6NW2Mx6GvrKSlUI0l2jAVjvJk6N9HdGEjQeKzGqtqC7FHsg49sBzZRkqmVpV+UbJWHQ9B43jtg0dFE0lf/F4VdUMoD1A6GjZGMatV5NP5/Jb1gCWsU9vyPPxcDVa/N6GxAikjHyMIJM84NzNFjFJZOHK9aHOqz9B0tqS2ZfICbrRm4eRb8FgsqXcJaI=
16 |   on:
17 |     branch: master
18 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2016 BigData Republic
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include bdranalytics/images/bdr.gif


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![PyPI version](https://badge.fury.io/py/bdranalytics.svg)](https://badge.fury.io/py/bdranalytics)
 2 | [![Build Status](https://travis-ci.org/BigDataRepublic/bdr-analytics-py.svg?branch=master)](https://travis-ci.org/BigDataRepublic/bdr-analytics-py)
 3 | 
 4 | # BigData Republic Analytics (python)
 5 | Our analytics library to quickly get our data scientists up to speed, on the python platform
 6 | 
 7 | User documentation can be found at https://bigdatarepublic.github.io/bdr-analytics-py/
 8 | 
 9 | ## Installation
10 | 
11 | Installation is done through the pip command line utility.
12 | 
13 | ```
14 | pip install bdranalytics
15 | ```
16 | 
17 | ## Using the Spark notebooks
18 | Some notebooks in the `notebooks` folder use spark. Check the [spark documentation](http://spark.apache.org/docs/2.0.1/programming-guide.html) for running jupyter with a spark contet.
19 | 
20 | But in short, for **windows**
21 | ```
22 | set PYSPARK_DRIVER_PYTHON_OPTS=notebook
23 | set PYSPARK_DRIVER_PYTHON=jupyter
24 | [spark_install_dir]\bin\pyspark
25 | ```
26 | 
27 | And for **nix**
28 | ```
29 | export PYSPARK_DRIVER_PYTHON_OPTS=notebook
30 | export PYSPARK_DRIVER_PYTHON=jupyter
31 | [spark_install_dir]/bin/pyspark
32 | ```
33 | 
34 | ## Contributing
35 | To contribute, please fork or branch from `master` and submit a pull-request.
36 | Guidelines for an acceptable pull-request:
37 | 
38 | - PEP8 compliant code
39 | - At least one line of documentation per class, function and method.
40 | - Tests covering edge cases of your code.
41 | 
42 | ### Development environment
43 | To create the development environment with conda, run:
44 | 
45 |   > conda env create -f environment-dev.yml
46 | 
47 |   > source activate bdranalytics-dev
48 | 
49 | ### Running the test
50 | 
51 | To run all tests:
52 | > source activate bdranalytics-dev
53 | > python setup.py test
54 | 
55 | ### Creating a package dist
56 | 
57 | To create a dist from a local checkout (when developing on this module):
58 | > source activate bdranalytics-dev
59 | > python setup.py sdist
60 | 
61 | ### Running the installation script
62 | This uses the setup.py script directly, useful for testing how the dist will be installed without creating the dist.
63 | 
64 | To just install the package and main dependencies from a local checkout (when going to use this module):
65 | > python setup.py install
66 | 
67 | ### Creating the sphinx documentation
68 | 
69 | To update html files:
70 | ```
71 | source activate bdranalytics-dev
72 | cd doc
73 | make clean && make source && make html
74 | ```
75 | 


--------------------------------------------------------------------------------
/bdranalytics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/bdranalytics/__init__.py


--------------------------------------------------------------------------------
/bdranalytics/images/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/bdranalytics/images/__init__.py


--------------------------------------------------------------------------------
/bdranalytics/images/bdr.gif:
--------------------------------------------------------------------------------
1 | TODO:our logo


--------------------------------------------------------------------------------
/bdranalytics/keras/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/bdranalytics/keras/__init__.py


--------------------------------------------------------------------------------
/bdranalytics/keras/callbacks.py:
--------------------------------------------------------------------------------
 1 | from keras.callbacks import Callback
 2 | from keras.models import Sequential
 3 | from sklearn.metrics import roc_auc_score, average_precision_score, \
 4 |     precision_score, recall_score
 5 | 
 6 | 
 7 | class EpochEvaluation(Callback):
 8 |     def __init__(self, validation_data=()):
 9 |         super(Callback, self).__init__()
10 |         self.X_val, self.y_val = validation_data
11 |         self.metrics = {}
12 | 
13 |     def on_epoch_begin(self, epoch, logs={}):
14 |         if epoch > 0:
15 |             print(" - ".join(["val_{:s}: {:.4f}".format(k, v)
16 |                               for k, v in self.metrics.items()]))
17 | 
18 |     def on_epoch_end(self, epoch, logs={}):
19 |         if isinstance(self.model, Sequential):
20 |             predict = self.model.predict_proba
21 |         else:
22 |             predict = self.model.predict
23 | 
24 |         y_pred = predict(self.X_val, verbose=0)
25 |         y_pred_bin = y_pred > 0.5
26 | 
27 |         y_true = self.y_val
28 |         self.metrics['roc_auc'] = roc_auc_score(y_true, y_pred)
29 |         self.metrics['pr_auc'] = average_precision_score(
30 |             y_true, y_pred, average="micro")
31 |         self.metrics['recall'] = recall_score(y_true, y_pred_bin)
32 |         self.metrics['precision'] = precision_score(y_true, y_pred_bin)
33 | 


--------------------------------------------------------------------------------
/bdranalytics/keras/generators.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from keras.preprocessing.image import Iterator
  3 | 
  4 | 
  5 | class StratifiedIndexGenerator:
  6 |     """
  7 |     Stratified index generator.
  8 |     """
  9 | 
 10 |     def __init__(self, shuffle=True):
 11 |         self.shuffle = shuffle
 12 | 
 13 |     def flow(self, strata=None, batch_size=128, strata_weights=None):
 14 |         """
 15 |         Batch generator function.
 16 | 
 17 |         :param strata: vector with n_samples that denotes the strata
 18 |         :param batch_size: number of samples in a batch
 19 |         :param strata_weights: dictionary with strata weights, should sum to 1.0
 20 |         :return: an iterator that yields sample indices
 21 |         """
 22 | 
 23 |         strata = strata.ravel()
 24 |         strata_labels = np.unique(strata)
 25 |         n_strata = len(strata_labels)
 26 |         n_samples_total = len(strata)  # total number of samples
 27 |         indices = np.arange(n_samples_total)
 28 | 
 29 |         generators = {}
 30 |         surplus = {}
 31 | 
 32 |         if strata_weights is None:
 33 |             strata_weights = {}
 34 | 
 35 |         for stratum_label in strata_labels:
 36 |             mask = (strata == stratum_label)
 37 |             generators[stratum_label] = self.sample_generator(indices[mask])
 38 |             surplus[stratum_label] = 0.
 39 |             if strata_weights.get(stratum_label) is None:
 40 |                 strata_weights[stratum_label] = 1 / n_strata
 41 | 
 42 |         if sum(strata_weights.values()) != 1.0:
 43 |             raise ValueError("strata weights should sum to 1.0")
 44 | 
 45 |         # preallocate
 46 |         return_indices = np.empty(batch_size, dtype=int)
 47 | 
 48 |         while True:
 49 | 
 50 |             # shift labels array to make sure every label is last label equal amount of times
 51 |             # better dispersed surplus
 52 |             strata_labels = np.roll(strata_labels, 1)
 53 | 
 54 |             # reset total sample counter
 55 |             i_sample = 0
 56 |             for stratum_label in strata_labels:
 57 | 
 58 |                 # float indicating number of samples to draw from this stratum
 59 |                 n_samples_float = (
 60 |                     batch_size * strata_weights[stratum_label]) - surplus[stratum_label]
 61 | 
 62 |                 # exception when reaching last stratum
 63 |                 if stratum_label == strata_labels[-1]:
 64 |                     n_samples = batch_size - i_sample
 65 |                 else:
 66 |                     n_samples = round(n_samples_float)
 67 | 
 68 |                 # store remainder
 69 |                 surplus[stratum_label] = (1. * n_samples) - n_samples_float
 70 | 
 71 |                 if n_samples == 0:
 72 |                     continue
 73 | 
 74 |                 # draw samples from generator
 75 |                 for _ in range(n_samples):
 76 |                     return_indices[i_sample] = next(generators[stratum_label])
 77 |                     i_sample += 1  # increment total sample counter
 78 | 
 79 |             # yield result
 80 |             yield return_indices
 81 | 
 82 |     def sample_generator(self, indices):
 83 |         """
 84 |         Basic single element generator from a list, in shuffled order.
 85 | 
 86 |         :param indices: list of indices to yield
 87 |         :return: a generator
 88 |         """
 89 |         while True:
 90 |             if self.shuffle:
 91 |                 indices = np.random.permutation(indices)
 92 |             for selected_row in indices:
 93 |                 yield selected_row
 94 | 
 95 | 
 96 | class DataGenerator:
 97 |     """
 98 |     Keras-API compatible data generator class for in-memory (X, y) samples.
 99 |     Comparable to keras.preprocessing.image.ImageDataGenerator
100 |     """
101 | 
102 |     def __init__(self):
103 |         pass
104 | 
105 |     def flow(self, X, y, batch_size=128, seed=42, shuffle=True, strata=None, strata_weights=None):
106 |         """
107 |         Returns a data iterator that can be looped over to return batches.
108 | 
109 |         :param X: array-like, input data
110 |         :param y: array-like, target data
111 |         :param batch_size: int, number of samples in the batch
112 |         :param seed: int, seed for randomness, set globally
113 |         :param shuffle: bool, whether to shuffle the dataset
114 |         :param strata: array-like, size n_samples that denotes the subpopulation (stratum) ID, which
115 |                         is sampled independently.
116 |         :param strata_weights: dictionary, containing strata weights, should sum to 1.0
117 |         :return: an iterator
118 |         """
119 |         return DataIterator(X, y, batch_size=batch_size, n=X.shape[0], seed=seed, shuffle=shuffle,
120 |                             strata=strata, strata_weights=strata_weights)
121 | 
122 | 
123 | class DataIterator(Iterator):
124 |     """
125 |     Data iterator stratification capability. Keras-API compatible.
126 |     Comparable to keras.preprocessing.image.NumpyDataIterator
127 |     """
128 | 
129 |     def __init__(self, X, y, strata=None, strata_weights=None, batch_size=128, shuffle=True, **kwargs):
130 |         self.X = X
131 |         self.y = y
132 |         self.strata = strata
133 |         self.strata_weights = strata_weights
134 | 
135 |         super(DataIterator, self).__init__(
136 |             batch_size=batch_size, shuffle=shuffle, **kwargs)
137 | 
138 |         if self.strata is not None:
139 |             self.index_generator = StratifiedIndexGenerator(shuffle=shuffle).flow(batch_size=batch_size,
140 |                                                                                   strata=self.strata,
141 |                                                                                   strata_weights=self.strata_weights)
142 | 
143 |     def _get_batches_of_transformed_samples(self, index_array):
144 |         return self.X[index_array, ], self.y[index_array, ]
145 | 
146 |     def next(self):
147 |         with self.lock:
148 |             index_array = next(self.index_generator)
149 |         return self._get_batches_of_transformed_samples(index_array)
150 | 


--------------------------------------------------------------------------------
/bdranalytics/keras/layers.py:
--------------------------------------------------------------------------------
 1 | from keras.engine import Layer, InputSpec
 2 | from keras.layers import Flatten
 3 | import tensorflow as tf
 4 | 
 5 | 
 6 | class KMaxPooling(Layer):
 7 |     """
 8 |     K-max pooling layer that extracts the k-highest activations from a sequence (2nd dimension).
 9 |     TensorFlow backend.
10 |     """
11 | 
12 |     def __init__(self, k=1, **kwargs):
13 |         super().__init__(**kwargs)
14 |         self.input_spec = InputSpec(ndim=3)
15 |         self.k = k
16 | 
17 |     def compute_output_shape(self, input_shape):
18 |         return input_shape[0], (input_shape[2] * self.k)
19 | 
20 |     def call(self, inputs, **kwargs):
21 |         # swap last two dimensions since top_k will be applied along the last dimension
22 |         shifted_input = tf.transpose(inputs, [0, 2, 1])
23 | 
24 |         # extract top_k, returns two tensors [values, indices]
25 |         top_k = tf.nn.top_k(shifted_input, k=self.k, sorted=True, name=None)[0]
26 | 
27 |         # return flattened output
28 |         return Flatten()(top_k)
29 | 


--------------------------------------------------------------------------------
/bdranalytics/keras/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/bdranalytics/keras/tests/__init__.py


--------------------------------------------------------------------------------
/bdranalytics/keras/tests/test_generators.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from bdranalytics.keras.generators import *
 3 | import numpy as np
 4 | from keras.models import Sequential
 5 | from keras.layers import Dense
 6 | from keras.preprocessing.image import ImageDataGenerator
 7 | 
 8 | 
 9 | @pytest.yield_fixture(scope='class')
10 | def params(request):
11 |     request.cls.batch_size = 3
12 |     request.cls.X = np.random.rand(100, 10)
13 |     request.cls.X_image = np.random.rand(100, 1, 1, 1)
14 |     request.cls.y = 1. * (np.random.rand(100, 1) > 0.5)
15 |     request.cls.strata_weights = {1.0: 0.2, 0.0: 0.8}
16 |     yield
17 | 
18 | 
19 | @pytest.mark.usefixtures("params")
20 | class TestGenerators:
21 | 
22 |     def test_stratified_index_generator(self):
23 |         iterator = StratifiedIndexGenerator().flow(
24 |             strata=self.y, batch_size=self.batch_size,
25 |             strata_weights=self.strata_weights
26 |         )
27 |         total, positives = 0, 0
28 | 
29 |         for i in range(100):
30 |             indices = next(iterator)
31 |             positives += self.y[indices].sum()
32 |             total += len(indices)
33 | 
34 |         np.testing.assert_almost_equal(
35 |             np.array([positives / total]), np.array([0.2])
36 |         )
37 | 
38 |     def test_data_generator(self):
39 |         iterator = DataGenerator().flow(
40 |             self.X, self.y, strata=self.y, strata_weights=self.strata_weights)
41 | 
42 |         total, positives = 0, 0
43 | 
44 |         for i in range(100):
45 |             X, y = next(iterator)
46 |             positives += y.sum()
47 |             total += len(y)
48 | 
49 |         np.testing.assert_almost_equal(
50 |             np.array([positives / total]), np.array([0.2])
51 |         )
52 | 
53 |         model = Sequential()
54 |         model.add(Dense(units=1, input_shape=self.X.shape[1:]))
55 |         model.compile(loss='mean_squared_error', optimizer='sgd')
56 |         model.fit_generator(
57 |             iterator, steps_per_epoch=(len(self.X)/self.batch_size))
58 | 
59 |     def test_stratified_image_data_generator(self):
60 | 
61 |         iterator = ImageDataGenerator().flow(self.X_image, self.y)
62 | 
63 |         iterator.index_generator = StratifiedIndexGenerator().flow(
64 |             batch_size=self.batch_size,
65 |             strata=self.y,
66 |             strata_weights=self.strata_weights)
67 | 
68 |         total, positives = 0, 0
69 | 
70 |         for i in range(100):
71 |             X, y = next(iterator)
72 |             positives += y.sum()
73 |             total += len(y)
74 | 
75 |         np.testing.assert_almost_equal(
76 |             np.array([positives / total]), np.array([0.2])
77 |         )
78 | 


--------------------------------------------------------------------------------
/bdranalytics/pdlearn/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The :mod:`bdranalytics.pdlearn` module contains adapters that allows you
3 | to put :class:`pandas.DataFrame` instances into :mod:`sklearn` without
4 | losing the column names.
5 | :mod:`sklearn` already allows you to provide instances of :class:`pandas.DataFrame`,
6 | but as it internally works with :class:`numpy.array`, column names are lost during transformation.
7 | Here we provide adapters, which re-add the column names after the :mod:`sklearn` modifications.
8 | """
9 | 


--------------------------------------------------------------------------------
/bdranalytics/pdlearn/pipeline.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import six
 3 | from sklearn.base import BaseEstimator, TransformerMixin
 4 | 
 5 | 
 6 | class PdFeatureUnion(BaseEstimator, TransformerMixin):
 7 |     """Concatenates the result of multiple transformers"""
 8 | 
 9 |     def __init__(self, transformer_list, n_jobs=1, transformer_weights=None, debug=False):
10 |         self.transformer_list = transformer_list
11 |         self.debug = debug
12 | 
13 |     def fit(self, X, y=None, **fit_params):
14 |         fit_params_steps = dict((name, {}) for name, step in self.transformer_list
15 |                                 if step is not None)
16 |         for pname, pval in six.iteritems(fit_params):
17 |             step, param = pname.split('__', 1)
18 |             fit_params_steps[step][param] = pval
19 | 
20 |         for name, transform in self.transformer_list:
21 |             if transform is None:
22 |                 pass
23 |             transform.fit(X, y, **fit_params_steps[name])
24 |         return self
25 | 
26 |     def transformgen(self, X):
27 |         for name, transform in self.transformer_list:
28 |             if transform is None:
29 |                 pass
30 |             Xt = transform.transform(X)
31 |             columns = Xt.columns if hasattr(Xt, "columns") else [
32 |                 "{}-{}".format(name, c) for c in range(Xt.shape[1])]
33 |             Xt = pd.DataFrame(Xt, index=X.index, columns=columns)
34 |             assert len(Xt) == len(X), "Transformer {} shouldn't change nr of rows. " \
35 |                                       "Returned {} while original is {}".format(
36 |                                           name, len(Xt), len(X))
37 |             yield Xt
38 | 
39 |     def _print_columns(self, xts):
40 |         for xt in xts:
41 |             print(xt.columns)
42 |             print("\r\n")
43 | 
44 |     def transform(self, X):
45 |         xts = list(self.transformgen(X))
46 |         if self.debug:
47 |             self._print_columns(xts)
48 |         try:
49 |             return pd.concat(xts, axis=1, verify_integrity=True, join_axes=None)
50 |         except:
51 |             self._print_columns(xts)
52 |             raise
53 | 
54 | 
55 | class PdFeatureChain(BaseEstimator, TransformerMixin):
56 |     """Passes a data set through a pipeline / chain of transformers.
57 |     The output of the first transformer is fed into the next transformer.
58 | 
59 |     Similar to sklearn Pipeline, but does not work with predictor in final step."""
60 | 
61 |     def __init__(self, steps):
62 |         self.steps = steps
63 | 
64 |     def fit(self, X, y=None, **fit_params):
65 |         fit_params_steps = dict((name, {}) for name, step in self.steps
66 |                                 if step is not None)
67 |         for pname, pval in six.iteritems(fit_params):
68 |             step, param = pname.split('__', 1)
69 |             fit_params_steps[step][param] = pval
70 | 
71 |         Xt = X
72 |         for name, transform in self.steps:
73 |             Xt = pd.DataFrame(Xt)
74 |             if transform is None:
75 |                 pass
76 |             elif hasattr(transform, "fit_transform"):
77 |                 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
78 |             else:
79 |                 Xt = transform.fit(
80 |                     Xt, y, **fit_params_steps[name]).transform(Xt)
81 |             assert len(Xt) == len(X), "Transformer {} shouldn't change nr of rows. " \
82 |                                       "Returned {} while original is {}".format(
83 |                                           name, len(Xt), len(X))
84 |         return self
85 | 
86 |     def transform(self, X):
87 |         Xt = X
88 |         for name, transform in self.steps:
89 |             if transform is not None:
90 |                 Xt = pd.DataFrame(Xt)
91 |                 Xt = transform.transform(Xt)
92 |                 assert len(Xt) == len(X), "Transformer {} shouldn't change nr of rows. " \
93 |                                           "Returned {} while original is {}".format(
94 |                                               name, len(Xt), len(X))
95 |         return pd.DataFrame(Xt)
96 | 
97 |     def fit_transform(self, X, y=None, **fit_params):
98 |         return self.fit(X, y, **fit_params).transform(X)
99 | 


--------------------------------------------------------------------------------
/bdranalytics/pdlearn/preprocessing.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn.base import BaseEstimator, TransformerMixin
  4 | from sklearn.pipeline import Pipeline
  5 | from sklearn.preprocessing import OneHotEncoder
  6 | 
  7 | from bdranalytics.sklearn.preprocessing import StringIndexer
  8 | 
  9 | 
 10 | def format_colname(prefix, suffix):
 11 |     return "{:s}_{:s}".format(prefix, suffix)
 12 | 
 13 | 
 14 | """A dictionary to get a date part cardinality given a general name"""
 15 | __date_part_cardinality = {
 16 |     "MONTH": 12,
 17 |     "DAY": 31,
 18 |     "DAY_OF_WEEK": 7,
 19 |     "HOUR": 24,
 20 |     "MINUTE": 60,
 21 |     "SECOND": 60
 22 | }
 23 | 
 24 | """A dictionary to get a date part extractor given a general name"""
 25 | __date_part_funcs = {
 26 |     "MONTH": lambda x: x.month,
 27 |     "DAY": lambda x: x.day,
 28 |     "DAY_OF_WEEK": lambda x: x.dayofweek,
 29 |     "HOUR": lambda x: x.hour,
 30 |     "MINUTE": lambda x: x.minute,
 31 |     "SECOND": lambda x: x.second
 32 | }
 33 | 
 34 | 
 35 | def date_to_dateparts(df, col_name, parts=list(__date_part_funcs.keys()), new_col_name_prefix=None):
 36 |     if new_col_name_prefix is None:
 37 |         new_col_name_prefix = col_name
 38 |     for part in parts:
 39 |         assert part in list(__date_part_funcs.keys()), \
 40 |             "part '{}' is not known. Available are {}".format(
 41 |                 part, ", ".join(list(__date_part_funcs.keys())))
 42 |     return pd.DataFrame({
 43 |         format_colname(new_col_name_prefix, part):
 44 |         df[col_name].apply(__date_part_funcs.get(part))
 45 |         for part in parts}, index=df.index)
 46 | 
 47 | 
 48 | def date_to_cyclical(df, col_name, parts=list(__date_part_funcs.keys()), new_col_name_prefix=None):
 49 |     if new_col_name_prefix is None:
 50 |         new_col_name_prefix = col_name
 51 |     for part in parts:
 52 |         assert part in list(__date_part_funcs.keys()), \
 53 |             "part '{}' is not known. Available are {}".format(
 54 |                 part, ", ".join(list(__date_part_funcs.keys())))
 55 |     names = [format_colname(new_col_name_prefix, part) for part in parts]
 56 |     names_sin = ["{:s}_SIN".format(name) for name in names]
 57 |     names_cos = ["{:s}_COS".format(name) for name in names]
 58 |     values = [df[col_name].apply(__date_part_funcs.get(part)) /
 59 |               (2.0 * np.pi * __date_part_cardinality.get(part)) for part in parts]
 60 |     values_sin = [col.apply(np.sin) for col in values]
 61 |     values_cos = [col.apply(np.cos) for col in values]
 62 |     result = pd.concat(values_sin + values_cos, axis=1)
 63 |     result.columns = names_sin + names_cos
 64 |     return result
 65 | 
 66 | 
 67 | def to_circular_variable(df, col_name, cardinality):
 68 |     return pd.DataFrame({
 69 |         # note that np.sin(df[col_name] / float(cardinalilty...)) gives different values, probably rounding
 70 |         "{:s}_SIN".format(col_name): df[col_name].apply(lambda x: np.sin(x / float(cardinality * 2 * np.pi))),
 71 |         "{:s}_COS".format(col_name): df[col_name].apply(lambda x: np.cos(x / float(cardinality * 2 * np.pi)))
 72 |     }, index=df.index)
 73 | 
 74 | 
 75 | class DateOneHotEncoding(BaseEstimator, TransformerMixin):
 76 |     """
 77 |     Feature-engineering class that transforms date columns into one hot encoding of the parts (day, hour, ..).
 78 |     The original date column will be removed.
 79 |     To be used by sklearn pipelines
 80 |     """
 81 | 
 82 |     def __init__(self, date_columns, parts=list(["DAY", "DAY_OF_WEEK", "HOUR", "MINUTE", "MONTH", "SECOND"]),
 83 |                  new_column_names=None, drop=True):
 84 |         """
 85 |         :param date_columns: the column names of the date columns to be expanded in one hot encodings
 86 |         :param new_column_names: the names to use as prefix for the generated column names
 87 |         :param drop: whether or not to drop the original column
 88 |         :param parts: the parts to extract from the date columns, and to then transform into one-hot encodings
 89 |         """
 90 |         self.drop = drop
 91 |         self.parts = parts
 92 |         if new_column_names is None:
 93 |             self.new_column_names = date_columns
 94 |         else:
 95 |             self.new_column_names = new_column_names
 96 |         self.date_columns = date_columns
 97 |         self.one_hot_encoding_model = OneHotEncoder(sparse=False, handle_unknown='ignore'
 98 |                                                     # , n_values=datepart_maxvalue
 99 |                                                     )
100 |         self.encoding_pipeline = Pipeline([
101 |             ('labeler', StringIndexer()),
102 |             ('encoder', self.one_hot_encoding_model)
103 |         ])
104 |         assert (len(self.date_columns) == len(self.new_column_names)), \
105 |             "length of new column names is not equal to given column names"
106 | 
107 |     def all_to_parts(self, X):
108 |         parts = [date_to_dateparts(X, old_name, self.parts, new_name)
109 |                  for old_name, new_name in zip(self.date_columns, self.new_column_names)]
110 |         result = pd.concat(parts, axis=1, join_axes=[X.index])
111 |         return result
112 | 
113 |     def fit(self, X, y):
114 |         parts = self.all_to_parts(X)
115 |         self.encoding_pipeline.fit(parts)
116 |         # original column i is mapped to values in range resulting_indices[i] .. resulting_indices[i+1]
117 |         resulting_indices = self.one_hot_encoding_model.feature_indices_
118 |         active_features = self.one_hot_encoding_model.active_features_
119 |         new_names = [''] * (np.max(resulting_indices) + 1)
120 |         for i, item in enumerate(parts.columns):
121 |             for j in range(resulting_indices[i], resulting_indices[i + 1]):
122 |                 new_names[j] = "{}-{}".format(item, j)
123 |         self.fitted_names = [new_names[i] for i in active_features]
124 |         return self
125 | 
126 |     def transform_one_hots(self, X):
127 |         np_frame = self.encoding_pipeline.transform(self.all_to_parts(X))
128 |         return pd.DataFrame(np_frame, columns=self.fitted_names)
129 | 
130 |     def transform(self, X):
131 |         new_columns = self.transform_one_hots(X)
132 |         old_columns = X.drop(self.date_columns, axis=1,
133 |                              inplace=False) if self.drop else X
134 | 
135 |         return pd.concat([old_columns, new_columns], axis=1, join_axes=[X.index])
136 | 
137 | 
138 | class DateCyclicalEncoding(BaseEstimator, TransformerMixin):
139 |     """
140 |     Feature-engineering class that transforms date columns into cyclical numerical columns.
141 |     The original date column will be removed.
142 |     To be used by sklearn pipelines
143 |     """
144 | 
145 |     def __init__(self, date_columns, parts=list(["DAY", "DAY_OF_WEEK", "HOUR", "MINUTE", "MONTH", "SECOND"]),
146 |                  new_column_names=None, drop=True):
147 |         """
148 |         :param date_columns: the column names of the date columns to be expanded in one hot encodings
149 |         :param new_column_names: the names to use as prefix for the generated column names
150 |         :param drop: whether or not to drop the original column
151 |         :param parts: the parts to extract from the date columns, and to then transform into one-hot encodings
152 |         """
153 |         self.parts = parts
154 |         self.drop = drop
155 |         if new_column_names is None:
156 |             self.new_column_names = date_columns
157 |         else:
158 |             self.new_column_names = new_column_names
159 |         self.date_columns = date_columns
160 |         assert (len(self.date_columns) == len(self.new_column_names))
161 | 
162 |     def all_to_cyclical_parts(self, X):
163 |         parts = [date_to_cyclical(X, old_name, self.parts, new_name)
164 |                  for old_name, new_name in zip(self.date_columns, self.new_column_names)]
165 |         return pd.concat(parts, axis=1, join_axes=[X.index])
166 | 
167 |     def fit(self, X, y):
168 |         return self
169 | 
170 |     def transform(self, X):
171 |         new_columns = self.all_to_cyclical_parts(X)
172 |         old_columns = X.drop(self.date_columns, axis=1,
173 |                              inplace=False) if self.drop else X
174 |         return pd.concat([old_columns, new_columns], axis=1, join_axes=[X.index])
175 | 
176 | 
177 | # like sklearn's transformers, but then on pandas DataFrame
178 | class PdLagTransformer(BaseEstimator, TransformerMixin):
179 |     def __init__(self, lag):
180 |         self.lag = lag
181 | 
182 |     def fit(self, X, y=None, **fit_params):
183 |         return self
184 | 
185 |     def do_transform(self, dataframe):
186 |         return (dataframe.shift(self.lag)
187 |                 .rename(columns=lambda c: "{}_lag{}".format(c, self.lag)))
188 | 
189 |     def transform(self, X):
190 |         try:
191 |             return self.do_transform(X)
192 |         except AttributeError:
193 |             return self.do_transform(pd.DataFrame(X))
194 | 
195 |     def fit_transform(self, X, y=None, **fit_params):
196 |         return self.fit(X, y, **fit_params).transform(X)
197 | 
198 | 
199 | class PdWindowTransformer(BaseEstimator, TransformerMixin):
200 |     def __init__(self, func, **rolling_params):
201 |         self.func = func
202 |         self.rolling_params = rolling_params
203 | 
204 |     def fit(self, X, y=None, **fit_params):
205 |         return self
206 | 
207 |     def do_transform(self, dataframe):
208 |         return (self.func(dataframe.rolling(**self.rolling_params))
209 |                 .rename(columns=lambda c: "{}_{}".format(c, "".join(
210 |                     ["{}{}".format(k, v) for k, v in self.rolling_params.items()]))))
211 | 
212 |     def transform(self, X):
213 |         try:
214 |             return self.do_transform(X)
215 |         except AttributeError:
216 |             return self.do_transform(pd.DataFrame(X))
217 | 
218 |     def fit_transform(self, X, y=None, **fit_params):
219 |         return self.fit(X, y, **fit_params).transform(X)
220 | 


--------------------------------------------------------------------------------
/bdranalytics/pdlearn/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/bdranalytics/pdlearn/tests/__init__.py


--------------------------------------------------------------------------------
/bdranalytics/pdlearn/tests/test_pipeline.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import unittest
 4 | from sklearn.pipeline import FeatureUnion, Pipeline
 5 | 
 6 | from bdranalytics.pdlearn.pipeline import PdFeatureUnion, PdFeatureChain
 7 | from bdranalytics.pdlearn.preprocessing import PdLagTransformer, PdWindowTransformer
 8 | 
 9 | 
10 | class TestLagTransformer(unittest.TestCase):
11 |     def test_lagtransformer(self):
12 |         orig_data = pd.DataFrame(data=np.arange(15).reshape(
13 |             5, 3), columns=["col1", "col2", "col3"])
14 |         lagged = PdLagTransformer(1).fit_transform(orig_data)
15 |         np.testing.assert_array_equal(
16 |             lagged.columns, ["col1_lag1", "col2_lag1", "col3_lag1"])
17 |         np.testing.assert_array_equal(lagged.iloc[1, :], orig_data.iloc[0, :])
18 |         np.testing.assert_array_equal(lagged.iloc[0, :], np.repeat(np.nan, 3))
19 | 
20 |     def test_lagtransformer_on_numpy(self):
21 |         orig_data = np.arange(15).reshape(5, 3)
22 |         lagged = PdLagTransformer(1).fit_transform(orig_data)
23 |         np.testing.assert_array_equal(
24 |             lagged.columns, ["0_lag1", "1_lag1", "2_lag1"])
25 |         np.testing.assert_array_equal(lagged.iloc[1, :], orig_data[0, :])
26 |         np.testing.assert_array_equal(lagged.iloc[0, :], np.repeat(np.nan, 3))
27 | 
28 |     def test_windowtransformer(self):
29 |         orig_data = pd.DataFrame(data=np.arange(
30 |             14, -1, -1).reshape(5, 3), columns=["col1", "col2", "col3"])
31 |         result = PdWindowTransformer(
32 |             lambda window: window.max(), window=2).fit_transform(orig_data)
33 |         np.testing.assert_array_equal(
34 |             result.columns, ["col1_window2", "col2_window2", "col3_window2"])
35 |         np.testing.assert_array_equal(result.iloc[0, :], np.repeat(np.nan, 3))
36 |         # orig data is [ [14, 13, 12], [11, 10, 9],.., thus rolling max at row 1 should be values of row 0
37 |         np.testing.assert_array_equal(result.iloc[1, :], orig_data.iloc[0, :])
38 | 
39 |     def test_windowtransformer_on_numpy(self):
40 |         orig_data = np.arange(14, -1, -1).reshape(5, 3)
41 |         result = PdWindowTransformer(
42 |             lambda window: window.max(), window=2).fit_transform(orig_data)
43 |         np.testing.assert_array_equal(
44 |             result.columns, ["0_window2", "1_window2", "2_window2"])
45 |         np.testing.assert_array_equal(result.iloc[0, :], np.repeat(np.nan, 3))
46 |         # orig data is [ [14, 13, 12], [11, 10, 9],.., thus rolling max at row 1 should be values of row 0
47 |         np.testing.assert_array_equal(result.iloc[1, :], orig_data[0, :])
48 | 
49 |     def test_featureunion(self):
50 |         orig_data = pd.DataFrame(data=np.arange(15).reshape(
51 |             5, 3), columns=["col1", "col2", "col3"])
52 |         result = PdFeatureUnion([
53 |             ('lag', PdLagTransformer(1)),
54 |             ('window', PdWindowTransformer(lambda window: window.max(), window=2))]
55 |         ).fit_transform(orig_data)
56 |         np.testing.assert_array_equal(result.columns,
57 |                                       ["col1_lag1", "col2_lag1", "col3_lag1", "col1_window2", "col2_window2",
58 |                                        "col3_window2"])
59 |         np.testing.assert_array_equal(
60 |             result.iloc[:, 0:3],
61 |             PdLagTransformer(1).fit_transform(orig_data))
62 |         np.testing.assert_array_equal(
63 |             result.iloc[:, 3:6],
64 |             PdWindowTransformer(lambda window: window.max(), window=2).fit_transform(orig_data))
65 |         np.testing.assert_array_equal(result,
66 |                                       FeatureUnion([
67 |                                           ("lag", PdLagTransformer(1)),
68 |                                           ("window", PdWindowTransformer(
69 |                                               lambda window: window.max(), window=2))
70 |                                       ]).fit_transform(orig_data))
71 | 
72 |     def test_featurechain(self):
73 |         orig_data = pd.DataFrame(data=np.arange(15).reshape(
74 |             5, 3), columns=["col1", "col2", "col3"])
75 |         result = PdFeatureChain([
76 |             ('lag', PdLagTransformer(1)),
77 |             ('window', PdWindowTransformer(lambda window: window.max(), window=2))]).fit_transform(orig_data)
78 |         np.testing.assert_array_equal(result.columns,
79 |                                       ["col1_lag1_window2", "col2_lag1_window2", "col3_lag1_window2"])
80 |         np.testing.assert_array_equal(
81 |             result,
82 |             PdWindowTransformer(lambda window: window.max(), window=2).fit_transform(
83 |                 PdLagTransformer(1).fit_transform(orig_data)
84 |             )
85 |         )
86 |         np.testing.assert_array_equal(result,
87 |                                       Pipeline(steps=[
88 |                                           ("lag", PdLagTransformer(1)),
89 |                                           ("window", PdWindowTransformer(
90 |                                               lambda window: window.max(), window=2))
91 |                                       ]).fit_transform(orig_data))
92 | 


--------------------------------------------------------------------------------
/bdranalytics/pdlearn/tests/test_preprocessing.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import unittest
  4 | 
  5 | from bdranalytics.pdlearn.preprocessing import DateCyclicalEncoding, \
  6 |     DateOneHotEncoding
  7 | from bdranalytics.pdlearn.preprocessing import date_to_dateparts, \
  8 |     date_to_cyclical
  9 | 
 10 | 
 11 | class TestDatePartitioner(unittest.TestCase):
 12 |     def test_date_to_dateparts(self):
 13 |         orig_data = pd.DataFrame(data=np.arange(
 14 |             np.datetime64('2011-07-11'), np.datetime64('2011-07-18')
 15 |         ).reshape(7, 1), columns=["thedate"])
 16 |         splitted_data = date_to_dateparts(orig_data, 'thedate',
 17 |                                           new_col_name_prefix='prefix')
 18 | 
 19 |         expected_columns = ["prefix_{}".format(x) for x in
 20 |                             ["DAY", "DAY_OF_WEEK", "HOUR", "MINUTE", "MONTH",
 21 |                              "SECOND"]]
 22 |         # no additional columns
 23 |         np.testing.assert_array_equal(
 24 |             list(set(splitted_data.columns) - set(expected_columns)), list())
 25 |         # no missing columns
 26 |         np.testing.assert_array_equal(
 27 |             list(set(expected_columns) - set(splitted_data.columns)), list())
 28 |         monday = 0
 29 |         tuesday = 1
 30 |         np.testing.assert_array_equal(splitted_data.loc[0, expected_columns],
 31 |                                       [11, monday, 0, 0, 7, 0])
 32 |         np.testing.assert_array_equal(splitted_data.loc[1, expected_columns],
 33 |                                       [12, tuesday, 0, 0, 7, 0])
 34 | 
 35 |     def test_dateparts_to_circular(self):
 36 |         orig_data = pd.DataFrame(data=np.arange(
 37 |             np.datetime64('2011-07-11'), np.datetime64('2011-07-18')
 38 |         ).reshape(7, 1), columns=["thedate"])
 39 |         circular_data = date_to_cyclical(orig_data, 'thedate',
 40 |                                          new_col_name_prefix='prefix')
 41 | 
 42 |         intermediate_columns = ["prefix_{}".format(x) for x in
 43 |                                 ["DAY", "DAY_OF_WEEK", "HOUR", "MINUTE",
 44 |                                  "MONTH", "SECOND"]]
 45 |         expected_columns = ["{}_{}".format(x, y) for y in ["COS", "SIN"] for x
 46 |                             in intermediate_columns]
 47 |         # no additional columns
 48 |         np.testing.assert_array_equal(
 49 |             list(set(circular_data.columns) - set(expected_columns)), list())
 50 |         # no missing columns
 51 |         np.testing.assert_array_equal(
 52 |             list(set(expected_columns) - set(circular_data.columns)), list())
 53 |         # correct result compared to just splitting the columns
 54 |         splitted_data = date_to_dateparts(orig_data, 'thedate',
 55 |                                           new_col_name_prefix='prefix')
 56 |         sin_columns = ["{}_{}".format(x, y) for y in ["SIN"] for x in
 57 |                        intermediate_columns]
 58 |         np.testing.assert_array_equal(circular_data.loc[:, sin_columns], np.sin(
 59 |             splitted_data.loc[:, intermediate_columns] / (
 60 |                 2.0 * np.pi * np.array([31, 7, 24, 60, 12, 60]))))
 61 |         cos_columns = ["{}_{}".format(x, y) for y in ["COS"] for x in
 62 |                        intermediate_columns]
 63 |         np.testing.assert_array_equal(circular_data.loc[:, cos_columns], np.cos(
 64 |             splitted_data.loc[:, intermediate_columns] / (
 65 |                 2.0 * np.pi * np.array([31, 7, 24, 60, 12, 60]))))
 66 | 
 67 |     def test_dateonehotencoding(self):
 68 |         orig_data = pd.DataFrame(data=np.arange(
 69 |             np.datetime64('2011-07-11'), np.datetime64('2011-07-18')
 70 |         ).reshape(7, 1), columns=["thedate"])
 71 |         y = np.repeat(0, 7)
 72 |         onehot = DateOneHotEncoding(['thedate'], drop=True).fit_transform(
 73 |             orig_data, y)
 74 |         print(onehot)
 75 | 
 76 |     def test_datecyclicalencoding(self):
 77 |         orig_data = pd.DataFrame(data=np.arange(
 78 |             np.datetime64('2011-07-11'), np.datetime64('2011-07-18')
 79 |         ).reshape(7, 1), columns=["thedate"])
 80 |         y = np.repeat(0, 7)
 81 | 
 82 |         # create splitted to also be able to calculate values
 83 |         splitted_data = date_to_dateparts(orig_data, 'thedate')
 84 | 
 85 |         circular_data = DateCyclicalEncoding(['thedate'],
 86 |                                              drop=True).fit_transform(orig_data,
 87 |                                                                       y)
 88 |         intermediate_columns = ["thedate_{}".format(x) for x in
 89 |                                 ["DAY", "DAY_OF_WEEK", "HOUR", "MINUTE",
 90 |                                  "MONTH", "SECOND"]]
 91 |         expected_columns = ["{}_{}".format(x, y) for y in ["COS", "SIN"] for x
 92 |                             in intermediate_columns]
 93 |         # no additional columns
 94 |         np.testing.assert_array_equal(
 95 |             list(set(circular_data.columns) - set(expected_columns)), list())
 96 |         # no missing columns
 97 |         np.testing.assert_array_equal(
 98 |             list(set(expected_columns) - set(circular_data.columns)), list())
 99 |         sin_columns = ["{}_{}".format(x, y) for y in ["SIN"] for x in
100 |                        intermediate_columns]
101 |         np.testing.assert_array_equal(circular_data.loc[:, sin_columns], np.sin(
102 |             splitted_data.loc[:, intermediate_columns] / (
103 |                 2.0 * np.pi * np.array([31, 7, 24, 60, 12, 60]))))
104 |         cos_columns = ["{}_{}".format(x, y) for y in ["COS"] for x in
105 |                        intermediate_columns]
106 |         np.testing.assert_array_equal(circular_data.loc[:, cos_columns], np.cos(
107 |             splitted_data.loc[:, intermediate_columns] / (
108 |                 2.0 * np.pi * np.array([31, 7, 24, 60, 12, 60]))))
109 | 


--------------------------------------------------------------------------------
/bdranalytics/plot/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/bdranalytics/plot/__init__.py


--------------------------------------------------------------------------------
/bdranalytics/plot/classification.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | import numpy as np
  3 | import pandas as pd
  4 | import seaborn as sns
  5 | from sklearn.externals.joblib import Parallel, delayed
  6 | from sklearn.metrics import (
  7 |     confusion_matrix,
  8 |     accuracy_score,
  9 |     f1_score,
 10 |     roc_curve,
 11 |     auc,
 12 |     precision_recall_curve,
 13 |     average_precision_score
 14 | )
 15 | 
 16 | primary_color = plt.rcParams['axes.prop_cycle'].by_key()['color'][0]
 17 | 
 18 | default_names = ('negative', 'positive')
 19 | 
 20 | 
 21 | def compute_parallel_metric(metric, y_true, y_pred):
 22 | 
 23 |     thresholds = np.arange(0, 1, .02)
 24 | 
 25 |     return Parallel(n_jobs=-1)(
 26 |         delayed(metric)(
 27 |             y_true,
 28 |             y_pred > threshold
 29 |         )
 30 |         for threshold in thresholds
 31 |     ), thresholds
 32 | 
 33 | 
 34 | def plot_accuracy(y_true, y_pred):
 35 |     acc, thresholds = compute_parallel_metric(accuracy_score, y_true, y_pred)
 36 | 
 37 |     lower_baseline = sum(y_true) / len(y_true)
 38 |     upper_baseline = 1 - lower_baseline
 39 | 
 40 |     plt.plot([0, 1], [lower_baseline, lower_baseline], 'k--')
 41 |     plt.plot([0, 1], [upper_baseline, upper_baseline], 'k--')
 42 |     plt.plot(thresholds, acc)
 43 |     plt.title('Accuracy across thresholds')
 44 |     plt.xlabel('classifier threshold')
 45 |     plt.ylabel('accuracy')
 46 |     plt.xlim([0.0, 1.0])
 47 |     plt.ylim([0.0, 1.0])
 48 | 
 49 | 
 50 | def plot_f1_score(y_true, y_pred):
 51 |     f1s, thresholds = compute_parallel_metric(f1_score, y_true, y_pred)
 52 | 
 53 |     plt.plot(thresholds, f1s)
 54 |     plt.title('F1 score across thresholds')
 55 |     plt.xlabel('classifier threshold')
 56 |     plt.ylabel('F1 score')
 57 |     plt.xlim([0.0, 1.0])
 58 |     plt.ylim([0.0, 1.0])
 59 | 
 60 | 
 61 | def plot_confusion_matrix(
 62 |     y_true, y_pred_bin, target_names=default_names, normalize=False):
 63 | 
 64 |     c = confusion_matrix(y_true, y_pred_bin)
 65 | 
 66 |     if normalize:
 67 |         c = c / c.sum()
 68 |         fmt = '.3f'
 69 |     else:
 70 |         fmt = 'd'
 71 | 
 72 |     confusion = pd.DataFrame(c, index=target_names, columns=target_names)
 73 |     sns.heatmap(confusion, annot=True, fmt=fmt)
 74 |     plt.xlabel('predicted label')
 75 |     plt.ylabel('true label')
 76 |     plt.title('Confusion matrix')
 77 |     plt.show()
 78 | 
 79 | 
 80 | def plot_roc_curve(y_true, y_pred):
 81 |     fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1,
 82 |                                      drop_intermediate=True)
 83 |     roc_auc = auc(fpr, tpr)
 84 | 
 85 |     plt.plot(fpr, tpr, label="ROC curve (area = {:.2f})".format(roc_auc))
 86 |     plt.plot([0, 1], [0, 1], 'k--')
 87 |     plt.xlim([0.0, 1.0])
 88 |     plt.ylim([0.0, 1.0])
 89 |     plt.xlabel('false positive rate')
 90 |     plt.ylabel('true positive rate')
 91 |     plt.title('Receiver-operating characteristic')
 92 |     plt.legend(loc="lower right")
 93 | 
 94 | 
 95 | def plot_pr_curve(y_true, y_pred):
 96 |     precision, recall, thresholds = precision_recall_curve(y_true, y_pred,
 97 |                                                            pos_label=1)
 98 | 
 99 |     average_precision = average_precision_score(
100 |         y_true, y_pred, average="micro")
101 | 
102 |     baseline = sum(y_true) / len(y_true)
103 | 
104 |     plt.plot(recall, precision,
105 |              label="PR curve (area = {:.2f})".format(average_precision))
106 |     plt.plot([0, 1], [baseline, baseline], 'k--')
107 |     plt.xlim([0.0, 1.0])
108 |     plt.ylim([0.0, 1.0])
109 |     plt.xlabel('recall')
110 |     plt.ylabel('precision')
111 |     plt.title('Precision-recall curve')
112 |     plt.legend(loc="lower right")
113 | 
114 | 
115 | def plot_benefits(y_true, y_pred, benefit_func=None, recalibrate=False,
116 |                   ax=None):
117 |     if benefit_func is None:
118 |         def net_benefit(tpr, fpr):
119 |             cost_fp, benefit_tp = (1, 1)  # equal weights
120 |             n_positives = sum(y_true)
121 |             n_tp = tpr * n_positives  # number of true positives (benefits)
122 |             n_fp = fpr * len(
123 |                 y_true) - n_positives  # number of false positives (costs)
124 |             fp_costs = n_fp * cost_fp
125 |             tp_benefits = n_tp * benefit_tp
126 |             return tp_benefits - fp_costs
127 | 
128 |         benefit_func = net_benefit
129 | 
130 |     fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1,
131 |                                      drop_intermediate=True)
132 | 
133 |     benefits = np.zeros_like(thresholds)
134 |     for i, _ in enumerate(thresholds):
135 |         benefits[i] = benefit_func(tpr[i], fpr[i])
136 | 
137 |     i_max = np.argmax(benefits)
138 |     print(
139 |         "max benefits: {:.0f} units on {:,} samples, "
140 |         "tpr: {:.3f}, fpr: {:.3f}, threshold: {:.3f}"
141 |         .format(
142 |             benefits[i_max], len(y_true),
143 |             tpr[i_max], fpr[i_max], thresholds[i_max]
144 |         )
145 |     )
146 | 
147 |     if ax is not None:
148 |         ax1 = ax
149 |     else:
150 |         _, ax1 = plt.subplots()
151 | 
152 |     ax2 = ax1.twinx()
153 |     ax2.vlines(thresholds[i_max], 0, 1, linestyles='dashed')
154 |     ax1.set_xlim([0, 1])
155 |     ax1.plot(thresholds, benefits, c=primary_color)
156 |     ax1.set_ylim([0, np.max(benefits)])
157 |     ax2.plot(thresholds, tpr, 'g-')
158 |     ax2.plot(thresholds, fpr, 'r-')
159 |     ax2.set_ylim([0, 1])
160 |     ax1.set_xlabel('classifier threshold')
161 |     ax1.set_ylabel('units')
162 |     ax2.set_ylabel('rate')
163 |     ax2.legend(labels=['TP', 'FP'], loc="upper right")
164 |     ax1.set_title('Benefits across thresholds')
165 |     ax1.legend(labels=['benefit'], loc="center right")
166 |     ax1.grid(1)
167 |     ax2.grid(0)
168 | 
169 |     if recalibrate:
170 |         y_pred_bin = (y_pred > thresholds[i_max]) * 1.
171 |         return y_pred_bin
172 | 
173 | 
174 | def subplot_evaluation_curves(y_true, y_pred, benefit_func=None,
175 |                               figsize=(12, 12)):
176 | 
177 |     fig, axarr = plt.subplots(3, 2, figsize=figsize)
178 |     fig.subplots_adjust(hspace=0.4, wspace=0.3)
179 | 
180 |     plt.sca(axarr[0, 0])
181 |     plot_roc_curve(y_true, y_pred)
182 | 
183 |     plt.sca(axarr[0, 1])
184 |     plot_pr_curve(y_true, y_pred)
185 | 
186 |     plt.sca(axarr[1, 0])
187 |     plot_accuracy(y_true, y_pred)
188 | 
189 |     plt.sca(axarr[1, 1])
190 |     plot_f1_score(y_true, y_pred)
191 | 
192 |     plot_benefits(y_true, y_pred, ax=axarr[2, 0], benefit_func=benefit_func)
193 |     plt.show()
194 | 


--------------------------------------------------------------------------------
/bdranalytics/plot/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/bdranalytics/plot/tests/__init__.py


--------------------------------------------------------------------------------
/bdranalytics/sklearn/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/bdranalytics/sklearn/__init__.py


--------------------------------------------------------------------------------
/bdranalytics/sklearn/encoders.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from sklearn.base import BaseEstimator, TransformerMixin
  4 | 
  5 | 
  6 | class WeightOfEvidenceEncoder(BaseEstimator, TransformerMixin):
  7 |     """
  8 |     Feature-engineering class that transforms a high-capacity categorical value
  9 |     into Weigh of Evidence scores. Can be used in sklearn pipelines.
 10 |     """
 11 | 
 12 |     def __init__(self, verbose=0, cols=None, return_df=True,
 13 |                  smooth=0.5, fillna=0, dependent_variable_values=None):
 14 |         """
 15 |         :param smooth: value for additive smoothing, to prevent divide by zero
 16 |         """
 17 |         # make sure cols is a list of strings
 18 |         if not isinstance(cols, list):
 19 |             cols = [cols]
 20 | 
 21 |         self.stat = {}
 22 |         self.return_df = return_df
 23 |         self.verbose = verbose
 24 |         self.cols = cols
 25 |         self.smooth = smooth
 26 |         self.fillna = fillna
 27 |         self.dependent_variable_values = dependent_variable_values
 28 | 
 29 |     def fit(self, X, y):
 30 | 
 31 |         if not isinstance(X, pd.DataFrame):
 32 |             raise TypeError(
 33 |                 'Input should be an instance of pandas.DataFrame()')
 34 | 
 35 |         if self.dependent_variable_values is not None:
 36 |             y = self.dependent_variable_values
 37 | 
 38 |         df = X[self.cols].copy()
 39 |         y_col_index = len(df.columns) + 1
 40 |         df[y_col_index] = np.array(y)
 41 | 
 42 |         def get_totals(x):
 43 |             total = np.size(x)
 44 |             pos = max(float(np.sum(x)), self.smooth)
 45 |             neg = max(float(total - pos), self.smooth)
 46 |             return pos, neg
 47 | 
 48 |         # get the totals per class
 49 |         total_positive, total_negative = get_totals(y)
 50 |         if self.verbose:
 51 |             print("total positives {:.0f}, total negatives {:.0f}".format(
 52 |                 total_positive, total_negative))
 53 | 
 54 |         def compute_bucket_woe(x):
 55 |             bucket_positive, bucket_negative = get_totals(x)
 56 |             return np.log(bucket_positive / bucket_negative)
 57 | 
 58 |         # compute WoE scores per bucket (category)
 59 |         stat = {}
 60 |         for col in self.cols:
 61 | 
 62 |             if self.verbose:
 63 |                 print(
 64 |                     "computing weight of evidence for column {:s}".format(col))
 65 | 
 66 |             stat[col] = ((df.groupby(col)[y_col_index].agg(compute_bucket_woe)
 67 |                           + np.log(total_negative / total_positive)).to_dict())
 68 | 
 69 |         self.stat = stat
 70 | 
 71 |         return self
 72 | 
 73 |     def transform(self, X, y=None):
 74 | 
 75 |         if not isinstance(X, pd.DataFrame):
 76 |             raise TypeError(
 77 |                 'Input should be an instance of pandas.DataFrame()')
 78 | 
 79 |         df = X.copy()
 80 | 
 81 |         # join the WoE stats with the data
 82 |         for col in self.cols:
 83 | 
 84 |             if self.verbose:
 85 |                 print("transforming categorical column {:s}".format(col))
 86 | 
 87 |             stat = pd.DataFrame.from_dict(self.stat[col], orient='index')
 88 | 
 89 |             ser = (pd.merge(df, stat, left_on=col, right_index=True, how='left')
 90 |                    .sort_index()
 91 |                    .reindex(df.index))[0]
 92 | 
 93 |             # fill missing values with
 94 |             if self.verbose:
 95 |                 print("{:.0f} NaNs in transformed data".format(
 96 |                     ser.isnull().sum()))
 97 |                 print("{:.4f} mean weight of evidence".format(ser.mean()))
 98 | 
 99 |             df[col] = np.array(ser.fillna(self.fillna))
100 | 
101 |         if not self.return_df:
102 |             out = np.array(df)
103 |         else:
104 |             out = df
105 | 
106 |         return out
107 | 
108 | 
109 | class ColumnSelector(BaseEstimator, TransformerMixin):
110 |     def __init__(self, columns):
111 |         self.columns = columns
112 | 
113 |     def fit(self, X, y=None):
114 |         return self
115 | 
116 |     def transform(self, X):
117 |         try:
118 |             return X[self.columns]
119 |         except:
120 |             print("Could not find selected columns {:s} in available columns {:s}".format(
121 |                 self.columns, X.columns))
122 |             raise
123 | 
124 | 
125 | class StringIndexer(BaseEstimator, TransformerMixin):
126 |     def __init__(self):
127 |         self.dictionaries = dict()
128 |         self.columns = list()
129 | 
130 |     def fit(self, X, y=None):
131 |         self.columns = X.columns.values
132 |         for col in self.columns:
133 |             categories = np.unique(X[col])
134 |             self.dictionaries[col] = dict(
135 |                 zip(categories, range(len(categories))))
136 |         return self
137 | 
138 |     def transform(self, X):
139 |         column_array = []
140 |         for col in self.columns:
141 |             dictionary = self.dictionaries[col]
142 |             na_value = len(dictionary) + 1
143 |             transformed_column = X[col].apply(
144 |                 lambda x: dictionary.get(x, na_value))
145 |             column_array.append(transformed_column.values.reshape(-1, 1))
146 |         return np.hstack(column_array)
147 | 
148 | 
149 | class LeaveOneOutEncoder(BaseEstimator, TransformerMixin):
150 |     """
151 |     Leave one out transformation for high-capacity categorical variables.
152 |     """
153 | 
154 |     def __init__(self, with_stdevs=True):
155 | 
156 |         self.with_stdevs = with_stdevs
157 |         self.means = {}
158 |         self.stdevs = {}
159 | 
160 |     def fit(self, X, y=None):
161 |         return self
162 | 
163 |     def transform(self, X):
164 |         df = X.copy()
165 |         for col in self.means.keys():
166 | 
167 |             mean_col_name = "{:s}_MEAN".format(col)
168 |             df[mean_col_name] = df.merge(pd.DataFrame(self.means[col]),
169 |                                          how='left', left_on=[col], right_index=True)['y']
170 |             if self.with_stdevs:
171 |                 std_col_name = "{:s}_STD".format(col)
172 |                 df[std_col_name] = df.merge(pd.DataFrame(self.stdevs[col]),
173 |                                             how='left', left_on=[col], right_index=True)['y']
174 | 
175 |             df.drop(col, axis=1, inplace=True)
176 | 
177 |         return df
178 | 
179 |     def fit_transform(self, X, y):
180 |         """will be used during pipeline fit"""
181 |         df = X.copy()
182 |         df['y'] = y
183 |         for col in df.columns.difference(['y']):
184 | 
185 |             mean_col_name = "{:s}_MEAN".format(col)
186 | 
187 |             grouped = df.groupby(col)['y']
188 | 
189 |             self.means[col] = grouped.mean()
190 |             df[mean_col_name] = grouped.transform(self._loo_means)
191 | 
192 |             if self.with_stdevs:
193 |                 std_col_name = "{:s}_STD".format(col)
194 |                 self.stdevs[col] = grouped.std()
195 |                 df[std_col_name] = grouped.transform(self._loo_stdevs)
196 | 
197 |             df.drop(col, axis=1, inplace=True)
198 | 
199 |         df.drop('y', axis=1, inplace=True)
200 |         return df
201 | 
202 |     def _loo_means(self, s):
203 |         n = len(s)
204 |         loo_means = (s.sum() - s) / (n - 1)
205 |         return loo_means * np.random.normal(loc=1.0, scale=0.01, size=n)
206 | 
207 |     def _loo_stdevs(self, s):
208 |         n = len(s)
209 |         if n > 1:
210 |             loo_means = self._loo_means(s)
211 |             sum_of_sq = n * s.std() ** 2
212 |             loo_stdevs = np.sqrt(
213 |                 abs((sum_of_sq - (s - s.mean()) * (s - loo_means))) / (n - 1))
214 |         else:
215 |             loo_stdevs = np.array([0])
216 | 
217 |         return loo_stdevs * np.random.normal(loc=1.0, scale=0.01, size=n)
218 | 


--------------------------------------------------------------------------------
/bdranalytics/sklearn/model_selection.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from abc import ABCMeta
  4 | from sklearn.externals.six import with_metaclass
  5 | from sklearn.utils.validation import _num_samples
  6 | 
  7 | 
  8 | class GrowingWindow(with_metaclass(ABCMeta)):
  9 |     """Growing Window cross validator
 10 | 
 11 |     Provides train/test indices to split data in train/test sets.
 12 |     Divides the data in n_folds+1 slices.
 13 |     For split i [1..n_folds], slices [0..i} are train, slice i is test
 14 | 
 15 |     Parameters:
 16 |         n_folds : int, default=3
 17 |             Number of folds. Must be at least 1
 18 |     """
 19 | 
 20 |     def __init__(self, n_folds=3):
 21 |         self.n_folds = n_folds
 22 | 
 23 |     def __repr__(self):
 24 |         return _build_repr(self)
 25 | 
 26 |     def split(self, X, y=None, labels=None):
 27 |         """Generate indices to split data into training and test set.
 28 |         Parameters
 29 |         ----------
 30 |         X : array-like, shape (n_samples, n_features)
 31 |             Training data, where n_samples is the number of samples
 32 |             and n_features is the number of features.
 33 |         y : array-like, of length n_samples
 34 |             The target variable for supervised learning problems.
 35 |             ignored
 36 |         labels : array-like, with shape (n_samples,), optional
 37 |             Group labels for the samples used while splitting the dataset into
 38 |             train/test set.
 39 |             ignored
 40 |         Returns
 41 |         -------
 42 |         train : ndarray
 43 |             The training set indices for that split.
 44 |         test : ndarray
 45 |             The testing set indices for that split.
 46 |         """
 47 |         n = _num_samples(X)
 48 |         n_slices = self.n_folds + 1
 49 |         # loop from the first 2 folds to the total number of folds
 50 |         for i in range(2, n_slices + 1):
 51 |             # the split is the percentage at which to split the folds into train
 52 |             # and test. For example when i = 2 we are taking the first 2 folds out
 53 |             # of the total available. In this specific case we have to split the
 54 |             # two of them in half (train on the first, test on the second),
 55 |             # so split = 1/2 = 0.5 = 50%. When i = 3 we are taking the first 3 folds
 56 |             # out of the total available, meaning that we have to split the three of them
 57 |             # in two at split = 2/3 = 0.66 = 66% (train on the first 2 and test on the
 58 |             # following)
 59 |             split = float(i - 1) / i
 60 |             # as we loop over the folds X and y are updated and increase in size.
 61 |             # This is the data that is going to be split and it increases in size
 62 |             # in the loop as we account for more folds. If k = 300, with i starting from 2
 63 |             # the result is the following in the loop
 64 |             # i = 2
 65 |             # X = X_train[:(600)]
 66 |             # y = y_train[:(600)]
 67 |             #
 68 |             # i = 3
 69 |             # X = X_train[:(900)]
 70 |             # y = y_train[:(900)]
 71 |             # ....
 72 |             n_sub = int(np.floor(float(n * i) / n_slices))
 73 |             subset = range(0, n_sub)
 74 |             # X and y contain both the folds to train and the fold to test.
 75 |             # index is the integer telling us where to split, according to the
 76 |             # split percentage we have set above
 77 |             n_train = int(np.floor(n_sub * split))
 78 |             train_index = np.arange(0, n_train)
 79 |             test_index = np.arange(n_train, n_sub)
 80 |             yield train_index, test_index
 81 | 
 82 |     def get_n_splits(self, X, y=None, labels=None):
 83 |         """Returns the number of splitting iterations in the cross-validator
 84 |         Parameters
 85 |         ----------
 86 |         X : array-like, shape (n_samples, n_features)
 87 |             Training data, where n_samples is the number of samples
 88 |             and n_features is the number of features.
 89 |         y : object
 90 |             Always ignored, exists for compatibility.
 91 |         labels : object
 92 |             Always ignored, exists for compatibility.
 93 |         Returns
 94 |         -------
 95 |         n_splits : int
 96 |             Returns the number of splitting iterations in the cross-validator.
 97 |         """
 98 |         if X is None:
 99 |             raise ValueError("The X parameter should not be None")
100 |         return self.n_folds
101 | 
102 | 
103 | class IntervalGrowingWindow(with_metaclass(ABCMeta)):
104 |     """Growing Window cross-validator based on time intervals"""
105 | 
106 |     def __init__(self, test_start_date, timestamps='index', test_end_date=None,
107 |                  test_size=None, train_size=None):
108 | 
109 |         self.test_start_date = pd.to_datetime(test_start_date)
110 |         self.test_end_date = pd.to_datetime(test_end_date)
111 |         self.test_size = pd.to_timedelta(test_size)
112 |         self.train_size = pd.to_timedelta(train_size)
113 | 
114 |         self.timestamps = timestamps
115 |         if timestamps is not 'index':
116 |             self.timestamps = pd.to_datetime(timestamps)
117 | 
118 |     def generate_intervals(self, timestamps):
119 | 
120 |         # infer test interval end date if not specified
121 |         # has to be done here to work with timestamps from DataFrame index
122 |         # NOTE: test_end_date is NOT included
123 |         if self.test_end_date is None:
124 |             # can be overridden for reuse
125 |             self.test_end_date = max(timestamps)
126 | 
127 |         # determine start date of the test intervals
128 |         intervals_start = pd.to_datetime(pd.date_range(self.test_start_date,
129 |                                                        self.test_end_date,
130 |                                                        freq=self.test_size)
131 |                                          .values)
132 | 
133 |         # convert to (start, end) tuples
134 |         intervals = list(zip(intervals_start[:-1], intervals_start[1:]))
135 | 
136 |         return intervals
137 | 
138 |     def get_timeseries(self, X):
139 |         """Returns the numpy array of timestamps for the given dataset"""
140 |         if self.timestamps is 'index':
141 |             return pd.to_datetime(X.index.values)
142 |         else:
143 |             return self.timestamps
144 | 
145 |     def split(self, X, y=None, labels=None):
146 |         """Generate indices to split data into training and test sets based on time stamps"""
147 |         if X is None:
148 |             raise ValueError("The X parameter should not be None")
149 | 
150 |         # extract timestamps from DataFrame index, if needed
151 |         timestamps = self.get_timeseries(X)
152 |         intervals = self.generate_intervals(timestamps)
153 | 
154 |         # extract first sample for unlimited train size
155 |         first_sample_date = min(timestamps)
156 | 
157 |         # number of samples
158 |         n = _num_samples(X)
159 | 
160 |         # list of indices, to convert booleans later on
161 |         index = np.arange(n)
162 | 
163 |         # loop over each interval
164 |         for test_start, test_end in intervals:
165 | 
166 |             if self.train_size is not None:
167 |                 train_start = test_start - self.train_size
168 |             else:
169 |                 train_start = first_sample_date
170 | 
171 |             train_interval_bool = np.array(list(map(lambda date:
172 |                                                     train_start <= date < test_start,
173 |                                                     timestamps)))
174 | 
175 |             test_interval_bool = np.array(list(map(lambda date:
176 |                                                    test_start <= date < test_end,
177 |                                                    timestamps)))
178 | 
179 |             # convert boolean to integer indices
180 |             train_index = index[train_interval_bool]
181 |             test_index = index[test_interval_bool]
182 | 
183 |             yield train_index, test_index
184 | 
185 |     def get_n_splits(self, X, y=None, labels=None):
186 |         if X is None:
187 |             raise ValueError("The X parameter should not be None")
188 | 
189 |         # extract timestamps from DataFrame index, if needed
190 |         timestamps = self.get_timeseries(X)
191 |         intervals = self.generate_intervals(timestamps)
192 | 
193 |         # compute number of folds
194 |         return len(intervals)
195 | 


--------------------------------------------------------------------------------
/bdranalytics/sklearn/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | from .encoding import WeightOfEvidenceEncoder, StringIndexer, LeaveOneOutEncoder
2 | from .scaling import ScaledRegressor
3 | 
4 | __all__ = ['ScaledRegressor',
5 |            'WeightOfEvidenceEncoder',
6 |            'StringIndexer',
7 |            'LeaveOneOutEncoder']
8 | 


--------------------------------------------------------------------------------
/bdranalytics/sklearn/preprocessing/encoding.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | from sklearn.base import BaseEstimator
  4 | from sklearn.base import TransformerMixin
  5 | 
  6 | 
  7 | class WeightOfEvidenceEncoder(BaseEstimator, TransformerMixin):
  8 |     """
  9 |     Feature-engineering class that transforms a high-capacity categorical value
 10 |     into Weigh of Evidence scores. Can be used in sklearn pipelines.
 11 |     """
 12 | 
 13 |     def __init__(self, verbose=0, cols=None, return_df=True,
 14 |                  smooth=0.5, fillna=0, dependent_variable_values=None):
 15 |         """
 16 |         :param smooth: value for additive smoothing, to prevent divide by zero
 17 |         """
 18 |         # make sure cols is a list of strings
 19 |         if not isinstance(cols, list):
 20 |             cols = [cols]
 21 | 
 22 |         self.stat = {}
 23 |         self.return_df = return_df
 24 |         self.verbose = verbose
 25 |         self.cols = cols
 26 |         self.smooth = smooth
 27 |         self.fillna = fillna
 28 |         self.dependent_variable_values = dependent_variable_values
 29 | 
 30 |     def fit(self, X, y):
 31 | 
 32 |         if not isinstance(X, pd.DataFrame):
 33 |             raise TypeError(
 34 |                 'Input should be an instance of pandas.DataFrame()')
 35 | 
 36 |         if self.dependent_variable_values is not None:
 37 |             y = self.dependent_variable_values
 38 | 
 39 |         df = X[self.cols].copy()
 40 |         y_col_index = len(df.columns) + 1
 41 |         df[y_col_index] = np.array(y)
 42 | 
 43 |         def get_totals(x):
 44 |             total = np.size(x)
 45 |             pos = max(float(np.sum(x)), self.smooth)
 46 |             neg = max(float(total - pos), self.smooth)
 47 |             return pos, neg
 48 | 
 49 |         # get the totals per class
 50 |         total_positive, total_negative = get_totals(y)
 51 |         if self.verbose:
 52 |             print("total positives {:.0f}, total negatives {:.0f}".format(
 53 |                 total_positive, total_negative))
 54 | 
 55 |         def compute_bucket_woe(x):
 56 |             bucket_positive, bucket_negative = get_totals(x)
 57 |             return np.log(bucket_positive / bucket_negative)
 58 | 
 59 |         # compute WoE scores per bucket (category)
 60 |         stat = {}
 61 |         for col in self.cols:
 62 | 
 63 |             if self.verbose:
 64 |                 print(
 65 |                     "computing weight of evidence for column {:s}".format(col))
 66 | 
 67 |             stat[col] = ((df.groupby(col)[y_col_index].agg(compute_bucket_woe)
 68 |                           + np.log(total_negative / total_positive)).to_dict())
 69 | 
 70 |         self.stat = stat
 71 | 
 72 |         return self
 73 | 
 74 |     def transform(self, X, y=None):
 75 | 
 76 |         if not isinstance(X, pd.DataFrame):
 77 |             raise TypeError(
 78 |                 'Input should be an instance of pandas.DataFrame()')
 79 | 
 80 |         df = X.copy()
 81 | 
 82 |         # join the WoE stats with the data
 83 |         for col in self.cols:
 84 | 
 85 |             if self.verbose:
 86 |                 print("transforming categorical column {:s}".format(col))
 87 | 
 88 |             stat = pd.DataFrame.from_dict(self.stat[col], orient='index')
 89 | 
 90 |             ser = (pd.merge(df, stat, left_on=col, right_index=True, how='left')
 91 |                    .sort_index()
 92 |                    .reindex(df.index))[0]
 93 | 
 94 |             # fill missing values with
 95 |             if self.verbose:
 96 |                 print("{:.0f} NaNs in transformed data".format(
 97 |                     ser.isnull().sum()))
 98 |                 print("{:.4f} mean weight of evidence".format(ser.mean()))
 99 | 
100 |             df[col] = np.array(ser.fillna(self.fillna))
101 | 
102 |         if not self.return_df:
103 |             out = np.array(df)
104 |         else:
105 |             out = df
106 | 
107 |         return out
108 | 
109 | 
110 | class StringIndexer(BaseEstimator, TransformerMixin):
111 |     def __init__(self):
112 |         self.dictionaries = dict()
113 |         self.columns = list()
114 | 
115 |     def fit(self, X, y=None):
116 |         self.columns = X.columns.values
117 |         for col in self.columns:
118 |             categories = np.unique(X[col])
119 |             self.dictionaries[col] = dict(
120 |                 zip(categories, range(len(categories))))
121 |         return self
122 | 
123 |     def transform(self, X):
124 |         column_array = []
125 |         for col in self.columns:
126 |             dictionary = self.dictionaries[col]
127 |             na_value = len(dictionary) + 1
128 |             transformed_column = X[col].apply(
129 |                 lambda x: dictionary.get(x, na_value)).astype(int)
130 |             column_array.append(transformed_column.values.reshape(-1, 1))
131 |         return np.hstack(column_array)
132 | 
133 | 
134 | class LeaveOneOutEncoder(BaseEstimator, TransformerMixin):
135 |     def __init__(self, with_stdevs=True):
136 | 
137 |         self.with_stdevs = with_stdevs
138 |         self.means = {}
139 |         self.stdevs = {}
140 | 
141 |     def fit(self, X, y=None):
142 |         return self
143 | 
144 |     def transform(self, X):
145 |         df = X.copy()
146 |         for col in self.means.keys():
147 | 
148 |             mean_col_name = "{:s}_MEAN".format(col)
149 |             df[mean_col_name] = df.merge(pd.DataFrame(self.means[col]),
150 |                                          how='left', left_on=[col], right_index=True)['y']
151 |             if self.with_stdevs:
152 |                 std_col_name = "{:s}_STD".format(col)
153 |                 df[std_col_name] = df.merge(pd.DataFrame(self.stdevs[col]),
154 |                                             how='left', left_on=[col], right_index=True)['y']
155 | 
156 |             df.drop(col, axis=1, inplace=True)
157 | 
158 |         return df
159 | 
160 |     def fit_transform(self, X, y):
161 |         """will be used during pipeline fit"""
162 |         df = X.copy()
163 |         df['y'] = y
164 |         for col in df.columns.difference(['y']):
165 | 
166 |             mean_col_name = "{:s}_MEAN".format(col)
167 | 
168 |             grouped = df.groupby(col)['y']
169 | 
170 |             self.means[col] = grouped.mean()
171 |             df[mean_col_name] = grouped.transform(self._loo_means)
172 | 
173 |             if self.with_stdevs:
174 |                 std_col_name = "{:s}_STD".format(col)
175 |                 self.stdevs[col] = grouped.std()
176 |                 df[std_col_name] = grouped.transform(self._loo_stdevs)
177 | 
178 |             df.drop(col, axis=1, inplace=True)
179 | 
180 |         df.drop('y', axis=1, inplace=True)
181 |         return df
182 | 
183 |     def _loo_means(self, s):
184 |         n = len(s)
185 |         loo_means = (s.sum() - s) / (n - 1)
186 |         return loo_means * np.random.normal(loc=1.0, scale=0.01, size=n)
187 | 
188 |     def _loo_stdevs(self, s):
189 |         n = len(s)
190 |         if n > 1:
191 |             loo_means = self._loo_means(s)
192 |             sum_of_sq = n * s.std() ** 2
193 |             loo_stdevs = np.sqrt(
194 |                 abs((sum_of_sq - (s - s.mean()) * (s - loo_means))) / (n - 1))
195 |         else:
196 |             loo_stdevs = np.array([0])
197 | 
198 |         return loo_stdevs * np.random.normal(loc=1.0, scale=0.01, size=n)
199 | 


--------------------------------------------------------------------------------
/bdranalytics/sklearn/preprocessing/preprocessing.py:
--------------------------------------------------------------------------------
 1 | from sklearn.base import BaseEstimator
 2 | from sklearn.base import TransformerMixin
 3 | 
 4 | 
 5 | class ColumnSelector(BaseEstimator, TransformerMixin):
 6 |     def __init__(self, columns):
 7 |         self.columns = columns
 8 | 
 9 |     def fit(self, X, y=None):
10 |         return self
11 | 
12 |     def transform(self, X):
13 |         try:
14 |             return X[self.columns]
15 |         except:
16 |             print("Could not find selected columns {:s} in available columns {:s}".format(
17 |                 self.columns, X.columns))
18 |             raise
19 | 


--------------------------------------------------------------------------------
/bdranalytics/sklearn/preprocessing/scaling.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.base import BaseEstimator, RegressorMixin
 3 | 
 4 | 
 5 | class ScaledRegressor(BaseEstimator, RegressorMixin):
 6 |     """Allows a regressor to work with a scaled target if it does not allow scaling itself.
 7 | 
 8 |     When fitting, the `y` will be transform using the `scaler`, before being passed to the `model.fit`.
 9 |     When predicting, the predicted y will be inverse transformed to obtain a y_hat in the original range of values.
10 | 
11 |     For example, this allows your regressor to predict manipulated targets (ie `log(y)`), without additional pre and
12 |     postprocessing outside your sklearn pipeline
13 | 
14 |     Parameters
15 |     ----------
16 |     scaler : TransformerMixin
17 |         The transformer which will be applied on the target before it is passed to the `model`
18 | 
19 |     estimator : RegressorMixin
20 |         The regressor which will work in transformed target space
21 | 
22 |     Attributes
23 |     ----------
24 | 
25 |     Examples
26 |     >>> from sklearn.linear_model import LinearRegression
27 |     >>> from sklearn.preprocessing import StandardScaler
28 |     >>> from sklearn.pipeline import Pipeline
29 |     >>> n_rows = 10
30 |     >>> X = np.random.rand(n_rows, 2)
31 |     >>> y = np.random.rand(n_rows)
32 |     >>> regressor = LinearRegression()
33 |     >>> scaler = StandardScaler()
34 |     >>> pipeline = Pipeline([("predict", ScaledRegressor(scaler, regressor))])
35 |     >>> y_hat = pipeline.fit(X, y).predict(X)
36 |     """
37 | 
38 |     def __init__(self, scaler, estimator):
39 |         self.estimator = estimator
40 |         self.scaler = scaler
41 | 
42 |     @staticmethod
43 |     def _to_matrix(vector):
44 |         return np.reshape(vector, (-1, 1))
45 | 
46 |     @staticmethod
47 |     def _to_vector(matrix):
48 |         return np.reshape(matrix, -1)
49 | 
50 |     def fit(self, X, y):
51 |         y_scaled = self.scaler.fit_transform(self._to_matrix(y))
52 |         self.estimator.fit(X, self._to_vector(y_scaled))
53 | 
54 |     def predict(self, X):
55 |         return self._to_vector(
56 |             self.scaler.inverse_transform(
57 |                 self._to_matrix(
58 |                     self.estimator.predict(X)
59 |                 )
60 |             )
61 |         )
62 | 


--------------------------------------------------------------------------------
/bdranalytics/sklearn/preprocessing/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/bdranalytics/sklearn/preprocessing/tests/__init__.py


--------------------------------------------------------------------------------
/bdranalytics/sklearn/preprocessing/tests/test_encoding.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import unittest
 3 | 
 4 | import numpy as np
 5 | import pandas as pd
 6 | 
 7 | from bdranalytics.sklearn.preprocessing import WeightOfEvidenceEncoder
 8 | 
 9 | 
10 | class TestEncoding(unittest.TestCase):
11 | 
12 |     def verify_numeric(self, X_test):
13 |         for dt in X_test.dtypes:
14 |             numeric = False
15 |             if np.issubdtype(dt, int) or np.issubdtype(dt, float):
16 |                 numeric = True
17 | 
18 |             self.assertTrue(numeric)
19 | 
20 |     @staticmethod
21 |     def create_dataset(n_rows=1000):
22 |         """
23 |         Creates a data set with some categorical variables
24 |         """
25 |         ds = [[
26 |             random.random(),
27 |             random.random(),
28 |             random.choice(['A', 'B', 'C']),
29 |             random.choice(['A', 'B', 'C']),
30 |             random.choice(['A', 'B', 'C', None]),
31 |             random.choice(['A', 'B', 'C'])
32 |         ] for _ in range(n_rows)]
33 | 
34 |         X = pd.DataFrame(ds, columns=['c1', 'c2', 'c3', 'c4', 'c5', 'c6'])
35 |         y = np.random.randint(2, size=(n_rows,))
36 | 
37 |         return X, y
38 | 
39 |     def test_weight_of_evidence(self):
40 |         """
41 |         Unit test for WeightOfEvidenceEncoder class
42 |         """
43 |         # generate some training data
44 |         cols = ['c3', 'c4', 'c5', 'c6']
45 |         n_rows = 100
46 |         X_train, y_train = self.create_dataset(n_rows=n_rows)
47 | 
48 |         # independent data set to-be-transformed
49 |         X_test, _ = self.create_dataset(n_rows=10)
50 | 
51 |         # add unseen category to catch NaN filling behavior
52 |         X_test.loc[0, 'c3'] = 'Z'
53 | 
54 |         # data frame case
55 |         enc = WeightOfEvidenceEncoder(verbose=1, cols=cols)
56 |         enc.fit(X_train, y_train)
57 |         self.verify_numeric(enc.transform(X_test))
58 | 
59 |         # numpy array case
60 |         enc_np = WeightOfEvidenceEncoder(verbose=0, return_df=False, cols=cols)
61 |         enc_np.fit(X_train, y_train)
62 |         output_array_enc_np = enc_np.transform(
63 |             X_test)  # save for following tests
64 |         self.assertTrue(isinstance(output_array_enc_np, np.ndarray))
65 | 
66 |         # external dep var, DIFFERENT from y_train
67 |         enc_ext = WeightOfEvidenceEncoder(verbose=1, cols=cols, return_df=False,
68 |                                           dependent_variable_values=np.random.randint(2, size=(n_rows,)))
69 |         enc_ext.fit(X_train, y_train)
70 |         self.assertTrue(np.array_equal(output_array_enc_np,
71 |                                        enc_ext.transform(X_test)) is False)
72 | 
73 |         # external dep var, SAME y_train
74 |         enc_ext = WeightOfEvidenceEncoder(verbose=1, cols=cols, return_df=False,
75 |                                           dependent_variable_values=y_train)
76 |         enc_ext.fit(X_train, y_train)
77 |         self.assertTrue(np.array_equal(output_array_enc_np,
78 |                                        enc_ext.transform(X_test)) is True)
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     unittest.main()
83 | 


--------------------------------------------------------------------------------
/bdranalytics/sklearn/preprocessing/tests/test_scaling.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | from sklearn.dummy import DummyRegressor
 5 | from sklearn.pipeline import Pipeline
 6 | from sklearn.preprocessing import StandardScaler
 7 | 
 8 | from bdranalytics.sklearn.preprocessing import ScaledRegressor
 9 | 
10 | 
11 | class TestPreprocessing(unittest.TestCase):
12 | 
13 |     @staticmethod
14 |     def create_regression_dataset(n_rows=1000):
15 |         """
16 |         Creates a data set with only numerical data
17 |         """
18 |         X = np.random.rand(n_rows, 2)
19 |         y = np.random.rand(n_rows)
20 |         return X, y
21 | 
22 |     def test_dummy_pipeline(self):
23 |         """
24 |         Just checking setup of a dummy regressor in a pipeline
25 |         :return: None
26 |         """
27 |         X, y = self.create_regression_dataset(n_rows=20)
28 |         predictor_constant = 3
29 |         predictor = DummyRegressor(
30 |             strategy="constant", constant=predictor_constant)
31 |         y_hat = Pipeline([("predict", predictor)]).fit(X, y).predict(X)
32 |         np.allclose(y_hat, np.repeat(predictor_constant, len(y)))
33 | 
34 |     def test_scaled_target(self):
35 |         X, y = self.create_regression_dataset(n_rows=20)
36 |         y_mean = np.mean(y)
37 |         predictor_constant = 0  # 0 will be multiplied by std , and then added to the mean
38 |         predictor = DummyRegressor(
39 |             strategy="constant", constant=predictor_constant)
40 |         scaler = StandardScaler()
41 |         y_hat = Pipeline([("predict", ScaledRegressor(scaler, predictor))]).fit(
42 |             X, y).predict(X)
43 |         np.allclose(y_hat, np.repeat(y_mean, len(y)))
44 | 
45 |     def test_scaled_target_with_set_params(self):
46 |         X, y = self.create_regression_dataset(n_rows=20)
47 |         y_mean = np.mean(y)
48 |         predictor_constant = 10  # 0 will be multiplied by std , and then added to the mean
49 |         predictor = DummyRegressor(
50 |             strategy="constant", constant=predictor_constant)
51 |         scaler = StandardScaler()
52 |         pipeline = Pipeline([("predict", ScaledRegressor(scaler, predictor))])
53 |         pipeline.set_params(predict__estimator__constant=0)
54 |         y_hat = pipeline.fit(X, y).predict(X)
55 |         np.allclose(y_hat, np.repeat(y_mean, len(y)))
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     unittest.main()
60 | 


--------------------------------------------------------------------------------
/bdranalytics/sklearn/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/bdranalytics/sklearn/tests/__init__.py


--------------------------------------------------------------------------------
/bdranalytics/sklearn/tests/test_model_selection.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import unittest
  4 | 
  5 | from bdranalytics.sklearn.model_selection import GrowingWindow, IntervalGrowingWindow
  6 | 
  7 | 
  8 | def create_time_series_data_set(start_date=pd.datetime(year=2000, month=1, day=1), n_rows=100):
  9 | 
 10 |     end_date = start_date + pd.Timedelta(days=n_rows-1)
 11 | 
 12 |     ds = np.random.rand(n_rows)
 13 | 
 14 |     X = pd.DataFrame(ds,
 15 |                      columns=['variable'],
 16 |                      index=pd.date_range(start_date, end_date))
 17 | 
 18 |     y = np.random.randint(2, size=(n_rows,))
 19 | 
 20 |     return X, y
 21 | 
 22 | 
 23 | class TestGrowingWindow(unittest.TestCase):
 24 | 
 25 |     def test_n_splits(self):
 26 |         assert GrowingWindow(4).get_n_splits(np.arange(15).reshape(3, 5)) == 4
 27 | 
 28 |     def test_n_splits_returned(self):
 29 |         assert len(list(GrowingWindow(4).split(
 30 |             np.arange(15).reshape(3, 5), np.arange(3)))) == 4
 31 | 
 32 |     def test_n_splits_testsize(self):
 33 |         for train, test in GrowingWindow(4).split(np.arange(15).reshape(5, 3), np.arange(5)):
 34 |             assert len(test) == 1
 35 | 
 36 |     def test_n_splits_testsize2(self):
 37 |         for i, (train, test) in zip(range(4), GrowingWindow(4).split(np.arange(15).reshape(5, 3), np.arange(5))):
 38 |             assert len(train) == i+1
 39 | 
 40 | 
 41 | class TestIntervalGrowingWindow(unittest.TestCase):
 42 | 
 43 |     def test_split_on_index(self):
 44 | 
 45 |         X, y = create_time_series_data_set()
 46 | 
 47 |         cv = IntervalGrowingWindow(
 48 |             test_start_date=pd.datetime(year=2000, month=2, day=1),
 49 |             test_end_date=pd.datetime(year=2000, month=3, day=1),
 50 |             test_size='7D')
 51 | 
 52 |         self.assertTrue(len(list(cv.split(X, y))) == 4)
 53 | 
 54 |     def test_split_on_array(self):
 55 | 
 56 |         X, y = create_time_series_data_set()
 57 | 
 58 |         test_size_in_days = 7
 59 | 
 60 |         cv = IntervalGrowingWindow(
 61 |             timestamps=X.index.values,
 62 |             test_start_date=pd.datetime(year=2000, month=2, day=1),
 63 |             test_end_date=pd.datetime(year=2000, month=3, day=1),
 64 |             test_size=pd.Timedelta(days=test_size_in_days))
 65 | 
 66 |         self.assertTrue(len(list(cv.split(X, y))) == 4)
 67 | 
 68 |     def test_split_test_size(self):
 69 | 
 70 |         X, y = create_time_series_data_set()
 71 | 
 72 |         test_size_in_days = 7
 73 | 
 74 |         cv = IntervalGrowingWindow(
 75 |             test_start_date=pd.datetime(year=2000, month=2, day=1),
 76 |             test_end_date=pd.datetime(year=2000, month=3, day=1),
 77 |             test_size=pd.Timedelta(days=test_size_in_days))
 78 | 
 79 |         for _, test in cv.split(X, y):
 80 |             self.assertTrue(len(test) == test_size_in_days)
 81 | 
 82 |     def test_split_with_train_size(self):
 83 | 
 84 |         X, y = create_time_series_data_set()
 85 | 
 86 |         train_size_in_days = 14
 87 | 
 88 |         cv = IntervalGrowingWindow(
 89 |             test_start_date=pd.datetime(year=2000, month=2, day=1),
 90 |             test_end_date=pd.datetime(year=2000, month=3, day=1),
 91 |             test_size=pd.Timedelta(days=7),
 92 |             train_size=pd.Timedelta(days=train_size_in_days))
 93 | 
 94 |         for train, _ in cv.split(X, y):
 95 |             self.assertTrue(len(train) == train_size_in_days)
 96 | 
 97 |     def test_n_splits(self):
 98 | 
 99 |         X, y = create_time_series_data_set()
100 | 
101 |         cv = IntervalGrowingWindow(
102 |             test_start_date=pd.datetime(year=2000, month=2, day=1),
103 |             test_end_date=pd.datetime(year=2000, month=3, day=1),
104 |             test_size=pd.Timedelta(days=7))
105 | 
106 |         self.assertTrue(cv.get_n_splits(X) == 4)
107 | 


--------------------------------------------------------------------------------
/data/recruit.dat:
--------------------------------------------------------------------------------
  1 |  68.63 
  2 |  68.63 
  3 |  68.63 
  4 |  68.63 
  5 |  68.63 
  6 |  68.63 
  7 |  59.16 
  8 |  48.7 
  9 |  47.54 
 10 |  50.91 
 11 |  44.7 
 12 |  42.85 
 13 |  39.62 
 14 |  44.45 
 15 |  38.98 
 16 |  42.62 
 17 |  48.27 
 18 |  59.39 
 19 |  51.66 
 20 |  38.55 
 21 |  60.33 
 22 |  72.27 
 23 |  68.62 
 24 |  69.63 
 25 |  72.2 
 26 |  67.87 
 27 |  64.91001 
 28 |  53.85 
 29 |  37.96 
 30 |  23.23 
 31 |  12.68 
 32 |  9.84 
 33 |  7.82 
 34 |  11.78 
 35 |  10.22 
 36 |  12.19 
 37 |  18.6 
 38 |  26.97 
 39 |  22.52 
 40 |  19.18 
 41 |  17.14 
 42 |  18.61 
 43 |  20.02 
 44 |  22.65 
 45 |  38.99 
 46 |  76.55 
 47 |  87.99 
 48 |  99.8 
 49 |  96.69 
 50 |  87.45 
 51 |  88.57 
 52 |  97.43 
 53 |  99.99 
 54 |  94.88 
 55 |  86.99 
 56 |  79.73001 
 57 |  92.35 
 58 |  91.29 
 59 |  94.31 
 60 |  84.95 
 61 |  82.97 
 62 |  92.98001 
 63 |  81.06 
 64 |  62.37 
 65 |  52.99 
 66 |  39.53 
 67 |  42.9 
 68 |  33.76 
 69 |  40.97 
 70 |  60.5 
 71 |  66.61 
 72 |  80.38 
 73 |  95.86 
 74 |  97.74 
 75 |  80.24 
 76 |  73.44 
 77 |  65.67 
 78 |  47.81 
 79 |  33.51 
 80 |  34.22 
 81 |  32.95 
 82 |  32.55 
 83 |  46.92 
 84 |  44.64 
 85 |  53.02 
 86 |  41.98 
 87 |  30.43 
 88 |  24.43 
 89 |  18.05 
 90 |  20.98 
 91 |  12.37 
 92 |  12.03 
 93 |  12.41 
 94 |  15.89 
 95 |  20.46 
 96 |  26.95 
 97 |  30.29 
 98 |  26.21 
 99 |  23.34 
100 |  25.55 
101 |  25.4 
102 |  24.16 
103 |  23.34 
104 |  24.38 
105 |  27.2 
106 |  29.18 
107 |  43.3 
108 |  53.92 
109 |  59.76 
110 |  64.52 
111 |  65.84 
112 |  70.19 
113 |  75.27 
114 |  77.63 
115 |  76.96 
116 |  77.7 
117 |  85.13 
118 |  99.33999 
119 |  97.7 
120 |  97.01999 
121 |  98.83 
122 |  99.8 
123 |  96.74 
124 |  93.54 
125 |  82.08 
126 |  71.51 
127 |  49.61 
128 |  41.76 
129 |  59.78 
130 |  90.97 
131 |  85.26 
132 |  100 
133 |  98.5 
134 |  98.7 
135 |  93.1 
136 |  79.37 
137 |  81.97 
138 |  55.94 
139 |  50.39 
140 |  48.64 
141 |  40.12 
142 |  63.04 
143 |  60.51 
144 |  78.72 
145 |  71.37 
146 |  76.43 
147 |  71.25 
148 |  56.46 
149 |  41.84 
150 |  41.24 
151 |  35.28 
152 |  39.86 
153 |  45.13 
154 |  53.8 
155 |  77.54 
156 |  80.02 
157 |  81.28 
158 |  73.58 
159 |  66.06 
160 |  59.46 
161 |  59.49 
162 |  51.9 
163 |  35.21 
164 |  39.65 
165 |  31.9 
166 |  61.56 
167 |  88.25 
168 |  96.46 
169 |  83.94 
170 |  89.05 
171 |  92.97 
172 |  98.29 
173 |  99.79 
174 |  94.01 
175 |  87.06 
176 |  80.46 
177 |  74.86 
178 |  67.18 
179 |  73.85 
180 |  80.15 
181 |  69.45 
182 |  50.39 
183 |  31.68 
184 |  31.25 
185 |  23.11 
186 |  11.32 
187 |  8.96 
188 |  6.03 
189 |  11.7 
190 |  34.63 
191 |  58.31 
192 |  58.66 
193 |  72.62 
194 |  85.76 
195 |  94.29 
196 |  92.76999 
197 |  93.18 
198 |  89.32 
199 |  81.63 
200 |  71.44 
201 |  66.42 
202 |  80.02 
203 |  76.52 
204 |  77.51 
205 |  67.73001 
206 |  50.52 
207 |  48.97 
208 |  50.64 
209 |  38.73 
210 |  30.79 
211 |  23.75 
212 |  26.28 
213 |  36.67 
214 |  68.91001 
215 |  97.39 
216 |  96.1 
217 |  90.3 
218 |  84.92 
219 |  91.41001 
220 |  92.54 
221 |  98.04 
222 |  99.96 
223 |  88.83 
224 |  83.07 
225 |  86.32 
226 |  99.83 
227 |  96.62 
228 |  99.94 
229 |  96.89 
230 |  85.12 
231 |  77.97 
232 |  67.38 
233 |  44.5 
234 |  26.72 
235 |  13.25 
236 |  10.64 
237 |  23.83 
238 |  29.18 
239 |  26.91 
240 |  20.09 
241 |  22.33 
242 |  22.07 
243 |  26.2 
244 |  29.81 
245 |  30.1 
246 |  24.25 
247 |  25.3 
248 |  23.5 
249 |  35.62 
250 |  52.11 
251 |  56.79 
252 |  69.09 
253 |  86.64 
254 |  99.28 
255 |  98.48001 
256 |  98.45 
257 |  94.76999 
258 |  93.58 
259 |  78.07 
260 |  66.88 
261 |  77.04 
262 |  88.72 
263 |  94.88 
264 |  99.67 
265 |  100 
266 |  99.9 
267 |  96.91001 
268 |  66.88 
269 |  52.39 
270 |  40.61 
271 |  30.65 
272 |  32.04 
273 |  45.28 
274 |  35 
275 |  35.62 
276 |  36.98 
277 |  39.89 
278 |  36.88 
279 |  30.85 
280 |  19.33 
281 |  13.26 
282 |  11.12 
283 |  9.140001 
284 |  8.21 
285 |  10.76 
286 |  10.43 
287 |  13.75 
288 |  37.91 
289 |  41.85 
290 |  44.67 
291 |  50.57 
292 |  50.34 
293 |  49.54 
294 |  56.93 
295 |  60.16 
296 |  57.47 
297 |  71.68 
298 |  97.28 
299 |  62.09 
300 |  59.97 
301 |  51.18 
302 |  51.48 
303 |  66.08 
304 |  86.39 
305 |  93.58 
306 |  99.9 
307 |  93.86 
308 |  82.82 
309 |  84.83999 
310 |  89.51 
311 |  86.89 
312 |  87.15 
313 |  78.47 
314 |  55.93 
315 |  41.27 
316 |  19.66 
317 |  9.439999 
318 |  4.66 
319 |  2.36 
320 |  1.72 
321 |  3.32 
322 |  12.13 
323 |  16.81 
324 |  24.3 
325 |  52.42 
326 |  58.05 
327 |  59.42 
328 |  57.52 
329 |  60.13 
330 |  64.68 
331 |  74.94 
332 |  69.73001 
333 |  77.11 
334 |  97.93 
335 |  98.74 
336 |  98.88 
337 |  90.41001 
338 |  77.86 
339 |  61.48 
340 |  47.66 
341 |  30.74 
342 |  20.11 
343 |  12.08 
344 |  8.97 
345 |  20.03 
346 |  71.54 
347 |  97.51999 
348 |  95.14 
349 |  92.22 
350 |  80.09 
351 |  74.59 
352 |  83.66001 
353 |  87.36 
354 |  96.63 
355 |  93.36 
356 |  94.7 
357 |  99.66001 
358 |  91.6 
359 |  89.98001 
360 |  99.39 
361 |  99.46 
362 |  99.37 
363 |  99.51999 
364 |  96.64 
365 |  89.55 
366 |  68.67 
367 |  65.02 
368 |  61.82 
369 |  76.92 
370 |  80.17 
371 |  77.48001 
372 |  82.34 
373 |  74.11 
374 |  69.03 
375 |  79.48001 
376 |  78.76 
377 |  67.55 
378 |  59.98 
379 |  44.35 
380 |  41.18 
381 |  71.53 
382 |  95.51999 
383 |  93.48001 
384 |  98.18 
385 |  70.48001 
386 |  77.63 
387 |  88.11 
388 |  93.15 
389 |  99.01 
390 |  93.31 
391 |  81.21 
392 |  79.63 
393 |  80.67 
394 |  85.63 
395 |  88.66001 
396 |  93.65 
397 |  95.49 
398 |  98.26999 
399 |  86.19 
400 |  79.69 
401 |  72.26 
402 |  35.06 
403 |  20.98 
404 |  29.67 
405 |  42.09 
406 |  52.96 
407 |  69.45 
408 |  76.86 
409 |  86.19 
410 |  96 
411 |  96.07 
412 |  86.85 
413 |  76.66001 
414 |  61.47 
415 |  46.26 
416 |  40.15 
417 |  72.59 
418 |  85.17 
419 |  91.74 
420 |  99.22 
421 |  76.55 
422 |  64.17 
423 |  69.2 
424 |  70.37 
425 |  79.55 
426 |  74.79 
427 |  70.9 
428 |  78.86 
429 |  84.28 
430 |  83.43 
431 |  85.55 
432 |  80.17 
433 |  90.82 
434 |  99.39 
435 |  99.18 
436 |  89.1 
437 |  82.18 
438 |  77.64 
439 |  55.93 
440 |  49.73 
441 |  70.12 
442 |  79.2 
443 |  87.83 
444 |  88.2 
445 |  94.83 
446 |  98.66001 
447 |  94.83999 
448 |  83.06 
449 |  61.42 
450 |  47.47 
451 |  31.81 
452 |  22.95 
453 |  17.87 
454 | 


--------------------------------------------------------------------------------
/data/soi.dat:
--------------------------------------------------------------------------------
  1 |  .377 
  2 |  .246 
  3 |  .311 
  4 |  .104 
  5 | -.016 
  6 |  .235 
  7 |  .137 
  8 |  .191 
  9 | -.016 
 10 |  .29 
 11 |  .038 
 12 | -.016 
 13 | -.158 
 14 |  .366 
 15 |  .607 
 16 | -.355 
 17 | -.18 
 18 |  .268 
 19 |  .093 
 20 |  .027 
 21 |  .246 
 22 |  .202 
 23 |  .432 
 24 |  .617 
 25 |  .76 
 26 |  .891 
 27 |  .607 
 28 |  .574 
 29 |  .005 
 30 |  .475 
 31 |  .202 
 32 | -.027 
 33 | -.038 
 34 |  .716 
 35 |  .836 
 36 |  .891 
 37 |  .53 
 38 |  .53 
 39 |  .377 
 40 | -.235 
 41 | -.585 
 42 | -.18 
 43 | -.53 
 44 | -.464 
 45 | -.443 
 46 |  .049 
 47 |  .454 
 48 |  .257 
 49 |  .41 
 50 |  .224 
 51 |  .148 
 52 | -.432 
 53 | -.093 
 54 | -.268 
 55 |  .158 
 56 | -.06 
 57 | -.399 
 58 |  .235 
 59 |  .366 
 60 |  .202 
 61 |  .344 
 62 | -.038 
 63 |  .29 
 64 | -.126 
 65 | -.366 
 66 | -.115 
 67 | -.301 
 68 | -.486 
 69 | -.137 
 70 |  .738 
 71 |  .366 
 72 |  .366 
 73 |  .65 
 74 |  .628 
 75 |  .126 
 76 |  .169 
 77 |  .137 
 78 | -.257 
 79 |  .169 
 80 | -.093 
 81 |  .475 
 82 |  .639 
 83 |  .596 
 84 |  .749 
 85 |  .191 
 86 |  1 
 87 |  .486 
 88 |  .41 
 89 |  .158 
 90 |  .126 
 91 |  .06 
 92 |  .246 
 93 |  .738 
 94 |  .803 
 95 |  .421 
 96 |  .617 
 97 |  .705 
 98 |  .639 
 99 |  .454 
100 |  .311 
101 |  .355 
102 | -.158 
103 | -.038 
104 |  .115 
105 |  .137 
106 |  .257 
107 |  .115 
108 |  .038 
109 |  .082 
110 |  .148 
111 |  .06 
112 | -.191 
113 | -.607 
114 | -.585 
115 | -.268 
116 | -.093 
117 | -.093 
118 |  .257 
119 | -.005 
120 |  .224 
121 |  .169 
122 |  .432 
123 |  .202 
124 | -.366 
125 | -.661 
126 |  .093 
127 | -.716 
128 |  .148 
129 | -.093 
130 |  .279 
131 |  .432 
132 | -.104 
133 |  .607 
134 |  .18 
135 |  .071 
136 |  .246 
137 | -.432 
138 |  .06 
139 | -.388 
140 |  .202 
141 | -.104 
142 |  .191 
143 |  .475 
144 |  .552 
145 |  .169 
146 |  .333 
147 | -.005 
148 | -.038 
149 | -.126 
150 | -.443 
151 | -.016 
152 |  .027 
153 |  .333 
154 |  .355 
155 |  .344 
156 |  .115 
157 |  .311 
158 |  .596 
159 |  .005 
160 |  .388 
161 | -.5080001 
162 | -.552 
163 | -.858 
164 | -.596 
165 |  .06 
166 |  .071 
167 |  .224 
168 |  .279 
169 |  .322 
170 |  .18 
171 |  .126 
172 |  .082 
173 |  .126 
174 | -.213 
175 | -.235 
176 |  .169 
177 |  .388 
178 |  .519 
179 |  .126 
180 |  .41 
181 |  .738 
182 |  .443 
183 |  .596 
184 | -.137 
185 | -.486 
186 | -.377 
187 |  .268 
188 | -.169 
189 | -.213 
190 | -.191 
191 |  .377 
192 |  .126 
193 |  .311 
194 |  .388 
195 |  .388 
196 |  .18 
197 | -.333 
198 |  .049 
199 | -.082 
200 |  .191 
201 |  .399 
202 |  .093 
203 |  .005 
204 |  .366 
205 |  .333 
206 |  .377 
207 |  .071 
208 | -.158 
209 | -.541 
210 | -.683 
211 | -.683 
212 | -.344 
213 | -.301 
214 |  .202 
215 | -.126 
216 |  .082 
217 |  .115 
218 |  .464 
219 |  .104 
220 | -.18 
221 | -.672 
222 |  .104 
223 | -.486 
224 |  .093 
225 |  .279 
226 |  .104 
227 |  .18 
228 |  .475 
229 |  .53 
230 |  .661 
231 |  .355 
232 | -.344 
233 | -.038 
234 |  .235 
235 |  .486 
236 |  .169 
237 |  .333 
238 |  .137 
239 |  .169 
240 |  .333 
241 |  .705 
242 |  .377 
243 |  .585 
244 | -.126 
245 | -.213 
246 |  .148 
247 | -.126 
248 | -.344 
249 | -.5080001 
250 |  .279 
251 |  .082 
252 |  .355 
253 |  .082 
254 |  .574 
255 |  .355 
256 | -.257 
257 | -.377 
258 | -.344 
259 | -.65 
260 | -.093 
261 | -.137 
262 |  .115 
263 |  .705 
264 |  .399 
265 |  .366 
266 |  .366 
267 |  .038 
268 | -.268 
269 |  .322 
270 |  .082 
271 |  .082 
272 |  .06 
273 |  .268 
274 |  .421 
275 |  .77 
276 |  .803 
277 |  .661 
278 |  .716 
279 |  .628 
280 |  .191 
281 |  .53 
282 |  .18 
283 | -.464 
284 |  .279 
285 |  .355 
286 |  .202 
287 |  .552 
288 |  .617 
289 |  .104 
290 |  .224 
291 |  .432 
292 | -.213 
293 | -.967 
294 | -1 
295 | -.432 
296 | -.683 
297 | -.5080001 
298 | -.137 
299 |  .082 
300 | -.06 
301 |  .333 
302 |  .355 
303 |  .388 
304 | -.06 
305 | -.202 
306 |  .005 
307 | -.115 
308 |  .137 
309 |  .454 
310 |  .388 
311 |  .738 
312 |  .77 
313 |  .803 
314 |  .836 
315 |  .65 
316 |  .005 
317 | -.366 
318 |  .071 
319 | -.005 
320 | -.454 
321 |  .191 
322 |  .432 
323 |  .683 
324 |  .388 
325 |  .235 
326 | -.093 
327 |  .475 
328 | -.082 
329 | -.617 
330 | -.093 
331 | -.071 
332 |  .432 
333 |  .421 
334 |  .497 
335 |  .454 
336 |  .596 
337 |  .552 
338 |  .607 
339 |  .464 
340 | -.301 
341 | -.77 
342 | -.65 
343 | -.694 
344 | -.137 
345 | -.464 
346 | -.246 
347 |  .169 
348 | -.202 
349 |  .082 
350 |  .607 
351 | -.158 
352 | -.454 
353 | -.781 
354 | -.388 
355 |  .18 
356 |  .06 
357 | -.5080001 
358 | -.268 
359 |  .224 
360 |  .115 
361 |  .399 
362 |  .027 
363 |  .016 
364 | -.377 
365 | -.18 
366 | -.038 
367 | -.202 
368 |  .137 
369 |  .093 
370 | -.279 
371 | -.038 
372 |  .235 
373 |  .169 
374 |  .399 
375 |  .158 
376 | -.541 
377 | -.628 
378 |  .027 
379 | -.738 
380 | -.913 
381 | -.027 
382 |  .071 
383 | -.126 
384 |  .049 
385 |  .454 
386 |  .301 
387 | -.005 
388 | -.093 
389 | -.224 
390 | -.213 
391 | -.311 
392 | -.235 
393 | -.322 
394 |  .322 
395 |  .093 
396 |  .115 
397 |  .77 
398 |  .607 
399 | -.158 
400 | -.235 
401 | -.202 
402 | -.333 
403 | -.158 
404 | -.224 
405 | -.355 
406 |  .005 
407 |  .454 
408 |  .41 
409 |  .519 
410 |  .541 
411 |  .301 
412 | -.574 
413 | -.344 
414 | -.279 
415 | -.716 
416 | -.869 
417 | -.596 
418 | -.29 
419 | -.454 
420 | -.246 
421 | -.607 
422 | -.563 
423 | -.235 
424 | -.246 
425 | -.399 
426 | -.333 
427 | -.53 
428 | -.049 
429 |  .158 
430 |  .115 
431 |  .322 
432 |  .115 
433 |  .049 
434 |  .454 
435 |  .158 
436 | -.421 
437 | -.268 
438 | -.311 
439 | -.115 
440 | -.322 
441 | -.322 
442 |  .126 
443 |  .333 
444 |  .519 
445 |  .399 
446 |  .519 
447 |  .432 
448 |  .355 
449 | -.126 
450 | -.5080001 
451 | -.388 
452 |  .388 
453 |  .071 
454 | 


--------------------------------------------------------------------------------
/data/soi_description.txt:
--------------------------------------------------------------------------------
1 | https://crudata.uea.ac.uk/cru/data/soi/
2 | 


--------------------------------------------------------------------------------
/data/test.dat:
--------------------------------------------------------------------------------
1 | stub


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = bdr-analytics-py
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Internal variables.
12 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
13 | 
14 | # Put it first so that "make" without argument is like "make help".
15 | help:
16 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
17 | 
18 | .PHONY: help Makefile clean
19 | 
20 | clean:
21 | 	rm -Rf source
22 | 	rm -Rf $(BUILDDIR)
23 | 
24 | # This target depends on all .py files in bdranalytics folder
25 | # Thus if any one changed, also the api doc will be updated
26 | # If there are py files for which no apidoc is created, using -f does not work:
27 | # None of the apidocs will be updated (nothing changed), but the py file will
28 | # be newer (as there is no corresponding api doc file)
29 | # Therefore we remove the source dir first
30 | #source: ../bdranalytics
31 | source:	$(shell find ../bdranalytics -type f -name '*.py')
32 | 	-rm -Rf source
33 | 	sphinx-apidoc -f -M -T -o source/ -H "$(SPHINXPROJ)" ../bdranalytics
34 | 
35 | html: Makefile source
36 | 	mkdir -p $(BUILDDIR)/html $(BUILDDIR)/doctrees
37 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
38 | 	@echo
39 | 	@echo "Build finished. The HTML pages are in build/html."
40 | 


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # codepy documentation build configuration file, created by
  4 | # sphinx-quickstart on Wed Feb  4 16:32:10 2009.
  5 | 
  6 | # encoding=utf8
  7 | import sys
  8 | import os
  9 | 
 10 | # This file is execfile()d with the current directory set to its containing dir.
 11 | #
 12 | # The contents of this file are pickled, so don't put values in the namespace
 13 | # that aren't pickleable (module imports are okay, they're removed automatically).
 14 | #
 15 | # Note that not all possible configuration values are present in this
 16 | # autogenerated file.
 17 | #
 18 | # All configuration values have a default; values that are commented out
 19 | # serve to show the default.
 20 | 
 21 | # If your extensions are in another directory, add it here. If the directory
 22 | # is relative to the documentation root, use os.path.abspath to make it
 23 | # absolute, like shown here.
 24 | sys.path.insert(0, os.path.abspath('..'))
 25 | 
 26 | 
 27 | # General configuration
 28 | # ---------------------
 29 | 
 30 | # Add any Sphinx extension module names here, as strings. They can be extensions
 31 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 32 | extensions = [
 33 |     'sphinx.ext.autodoc',
 34 |     'sphinx.ext.doctest',
 35 |     'sphinx.ext.coverage',
 36 |     'sphinx.ext.viewcode',
 37 |     'sphinx.ext.githubpages',
 38 |     'sphinx.ext.intersphinx'
 39 | ]
 40 | 
 41 | # pngmath / imgmath compatibility layer for different sphinx versions
 42 | import sphinx
 43 | from distutils.version import LooseVersion
 44 | if LooseVersion(sphinx.__version__) < LooseVersion('1.4'):
 45 |     extensions.append('sphinx.ext.pngmath')
 46 | else:
 47 |     extensions.append('sphinx.ext.imgmath')
 48 | 
 49 | # Add any paths that contain templates here, relative to this directory.
 50 | templates_path = ['_templates']
 51 | 
 52 | # The suffix of source filenames.
 53 | source_suffix = '.rst'
 54 | 
 55 | # The encoding of source files.
 56 | #source_encoding = 'utf-8'
 57 | 
 58 | # The master toctree document.
 59 | master_doc = 'index'
 60 | 
 61 | # General information about the project.
 62 | project = u'bdr-analytics-py'
 63 | copyright = u'2017, BigData Republic'
 64 | author = u'Gerben Oostra, Benoit Descamps, Alexander Backus, Steven Reitsma, Tom de Ruijter'
 65 | 
 66 | # The version info for the project you're documenting, acts as replacement for
 67 | # |version| and |release|, also used in various other places throughout the
 68 | # built documents.
 69 | #
 70 | # The short X.Y version.
 71 | import re
 72 | ver_re = re.compile(r'version\s*=\s*\'([0-9a-z.]+)\'')
 73 | version = [ver_re.search(line).group(1)
 74 |            for line in open("../setup.py").readlines()
 75 |            if ver_re.search(line)
 76 |            ][0]
 77 | # The full version, including alpha/beta/rc tags.
 78 | release = version
 79 | 
 80 | # The language for content autogenerated by Sphinx. Refer to documentation
 81 | # for a list of supported languages.
 82 | language = None
 83 | 
 84 | # There are two options for replacing |today|: either, you set today to some
 85 | # non-false value, then it is used:
 86 | #today = ''
 87 | # Else, today_fmt is used as the format for a strftime call.
 88 | #today_fmt = '%B %d, %Y'
 89 | 
 90 | # List of documents that shouldn't be included in the build.
 91 | #unused_docs = []
 92 | 
 93 | # List of patterns, relative to source directory, that match files and
 94 | # directories to ignore when looking for source files.
 95 | # This patterns also effect to html_static_path and html_extra_path
 96 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 97 | 
 98 | # List of directories, relative to source directory, that shouldn't be searched
 99 | # for source files.
100 | exclude_trees = []
101 | 
102 | # The reST default role (used for this markup: `text`) to use for all documents.
103 | #default_role = None
104 | 
105 | # If true, '()' will be appended to :func: etc. cross-reference text.
106 | #add_function_parentheses = True
107 | 
108 | # If true, the current module name will be prepended to all description
109 | # unit titles (such as .. function::).
110 | #add_module_names = True
111 | 
112 | # If true, sectionauthor and moduleauthor directives will be shown in the
113 | # output. They are ignored by default.
114 | #show_authors = False
115 | 
116 | # The name of the Pygments (syntax highlighting) style to use.
117 | pygments_style = 'sphinx'
118 | 
119 | # If true, `todo` and `todoList` produce output, else they produce nothing.
120 | todo_include_todos = False
121 | 
122 | # Options for HTML output
123 | # -----------------------
124 | 
125 | html_theme = "sphinx_rtd_theme"
126 | 
127 | html_theme_options = {}
128 | 
129 | html_sidebars = {
130 |     '**': [
131 |         'about.html',
132 |         'navigation.html',
133 |         'relations.html',
134 |         'searchbox.html',
135 |     ]
136 | }
137 | 
138 | # The style sheet to use for HTML and HTML Help pages. A file of that name
139 | # must exist either in Sphinx' static/ path, or in one of the custom paths
140 | # given in html_static_path.
141 | #html_style = 'default.css'
142 | 
143 | # The name for this set of Sphinx documents.  If None, it defaults to
144 | # "<project> v<release> documentation".
145 | #html_title = None
146 | 
147 | # A shorter title for the navigation bar.  Default is the same as html_title.
148 | #html_short_title = None
149 | 
150 | # The name of an image file (relative to this directory) to place at the top
151 | # of the sidebar.
152 | #html_logo = None
153 | 
154 | # The name of an image file (within the static path) to use as favicon of the
155 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
156 | # pixels large.
157 | #html_favicon = None
158 | 
159 | # Add any paths that contain custom static files (such as style sheets) here,
160 | # relative to this directory. They are copied after the builtin static files,
161 | # so a file named "default.css" will overwrite the builtin "default.css".
162 | html_static_path = ['_static']
163 | 
164 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
165 | # using the given strftime format.
166 | #html_last_updated_fmt = '%b %d, %Y'
167 | 
168 | # If true, SmartyPants will be used to convert quotes and dashes to
169 | # typographically correct entities.
170 | #html_use_smartypants = True
171 | 
172 | # Custom sidebar templates, maps document names to template names.
173 | #html_sidebars = {}
174 | 
175 | # Additional templates that should be rendered to pages, maps page names to
176 | # template names.
177 | #html_additional_pages = {}
178 | 
179 | # If false, no module index is generated.
180 | html_use_modindex = False
181 | 
182 | # If false, no index is generated.
183 | html_use_index = False
184 | 
185 | # If true, the index is split into individual pages for each letter.
186 | #html_split_index = False
187 | 
188 | # If true, the reST sources are included in the HTML build as _sources/<name>.
189 | #html_copy_source = True
190 | 
191 | # If true, an OpenSearch description file will be output, and all pages will
192 | # contain a <link> tag referring to it.  The value of this option must be the
193 | # base URL from which the finished HTML is served.
194 | #html_use_opensearch = ''
195 | 
196 | # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
197 | #html_file_suffix = ''
198 | 
199 | # Output file base name for HTML help builder.
200 | htmlhelp_basename = 'bdr-analytics-pydoc'
201 | 
202 | 
203 | # Options for LaTeX output
204 | # ------------------------
205 | 
206 | # The paper size ('letter' or 'a4').
207 | #latex_paper_size = 'letter'
208 | 
209 | # The font size ('10pt', '11pt' or '12pt').
210 | #latex_font_size = '10pt'
211 | 
212 | # Grouping the document tree into LaTeX files. List of tuples
213 | # (source start file, target name, title, author, document class [howto/manual]).
214 | latex_documents = [
215 |     (master_doc, 'bdr-analytics-py.tex', u'bdr-analytics-py Documentation',
216 |      u'Gerben Oostra, Benoit Descamps, Alexander Backus, Steven Reitsma', 'manual'),
217 | ]
218 | 
219 | man_pages = [
220 |     (master_doc, 'bdr-analytics-py', u'bdr-analytics-py Documentation',
221 |      [author], 1)
222 | ]
223 | 
224 | # The name of an image file (relative to this directory) to place at the top of
225 | # the title page.
226 | #latex_logo = None
227 | 
228 | # For "manual" documents, if this is true, then toplevel headings are parts,
229 | # not chapters.
230 | #latex_use_parts = False
231 | 
232 | # Additional stuff for the LaTeX preamble.
233 | #latex_preamble = ''
234 | 
235 | # Documents to append as an appendix to all manuals.
236 | #latex_appendices = []
237 | 
238 | # If false, no module index is generated.
239 | #latex_use_modindex = True
240 | 
241 | autoclass_content = "both"
242 | 
243 | intersphinx_mapping = {
244 |     'http://docs.python.org/dev': None,
245 |     'http://docs.scipy.org/doc/numpy/': None
246 | }
247 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to bdr-analytics-py's documentation!
 2 | ============================================
 3 | 
 4 | bdr-analytics-py is a collection of tools to simplify data science.
 5 | 
 6 | Contents
 7 | --------
 8 | .. toctree::
 9 |    :maxdepth: 1
10 | 
11 |    source/bdranalytics.images
12 |    source/bdranalytics.keras
13 |    source/bdranalytics.pdlearn
14 |    source/bdranalytics.plot
15 |    source/bdranalytics.sklearn
16 | 


--------------------------------------------------------------------------------
/doc/push_to_pages.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # Push HTML files to gh-pages automatically.
 3 | 
 4 | if [[ $TRAVIS_BRANCH != "master" || $TRAVIS_PULL_REQUEST != false ]]; then exit; fi
 5 | 
 6 | # Fill this out with the correct org/repo
 7 | ORG=BigDataRepublic
 8 | REPO=bdr-analytics-py
 9 | # This probably should match an email for one of your users.
10 | EMAIL=info@bigdatarepublic.nl
11 | 
12 | set -e
13 | 
14 | # Script is located in and initiated from project root.
15 | # Clone the gh-pages branch outside of the repo and cd into it.
16 | cd ..
17 | git clone -b gh-pages "https://$GH_TOKEN@github.com/$ORG/$REPO.git" gh-pages
18 | cd gh-pages
19 | 
20 | # Update git configuration so I can push.
21 | if [ "$1" != "dry" ]; then
22 |     # Update git config.
23 |     git config user.name "Travis Builder"
24 |     git config user.email "$EMAIL"
25 | fi
26 | 
27 | # Copy in the HTML.  You may want to change this with your documentation path.
28 | cp -R ../$REPO/doc/_build/html/* ./
29 | 
30 | # Add and commit changes.
31 | git add -A .
32 | git commit -m "[ci skip] Autodoc commit for $COMMIT."
33 | if [ "$1" != "dry" ]; then
34 |     # -q is very important, otherwise you leak your GH_TOKEN
35 |     git push -q origin gh-pages
36 | fi
37 | 
38 | # Move back into project root.
39 | cd ../$REPO
40 | 


--------------------------------------------------------------------------------
/environment-dev.yml:
--------------------------------------------------------------------------------
 1 | name: bdranalytics-dev
 2 | # This environment is designed to work ON the bdranalytics module: it includes its dependencies, but not bdranalytics itself.
 3 | dependencies:
 4 | - python=3.6
 5 | - cmake
 6 | - boost
 7 | - pkgconfig
 8 | - jupyter
 9 | - tensorflow
10 | - conda-forge::xgboost=0.6a2
11 | - pip
12 | - pip:
13 |   - -rrequirements-dev.txt
14 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: bdranalytics
 2 | # This environment is designed to work WITH the bdranalytics module: it includes the module including its dependencies
 3 | dependencies:
 4 | - python=3.6
 5 | - cmake
 6 | - boost
 7 | - tensorflow
 8 | - keras
 9 | - pkgconfig
10 | - jupyter
11 | - conda-forge::xgboost=0.6a2
12 | - pip
13 | - pip:
14 |   - -rrequirements.txt
15 | 


--------------------------------------------------------------------------------
/notebooks/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/notebooks/.gitkeep


--------------------------------------------------------------------------------
/notebooks/Spark Cross Sell Frequent Pairs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Recommendations based on frequent pairs\n",
  8 |     "\n",
  9 |     "This notebook shows how to calculate frequent pairs based on shopping lists, using a scoring function that is relevant for recommendatins."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## Generating artificial shopping lists"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 39,
 22 |    "metadata": {
 23 |     "collapsed": false
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "import numpy as np\n",
 28 |     "from itertools import groupby, permutations, chain, islice\n",
 29 |     "from operator import itemgetter, add\n",
 30 |     "\n",
 31 |     "def get_random_element(x):\n",
 32 |     "    if x is None:\n",
 33 |     "        return -1\n",
 34 |     "    else:\n",
 35 |     "        return x[np.random.randint(len(x))]\n",
 36 |     "\n",
 37 |     "def split_every(n, iterable):\n",
 38 |     "    i = iter(iterable)\n",
 39 |     "    piece = list(islice(i, n))\n",
 40 |     "    while piece:\n",
 41 |     "        yield piece\n",
 42 |     "        piece = list(islice(i, n))\n"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "Let's generate some random shopping lists:"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 40,
 55 |    "metadata": {
 56 |     "collapsed": true
 57 |    },
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "num_unique_items =   5000\n",
 61 |     "num_paired_items =    100\n",
 62 |     "num_bought_items = 250000"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "Select some random pairs which are usually bought together"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 41,
 75 |    "metadata": {
 76 |     "collapsed": false
 77 |    },
 78 |    "outputs": [
 79 |     {
 80 |      "name": "stdout",
 81 |      "output_type": "stream",
 82 |      "text": [
 83 |       "We generate example pairs which will always be bought together. Some examples:\n"
 84 |      ]
 85 |     },
 86 |     {
 87 |      "data": {
 88 |       "text/plain": [
 89 |        "[(2048, [2804]), (3086, [1164]), (22, [4816]), (547, [3575]), (41, [2330])]"
 90 |       ]
 91 |      },
 92 |      "execution_count": 41,
 93 |      "metadata": {},
 94 |      "output_type": "execute_result"
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "test_givenpairs = np.random.randint(num_unique_items, size=(num_paired_items, 2))\n",
 99 |     "test_pair_dict = dict([(k, [itemgetter(1)(f) for f in v]) for k, v in groupby(sorted(test_givenpairs,key=itemgetter(0)), key = itemgetter(0))])\n",
100 |     "print \"We generate example pairs which will always be bought together. Some examples:\"\n",
101 |     "test_pair_dict.items()[0:5]"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "Determine the items that are bought. By running them modulo their index, we create an exponential distribution"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 42,
114 |    "metadata": {
115 |     "collapsed": true
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "test_purchases_bought = [j % (i+1) for i, j in enumerate(np.random.randint(num_unique_items, size=(num_bought_items)))]"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "Now map all bought items to a random item which is by our definition bought together. If one is available"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 43,
132 |    "metadata": {
133 |     "collapsed": false
134 |    },
135 |    "outputs": [],
136 |    "source": [
137 |     "vectorized_select_paired_items = np.vectorize(test_pair_dict.get)\n",
138 |     "vectorized_select_random_item = np.vectorize(get_random_element)\n",
139 |     "test_purchases_added = ( \n",
140 |     "    vectorized_select_random_item(  # select one item random from the list of paired items\n",
141 |     "        vectorized_select_paired_items(  # map them to the list of paired items\n",
142 |     "            test_purchases_bought # the items we bought\n",
143 |     "        )\n",
144 |     "    ))\n",
145 |     "test_purchases_pairs = np.transpose(np.vstack((test_purchases_bought, test_purchases_added)))\n",
146 |     "np.random.shuffle(test_purchases_pairs)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "Given these pairs, we unravel it to create one long list, which is then splitted into different shopping baskets"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 44,
159 |    "metadata": {
160 |     "collapsed": false
161 |    },
162 |    "outputs": [
163 |     {
164 |      "name": "stdout",
165 |      "output_type": "stream",
166 |      "text": [
167 |       "Each few items are joined together to form a list. Because some items are removed, lists have variable length. Some examples:\n",
168 |       "[[266, 2538, 3794, 4274], [1099, 4111, 4177], [1442, 3249, 4480], [3652, 3769, 4565], [347, 1253, 2298]]\n",
169 |       "Total number of generated shopping lists = 83334\n"
170 |      ]
171 |     }
172 |    ],
173 |    "source": [
174 |     "test_sequence = test_purchases_pairs.ravel()\n",
175 |     "test_shopping_baskets = [nonempty for nonempty in [np.unique(a[a>0]).tolist() for a in [np.array(a) for a in list(split_every(6, test_sequence))]] if len(nonempty)>0]\n",
176 |     "print \"Each few items are joined together to form a list. Because some items are removed, lists have variable length. Some examples:\"\n",
177 |     "print test_shopping_baskets[0:5]\n",
178 |     "print \"Total number of generated shopping lists = {}\".format(len(test_shopping_baskets))"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "## Step 0: Loading the data in spark"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 45,
191 |    "metadata": {
192 |     "collapsed": false
193 |    },
194 |    "outputs": [
195 |     {
196 |      "name": "stdout",
197 |      "output_type": "stream",
198 |      "text": [
199 |       "There are in total 83334 shopping lists\n"
200 |      ]
201 |     }
202 |    ],
203 |    "source": [
204 |     "shopping_baskets = sc.parallelize(test_shopping_baskets)\n",
205 |     "shopping_baskets_count = shopping_baskets.count()\n",
206 |     "print \"There are in total {} shopping lists\".format(shopping_baskets_count)"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "metadata": {},
212 |    "source": [
213 |     "## Step 1: Calculating item frequencies"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "Getting item buy frequency. While the shopping lists do not have to fit in memory, a list of unique items should"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 46,
226 |    "metadata": {
227 |     "collapsed": false
228 |    },
229 |    "outputs": [],
230 |    "source": [
231 |     "def check_uniqueness(t):\n",
232 |     "    if len(set(t))!=len(t):\n",
233 |     "        raise ValueError(\"Items in a transaction must be unique but got {}\".format(t))\n",
234 |     "        return t\n",
235 |     "    else:\n",
236 |     "        return t\n",
237 |     "item_freq = dict(shopping_baskets.flatMap(check_uniqueness).map(lambda v: (v, 1L)).reduceByKey(add).collect())\n"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": 47,
243 |    "metadata": {
244 |     "collapsed": false
245 |    },
246 |    "outputs": [
247 |     {
248 |      "name": "stdout",
249 |      "output_type": "stream",
250 |      "text": [
251 |       "The most popular item is bought in 126 shopping baskets\n",
252 |       "As example, the first few items with their frequencies:\n",
253 |       "[(1, 60L), (2, 59L), (3, 53L), (4, 69L), (5, 50L)]\n"
254 |      ]
255 |     }
256 |    ],
257 |    "source": [
258 |     "print \"The most popular item is bought in {} shopping baskets\".format(max(item_freq.values()))\n",
259 |     "print \"As example, the first few items with their frequencies:\"\n",
260 |     "print item_freq.items()[0:5]"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "metadata": {},
266 |    "source": [
267 |     "## Step 2: Calculating item pair scores"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "markdown",
272 |    "metadata": {},
273 |    "source": [
274 |     "We use the following score:\n",
275 |     "$$ score = \\dfrac{\\bigg(\\dfrac{X\\ and\\ Y}{X}\\bigg)}{\\bigg(\\dfrac{(not\\ X)\\ and\\ Y}{not\\ X}\\bigg)}$$"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": 48,
281 |    "metadata": {
282 |     "collapsed": false
283 |    },
284 |    "outputs": [],
285 |    "source": [
286 |     "from __future__ import division\n",
287 |     "def calculate_score( xy, xy_count):\n",
288 |     "    \"\"\"\n",
289 |     "    xy is a tuple of item ids\n",
290 |     "    xy_count is the observation count\n",
291 |     "    calculates:\n",
292 |     "      x and y / x\n",
293 |     "      / \n",
294 |     "    not x and y / not x\"\"\"\n",
295 |     "    x_item, y_item = xy\n",
296 |     "    x = item_freq[x_item]\n",
297 |     "    y = item_freq[y_item]\n",
298 |     "    notx = shopping_baskets_count - x\n",
299 |     "    x_y = xy_count\n",
300 |     "    notx_y = y - x_y\n",
301 |     "    if notx_y==0:\n",
302 |     "        return (xy, np.Inf)\n",
303 |     "    else:\n",
304 |     "        return (xy, (notx/x) * (x_y/notx_y))\n",
305 |     "    \n",
306 |     "def all_pairs(x):\n",
307 |     "    return list(permutations(x, 2)) # permutations also generates the pairs with _1 and _2 flipped\n",
308 |     "        \n",
309 |     "def as_key_with_value(i):\n",
310 |     "    def as_key(x):\n",
311 |     "        return x, i\n",
312 |     "    return as_key"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": 49,
318 |    "metadata": {
319 |     "collapsed": false
320 |    },
321 |    "outputs": [],
322 |    "source": [
323 |     "pairs = shopping_baskets\\\n",
324 |     "            .flatMap(all_pairs)\\\n",
325 |     "            .map(as_key_with_value(1))\\\n",
326 |     "            .reduceByKey(add)\\\n",
327 |     "            .map(lambda x:calculate_score(*x))\\\n",
328 |     "            .cache()"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "markdown",
333 |    "metadata": {},
334 |    "source": [
335 |     "Now we have the score for every pair of products ever bought"
336 |    ]
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": 50,
341 |    "metadata": {
342 |     "collapsed": false
343 |    },
344 |    "outputs": [
345 |     {
346 |      "name": "stdout",
347 |      "output_type": "stream",
348 |      "text": [
349 |       "There are in total 513862 pairs of bought products\n",
350 |       "The first few pairs with their score:\n"
351 |      ]
352 |     },
353 |     {
354 |      "data": {
355 |       "text/plain": [
356 |        "[((3075, 3343), 41.066074950690336),\n",
357 |        " ((4171, 535), 41.3125),\n",
358 |        " ((2261, 2169), 26.089598997493734),\n",
359 |        " ((810, 3728), 42.48877551020408),\n",
360 |        " ((4785, 2165), 29.65811965811966),\n",
361 |        " ((2635, 4663), 17.135802469135804),\n",
362 |        " ((112, 4246), 41.647),\n",
363 |        " ((4649, 717), 26.0531914893617),\n",
364 |        " ((2514, 2968), 31.7472359893252),\n",
365 |        " ((607, 3139), 39.06801125703565)]"
366 |       ]
367 |      },
368 |      "execution_count": 50,
369 |      "metadata": {},
370 |      "output_type": "execute_result"
371 |     }
372 |    ],
373 |    "source": [
374 |     "pairs_count = pairs.count()\n",
375 |     "print \"There are in total {} pairs of bought products\".format(pairs_count)\n",
376 |     "print \"The first few pairs with their score:\"\n",
377 |     "pairs.take(10)"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "markdown",
382 |    "metadata": {},
383 |    "source": [
384 |     "## Step 3, Option 1: Selecting pairs based on score threshold"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "markdown",
389 |    "metadata": {},
390 |    "source": [
391 |     "As context, let's get the histogram"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": 61,
397 |    "metadata": {
398 |     "collapsed": false
399 |    },
400 |    "outputs": [
401 |     {
402 |      "data": {
403 |       "text/plain": [
404 |        "([5.683559866129364,\n",
405 |        "  250.08873329128113,\n",
406 |        "  494.4939067164329,\n",
407 |        "  738.8990801415846,\n",
408 |        "  983.3042535667364,\n",
409 |        "  1227.7094269918882,\n",
410 |        "  1472.1146004170398,\n",
411 |        "  1716.5197738421916,\n",
412 |        "  1960.9249472673434,\n",
413 |        "  2205.330120692495,\n",
414 |        "  2449.735294117647],\n",
415 |        " [513662, 0, 0, 4, 4, 17, 43, 24, 9, 2])"
416 |       ]
417 |      },
418 |      "execution_count": 61,
419 |      "metadata": {},
420 |      "output_type": "execute_result"
421 |     }
422 |    ],
423 |    "source": [
424 |     "pairs.map(lambda k_v:k_v[1]).filter(lambda score: not np.isinf(score)).histogram(10)"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": 62,
430 |    "metadata": {
431 |     "collapsed": false
432 |    },
433 |    "outputs": [
434 |     {
435 |      "name": "stdout",
436 |      "output_type": "stream",
437 |      "text": [
438 |       "The number of frequent pairs = 200\n"
439 |      ]
440 |     }
441 |    ],
442 |    "source": [
443 |     "frequent_pairs = pairs.filter(lambda k_v:k_v[1]>250).collect()\n",
444 |     "print \"The number of frequent pairs = {}\".format(len(frequent_pairs))"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "code",
449 |    "execution_count": 52,
450 |    "metadata": {
451 |     "collapsed": false
452 |    },
453 |    "outputs": [
454 |     {
455 |      "data": {
456 |       "text/plain": [
457 |        "[((4257, 2061), 1321.952380952381),\n",
458 |        " ((3627, 479), 930.1998710509349),\n",
459 |        " ((2061, 4257), inf),\n",
460 |        " ((547, 3575), 1665.88),\n",
461 |        " ((2926, 1436), inf)]"
462 |       ]
463 |      },
464 |      "execution_count": 52,
465 |      "metadata": {},
466 |      "output_type": "execute_result"
467 |     }
468 |    ],
469 |    "source": [
470 |     "frequent_pairs[0:5]"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "markdown",
475 |    "metadata": {},
476 |    "source": [
477 |     "## Step 3, Option 2: Selecting top N scoring cross selling items."
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "markdown",
482 |    "metadata": {},
483 |    "source": [
484 |     "Here we define some helper functions to keep the highest N co occurring items"
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "code",
489 |    "execution_count": 53,
490 |    "metadata": {
491 |     "collapsed": false
492 |    },
493 |    "outputs": [],
494 |    "source": [
495 |     "def aggregate_zero():\n",
496 |     "    return []\n",
497 |     "\n",
498 |     "def aggregate_seq(n):\n",
499 |     "    def sequenceadd(seq, item):\n",
500 |     "        seq.append(item)\n",
501 |     "        seq.sort(key=lambda x:x[1], reverse=True)\n",
502 |     "        return seq[0:n]\n",
503 |     "    return sequenceadd\n",
504 |     "\n",
505 |     "def aggregate_combine(n):\n",
506 |     "    def combine(seq1, seq2):\n",
507 |     "        return sorted(seq1+seq2, key=lambda x:x[1], reverse=True)[0:n]\n",
508 |     "    return combine"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "markdown",
513 |    "metadata": {},
514 |    "source": [
515 |     "Instead of hard thresholds, we can just find the most cross sellable product for each product. Some examples:"
516 |    ]
517 |   },
518 |   {
519 |    "cell_type": "code",
520 |    "execution_count": 54,
521 |    "metadata": {
522 |     "collapsed": false
523 |    },
524 |    "outputs": [],
525 |    "source": [
526 |     "item_with_cross_sells = pairs\\\n",
527 |     "    .map(lambda kv: (kv[0][0], (kv[0][1], kv[1])))\\\n",
528 |     "    .aggregateByKey(aggregate_zero(), aggregate_seq(5), aggregate_combine(5)).cache()"
529 |    ]
530 |   },
531 |   {
532 |    "cell_type": "code",
533 |    "execution_count": 55,
534 |    "metadata": {
535 |     "collapsed": false
536 |    },
537 |    "outputs": [
538 |     {
539 |      "data": {
540 |       "text/plain": [
541 |        "[(2048,\n",
542 |        "  [(2804, 2191.3947368421054),\n",
543 |        "   (3589, 53.53455480552877),\n",
544 |        "   (3468, 44.03648863035431),\n",
545 |        "   (2428, 44.03648863035431),\n",
546 |        "   (4481, 42.66034836065574)]),\n",
547 |        " (3072,\n",
548 |        "  [(63, 57.431724137931035),\n",
549 |        "   (1603, 55.222811671087534),\n",
550 |        "   (2161, 53.17752234993614),\n",
551 |        "   (1396, 51.278325123152705),\n",
552 |        "   (3171, 41.02266009852217)]),\n",
553 |        " (8,\n",
554 |        "  [(4082, 60.69825072886297),\n",
555 |        "   (897, 41.30853174603175),\n",
556 |        "   (4982, 38.13095238095238),\n",
557 |        "   (4459, 38.13095238095238),\n",
558 |        "   (1525, 36.27090592334495)]),\n",
559 |        " (16,\n",
560 |        "  [(2027, 54.422222222222224),\n",
561 |        "   (599, 42.224137931034484),\n",
562 |        "   (1828, 40.81666666666666),\n",
563 |        "   (1460, 38.265625),\n",
564 |        "   (3584, 34.98571428571429)]),\n",
565 |        " (344,\n",
566 |        "  [(636, 59.140625),\n",
567 |        "   (3622, 56.56929347826087),\n",
568 |        "   (2246, 50.042067307692314),\n",
569 |        "   (4331, 44.10487288135593),\n",
570 |        "   (3839, 37.17410714285714)])]"
571 |       ]
572 |      },
573 |      "execution_count": 55,
574 |      "metadata": {},
575 |      "output_type": "execute_result"
576 |     }
577 |    ],
578 |    "source": [
579 |     "item_with_cross_sells.take(5)"
580 |    ]
581 |   },
582 |   {
583 |    "cell_type": "markdown",
584 |    "metadata": {},
585 |    "source": [
586 |     "Let's find a perfect pair, one with score infinity"
587 |    ]
588 |   },
589 |   {
590 |    "cell_type": "code",
591 |    "execution_count": 56,
592 |    "metadata": {
593 |     "collapsed": false
594 |    },
595 |    "outputs": [
596 |     {
597 |      "data": {
598 |       "text/plain": [
599 |        "((2061, 4257), inf)"
600 |       ]
601 |      },
602 |      "execution_count": 56,
603 |      "metadata": {},
604 |      "output_type": "execute_result"
605 |     }
606 |    ],
607 |    "source": [
608 |     "perfect_pair = pairs.filter(lambda x: np.isinf(x[1])).take(1)[0]\n",
609 |     "perfect_pair"
610 |    ]
611 |   },
612 |   {
613 |    "cell_type": "markdown",
614 |    "metadata": {},
615 |    "source": [
616 |     "And show with which other items that occurs:"
617 |    ]
618 |   },
619 |   {
620 |    "cell_type": "code",
621 |    "execution_count": 57,
622 |    "metadata": {
623 |     "collapsed": false
624 |    },
625 |    "outputs": [
626 |     {
627 |      "data": {
628 |       "text/plain": [
629 |        "[[(4257, inf),\n",
630 |        "  (1971, 39.45945945945946),\n",
631 |        "  (2047, 30.416666666666664),\n",
632 |        "  (3390, 28.07692307692308),\n",
633 |        "  (2232, 25.17241379310345)]]"
634 |       ]
635 |      },
636 |      "execution_count": 57,
637 |      "metadata": {},
638 |      "output_type": "execute_result"
639 |     }
640 |    ],
641 |    "source": [
642 |     "item_with_cross_sells.lookup(perfect_pair[0][0])"
643 |    ]
644 |   }
645 |  ],
646 |  "metadata": {
647 |   "anaconda-cloud": {},
648 |   "kernelspec": {
649 |    "display_name": "Python [bdranalytics]",
650 |    "language": "python",
651 |    "name": "Python [bdranalytics]"
652 |   },
653 |   "language_info": {
654 |    "codemirror_mode": {
655 |     "name": "ipython",
656 |     "version": 2
657 |    },
658 |    "file_extension": ".py",
659 |    "mimetype": "text/x-python",
660 |    "name": "python",
661 |    "nbconvert_exporter": "python",
662 |    "pygments_lexer": "ipython2",
663 |    "version": "2.7.12"
664 |   }
665 |  },
666 |  "nbformat": 4,
667 |  "nbformat_minor": 0
668 | }
669 | 


--------------------------------------------------------------------------------
/notebooks/bdr-imbalanced-classification.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "deletable": true,
  7 |     "editable": true
  8 |    },
  9 |    "source": [
 10 |     "# Classification model\n",
 11 |     "Here we use machine learning techniques to create and validate a model that can predict the probability of a relatively rare event (imbalanced classes problem)."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {
 18 |     "collapsed": false,
 19 |     "deletable": true,
 20 |     "editable": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "import sys\n",
 25 |     "sys.path.append('../')\n",
 26 |     "\n",
 27 |     "# import generic packages\n",
 28 |     "import numpy as np\n",
 29 |     "import pandas as pd\n",
 30 |     "# pd.options.display.max_columns = None\n",
 31 |     "# pd.options.display.max_colwidth = 100\n",
 32 |     "from IPython.display import display\n",
 33 |     "\n",
 34 |     "# visualization packages\n",
 35 |     "import matplotlib.pyplot as plt\n",
 36 |     "import matplotlib\n",
 37 |     "import seaborn as sns\n",
 38 |     "sns.set(style=\"white\")\n",
 39 |     "%matplotlib inline\n",
 40 |     "\n",
 41 |     "# module loading settings\n",
 42 |     "%load_ext autoreload\n",
 43 |     "%autoreload 2"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {
 50 |     "collapsed": false,
 51 |     "deletable": true,
 52 |     "editable": true
 53 |    },
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "# load to data frame\n",
 57 |     "df = pd.read_csv('')\n",
 58 |     "\n",
 59 |     "# extract and remove timestamps from data frame\n",
 60 |     "timestamps = df['timestamp']\n",
 61 |     "df.drop('timestamp', axis=1, inplace=True)\n",
 62 |     "\n",
 63 |     "# determine categoricals\n",
 64 |     "high_capacity = df.columns.values[~np.array(df.dtypes == np.number)].tolist()\n",
 65 |     "print \"high capacity categorical feature columns:\"\n",
 66 |     "print high_capacity\n",
 67 |     "\n",
 68 |     "# print some info\n",
 69 |     "print \"{:d} observations\".format(len(df))\n",
 70 |     "df.head()"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "markdown",
 75 |    "metadata": {
 76 |     "deletable": true,
 77 |     "editable": true
 78 |    },
 79 |    "source": [
 80 |     "## Model specification\n",
 81 |     "Here we set some specifications for the model: type, how it should be fitted, optimized and validated."
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {
 88 |     "collapsed": true,
 89 |     "deletable": true,
 90 |     "editable": true
 91 |    },
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "model_type = 'rf'  # the classification algorithm\n",
 95 |     "tune_model = False  # optimize hyperparameters\n",
 96 |     "\n",
 97 |     "cross_val_method = 'temporal'  # cross-validation routine\n",
 98 |     "\n",
 99 |     "cost_fp = 1000  # preferably in euros!\n",
100 |     "benefit_tp = 3000\n",
101 |     "class_weights = {0: cost_fp, 1: benefit_tp}  # costs for fn and fp"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {
107 |     "deletable": true,
108 |     "editable": true
109 |    },
110 |    "source": [
111 |     "## Cross-validation procedure\n",
112 |     "To validate whether the model makes sensible predictions, we need to perform cross-validation. The exact procedure for this is specified below. Random cross-validation (set-aside a random sample for testing) is fast, but temporal cross-validation (set-aside a time period for testing) gives the most realistic results due to the resemblence of real-world model usage."
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {
119 |     "collapsed": false,
120 |     "deletable": true,
121 |     "editable": true
122 |    },
123 |    "outputs": [],
124 |    "source": [
125 |     "from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split\n",
126 |     "\n",
127 |     "#source: https://github.com/BigDataRepublic/bdr-analytics-py\n",
128 |     "#! pip install -e git+ssh://git@github.com/BigDataRepublic/bdr-analytics.git#egg=bdranalytics-0.1\n",
129 |     "from bdranalytics.pipeline.encoders import WeightOfEvidenceEncoder\n",
130 |     "from bdranalytics.model_selection.growingwindow import IntervalGrowingWindow\n",
131 |     "\n",
132 |     "from sklearn.metrics import average_precision_score, make_scorer, roc_auc_score\n",
133 |     "\n",
134 |     "if cross_val_method is 'random':\n",
135 |     "  \n",
136 |     "    # split train data into stratified random folds\n",
137 |     "    cv_dev = StratifiedShuffleSplit(test_size=0.1, train_size=0.1, n_splits=5, random_state=1)\n",
138 |     "    \n",
139 |     "    cv_test = StratifiedShuffleSplit(test_size=0.33, n_splits=1, random_state=2)\n",
140 |     "\n",
141 |     "elif cross_val_method is 'temporal':\n",
142 |     "    \n",
143 |     "    train_size = pd.Timedelta(days=365 * 4 )\n",
144 |     "    \n",
145 |     "    # create a cross-validation routine for parameter tuning\n",
146 |     "    cv_dev = IntervalGrowingWindow(timestamps=timestamps,\n",
147 |     "                               test_start_date=pd.datetime(year=2015, month=1, day=1),\n",
148 |     "                               test_end_date=pd.datetime(year=2015, month=12, day=31),\n",
149 |     "                               test_size=pd.Timedelta(days=30), \n",
150 |     "                               train_size=train_size)\n",
151 |     "    \n",
152 |     "    # create a cross-validation routine for model evaluation\n",
153 |     "    cv_test = IntervalGrowingWindow(timestamps=timestamps,\n",
154 |     "                               test_start_date=pd.datetime(year=2016, month=1, day=1),\n",
155 |     "                               test_end_date=pd.datetime(year=2016, month=8, day=31),\n",
156 |     "                               test_size=pd.Timedelta(days=2*30),\n",
157 |     "                               train_size=train_size)    \n",
158 |     "\n",
159 |     "# number of parallel jobs for cross-validation\n",
160 |     "n_jobs = 1\n",
161 |     "\n",
162 |     "# two functions for advanced performance evaluation metrics\n",
163 |     "def roc_auc(y_true, y_pred):\n",
164 |     "    return roc_auc_score(pd.get_dummies(y_true), y_pred)\n",
165 |     "\n",
166 |     "roc_auc_scorer = make_scorer(roc_auc, needs_proba=True)\n",
167 |     "\n",
168 |     "def pr_auc(y_true, y_pred):\n",
169 |     "    return average_precision_score(pd.get_dummies(y_true), y_pred, average=\"micro\")\n",
170 |     "\n",
171 |     "pr_auc_scorer = make_scorer(pr_auc, needs_proba=True)"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {
178 |     "collapsed": false,
179 |     "deletable": true,
180 |     "editable": true
181 |    },
182 |    "outputs": [],
183 |    "source": [
184 |     "from sklearn.preprocessing import StandardScaler, Imputer\n",
185 |     "\n",
186 |     "from sklearn.pipeline import Pipeline\n",
187 |     "\n",
188 |     "from sklearn.linear_model import LogisticRegression\n",
189 |     "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n",
190 |     "from sklearn.dummy import DummyClassifier\n",
191 |     "from xgboost import XGBClassifier\n",
192 |     "\n",
193 |     "# convert date frame to bare X and y variables for the model pipeline\n",
194 |     "y_col = 'target'\n",
195 |     "X = df.copy().drop(y_col, axis=1)\n",
196 |     "y = np.array(df[y_col])\n",
197 |     "n_features = X.shape[1]\n",
198 |     "\n",
199 |     "# define preprocessing steps\n",
200 |     "preproc_steps = [('woe', WeightOfEvidenceEncoder(cols=high_capacity)),\n",
201 |     "                 ('imputer', Imputer(missing_values='NaN', strategy='median', axis=0)),\n",
202 |     "                 ('standardizer', StandardScaler(with_mean=True, with_std=True))]\n",
203 |     "\n",
204 |     "# specification of different model types and their defaults\n",
205 |     "model_steps_dict = {'lr': [('lr', LogisticRegression(C=0.001, penalty='l2', tol=0.01,\n",
206 |     "                                                     class_weight=class_weights))],\n",
207 |     "                   'rf': [('rf', RandomForestClassifier(n_estimators=400, max_features='auto',\n",
208 |     "                                                       class_weight=class_weights))],\n",
209 |     "                   'gbc': [('gbc', GradientBoostingClassifier(n_estimators=400, max_depth=3))],\n",
210 |     "                   'xgb': [('xgb', XGBClassifier(scale_pos_weight=class_weights[1],\n",
211 |     "                                                 n_estimators=100, max_depth=4))],\n",
212 |     "                   'dummy': [('dummy', DummyClassifier(strategy='prior'))]\n",
213 |     "                   }\n",
214 |     "\n",
215 |     "# specification of the different model hyperparameters and tuning space\n",
216 |     "model_params_grid = {'lr': {'lr__C': [1e-4, 1e-3, 1e-2, 1e-1]},\n",
217 |     "                    'rf': {'rf__max_features': [3, n_features, np.sqrt(n_features)],\n",
218 |     "                           'rf__n_estimators': [10, 100, 1000]},\n",
219 |     "                     'gbc': {'gbc__n_estimators': [100, 200]},\n",
220 |     "                     'xgb': {'xgb__max_depth': [3,6,9],\n",
221 |     "                             'xgb__reg_alpha': [0,5,15],\n",
222 |     "                             'xgb__reg_lambda': [0,5,15],\n",
223 |     "                             'xgb__gamma' : [0,10,50,100]},\n",
224 |     "                     'dummy': {}}\n",
225 |     "\n",
226 |     "# store the model step\n",
227 |     "model_steps = model_steps_dict[model_type]\n",
228 |     "\n",
229 |     "# combine everything in one pipeline\n",
230 |     "estimator = Pipeline(steps=(preproc_steps + model_steps))\n",
231 |     "print estimator"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "metadata": {
237 |     "deletable": true,
238 |     "editable": true
239 |    },
240 |    "source": [
241 |     "## Model parameter tuning\n",
242 |     "If desired, we can optimize the model hyperparameters to get the best possible model."
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "metadata": {
249 |     "collapsed": false,
250 |     "deletable": true,
251 |     "editable": true
252 |    },
253 |    "outputs": [],
254 |    "source": [
255 |     "# procedure depends on cross-validation type\n",
256 |     "if cross_val_method is 'random': \n",
257 |     "    train_index = next(cv_test.split(X, y))[0]\n",
258 |     "    X_dev = X.iloc[train_index,:]\n",
259 |     "    y_dev = y[train_index]\n",
260 |     "elif cross_val_method is 'temporal':\n",
261 |     "    X_dev = X\n",
262 |     "    y_dev = y\n",
263 |     "\n",
264 |     "# setting to include class weights in the gradient boosting model\n",
265 |     "if model_type is 'gbc':\n",
266 |     "    sample_weights = np.array(map(lambda x: class_weights[x], y_dev))\n",
267 |     "    fit_params = {'gbc__sample_weight': sample_weights}\n",
268 |     "else: \n",
269 |     "    fit_params = {}\n",
270 |     "\n",
271 |     "# tune model with a parameter grid search if desired\n",
272 |     "if tune_model:\n",
273 |     "    \n",
274 |     "    grid_search = GridSearchCV(estimator, cv=cv_dev, n_jobs=n_jobs, refit=False,\n",
275 |     "                             param_grid=model_params_grid[model_type],\n",
276 |     "                             scoring=pr_auc_scorer, fit_params=fit_params)\n",
277 |     "\n",
278 |     "    grid_search.fit(X_dev, y_dev)\n",
279 |     "    \n",
280 |     "    # show grid search results\n",
281 |     "    display(pd.DataFrame(grid_search.cv_results_))\n",
282 |     "    \n",
283 |     "    # set best parameters for estimator\n",
284 |     "    estimator.set_params(**grid_search.best_params_)"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "markdown",
289 |    "metadata": {
290 |     "deletable": true,
291 |     "editable": true
292 |    },
293 |    "source": [
294 |     "## Model validation\n",
295 |     "The final test on the holdout."
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": null,
301 |    "metadata": {
302 |     "collapsed": false,
303 |     "deletable": true,
304 |     "editable": true,
305 |     "scrolled": true
306 |    },
307 |    "outputs": [],
308 |    "source": [
309 |     "y_pred_proba = []  # initialize empty predictions array\n",
310 |     "y_true = []  # initialize empty ground-truth array\n",
311 |     "\n",
312 |     "# loop over the test folds\n",
313 |     "for i_fold, (train_index, test_index) in enumerate(cv_test.split(X, y)):\n",
314 |     "    \n",
315 |     "    print \"validation fold {:d}\".format(i_fold)\n",
316 |     "    \n",
317 |     "    X_train = X.iloc[train_index,:]\n",
318 |     "    y_train = y[train_index]\n",
319 |     "    \n",
320 |     "    X_test = X.iloc[test_index,:]\n",
321 |     "    y_test = y[test_index]\n",
322 |     "    \n",
323 |     "    if model_type is 'gbc':\n",
324 |     "        sample_weights = map(lambda x: class_weights[x], y_train)\n",
325 |     "        fit_params = {'gbc__sample_weight': sample_weights}\n",
326 |     "    else: \n",
327 |     "        fit_params = {}\n",
328 |     "    \n",
329 |     "    # fit the model\n",
330 |     "    estimator.fit(X_train, y_train, **fit_params)\n",
331 |     "\n",
332 |     "    # probability outputs for class 1\n",
333 |     "    y_pred_proba.append(map(lambda x: x[1], estimator.predict_proba(X_test)))\n",
334 |     "    \n",
335 |     "    # store the true y labels for each fold\n",
336 |     "    y_true.append(np.array(y_test))\n",
337 |     "\n",
338 |     "# postprocess the results\n",
339 |     "y_true = np.concatenate(y_true)\n",
340 |     "y_pred_proba = np.concatenate(y_pred_proba) \n",
341 |     "y_pred_bin = (y_pred_proba > 0.5) * 1.\n",
342 |     "\n",
343 |     "# print some stats\n",
344 |     "n_samples_test = len(y_true)\n",
345 |     "n_pos_test = sum(y_true)\n",
346 |     "n_neg_test = n_samples_test - n_pos_test\n",
347 |     "print \"events: {}\".format(n_pos_test)\n",
348 |     "print \"p_no_event: {}\".format(n_neg_test / n_samples_test)\n",
349 |     "print \"test accuracy: {}\".format((np.equal(y_pred_bin, y_true) * 1.).mean())"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "markdown",
354 |    "metadata": {
355 |     "deletable": true,
356 |     "editable": true
357 |    },
358 |    "source": [
359 |     "### Receiver-operator characteristics\n",
360 |     "Line is constructed by applying various threshold to the model output.   \n",
361 |     "Y-axis: proportion of events correctly identified, hit-rate  \n",
362 |     "X-axis: proportion of false positives, usually results in waste of resources \n",
363 |     "Dotted line is guessing (no model). Blue line above the dotted line means there is information in the features."
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": null,
369 |    "metadata": {
370 |     "collapsed": false,
371 |     "deletable": true,
372 |     "editable": true
373 |    },
374 |    "outputs": [],
375 |    "source": [
376 |     "from sklearn.metrics import roc_curve, auc\n",
377 |     "\n",
378 |     "fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba, pos_label=1)\n",
379 |     "roc_auc = auc(fpr, tpr)\n",
380 |     "     \n",
381 |     "# plot ROC curve\n",
382 |     "plt.figure()\n",
383 |     "plt.plot(fpr, tpr, label=\"ROC curve (area = {:.2f})\".format(roc_auc))\n",
384 |     "plt.plot([0, 1], [0, 1], 'k--')\n",
385 |     "plt.xlim([0.0, 1.0])\n",
386 |     "plt.ylim([0.0, 1.0])\n",
387 |     "plt.xlabel('False positive rate')\n",
388 |     "plt.ylabel('True positive rate')\n",
389 |     "plt.title('Receiver-operating characteristic')\n",
390 |     "plt.legend(loc=\"lower right\")\n",
391 |     "plt.show()"
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "markdown",
396 |    "metadata": {
397 |     "deletable": true,
398 |     "editable": true
399 |    },
400 |    "source": [
401 |     "## Costs and benefits\n",
402 |     "ROC optimization with cost matrix. Critical information: cost of FP and cost of FN (i.e. benefit of TP). Also used to train the model with `class_weights`."
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": null,
408 |    "metadata": {
409 |     "collapsed": false,
410 |     "deletable": true,
411 |     "editable": true
412 |    },
413 |    "outputs": [],
414 |    "source": [
415 |     "def benefit(tpr, fpr):\n",
416 |     "\n",
417 |     "    n_tp = tpr * n_pos_test  # number of true positives (benefits)\n",
418 |     "    n_fp = fpr * n_neg_test  # number of false positives (extra costs)\n",
419 |     "    \n",
420 |     "    fp_costs = n_fp * cost_fp\n",
421 |     "    tp_benefits =  n_tp * benefit_tp\n",
422 |     "    \n",
423 |     "    return tp_benefits - fp_costs\n",
424 |     "\n",
425 |     "benefits = np.zeros_like(thresholds)\n",
426 |     "for i, _ in enumerate(thresholds):\n",
427 |     "    benefits[i] = benefit(tpr[i], fpr[i])\n",
428 |     "\n",
429 |     "i_max = np.argmax(benefits)\n",
430 |     "print (\"max benefits: {:.0f}k euros, tpr: {:.3f}, fpr: {:.3f}, threshold: {:.3f}\"\n",
431 |     "       .format(benefits[i_max]/ 1e3, benefits[i_max]/ 1e3 / 8, tpr[i_max], fpr[i_max], thresholds[i_max]))\n",
432 |     "\n",
433 |     "plt.plot(thresholds, benefits)\n",
434 |     "plt.xlim([0,1])\n",
435 |     "plt.ylim([0,np.max(benefits)])\n",
436 |     "plt.show()"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "code",
441 |    "execution_count": null,
442 |    "metadata": {
443 |     "collapsed": false,
444 |     "deletable": true,
445 |     "editable": true
446 |    },
447 |    "outputs": [],
448 |    "source": [
449 |     "# recalibrate threshold based on benefits (optional, should still be around 0.5)\n",
450 |     "y_pred_bin = (y_pred_proba > thresholds[i_max]) * 1."
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "markdown",
455 |    "metadata": {
456 |     "deletable": true,
457 |     "editable": true
458 |    },
459 |    "source": [
460 |     "###  Precision-recall curve\n",
461 |     "Another way to look at it. Note that models which perform well in PR-space are necessarily also dominating ROC-space. The opposite is not the case! Line is constructed by applying various threshold to the model output.  \n",
462 |     "Y-axis: proportion of events among all positives (precision)  \n",
463 |     "X-axis: proportion of events correctly identified (recall, hit rate)"
464 |    ]
465 |   },
466 |   {
467 |    "cell_type": "code",
468 |    "execution_count": null,
469 |    "metadata": {
470 |     "collapsed": false,
471 |     "deletable": true,
472 |     "editable": true
473 |    },
474 |    "outputs": [],
475 |    "source": [
476 |     "from sklearn.metrics import precision_recall_curve\n",
477 |     "\n",
478 |     "precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba, pos_label=1)\n",
479 |     "\n",
480 |     "average_precision = average_precision_score(y_true, y_pred_proba, average=\"micro\")\n",
481 |     "\n",
482 |     "baseline = n_pos_test / n_samples_test\n",
483 |     "\n",
484 |     "# plot PR curve\n",
485 |     "plt.figure()\n",
486 |     "plt.plot(recall, precision, label=\"PR curve (area = {:.2f})\".format(average_precision))\n",
487 |     "plt.plot([0, 1], [baseline, baseline], 'k--')\n",
488 |     "plt.xlim([0.0, 1.0])\n",
489 |     "plt.ylim([0.0, 1.0])\n",
490 |     "plt.xlabel('Recall')\n",
491 |     "plt.ylabel('Precision')\n",
492 |     "plt.title('Precision-recall curve')\n",
493 |     "plt.legend(loc=\"lower right\")\n",
494 |     "plt.show()\n",
495 |     "\n",
496 |     "if model_type is 'dummy':\n",
497 |     "    print 'DummyClassifier only has endpoints in PR-curve'"
498 |    ]
499 |   },
500 |   {
501 |    "cell_type": "markdown",
502 |    "metadata": {
503 |     "deletable": true,
504 |     "editable": true
505 |    },
506 |    "source": [
507 |     "### Classification report"
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "code",
512 |    "execution_count": null,
513 |    "metadata": {
514 |     "collapsed": false,
515 |     "deletable": true,
516 |     "editable": true
517 |    },
518 |    "outputs": [],
519 |    "source": [
520 |     "from sklearn.metrics import classification_report\n",
521 |     "\n",
522 |     "target_names = ['no event','event']\n",
523 |     "\n",
524 |     "print classification_report(y_true, y_pred_bin, target_names=target_names, digits=3)"
525 |    ]
526 |   },
527 |   {
528 |    "cell_type": "markdown",
529 |    "metadata": {
530 |     "deletable": true,
531 |     "editable": true
532 |    },
533 |    "source": [
534 |     "### Confusion matrix"
535 |    ]
536 |   },
537 |   {
538 |    "cell_type": "code",
539 |    "execution_count": null,
540 |    "metadata": {
541 |     "collapsed": false,
542 |     "deletable": true,
543 |     "editable": true
544 |    },
545 |    "outputs": [],
546 |    "source": [
547 |     "from sklearn.metrics import confusion_matrix\n",
548 |     "\n",
549 |     "confusion = pd.DataFrame(confusion_matrix(y_true, y_pred_bin), index=target_names, columns=target_names)\n",
550 |     "sns.heatmap(confusion, annot=True, fmt=\"d\")\n",
551 |     "plt.xlabel('predicted label')\n",
552 |     "plt.ylabel('true label')"
553 |    ]
554 |   },
555 |   {
556 |    "cell_type": "markdown",
557 |    "metadata": {
558 |     "deletable": true,
559 |     "editable": true
560 |    },
561 |    "source": [
562 |     "### Accuracies at different classifier thresholds"
563 |    ]
564 |   },
565 |   {
566 |    "cell_type": "code",
567 |    "execution_count": null,
568 |    "metadata": {
569 |     "collapsed": false,
570 |     "deletable": true,
571 |     "editable": true
572 |    },
573 |    "outputs": [],
574 |    "source": [
575 |     "from sklearn.metrics import accuracy_score\n",
576 |     "\n",
577 |     "thresholds = (np.arange(0,100,1) / 100.)\n",
578 |     "acc = map(lambda thresh: accuracy_score(y_true, map(lambda prob: prob > thresh, y_pred_proba)), thresholds)\n",
579 |     "plt.hist(acc, bins=20);"
580 |    ]
581 |   },
582 |   {
583 |    "cell_type": "markdown",
584 |    "metadata": {
585 |     "deletable": true,
586 |     "editable": true
587 |    },
588 |    "source": [
589 |     "### Thresholds versus accuracy"
590 |    ]
591 |   },
592 |   {
593 |    "cell_type": "code",
594 |    "execution_count": null,
595 |    "metadata": {
596 |     "collapsed": false,
597 |     "deletable": true,
598 |     "editable": true
599 |    },
600 |    "outputs": [],
601 |    "source": [
602 |     "plt.plot(thresholds, acc);"
603 |    ]
604 |   },
605 |   {
606 |    "cell_type": "markdown",
607 |    "metadata": {
608 |     "deletable": true,
609 |     "editable": true
610 |    },
611 |    "source": [
612 |     "### Feature importance\n",
613 |     "Note that these models are optimized to make accurate predictions, and **not** to make solid statistical inferences."
614 |    ]
615 |   },
616 |   {
617 |    "cell_type": "code",
618 |    "execution_count": null,
619 |    "metadata": {
620 |     "collapsed": false,
621 |     "deletable": true,
622 |     "editable": true
623 |    },
624 |    "outputs": [],
625 |    "source": [
626 |     "feature_labels = filter(lambda k: y_col not in k, df.columns.values) \n",
627 |     "\n",
628 |     "if model_type is 'lr':\n",
629 |     "    weights = estimator._final_estimator.coef_[0]\n",
630 |     "elif model_type in ['rf','gbc']:\n",
631 |     "    weights = estimator._final_estimator.feature_importances_\n",
632 |     "elif model_type is 'dummy':\n",
633 |     "    print 'DummyClassifier does not have weights'\n",
634 |     "    weights = np.zeros(len(feature_labels))\n",
635 |     "    \n",
636 |     "feature_weights = pd.Series(weights, index=feature_labels)\n",
637 |     "feature_weights.plot.barh(title='Feature importance', fontsize=8, figsize=(12,30), grid=True);"
638 |    ]
639 |   },
640 |   {
641 |    "cell_type": "code",
642 |    "execution_count": null,
643 |    "metadata": {
644 |     "collapsed": false,
645 |     "deletable": true,
646 |     "editable": true
647 |    },
648 |    "outputs": [],
649 |    "source": [
650 |     "from sklearn.ensemble.partial_dependence import plot_partial_dependence\n",
651 |     "\n",
652 |     "if model_type is 'gbc':\n",
653 |     "    preproc_pipe = Pipeline(steps=preproc_steps)\n",
654 |     "    X_transformed = preproc_pipe.fit_transform(X_dev, y_dev)\n",
655 |     "\n",
656 |     "    plot_partial_dependence(estimator._final_estimator, X_transformed,\n",
657 |     "                            features=range(n_features), feature_names=feature_labels,\n",
658 |     "                            figsize=(12,180), n_cols=4, percentiles=(0.2,0.8));\n",
659 |     "else:\n",
660 |     "    print \"No partial dependence plots available for this model type.\""
661 |    ]
662 |   },
663 |   {
664 |    "cell_type": "code",
665 |    "execution_count": null,
666 |    "metadata": {
667 |     "collapsed": true,
668 |     "deletable": true,
669 |     "editable": true
670 |    },
671 |    "outputs": [],
672 |    "source": []
673 |   }
674 |  ],
675 |  "metadata": {
676 |   "kernelspec": {
677 |    "display_name": "Python [Root]",
678 |    "language": "python",
679 |    "name": "Python [Root]"
680 |   },
681 |   "language_info": {
682 |    "codemirror_mode": {
683 |     "name": "ipython",
684 |     "version": 2
685 |    },
686 |    "file_extension": ".py",
687 |    "mimetype": "text/x-python",
688 |    "name": "python",
689 |    "nbconvert_exporter": "python",
690 |    "pygments_lexer": "ipython2",
691 |    "version": "2.7.13"
692 |   }
693 |  },
694 |  "nbformat": 4,
695 |  "nbformat_minor": 1
696 | }
697 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | # This environment is designed to work ON the bdranalytics module: it includes its dependencies, but not bdranalytics itself.
 2 | # This allows you to easily setup an environment in which you work on / contribute to the bdranalytics module
 3 | # It therefore also includes the test tools
 4 | NumPy>=1.6.1
 5 | SciPy>=0.9
 6 | scikit-learn>=0.18
 7 | pandas
 8 | keras
 9 | matplotlib
10 | pytest
11 | pytest-runner
12 | sphinx
13 | sphinx_rtd_theme
14 | recommonmark
15 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | # This environment is designed to work WITH the bdranalytics module: it includes the module including its dependencies
2 | # If you want to use pip to install the module, instead of by building straight from the setup.py
3 | # By including the module, the module including all its dependencies will be installed
4 | # This therefore makes it easy to include the module into your own environment
5 | # We use the --editable flag such that you can reinstall the package after you modified it
6 | -e .


--------------------------------------------------------------------------------
/sample/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/sample/__init__.py


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [aliases]
2 | test=pytest
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='bdranalytics',
 5 |     version='0.3',
 6 |     license='Apache License 2.0',
 7 |     author='bigdatarepublic',
 8 |     author_email='info@bigdatarepublic.nl',
 9 |     url='http://www.bigdatarepublic.nl',
10 |     long_description="README.md",
11 |     packages=['bdranalytics',
12 |               'bdranalytics.images',
13 |               'bdranalytics.keras',
14 |               'bdranalytics.pdlearn',
15 |               'bdranalytics.plot',
16 |               'bdranalytics.sklearn'],
17 |     include_package_data=True,
18 |     package_data={'bdranalytics': ['data/*.dat'],
19 |                   'bdranalytics.images': ['bdr.gif']},
20 |     description="Making data science workflows easier.",
21 |     python_requires='>3.5',
22 |     install_requires=[
23 |         "NumPy>=1.6.1",
24 |         "SciPy>=0.9",
25 |         "scikit-learn>=0.18",
26 |         "keras",
27 |         "pandas",
28 |         "matplotlib",
29 |     ],
30 |     setup_requires=["pytest-runner"],
31 |     tests_require=["pytest"]
32 | )
33 | 


--------------------------------------------------------------------------------