├── .gitignore ├── .travis.yml ├── LICENSE.md ├── MANIFEST.in ├── README.md ├── bdranalytics ├── __init__.py ├── images │ ├── __init__.py │ └── bdr.gif ├── keras │ ├── __init__.py │ ├── callbacks.py │ ├── generators.py │ ├── layers.py │ └── tests │ │ ├── __init__.py │ │ └── test_generators.py ├── pdlearn │ ├── __init__.py │ ├── pipeline.py │ ├── preprocessing.py │ └── tests │ │ ├── __init__.py │ │ ├── test_pipeline.py │ │ └── test_preprocessing.py ├── plot │ ├── __init__.py │ ├── classification.py │ └── tests │ │ └── __init__.py └── sklearn │ ├── __init__.py │ ├── encoders.py │ ├── model_selection.py │ ├── preprocessing │ ├── __init__.py │ ├── encoding.py │ ├── preprocessing.py │ ├── scaling.py │ └── tests │ │ ├── __init__.py │ │ ├── test_encoding.py │ │ └── test_scaling.py │ └── tests │ ├── __init__.py │ └── test_model_selection.py ├── data ├── recruit.dat ├── soi.dat ├── soi_description.txt └── test.dat ├── doc ├── Makefile ├── conf.py ├── index.rst └── push_to_pages.sh ├── environment-dev.yml ├── environment.yml ├── notebooks ├── .gitkeep ├── Spark Cross Sell Frequent Pairs.ipynb ├── bdr-imbalanced-classification.ipynb ├── bdr-regression.ipynb ├── bdr-timeseries-classic-approach.ipynb └── bdr-timeseries-neuralnetwork-lstm.ipynb ├── requirements-dev.txt ├── requirements.txt ├── sample └── __init__.py ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Documentation 2 | doc/_build 3 | doc/source 4 | 5 | # Python / Anaconda 6 | *.egg-info 7 | .eggs 8 | *.iml 9 | *.pyc 10 | 11 | # IntelliJ / PyCharm 12 | /.idea 13 | /build 14 | /src 15 | /dist 16 | 17 | # Jupyter notebook 18 | .ipynb_checkpoints 19 | 20 | # Joblib 21 | .cache 22 | 23 | # OS X 24 | .DS_Store 25 | 26 | # Tests 27 | .pytest_cache/ 28 | 29 | # Tensorflow 30 | tensorboardlog 31 | notebooks/logs/ 32 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - '3.6' 4 | before_install: 5 | - sudo apt-get update 6 | - sudo apt-get install -qq python-numpy python-scipy 7 | install: pip install -r requirements-dev.txt 8 | script: cd doc && make html && cd .. 9 | after_success: 10 | - "doc/push_to_pages.sh" 11 | deploy: 12 | provider: pypi 13 | user: bigdatarepublic 14 | password: 15 | secure: c8287YylQ5wz4WJMk6DrOlstCuDbao05LpKDF2mMzTRXjwOxNGatexq04frciizuupUkKP1ui9JdOYOEgFI0pYFffmXuqmiXkIK9MG9U4AJV3CH9URV81VmgGgsN1rbHcFGKvOrxelDSY6TqQYMaHt5JCGFcPUxpvkE76KAssmzL9wYavqDuHRKhjTcYiNqw22u7V1i3Cp/7zFzBHYz0BlkzzbkWPuphMnxiTsz+HE9bTqa7Jwj1pduyAuwdDSkAVcGUSjc3GZifzY5rD77vOuovgCCtD0aY9hj3YHV4oY6+4ErkbpJkuF9urkMQV1FPgoOb7YrjxBzXTnXZWzx2E4sXlzQYgXsLF5bI+5+qPwQXtInarFSH/QGyRhTppg9RT2ItX2rri+Do3biEunRtxd3pZSaZP7I2fSOYNxnvrcx8/qC1s8X8P6lLEKkb6hH3aRMeO8L2e/X0vu/4nVua09yW1e0QZzJN6GYUe8N+vA4OQNlL6NW2Mx6GvrKSlUI0l2jAVjvJk6N9HdGEjQeKzGqtqC7FHsg49sBzZRkqmVpV+UbJWHQ9B43jtg0dFE0lf/F4VdUMoD1A6GjZGMatV5NP5/Jb1gCWsU9vyPPxcDVa/N6GxAikjHyMIJM84NzNFjFJZOHK9aHOqz9B0tqS2ZfICbrRm4eRb8FgsqXcJaI= 16 | on: 17 | branch: master 18 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2016 BigData Republic 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include bdranalytics/images/bdr.gif -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![PyPI version](https://badge.fury.io/py/bdranalytics.svg)](https://badge.fury.io/py/bdranalytics) 2 | [![Build Status](https://travis-ci.org/BigDataRepublic/bdr-analytics-py.svg?branch=master)](https://travis-ci.org/BigDataRepublic/bdr-analytics-py) 3 | 4 | # BigData Republic Analytics (python) 5 | Our analytics library to quickly get our data scientists up to speed, on the python platform 6 | 7 | User documentation can be found at https://bigdatarepublic.github.io/bdr-analytics-py/ 8 | 9 | ## Installation 10 | 11 | Installation is done through the pip command line utility. 12 | 13 | ``` 14 | pip install bdranalytics 15 | ``` 16 | 17 | ## Using the Spark notebooks 18 | Some notebooks in the `notebooks` folder use spark. Check the [spark documentation](http://spark.apache.org/docs/2.0.1/programming-guide.html) for running jupyter with a spark contet. 19 | 20 | But in short, for **windows** 21 | ``` 22 | set PYSPARK_DRIVER_PYTHON_OPTS=notebook 23 | set PYSPARK_DRIVER_PYTHON=jupyter 24 | [spark_install_dir]\bin\pyspark 25 | ``` 26 | 27 | And for **nix** 28 | ``` 29 | export PYSPARK_DRIVER_PYTHON_OPTS=notebook 30 | export PYSPARK_DRIVER_PYTHON=jupyter 31 | [spark_install_dir]/bin/pyspark 32 | ``` 33 | 34 | ## Contributing 35 | To contribute, please fork or branch from `master` and submit a pull-request. 36 | Guidelines for an acceptable pull-request: 37 | 38 | - PEP8 compliant code 39 | - At least one line of documentation per class, function and method. 40 | - Tests covering edge cases of your code. 41 | 42 | ### Development environment 43 | To create the development environment with conda, run: 44 | 45 | > conda env create -f environment-dev.yml 46 | 47 | > source activate bdranalytics-dev 48 | 49 | ### Running the test 50 | 51 | To run all tests: 52 | > source activate bdranalytics-dev 53 | > python setup.py test 54 | 55 | ### Creating a package dist 56 | 57 | To create a dist from a local checkout (when developing on this module): 58 | > source activate bdranalytics-dev 59 | > python setup.py sdist 60 | 61 | ### Running the installation script 62 | This uses the setup.py script directly, useful for testing how the dist will be installed without creating the dist. 63 | 64 | To just install the package and main dependencies from a local checkout (when going to use this module): 65 | > python setup.py install 66 | 67 | ### Creating the sphinx documentation 68 | 69 | To update html files: 70 | ``` 71 | source activate bdranalytics-dev 72 | cd doc 73 | make clean && make source && make html 74 | ``` 75 | -------------------------------------------------------------------------------- /bdranalytics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/bdranalytics/__init__.py -------------------------------------------------------------------------------- /bdranalytics/images/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/bdranalytics/images/__init__.py -------------------------------------------------------------------------------- /bdranalytics/images/bdr.gif: -------------------------------------------------------------------------------- 1 | TODO:our logo -------------------------------------------------------------------------------- /bdranalytics/keras/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/bdranalytics/keras/__init__.py -------------------------------------------------------------------------------- /bdranalytics/keras/callbacks.py: -------------------------------------------------------------------------------- 1 | from keras.callbacks import Callback 2 | from keras.models import Sequential 3 | from sklearn.metrics import roc_auc_score, average_precision_score, \ 4 | precision_score, recall_score 5 | 6 | 7 | class EpochEvaluation(Callback): 8 | def __init__(self, validation_data=()): 9 | super(Callback, self).__init__() 10 | self.X_val, self.y_val = validation_data 11 | self.metrics = {} 12 | 13 | def on_epoch_begin(self, epoch, logs={}): 14 | if epoch > 0: 15 | print(" - ".join(["val_{:s}: {:.4f}".format(k, v) 16 | for k, v in self.metrics.items()])) 17 | 18 | def on_epoch_end(self, epoch, logs={}): 19 | if isinstance(self.model, Sequential): 20 | predict = self.model.predict_proba 21 | else: 22 | predict = self.model.predict 23 | 24 | y_pred = predict(self.X_val, verbose=0) 25 | y_pred_bin = y_pred > 0.5 26 | 27 | y_true = self.y_val 28 | self.metrics['roc_auc'] = roc_auc_score(y_true, y_pred) 29 | self.metrics['pr_auc'] = average_precision_score( 30 | y_true, y_pred, average="micro") 31 | self.metrics['recall'] = recall_score(y_true, y_pred_bin) 32 | self.metrics['precision'] = precision_score(y_true, y_pred_bin) 33 | -------------------------------------------------------------------------------- /bdranalytics/keras/generators.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from keras.preprocessing.image import Iterator 3 | 4 | 5 | class StratifiedIndexGenerator: 6 | """ 7 | Stratified index generator. 8 | """ 9 | 10 | def __init__(self, shuffle=True): 11 | self.shuffle = shuffle 12 | 13 | def flow(self, strata=None, batch_size=128, strata_weights=None): 14 | """ 15 | Batch generator function. 16 | 17 | :param strata: vector with n_samples that denotes the strata 18 | :param batch_size: number of samples in a batch 19 | :param strata_weights: dictionary with strata weights, should sum to 1.0 20 | :return: an iterator that yields sample indices 21 | """ 22 | 23 | strata = strata.ravel() 24 | strata_labels = np.unique(strata) 25 | n_strata = len(strata_labels) 26 | n_samples_total = len(strata) # total number of samples 27 | indices = np.arange(n_samples_total) 28 | 29 | generators = {} 30 | surplus = {} 31 | 32 | if strata_weights is None: 33 | strata_weights = {} 34 | 35 | for stratum_label in strata_labels: 36 | mask = (strata == stratum_label) 37 | generators[stratum_label] = self.sample_generator(indices[mask]) 38 | surplus[stratum_label] = 0. 39 | if strata_weights.get(stratum_label) is None: 40 | strata_weights[stratum_label] = 1 / n_strata 41 | 42 | if sum(strata_weights.values()) != 1.0: 43 | raise ValueError("strata weights should sum to 1.0") 44 | 45 | # preallocate 46 | return_indices = np.empty(batch_size, dtype=int) 47 | 48 | while True: 49 | 50 | # shift labels array to make sure every label is last label equal amount of times 51 | # better dispersed surplus 52 | strata_labels = np.roll(strata_labels, 1) 53 | 54 | # reset total sample counter 55 | i_sample = 0 56 | for stratum_label in strata_labels: 57 | 58 | # float indicating number of samples to draw from this stratum 59 | n_samples_float = ( 60 | batch_size * strata_weights[stratum_label]) - surplus[stratum_label] 61 | 62 | # exception when reaching last stratum 63 | if stratum_label == strata_labels[-1]: 64 | n_samples = batch_size - i_sample 65 | else: 66 | n_samples = round(n_samples_float) 67 | 68 | # store remainder 69 | surplus[stratum_label] = (1. * n_samples) - n_samples_float 70 | 71 | if n_samples == 0: 72 | continue 73 | 74 | # draw samples from generator 75 | for _ in range(n_samples): 76 | return_indices[i_sample] = next(generators[stratum_label]) 77 | i_sample += 1 # increment total sample counter 78 | 79 | # yield result 80 | yield return_indices 81 | 82 | def sample_generator(self, indices): 83 | """ 84 | Basic single element generator from a list, in shuffled order. 85 | 86 | :param indices: list of indices to yield 87 | :return: a generator 88 | """ 89 | while True: 90 | if self.shuffle: 91 | indices = np.random.permutation(indices) 92 | for selected_row in indices: 93 | yield selected_row 94 | 95 | 96 | class DataGenerator: 97 | """ 98 | Keras-API compatible data generator class for in-memory (X, y) samples. 99 | Comparable to keras.preprocessing.image.ImageDataGenerator 100 | """ 101 | 102 | def __init__(self): 103 | pass 104 | 105 | def flow(self, X, y, batch_size=128, seed=42, shuffle=True, strata=None, strata_weights=None): 106 | """ 107 | Returns a data iterator that can be looped over to return batches. 108 | 109 | :param X: array-like, input data 110 | :param y: array-like, target data 111 | :param batch_size: int, number of samples in the batch 112 | :param seed: int, seed for randomness, set globally 113 | :param shuffle: bool, whether to shuffle the dataset 114 | :param strata: array-like, size n_samples that denotes the subpopulation (stratum) ID, which 115 | is sampled independently. 116 | :param strata_weights: dictionary, containing strata weights, should sum to 1.0 117 | :return: an iterator 118 | """ 119 | return DataIterator(X, y, batch_size=batch_size, n=X.shape[0], seed=seed, shuffle=shuffle, 120 | strata=strata, strata_weights=strata_weights) 121 | 122 | 123 | class DataIterator(Iterator): 124 | """ 125 | Data iterator stratification capability. Keras-API compatible. 126 | Comparable to keras.preprocessing.image.NumpyDataIterator 127 | """ 128 | 129 | def __init__(self, X, y, strata=None, strata_weights=None, batch_size=128, shuffle=True, **kwargs): 130 | self.X = X 131 | self.y = y 132 | self.strata = strata 133 | self.strata_weights = strata_weights 134 | 135 | super(DataIterator, self).__init__( 136 | batch_size=batch_size, shuffle=shuffle, **kwargs) 137 | 138 | if self.strata is not None: 139 | self.index_generator = StratifiedIndexGenerator(shuffle=shuffle).flow(batch_size=batch_size, 140 | strata=self.strata, 141 | strata_weights=self.strata_weights) 142 | 143 | def _get_batches_of_transformed_samples(self, index_array): 144 | return self.X[index_array, ], self.y[index_array, ] 145 | 146 | def next(self): 147 | with self.lock: 148 | index_array = next(self.index_generator) 149 | return self._get_batches_of_transformed_samples(index_array) 150 | -------------------------------------------------------------------------------- /bdranalytics/keras/layers.py: -------------------------------------------------------------------------------- 1 | from keras.engine import Layer, InputSpec 2 | from keras.layers import Flatten 3 | import tensorflow as tf 4 | 5 | 6 | class KMaxPooling(Layer): 7 | """ 8 | K-max pooling layer that extracts the k-highest activations from a sequence (2nd dimension). 9 | TensorFlow backend. 10 | """ 11 | 12 | def __init__(self, k=1, **kwargs): 13 | super().__init__(**kwargs) 14 | self.input_spec = InputSpec(ndim=3) 15 | self.k = k 16 | 17 | def compute_output_shape(self, input_shape): 18 | return input_shape[0], (input_shape[2] * self.k) 19 | 20 | def call(self, inputs, **kwargs): 21 | # swap last two dimensions since top_k will be applied along the last dimension 22 | shifted_input = tf.transpose(inputs, [0, 2, 1]) 23 | 24 | # extract top_k, returns two tensors [values, indices] 25 | top_k = tf.nn.top_k(shifted_input, k=self.k, sorted=True, name=None)[0] 26 | 27 | # return flattened output 28 | return Flatten()(top_k) 29 | -------------------------------------------------------------------------------- /bdranalytics/keras/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/bdranalytics/keras/tests/__init__.py -------------------------------------------------------------------------------- /bdranalytics/keras/tests/test_generators.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from bdranalytics.keras.generators import * 3 | import numpy as np 4 | from keras.models import Sequential 5 | from keras.layers import Dense 6 | from keras.preprocessing.image import ImageDataGenerator 7 | 8 | 9 | @pytest.yield_fixture(scope='class') 10 | def params(request): 11 | request.cls.batch_size = 3 12 | request.cls.X = np.random.rand(100, 10) 13 | request.cls.X_image = np.random.rand(100, 1, 1, 1) 14 | request.cls.y = 1. * (np.random.rand(100, 1) > 0.5) 15 | request.cls.strata_weights = {1.0: 0.2, 0.0: 0.8} 16 | yield 17 | 18 | 19 | @pytest.mark.usefixtures("params") 20 | class TestGenerators: 21 | 22 | def test_stratified_index_generator(self): 23 | iterator = StratifiedIndexGenerator().flow( 24 | strata=self.y, batch_size=self.batch_size, 25 | strata_weights=self.strata_weights 26 | ) 27 | total, positives = 0, 0 28 | 29 | for i in range(100): 30 | indices = next(iterator) 31 | positives += self.y[indices].sum() 32 | total += len(indices) 33 | 34 | np.testing.assert_almost_equal( 35 | np.array([positives / total]), np.array([0.2]) 36 | ) 37 | 38 | def test_data_generator(self): 39 | iterator = DataGenerator().flow( 40 | self.X, self.y, strata=self.y, strata_weights=self.strata_weights) 41 | 42 | total, positives = 0, 0 43 | 44 | for i in range(100): 45 | X, y = next(iterator) 46 | positives += y.sum() 47 | total += len(y) 48 | 49 | np.testing.assert_almost_equal( 50 | np.array([positives / total]), np.array([0.2]) 51 | ) 52 | 53 | model = Sequential() 54 | model.add(Dense(units=1, input_shape=self.X.shape[1:])) 55 | model.compile(loss='mean_squared_error', optimizer='sgd') 56 | model.fit_generator( 57 | iterator, steps_per_epoch=(len(self.X)/self.batch_size)) 58 | 59 | def test_stratified_image_data_generator(self): 60 | 61 | iterator = ImageDataGenerator().flow(self.X_image, self.y) 62 | 63 | iterator.index_generator = StratifiedIndexGenerator().flow( 64 | batch_size=self.batch_size, 65 | strata=self.y, 66 | strata_weights=self.strata_weights) 67 | 68 | total, positives = 0, 0 69 | 70 | for i in range(100): 71 | X, y = next(iterator) 72 | positives += y.sum() 73 | total += len(y) 74 | 75 | np.testing.assert_almost_equal( 76 | np.array([positives / total]), np.array([0.2]) 77 | ) 78 | -------------------------------------------------------------------------------- /bdranalytics/pdlearn/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`bdranalytics.pdlearn` module contains adapters that allows you 3 | to put :class:`pandas.DataFrame` instances into :mod:`sklearn` without 4 | losing the column names. 5 | :mod:`sklearn` already allows you to provide instances of :class:`pandas.DataFrame`, 6 | but as it internally works with :class:`numpy.array`, column names are lost during transformation. 7 | Here we provide adapters, which re-add the column names after the :mod:`sklearn` modifications. 8 | """ 9 | -------------------------------------------------------------------------------- /bdranalytics/pdlearn/pipeline.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import six 3 | from sklearn.base import BaseEstimator, TransformerMixin 4 | 5 | 6 | class PdFeatureUnion(BaseEstimator, TransformerMixin): 7 | """Concatenates the result of multiple transformers""" 8 | 9 | def __init__(self, transformer_list, n_jobs=1, transformer_weights=None, debug=False): 10 | self.transformer_list = transformer_list 11 | self.debug = debug 12 | 13 | def fit(self, X, y=None, **fit_params): 14 | fit_params_steps = dict((name, {}) for name, step in self.transformer_list 15 | if step is not None) 16 | for pname, pval in six.iteritems(fit_params): 17 | step, param = pname.split('__', 1) 18 | fit_params_steps[step][param] = pval 19 | 20 | for name, transform in self.transformer_list: 21 | if transform is None: 22 | pass 23 | transform.fit(X, y, **fit_params_steps[name]) 24 | return self 25 | 26 | def transformgen(self, X): 27 | for name, transform in self.transformer_list: 28 | if transform is None: 29 | pass 30 | Xt = transform.transform(X) 31 | columns = Xt.columns if hasattr(Xt, "columns") else [ 32 | "{}-{}".format(name, c) for c in range(Xt.shape[1])] 33 | Xt = pd.DataFrame(Xt, index=X.index, columns=columns) 34 | assert len(Xt) == len(X), "Transformer {} shouldn't change nr of rows. " \ 35 | "Returned {} while original is {}".format( 36 | name, len(Xt), len(X)) 37 | yield Xt 38 | 39 | def _print_columns(self, xts): 40 | for xt in xts: 41 | print(xt.columns) 42 | print("\r\n") 43 | 44 | def transform(self, X): 45 | xts = list(self.transformgen(X)) 46 | if self.debug: 47 | self._print_columns(xts) 48 | try: 49 | return pd.concat(xts, axis=1, verify_integrity=True, join_axes=None) 50 | except: 51 | self._print_columns(xts) 52 | raise 53 | 54 | 55 | class PdFeatureChain(BaseEstimator, TransformerMixin): 56 | """Passes a data set through a pipeline / chain of transformers. 57 | The output of the first transformer is fed into the next transformer. 58 | 59 | Similar to sklearn Pipeline, but does not work with predictor in final step.""" 60 | 61 | def __init__(self, steps): 62 | self.steps = steps 63 | 64 | def fit(self, X, y=None, **fit_params): 65 | fit_params_steps = dict((name, {}) for name, step in self.steps 66 | if step is not None) 67 | for pname, pval in six.iteritems(fit_params): 68 | step, param = pname.split('__', 1) 69 | fit_params_steps[step][param] = pval 70 | 71 | Xt = X 72 | for name, transform in self.steps: 73 | Xt = pd.DataFrame(Xt) 74 | if transform is None: 75 | pass 76 | elif hasattr(transform, "fit_transform"): 77 | Xt = transform.fit_transform(Xt, y, **fit_params_steps[name]) 78 | else: 79 | Xt = transform.fit( 80 | Xt, y, **fit_params_steps[name]).transform(Xt) 81 | assert len(Xt) == len(X), "Transformer {} shouldn't change nr of rows. " \ 82 | "Returned {} while original is {}".format( 83 | name, len(Xt), len(X)) 84 | return self 85 | 86 | def transform(self, X): 87 | Xt = X 88 | for name, transform in self.steps: 89 | if transform is not None: 90 | Xt = pd.DataFrame(Xt) 91 | Xt = transform.transform(Xt) 92 | assert len(Xt) == len(X), "Transformer {} shouldn't change nr of rows. " \ 93 | "Returned {} while original is {}".format( 94 | name, len(Xt), len(X)) 95 | return pd.DataFrame(Xt) 96 | 97 | def fit_transform(self, X, y=None, **fit_params): 98 | return self.fit(X, y, **fit_params).transform(X) 99 | -------------------------------------------------------------------------------- /bdranalytics/pdlearn/preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.base import BaseEstimator, TransformerMixin 4 | from sklearn.pipeline import Pipeline 5 | from sklearn.preprocessing import OneHotEncoder 6 | 7 | from bdranalytics.sklearn.preprocessing import StringIndexer 8 | 9 | 10 | def format_colname(prefix, suffix): 11 | return "{:s}_{:s}".format(prefix, suffix) 12 | 13 | 14 | """A dictionary to get a date part cardinality given a general name""" 15 | __date_part_cardinality = { 16 | "MONTH": 12, 17 | "DAY": 31, 18 | "DAY_OF_WEEK": 7, 19 | "HOUR": 24, 20 | "MINUTE": 60, 21 | "SECOND": 60 22 | } 23 | 24 | """A dictionary to get a date part extractor given a general name""" 25 | __date_part_funcs = { 26 | "MONTH": lambda x: x.month, 27 | "DAY": lambda x: x.day, 28 | "DAY_OF_WEEK": lambda x: x.dayofweek, 29 | "HOUR": lambda x: x.hour, 30 | "MINUTE": lambda x: x.minute, 31 | "SECOND": lambda x: x.second 32 | } 33 | 34 | 35 | def date_to_dateparts(df, col_name, parts=list(__date_part_funcs.keys()), new_col_name_prefix=None): 36 | if new_col_name_prefix is None: 37 | new_col_name_prefix = col_name 38 | for part in parts: 39 | assert part in list(__date_part_funcs.keys()), \ 40 | "part '{}' is not known. Available are {}".format( 41 | part, ", ".join(list(__date_part_funcs.keys()))) 42 | return pd.DataFrame({ 43 | format_colname(new_col_name_prefix, part): 44 | df[col_name].apply(__date_part_funcs.get(part)) 45 | for part in parts}, index=df.index) 46 | 47 | 48 | def date_to_cyclical(df, col_name, parts=list(__date_part_funcs.keys()), new_col_name_prefix=None): 49 | if new_col_name_prefix is None: 50 | new_col_name_prefix = col_name 51 | for part in parts: 52 | assert part in list(__date_part_funcs.keys()), \ 53 | "part '{}' is not known. Available are {}".format( 54 | part, ", ".join(list(__date_part_funcs.keys()))) 55 | names = [format_colname(new_col_name_prefix, part) for part in parts] 56 | names_sin = ["{:s}_SIN".format(name) for name in names] 57 | names_cos = ["{:s}_COS".format(name) for name in names] 58 | values = [df[col_name].apply(__date_part_funcs.get(part)) / 59 | (2.0 * np.pi * __date_part_cardinality.get(part)) for part in parts] 60 | values_sin = [col.apply(np.sin) for col in values] 61 | values_cos = [col.apply(np.cos) for col in values] 62 | result = pd.concat(values_sin + values_cos, axis=1) 63 | result.columns = names_sin + names_cos 64 | return result 65 | 66 | 67 | def to_circular_variable(df, col_name, cardinality): 68 | return pd.DataFrame({ 69 | # note that np.sin(df[col_name] / float(cardinalilty...)) gives different values, probably rounding 70 | "{:s}_SIN".format(col_name): df[col_name].apply(lambda x: np.sin(x / float(cardinality * 2 * np.pi))), 71 | "{:s}_COS".format(col_name): df[col_name].apply(lambda x: np.cos(x / float(cardinality * 2 * np.pi))) 72 | }, index=df.index) 73 | 74 | 75 | class DateOneHotEncoding(BaseEstimator, TransformerMixin): 76 | """ 77 | Feature-engineering class that transforms date columns into one hot encoding of the parts (day, hour, ..). 78 | The original date column will be removed. 79 | To be used by sklearn pipelines 80 | """ 81 | 82 | def __init__(self, date_columns, parts=list(["DAY", "DAY_OF_WEEK", "HOUR", "MINUTE", "MONTH", "SECOND"]), 83 | new_column_names=None, drop=True): 84 | """ 85 | :param date_columns: the column names of the date columns to be expanded in one hot encodings 86 | :param new_column_names: the names to use as prefix for the generated column names 87 | :param drop: whether or not to drop the original column 88 | :param parts: the parts to extract from the date columns, and to then transform into one-hot encodings 89 | """ 90 | self.drop = drop 91 | self.parts = parts 92 | if new_column_names is None: 93 | self.new_column_names = date_columns 94 | else: 95 | self.new_column_names = new_column_names 96 | self.date_columns = date_columns 97 | self.one_hot_encoding_model = OneHotEncoder(sparse=False, handle_unknown='ignore' 98 | # , n_values=datepart_maxvalue 99 | ) 100 | self.encoding_pipeline = Pipeline([ 101 | ('labeler', StringIndexer()), 102 | ('encoder', self.one_hot_encoding_model) 103 | ]) 104 | assert (len(self.date_columns) == len(self.new_column_names)), \ 105 | "length of new column names is not equal to given column names" 106 | 107 | def all_to_parts(self, X): 108 | parts = [date_to_dateparts(X, old_name, self.parts, new_name) 109 | for old_name, new_name in zip(self.date_columns, self.new_column_names)] 110 | result = pd.concat(parts, axis=1, join_axes=[X.index]) 111 | return result 112 | 113 | def fit(self, X, y): 114 | parts = self.all_to_parts(X) 115 | self.encoding_pipeline.fit(parts) 116 | # original column i is mapped to values in range resulting_indices[i] .. resulting_indices[i+1] 117 | resulting_indices = self.one_hot_encoding_model.feature_indices_ 118 | active_features = self.one_hot_encoding_model.active_features_ 119 | new_names = [''] * (np.max(resulting_indices) + 1) 120 | for i, item in enumerate(parts.columns): 121 | for j in range(resulting_indices[i], resulting_indices[i + 1]): 122 | new_names[j] = "{}-{}".format(item, j) 123 | self.fitted_names = [new_names[i] for i in active_features] 124 | return self 125 | 126 | def transform_one_hots(self, X): 127 | np_frame = self.encoding_pipeline.transform(self.all_to_parts(X)) 128 | return pd.DataFrame(np_frame, columns=self.fitted_names) 129 | 130 | def transform(self, X): 131 | new_columns = self.transform_one_hots(X) 132 | old_columns = X.drop(self.date_columns, axis=1, 133 | inplace=False) if self.drop else X 134 | 135 | return pd.concat([old_columns, new_columns], axis=1, join_axes=[X.index]) 136 | 137 | 138 | class DateCyclicalEncoding(BaseEstimator, TransformerMixin): 139 | """ 140 | Feature-engineering class that transforms date columns into cyclical numerical columns. 141 | The original date column will be removed. 142 | To be used by sklearn pipelines 143 | """ 144 | 145 | def __init__(self, date_columns, parts=list(["DAY", "DAY_OF_WEEK", "HOUR", "MINUTE", "MONTH", "SECOND"]), 146 | new_column_names=None, drop=True): 147 | """ 148 | :param date_columns: the column names of the date columns to be expanded in one hot encodings 149 | :param new_column_names: the names to use as prefix for the generated column names 150 | :param drop: whether or not to drop the original column 151 | :param parts: the parts to extract from the date columns, and to then transform into one-hot encodings 152 | """ 153 | self.parts = parts 154 | self.drop = drop 155 | if new_column_names is None: 156 | self.new_column_names = date_columns 157 | else: 158 | self.new_column_names = new_column_names 159 | self.date_columns = date_columns 160 | assert (len(self.date_columns) == len(self.new_column_names)) 161 | 162 | def all_to_cyclical_parts(self, X): 163 | parts = [date_to_cyclical(X, old_name, self.parts, new_name) 164 | for old_name, new_name in zip(self.date_columns, self.new_column_names)] 165 | return pd.concat(parts, axis=1, join_axes=[X.index]) 166 | 167 | def fit(self, X, y): 168 | return self 169 | 170 | def transform(self, X): 171 | new_columns = self.all_to_cyclical_parts(X) 172 | old_columns = X.drop(self.date_columns, axis=1, 173 | inplace=False) if self.drop else X 174 | return pd.concat([old_columns, new_columns], axis=1, join_axes=[X.index]) 175 | 176 | 177 | # like sklearn's transformers, but then on pandas DataFrame 178 | class PdLagTransformer(BaseEstimator, TransformerMixin): 179 | def __init__(self, lag): 180 | self.lag = lag 181 | 182 | def fit(self, X, y=None, **fit_params): 183 | return self 184 | 185 | def do_transform(self, dataframe): 186 | return (dataframe.shift(self.lag) 187 | .rename(columns=lambda c: "{}_lag{}".format(c, self.lag))) 188 | 189 | def transform(self, X): 190 | try: 191 | return self.do_transform(X) 192 | except AttributeError: 193 | return self.do_transform(pd.DataFrame(X)) 194 | 195 | def fit_transform(self, X, y=None, **fit_params): 196 | return self.fit(X, y, **fit_params).transform(X) 197 | 198 | 199 | class PdWindowTransformer(BaseEstimator, TransformerMixin): 200 | def __init__(self, func, **rolling_params): 201 | self.func = func 202 | self.rolling_params = rolling_params 203 | 204 | def fit(self, X, y=None, **fit_params): 205 | return self 206 | 207 | def do_transform(self, dataframe): 208 | return (self.func(dataframe.rolling(**self.rolling_params)) 209 | .rename(columns=lambda c: "{}_{}".format(c, "".join( 210 | ["{}{}".format(k, v) for k, v in self.rolling_params.items()])))) 211 | 212 | def transform(self, X): 213 | try: 214 | return self.do_transform(X) 215 | except AttributeError: 216 | return self.do_transform(pd.DataFrame(X)) 217 | 218 | def fit_transform(self, X, y=None, **fit_params): 219 | return self.fit(X, y, **fit_params).transform(X) 220 | -------------------------------------------------------------------------------- /bdranalytics/pdlearn/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/bdranalytics/pdlearn/tests/__init__.py -------------------------------------------------------------------------------- /bdranalytics/pdlearn/tests/test_pipeline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import unittest 4 | from sklearn.pipeline import FeatureUnion, Pipeline 5 | 6 | from bdranalytics.pdlearn.pipeline import PdFeatureUnion, PdFeatureChain 7 | from bdranalytics.pdlearn.preprocessing import PdLagTransformer, PdWindowTransformer 8 | 9 | 10 | class TestLagTransformer(unittest.TestCase): 11 | def test_lagtransformer(self): 12 | orig_data = pd.DataFrame(data=np.arange(15).reshape( 13 | 5, 3), columns=["col1", "col2", "col3"]) 14 | lagged = PdLagTransformer(1).fit_transform(orig_data) 15 | np.testing.assert_array_equal( 16 | lagged.columns, ["col1_lag1", "col2_lag1", "col3_lag1"]) 17 | np.testing.assert_array_equal(lagged.iloc[1, :], orig_data.iloc[0, :]) 18 | np.testing.assert_array_equal(lagged.iloc[0, :], np.repeat(np.nan, 3)) 19 | 20 | def test_lagtransformer_on_numpy(self): 21 | orig_data = np.arange(15).reshape(5, 3) 22 | lagged = PdLagTransformer(1).fit_transform(orig_data) 23 | np.testing.assert_array_equal( 24 | lagged.columns, ["0_lag1", "1_lag1", "2_lag1"]) 25 | np.testing.assert_array_equal(lagged.iloc[1, :], orig_data[0, :]) 26 | np.testing.assert_array_equal(lagged.iloc[0, :], np.repeat(np.nan, 3)) 27 | 28 | def test_windowtransformer(self): 29 | orig_data = pd.DataFrame(data=np.arange( 30 | 14, -1, -1).reshape(5, 3), columns=["col1", "col2", "col3"]) 31 | result = PdWindowTransformer( 32 | lambda window: window.max(), window=2).fit_transform(orig_data) 33 | np.testing.assert_array_equal( 34 | result.columns, ["col1_window2", "col2_window2", "col3_window2"]) 35 | np.testing.assert_array_equal(result.iloc[0, :], np.repeat(np.nan, 3)) 36 | # orig data is [ [14, 13, 12], [11, 10, 9],.., thus rolling max at row 1 should be values of row 0 37 | np.testing.assert_array_equal(result.iloc[1, :], orig_data.iloc[0, :]) 38 | 39 | def test_windowtransformer_on_numpy(self): 40 | orig_data = np.arange(14, -1, -1).reshape(5, 3) 41 | result = PdWindowTransformer( 42 | lambda window: window.max(), window=2).fit_transform(orig_data) 43 | np.testing.assert_array_equal( 44 | result.columns, ["0_window2", "1_window2", "2_window2"]) 45 | np.testing.assert_array_equal(result.iloc[0, :], np.repeat(np.nan, 3)) 46 | # orig data is [ [14, 13, 12], [11, 10, 9],.., thus rolling max at row 1 should be values of row 0 47 | np.testing.assert_array_equal(result.iloc[1, :], orig_data[0, :]) 48 | 49 | def test_featureunion(self): 50 | orig_data = pd.DataFrame(data=np.arange(15).reshape( 51 | 5, 3), columns=["col1", "col2", "col3"]) 52 | result = PdFeatureUnion([ 53 | ('lag', PdLagTransformer(1)), 54 | ('window', PdWindowTransformer(lambda window: window.max(), window=2))] 55 | ).fit_transform(orig_data) 56 | np.testing.assert_array_equal(result.columns, 57 | ["col1_lag1", "col2_lag1", "col3_lag1", "col1_window2", "col2_window2", 58 | "col3_window2"]) 59 | np.testing.assert_array_equal( 60 | result.iloc[:, 0:3], 61 | PdLagTransformer(1).fit_transform(orig_data)) 62 | np.testing.assert_array_equal( 63 | result.iloc[:, 3:6], 64 | PdWindowTransformer(lambda window: window.max(), window=2).fit_transform(orig_data)) 65 | np.testing.assert_array_equal(result, 66 | FeatureUnion([ 67 | ("lag", PdLagTransformer(1)), 68 | ("window", PdWindowTransformer( 69 | lambda window: window.max(), window=2)) 70 | ]).fit_transform(orig_data)) 71 | 72 | def test_featurechain(self): 73 | orig_data = pd.DataFrame(data=np.arange(15).reshape( 74 | 5, 3), columns=["col1", "col2", "col3"]) 75 | result = PdFeatureChain([ 76 | ('lag', PdLagTransformer(1)), 77 | ('window', PdWindowTransformer(lambda window: window.max(), window=2))]).fit_transform(orig_data) 78 | np.testing.assert_array_equal(result.columns, 79 | ["col1_lag1_window2", "col2_lag1_window2", "col3_lag1_window2"]) 80 | np.testing.assert_array_equal( 81 | result, 82 | PdWindowTransformer(lambda window: window.max(), window=2).fit_transform( 83 | PdLagTransformer(1).fit_transform(orig_data) 84 | ) 85 | ) 86 | np.testing.assert_array_equal(result, 87 | Pipeline(steps=[ 88 | ("lag", PdLagTransformer(1)), 89 | ("window", PdWindowTransformer( 90 | lambda window: window.max(), window=2)) 91 | ]).fit_transform(orig_data)) 92 | -------------------------------------------------------------------------------- /bdranalytics/pdlearn/tests/test_preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import unittest 4 | 5 | from bdranalytics.pdlearn.preprocessing import DateCyclicalEncoding, \ 6 | DateOneHotEncoding 7 | from bdranalytics.pdlearn.preprocessing import date_to_dateparts, \ 8 | date_to_cyclical 9 | 10 | 11 | class TestDatePartitioner(unittest.TestCase): 12 | def test_date_to_dateparts(self): 13 | orig_data = pd.DataFrame(data=np.arange( 14 | np.datetime64('2011-07-11'), np.datetime64('2011-07-18') 15 | ).reshape(7, 1), columns=["thedate"]) 16 | splitted_data = date_to_dateparts(orig_data, 'thedate', 17 | new_col_name_prefix='prefix') 18 | 19 | expected_columns = ["prefix_{}".format(x) for x in 20 | ["DAY", "DAY_OF_WEEK", "HOUR", "MINUTE", "MONTH", 21 | "SECOND"]] 22 | # no additional columns 23 | np.testing.assert_array_equal( 24 | list(set(splitted_data.columns) - set(expected_columns)), list()) 25 | # no missing columns 26 | np.testing.assert_array_equal( 27 | list(set(expected_columns) - set(splitted_data.columns)), list()) 28 | monday = 0 29 | tuesday = 1 30 | np.testing.assert_array_equal(splitted_data.loc[0, expected_columns], 31 | [11, monday, 0, 0, 7, 0]) 32 | np.testing.assert_array_equal(splitted_data.loc[1, expected_columns], 33 | [12, tuesday, 0, 0, 7, 0]) 34 | 35 | def test_dateparts_to_circular(self): 36 | orig_data = pd.DataFrame(data=np.arange( 37 | np.datetime64('2011-07-11'), np.datetime64('2011-07-18') 38 | ).reshape(7, 1), columns=["thedate"]) 39 | circular_data = date_to_cyclical(orig_data, 'thedate', 40 | new_col_name_prefix='prefix') 41 | 42 | intermediate_columns = ["prefix_{}".format(x) for x in 43 | ["DAY", "DAY_OF_WEEK", "HOUR", "MINUTE", 44 | "MONTH", "SECOND"]] 45 | expected_columns = ["{}_{}".format(x, y) for y in ["COS", "SIN"] for x 46 | in intermediate_columns] 47 | # no additional columns 48 | np.testing.assert_array_equal( 49 | list(set(circular_data.columns) - set(expected_columns)), list()) 50 | # no missing columns 51 | np.testing.assert_array_equal( 52 | list(set(expected_columns) - set(circular_data.columns)), list()) 53 | # correct result compared to just splitting the columns 54 | splitted_data = date_to_dateparts(orig_data, 'thedate', 55 | new_col_name_prefix='prefix') 56 | sin_columns = ["{}_{}".format(x, y) for y in ["SIN"] for x in 57 | intermediate_columns] 58 | np.testing.assert_array_equal(circular_data.loc[:, sin_columns], np.sin( 59 | splitted_data.loc[:, intermediate_columns] / ( 60 | 2.0 * np.pi * np.array([31, 7, 24, 60, 12, 60])))) 61 | cos_columns = ["{}_{}".format(x, y) for y in ["COS"] for x in 62 | intermediate_columns] 63 | np.testing.assert_array_equal(circular_data.loc[:, cos_columns], np.cos( 64 | splitted_data.loc[:, intermediate_columns] / ( 65 | 2.0 * np.pi * np.array([31, 7, 24, 60, 12, 60])))) 66 | 67 | def test_dateonehotencoding(self): 68 | orig_data = pd.DataFrame(data=np.arange( 69 | np.datetime64('2011-07-11'), np.datetime64('2011-07-18') 70 | ).reshape(7, 1), columns=["thedate"]) 71 | y = np.repeat(0, 7) 72 | onehot = DateOneHotEncoding(['thedate'], drop=True).fit_transform( 73 | orig_data, y) 74 | print(onehot) 75 | 76 | def test_datecyclicalencoding(self): 77 | orig_data = pd.DataFrame(data=np.arange( 78 | np.datetime64('2011-07-11'), np.datetime64('2011-07-18') 79 | ).reshape(7, 1), columns=["thedate"]) 80 | y = np.repeat(0, 7) 81 | 82 | # create splitted to also be able to calculate values 83 | splitted_data = date_to_dateparts(orig_data, 'thedate') 84 | 85 | circular_data = DateCyclicalEncoding(['thedate'], 86 | drop=True).fit_transform(orig_data, 87 | y) 88 | intermediate_columns = ["thedate_{}".format(x) for x in 89 | ["DAY", "DAY_OF_WEEK", "HOUR", "MINUTE", 90 | "MONTH", "SECOND"]] 91 | expected_columns = ["{}_{}".format(x, y) for y in ["COS", "SIN"] for x 92 | in intermediate_columns] 93 | # no additional columns 94 | np.testing.assert_array_equal( 95 | list(set(circular_data.columns) - set(expected_columns)), list()) 96 | # no missing columns 97 | np.testing.assert_array_equal( 98 | list(set(expected_columns) - set(circular_data.columns)), list()) 99 | sin_columns = ["{}_{}".format(x, y) for y in ["SIN"] for x in 100 | intermediate_columns] 101 | np.testing.assert_array_equal(circular_data.loc[:, sin_columns], np.sin( 102 | splitted_data.loc[:, intermediate_columns] / ( 103 | 2.0 * np.pi * np.array([31, 7, 24, 60, 12, 60])))) 104 | cos_columns = ["{}_{}".format(x, y) for y in ["COS"] for x in 105 | intermediate_columns] 106 | np.testing.assert_array_equal(circular_data.loc[:, cos_columns], np.cos( 107 | splitted_data.loc[:, intermediate_columns] / ( 108 | 2.0 * np.pi * np.array([31, 7, 24, 60, 12, 60])))) 109 | -------------------------------------------------------------------------------- /bdranalytics/plot/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/bdranalytics/plot/__init__.py -------------------------------------------------------------------------------- /bdranalytics/plot/classification.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import pandas as pd 4 | import seaborn as sns 5 | from sklearn.externals.joblib import Parallel, delayed 6 | from sklearn.metrics import ( 7 | confusion_matrix, 8 | accuracy_score, 9 | f1_score, 10 | roc_curve, 11 | auc, 12 | precision_recall_curve, 13 | average_precision_score 14 | ) 15 | 16 | primary_color = plt.rcParams['axes.prop_cycle'].by_key()['color'][0] 17 | 18 | default_names = ('negative', 'positive') 19 | 20 | 21 | def compute_parallel_metric(metric, y_true, y_pred): 22 | 23 | thresholds = np.arange(0, 1, .02) 24 | 25 | return Parallel(n_jobs=-1)( 26 | delayed(metric)( 27 | y_true, 28 | y_pred > threshold 29 | ) 30 | for threshold in thresholds 31 | ), thresholds 32 | 33 | 34 | def plot_accuracy(y_true, y_pred): 35 | acc, thresholds = compute_parallel_metric(accuracy_score, y_true, y_pred) 36 | 37 | lower_baseline = sum(y_true) / len(y_true) 38 | upper_baseline = 1 - lower_baseline 39 | 40 | plt.plot([0, 1], [lower_baseline, lower_baseline], 'k--') 41 | plt.plot([0, 1], [upper_baseline, upper_baseline], 'k--') 42 | plt.plot(thresholds, acc) 43 | plt.title('Accuracy across thresholds') 44 | plt.xlabel('classifier threshold') 45 | plt.ylabel('accuracy') 46 | plt.xlim([0.0, 1.0]) 47 | plt.ylim([0.0, 1.0]) 48 | 49 | 50 | def plot_f1_score(y_true, y_pred): 51 | f1s, thresholds = compute_parallel_metric(f1_score, y_true, y_pred) 52 | 53 | plt.plot(thresholds, f1s) 54 | plt.title('F1 score across thresholds') 55 | plt.xlabel('classifier threshold') 56 | plt.ylabel('F1 score') 57 | plt.xlim([0.0, 1.0]) 58 | plt.ylim([0.0, 1.0]) 59 | 60 | 61 | def plot_confusion_matrix( 62 | y_true, y_pred_bin, target_names=default_names, normalize=False): 63 | 64 | c = confusion_matrix(y_true, y_pred_bin) 65 | 66 | if normalize: 67 | c = c / c.sum() 68 | fmt = '.3f' 69 | else: 70 | fmt = 'd' 71 | 72 | confusion = pd.DataFrame(c, index=target_names, columns=target_names) 73 | sns.heatmap(confusion, annot=True, fmt=fmt) 74 | plt.xlabel('predicted label') 75 | plt.ylabel('true label') 76 | plt.title('Confusion matrix') 77 | plt.show() 78 | 79 | 80 | def plot_roc_curve(y_true, y_pred): 81 | fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1, 82 | drop_intermediate=True) 83 | roc_auc = auc(fpr, tpr) 84 | 85 | plt.plot(fpr, tpr, label="ROC curve (area = {:.2f})".format(roc_auc)) 86 | plt.plot([0, 1], [0, 1], 'k--') 87 | plt.xlim([0.0, 1.0]) 88 | plt.ylim([0.0, 1.0]) 89 | plt.xlabel('false positive rate') 90 | plt.ylabel('true positive rate') 91 | plt.title('Receiver-operating characteristic') 92 | plt.legend(loc="lower right") 93 | 94 | 95 | def plot_pr_curve(y_true, y_pred): 96 | precision, recall, thresholds = precision_recall_curve(y_true, y_pred, 97 | pos_label=1) 98 | 99 | average_precision = average_precision_score( 100 | y_true, y_pred, average="micro") 101 | 102 | baseline = sum(y_true) / len(y_true) 103 | 104 | plt.plot(recall, precision, 105 | label="PR curve (area = {:.2f})".format(average_precision)) 106 | plt.plot([0, 1], [baseline, baseline], 'k--') 107 | plt.xlim([0.0, 1.0]) 108 | plt.ylim([0.0, 1.0]) 109 | plt.xlabel('recall') 110 | plt.ylabel('precision') 111 | plt.title('Precision-recall curve') 112 | plt.legend(loc="lower right") 113 | 114 | 115 | def plot_benefits(y_true, y_pred, benefit_func=None, recalibrate=False, 116 | ax=None): 117 | if benefit_func is None: 118 | def net_benefit(tpr, fpr): 119 | cost_fp, benefit_tp = (1, 1) # equal weights 120 | n_positives = sum(y_true) 121 | n_tp = tpr * n_positives # number of true positives (benefits) 122 | n_fp = fpr * len( 123 | y_true) - n_positives # number of false positives (costs) 124 | fp_costs = n_fp * cost_fp 125 | tp_benefits = n_tp * benefit_tp 126 | return tp_benefits - fp_costs 127 | 128 | benefit_func = net_benefit 129 | 130 | fpr, tpr, thresholds = roc_curve(y_true, y_pred, pos_label=1, 131 | drop_intermediate=True) 132 | 133 | benefits = np.zeros_like(thresholds) 134 | for i, _ in enumerate(thresholds): 135 | benefits[i] = benefit_func(tpr[i], fpr[i]) 136 | 137 | i_max = np.argmax(benefits) 138 | print( 139 | "max benefits: {:.0f} units on {:,} samples, " 140 | "tpr: {:.3f}, fpr: {:.3f}, threshold: {:.3f}" 141 | .format( 142 | benefits[i_max], len(y_true), 143 | tpr[i_max], fpr[i_max], thresholds[i_max] 144 | ) 145 | ) 146 | 147 | if ax is not None: 148 | ax1 = ax 149 | else: 150 | _, ax1 = plt.subplots() 151 | 152 | ax2 = ax1.twinx() 153 | ax2.vlines(thresholds[i_max], 0, 1, linestyles='dashed') 154 | ax1.set_xlim([0, 1]) 155 | ax1.plot(thresholds, benefits, c=primary_color) 156 | ax1.set_ylim([0, np.max(benefits)]) 157 | ax2.plot(thresholds, tpr, 'g-') 158 | ax2.plot(thresholds, fpr, 'r-') 159 | ax2.set_ylim([0, 1]) 160 | ax1.set_xlabel('classifier threshold') 161 | ax1.set_ylabel('units') 162 | ax2.set_ylabel('rate') 163 | ax2.legend(labels=['TP', 'FP'], loc="upper right") 164 | ax1.set_title('Benefits across thresholds') 165 | ax1.legend(labels=['benefit'], loc="center right") 166 | ax1.grid(1) 167 | ax2.grid(0) 168 | 169 | if recalibrate: 170 | y_pred_bin = (y_pred > thresholds[i_max]) * 1. 171 | return y_pred_bin 172 | 173 | 174 | def subplot_evaluation_curves(y_true, y_pred, benefit_func=None, 175 | figsize=(12, 12)): 176 | 177 | fig, axarr = plt.subplots(3, 2, figsize=figsize) 178 | fig.subplots_adjust(hspace=0.4, wspace=0.3) 179 | 180 | plt.sca(axarr[0, 0]) 181 | plot_roc_curve(y_true, y_pred) 182 | 183 | plt.sca(axarr[0, 1]) 184 | plot_pr_curve(y_true, y_pred) 185 | 186 | plt.sca(axarr[1, 0]) 187 | plot_accuracy(y_true, y_pred) 188 | 189 | plt.sca(axarr[1, 1]) 190 | plot_f1_score(y_true, y_pred) 191 | 192 | plot_benefits(y_true, y_pred, ax=axarr[2, 0], benefit_func=benefit_func) 193 | plt.show() 194 | -------------------------------------------------------------------------------- /bdranalytics/plot/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/bdranalytics/plot/tests/__init__.py -------------------------------------------------------------------------------- /bdranalytics/sklearn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/bdranalytics/sklearn/__init__.py -------------------------------------------------------------------------------- /bdranalytics/sklearn/encoders.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.base import BaseEstimator, TransformerMixin 4 | 5 | 6 | class WeightOfEvidenceEncoder(BaseEstimator, TransformerMixin): 7 | """ 8 | Feature-engineering class that transforms a high-capacity categorical value 9 | into Weigh of Evidence scores. Can be used in sklearn pipelines. 10 | """ 11 | 12 | def __init__(self, verbose=0, cols=None, return_df=True, 13 | smooth=0.5, fillna=0, dependent_variable_values=None): 14 | """ 15 | :param smooth: value for additive smoothing, to prevent divide by zero 16 | """ 17 | # make sure cols is a list of strings 18 | if not isinstance(cols, list): 19 | cols = [cols] 20 | 21 | self.stat = {} 22 | self.return_df = return_df 23 | self.verbose = verbose 24 | self.cols = cols 25 | self.smooth = smooth 26 | self.fillna = fillna 27 | self.dependent_variable_values = dependent_variable_values 28 | 29 | def fit(self, X, y): 30 | 31 | if not isinstance(X, pd.DataFrame): 32 | raise TypeError( 33 | 'Input should be an instance of pandas.DataFrame()') 34 | 35 | if self.dependent_variable_values is not None: 36 | y = self.dependent_variable_values 37 | 38 | df = X[self.cols].copy() 39 | y_col_index = len(df.columns) + 1 40 | df[y_col_index] = np.array(y) 41 | 42 | def get_totals(x): 43 | total = np.size(x) 44 | pos = max(float(np.sum(x)), self.smooth) 45 | neg = max(float(total - pos), self.smooth) 46 | return pos, neg 47 | 48 | # get the totals per class 49 | total_positive, total_negative = get_totals(y) 50 | if self.verbose: 51 | print("total positives {:.0f}, total negatives {:.0f}".format( 52 | total_positive, total_negative)) 53 | 54 | def compute_bucket_woe(x): 55 | bucket_positive, bucket_negative = get_totals(x) 56 | return np.log(bucket_positive / bucket_negative) 57 | 58 | # compute WoE scores per bucket (category) 59 | stat = {} 60 | for col in self.cols: 61 | 62 | if self.verbose: 63 | print( 64 | "computing weight of evidence for column {:s}".format(col)) 65 | 66 | stat[col] = ((df.groupby(col)[y_col_index].agg(compute_bucket_woe) 67 | + np.log(total_negative / total_positive)).to_dict()) 68 | 69 | self.stat = stat 70 | 71 | return self 72 | 73 | def transform(self, X, y=None): 74 | 75 | if not isinstance(X, pd.DataFrame): 76 | raise TypeError( 77 | 'Input should be an instance of pandas.DataFrame()') 78 | 79 | df = X.copy() 80 | 81 | # join the WoE stats with the data 82 | for col in self.cols: 83 | 84 | if self.verbose: 85 | print("transforming categorical column {:s}".format(col)) 86 | 87 | stat = pd.DataFrame.from_dict(self.stat[col], orient='index') 88 | 89 | ser = (pd.merge(df, stat, left_on=col, right_index=True, how='left') 90 | .sort_index() 91 | .reindex(df.index))[0] 92 | 93 | # fill missing values with 94 | if self.verbose: 95 | print("{:.0f} NaNs in transformed data".format( 96 | ser.isnull().sum())) 97 | print("{:.4f} mean weight of evidence".format(ser.mean())) 98 | 99 | df[col] = np.array(ser.fillna(self.fillna)) 100 | 101 | if not self.return_df: 102 | out = np.array(df) 103 | else: 104 | out = df 105 | 106 | return out 107 | 108 | 109 | class ColumnSelector(BaseEstimator, TransformerMixin): 110 | def __init__(self, columns): 111 | self.columns = columns 112 | 113 | def fit(self, X, y=None): 114 | return self 115 | 116 | def transform(self, X): 117 | try: 118 | return X[self.columns] 119 | except: 120 | print("Could not find selected columns {:s} in available columns {:s}".format( 121 | self.columns, X.columns)) 122 | raise 123 | 124 | 125 | class StringIndexer(BaseEstimator, TransformerMixin): 126 | def __init__(self): 127 | self.dictionaries = dict() 128 | self.columns = list() 129 | 130 | def fit(self, X, y=None): 131 | self.columns = X.columns.values 132 | for col in self.columns: 133 | categories = np.unique(X[col]) 134 | self.dictionaries[col] = dict( 135 | zip(categories, range(len(categories)))) 136 | return self 137 | 138 | def transform(self, X): 139 | column_array = [] 140 | for col in self.columns: 141 | dictionary = self.dictionaries[col] 142 | na_value = len(dictionary) + 1 143 | transformed_column = X[col].apply( 144 | lambda x: dictionary.get(x, na_value)) 145 | column_array.append(transformed_column.values.reshape(-1, 1)) 146 | return np.hstack(column_array) 147 | 148 | 149 | class LeaveOneOutEncoder(BaseEstimator, TransformerMixin): 150 | """ 151 | Leave one out transformation for high-capacity categorical variables. 152 | """ 153 | 154 | def __init__(self, with_stdevs=True): 155 | 156 | self.with_stdevs = with_stdevs 157 | self.means = {} 158 | self.stdevs = {} 159 | 160 | def fit(self, X, y=None): 161 | return self 162 | 163 | def transform(self, X): 164 | df = X.copy() 165 | for col in self.means.keys(): 166 | 167 | mean_col_name = "{:s}_MEAN".format(col) 168 | df[mean_col_name] = df.merge(pd.DataFrame(self.means[col]), 169 | how='left', left_on=[col], right_index=True)['y'] 170 | if self.with_stdevs: 171 | std_col_name = "{:s}_STD".format(col) 172 | df[std_col_name] = df.merge(pd.DataFrame(self.stdevs[col]), 173 | how='left', left_on=[col], right_index=True)['y'] 174 | 175 | df.drop(col, axis=1, inplace=True) 176 | 177 | return df 178 | 179 | def fit_transform(self, X, y): 180 | """will be used during pipeline fit""" 181 | df = X.copy() 182 | df['y'] = y 183 | for col in df.columns.difference(['y']): 184 | 185 | mean_col_name = "{:s}_MEAN".format(col) 186 | 187 | grouped = df.groupby(col)['y'] 188 | 189 | self.means[col] = grouped.mean() 190 | df[mean_col_name] = grouped.transform(self._loo_means) 191 | 192 | if self.with_stdevs: 193 | std_col_name = "{:s}_STD".format(col) 194 | self.stdevs[col] = grouped.std() 195 | df[std_col_name] = grouped.transform(self._loo_stdevs) 196 | 197 | df.drop(col, axis=1, inplace=True) 198 | 199 | df.drop('y', axis=1, inplace=True) 200 | return df 201 | 202 | def _loo_means(self, s): 203 | n = len(s) 204 | loo_means = (s.sum() - s) / (n - 1) 205 | return loo_means * np.random.normal(loc=1.0, scale=0.01, size=n) 206 | 207 | def _loo_stdevs(self, s): 208 | n = len(s) 209 | if n > 1: 210 | loo_means = self._loo_means(s) 211 | sum_of_sq = n * s.std() ** 2 212 | loo_stdevs = np.sqrt( 213 | abs((sum_of_sq - (s - s.mean()) * (s - loo_means))) / (n - 1)) 214 | else: 215 | loo_stdevs = np.array([0]) 216 | 217 | return loo_stdevs * np.random.normal(loc=1.0, scale=0.01, size=n) 218 | -------------------------------------------------------------------------------- /bdranalytics/sklearn/model_selection.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from abc import ABCMeta 4 | from sklearn.externals.six import with_metaclass 5 | from sklearn.utils.validation import _num_samples 6 | 7 | 8 | class GrowingWindow(with_metaclass(ABCMeta)): 9 | """Growing Window cross validator 10 | 11 | Provides train/test indices to split data in train/test sets. 12 | Divides the data in n_folds+1 slices. 13 | For split i [1..n_folds], slices [0..i} are train, slice i is test 14 | 15 | Parameters: 16 | n_folds : int, default=3 17 | Number of folds. Must be at least 1 18 | """ 19 | 20 | def __init__(self, n_folds=3): 21 | self.n_folds = n_folds 22 | 23 | def __repr__(self): 24 | return _build_repr(self) 25 | 26 | def split(self, X, y=None, labels=None): 27 | """Generate indices to split data into training and test set. 28 | Parameters 29 | ---------- 30 | X : array-like, shape (n_samples, n_features) 31 | Training data, where n_samples is the number of samples 32 | and n_features is the number of features. 33 | y : array-like, of length n_samples 34 | The target variable for supervised learning problems. 35 | ignored 36 | labels : array-like, with shape (n_samples,), optional 37 | Group labels for the samples used while splitting the dataset into 38 | train/test set. 39 | ignored 40 | Returns 41 | ------- 42 | train : ndarray 43 | The training set indices for that split. 44 | test : ndarray 45 | The testing set indices for that split. 46 | """ 47 | n = _num_samples(X) 48 | n_slices = self.n_folds + 1 49 | # loop from the first 2 folds to the total number of folds 50 | for i in range(2, n_slices + 1): 51 | # the split is the percentage at which to split the folds into train 52 | # and test. For example when i = 2 we are taking the first 2 folds out 53 | # of the total available. In this specific case we have to split the 54 | # two of them in half (train on the first, test on the second), 55 | # so split = 1/2 = 0.5 = 50%. When i = 3 we are taking the first 3 folds 56 | # out of the total available, meaning that we have to split the three of them 57 | # in two at split = 2/3 = 0.66 = 66% (train on the first 2 and test on the 58 | # following) 59 | split = float(i - 1) / i 60 | # as we loop over the folds X and y are updated and increase in size. 61 | # This is the data that is going to be split and it increases in size 62 | # in the loop as we account for more folds. If k = 300, with i starting from 2 63 | # the result is the following in the loop 64 | # i = 2 65 | # X = X_train[:(600)] 66 | # y = y_train[:(600)] 67 | # 68 | # i = 3 69 | # X = X_train[:(900)] 70 | # y = y_train[:(900)] 71 | # .... 72 | n_sub = int(np.floor(float(n * i) / n_slices)) 73 | subset = range(0, n_sub) 74 | # X and y contain both the folds to train and the fold to test. 75 | # index is the integer telling us where to split, according to the 76 | # split percentage we have set above 77 | n_train = int(np.floor(n_sub * split)) 78 | train_index = np.arange(0, n_train) 79 | test_index = np.arange(n_train, n_sub) 80 | yield train_index, test_index 81 | 82 | def get_n_splits(self, X, y=None, labels=None): 83 | """Returns the number of splitting iterations in the cross-validator 84 | Parameters 85 | ---------- 86 | X : array-like, shape (n_samples, n_features) 87 | Training data, where n_samples is the number of samples 88 | and n_features is the number of features. 89 | y : object 90 | Always ignored, exists for compatibility. 91 | labels : object 92 | Always ignored, exists for compatibility. 93 | Returns 94 | ------- 95 | n_splits : int 96 | Returns the number of splitting iterations in the cross-validator. 97 | """ 98 | if X is None: 99 | raise ValueError("The X parameter should not be None") 100 | return self.n_folds 101 | 102 | 103 | class IntervalGrowingWindow(with_metaclass(ABCMeta)): 104 | """Growing Window cross-validator based on time intervals""" 105 | 106 | def __init__(self, test_start_date, timestamps='index', test_end_date=None, 107 | test_size=None, train_size=None): 108 | 109 | self.test_start_date = pd.to_datetime(test_start_date) 110 | self.test_end_date = pd.to_datetime(test_end_date) 111 | self.test_size = pd.to_timedelta(test_size) 112 | self.train_size = pd.to_timedelta(train_size) 113 | 114 | self.timestamps = timestamps 115 | if timestamps is not 'index': 116 | self.timestamps = pd.to_datetime(timestamps) 117 | 118 | def generate_intervals(self, timestamps): 119 | 120 | # infer test interval end date if not specified 121 | # has to be done here to work with timestamps from DataFrame index 122 | # NOTE: test_end_date is NOT included 123 | if self.test_end_date is None: 124 | # can be overridden for reuse 125 | self.test_end_date = max(timestamps) 126 | 127 | # determine start date of the test intervals 128 | intervals_start = pd.to_datetime(pd.date_range(self.test_start_date, 129 | self.test_end_date, 130 | freq=self.test_size) 131 | .values) 132 | 133 | # convert to (start, end) tuples 134 | intervals = list(zip(intervals_start[:-1], intervals_start[1:])) 135 | 136 | return intervals 137 | 138 | def get_timeseries(self, X): 139 | """Returns the numpy array of timestamps for the given dataset""" 140 | if self.timestamps is 'index': 141 | return pd.to_datetime(X.index.values) 142 | else: 143 | return self.timestamps 144 | 145 | def split(self, X, y=None, labels=None): 146 | """Generate indices to split data into training and test sets based on time stamps""" 147 | if X is None: 148 | raise ValueError("The X parameter should not be None") 149 | 150 | # extract timestamps from DataFrame index, if needed 151 | timestamps = self.get_timeseries(X) 152 | intervals = self.generate_intervals(timestamps) 153 | 154 | # extract first sample for unlimited train size 155 | first_sample_date = min(timestamps) 156 | 157 | # number of samples 158 | n = _num_samples(X) 159 | 160 | # list of indices, to convert booleans later on 161 | index = np.arange(n) 162 | 163 | # loop over each interval 164 | for test_start, test_end in intervals: 165 | 166 | if self.train_size is not None: 167 | train_start = test_start - self.train_size 168 | else: 169 | train_start = first_sample_date 170 | 171 | train_interval_bool = np.array(list(map(lambda date: 172 | train_start <= date < test_start, 173 | timestamps))) 174 | 175 | test_interval_bool = np.array(list(map(lambda date: 176 | test_start <= date < test_end, 177 | timestamps))) 178 | 179 | # convert boolean to integer indices 180 | train_index = index[train_interval_bool] 181 | test_index = index[test_interval_bool] 182 | 183 | yield train_index, test_index 184 | 185 | def get_n_splits(self, X, y=None, labels=None): 186 | if X is None: 187 | raise ValueError("The X parameter should not be None") 188 | 189 | # extract timestamps from DataFrame index, if needed 190 | timestamps = self.get_timeseries(X) 191 | intervals = self.generate_intervals(timestamps) 192 | 193 | # compute number of folds 194 | return len(intervals) 195 | -------------------------------------------------------------------------------- /bdranalytics/sklearn/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | from .encoding import WeightOfEvidenceEncoder, StringIndexer, LeaveOneOutEncoder 2 | from .scaling import ScaledRegressor 3 | 4 | __all__ = ['ScaledRegressor', 5 | 'WeightOfEvidenceEncoder', 6 | 'StringIndexer', 7 | 'LeaveOneOutEncoder'] 8 | -------------------------------------------------------------------------------- /bdranalytics/sklearn/preprocessing/encoding.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.base import BaseEstimator 4 | from sklearn.base import TransformerMixin 5 | 6 | 7 | class WeightOfEvidenceEncoder(BaseEstimator, TransformerMixin): 8 | """ 9 | Feature-engineering class that transforms a high-capacity categorical value 10 | into Weigh of Evidence scores. Can be used in sklearn pipelines. 11 | """ 12 | 13 | def __init__(self, verbose=0, cols=None, return_df=True, 14 | smooth=0.5, fillna=0, dependent_variable_values=None): 15 | """ 16 | :param smooth: value for additive smoothing, to prevent divide by zero 17 | """ 18 | # make sure cols is a list of strings 19 | if not isinstance(cols, list): 20 | cols = [cols] 21 | 22 | self.stat = {} 23 | self.return_df = return_df 24 | self.verbose = verbose 25 | self.cols = cols 26 | self.smooth = smooth 27 | self.fillna = fillna 28 | self.dependent_variable_values = dependent_variable_values 29 | 30 | def fit(self, X, y): 31 | 32 | if not isinstance(X, pd.DataFrame): 33 | raise TypeError( 34 | 'Input should be an instance of pandas.DataFrame()') 35 | 36 | if self.dependent_variable_values is not None: 37 | y = self.dependent_variable_values 38 | 39 | df = X[self.cols].copy() 40 | y_col_index = len(df.columns) + 1 41 | df[y_col_index] = np.array(y) 42 | 43 | def get_totals(x): 44 | total = np.size(x) 45 | pos = max(float(np.sum(x)), self.smooth) 46 | neg = max(float(total - pos), self.smooth) 47 | return pos, neg 48 | 49 | # get the totals per class 50 | total_positive, total_negative = get_totals(y) 51 | if self.verbose: 52 | print("total positives {:.0f}, total negatives {:.0f}".format( 53 | total_positive, total_negative)) 54 | 55 | def compute_bucket_woe(x): 56 | bucket_positive, bucket_negative = get_totals(x) 57 | return np.log(bucket_positive / bucket_negative) 58 | 59 | # compute WoE scores per bucket (category) 60 | stat = {} 61 | for col in self.cols: 62 | 63 | if self.verbose: 64 | print( 65 | "computing weight of evidence for column {:s}".format(col)) 66 | 67 | stat[col] = ((df.groupby(col)[y_col_index].agg(compute_bucket_woe) 68 | + np.log(total_negative / total_positive)).to_dict()) 69 | 70 | self.stat = stat 71 | 72 | return self 73 | 74 | def transform(self, X, y=None): 75 | 76 | if not isinstance(X, pd.DataFrame): 77 | raise TypeError( 78 | 'Input should be an instance of pandas.DataFrame()') 79 | 80 | df = X.copy() 81 | 82 | # join the WoE stats with the data 83 | for col in self.cols: 84 | 85 | if self.verbose: 86 | print("transforming categorical column {:s}".format(col)) 87 | 88 | stat = pd.DataFrame.from_dict(self.stat[col], orient='index') 89 | 90 | ser = (pd.merge(df, stat, left_on=col, right_index=True, how='left') 91 | .sort_index() 92 | .reindex(df.index))[0] 93 | 94 | # fill missing values with 95 | if self.verbose: 96 | print("{:.0f} NaNs in transformed data".format( 97 | ser.isnull().sum())) 98 | print("{:.4f} mean weight of evidence".format(ser.mean())) 99 | 100 | df[col] = np.array(ser.fillna(self.fillna)) 101 | 102 | if not self.return_df: 103 | out = np.array(df) 104 | else: 105 | out = df 106 | 107 | return out 108 | 109 | 110 | class StringIndexer(BaseEstimator, TransformerMixin): 111 | def __init__(self): 112 | self.dictionaries = dict() 113 | self.columns = list() 114 | 115 | def fit(self, X, y=None): 116 | self.columns = X.columns.values 117 | for col in self.columns: 118 | categories = np.unique(X[col]) 119 | self.dictionaries[col] = dict( 120 | zip(categories, range(len(categories)))) 121 | return self 122 | 123 | def transform(self, X): 124 | column_array = [] 125 | for col in self.columns: 126 | dictionary = self.dictionaries[col] 127 | na_value = len(dictionary) + 1 128 | transformed_column = X[col].apply( 129 | lambda x: dictionary.get(x, na_value)).astype(int) 130 | column_array.append(transformed_column.values.reshape(-1, 1)) 131 | return np.hstack(column_array) 132 | 133 | 134 | class LeaveOneOutEncoder(BaseEstimator, TransformerMixin): 135 | def __init__(self, with_stdevs=True): 136 | 137 | self.with_stdevs = with_stdevs 138 | self.means = {} 139 | self.stdevs = {} 140 | 141 | def fit(self, X, y=None): 142 | return self 143 | 144 | def transform(self, X): 145 | df = X.copy() 146 | for col in self.means.keys(): 147 | 148 | mean_col_name = "{:s}_MEAN".format(col) 149 | df[mean_col_name] = df.merge(pd.DataFrame(self.means[col]), 150 | how='left', left_on=[col], right_index=True)['y'] 151 | if self.with_stdevs: 152 | std_col_name = "{:s}_STD".format(col) 153 | df[std_col_name] = df.merge(pd.DataFrame(self.stdevs[col]), 154 | how='left', left_on=[col], right_index=True)['y'] 155 | 156 | df.drop(col, axis=1, inplace=True) 157 | 158 | return df 159 | 160 | def fit_transform(self, X, y): 161 | """will be used during pipeline fit""" 162 | df = X.copy() 163 | df['y'] = y 164 | for col in df.columns.difference(['y']): 165 | 166 | mean_col_name = "{:s}_MEAN".format(col) 167 | 168 | grouped = df.groupby(col)['y'] 169 | 170 | self.means[col] = grouped.mean() 171 | df[mean_col_name] = grouped.transform(self._loo_means) 172 | 173 | if self.with_stdevs: 174 | std_col_name = "{:s}_STD".format(col) 175 | self.stdevs[col] = grouped.std() 176 | df[std_col_name] = grouped.transform(self._loo_stdevs) 177 | 178 | df.drop(col, axis=1, inplace=True) 179 | 180 | df.drop('y', axis=1, inplace=True) 181 | return df 182 | 183 | def _loo_means(self, s): 184 | n = len(s) 185 | loo_means = (s.sum() - s) / (n - 1) 186 | return loo_means * np.random.normal(loc=1.0, scale=0.01, size=n) 187 | 188 | def _loo_stdevs(self, s): 189 | n = len(s) 190 | if n > 1: 191 | loo_means = self._loo_means(s) 192 | sum_of_sq = n * s.std() ** 2 193 | loo_stdevs = np.sqrt( 194 | abs((sum_of_sq - (s - s.mean()) * (s - loo_means))) / (n - 1)) 195 | else: 196 | loo_stdevs = np.array([0]) 197 | 198 | return loo_stdevs * np.random.normal(loc=1.0, scale=0.01, size=n) 199 | -------------------------------------------------------------------------------- /bdranalytics/sklearn/preprocessing/preprocessing.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator 2 | from sklearn.base import TransformerMixin 3 | 4 | 5 | class ColumnSelector(BaseEstimator, TransformerMixin): 6 | def __init__(self, columns): 7 | self.columns = columns 8 | 9 | def fit(self, X, y=None): 10 | return self 11 | 12 | def transform(self, X): 13 | try: 14 | return X[self.columns] 15 | except: 16 | print("Could not find selected columns {:s} in available columns {:s}".format( 17 | self.columns, X.columns)) 18 | raise 19 | -------------------------------------------------------------------------------- /bdranalytics/sklearn/preprocessing/scaling.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.base import BaseEstimator, RegressorMixin 3 | 4 | 5 | class ScaledRegressor(BaseEstimator, RegressorMixin): 6 | """Allows a regressor to work with a scaled target if it does not allow scaling itself. 7 | 8 | When fitting, the `y` will be transform using the `scaler`, before being passed to the `model.fit`. 9 | When predicting, the predicted y will be inverse transformed to obtain a y_hat in the original range of values. 10 | 11 | For example, this allows your regressor to predict manipulated targets (ie `log(y)`), without additional pre and 12 | postprocessing outside your sklearn pipeline 13 | 14 | Parameters 15 | ---------- 16 | scaler : TransformerMixin 17 | The transformer which will be applied on the target before it is passed to the `model` 18 | 19 | estimator : RegressorMixin 20 | The regressor which will work in transformed target space 21 | 22 | Attributes 23 | ---------- 24 | 25 | Examples 26 | >>> from sklearn.linear_model import LinearRegression 27 | >>> from sklearn.preprocessing import StandardScaler 28 | >>> from sklearn.pipeline import Pipeline 29 | >>> n_rows = 10 30 | >>> X = np.random.rand(n_rows, 2) 31 | >>> y = np.random.rand(n_rows) 32 | >>> regressor = LinearRegression() 33 | >>> scaler = StandardScaler() 34 | >>> pipeline = Pipeline([("predict", ScaledRegressor(scaler, regressor))]) 35 | >>> y_hat = pipeline.fit(X, y).predict(X) 36 | """ 37 | 38 | def __init__(self, scaler, estimator): 39 | self.estimator = estimator 40 | self.scaler = scaler 41 | 42 | @staticmethod 43 | def _to_matrix(vector): 44 | return np.reshape(vector, (-1, 1)) 45 | 46 | @staticmethod 47 | def _to_vector(matrix): 48 | return np.reshape(matrix, -1) 49 | 50 | def fit(self, X, y): 51 | y_scaled = self.scaler.fit_transform(self._to_matrix(y)) 52 | self.estimator.fit(X, self._to_vector(y_scaled)) 53 | 54 | def predict(self, X): 55 | return self._to_vector( 56 | self.scaler.inverse_transform( 57 | self._to_matrix( 58 | self.estimator.predict(X) 59 | ) 60 | ) 61 | ) 62 | -------------------------------------------------------------------------------- /bdranalytics/sklearn/preprocessing/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/bdranalytics/sklearn/preprocessing/tests/__init__.py -------------------------------------------------------------------------------- /bdranalytics/sklearn/preprocessing/tests/test_encoding.py: -------------------------------------------------------------------------------- 1 | import random 2 | import unittest 3 | 4 | import numpy as np 5 | import pandas as pd 6 | 7 | from bdranalytics.sklearn.preprocessing import WeightOfEvidenceEncoder 8 | 9 | 10 | class TestEncoding(unittest.TestCase): 11 | 12 | def verify_numeric(self, X_test): 13 | for dt in X_test.dtypes: 14 | numeric = False 15 | if np.issubdtype(dt, int) or np.issubdtype(dt, float): 16 | numeric = True 17 | 18 | self.assertTrue(numeric) 19 | 20 | @staticmethod 21 | def create_dataset(n_rows=1000): 22 | """ 23 | Creates a data set with some categorical variables 24 | """ 25 | ds = [[ 26 | random.random(), 27 | random.random(), 28 | random.choice(['A', 'B', 'C']), 29 | random.choice(['A', 'B', 'C']), 30 | random.choice(['A', 'B', 'C', None]), 31 | random.choice(['A', 'B', 'C']) 32 | ] for _ in range(n_rows)] 33 | 34 | X = pd.DataFrame(ds, columns=['c1', 'c2', 'c3', 'c4', 'c5', 'c6']) 35 | y = np.random.randint(2, size=(n_rows,)) 36 | 37 | return X, y 38 | 39 | def test_weight_of_evidence(self): 40 | """ 41 | Unit test for WeightOfEvidenceEncoder class 42 | """ 43 | # generate some training data 44 | cols = ['c3', 'c4', 'c5', 'c6'] 45 | n_rows = 100 46 | X_train, y_train = self.create_dataset(n_rows=n_rows) 47 | 48 | # independent data set to-be-transformed 49 | X_test, _ = self.create_dataset(n_rows=10) 50 | 51 | # add unseen category to catch NaN filling behavior 52 | X_test.loc[0, 'c3'] = 'Z' 53 | 54 | # data frame case 55 | enc = WeightOfEvidenceEncoder(verbose=1, cols=cols) 56 | enc.fit(X_train, y_train) 57 | self.verify_numeric(enc.transform(X_test)) 58 | 59 | # numpy array case 60 | enc_np = WeightOfEvidenceEncoder(verbose=0, return_df=False, cols=cols) 61 | enc_np.fit(X_train, y_train) 62 | output_array_enc_np = enc_np.transform( 63 | X_test) # save for following tests 64 | self.assertTrue(isinstance(output_array_enc_np, np.ndarray)) 65 | 66 | # external dep var, DIFFERENT from y_train 67 | enc_ext = WeightOfEvidenceEncoder(verbose=1, cols=cols, return_df=False, 68 | dependent_variable_values=np.random.randint(2, size=(n_rows,))) 69 | enc_ext.fit(X_train, y_train) 70 | self.assertTrue(np.array_equal(output_array_enc_np, 71 | enc_ext.transform(X_test)) is False) 72 | 73 | # external dep var, SAME y_train 74 | enc_ext = WeightOfEvidenceEncoder(verbose=1, cols=cols, return_df=False, 75 | dependent_variable_values=y_train) 76 | enc_ext.fit(X_train, y_train) 77 | self.assertTrue(np.array_equal(output_array_enc_np, 78 | enc_ext.transform(X_test)) is True) 79 | 80 | 81 | if __name__ == '__main__': 82 | unittest.main() 83 | -------------------------------------------------------------------------------- /bdranalytics/sklearn/preprocessing/tests/test_scaling.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | from sklearn.dummy import DummyRegressor 5 | from sklearn.pipeline import Pipeline 6 | from sklearn.preprocessing import StandardScaler 7 | 8 | from bdranalytics.sklearn.preprocessing import ScaledRegressor 9 | 10 | 11 | class TestPreprocessing(unittest.TestCase): 12 | 13 | @staticmethod 14 | def create_regression_dataset(n_rows=1000): 15 | """ 16 | Creates a data set with only numerical data 17 | """ 18 | X = np.random.rand(n_rows, 2) 19 | y = np.random.rand(n_rows) 20 | return X, y 21 | 22 | def test_dummy_pipeline(self): 23 | """ 24 | Just checking setup of a dummy regressor in a pipeline 25 | :return: None 26 | """ 27 | X, y = self.create_regression_dataset(n_rows=20) 28 | predictor_constant = 3 29 | predictor = DummyRegressor( 30 | strategy="constant", constant=predictor_constant) 31 | y_hat = Pipeline([("predict", predictor)]).fit(X, y).predict(X) 32 | np.allclose(y_hat, np.repeat(predictor_constant, len(y))) 33 | 34 | def test_scaled_target(self): 35 | X, y = self.create_regression_dataset(n_rows=20) 36 | y_mean = np.mean(y) 37 | predictor_constant = 0 # 0 will be multiplied by std , and then added to the mean 38 | predictor = DummyRegressor( 39 | strategy="constant", constant=predictor_constant) 40 | scaler = StandardScaler() 41 | y_hat = Pipeline([("predict", ScaledRegressor(scaler, predictor))]).fit( 42 | X, y).predict(X) 43 | np.allclose(y_hat, np.repeat(y_mean, len(y))) 44 | 45 | def test_scaled_target_with_set_params(self): 46 | X, y = self.create_regression_dataset(n_rows=20) 47 | y_mean = np.mean(y) 48 | predictor_constant = 10 # 0 will be multiplied by std , and then added to the mean 49 | predictor = DummyRegressor( 50 | strategy="constant", constant=predictor_constant) 51 | scaler = StandardScaler() 52 | pipeline = Pipeline([("predict", ScaledRegressor(scaler, predictor))]) 53 | pipeline.set_params(predict__estimator__constant=0) 54 | y_hat = pipeline.fit(X, y).predict(X) 55 | np.allclose(y_hat, np.repeat(y_mean, len(y))) 56 | 57 | 58 | if __name__ == '__main__': 59 | unittest.main() 60 | -------------------------------------------------------------------------------- /bdranalytics/sklearn/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/bdranalytics/sklearn/tests/__init__.py -------------------------------------------------------------------------------- /bdranalytics/sklearn/tests/test_model_selection.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import unittest 4 | 5 | from bdranalytics.sklearn.model_selection import GrowingWindow, IntervalGrowingWindow 6 | 7 | 8 | def create_time_series_data_set(start_date=pd.datetime(year=2000, month=1, day=1), n_rows=100): 9 | 10 | end_date = start_date + pd.Timedelta(days=n_rows-1) 11 | 12 | ds = np.random.rand(n_rows) 13 | 14 | X = pd.DataFrame(ds, 15 | columns=['variable'], 16 | index=pd.date_range(start_date, end_date)) 17 | 18 | y = np.random.randint(2, size=(n_rows,)) 19 | 20 | return X, y 21 | 22 | 23 | class TestGrowingWindow(unittest.TestCase): 24 | 25 | def test_n_splits(self): 26 | assert GrowingWindow(4).get_n_splits(np.arange(15).reshape(3, 5)) == 4 27 | 28 | def test_n_splits_returned(self): 29 | assert len(list(GrowingWindow(4).split( 30 | np.arange(15).reshape(3, 5), np.arange(3)))) == 4 31 | 32 | def test_n_splits_testsize(self): 33 | for train, test in GrowingWindow(4).split(np.arange(15).reshape(5, 3), np.arange(5)): 34 | assert len(test) == 1 35 | 36 | def test_n_splits_testsize2(self): 37 | for i, (train, test) in zip(range(4), GrowingWindow(4).split(np.arange(15).reshape(5, 3), np.arange(5))): 38 | assert len(train) == i+1 39 | 40 | 41 | class TestIntervalGrowingWindow(unittest.TestCase): 42 | 43 | def test_split_on_index(self): 44 | 45 | X, y = create_time_series_data_set() 46 | 47 | cv = IntervalGrowingWindow( 48 | test_start_date=pd.datetime(year=2000, month=2, day=1), 49 | test_end_date=pd.datetime(year=2000, month=3, day=1), 50 | test_size='7D') 51 | 52 | self.assertTrue(len(list(cv.split(X, y))) == 4) 53 | 54 | def test_split_on_array(self): 55 | 56 | X, y = create_time_series_data_set() 57 | 58 | test_size_in_days = 7 59 | 60 | cv = IntervalGrowingWindow( 61 | timestamps=X.index.values, 62 | test_start_date=pd.datetime(year=2000, month=2, day=1), 63 | test_end_date=pd.datetime(year=2000, month=3, day=1), 64 | test_size=pd.Timedelta(days=test_size_in_days)) 65 | 66 | self.assertTrue(len(list(cv.split(X, y))) == 4) 67 | 68 | def test_split_test_size(self): 69 | 70 | X, y = create_time_series_data_set() 71 | 72 | test_size_in_days = 7 73 | 74 | cv = IntervalGrowingWindow( 75 | test_start_date=pd.datetime(year=2000, month=2, day=1), 76 | test_end_date=pd.datetime(year=2000, month=3, day=1), 77 | test_size=pd.Timedelta(days=test_size_in_days)) 78 | 79 | for _, test in cv.split(X, y): 80 | self.assertTrue(len(test) == test_size_in_days) 81 | 82 | def test_split_with_train_size(self): 83 | 84 | X, y = create_time_series_data_set() 85 | 86 | train_size_in_days = 14 87 | 88 | cv = IntervalGrowingWindow( 89 | test_start_date=pd.datetime(year=2000, month=2, day=1), 90 | test_end_date=pd.datetime(year=2000, month=3, day=1), 91 | test_size=pd.Timedelta(days=7), 92 | train_size=pd.Timedelta(days=train_size_in_days)) 93 | 94 | for train, _ in cv.split(X, y): 95 | self.assertTrue(len(train) == train_size_in_days) 96 | 97 | def test_n_splits(self): 98 | 99 | X, y = create_time_series_data_set() 100 | 101 | cv = IntervalGrowingWindow( 102 | test_start_date=pd.datetime(year=2000, month=2, day=1), 103 | test_end_date=pd.datetime(year=2000, month=3, day=1), 104 | test_size=pd.Timedelta(days=7)) 105 | 106 | self.assertTrue(cv.get_n_splits(X) == 4) 107 | -------------------------------------------------------------------------------- /data/recruit.dat: -------------------------------------------------------------------------------- 1 | 68.63 2 | 68.63 3 | 68.63 4 | 68.63 5 | 68.63 6 | 68.63 7 | 59.16 8 | 48.7 9 | 47.54 10 | 50.91 11 | 44.7 12 | 42.85 13 | 39.62 14 | 44.45 15 | 38.98 16 | 42.62 17 | 48.27 18 | 59.39 19 | 51.66 20 | 38.55 21 | 60.33 22 | 72.27 23 | 68.62 24 | 69.63 25 | 72.2 26 | 67.87 27 | 64.91001 28 | 53.85 29 | 37.96 30 | 23.23 31 | 12.68 32 | 9.84 33 | 7.82 34 | 11.78 35 | 10.22 36 | 12.19 37 | 18.6 38 | 26.97 39 | 22.52 40 | 19.18 41 | 17.14 42 | 18.61 43 | 20.02 44 | 22.65 45 | 38.99 46 | 76.55 47 | 87.99 48 | 99.8 49 | 96.69 50 | 87.45 51 | 88.57 52 | 97.43 53 | 99.99 54 | 94.88 55 | 86.99 56 | 79.73001 57 | 92.35 58 | 91.29 59 | 94.31 60 | 84.95 61 | 82.97 62 | 92.98001 63 | 81.06 64 | 62.37 65 | 52.99 66 | 39.53 67 | 42.9 68 | 33.76 69 | 40.97 70 | 60.5 71 | 66.61 72 | 80.38 73 | 95.86 74 | 97.74 75 | 80.24 76 | 73.44 77 | 65.67 78 | 47.81 79 | 33.51 80 | 34.22 81 | 32.95 82 | 32.55 83 | 46.92 84 | 44.64 85 | 53.02 86 | 41.98 87 | 30.43 88 | 24.43 89 | 18.05 90 | 20.98 91 | 12.37 92 | 12.03 93 | 12.41 94 | 15.89 95 | 20.46 96 | 26.95 97 | 30.29 98 | 26.21 99 | 23.34 100 | 25.55 101 | 25.4 102 | 24.16 103 | 23.34 104 | 24.38 105 | 27.2 106 | 29.18 107 | 43.3 108 | 53.92 109 | 59.76 110 | 64.52 111 | 65.84 112 | 70.19 113 | 75.27 114 | 77.63 115 | 76.96 116 | 77.7 117 | 85.13 118 | 99.33999 119 | 97.7 120 | 97.01999 121 | 98.83 122 | 99.8 123 | 96.74 124 | 93.54 125 | 82.08 126 | 71.51 127 | 49.61 128 | 41.76 129 | 59.78 130 | 90.97 131 | 85.26 132 | 100 133 | 98.5 134 | 98.7 135 | 93.1 136 | 79.37 137 | 81.97 138 | 55.94 139 | 50.39 140 | 48.64 141 | 40.12 142 | 63.04 143 | 60.51 144 | 78.72 145 | 71.37 146 | 76.43 147 | 71.25 148 | 56.46 149 | 41.84 150 | 41.24 151 | 35.28 152 | 39.86 153 | 45.13 154 | 53.8 155 | 77.54 156 | 80.02 157 | 81.28 158 | 73.58 159 | 66.06 160 | 59.46 161 | 59.49 162 | 51.9 163 | 35.21 164 | 39.65 165 | 31.9 166 | 61.56 167 | 88.25 168 | 96.46 169 | 83.94 170 | 89.05 171 | 92.97 172 | 98.29 173 | 99.79 174 | 94.01 175 | 87.06 176 | 80.46 177 | 74.86 178 | 67.18 179 | 73.85 180 | 80.15 181 | 69.45 182 | 50.39 183 | 31.68 184 | 31.25 185 | 23.11 186 | 11.32 187 | 8.96 188 | 6.03 189 | 11.7 190 | 34.63 191 | 58.31 192 | 58.66 193 | 72.62 194 | 85.76 195 | 94.29 196 | 92.76999 197 | 93.18 198 | 89.32 199 | 81.63 200 | 71.44 201 | 66.42 202 | 80.02 203 | 76.52 204 | 77.51 205 | 67.73001 206 | 50.52 207 | 48.97 208 | 50.64 209 | 38.73 210 | 30.79 211 | 23.75 212 | 26.28 213 | 36.67 214 | 68.91001 215 | 97.39 216 | 96.1 217 | 90.3 218 | 84.92 219 | 91.41001 220 | 92.54 221 | 98.04 222 | 99.96 223 | 88.83 224 | 83.07 225 | 86.32 226 | 99.83 227 | 96.62 228 | 99.94 229 | 96.89 230 | 85.12 231 | 77.97 232 | 67.38 233 | 44.5 234 | 26.72 235 | 13.25 236 | 10.64 237 | 23.83 238 | 29.18 239 | 26.91 240 | 20.09 241 | 22.33 242 | 22.07 243 | 26.2 244 | 29.81 245 | 30.1 246 | 24.25 247 | 25.3 248 | 23.5 249 | 35.62 250 | 52.11 251 | 56.79 252 | 69.09 253 | 86.64 254 | 99.28 255 | 98.48001 256 | 98.45 257 | 94.76999 258 | 93.58 259 | 78.07 260 | 66.88 261 | 77.04 262 | 88.72 263 | 94.88 264 | 99.67 265 | 100 266 | 99.9 267 | 96.91001 268 | 66.88 269 | 52.39 270 | 40.61 271 | 30.65 272 | 32.04 273 | 45.28 274 | 35 275 | 35.62 276 | 36.98 277 | 39.89 278 | 36.88 279 | 30.85 280 | 19.33 281 | 13.26 282 | 11.12 283 | 9.140001 284 | 8.21 285 | 10.76 286 | 10.43 287 | 13.75 288 | 37.91 289 | 41.85 290 | 44.67 291 | 50.57 292 | 50.34 293 | 49.54 294 | 56.93 295 | 60.16 296 | 57.47 297 | 71.68 298 | 97.28 299 | 62.09 300 | 59.97 301 | 51.18 302 | 51.48 303 | 66.08 304 | 86.39 305 | 93.58 306 | 99.9 307 | 93.86 308 | 82.82 309 | 84.83999 310 | 89.51 311 | 86.89 312 | 87.15 313 | 78.47 314 | 55.93 315 | 41.27 316 | 19.66 317 | 9.439999 318 | 4.66 319 | 2.36 320 | 1.72 321 | 3.32 322 | 12.13 323 | 16.81 324 | 24.3 325 | 52.42 326 | 58.05 327 | 59.42 328 | 57.52 329 | 60.13 330 | 64.68 331 | 74.94 332 | 69.73001 333 | 77.11 334 | 97.93 335 | 98.74 336 | 98.88 337 | 90.41001 338 | 77.86 339 | 61.48 340 | 47.66 341 | 30.74 342 | 20.11 343 | 12.08 344 | 8.97 345 | 20.03 346 | 71.54 347 | 97.51999 348 | 95.14 349 | 92.22 350 | 80.09 351 | 74.59 352 | 83.66001 353 | 87.36 354 | 96.63 355 | 93.36 356 | 94.7 357 | 99.66001 358 | 91.6 359 | 89.98001 360 | 99.39 361 | 99.46 362 | 99.37 363 | 99.51999 364 | 96.64 365 | 89.55 366 | 68.67 367 | 65.02 368 | 61.82 369 | 76.92 370 | 80.17 371 | 77.48001 372 | 82.34 373 | 74.11 374 | 69.03 375 | 79.48001 376 | 78.76 377 | 67.55 378 | 59.98 379 | 44.35 380 | 41.18 381 | 71.53 382 | 95.51999 383 | 93.48001 384 | 98.18 385 | 70.48001 386 | 77.63 387 | 88.11 388 | 93.15 389 | 99.01 390 | 93.31 391 | 81.21 392 | 79.63 393 | 80.67 394 | 85.63 395 | 88.66001 396 | 93.65 397 | 95.49 398 | 98.26999 399 | 86.19 400 | 79.69 401 | 72.26 402 | 35.06 403 | 20.98 404 | 29.67 405 | 42.09 406 | 52.96 407 | 69.45 408 | 76.86 409 | 86.19 410 | 96 411 | 96.07 412 | 86.85 413 | 76.66001 414 | 61.47 415 | 46.26 416 | 40.15 417 | 72.59 418 | 85.17 419 | 91.74 420 | 99.22 421 | 76.55 422 | 64.17 423 | 69.2 424 | 70.37 425 | 79.55 426 | 74.79 427 | 70.9 428 | 78.86 429 | 84.28 430 | 83.43 431 | 85.55 432 | 80.17 433 | 90.82 434 | 99.39 435 | 99.18 436 | 89.1 437 | 82.18 438 | 77.64 439 | 55.93 440 | 49.73 441 | 70.12 442 | 79.2 443 | 87.83 444 | 88.2 445 | 94.83 446 | 98.66001 447 | 94.83999 448 | 83.06 449 | 61.42 450 | 47.47 451 | 31.81 452 | 22.95 453 | 17.87 454 | -------------------------------------------------------------------------------- /data/soi.dat: -------------------------------------------------------------------------------- 1 | .377 2 | .246 3 | .311 4 | .104 5 | -.016 6 | .235 7 | .137 8 | .191 9 | -.016 10 | .29 11 | .038 12 | -.016 13 | -.158 14 | .366 15 | .607 16 | -.355 17 | -.18 18 | .268 19 | .093 20 | .027 21 | .246 22 | .202 23 | .432 24 | .617 25 | .76 26 | .891 27 | .607 28 | .574 29 | .005 30 | .475 31 | .202 32 | -.027 33 | -.038 34 | .716 35 | .836 36 | .891 37 | .53 38 | .53 39 | .377 40 | -.235 41 | -.585 42 | -.18 43 | -.53 44 | -.464 45 | -.443 46 | .049 47 | .454 48 | .257 49 | .41 50 | .224 51 | .148 52 | -.432 53 | -.093 54 | -.268 55 | .158 56 | -.06 57 | -.399 58 | .235 59 | .366 60 | .202 61 | .344 62 | -.038 63 | .29 64 | -.126 65 | -.366 66 | -.115 67 | -.301 68 | -.486 69 | -.137 70 | .738 71 | .366 72 | .366 73 | .65 74 | .628 75 | .126 76 | .169 77 | .137 78 | -.257 79 | .169 80 | -.093 81 | .475 82 | .639 83 | .596 84 | .749 85 | .191 86 | 1 87 | .486 88 | .41 89 | .158 90 | .126 91 | .06 92 | .246 93 | .738 94 | .803 95 | .421 96 | .617 97 | .705 98 | .639 99 | .454 100 | .311 101 | .355 102 | -.158 103 | -.038 104 | .115 105 | .137 106 | .257 107 | .115 108 | .038 109 | .082 110 | .148 111 | .06 112 | -.191 113 | -.607 114 | -.585 115 | -.268 116 | -.093 117 | -.093 118 | .257 119 | -.005 120 | .224 121 | .169 122 | .432 123 | .202 124 | -.366 125 | -.661 126 | .093 127 | -.716 128 | .148 129 | -.093 130 | .279 131 | .432 132 | -.104 133 | .607 134 | .18 135 | .071 136 | .246 137 | -.432 138 | .06 139 | -.388 140 | .202 141 | -.104 142 | .191 143 | .475 144 | .552 145 | .169 146 | .333 147 | -.005 148 | -.038 149 | -.126 150 | -.443 151 | -.016 152 | .027 153 | .333 154 | .355 155 | .344 156 | .115 157 | .311 158 | .596 159 | .005 160 | .388 161 | -.5080001 162 | -.552 163 | -.858 164 | -.596 165 | .06 166 | .071 167 | .224 168 | .279 169 | .322 170 | .18 171 | .126 172 | .082 173 | .126 174 | -.213 175 | -.235 176 | .169 177 | .388 178 | .519 179 | .126 180 | .41 181 | .738 182 | .443 183 | .596 184 | -.137 185 | -.486 186 | -.377 187 | .268 188 | -.169 189 | -.213 190 | -.191 191 | .377 192 | .126 193 | .311 194 | .388 195 | .388 196 | .18 197 | -.333 198 | .049 199 | -.082 200 | .191 201 | .399 202 | .093 203 | .005 204 | .366 205 | .333 206 | .377 207 | .071 208 | -.158 209 | -.541 210 | -.683 211 | -.683 212 | -.344 213 | -.301 214 | .202 215 | -.126 216 | .082 217 | .115 218 | .464 219 | .104 220 | -.18 221 | -.672 222 | .104 223 | -.486 224 | .093 225 | .279 226 | .104 227 | .18 228 | .475 229 | .53 230 | .661 231 | .355 232 | -.344 233 | -.038 234 | .235 235 | .486 236 | .169 237 | .333 238 | .137 239 | .169 240 | .333 241 | .705 242 | .377 243 | .585 244 | -.126 245 | -.213 246 | .148 247 | -.126 248 | -.344 249 | -.5080001 250 | .279 251 | .082 252 | .355 253 | .082 254 | .574 255 | .355 256 | -.257 257 | -.377 258 | -.344 259 | -.65 260 | -.093 261 | -.137 262 | .115 263 | .705 264 | .399 265 | .366 266 | .366 267 | .038 268 | -.268 269 | .322 270 | .082 271 | .082 272 | .06 273 | .268 274 | .421 275 | .77 276 | .803 277 | .661 278 | .716 279 | .628 280 | .191 281 | .53 282 | .18 283 | -.464 284 | .279 285 | .355 286 | .202 287 | .552 288 | .617 289 | .104 290 | .224 291 | .432 292 | -.213 293 | -.967 294 | -1 295 | -.432 296 | -.683 297 | -.5080001 298 | -.137 299 | .082 300 | -.06 301 | .333 302 | .355 303 | .388 304 | -.06 305 | -.202 306 | .005 307 | -.115 308 | .137 309 | .454 310 | .388 311 | .738 312 | .77 313 | .803 314 | .836 315 | .65 316 | .005 317 | -.366 318 | .071 319 | -.005 320 | -.454 321 | .191 322 | .432 323 | .683 324 | .388 325 | .235 326 | -.093 327 | .475 328 | -.082 329 | -.617 330 | -.093 331 | -.071 332 | .432 333 | .421 334 | .497 335 | .454 336 | .596 337 | .552 338 | .607 339 | .464 340 | -.301 341 | -.77 342 | -.65 343 | -.694 344 | -.137 345 | -.464 346 | -.246 347 | .169 348 | -.202 349 | .082 350 | .607 351 | -.158 352 | -.454 353 | -.781 354 | -.388 355 | .18 356 | .06 357 | -.5080001 358 | -.268 359 | .224 360 | .115 361 | .399 362 | .027 363 | .016 364 | -.377 365 | -.18 366 | -.038 367 | -.202 368 | .137 369 | .093 370 | -.279 371 | -.038 372 | .235 373 | .169 374 | .399 375 | .158 376 | -.541 377 | -.628 378 | .027 379 | -.738 380 | -.913 381 | -.027 382 | .071 383 | -.126 384 | .049 385 | .454 386 | .301 387 | -.005 388 | -.093 389 | -.224 390 | -.213 391 | -.311 392 | -.235 393 | -.322 394 | .322 395 | .093 396 | .115 397 | .77 398 | .607 399 | -.158 400 | -.235 401 | -.202 402 | -.333 403 | -.158 404 | -.224 405 | -.355 406 | .005 407 | .454 408 | .41 409 | .519 410 | .541 411 | .301 412 | -.574 413 | -.344 414 | -.279 415 | -.716 416 | -.869 417 | -.596 418 | -.29 419 | -.454 420 | -.246 421 | -.607 422 | -.563 423 | -.235 424 | -.246 425 | -.399 426 | -.333 427 | -.53 428 | -.049 429 | .158 430 | .115 431 | .322 432 | .115 433 | .049 434 | .454 435 | .158 436 | -.421 437 | -.268 438 | -.311 439 | -.115 440 | -.322 441 | -.322 442 | .126 443 | .333 444 | .519 445 | .399 446 | .519 447 | .432 448 | .355 449 | -.126 450 | -.5080001 451 | -.388 452 | .388 453 | .071 454 | -------------------------------------------------------------------------------- /data/soi_description.txt: -------------------------------------------------------------------------------- 1 | https://crudata.uea.ac.uk/cru/data/soi/ 2 | -------------------------------------------------------------------------------- /data/test.dat: -------------------------------------------------------------------------------- 1 | stub -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = bdr-analytics-py 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Internal variables. 12 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 13 | 14 | # Put it first so that "make" without argument is like "make help". 15 | help: 16 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 17 | 18 | .PHONY: help Makefile clean 19 | 20 | clean: 21 | rm -Rf source 22 | rm -Rf $(BUILDDIR) 23 | 24 | # This target depends on all .py files in bdranalytics folder 25 | # Thus if any one changed, also the api doc will be updated 26 | # If there are py files for which no apidoc is created, using -f does not work: 27 | # None of the apidocs will be updated (nothing changed), but the py file will 28 | # be newer (as there is no corresponding api doc file) 29 | # Therefore we remove the source dir first 30 | #source: ../bdranalytics 31 | source: $(shell find ../bdranalytics -type f -name '*.py') 32 | -rm -Rf source 33 | sphinx-apidoc -f -M -T -o source/ -H "$(SPHINXPROJ)" ../bdranalytics 34 | 35 | html: Makefile source 36 | mkdir -p $(BUILDDIR)/html $(BUILDDIR)/doctrees 37 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 38 | @echo 39 | @echo "Build finished. The HTML pages are in build/html." 40 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # codepy documentation build configuration file, created by 4 | # sphinx-quickstart on Wed Feb 4 16:32:10 2009. 5 | 6 | # encoding=utf8 7 | import sys 8 | import os 9 | 10 | # This file is execfile()d with the current directory set to its containing dir. 11 | # 12 | # The contents of this file are pickled, so don't put values in the namespace 13 | # that aren't pickleable (module imports are okay, they're removed automatically). 14 | # 15 | # Note that not all possible configuration values are present in this 16 | # autogenerated file. 17 | # 18 | # All configuration values have a default; values that are commented out 19 | # serve to show the default. 20 | 21 | # If your extensions are in another directory, add it here. If the directory 22 | # is relative to the documentation root, use os.path.abspath to make it 23 | # absolute, like shown here. 24 | sys.path.insert(0, os.path.abspath('..')) 25 | 26 | 27 | # General configuration 28 | # --------------------- 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be extensions 31 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 32 | extensions = [ 33 | 'sphinx.ext.autodoc', 34 | 'sphinx.ext.doctest', 35 | 'sphinx.ext.coverage', 36 | 'sphinx.ext.viewcode', 37 | 'sphinx.ext.githubpages', 38 | 'sphinx.ext.intersphinx' 39 | ] 40 | 41 | # pngmath / imgmath compatibility layer for different sphinx versions 42 | import sphinx 43 | from distutils.version import LooseVersion 44 | if LooseVersion(sphinx.__version__) < LooseVersion('1.4'): 45 | extensions.append('sphinx.ext.pngmath') 46 | else: 47 | extensions.append('sphinx.ext.imgmath') 48 | 49 | # Add any paths that contain templates here, relative to this directory. 50 | templates_path = ['_templates'] 51 | 52 | # The suffix of source filenames. 53 | source_suffix = '.rst' 54 | 55 | # The encoding of source files. 56 | #source_encoding = 'utf-8' 57 | 58 | # The master toctree document. 59 | master_doc = 'index' 60 | 61 | # General information about the project. 62 | project = u'bdr-analytics-py' 63 | copyright = u'2017, BigData Republic' 64 | author = u'Gerben Oostra, Benoit Descamps, Alexander Backus, Steven Reitsma, Tom de Ruijter' 65 | 66 | # The version info for the project you're documenting, acts as replacement for 67 | # |version| and |release|, also used in various other places throughout the 68 | # built documents. 69 | # 70 | # The short X.Y version. 71 | import re 72 | ver_re = re.compile(r'version\s*=\s*\'([0-9a-z.]+)\'') 73 | version = [ver_re.search(line).group(1) 74 | for line in open("../setup.py").readlines() 75 | if ver_re.search(line) 76 | ][0] 77 | # The full version, including alpha/beta/rc tags. 78 | release = version 79 | 80 | # The language for content autogenerated by Sphinx. Refer to documentation 81 | # for a list of supported languages. 82 | language = None 83 | 84 | # There are two options for replacing |today|: either, you set today to some 85 | # non-false value, then it is used: 86 | #today = '' 87 | # Else, today_fmt is used as the format for a strftime call. 88 | #today_fmt = '%B %d, %Y' 89 | 90 | # List of documents that shouldn't be included in the build. 91 | #unused_docs = [] 92 | 93 | # List of patterns, relative to source directory, that match files and 94 | # directories to ignore when looking for source files. 95 | # This patterns also effect to html_static_path and html_extra_path 96 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 97 | 98 | # List of directories, relative to source directory, that shouldn't be searched 99 | # for source files. 100 | exclude_trees = [] 101 | 102 | # The reST default role (used for this markup: `text`) to use for all documents. 103 | #default_role = None 104 | 105 | # If true, '()' will be appended to :func: etc. cross-reference text. 106 | #add_function_parentheses = True 107 | 108 | # If true, the current module name will be prepended to all description 109 | # unit titles (such as .. function::). 110 | #add_module_names = True 111 | 112 | # If true, sectionauthor and moduleauthor directives will be shown in the 113 | # output. They are ignored by default. 114 | #show_authors = False 115 | 116 | # The name of the Pygments (syntax highlighting) style to use. 117 | pygments_style = 'sphinx' 118 | 119 | # If true, `todo` and `todoList` produce output, else they produce nothing. 120 | todo_include_todos = False 121 | 122 | # Options for HTML output 123 | # ----------------------- 124 | 125 | html_theme = "sphinx_rtd_theme" 126 | 127 | html_theme_options = {} 128 | 129 | html_sidebars = { 130 | '**': [ 131 | 'about.html', 132 | 'navigation.html', 133 | 'relations.html', 134 | 'searchbox.html', 135 | ] 136 | } 137 | 138 | # The style sheet to use for HTML and HTML Help pages. A file of that name 139 | # must exist either in Sphinx' static/ path, or in one of the custom paths 140 | # given in html_static_path. 141 | #html_style = 'default.css' 142 | 143 | # The name for this set of Sphinx documents. If None, it defaults to 144 | # " v documentation". 145 | #html_title = None 146 | 147 | # A shorter title for the navigation bar. Default is the same as html_title. 148 | #html_short_title = None 149 | 150 | # The name of an image file (relative to this directory) to place at the top 151 | # of the sidebar. 152 | #html_logo = None 153 | 154 | # The name of an image file (within the static path) to use as favicon of the 155 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 156 | # pixels large. 157 | #html_favicon = None 158 | 159 | # Add any paths that contain custom static files (such as style sheets) here, 160 | # relative to this directory. They are copied after the builtin static files, 161 | # so a file named "default.css" will overwrite the builtin "default.css". 162 | html_static_path = ['_static'] 163 | 164 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 165 | # using the given strftime format. 166 | #html_last_updated_fmt = '%b %d, %Y' 167 | 168 | # If true, SmartyPants will be used to convert quotes and dashes to 169 | # typographically correct entities. 170 | #html_use_smartypants = True 171 | 172 | # Custom sidebar templates, maps document names to template names. 173 | #html_sidebars = {} 174 | 175 | # Additional templates that should be rendered to pages, maps page names to 176 | # template names. 177 | #html_additional_pages = {} 178 | 179 | # If false, no module index is generated. 180 | html_use_modindex = False 181 | 182 | # If false, no index is generated. 183 | html_use_index = False 184 | 185 | # If true, the index is split into individual pages for each letter. 186 | #html_split_index = False 187 | 188 | # If true, the reST sources are included in the HTML build as _sources/. 189 | #html_copy_source = True 190 | 191 | # If true, an OpenSearch description file will be output, and all pages will 192 | # contain a tag referring to it. The value of this option must be the 193 | # base URL from which the finished HTML is served. 194 | #html_use_opensearch = '' 195 | 196 | # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). 197 | #html_file_suffix = '' 198 | 199 | # Output file base name for HTML help builder. 200 | htmlhelp_basename = 'bdr-analytics-pydoc' 201 | 202 | 203 | # Options for LaTeX output 204 | # ------------------------ 205 | 206 | # The paper size ('letter' or 'a4'). 207 | #latex_paper_size = 'letter' 208 | 209 | # The font size ('10pt', '11pt' or '12pt'). 210 | #latex_font_size = '10pt' 211 | 212 | # Grouping the document tree into LaTeX files. List of tuples 213 | # (source start file, target name, title, author, document class [howto/manual]). 214 | latex_documents = [ 215 | (master_doc, 'bdr-analytics-py.tex', u'bdr-analytics-py Documentation', 216 | u'Gerben Oostra, Benoit Descamps, Alexander Backus, Steven Reitsma', 'manual'), 217 | ] 218 | 219 | man_pages = [ 220 | (master_doc, 'bdr-analytics-py', u'bdr-analytics-py Documentation', 221 | [author], 1) 222 | ] 223 | 224 | # The name of an image file (relative to this directory) to place at the top of 225 | # the title page. 226 | #latex_logo = None 227 | 228 | # For "manual" documents, if this is true, then toplevel headings are parts, 229 | # not chapters. 230 | #latex_use_parts = False 231 | 232 | # Additional stuff for the LaTeX preamble. 233 | #latex_preamble = '' 234 | 235 | # Documents to append as an appendix to all manuals. 236 | #latex_appendices = [] 237 | 238 | # If false, no module index is generated. 239 | #latex_use_modindex = True 240 | 241 | autoclass_content = "both" 242 | 243 | intersphinx_mapping = { 244 | 'http://docs.python.org/dev': None, 245 | 'http://docs.scipy.org/doc/numpy/': None 246 | } 247 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to bdr-analytics-py's documentation! 2 | ============================================ 3 | 4 | bdr-analytics-py is a collection of tools to simplify data science. 5 | 6 | Contents 7 | -------- 8 | .. toctree:: 9 | :maxdepth: 1 10 | 11 | source/bdranalytics.images 12 | source/bdranalytics.keras 13 | source/bdranalytics.pdlearn 14 | source/bdranalytics.plot 15 | source/bdranalytics.sklearn 16 | -------------------------------------------------------------------------------- /doc/push_to_pages.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # Push HTML files to gh-pages automatically. 3 | 4 | if [[ $TRAVIS_BRANCH != "master" || $TRAVIS_PULL_REQUEST != false ]]; then exit; fi 5 | 6 | # Fill this out with the correct org/repo 7 | ORG=BigDataRepublic 8 | REPO=bdr-analytics-py 9 | # This probably should match an email for one of your users. 10 | EMAIL=info@bigdatarepublic.nl 11 | 12 | set -e 13 | 14 | # Script is located in and initiated from project root. 15 | # Clone the gh-pages branch outside of the repo and cd into it. 16 | cd .. 17 | git clone -b gh-pages "https://$GH_TOKEN@github.com/$ORG/$REPO.git" gh-pages 18 | cd gh-pages 19 | 20 | # Update git configuration so I can push. 21 | if [ "$1" != "dry" ]; then 22 | # Update git config. 23 | git config user.name "Travis Builder" 24 | git config user.email "$EMAIL" 25 | fi 26 | 27 | # Copy in the HTML. You may want to change this with your documentation path. 28 | cp -R ../$REPO/doc/_build/html/* ./ 29 | 30 | # Add and commit changes. 31 | git add -A . 32 | git commit -m "[ci skip] Autodoc commit for $COMMIT." 33 | if [ "$1" != "dry" ]; then 34 | # -q is very important, otherwise you leak your GH_TOKEN 35 | git push -q origin gh-pages 36 | fi 37 | 38 | # Move back into project root. 39 | cd ../$REPO 40 | -------------------------------------------------------------------------------- /environment-dev.yml: -------------------------------------------------------------------------------- 1 | name: bdranalytics-dev 2 | # This environment is designed to work ON the bdranalytics module: it includes its dependencies, but not bdranalytics itself. 3 | dependencies: 4 | - python=3.6 5 | - cmake 6 | - boost 7 | - pkgconfig 8 | - jupyter 9 | - tensorflow 10 | - conda-forge::xgboost=0.6a2 11 | - pip 12 | - pip: 13 | - -rrequirements-dev.txt 14 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: bdranalytics 2 | # This environment is designed to work WITH the bdranalytics module: it includes the module including its dependencies 3 | dependencies: 4 | - python=3.6 5 | - cmake 6 | - boost 7 | - tensorflow 8 | - keras 9 | - pkgconfig 10 | - jupyter 11 | - conda-forge::xgboost=0.6a2 12 | - pip 13 | - pip: 14 | - -rrequirements.txt 15 | -------------------------------------------------------------------------------- /notebooks/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/notebooks/.gitkeep -------------------------------------------------------------------------------- /notebooks/Spark Cross Sell Frequent Pairs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Recommendations based on frequent pairs\n", 8 | "\n", 9 | "This notebook shows how to calculate frequent pairs based on shopping lists, using a scoring function that is relevant for recommendatins." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "## Generating artificial shopping lists" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 39, 22 | "metadata": { 23 | "collapsed": false 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "import numpy as np\n", 28 | "from itertools import groupby, permutations, chain, islice\n", 29 | "from operator import itemgetter, add\n", 30 | "\n", 31 | "def get_random_element(x):\n", 32 | " if x is None:\n", 33 | " return -1\n", 34 | " else:\n", 35 | " return x[np.random.randint(len(x))]\n", 36 | "\n", 37 | "def split_every(n, iterable):\n", 38 | " i = iter(iterable)\n", 39 | " piece = list(islice(i, n))\n", 40 | " while piece:\n", 41 | " yield piece\n", 42 | " piece = list(islice(i, n))\n" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "Let's generate some random shopping lists:" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 40, 55 | "metadata": { 56 | "collapsed": true 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "num_unique_items = 5000\n", 61 | "num_paired_items = 100\n", 62 | "num_bought_items = 250000" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "Select some random pairs which are usually bought together" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 41, 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "outputs": [ 79 | { 80 | "name": "stdout", 81 | "output_type": "stream", 82 | "text": [ 83 | "We generate example pairs which will always be bought together. Some examples:\n" 84 | ] 85 | }, 86 | { 87 | "data": { 88 | "text/plain": [ 89 | "[(2048, [2804]), (3086, [1164]), (22, [4816]), (547, [3575]), (41, [2330])]" 90 | ] 91 | }, 92 | "execution_count": 41, 93 | "metadata": {}, 94 | "output_type": "execute_result" 95 | } 96 | ], 97 | "source": [ 98 | "test_givenpairs = np.random.randint(num_unique_items, size=(num_paired_items, 2))\n", 99 | "test_pair_dict = dict([(k, [itemgetter(1)(f) for f in v]) for k, v in groupby(sorted(test_givenpairs,key=itemgetter(0)), key = itemgetter(0))])\n", 100 | "print \"We generate example pairs which will always be bought together. Some examples:\"\n", 101 | "test_pair_dict.items()[0:5]" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "Determine the items that are bought. By running them modulo their index, we create an exponential distribution" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 42, 114 | "metadata": { 115 | "collapsed": true 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "test_purchases_bought = [j % (i+1) for i, j in enumerate(np.random.randint(num_unique_items, size=(num_bought_items)))]" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "metadata": {}, 125 | "source": [ 126 | "Now map all bought items to a random item which is by our definition bought together. If one is available" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 43, 132 | "metadata": { 133 | "collapsed": false 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "vectorized_select_paired_items = np.vectorize(test_pair_dict.get)\n", 138 | "vectorized_select_random_item = np.vectorize(get_random_element)\n", 139 | "test_purchases_added = ( \n", 140 | " vectorized_select_random_item( # select one item random from the list of paired items\n", 141 | " vectorized_select_paired_items( # map them to the list of paired items\n", 142 | " test_purchases_bought # the items we bought\n", 143 | " )\n", 144 | " ))\n", 145 | "test_purchases_pairs = np.transpose(np.vstack((test_purchases_bought, test_purchases_added)))\n", 146 | "np.random.shuffle(test_purchases_pairs)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "Given these pairs, we unravel it to create one long list, which is then splitted into different shopping baskets" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 44, 159 | "metadata": { 160 | "collapsed": false 161 | }, 162 | "outputs": [ 163 | { 164 | "name": "stdout", 165 | "output_type": "stream", 166 | "text": [ 167 | "Each few items are joined together to form a list. Because some items are removed, lists have variable length. Some examples:\n", 168 | "[[266, 2538, 3794, 4274], [1099, 4111, 4177], [1442, 3249, 4480], [3652, 3769, 4565], [347, 1253, 2298]]\n", 169 | "Total number of generated shopping lists = 83334\n" 170 | ] 171 | } 172 | ], 173 | "source": [ 174 | "test_sequence = test_purchases_pairs.ravel()\n", 175 | "test_shopping_baskets = [nonempty for nonempty in [np.unique(a[a>0]).tolist() for a in [np.array(a) for a in list(split_every(6, test_sequence))]] if len(nonempty)>0]\n", 176 | "print \"Each few items are joined together to form a list. Because some items are removed, lists have variable length. Some examples:\"\n", 177 | "print test_shopping_baskets[0:5]\n", 178 | "print \"Total number of generated shopping lists = {}\".format(len(test_shopping_baskets))" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "## Step 0: Loading the data in spark" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 45, 191 | "metadata": { 192 | "collapsed": false 193 | }, 194 | "outputs": [ 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "There are in total 83334 shopping lists\n" 200 | ] 201 | } 202 | ], 203 | "source": [ 204 | "shopping_baskets = sc.parallelize(test_shopping_baskets)\n", 205 | "shopping_baskets_count = shopping_baskets.count()\n", 206 | "print \"There are in total {} shopping lists\".format(shopping_baskets_count)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": {}, 212 | "source": [ 213 | "## Step 1: Calculating item frequencies" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "Getting item buy frequency. While the shopping lists do not have to fit in memory, a list of unique items should" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 46, 226 | "metadata": { 227 | "collapsed": false 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "def check_uniqueness(t):\n", 232 | " if len(set(t))!=len(t):\n", 233 | " raise ValueError(\"Items in a transaction must be unique but got {}\".format(t))\n", 234 | " return t\n", 235 | " else:\n", 236 | " return t\n", 237 | "item_freq = dict(shopping_baskets.flatMap(check_uniqueness).map(lambda v: (v, 1L)).reduceByKey(add).collect())\n" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 47, 243 | "metadata": { 244 | "collapsed": false 245 | }, 246 | "outputs": [ 247 | { 248 | "name": "stdout", 249 | "output_type": "stream", 250 | "text": [ 251 | "The most popular item is bought in 126 shopping baskets\n", 252 | "As example, the first few items with their frequencies:\n", 253 | "[(1, 60L), (2, 59L), (3, 53L), (4, 69L), (5, 50L)]\n" 254 | ] 255 | } 256 | ], 257 | "source": [ 258 | "print \"The most popular item is bought in {} shopping baskets\".format(max(item_freq.values()))\n", 259 | "print \"As example, the first few items with their frequencies:\"\n", 260 | "print item_freq.items()[0:5]" 261 | ] 262 | }, 263 | { 264 | "cell_type": "markdown", 265 | "metadata": {}, 266 | "source": [ 267 | "## Step 2: Calculating item pair scores" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "We use the following score:\n", 275 | "$$ score = \\dfrac{\\bigg(\\dfrac{X\\ and\\ Y}{X}\\bigg)}{\\bigg(\\dfrac{(not\\ X)\\ and\\ Y}{not\\ X}\\bigg)}$$" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 48, 281 | "metadata": { 282 | "collapsed": false 283 | }, 284 | "outputs": [], 285 | "source": [ 286 | "from __future__ import division\n", 287 | "def calculate_score( xy, xy_count):\n", 288 | " \"\"\"\n", 289 | " xy is a tuple of item ids\n", 290 | " xy_count is the observation count\n", 291 | " calculates:\n", 292 | " x and y / x\n", 293 | " / \n", 294 | " not x and y / not x\"\"\"\n", 295 | " x_item, y_item = xy\n", 296 | " x = item_freq[x_item]\n", 297 | " y = item_freq[y_item]\n", 298 | " notx = shopping_baskets_count - x\n", 299 | " x_y = xy_count\n", 300 | " notx_y = y - x_y\n", 301 | " if notx_y==0:\n", 302 | " return (xy, np.Inf)\n", 303 | " else:\n", 304 | " return (xy, (notx/x) * (x_y/notx_y))\n", 305 | " \n", 306 | "def all_pairs(x):\n", 307 | " return list(permutations(x, 2)) # permutations also generates the pairs with _1 and _2 flipped\n", 308 | " \n", 309 | "def as_key_with_value(i):\n", 310 | " def as_key(x):\n", 311 | " return x, i\n", 312 | " return as_key" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 49, 318 | "metadata": { 319 | "collapsed": false 320 | }, 321 | "outputs": [], 322 | "source": [ 323 | "pairs = shopping_baskets\\\n", 324 | " .flatMap(all_pairs)\\\n", 325 | " .map(as_key_with_value(1))\\\n", 326 | " .reduceByKey(add)\\\n", 327 | " .map(lambda x:calculate_score(*x))\\\n", 328 | " .cache()" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": {}, 334 | "source": [ 335 | "Now we have the score for every pair of products ever bought" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 50, 341 | "metadata": { 342 | "collapsed": false 343 | }, 344 | "outputs": [ 345 | { 346 | "name": "stdout", 347 | "output_type": "stream", 348 | "text": [ 349 | "There are in total 513862 pairs of bought products\n", 350 | "The first few pairs with their score:\n" 351 | ] 352 | }, 353 | { 354 | "data": { 355 | "text/plain": [ 356 | "[((3075, 3343), 41.066074950690336),\n", 357 | " ((4171, 535), 41.3125),\n", 358 | " ((2261, 2169), 26.089598997493734),\n", 359 | " ((810, 3728), 42.48877551020408),\n", 360 | " ((4785, 2165), 29.65811965811966),\n", 361 | " ((2635, 4663), 17.135802469135804),\n", 362 | " ((112, 4246), 41.647),\n", 363 | " ((4649, 717), 26.0531914893617),\n", 364 | " ((2514, 2968), 31.7472359893252),\n", 365 | " ((607, 3139), 39.06801125703565)]" 366 | ] 367 | }, 368 | "execution_count": 50, 369 | "metadata": {}, 370 | "output_type": "execute_result" 371 | } 372 | ], 373 | "source": [ 374 | "pairs_count = pairs.count()\n", 375 | "print \"There are in total {} pairs of bought products\".format(pairs_count)\n", 376 | "print \"The first few pairs with their score:\"\n", 377 | "pairs.take(10)" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "## Step 3, Option 1: Selecting pairs based on score threshold" 385 | ] 386 | }, 387 | { 388 | "cell_type": "markdown", 389 | "metadata": {}, 390 | "source": [ 391 | "As context, let's get the histogram" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": 61, 397 | "metadata": { 398 | "collapsed": false 399 | }, 400 | "outputs": [ 401 | { 402 | "data": { 403 | "text/plain": [ 404 | "([5.683559866129364,\n", 405 | " 250.08873329128113,\n", 406 | " 494.4939067164329,\n", 407 | " 738.8990801415846,\n", 408 | " 983.3042535667364,\n", 409 | " 1227.7094269918882,\n", 410 | " 1472.1146004170398,\n", 411 | " 1716.5197738421916,\n", 412 | " 1960.9249472673434,\n", 413 | " 2205.330120692495,\n", 414 | " 2449.735294117647],\n", 415 | " [513662, 0, 0, 4, 4, 17, 43, 24, 9, 2])" 416 | ] 417 | }, 418 | "execution_count": 61, 419 | "metadata": {}, 420 | "output_type": "execute_result" 421 | } 422 | ], 423 | "source": [ 424 | "pairs.map(lambda k_v:k_v[1]).filter(lambda score: not np.isinf(score)).histogram(10)" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": 62, 430 | "metadata": { 431 | "collapsed": false 432 | }, 433 | "outputs": [ 434 | { 435 | "name": "stdout", 436 | "output_type": "stream", 437 | "text": [ 438 | "The number of frequent pairs = 200\n" 439 | ] 440 | } 441 | ], 442 | "source": [ 443 | "frequent_pairs = pairs.filter(lambda k_v:k_v[1]>250).collect()\n", 444 | "print \"The number of frequent pairs = {}\".format(len(frequent_pairs))" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 52, 450 | "metadata": { 451 | "collapsed": false 452 | }, 453 | "outputs": [ 454 | { 455 | "data": { 456 | "text/plain": [ 457 | "[((4257, 2061), 1321.952380952381),\n", 458 | " ((3627, 479), 930.1998710509349),\n", 459 | " ((2061, 4257), inf),\n", 460 | " ((547, 3575), 1665.88),\n", 461 | " ((2926, 1436), inf)]" 462 | ] 463 | }, 464 | "execution_count": 52, 465 | "metadata": {}, 466 | "output_type": "execute_result" 467 | } 468 | ], 469 | "source": [ 470 | "frequent_pairs[0:5]" 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "metadata": {}, 476 | "source": [ 477 | "## Step 3, Option 2: Selecting top N scoring cross selling items." 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": {}, 483 | "source": [ 484 | "Here we define some helper functions to keep the highest N co occurring items" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": 53, 490 | "metadata": { 491 | "collapsed": false 492 | }, 493 | "outputs": [], 494 | "source": [ 495 | "def aggregate_zero():\n", 496 | " return []\n", 497 | "\n", 498 | "def aggregate_seq(n):\n", 499 | " def sequenceadd(seq, item):\n", 500 | " seq.append(item)\n", 501 | " seq.sort(key=lambda x:x[1], reverse=True)\n", 502 | " return seq[0:n]\n", 503 | " return sequenceadd\n", 504 | "\n", 505 | "def aggregate_combine(n):\n", 506 | " def combine(seq1, seq2):\n", 507 | " return sorted(seq1+seq2, key=lambda x:x[1], reverse=True)[0:n]\n", 508 | " return combine" 509 | ] 510 | }, 511 | { 512 | "cell_type": "markdown", 513 | "metadata": {}, 514 | "source": [ 515 | "Instead of hard thresholds, we can just find the most cross sellable product for each product. Some examples:" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": 54, 521 | "metadata": { 522 | "collapsed": false 523 | }, 524 | "outputs": [], 525 | "source": [ 526 | "item_with_cross_sells = pairs\\\n", 527 | " .map(lambda kv: (kv[0][0], (kv[0][1], kv[1])))\\\n", 528 | " .aggregateByKey(aggregate_zero(), aggregate_seq(5), aggregate_combine(5)).cache()" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": 55, 534 | "metadata": { 535 | "collapsed": false 536 | }, 537 | "outputs": [ 538 | { 539 | "data": { 540 | "text/plain": [ 541 | "[(2048,\n", 542 | " [(2804, 2191.3947368421054),\n", 543 | " (3589, 53.53455480552877),\n", 544 | " (3468, 44.03648863035431),\n", 545 | " (2428, 44.03648863035431),\n", 546 | " (4481, 42.66034836065574)]),\n", 547 | " (3072,\n", 548 | " [(63, 57.431724137931035),\n", 549 | " (1603, 55.222811671087534),\n", 550 | " (2161, 53.17752234993614),\n", 551 | " (1396, 51.278325123152705),\n", 552 | " (3171, 41.02266009852217)]),\n", 553 | " (8,\n", 554 | " [(4082, 60.69825072886297),\n", 555 | " (897, 41.30853174603175),\n", 556 | " (4982, 38.13095238095238),\n", 557 | " (4459, 38.13095238095238),\n", 558 | " (1525, 36.27090592334495)]),\n", 559 | " (16,\n", 560 | " [(2027, 54.422222222222224),\n", 561 | " (599, 42.224137931034484),\n", 562 | " (1828, 40.81666666666666),\n", 563 | " (1460, 38.265625),\n", 564 | " (3584, 34.98571428571429)]),\n", 565 | " (344,\n", 566 | " [(636, 59.140625),\n", 567 | " (3622, 56.56929347826087),\n", 568 | " (2246, 50.042067307692314),\n", 569 | " (4331, 44.10487288135593),\n", 570 | " (3839, 37.17410714285714)])]" 571 | ] 572 | }, 573 | "execution_count": 55, 574 | "metadata": {}, 575 | "output_type": "execute_result" 576 | } 577 | ], 578 | "source": [ 579 | "item_with_cross_sells.take(5)" 580 | ] 581 | }, 582 | { 583 | "cell_type": "markdown", 584 | "metadata": {}, 585 | "source": [ 586 | "Let's find a perfect pair, one with score infinity" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": 56, 592 | "metadata": { 593 | "collapsed": false 594 | }, 595 | "outputs": [ 596 | { 597 | "data": { 598 | "text/plain": [ 599 | "((2061, 4257), inf)" 600 | ] 601 | }, 602 | "execution_count": 56, 603 | "metadata": {}, 604 | "output_type": "execute_result" 605 | } 606 | ], 607 | "source": [ 608 | "perfect_pair = pairs.filter(lambda x: np.isinf(x[1])).take(1)[0]\n", 609 | "perfect_pair" 610 | ] 611 | }, 612 | { 613 | "cell_type": "markdown", 614 | "metadata": {}, 615 | "source": [ 616 | "And show with which other items that occurs:" 617 | ] 618 | }, 619 | { 620 | "cell_type": "code", 621 | "execution_count": 57, 622 | "metadata": { 623 | "collapsed": false 624 | }, 625 | "outputs": [ 626 | { 627 | "data": { 628 | "text/plain": [ 629 | "[[(4257, inf),\n", 630 | " (1971, 39.45945945945946),\n", 631 | " (2047, 30.416666666666664),\n", 632 | " (3390, 28.07692307692308),\n", 633 | " (2232, 25.17241379310345)]]" 634 | ] 635 | }, 636 | "execution_count": 57, 637 | "metadata": {}, 638 | "output_type": "execute_result" 639 | } 640 | ], 641 | "source": [ 642 | "item_with_cross_sells.lookup(perfect_pair[0][0])" 643 | ] 644 | } 645 | ], 646 | "metadata": { 647 | "anaconda-cloud": {}, 648 | "kernelspec": { 649 | "display_name": "Python [bdranalytics]", 650 | "language": "python", 651 | "name": "Python [bdranalytics]" 652 | }, 653 | "language_info": { 654 | "codemirror_mode": { 655 | "name": "ipython", 656 | "version": 2 657 | }, 658 | "file_extension": ".py", 659 | "mimetype": "text/x-python", 660 | "name": "python", 661 | "nbconvert_exporter": "python", 662 | "pygments_lexer": "ipython2", 663 | "version": "2.7.12" 664 | } 665 | }, 666 | "nbformat": 4, 667 | "nbformat_minor": 0 668 | } 669 | -------------------------------------------------------------------------------- /notebooks/bdr-imbalanced-classification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "deletable": true, 7 | "editable": true 8 | }, 9 | "source": [ 10 | "# Classification model\n", 11 | "Here we use machine learning techniques to create and validate a model that can predict the probability of a relatively rare event (imbalanced classes problem)." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": { 18 | "collapsed": false, 19 | "deletable": true, 20 | "editable": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "import sys\n", 25 | "sys.path.append('../')\n", 26 | "\n", 27 | "# import generic packages\n", 28 | "import numpy as np\n", 29 | "import pandas as pd\n", 30 | "# pd.options.display.max_columns = None\n", 31 | "# pd.options.display.max_colwidth = 100\n", 32 | "from IPython.display import display\n", 33 | "\n", 34 | "# visualization packages\n", 35 | "import matplotlib.pyplot as plt\n", 36 | "import matplotlib\n", 37 | "import seaborn as sns\n", 38 | "sns.set(style=\"white\")\n", 39 | "%matplotlib inline\n", 40 | "\n", 41 | "# module loading settings\n", 42 | "%load_ext autoreload\n", 43 | "%autoreload 2" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": { 50 | "collapsed": false, 51 | "deletable": true, 52 | "editable": true 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "# load to data frame\n", 57 | "df = pd.read_csv('')\n", 58 | "\n", 59 | "# extract and remove timestamps from data frame\n", 60 | "timestamps = df['timestamp']\n", 61 | "df.drop('timestamp', axis=1, inplace=True)\n", 62 | "\n", 63 | "# determine categoricals\n", 64 | "high_capacity = df.columns.values[~np.array(df.dtypes == np.number)].tolist()\n", 65 | "print \"high capacity categorical feature columns:\"\n", 66 | "print high_capacity\n", 67 | "\n", 68 | "# print some info\n", 69 | "print \"{:d} observations\".format(len(df))\n", 70 | "df.head()" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": { 76 | "deletable": true, 77 | "editable": true 78 | }, 79 | "source": [ 80 | "## Model specification\n", 81 | "Here we set some specifications for the model: type, how it should be fitted, optimized and validated." 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "collapsed": true, 89 | "deletable": true, 90 | "editable": true 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "model_type = 'rf' # the classification algorithm\n", 95 | "tune_model = False # optimize hyperparameters\n", 96 | "\n", 97 | "cross_val_method = 'temporal' # cross-validation routine\n", 98 | "\n", 99 | "cost_fp = 1000 # preferably in euros!\n", 100 | "benefit_tp = 3000\n", 101 | "class_weights = {0: cost_fp, 1: benefit_tp} # costs for fn and fp" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": { 107 | "deletable": true, 108 | "editable": true 109 | }, 110 | "source": [ 111 | "## Cross-validation procedure\n", 112 | "To validate whether the model makes sensible predictions, we need to perform cross-validation. The exact procedure for this is specified below. Random cross-validation (set-aside a random sample for testing) is fast, but temporal cross-validation (set-aside a time period for testing) gives the most realistic results due to the resemblence of real-world model usage." 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "collapsed": false, 120 | "deletable": true, 121 | "editable": true 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, train_test_split\n", 126 | "\n", 127 | "#source: https://github.com/BigDataRepublic/bdr-analytics-py\n", 128 | "#! pip install -e git+ssh://git@github.com/BigDataRepublic/bdr-analytics.git#egg=bdranalytics-0.1\n", 129 | "from bdranalytics.pipeline.encoders import WeightOfEvidenceEncoder\n", 130 | "from bdranalytics.model_selection.growingwindow import IntervalGrowingWindow\n", 131 | "\n", 132 | "from sklearn.metrics import average_precision_score, make_scorer, roc_auc_score\n", 133 | "\n", 134 | "if cross_val_method is 'random':\n", 135 | " \n", 136 | " # split train data into stratified random folds\n", 137 | " cv_dev = StratifiedShuffleSplit(test_size=0.1, train_size=0.1, n_splits=5, random_state=1)\n", 138 | " \n", 139 | " cv_test = StratifiedShuffleSplit(test_size=0.33, n_splits=1, random_state=2)\n", 140 | "\n", 141 | "elif cross_val_method is 'temporal':\n", 142 | " \n", 143 | " train_size = pd.Timedelta(days=365 * 4 )\n", 144 | " \n", 145 | " # create a cross-validation routine for parameter tuning\n", 146 | " cv_dev = IntervalGrowingWindow(timestamps=timestamps,\n", 147 | " test_start_date=pd.datetime(year=2015, month=1, day=1),\n", 148 | " test_end_date=pd.datetime(year=2015, month=12, day=31),\n", 149 | " test_size=pd.Timedelta(days=30), \n", 150 | " train_size=train_size)\n", 151 | " \n", 152 | " # create a cross-validation routine for model evaluation\n", 153 | " cv_test = IntervalGrowingWindow(timestamps=timestamps,\n", 154 | " test_start_date=pd.datetime(year=2016, month=1, day=1),\n", 155 | " test_end_date=pd.datetime(year=2016, month=8, day=31),\n", 156 | " test_size=pd.Timedelta(days=2*30),\n", 157 | " train_size=train_size) \n", 158 | "\n", 159 | "# number of parallel jobs for cross-validation\n", 160 | "n_jobs = 1\n", 161 | "\n", 162 | "# two functions for advanced performance evaluation metrics\n", 163 | "def roc_auc(y_true, y_pred):\n", 164 | " return roc_auc_score(pd.get_dummies(y_true), y_pred)\n", 165 | "\n", 166 | "roc_auc_scorer = make_scorer(roc_auc, needs_proba=True)\n", 167 | "\n", 168 | "def pr_auc(y_true, y_pred):\n", 169 | " return average_precision_score(pd.get_dummies(y_true), y_pred, average=\"micro\")\n", 170 | "\n", 171 | "pr_auc_scorer = make_scorer(pr_auc, needs_proba=True)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": { 178 | "collapsed": false, 179 | "deletable": true, 180 | "editable": true 181 | }, 182 | "outputs": [], 183 | "source": [ 184 | "from sklearn.preprocessing import StandardScaler, Imputer\n", 185 | "\n", 186 | "from sklearn.pipeline import Pipeline\n", 187 | "\n", 188 | "from sklearn.linear_model import LogisticRegression\n", 189 | "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n", 190 | "from sklearn.dummy import DummyClassifier\n", 191 | "from xgboost import XGBClassifier\n", 192 | "\n", 193 | "# convert date frame to bare X and y variables for the model pipeline\n", 194 | "y_col = 'target'\n", 195 | "X = df.copy().drop(y_col, axis=1)\n", 196 | "y = np.array(df[y_col])\n", 197 | "n_features = X.shape[1]\n", 198 | "\n", 199 | "# define preprocessing steps\n", 200 | "preproc_steps = [('woe', WeightOfEvidenceEncoder(cols=high_capacity)),\n", 201 | " ('imputer', Imputer(missing_values='NaN', strategy='median', axis=0)),\n", 202 | " ('standardizer', StandardScaler(with_mean=True, with_std=True))]\n", 203 | "\n", 204 | "# specification of different model types and their defaults\n", 205 | "model_steps_dict = {'lr': [('lr', LogisticRegression(C=0.001, penalty='l2', tol=0.01,\n", 206 | " class_weight=class_weights))],\n", 207 | " 'rf': [('rf', RandomForestClassifier(n_estimators=400, max_features='auto',\n", 208 | " class_weight=class_weights))],\n", 209 | " 'gbc': [('gbc', GradientBoostingClassifier(n_estimators=400, max_depth=3))],\n", 210 | " 'xgb': [('xgb', XGBClassifier(scale_pos_weight=class_weights[1],\n", 211 | " n_estimators=100, max_depth=4))],\n", 212 | " 'dummy': [('dummy', DummyClassifier(strategy='prior'))]\n", 213 | " }\n", 214 | "\n", 215 | "# specification of the different model hyperparameters and tuning space\n", 216 | "model_params_grid = {'lr': {'lr__C': [1e-4, 1e-3, 1e-2, 1e-1]},\n", 217 | " 'rf': {'rf__max_features': [3, n_features, np.sqrt(n_features)],\n", 218 | " 'rf__n_estimators': [10, 100, 1000]},\n", 219 | " 'gbc': {'gbc__n_estimators': [100, 200]},\n", 220 | " 'xgb': {'xgb__max_depth': [3,6,9],\n", 221 | " 'xgb__reg_alpha': [0,5,15],\n", 222 | " 'xgb__reg_lambda': [0,5,15],\n", 223 | " 'xgb__gamma' : [0,10,50,100]},\n", 224 | " 'dummy': {}}\n", 225 | "\n", 226 | "# store the model step\n", 227 | "model_steps = model_steps_dict[model_type]\n", 228 | "\n", 229 | "# combine everything in one pipeline\n", 230 | "estimator = Pipeline(steps=(preproc_steps + model_steps))\n", 231 | "print estimator" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": { 237 | "deletable": true, 238 | "editable": true 239 | }, 240 | "source": [ 241 | "## Model parameter tuning\n", 242 | "If desired, we can optimize the model hyperparameters to get the best possible model." 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "collapsed": false, 250 | "deletable": true, 251 | "editable": true 252 | }, 253 | "outputs": [], 254 | "source": [ 255 | "# procedure depends on cross-validation type\n", 256 | "if cross_val_method is 'random': \n", 257 | " train_index = next(cv_test.split(X, y))[0]\n", 258 | " X_dev = X.iloc[train_index,:]\n", 259 | " y_dev = y[train_index]\n", 260 | "elif cross_val_method is 'temporal':\n", 261 | " X_dev = X\n", 262 | " y_dev = y\n", 263 | "\n", 264 | "# setting to include class weights in the gradient boosting model\n", 265 | "if model_type is 'gbc':\n", 266 | " sample_weights = np.array(map(lambda x: class_weights[x], y_dev))\n", 267 | " fit_params = {'gbc__sample_weight': sample_weights}\n", 268 | "else: \n", 269 | " fit_params = {}\n", 270 | "\n", 271 | "# tune model with a parameter grid search if desired\n", 272 | "if tune_model:\n", 273 | " \n", 274 | " grid_search = GridSearchCV(estimator, cv=cv_dev, n_jobs=n_jobs, refit=False,\n", 275 | " param_grid=model_params_grid[model_type],\n", 276 | " scoring=pr_auc_scorer, fit_params=fit_params)\n", 277 | "\n", 278 | " grid_search.fit(X_dev, y_dev)\n", 279 | " \n", 280 | " # show grid search results\n", 281 | " display(pd.DataFrame(grid_search.cv_results_))\n", 282 | " \n", 283 | " # set best parameters for estimator\n", 284 | " estimator.set_params(**grid_search.best_params_)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "markdown", 289 | "metadata": { 290 | "deletable": true, 291 | "editable": true 292 | }, 293 | "source": [ 294 | "## Model validation\n", 295 | "The final test on the holdout." 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": { 302 | "collapsed": false, 303 | "deletable": true, 304 | "editable": true, 305 | "scrolled": true 306 | }, 307 | "outputs": [], 308 | "source": [ 309 | "y_pred_proba = [] # initialize empty predictions array\n", 310 | "y_true = [] # initialize empty ground-truth array\n", 311 | "\n", 312 | "# loop over the test folds\n", 313 | "for i_fold, (train_index, test_index) in enumerate(cv_test.split(X, y)):\n", 314 | " \n", 315 | " print \"validation fold {:d}\".format(i_fold)\n", 316 | " \n", 317 | " X_train = X.iloc[train_index,:]\n", 318 | " y_train = y[train_index]\n", 319 | " \n", 320 | " X_test = X.iloc[test_index,:]\n", 321 | " y_test = y[test_index]\n", 322 | " \n", 323 | " if model_type is 'gbc':\n", 324 | " sample_weights = map(lambda x: class_weights[x], y_train)\n", 325 | " fit_params = {'gbc__sample_weight': sample_weights}\n", 326 | " else: \n", 327 | " fit_params = {}\n", 328 | " \n", 329 | " # fit the model\n", 330 | " estimator.fit(X_train, y_train, **fit_params)\n", 331 | "\n", 332 | " # probability outputs for class 1\n", 333 | " y_pred_proba.append(map(lambda x: x[1], estimator.predict_proba(X_test)))\n", 334 | " \n", 335 | " # store the true y labels for each fold\n", 336 | " y_true.append(np.array(y_test))\n", 337 | "\n", 338 | "# postprocess the results\n", 339 | "y_true = np.concatenate(y_true)\n", 340 | "y_pred_proba = np.concatenate(y_pred_proba) \n", 341 | "y_pred_bin = (y_pred_proba > 0.5) * 1.\n", 342 | "\n", 343 | "# print some stats\n", 344 | "n_samples_test = len(y_true)\n", 345 | "n_pos_test = sum(y_true)\n", 346 | "n_neg_test = n_samples_test - n_pos_test\n", 347 | "print \"events: {}\".format(n_pos_test)\n", 348 | "print \"p_no_event: {}\".format(n_neg_test / n_samples_test)\n", 349 | "print \"test accuracy: {}\".format((np.equal(y_pred_bin, y_true) * 1.).mean())" 350 | ] 351 | }, 352 | { 353 | "cell_type": "markdown", 354 | "metadata": { 355 | "deletable": true, 356 | "editable": true 357 | }, 358 | "source": [ 359 | "### Receiver-operator characteristics\n", 360 | "Line is constructed by applying various threshold to the model output. \n", 361 | "Y-axis: proportion of events correctly identified, hit-rate \n", 362 | "X-axis: proportion of false positives, usually results in waste of resources \n", 363 | "Dotted line is guessing (no model). Blue line above the dotted line means there is information in the features." 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": { 370 | "collapsed": false, 371 | "deletable": true, 372 | "editable": true 373 | }, 374 | "outputs": [], 375 | "source": [ 376 | "from sklearn.metrics import roc_curve, auc\n", 377 | "\n", 378 | "fpr, tpr, thresholds = roc_curve(y_true, y_pred_proba, pos_label=1)\n", 379 | "roc_auc = auc(fpr, tpr)\n", 380 | " \n", 381 | "# plot ROC curve\n", 382 | "plt.figure()\n", 383 | "plt.plot(fpr, tpr, label=\"ROC curve (area = {:.2f})\".format(roc_auc))\n", 384 | "plt.plot([0, 1], [0, 1], 'k--')\n", 385 | "plt.xlim([0.0, 1.0])\n", 386 | "plt.ylim([0.0, 1.0])\n", 387 | "plt.xlabel('False positive rate')\n", 388 | "plt.ylabel('True positive rate')\n", 389 | "plt.title('Receiver-operating characteristic')\n", 390 | "plt.legend(loc=\"lower right\")\n", 391 | "plt.show()" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": { 397 | "deletable": true, 398 | "editable": true 399 | }, 400 | "source": [ 401 | "## Costs and benefits\n", 402 | "ROC optimization with cost matrix. Critical information: cost of FP and cost of FN (i.e. benefit of TP). Also used to train the model with `class_weights`." 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "metadata": { 409 | "collapsed": false, 410 | "deletable": true, 411 | "editable": true 412 | }, 413 | "outputs": [], 414 | "source": [ 415 | "def benefit(tpr, fpr):\n", 416 | "\n", 417 | " n_tp = tpr * n_pos_test # number of true positives (benefits)\n", 418 | " n_fp = fpr * n_neg_test # number of false positives (extra costs)\n", 419 | " \n", 420 | " fp_costs = n_fp * cost_fp\n", 421 | " tp_benefits = n_tp * benefit_tp\n", 422 | " \n", 423 | " return tp_benefits - fp_costs\n", 424 | "\n", 425 | "benefits = np.zeros_like(thresholds)\n", 426 | "for i, _ in enumerate(thresholds):\n", 427 | " benefits[i] = benefit(tpr[i], fpr[i])\n", 428 | "\n", 429 | "i_max = np.argmax(benefits)\n", 430 | "print (\"max benefits: {:.0f}k euros, tpr: {:.3f}, fpr: {:.3f}, threshold: {:.3f}\"\n", 431 | " .format(benefits[i_max]/ 1e3, benefits[i_max]/ 1e3 / 8, tpr[i_max], fpr[i_max], thresholds[i_max]))\n", 432 | "\n", 433 | "plt.plot(thresholds, benefits)\n", 434 | "plt.xlim([0,1])\n", 435 | "plt.ylim([0,np.max(benefits)])\n", 436 | "plt.show()" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": null, 442 | "metadata": { 443 | "collapsed": false, 444 | "deletable": true, 445 | "editable": true 446 | }, 447 | "outputs": [], 448 | "source": [ 449 | "# recalibrate threshold based on benefits (optional, should still be around 0.5)\n", 450 | "y_pred_bin = (y_pred_proba > thresholds[i_max]) * 1." 451 | ] 452 | }, 453 | { 454 | "cell_type": "markdown", 455 | "metadata": { 456 | "deletable": true, 457 | "editable": true 458 | }, 459 | "source": [ 460 | "### Precision-recall curve\n", 461 | "Another way to look at it. Note that models which perform well in PR-space are necessarily also dominating ROC-space. The opposite is not the case! Line is constructed by applying various threshold to the model output. \n", 462 | "Y-axis: proportion of events among all positives (precision) \n", 463 | "X-axis: proportion of events correctly identified (recall, hit rate)" 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "execution_count": null, 469 | "metadata": { 470 | "collapsed": false, 471 | "deletable": true, 472 | "editable": true 473 | }, 474 | "outputs": [], 475 | "source": [ 476 | "from sklearn.metrics import precision_recall_curve\n", 477 | "\n", 478 | "precision, recall, thresholds = precision_recall_curve(y_true, y_pred_proba, pos_label=1)\n", 479 | "\n", 480 | "average_precision = average_precision_score(y_true, y_pred_proba, average=\"micro\")\n", 481 | "\n", 482 | "baseline = n_pos_test / n_samples_test\n", 483 | "\n", 484 | "# plot PR curve\n", 485 | "plt.figure()\n", 486 | "plt.plot(recall, precision, label=\"PR curve (area = {:.2f})\".format(average_precision))\n", 487 | "plt.plot([0, 1], [baseline, baseline], 'k--')\n", 488 | "plt.xlim([0.0, 1.0])\n", 489 | "plt.ylim([0.0, 1.0])\n", 490 | "plt.xlabel('Recall')\n", 491 | "plt.ylabel('Precision')\n", 492 | "plt.title('Precision-recall curve')\n", 493 | "plt.legend(loc=\"lower right\")\n", 494 | "plt.show()\n", 495 | "\n", 496 | "if model_type is 'dummy':\n", 497 | " print 'DummyClassifier only has endpoints in PR-curve'" 498 | ] 499 | }, 500 | { 501 | "cell_type": "markdown", 502 | "metadata": { 503 | "deletable": true, 504 | "editable": true 505 | }, 506 | "source": [ 507 | "### Classification report" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "metadata": { 514 | "collapsed": false, 515 | "deletable": true, 516 | "editable": true 517 | }, 518 | "outputs": [], 519 | "source": [ 520 | "from sklearn.metrics import classification_report\n", 521 | "\n", 522 | "target_names = ['no event','event']\n", 523 | "\n", 524 | "print classification_report(y_true, y_pred_bin, target_names=target_names, digits=3)" 525 | ] 526 | }, 527 | { 528 | "cell_type": "markdown", 529 | "metadata": { 530 | "deletable": true, 531 | "editable": true 532 | }, 533 | "source": [ 534 | "### Confusion matrix" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": null, 540 | "metadata": { 541 | "collapsed": false, 542 | "deletable": true, 543 | "editable": true 544 | }, 545 | "outputs": [], 546 | "source": [ 547 | "from sklearn.metrics import confusion_matrix\n", 548 | "\n", 549 | "confusion = pd.DataFrame(confusion_matrix(y_true, y_pred_bin), index=target_names, columns=target_names)\n", 550 | "sns.heatmap(confusion, annot=True, fmt=\"d\")\n", 551 | "plt.xlabel('predicted label')\n", 552 | "plt.ylabel('true label')" 553 | ] 554 | }, 555 | { 556 | "cell_type": "markdown", 557 | "metadata": { 558 | "deletable": true, 559 | "editable": true 560 | }, 561 | "source": [ 562 | "### Accuracies at different classifier thresholds" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": null, 568 | "metadata": { 569 | "collapsed": false, 570 | "deletable": true, 571 | "editable": true 572 | }, 573 | "outputs": [], 574 | "source": [ 575 | "from sklearn.metrics import accuracy_score\n", 576 | "\n", 577 | "thresholds = (np.arange(0,100,1) / 100.)\n", 578 | "acc = map(lambda thresh: accuracy_score(y_true, map(lambda prob: prob > thresh, y_pred_proba)), thresholds)\n", 579 | "plt.hist(acc, bins=20);" 580 | ] 581 | }, 582 | { 583 | "cell_type": "markdown", 584 | "metadata": { 585 | "deletable": true, 586 | "editable": true 587 | }, 588 | "source": [ 589 | "### Thresholds versus accuracy" 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": null, 595 | "metadata": { 596 | "collapsed": false, 597 | "deletable": true, 598 | "editable": true 599 | }, 600 | "outputs": [], 601 | "source": [ 602 | "plt.plot(thresholds, acc);" 603 | ] 604 | }, 605 | { 606 | "cell_type": "markdown", 607 | "metadata": { 608 | "deletable": true, 609 | "editable": true 610 | }, 611 | "source": [ 612 | "### Feature importance\n", 613 | "Note that these models are optimized to make accurate predictions, and **not** to make solid statistical inferences." 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": null, 619 | "metadata": { 620 | "collapsed": false, 621 | "deletable": true, 622 | "editable": true 623 | }, 624 | "outputs": [], 625 | "source": [ 626 | "feature_labels = filter(lambda k: y_col not in k, df.columns.values) \n", 627 | "\n", 628 | "if model_type is 'lr':\n", 629 | " weights = estimator._final_estimator.coef_[0]\n", 630 | "elif model_type in ['rf','gbc']:\n", 631 | " weights = estimator._final_estimator.feature_importances_\n", 632 | "elif model_type is 'dummy':\n", 633 | " print 'DummyClassifier does not have weights'\n", 634 | " weights = np.zeros(len(feature_labels))\n", 635 | " \n", 636 | "feature_weights = pd.Series(weights, index=feature_labels)\n", 637 | "feature_weights.plot.barh(title='Feature importance', fontsize=8, figsize=(12,30), grid=True);" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": null, 643 | "metadata": { 644 | "collapsed": false, 645 | "deletable": true, 646 | "editable": true 647 | }, 648 | "outputs": [], 649 | "source": [ 650 | "from sklearn.ensemble.partial_dependence import plot_partial_dependence\n", 651 | "\n", 652 | "if model_type is 'gbc':\n", 653 | " preproc_pipe = Pipeline(steps=preproc_steps)\n", 654 | " X_transformed = preproc_pipe.fit_transform(X_dev, y_dev)\n", 655 | "\n", 656 | " plot_partial_dependence(estimator._final_estimator, X_transformed,\n", 657 | " features=range(n_features), feature_names=feature_labels,\n", 658 | " figsize=(12,180), n_cols=4, percentiles=(0.2,0.8));\n", 659 | "else:\n", 660 | " print \"No partial dependence plots available for this model type.\"" 661 | ] 662 | }, 663 | { 664 | "cell_type": "code", 665 | "execution_count": null, 666 | "metadata": { 667 | "collapsed": true, 668 | "deletable": true, 669 | "editable": true 670 | }, 671 | "outputs": [], 672 | "source": [] 673 | } 674 | ], 675 | "metadata": { 676 | "kernelspec": { 677 | "display_name": "Python [Root]", 678 | "language": "python", 679 | "name": "Python [Root]" 680 | }, 681 | "language_info": { 682 | "codemirror_mode": { 683 | "name": "ipython", 684 | "version": 2 685 | }, 686 | "file_extension": ".py", 687 | "mimetype": "text/x-python", 688 | "name": "python", 689 | "nbconvert_exporter": "python", 690 | "pygments_lexer": "ipython2", 691 | "version": "2.7.13" 692 | } 693 | }, 694 | "nbformat": 4, 695 | "nbformat_minor": 1 696 | } 697 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # This environment is designed to work ON the bdranalytics module: it includes its dependencies, but not bdranalytics itself. 2 | # This allows you to easily setup an environment in which you work on / contribute to the bdranalytics module 3 | # It therefore also includes the test tools 4 | NumPy>=1.6.1 5 | SciPy>=0.9 6 | scikit-learn>=0.18 7 | pandas 8 | keras 9 | matplotlib 10 | pytest 11 | pytest-runner 12 | sphinx 13 | sphinx_rtd_theme 14 | recommonmark 15 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | # This environment is designed to work WITH the bdranalytics module: it includes the module including its dependencies 2 | # If you want to use pip to install the module, instead of by building straight from the setup.py 3 | # By including the module, the module including all its dependencies will be installed 4 | # This therefore makes it easy to include the module into your own environment 5 | # We use the --editable flag such that you can reinstall the package after you modified it 6 | -e . -------------------------------------------------------------------------------- /sample/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BigDataRepublic/bdr-analytics-py/fda8f0b4fe303cc55a1a06ef64a52f81c0ccfae5/sample/__init__.py -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test=pytest 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='bdranalytics', 5 | version='0.3', 6 | license='Apache License 2.0', 7 | author='bigdatarepublic', 8 | author_email='info@bigdatarepublic.nl', 9 | url='http://www.bigdatarepublic.nl', 10 | long_description="README.md", 11 | packages=['bdranalytics', 12 | 'bdranalytics.images', 13 | 'bdranalytics.keras', 14 | 'bdranalytics.pdlearn', 15 | 'bdranalytics.plot', 16 | 'bdranalytics.sklearn'], 17 | include_package_data=True, 18 | package_data={'bdranalytics': ['data/*.dat'], 19 | 'bdranalytics.images': ['bdr.gif']}, 20 | description="Making data science workflows easier.", 21 | python_requires='>3.5', 22 | install_requires=[ 23 | "NumPy>=1.6.1", 24 | "SciPy>=0.9", 25 | "scikit-learn>=0.18", 26 | "keras", 27 | "pandas", 28 | "matplotlib", 29 | ], 30 | setup_requires=["pytest-runner"], 31 | tests_require=["pytest"] 32 | ) 33 | --------------------------------------------------------------------------------