├── .circleci
    └── config.yml
├── .github
    └── workflows
    │   ├── step1_test.yml
    │   ├── step2_release.yml
    │   ├── step3_pypi_deploy.yml
    │   └── step4_conda_deploy.yml
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.rst
├── conda
    ├── conda_build_config.yml
    └── meta.yaml
├── nox.ini
├── noxfile.py
├── pytest.ini
├── setup.cfg
├── setup.py
├── sklearn_pandas
    ├── __init__.py
    ├── cross_validation.py
    ├── dataframe_mapper.py
    ├── features_generator.py
    ├── pipeline.py
    └── transformers.py
├── test.py
└── tests
    ├── test_data
        └── cars.csv.gz
    ├── test_dataframe_mapper.py
    ├── test_features_generator.py
    ├── test_pipeline.py
    └── test_transformers.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | jobs:
 3 |   test37:
 4 |     docker:
 5 |       - image: circleci/python:3.7
 6 |     steps:
 7 |       - checkout
 8 |       - run: pip install --user nox
 9 |       - run: ~/.local/bin/nox
10 |   test38:
11 |     docker:
12 |       - image: circleci/python:3.8
13 |     steps:
14 |       - checkout
15 |       - run: pip install --user nox
16 |       - run: ~/.local/bin/nox  
17 |   test39:
18 |     docker:
19 |       - image: cimg/python:3.9.1
20 |     steps:
21 |       - checkout
22 |       - run: pip install --user nox
23 |       - run: ~/.local/bin/nox            
24 | 
25 | workflows:
26 |   version: 2
27 |   build_and_test:
28 |     jobs:
29 |       - test37
30 |       - test39
31 | 


--------------------------------------------------------------------------------
/.github/workflows/step1_test.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: 1 Test Package
 5 | 
 6 | on: 
 7 |   workflow_dispatch:
 8 |     branches:
 9 |       - main
10 | 
11 | jobs:
12 |   test:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       matrix:
16 |         python-version: [3.7, 3.8, 3.9]
17 |     steps:
18 |     - uses: actions/checkout@v2
19 |     - name: Set up Python ${{ matrix.python-version }}
20 |       uses: actions/setup-python@v1
21 |       with:
22 |         python-version: ${{ matrix.python-version }}  
23 |     - name: Install dependencies
24 |       run: |
25 |         python -m pip install --upgrade pip
26 |         pip install nox
27 |     - name: Test with pytest
28 |       run: nox
29 | 


--------------------------------------------------------------------------------
/.github/workflows/step2_release.yml:
--------------------------------------------------------------------------------
 1 | name: 2 Release Package
 2 | 
 3 | on: 
 4 |   workflow_dispatch:
 5 |     branches:
 6 |       - main    
 7 | 
 8 | jobs:
 9 |   
10 |   release:
11 |     runs-on: ubuntu-latest
12 |   
13 |     steps:
14 |       - name: Checkout Code
15 |         uses: actions/checkout@v2
16 |         with:
17 |           fetch-depth: 0          
18 |       - name: Changelog
19 |         uses: scottbrenner/generate-changelog-action@master
20 |         id: Changelog
21 |       - name: Create Release
22 |         id: create_release
23 |         uses: actions/create-release@latest
24 |         env: 
25 |            GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token
26 |         with:
27 |           tag_name: ${{ github.ref }}
28 |           release_name: ${{ github.ref }}
29 |           body: |
30 |             ${{ steps.Changelog.outputs.changelog }}
31 |           draft: false
32 |           prerelease: false


--------------------------------------------------------------------------------
/.github/workflows/step3_pypi_deploy.yml:
--------------------------------------------------------------------------------
 1 | name: 3 PyPI Deploy
 2 | 
 3 | on: 
 4 |   workflow_dispatch:
 5 |     branches:
 6 |       - main 
 7 | 
 8 | jobs:
 9 | 
10 |   deploy:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: Checkout Code
14 |         uses: actions/checkout@v2
15 |         with:
16 |           fetch-depth: 0          
17 |       - name: Set up Python
18 |         uses: actions/setup-python@v2
19 |         with:
20 |           python-version: "3.x"            
21 |       - name: Install dependencies
22 |         run: |
23 |           python -m pip install --upgrade pip
24 |           pip install setuptools wheel twine
25 |       - name: Build and publish PyPI
26 |         env:
27 |           TWINE_USERNAME: __token__
28 |           TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}  
29 |         run: |
30 |           python setup.py sdist bdist_wheel                 
31 |           twine upload --repository pypi dist/*


--------------------------------------------------------------------------------
/.github/workflows/step4_conda_deploy.yml:
--------------------------------------------------------------------------------
 1 | name: 4 Conda Deploy
 2 | 
 3 | on: 
 4 |   workflow_dispatch:
 5 |     branches:
 6 |       - main 
 7 | 
 8 | jobs:
 9 | 
10 |   conda_deploy:
11 |     runs-on: ubuntu-latest
12 |     # needs: test
13 |   
14 |     steps:
15 |       - uses: actions/checkout@v2              
16 |       - name: publish-to-conda
17 |         uses: fcakyon/conda-publish-action@v1.3
18 |         with:
19 |           subdir: 'conda'
20 |           anacondatoken: ${{ secrets.ANACONDA_TOKEN }}
21 |           platforms: 'win osx linux'          


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info
2 | *.pyc
3 | .tox/
4 | build/
5 | dist/
6 | .cache/
7 | .idea/
8 | .pytest_cache/
9 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | ## Development environment and steps
 4 | 
 5 | 1. Click on the "Fork" button at the top-right of the GitHub page.
 6 | 2. Clone your fork. Example: `git clone git@github.com:dukebody/sklearn-pandas.git`.
 7 | 3. Create a new branch to work on the issue/feature you want.
 8 | 4. Hack out your code. To run the tests and `flake8`, just run `nox`. Tests live in the `tests` subfolder.
 9 | 5. Submit a new PR with your code, indicating in the PR which issue/feature it relates to.
10 | 
11 | Note: You don't need to install `sklearn-pandas` in your virtualenv to run the tests. `tox` will automatically create multiple virtual environments to run them with multiple package versions.
12 | 
13 | 
14 | ## Guidelines
15 | 
16 | - Remember that `sklearn-pandas` does not expect to do everything. Its scope is to serve as an integration layer between `scikit-learn` and `pandas` where needed. If the feature you want to implement adds a lot of complexity to the code, think twice if it is really needed or can be worked around in a few lines.
17 | - Always write tests for any change introduced.
18 | - If the change involves new options or modifies the public interface, modify also the `README` file explaining how to use it. It uses doctests to test the documentation itself.
19 | - If the change is not just cosmetic, add a line to the Changelog section and your name to the Credits section of the `README` file.
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | sklearn-pandas -- bridge code for cross-validation of pandas data frames
 2 |     with sklearn
 3 | 
 4 | This software is provided 'as-is', without any express or implied
 5 | warranty.  In no event will the authors be held liable for any damages
 6 | arising from the use of this software.
 7 | 
 8 | Permission is granted to anyone to use this software for any purpose,
 9 | including commercial applications, and to alter it and redistribute it
10 | freely, subject to the following restrictions:
11 | 
12 | 1. The origin of this software must not be misrepresented; you must not
13 |  claim that you wrote the original software. If you use this software
14 |  in a product, an acknowledgment in the product documentation would be
15 |  appreciated but is not required.
16 | 2. Altered source versions must be plainly marked as such, and must not be
17 |  misrepresented as being the original software.
18 | 3. This notice may not be removed or altered from any source distribution.
19 | 
20 | Paul Butler <paulgb@gmail.com>
21 | 
22 | The source code of DataFrameMapper is derived from code originally written by
23 | Ben Hamner and released under the following license.
24 | 
25 | Copyright (c) 2013, Ben Hamner
26 | Author: Ben Hamner (ben@benhamner.com)
27 | All rights reserved.
28 | 
29 | Redistribution and use in source and binary forms, with or without
30 | modification, are permitted provided that the following conditions are met: 
31 | 
32 | 1. Redistributions of source code must retain the above copyright notice, this
33 |    list of conditions and the following disclaimer. 
34 | 2. Redistributions in binary form must reproduce the above copyright notice,
35 |    this list of conditions and the following disclaimer in the documentation
36 |    and/or other materials provided with the distribution. 
37 | 
38 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
39 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
40 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
41 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
42 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
43 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
44 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
45 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
46 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
47 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
48 | 
49 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.rst
3 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | Sklearn-pandas
  3 | ==============
  4 | 
  5 | .. image:: https://circleci.com/gh/scikit-learn-contrib/sklearn-pandas.svg?style=svg
  6 |     :target: https://circleci.com/gh/scikit-learn-contrib/sklearn-pandas
  7 | .. image:: https://img.shields.io/pypi/v/sklearn-pandas.svg
  8 |    :target: https://pypi.python.org/pypi/sklearn-pandas/
  9 | .. image:: https://anaconda.org/conda-forge/sklearn-pandas/badges/version.svg
 10 |    :target: https://anaconda.org/conda-forge/sklearn-pandas/
 11 | 
 12 | .. highlight:: python
 13 | 
 14 | This module provides a bridge between `Scikit-Learn <http://scikit-learn.org/stable>`__'s machine learning methods and `pandas <https://pandas.pydata.org>`__-style Data Frames.
 15 | In particular, it provides a way to map ``DataFrame`` columns to transformations, which are later recombined into features.
 16 | 
 17 | Installation
 18 | ------------
 19 | 
 20 | You can install ``sklearn-pandas`` with ``pip``::
 21 | 
 22 |     # pip install sklearn-pandas
 23 | 
 24 | or conda-forge::
 25 | 
 26 |     # conda install -c conda-forge sklearn-pandas
 27 | 
 28 | Tests
 29 | -----
 30 | 
 31 | The examples in this file double as basic sanity tests. To run them, use ``doctest``, which is included with python::
 32 | 
 33 |     # python -m doctest README.rst
 34 | 
 35 | 
 36 | Usage
 37 | -----
 38 | 
 39 | 
 40 | Import
 41 | ******
 42 | 
 43 | Import what you need from the ``sklearn_pandas`` package. The choices are:
 44 | 
 45 | * ``DataFrameMapper``, a class for mapping pandas data frame columns to different sklearn transformations
 46 | 
 47 | 
 48 | For this demonstration, we will import both::
 49 | 
 50 |     >>> from sklearn_pandas import DataFrameMapper
 51 | 
 52 | For these examples, we'll also use pandas, numpy, and sklearn::
 53 | 
 54 |     >>> import pandas as pd
 55 |     >>> import numpy as np
 56 |     >>> import sklearn.preprocessing, sklearn.decomposition, \
 57 |     ...     sklearn.linear_model, sklearn.pipeline, sklearn.metrics, \
 58 |     ...     sklearn.compose
 59 |     >>> from sklearn.feature_extraction.text import CountVectorizer
 60 | 
 61 | 
 62 | Load some Data
 63 | **************
 64 | 
 65 | 
 66 | Normally you'll read the data from a file, but for demonstration purposes we'll create a data frame from a Python dict::
 67 | 
 68 |     >>> data = pd.DataFrame({'pet':      ['cat', 'dog', 'dog', 'fish', 'cat', 'dog', 'cat', 'fish'],
 69 |     ...                      'children': [4., 6, 3, 3, 2, 3, 5, 4],
 70 |     ...                      'salary':   [90., 24, 44, 27, 32, 59, 36, 27]})
 71 | 
 72 | 
 73 | Transformation Mapping
 74 | ----------------------
 75 | 
 76 | 
 77 | Map the Columns to Transformations
 78 | **********************************
 79 | 
 80 | The mapper takes a list of tuples. Each tuple has three elements:
 81 |   1. column name(s): The first element is a column name from the pandas DataFrame, or a list containing one or multiple columns (we will see an example with multiple columns later) or an instance of a callable function such as `make_column_selector <https://scikit-learn.org/stable/modules/generated/sklearn.compose.make_column_selector.html>`__. 
 82 |   2. transformer(s): The second element is an object which will perform the transformation which will be applied to that column. 
 83 |   3. attributes: The third one is optional and is a dictionary containing the transformation options, if applicable (see "custom column names for transformed features" below).
 84 | 
 85 | Let's see an example::
 86 | 
 87 |     >>> mapper = DataFrameMapper([
 88 |     ...     ('pet', sklearn.preprocessing.LabelBinarizer()),
 89 |     ...     (['children'], sklearn.preprocessing.StandardScaler())
 90 |     ... ])
 91 | 
 92 | The difference between specifying the column selector as ``'column'`` (as a simple string) and ``['column']`` (as a list with one element) is the shape of the array that is passed to the transformer. In the first case, a one dimensional array will be passed, while in the second case it will be a 2-dimensional array with one column, i.e. a column vector. 
 93 | 
 94 | This behaviour mimics the same pattern as pandas' dataframes ``__getitem__``  indexing::
 95 | 
 96 |     >>> data['children'].shape
 97 |     (8,)
 98 |     >>> data[['children']].shape
 99 |     (8, 1)
100 | 
101 | Be aware that some transformers expect a 1-dimensional input (the label-oriented ones) while some others, like ``OneHotEncoder`` or ``Imputer``, expect 2-dimensional input, with the shape ``[n_samples, n_features]``.
102 | 
103 | 
104 | Test the Transformation
105 | ***********************
106 | 
107 | We can use the ``fit_transform`` shortcut to both fit the model and see what transformed data looks like. In this and the other examples, output is rounded to two digits with ``np.round`` to account for rounding errors on different hardware::
108 | 
109 |     >>> np.round(mapper.fit_transform(data.copy()), 2)
110 |     array([[ 1.  ,  0.  ,  0.  ,  0.21],
111 |            [ 0.  ,  1.  ,  0.  ,  1.88],
112 |            [ 0.  ,  1.  ,  0.  , -0.63],
113 |            [ 0.  ,  0.  ,  1.  , -0.63],
114 |            [ 1.  ,  0.  ,  0.  , -1.46],
115 |            [ 0.  ,  1.  ,  0.  , -0.63],
116 |            [ 1.  ,  0.  ,  0.  ,  1.04],
117 |            [ 0.  ,  0.  ,  1.  ,  0.21]])
118 | 
119 | Note that the first three columns are the output of the ``LabelBinarizer`` (corresponding to ``cat``, ``dog``, and ``fish`` respectively) and the fourth column is the standardized value for the number of children. In general, the columns are ordered according to the order given when the ``DataFrameMapper`` is constructed.
120 | 
121 | Now that the transformation is trained, we confirm that it works on new data::
122 | 
123 |     >>> sample = pd.DataFrame({'pet': ['cat'], 'children': [5.]})
124 |     >>> np.round(mapper.transform(sample), 2)
125 |     array([[1.  , 0.  , 0.  , 1.04]])
126 | 
127 | 
128 | Output features names
129 | *********************
130 | 
131 | In certain cases, like when studying the feature importances for some model,
132 | we want to be able to associate the original features to the ones generated by
133 | the dataframe mapper. We can do so by inspecting the automatically generated ``transformed_names_`` attribute of the mapper after transformation::
134 | 
135 |     >>> mapper.transformed_names_
136 |     ['pet_cat', 'pet_dog', 'pet_fish', 'children']
137 | 
138 | 
139 | Custom column names for transformed features
140 | ********************************************
141 | 
142 | We can provide a custom name for the transformed features, to be used instead
143 | of the automatically generated one, by specifying it as the third argument
144 | of the feature definition::
145 | 
146 | 
147 |   >>> mapper_alias = DataFrameMapper([
148 |   ...     (['children'], sklearn.preprocessing.StandardScaler(),
149 |   ...      {'alias': 'children_scaled'})
150 |   ... ])
151 |   >>> _ = mapper_alias.fit_transform(data.copy())
152 |   >>> mapper_alias.transformed_names_
153 |   ['children_scaled']
154 | 
155 | Alternatively, you can also specify prefix and/or suffix to add to the column name. For example::
156 | 
157 | 
158 |   >>> mapper_alias = DataFrameMapper([
159 |   ...     (['children'], sklearn.preprocessing.StandardScaler(), {'prefix': 'standard_scaled_'}),
160 |   ...     (['children'], sklearn.preprocessing.StandardScaler(), {'suffix': '_raw'})
161 |   ... ])
162 |   >>> _ = mapper_alias.fit_transform(data.copy())
163 |   >>> mapper_alias.transformed_names_
164 |   ['standard_scaled_children', 'children_raw']
165 | 
166 | 
167 | Dynamic Columns
168 | ***********************
169 | In some situations the columns are not known before hand and we would like to dynamically select them during the fit operation. As shown below, in such situations you can provide either a custom callable or use `make_column_selector <https://scikit-learn.org/stable/modules/generated/sklearn.compose.make_column_selector.html>`__.
170 | 
171 | ::
172 | 
173 |     >>> class GetColumnsStartingWith:
174 |     ...   def __init__(self, start_str):
175 |     ...     self.pattern = start_str
176 |     ...
177 |     ...   def __call__(self, X:pd.DataFrame=None):
178 |     ...     return [c for c in X.columns if c.startswith(self.pattern)]
179 |     ...
180 |     >>> df = pd.DataFrame({
181 |     ...    'sepal length (cm)': [1.0, 2.0, 3.0],
182 |     ...    'sepal width (cm)': [1.0, 2.0, 3.0],
183 |     ...    'petal length (cm)': [1.0, 2.0, 3.0],
184 |     ...    'petal width (cm)': [1.0, 2.0, 3.0]
185 |     ... })
186 |     >>> t = DataFrameMapper([
187 |     ...     (
188 |     ...       sklearn.compose.make_column_selector(dtype_include=float),
189 |     ...       sklearn.preprocessing.StandardScaler(),
190 |     ...       {'alias': 'x'}
191 |     ...     ),
192 |     ...     (
193 |     ...       GetColumnsStartingWith('petal'),
194 |     ...       None,
195 |     ...       {'alias': 'petal'}
196 |     ...     )], df_out=True, default=False)
197 |     >>> t.fit(df).transform(df).shape
198 |     (3, 6)
199 |     >>> t.transformed_names_
200 |     ['x_0', 'x_1', 'x_2', 'x_3', 'petal_0', 'petal_1']
201 | 
202 | 
203 | 
204 | Above we use `make_column_selector` to select all columns that are of type float and also use a custom callable function to select columns that start with the word 'petal'.
205 | 
206 | 
207 | Passing Series/DataFrames to the transformers
208 | *********************************************
209 | 
210 | By default the transformers are passed a numpy array of the selected columns
211 | as input. This is because ``sklearn`` transformers are historically designed to
212 | work with numpy arrays, not with pandas dataframes, even though their basic
213 | indexing interfaces are similar.
214 | 
215 | However we can pass a dataframe/series to the transformers to handle custom
216 | cases initializing the dataframe mapper with ``input_df=True``::
217 | 
218 |     >>> from sklearn.base import TransformerMixin
219 |     >>> class DateEncoder(TransformerMixin):
220 |     ...    def fit(self, X, y=None):
221 |     ...        return self
222 |     ...
223 |     ...    def transform(self, X):
224 |     ...        dt = X.dt
225 |     ...        return pd.concat([dt.year, dt.month, dt.day], axis=1)
226 |     >>> dates_df = pd.DataFrame(
227 |     ...     {'dates': pd.date_range('2015-10-30', '2015-11-02')})
228 |     >>> mapper_dates = DataFrameMapper([
229 |     ...     ('dates', DateEncoder())
230 |     ... ], input_df=True)
231 |     >>> mapper_dates.fit_transform(dates_df)
232 |     array([[2015,   10,   30],
233 |            [2015,   10,   31],
234 |            [2015,   11,    1],
235 |            [2015,   11,    2]])
236 | 
237 | We can also specify this option per group of columns instead of for the
238 | whole mapper::
239 | 
240 |   >>> mapper_dates = DataFrameMapper([
241 |   ...     ('dates', DateEncoder(), {'input_df': True})
242 |   ... ])
243 |   >>> mapper_dates.fit_transform(dates_df)
244 |   array([[2015,   10,   30],
245 |          [2015,   10,   31],
246 |          [2015,   11,    1],
247 |          [2015,   11,    2]])
248 | 
249 | Outputting a dataframe
250 | **********************
251 | 
252 | By default the output of the dataframe mapper is a numpy array. This is so because most sklearn estimators expect a numpy array as input. If however we want the output of the mapper to be a dataframe, we can do so using the parameter ``df_out`` when creating the mapper::
253 | 
254 |     >>> mapper_df = DataFrameMapper([
255 |     ...     ('pet', sklearn.preprocessing.LabelBinarizer()),
256 |     ...     (['children'], sklearn.preprocessing.StandardScaler())
257 |     ... ], df_out=True)
258 |     >>> np.round(mapper_df.fit_transform(data.copy()), 2)
259 |        pet_cat  pet_dog  pet_fish  children
260 |     0        1        0         0      0.21
261 |     1        0        1         0      1.88
262 |     2        0        1         0     -0.63
263 |     3        0        0         1     -0.63
264 |     4        1        0         0     -1.46
265 |     5        0        1         0     -0.63
266 |     6        1        0         0      1.04
267 |     7        0        0         1      0.21
268 | 
269 | The names for the columns are the same ones present in the ``transformed_names_``
270 | attribute.
271 | 
272 | Note this does not work together with the ``default=True`` or ``sparse=True`` arguments to the mapper.
273 | 
274 | Dropping columns explictly
275 | *******************************
276 | 
277 | Sometimes it is required to drop a specific column/ list of columns.
278 | For this purpose, ``drop_cols``  argument for ``DataFrameMapper`` can be used.
279 | Default value is ``None``::
280 | 
281 |     >>> mapper_df = DataFrameMapper([
282 |     ...     ('pet', sklearn.preprocessing.LabelBinarizer()),
283 |     ...     (['children'], sklearn.preprocessing.StandardScaler())
284 |     ... ], drop_cols=['salary'])
285 | 
286 | Now running ``fit_transform`` will run transformations on 'pet' and 'children' and drop 'salary' column::
287 | 
288 |    >>> np.round(mapper_df.fit_transform(data.copy()), 1)
289 |    array([[ 1. ,  0. ,  0. ,  0.2],
290 |           [ 0. ,  1. ,  0. ,  1.9],
291 |           [ 0. ,  1. ,  0. , -0.6],
292 |           [ 0. ,  0. ,  1. , -0.6],
293 |           [ 1. ,  0. ,  0. , -1.5],
294 |           [ 0. ,  1. ,  0. , -0.6],
295 |           [ 1. ,  0. ,  0. ,  1. ],
296 |           [ 0. ,  0. ,  1. ,  0.2]])
297 | 
298 | Transformations may require multiple input columns. In these
299 | 
300 | Transform Multiple Columns
301 | **************************
302 | 
303 | Transformations may require multiple input columns. In these cases, the column names can be specified in a list::
304 | 
305 |     >>> mapper2 = DataFrameMapper([
306 |     ...     (['children', 'salary'], sklearn.decomposition.PCA(1))
307 |     ... ])
308 | 
309 | Now running ``fit_transform`` will run PCA on the ``children`` and ``salary`` columns and return the first principal component::
310 | 
311 |     >>> np.round(mapper2.fit_transform(data.copy()), 1)
312 |     array([[ 47.6],
313 |            [-18.4],
314 |            [  1.6],
315 |            [-15.4],
316 |            [-10.4],
317 |            [ 16.6],
318 |            [ -6.4],
319 |            [-15.4]])
320 | 
321 | Multiple transformers for the same column
322 | *****************************************
323 | 
324 | Multiple transformers can be applied to the same column specifying them
325 | in a list::
326 | 
327 |     >>> from sklearn.impute import SimpleImputer
328 |     >>> mapper3 = DataFrameMapper([
329 |     ...     (['age'], [SimpleImputer(),
330 |     ...                sklearn.preprocessing.StandardScaler()])])
331 |     >>> data_3 = pd.DataFrame({'age': [1, np.nan, 3]})
332 |     >>> mapper3.fit_transform(data_3)
333 |     array([[-1.22474487],
334 |            [ 0.        ],
335 |            [ 1.22474487]])
336 | 
337 | 
338 | Columns that don't need any transformation
339 | ******************************************
340 | 
341 | Only columns that are listed in the DataFrameMapper are kept. To keep a column but don't apply any transformation to it, use `None` as transformer::
342 | 
343 |     >>> mapper3 = DataFrameMapper([
344 |     ...     ('pet', sklearn.preprocessing.LabelBinarizer()),
345 |     ...     ('children', None)
346 |     ... ])
347 |     >>> np.round(mapper3.fit_transform(data.copy()))
348 |     array([[1., 0., 0., 4.],
349 |            [0., 1., 0., 6.],
350 |            [0., 1., 0., 3.],
351 |            [0., 0., 1., 3.],
352 |            [1., 0., 0., 2.],
353 |            [0., 1., 0., 3.],
354 |            [1., 0., 0., 5.],
355 |            [0., 0., 1., 4.]])
356 | 
357 | Applying a default transformer
358 | ******************************
359 | 
360 | A default transformer can be applied to columns not explicitly selected
361 | passing it as the ``default`` argument to the mapper::
362 | 
363 |     >>> mapper4 = DataFrameMapper([
364 |     ...     ('pet', sklearn.preprocessing.LabelBinarizer()),
365 |     ...     ('children', None)
366 |     ... ], default=sklearn.preprocessing.StandardScaler())
367 |     >>> np.round(mapper4.fit_transform(data.copy()), 1)
368 |     array([[ 1. ,  0. ,  0. ,  4. ,  2.3],
369 |            [ 0. ,  1. ,  0. ,  6. , -0.9],
370 |            [ 0. ,  1. ,  0. ,  3. ,  0.1],
371 |            [ 0. ,  0. ,  1. ,  3. , -0.7],
372 |            [ 1. ,  0. ,  0. ,  2. , -0.5],
373 |            [ 0. ,  1. ,  0. ,  3. ,  0.8],
374 |            [ 1. ,  0. ,  0. ,  5. , -0.3],
375 |            [ 0. ,  0. ,  1. ,  4. , -0.7]])
376 | 
377 | Using ``default=False`` (the default) drops unselected columns. Using
378 | ``default=None`` pass the unselected columns unchanged.
379 | 
380 | 
381 | Same transformer for the multiple columns
382 | *****************************************
383 | 
384 | Sometimes it is required to apply the same transformation to several dataframe columns.
385 | To simplify this process, the package provides ``gen_features`` function which accepts a list
386 | of columns and feature transformer class (or list of classes), and generates a feature definition,
387 | acceptable by ``DataFrameMapper``.
388 | 
389 | For example, consider a dataset with three categorical columns, 'col1', 'col2', and 'col3',
390 | To binarize each of them, one could pass column names and ``LabelBinarizer`` transformer class
391 | into generator, and then use returned definition as ``features`` argument for ``DataFrameMapper``::
392 | 
393 |     >>> from sklearn_pandas import gen_features
394 |     >>> feature_def = gen_features(
395 |     ...     columns=['col1', 'col2', 'col3'],
396 |     ...     classes=[sklearn.preprocessing.LabelEncoder]
397 |     ... )
398 |     >>> feature_def
399 |     [('col1', [LabelEncoder()], {}), ('col2', [LabelEncoder()], {}), ('col3', [LabelEncoder()], {})]
400 |     >>> mapper5 = DataFrameMapper(feature_def)
401 |     >>> data5 = pd.DataFrame({
402 |     ...     'col1': ['yes', 'no', 'yes'],
403 |     ...     'col2': [True, False, False],
404 |     ...     'col3': ['one', 'two', 'three']
405 |     ... })
406 |     >>> mapper5.fit_transform(data5)
407 |     array([[1, 1, 0],
408 |            [0, 0, 2],
409 |            [1, 0, 1]])
410 | 
411 | If it is required to override some of transformer parameters, then a dict with 'class' key and
412 | transformer parameters should be provided. For example, consider a dataset with missing values.
413 | Then the following code could be used to override default imputing strategy::
414 | 
415 |     >>> from sklearn.impute import SimpleImputer
416 |     >>> import numpy as np
417 |     >>> feature_def = gen_features(
418 |     ...     columns=[['col1'], ['col2'], ['col3']],
419 |     ...     classes=[{'class': SimpleImputer, 'strategy':'most_frequent'}]
420 |     ... )
421 |     >>> mapper6 = DataFrameMapper(feature_def)
422 |     >>> data6 = pd.DataFrame({
423 |     ...     'col1': [np.nan, 1, 1, 2, 3],
424 |     ...     'col2': [True, False, np.nan, np.nan, True],
425 |     ...     'col3': [0, 0, 0, np.nan, np.nan]
426 |     ... })
427 |     >>> mapper6.fit_transform(data6)
428 |     array([[1.0, True, 0.0],
429 |            [1.0, False, 0.0],
430 |            [1.0, True, 0.0],
431 |            [2.0, True, 0.0],
432 |            [3.0, True, 0.0]], dtype=object)
433 | 
434 | You can also specify global prefix or suffix for the generated transformed column names using the prefix and suffix
435 | parameters::
436 | 
437 |     >>> feature_def = gen_features(
438 |     ...     columns=['col1', 'col2', 'col3'],
439 |     ...     classes=[sklearn.preprocessing.LabelEncoder],
440 |     ...     prefix="lblencoder_"
441 |     ... )
442 |     >>> mapper5 = DataFrameMapper(feature_def)
443 |     >>> data5 = pd.DataFrame({
444 |     ...     'col1': ['yes', 'no', 'yes'],
445 |     ...     'col2': [True, False, False],
446 |     ...     'col3': ['one', 'two', 'three']
447 |     ... })
448 |     >>> _ = mapper5.fit_transform(data5)
449 |     >>> mapper5.transformed_names_
450 |     ['lblencoder_col1', 'lblencoder_col2', 'lblencoder_col3']
451 | 
452 | Feature selection and other supervised transformations
453 | ******************************************************
454 | 
455 | ``DataFrameMapper`` supports transformers that require both X and y arguments. An example of this is feature selection. Treating the 'pet' column as the target, we will select the column that best predicts it.
456 | 
457 | ::
458 | 
459 |     >>> from sklearn.feature_selection import SelectKBest, chi2
460 |     >>> mapper_fs = DataFrameMapper([(['children','salary'], SelectKBest(chi2, k=1))])
461 |     >>> mapper_fs.fit_transform(data[['children','salary']], data['pet'])
462 |     array([[90.],
463 |            [24.],
464 |            [44.],
465 |            [27.],
466 |            [32.],
467 |            [59.],
468 |            [36.],
469 |            [27.]])
470 | 
471 | Working with sparse features
472 | ****************************
473 | 
474 | A ``DataFrameMapper`` will return a dense feature array by default. Setting ``sparse=True`` in the mapper will return
475 | a sparse array whenever any of the extracted features is sparse. Example::
476 | 
477 |     >>> mapper5 = DataFrameMapper([
478 |     ...     ('pet', CountVectorizer()),
479 |     ... ], sparse=True)
480 |     >>> type(mapper5.fit_transform(data))
481 |     <class 'scipy.sparse.csr.csr_matrix'>
482 | 
483 | The stacking of the sparse features is done without ever densifying them.
484 | 
485 | 
486 | Using ``NumericalTransformer``
487 | ***********************************
488 | 
489 | While you can use ``FunctionTransformation`` to generate arbitrary transformers, it can present serialization issues
490 | when pickling. Use ``NumericalTransformer`` instead, which takes the function name as a string parameter and hence
491 | can be easily serialized.
492 | 
493 | ::
494 | 
495 |     >>> from sklearn_pandas import NumericalTransformer
496 |     >>> mapper5 = DataFrameMapper([
497 |     ...     ('children', NumericalTransformer('log')),
498 |     ... ])
499 |     >>> mapper5.fit_transform(data)
500 |     array([[1.38629436],
501 |            [1.79175947],
502 |            [1.09861229],
503 |            [1.09861229],
504 |            [0.69314718],
505 |            [1.09861229],
506 |            [1.60943791],
507 |            [1.38629436]])
508 | 
509 | Changing Logging level
510 | ***********************************
511 | 
512 | You can change log level to info to print time take to fit/transform features. Setting it to higher level will stop printing elapsed time.
513 | Below example shows how to change logging level.
514 | 
515 | ::
516 | 
517 |     >>> import logging
518 |     >>> logging.getLogger('sklearn_pandas').setLevel(logging.INFO)
519 |     
520 | 
521 | Changelog
522 | ---------
523 | 
524 | 
525 | 2.2.0 (2021-05-07)
526 | ******************
527 | * Added an ability to provide callable functions instead of static column list.
528 | 
529 | 
530 | 2.1.0 (2021-02-26)
531 | ******************
532 | * Removed test for Python 3.6 and added Python 3.9
533 | * Added deprecation warning for NumericalTransformer
534 | * Fixed pickling issue causing integration issues with Baikal.  
535 | * Started publishing package to conda repo
536 | 
537 | 
538 | 2.0.4 (2020-11-06)
539 | ******************
540 | 
541 | * Explicitly handling serialization (#224)
542 | * document fixes
543 | * Making transform function thread safe (#194)
544 | * Switched to nox for unit testing (#226)
545 | 
546 | 
547 | 2.0.3 (2020-11-06)
548 | ******************
549 | 
550 | * Added elapsed time information for each feature.
551 | 
552 | 
553 | 2.0.2 (2020-10-01)
554 | ******************
555 | 
556 | * Fix `DataFrameMapper` drop_cols attribute naming consistency with scikit-learn and initialization.
557 | 
558 | 
559 | 2.0.1 (2020-09-07)
560 | ******************
561 | 
562 | * Added an option to explicitly drop columns.
563 | 
564 | 
565 | 2.0.0 (2020-08-01)
566 | ******************
567 | 
568 | * Deprecated support for Python < 3.6.
569 | * Deprecated support for old versions of scikit-learn, pandas and numpy. Please check setup.py for minimum requirement.
570 | * Removed CategoricalImputer, cross_val_score and GridSearchCV. All these functionality now exists as part of
571 |   scikit-learn. Please use SimpleImputer instead of CategoricalImputer. Also
572 |   Cross validation from sklearn now supports dataframe so we don't need to use cross validation wrapper provided over
573 |   here.
574 | * Added ``NumericalTransformer`` for common numerical transformations. Currently it implements log and log1p
575 |   transformation.
576 | * Added prefix and suffix options. See examples above. These are usually helpful when using gen_features.
577 | * Added ``drop_cols`` argument to DataframeMapper. This can be used to explicitly drop columns
578 | 
579 | 
580 | 1.8.0 (2018-12-01)
581 | ******************
582 | 
583 | * Add ``FunctionTransformer`` class (#117).
584 | * Fix column names derivation for dataframes with multi-index or non-string
585 |   columns (#166).
586 | * Change behaviour of DataFrameMapper's fit_transform method to invoke each underlying transformers'
587 |   native fit_transform if implemented (#150).
588 | 
589 | 
590 | 1.7.0 (2018-08-15)
591 | ******************
592 | 
593 | * Fix issues with unicode names in ``get_names`` (#160).
594 | * Update to build using ``numpy==1.14`` and ``python==3.6`` (#154).
595 | * Add ``strategy`` and ``fill_value`` parameters to ``CategoricalImputer`` to allow imputing
596 |   with values other than the mode (#144),(#161).
597 | * Preserve input data types when no transform is supplied (#138).
598 | 
599 | 
600 | 1.6.0 (2017-10-28)
601 | ******************
602 | 
603 | * Add column name to exception during fit/transform (#110).
604 | * Add ``gen_feature`` helper function to help generating the same transformation for multiple columns (#126).
605 | 
606 | 
607 | 1.5.0 (2017-06-24)
608 | ******************
609 | 
610 | * Allow inputting a dataframe/series per group of columns.
611 | * Get feature names also from ``estimator.get_feature_names()`` if present.
612 | * Attempt to derive feature names from individual transformers when applying a
613 |   list of transformers.
614 | * Do not mutate features in ``__init__`` to be compatible with
615 |   ``sklearn>=0.20`` (#76).
616 | 
617 | 
618 | 1.4.0 (2017-05-13)
619 | ******************
620 | 
621 | * Allow specifying a custom name (alias) for transformed columns (#83).
622 | * Capture output columns generated names in ``transformed_names_`` attribute (#78).
623 | * Add ``CategoricalImputer`` that replaces null-like values with the mode
624 |   for string-like columns.
625 | * Add ``input_df`` init argument to allow inputting a dataframe/series to the
626 |   transformers instead of a numpy array (#60).
627 | 
628 | 
629 | 1.3.0 (2017-01-21)
630 | ******************
631 | 
632 | * Make the mapper return dataframes when ``df_out=True`` (#70, #74).
633 | * Update imports to avoid deprecation warnings in sklearn 0.18 (#68).
634 | 
635 | 
636 | 1.2.0 (2016-10-02)
637 | ******************
638 | 
639 | * Deprecate custom cross-validation shim classes.
640 | * Require ``scikit-learn>=0.15.0``. Resolves #49.
641 | * Allow applying a default transformer to columns not selected explicitly in
642 |   the mapper. Resolves #55.
643 | * Allow specifying an optional ``y`` argument during transform for
644 |   supervised transformations. Resolves #58.
645 | 
646 | 
647 | 1.1.0 (2015-12-06)
648 | *******************
649 | 
650 | * Delete obsolete ``PassThroughTransformer``. If no transformation is desired for a given column, use ``None`` as transformer.
651 | * Factor out code in several modules, to avoid having everything in ``__init__.py``.
652 | * Use custom ``TransformerPipeline`` class to allow transformation steps accepting only a X argument. Fixes #46.
653 | * Add compatibility shim for unpickling mappers with list of transformers created before 1.0.0. Fixes #45.
654 | 
655 | 
656 | 1.0.0 (2015-11-28)
657 | *******************
658 | 
659 | * Change version numbering scheme to SemVer.
660 | * Use ``sklearn.pipeline.Pipeline`` instead of copying its code. Resolves #43.
661 | * Raise ``KeyError`` when selecting unexistent columns in the dataframe. Fixes #30.
662 | * Return sparse feature array if any of the features is sparse and ``sparse`` argument is ``True``. Defaults to ``False`` to avoid potential breaking of existing code. Resolves #34.
663 | * Return model and prediction in custom CV classes. Fixes #27.
664 | 
665 | 
666 | 0.0.12 (2015-11-07)
667 | ********************
668 | 
669 | * Allow specifying a list of transformers to use sequentially on the same column.
670 | 
671 | 
672 | Credits
673 | -------
674 | 
675 | The code for ``DataFrameMapper`` is based on code originally written by `Ben Hamner <https://github.com/benhamner>`__.
676 | 
677 | Other contributors:
678 | 
679 | * Ariel Rossanigo (@arielrossanigo)
680 | * Arnau Gil Amat (@arnau126)
681 | * Assaf Ben-David (@AssafBenDavid)
682 | * Brendan Herger (@bjherger)
683 | * Cal Paterson (@calpaterson)
684 | * @defvorfu
685 | * Floris Hoogenboom (@FlorisHoogenboom)
686 | * Gustavo Sena Mafra (@gsmafra)
687 | * Israel Saeta Pérez (@dukebody)
688 | * Jeremy Howard (@jph00)
689 | * Jimmy Wan (@jimmywan)
690 | * Kristof Van Engeland (@kristofve91)
691 | * Olivier Grisel (@ogrisel)
692 | * Paul Butler (@paulgb)
693 | * Richard Miller (@rwjmiller)
694 | * Ritesh Agrawal (@ragrawal)
695 | * @SandroCasagrande
696 | * Timothy Sweetser (@hacktuarial)
697 | * Vitaley Zaretskey (@vzaretsk)
698 | * Zac Stewart (@zacstewart)
699 | * Parul Singh (@paro1234)
700 | * Vincent Heusinkveld (@VHeusinkveld)
701 | 


--------------------------------------------------------------------------------
/conda/conda_build_config.yml:
--------------------------------------------------------------------------------
1 | python:
2 |     - 3.7
3 |     - 3.8
4 |     - 3.9
5 | 


--------------------------------------------------------------------------------
/conda/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set data = load_setup_py_data() %}
 2 | 
 3 | package:
 4 |   name: sklearn-pandas
 5 |   version: {{ data['version'] }}
 6 | 
 7 | source:
 8 |   path: ..
 9 | 
10 | build:
11 |   number: 0
12 |   script: python setup.py install --single-version-externally-managed --record=record.txt
13 | 
14 | requirements:
15 |   build:
16 |     - python
17 |     - scikit-learn>=0.23.0
18 |     - scipy>=1.5.1
19 |     - pandas>=1.1.4
20 |     - numpy>=1.18.1
21 | 
22 |   run:
23 |     - python
24 |     - scikit-learn>=0.23.0
25 |     - scipy>=1.5.1
26 |     - pandas>=1.1.4
27 |     - numpy>=1.18.1
28 | 
29 | test:
30 |   imports:
31 |     - sklearn_pandas
32 | 
33 | about:
34 |   home: {{ data['url'] }}
35 |   license: {{ data['license'] }}
36 | 
37 | summary: {{ data['description'] }}


--------------------------------------------------------------------------------
/nox.ini:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 |  exclude =
 3 |      .git
 4 |      .github
 5 |      __pycache__
 6 |      build
 7 |      dist
 8 |      *site-packages/
 9 |      *bin/
10 |      *.egg/*
11 |      .eggs
12 |      .tox
13 |      docs


--------------------------------------------------------------------------------
/noxfile.py:
--------------------------------------------------------------------------------
 1 | import nox
 2 | 
 3 | @nox.session
 4 | def lint(session):
 5 |     session.install('pytest>=5.3.5', 'setuptools>=45.2',
 6 |                     'wheel>=0.34.2', 'flake8>=3.7.9',
 7 |                     'numpy==1.18.1', 'pandas==1.1.4')
 8 |     session.install('.')
 9 |     session.run('flake8', 'sklearn_pandas/', 'tests')
10 | 
11 | @nox.session
12 | @nox.parametrize('numpy', ['1.18.1', '1.19.4', '1.20.1'])
13 | @nox.parametrize('scipy', ['1.5.4', '1.6.0'])
14 | @nox.parametrize('pandas', ['1.1.4', '1.2.2'])
15 | def tests(session, numpy, scipy, pandas):
16 |     session.install('pytest>=5.3.5', 
17 |                     'setuptools>=45.2',
18 |                     'wheel>=0.34.2',
19 |                     f'numpy=={numpy}',
20 |                     f'scipy=={scipy}',
21 |                     f'pandas=={pandas}'
22 |                     )
23 |     session.install('.')
24 |     session.run('py.test', 'README.rst', 'tests')
25 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = --doctest-glob='*.rst'


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [wheel]
2 | universal = 1
3 | 
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | from setuptools import setup
 5 | from setuptools.command.test import test as TestCommand
 6 | import re
 7 | 
 8 | for line in open('sklearn_pandas/__init__.py'):
 9 |     match = re.match("__version__ *= *'(.*)'", line)
10 |     if match:
11 |         __version__, = match.groups()
12 | 
13 | 
14 | class PyTest(TestCommand):
15 |     user_options = [('pytest-args=', 'a', "Arguments to pass to py.test")]
16 | 
17 |     def initialize_options(self):
18 |         TestCommand.initialize_options(self)
19 |         self.pytest_args = []
20 | 
21 |     def finalize_options(self):
22 |         TestCommand.finalize_options(self)
23 |         self.test_args = []
24 |         self.test_suite = True
25 | 
26 |     def run(self):
27 |         import pytest
28 |         errno = pytest.main(self.pytest_args)
29 |         raise SystemExit(errno)
30 | 
31 | 
32 | setup(name='sklearn-pandas',
33 |       version=__version__,
34 |       description='Pandas integration with sklearn',
35 |       maintainer='Ritesh Agrawal',
36 |       maintainer_email='ragrawal@gmail.com',
37 |       url='https://github.com/scikit-learn-contrib/sklearn-pandas',
38 |       packages=['sklearn_pandas'],
39 |       keywords=['scikit', 'sklearn', 'pandas'],
40 |       install_requires=[
41 |           'scikit-learn>=0.23.0',
42 |           'scipy>=1.5.1',
43 |           'pandas>=1.1.4',
44 |           'numpy>=1.18.1'
45 |       ],
46 |       tests_require=['pytest', 'mock'],
47 |       cmdclass={'test': PyTest},
48 |       license='MIT License'
49 | )
50 | 


--------------------------------------------------------------------------------
/sklearn_pandas/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '2.2.0'
2 | 
3 | import logging
4 | logger = logging.getLogger(__name__)
5 | 
6 | from .dataframe_mapper import DataFrameMapper  # NOQA
7 | from .features_generator import gen_features  # NOQA
8 | from .transformers import NumericalTransformer # NOQA
9 | 


--------------------------------------------------------------------------------
/sklearn_pandas/cross_validation.py:
--------------------------------------------------------------------------------
 1 | class DataWrapper(object):
 2 | 
 3 |     def __init__(self, df):
 4 |         self.df = df
 5 | 
 6 |     def __len__(self):
 7 |         return len(self.df)
 8 | 
 9 |     def __getitem__(self, key):
10 |         return self.df.iloc[key]
11 | 


--------------------------------------------------------------------------------
/sklearn_pandas/dataframe_mapper.py:
--------------------------------------------------------------------------------
  1 | import contextlib
  2 | from datetime import datetime
  3 | import pandas as pd
  4 | import numpy as np
  5 | from scipy import sparse
  6 | from sklearn.base import BaseEstimator, TransformerMixin
  7 | from .cross_validation import DataWrapper
  8 | from .pipeline import make_transformer_pipeline, _call_fit, TransformerPipeline
  9 | from . import logger
 10 | 
 11 | string_types = text_type = str
 12 | 
 13 | 
 14 | def _handle_feature(fea):
 15 |     """
 16 |     Convert 1-dimensional arrays to 2-dimensional column vectors.
 17 |     """
 18 |     if len(fea.shape) == 1:
 19 |         fea = np.array([fea]).T
 20 | 
 21 |     return fea
 22 | 
 23 | 
 24 | def _build_transformer(transformers):
 25 |     if isinstance(transformers, list):
 26 |         transformers = make_transformer_pipeline(*transformers)
 27 |     return transformers
 28 | 
 29 | 
 30 | def _build_feature(columns, transformers, options={}, X=None):
 31 |     if X is None:
 32 |         return (columns, _build_transformer(transformers), options)
 33 |     return (
 34 |         columns(X) if callable(columns) else columns,
 35 |         _build_transformer(transformers),
 36 |         options
 37 |     )
 38 | 
 39 | 
 40 | def _elapsed_secs(t1):
 41 |     return (datetime.now()-t1).total_seconds()
 42 | 
 43 | 
 44 | def _get_feature_names(estimator):
 45 |     """
 46 |     Attempt to extract feature names based on a given estimator
 47 |     """
 48 |     if hasattr(estimator, 'classes_'):
 49 |         return estimator.classes_
 50 |     elif hasattr(estimator, 'get_feature_names'):
 51 |         return estimator.get_feature_names()
 52 |     return None
 53 | 
 54 | 
 55 | @contextlib.contextmanager
 56 | def add_column_names_to_exception(column_names):
 57 |     # Stolen from https://stackoverflow.com/a/17677938/356729
 58 |     try:
 59 |         yield
 60 |     except Exception as ex:
 61 |         if ex.args:
 62 |             msg = u'{}: {}'.format(column_names, ex.args[0])
 63 |         else:
 64 |             msg = text_type(column_names)
 65 |         ex.args = (msg,) + ex.args[1:]
 66 |         raise
 67 | 
 68 | 
 69 | class DataFrameMapper(BaseEstimator, TransformerMixin):
 70 |     """
 71 |     Map Pandas data frame column subsets to their own
 72 |     sklearn transformation.
 73 |     """
 74 | 
 75 |     def __init__(self, features, default=False, sparse=False, df_out=False,
 76 |                  input_df=False, drop_cols=None):
 77 |         """
 78 |         Params:
 79 | 
 80 |         features    a list of tuples with features definitions.
 81 |                     The first element is the pandas column selector. This can
 82 |                     be a string (for one column) or a list of strings.
 83 |                     The second element is an object that supports
 84 |                     sklearn's transform interface, or a list of such objects
 85 |                     The third element is optional and, if present, must be
 86 |                     a dictionary with the options to apply to the
 87 |                     transformation. Example: {'alias': 'day_of_week'}
 88 | 
 89 |         default     default transformer to apply to the columns not
 90 |                     explicitly selected in the mapper. If False (default),
 91 |                     discard them. If None, pass them through untouched. Any
 92 |                     other transformer will be applied to all the unselected
 93 |                     columns as a whole, taken as a 2d-array.
 94 | 
 95 |         sparse      will return sparse matrix if set True and any of the
 96 |                     extracted features is sparse. Defaults to False.
 97 | 
 98 |         df_out      return a pandas data frame, with each column named using
 99 |                     the pandas column that created it (if there's only one
100 |                     input and output) or the input columns joined with '_'
101 |                     if there's multiple inputs, and the name concatenated with
102 |                     '_1', '_2' etc if there's multiple outputs. NB: does not
103 |                     work if *default* or *sparse* are true
104 | 
105 |         input_df    If ``True`` pass the selected columns to the transformers
106 |                     as a pandas DataFrame or Series. Otherwise pass them as a
107 |                     numpy array. Defaults to ``False``.
108 | 
109 |         drop_cols   List of columns to be dropped. Defaults to None.
110 | 
111 |         """
112 |         self.features = features
113 |         self.default = default
114 |         self.built_default = None
115 |         self.sparse = sparse
116 |         self.df_out = df_out
117 |         self.input_df = input_df
118 |         self.drop_cols = [] if drop_cols is None else drop_cols
119 |         self.transformed_names_ = []
120 |         if (df_out and (sparse or default)):
121 |             raise ValueError("Can not use df_out with sparse or default")
122 | 
123 |     def _build(self, X=None):
124 |         """
125 |         Build attributes built_features and built_default.
126 |         """
127 |         if isinstance(self.features, list):
128 |             self.built_features = [
129 |                 _build_feature(*f, X=X) for f in self.features
130 |             ]
131 |         else:
132 |             self.built_features = _build_feature(*self.features, X=X)
133 |         self.built_default = _build_transformer(self.default)
134 | 
135 |     @property
136 |     def _selected_columns(self):
137 |         """
138 |         Return a set of selected columns in the feature list.
139 |         """
140 |         selected_columns = set()
141 |         for feature in self.features:
142 |             columns = feature[0]
143 |             if isinstance(columns, list):
144 |                 selected_columns = selected_columns.union(set(columns))
145 |             else:
146 |                 selected_columns.add(columns)
147 |         return selected_columns
148 | 
149 |     def _unselected_columns(self, X):
150 |         """
151 |         Return list of columns present in X and not selected explicitly in the
152 |         mapper.
153 | 
154 |         Unselected columns are returned in the order they appear in the
155 |         dataframe to avoid issues with different ordering during default fit
156 |         and transform steps.
157 |         """
158 |         X_columns = list(X.columns)
159 |         return [column for column in X_columns if
160 |                 column not in self._selected_columns
161 |                 and column not in self.drop_cols]
162 | 
163 |     def __setstate__(self, state):
164 |         # compatibility for older versions of sklearn-pandas
165 |         super().__setstate__(state)
166 |         self.features = [_build_feature(*feat) for feat in state['features']]
167 |         self.sparse = state.get('sparse', False)
168 |         self.default = state.get('default', False)
169 |         self.df_out = state.get('df_out', False)
170 |         self.input_df = state.get('input_df', False)
171 |         self.drop_cols = state.get('drop_cols', [])
172 |         self.built_features = state.get('built_features', self.features)
173 |         self.built_default = state.get('built_default', self.default)
174 |         self.transformed_names_ = state.get('transformed_names_', [])
175 | 
176 |     def __getstate__(self):
177 |         state = super().__getstate__()
178 |         state['features'] = self.features
179 |         state['sparse'] = self.sparse
180 |         state['default'] = self.default
181 |         state['df_out'] = self.df_out
182 |         state['input_df'] = self.input_df
183 |         state['drop_cols'] = self.drop_cols
184 |         state['build_features'] = getattr(self, 'built_features', None)
185 |         state['built_default'] = self.built_default
186 |         state['transformed_names_'] = self.transformed_names_
187 |         return state
188 | 
189 |     def _get_col_subset(self, X, cols, input_df=False):
190 |         """
191 |         Get a subset of columns from the given table X.
192 | 
193 |         X       a Pandas dataframe; the table to select columns from
194 |         cols    a string or list of strings representing the columns to select.
195 |                 It can also be a callable that returns True or False, i.e.
196 |                 compatible with the built-in filter function.
197 | 
198 |         Returns a numpy array with the data from the selected columns
199 |         """
200 | 
201 |         if isinstance(cols, string_types):
202 |             return_vector = True
203 |             cols = [cols]
204 |         else:
205 |             return_vector = False
206 | 
207 |         # Needed when using the cross-validation compatibility
208 |         # layer for sklearn<0.16.0.
209 |         # Will be dropped on sklearn-pandas 2.0.
210 |         if isinstance(X, list):
211 |             X = [x[cols] for x in X]
212 |             X = pd.DataFrame(X)
213 | 
214 |         elif isinstance(X, DataWrapper):
215 |             X = X.df  # fetch underlying data
216 | 
217 |         if return_vector:
218 |             t = X[cols[0]]
219 |         else:
220 |             t = X[cols]
221 | 
222 |         # return either a DataFrame/Series or a numpy array
223 |         if input_df:
224 |             return t
225 |         else:
226 |             return t.values
227 | 
228 |     def fit(self, X, y=None):
229 |         """
230 |         Fit a transformation from the pipeline
231 | 
232 |         X       the data to fit
233 | 
234 |         y       the target vector relative to X, optional
235 | 
236 |         """
237 |         self._build(X=X)
238 | 
239 |         for columns, transformers, options in self.built_features:
240 |             t1 = datetime.now()
241 |             input_df = options.get('input_df', self.input_df)
242 | 
243 |             if transformers is not None:
244 |                 with add_column_names_to_exception(columns):
245 |                     Xt = self._get_col_subset(X, columns, input_df)
246 |                     _call_fit(transformers.fit, Xt, y)
247 |             logger.info(f"[FIT] {columns}: {_elapsed_secs(t1)} secs")
248 | 
249 |         # handle features not explicitly selected
250 |         if self.built_default:  # not False and not None
251 |             unsel_cols = self._unselected_columns(X)
252 |             with add_column_names_to_exception(unsel_cols):
253 |                 Xt = self._get_col_subset(X, unsel_cols, self.input_df)
254 |                 _call_fit(self.built_default.fit, Xt, y)
255 |         return self
256 | 
257 |     def get_names(self, columns, transformer, x, alias=None, prefix='',
258 |                   suffix=''):
259 |         """
260 |         Return verbose names for the transformed columns.
261 | 
262 |         columns       name (or list of names) of the original column(s)
263 |         transformer   transformer - can be a TransformerPipeline
264 |         x             transformed columns (numpy.ndarray)
265 |         alias         base name to use for the selected columns
266 |         """
267 |         if alias is not None:
268 |             name = alias
269 |         elif isinstance(columns, list):
270 |             name = '_'.join(map(str, columns))
271 |         else:
272 |             name = columns
273 |         num_cols = x.shape[1] if len(x.shape) > 1 else 1
274 | 
275 |         output = []
276 | 
277 |         if num_cols > 1:
278 |             # If there are as many columns as classes in the transformer,
279 |             # infer column names from classes names.
280 | 
281 |             # If we are dealing with multiple transformers for these columns
282 |             # attempt to extract the names from each of them, starting from the
283 |             # last one
284 |             if isinstance(transformer, TransformerPipeline):
285 |                 inverse_steps = transformer.steps[::-1]
286 |                 estimators = (estimator for name, estimator in inverse_steps)
287 |                 names_steps = (_get_feature_names(e) for e in estimators)
288 |                 names = next((n for n in names_steps if n is not None), None)
289 |             # Otherwise use the only estimator present
290 |             else:
291 |                 names = _get_feature_names(transformer)
292 | 
293 |             if names is not None and len(names) == num_cols:
294 |                 output = [f"{name}_{o}" for o in names]
295 |                 # otherwise, return name concatenated with '_1', '_2', etc.
296 |             else:
297 |                 output = [name + '_' + str(o) for o in range(num_cols)]
298 |         else:
299 |             output = [name]
300 | 
301 |         if prefix == suffix == "":
302 |             return output
303 | 
304 |         return ['{}{}{}'.format(prefix, x, suffix) for x in output]
305 | 
306 |     def get_dtypes(self, extracted):
307 |         dtypes_features = [self.get_dtype(ex) for ex in extracted]
308 |         return [dtype for dtype_feature in dtypes_features
309 |                 for dtype in dtype_feature]
310 | 
311 |     def get_dtype(self, ex):
312 |         if isinstance(ex, np.ndarray) or sparse.issparse(ex):
313 |             return [ex.dtype] * ex.shape[1]
314 |         elif isinstance(ex, pd.DataFrame):
315 |             return list(ex.dtypes)
316 |         else:
317 |             raise TypeError(type(ex))
318 | 
319 |     def _transform(self, X, y=None, do_fit=False):
320 |         """
321 |         Transform the given data with possibility to fit in advance.
322 |         Avoids code duplication for implementation of transform and
323 |         fit_transform.
324 |         """
325 |         if do_fit:
326 |             self._build(X=X)
327 | 
328 |         extracted = []
329 |         transformed_names_ = []
330 |         for columns, transformers, options in self.built_features:
331 |             input_df = options.get('input_df', self.input_df)
332 | 
333 |             # columns could be a string or list of
334 |             # strings; we don't care because pandas
335 |             # will handle either.
336 |             Xt = self._get_col_subset(X, columns, input_df)
337 | 
338 |             if transformers is not None:
339 |                 with add_column_names_to_exception(columns):
340 |                     if do_fit and hasattr(transformers, 'fit_transform'):
341 |                         t1 = datetime.now()
342 |                         Xt = _call_fit(transformers.fit_transform, Xt, y)
343 |                         logger.info(f"[FIT_TRANSFORM] {columns}: {_elapsed_secs(t1)} secs")  # NOQA
344 |                     else:
345 |                         if do_fit:
346 |                             t1 = datetime.now()
347 |                             _call_fit(transformers.fit, Xt, y)
348 |                             logger.info(
349 |                                 f"[FIT] {columns}: {_elapsed_secs(t1)} secs")
350 | 
351 |                         t1 = datetime.now()
352 |                         Xt = transformers.transform(Xt)
353 |                         logger.info(f"[TRANSFORM] {columns}: {_elapsed_secs(t1)} secs")  # NOQA
354 | 
355 |             extracted.append(_handle_feature(Xt))
356 | 
357 |             alias = options.get('alias')
358 | 
359 |             prefix = options.get('prefix', '')
360 |             suffix = options.get('suffix', '')
361 | 
362 |             transformed_names_ += self.get_names(
363 |                 columns, transformers, Xt, alias, prefix, suffix)
364 | 
365 |         # handle features not explicitly selected
366 |         if self.built_default is not False:
367 |             unsel_cols = self._unselected_columns(X)
368 |             Xt = self._get_col_subset(X, unsel_cols, self.input_df)
369 |             if self.built_default is not None:
370 |                 with add_column_names_to_exception(unsel_cols):
371 |                     if do_fit and hasattr(self.built_default, 'fit_transform'):
372 |                         Xt = _call_fit(self.built_default.fit_transform, Xt, y)
373 |                     else:
374 |                         if do_fit:
375 |                             _call_fit(self.built_default.fit, Xt, y)
376 |                         Xt = self.built_default.transform(Xt)
377 |                 transformed_names_ += self.get_names(
378 |                     unsel_cols, self.built_default, Xt)
379 |             else:
380 |                 # if not applying a default transformer,
381 |                 # keep column names unmodified
382 |                 transformed_names_ += unsel_cols
383 | 
384 |             extracted.append(_handle_feature(Xt))
385 | 
386 |         self.transformed_names_ = transformed_names_
387 | 
388 |         # combine the feature outputs into one array.
389 |         # at this point we lose track of which features
390 |         # were created from which input columns, so it's
391 |         # assumed that that doesn't matter to the model.
392 | 
393 |         # If any of the extracted features is sparse, combine sparsely.
394 |         # Otherwise, combine as normal arrays.
395 |         if any(sparse.issparse(fea) for fea in extracted):
396 |             stacked = sparse.hstack(extracted).tocsr()
397 |             # return a sparse matrix only if the mapper was initialized
398 |             # with sparse=True
399 |             if not self.sparse:
400 |                 stacked = stacked.toarray()
401 |         else:
402 |             stacked = np.hstack(extracted)
403 | 
404 |         if self.df_out:
405 |             # if no rows were dropped preserve the original index,
406 |             # otherwise use a new integer one
407 |             no_rows_dropped = len(X) == len(stacked)
408 |             if no_rows_dropped:
409 |                 index = X.index
410 |             else:
411 |                 index = None
412 | 
413 |             # output different data types, if appropriate
414 |             dtypes = self.get_dtypes(extracted)
415 |             df_out = pd.DataFrame(
416 |                 stacked,
417 |                 columns=self.transformed_names_,
418 |                 index=index)
419 |             # preserve types
420 |             for col, dtype in zip(self.transformed_names_, dtypes):
421 |                 df_out[col] = df_out[col].astype(dtype)
422 |             return df_out
423 |         else:
424 |             return stacked
425 | 
426 |     def transform(self, X):
427 |         """
428 |         Transform the given data. Assumes that fit has already been called.
429 | 
430 |         X       the data to transform
431 |         """
432 |         return self._transform(X)
433 | 
434 |     def fit_transform(self, X, y=None):
435 |         """
436 |         Fit a transformation from the pipeline and directly apply
437 |         it to the given data.
438 | 
439 |         X       the data to fit
440 | 
441 |         y       the target vector relative to X, optional
442 |         """
443 |         return self._transform(X, y, True)
444 | 


--------------------------------------------------------------------------------
/sklearn_pandas/features_generator.py:
--------------------------------------------------------------------------------
 1 | def gen_features(columns, classes=None, prefix='', suffix=''):
 2 |     """Generates a feature definition list which can be passed
 3 |     into DataFrameMapper
 4 | 
 5 |     Params:
 6 | 
 7 |     columns     a list of column names to generate features for.
 8 | 
 9 |     classes     a list of classes for each feature, a list of dictionaries with
10 |                 transformer class and init parameters, or None.
11 | 
12 |                 If list of classes is provided, then each of them is
13 |                 instantiated with default arguments. Example:
14 | 
15 |                     classes = [StandardScaler, LabelBinarizer]
16 | 
17 |                 If list of dictionaries is provided, then each of them should
18 |                 have a 'class' key with transformer class. All other keys are
19 |                 passed into 'class' value constructor. Example:
20 | 
21 |                     classes = [
22 |                         {'class': StandardScaler, 'with_mean': False},
23 |                         {'class': LabelBinarizer}
24 |                     }]
25 | 
26 |                 If None value selected, then each feature left as is.
27 | 
28 |     prefix      add prefix to transformed column names
29 | 
30 |     suffix      add suffix to transformed column names.
31 | 
32 |     """
33 |     if classes is None:
34 |         return [(column, None) for column in columns]
35 | 
36 |     feature_defs = []
37 | 
38 |     for column in columns:
39 |         feature_transformers = []
40 | 
41 |         arguments = {}
42 |         if prefix and prefix != "":
43 |             arguments['prefix'] = prefix
44 |         if suffix and suffix != "":
45 |             arguments['suffix'] = suffix
46 | 
47 |         classes = [cls for cls in classes if cls is not None]
48 |         if not classes:
49 |             feature_defs.append((column, None, arguments))
50 | 
51 |         else:
52 |             for definition in classes:
53 |                 if isinstance(definition, dict):
54 |                     params = definition.copy()
55 |                     klass = params.pop('class')
56 |                     feature_transformers.append(klass(**params))
57 |                 else:
58 |                     feature_transformers.append(definition())
59 | 
60 |             if not feature_transformers:
61 |                 feature_transformers = None
62 | 
63 |             feature_defs.append((column, feature_transformers, arguments))
64 | 
65 |     return feature_defs
66 | 


--------------------------------------------------------------------------------
/sklearn_pandas/pipeline.py:
--------------------------------------------------------------------------------
 1 | import six
 2 | from sklearn.pipeline import _name_estimators, Pipeline
 3 | from sklearn.utils import tosequence
 4 | 
 5 | 
 6 | def _call_fit(fit_method, X, y=None, **kwargs):
 7 |     """
 8 |     helper function, calls the fit or fit_transform method with the correct
 9 |     number of parameters
10 | 
11 |     fit_method: fit or fit_transform method of the transformer
12 |     X: the data to fit
13 |     y: the target vector relative to X, optional
14 |     kwargs: any keyword arguments to the fit method
15 | 
16 |     return: the result of the fit or fit_transform method
17 | 
18 |     WARNING: if this function raises a TypeError exception, test the fit
19 |     or fit_transform method passed to it in isolation as _call_fit will not
20 |     distinguish TypeError due to incorrect number of arguments from
21 |     other TypeError
22 |     """
23 |     try:
24 |         return fit_method(X, y, **kwargs)
25 |     except TypeError:
26 |         # fit takes only one argument
27 |         return fit_method(X, **kwargs)
28 | 
29 | 
30 | class TransformerPipeline(Pipeline):
31 |     """
32 |     Pipeline that expects all steps to be transformers taking a single X
33 |     argument, an optional y argument, and having fit and transform methods.
34 | 
35 |     Code is copied from sklearn's Pipeline
36 |     """
37 | 
38 |     def __init__(self, steps):
39 |         names, estimators = zip(*steps)
40 |         if len(dict(steps)) != len(steps):
41 |             raise ValueError(
42 |                 "Provided step names are not unique: %s" % (names,))
43 | 
44 |         # shallow copy of steps
45 |         self.steps = tosequence(steps)
46 |         estimator = estimators[-1]
47 | 
48 |         for e in estimators:
49 |             if (not (hasattr(e, "fit") or hasattr(e, "fit_transform")) or not
50 |                     hasattr(e, "transform")):
51 |                 raise TypeError("All steps of the chain should "
52 |                                 "be transforms and implement fit and transform"
53 |                                 " '%s' (type %s) doesn't)" % (e, type(e)))
54 | 
55 |         if not hasattr(estimator, "fit"):
56 |             raise TypeError("Last step of chain should implement fit "
57 |                             "'%s' (type %s) doesn't)"
58 |                             % (estimator, type(estimator)))
59 | 
60 |     def _pre_transform(self, X, y=None, **fit_params):
61 |         fit_params_steps = dict((step, {}) for step, _ in self.steps)
62 |         for pname, pval in six.iteritems(fit_params):
63 |             step, param = pname.split('__', 1)
64 |             fit_params_steps[step][param] = pval
65 |         Xt = X
66 |         for name, transform in self.steps[:-1]:
67 |             if hasattr(transform, "fit_transform"):
68 |                 Xt = _call_fit(transform.fit_transform,
69 |                                Xt, y, **fit_params_steps[name])
70 |             else:
71 |                 Xt = _call_fit(transform.fit,
72 |                                Xt, y, **fit_params_steps[name]).transform(Xt)
73 |         return Xt, fit_params_steps[self.steps[-1][0]]
74 | 
75 |     def fit(self, X, y=None, **fit_params):
76 |         Xt, fit_params = self._pre_transform(X, y, **fit_params)
77 |         _call_fit(self.steps[-1][-1].fit, Xt, y, **fit_params)
78 |         return self
79 | 
80 |     def fit_transform(self, X, y=None, **fit_params):
81 |         Xt, fit_params = self._pre_transform(X, y, **fit_params)
82 |         if hasattr(self.steps[-1][-1], 'fit_transform'):
83 |             return _call_fit(self.steps[-1][-1].fit_transform,
84 |                              Xt, y, **fit_params)
85 |         else:
86 |             return _call_fit(self.steps[-1][-1].fit,
87 |                              Xt, y, **fit_params).transform(Xt)
88 | 
89 | 
90 | def make_transformer_pipeline(*steps):
91 |     """Construct a TransformerPipeline from the given estimators.
92 |     """
93 |     return TransformerPipeline(_name_estimators(steps))
94 | 


--------------------------------------------------------------------------------
/sklearn_pandas/transformers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | from sklearn.base import TransformerMixin
 4 | import warnings
 5 | 
 6 | 
 7 | def _get_mask(X, value):
 8 |     """
 9 |     Compute the boolean mask X == missing_values.
10 |     """
11 |     if value == "NaN" or \
12 |        value is None or \
13 |        (isinstance(value, float) and np.isnan(value)):
14 |         return pd.isnull(X)
15 |     else:
16 |         return X == value
17 | 
18 | 
19 | class NumericalTransformer(TransformerMixin):
20 |     """
21 |     Provides commonly used numerical transformers.
22 |     """
23 |     SUPPORTED_FUNCTIONS = ['log', 'log1p']
24 | 
25 |     def __init__(self, func):
26 |         """
27 |         Params
28 | 
29 |         func    function to apply to input columns. The function will be
30 |                 applied to each value. Supported functions are defined
31 |                 in SUPPORTED_FUNCTIONS variable. Throws assertion error if the
32 |                 not supported.
33 |         """
34 | 
35 |         warnings.warn("""
36 |             NumericalTransformer will be deprecated in 3.0 version.
37 |             Please use Sklearn.base.TransformerMixin to write
38 |             customer transformers
39 |             """, DeprecationWarning)
40 | 
41 |         assert func in self.SUPPORTED_FUNCTIONS, \
42 |             f"Only following func are supported: {self.SUPPORTED_FUNCTIONS}"
43 |         super(NumericalTransformer, self).__init__()
44 |         self.__func = func
45 | 
46 |     def fit(self, X, y=None):
47 |         return self
48 | 
49 |     def transform(self, X, y=None):
50 |         if self.__func == 'log1p':
51 |             return np.vectorize(np.log1p)(X)
52 |         elif self.__func == 'log':
53 |             return np.vectorize(np.log)(X)
54 | 
55 |         raise ValueError(f"Invalid function name: {self.__func}")
56 | 


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from unittest.mock import Mock
 3 | import numpy as np
 4 | import pandas as pd
 5 | from sklearn_pandas import DataFrameMapper
 6 | from sklearn.compose import make_column_selector
 7 | from sklearn.preprocessing import StandardScaler
 8 | 
 9 | 
10 | class GetStartWith:
11 |     def __init__(self, start_str):
12 |         self.start_str = start_str
13 | 
14 |     def __call__(self, X: pd.DataFrame) -> list:
15 |         return [c for c in X.columns if c.startswith(self.start_str)]
16 | 
17 | 
18 | df = pd.DataFrame({
19 |     'sepal length (cm)': [1.0, 2.0, 3.0],
20 |     'sepal width (cm)': [1.0, 2.0, 3.0],
21 |     'petal length (cm)': [1.0, 2.0, 3.0],
22 |     'petal width (cm)': [1.0, 2.0, 3.0]
23 | })
24 | t = DataFrameMapper([
25 |     (make_column_selector(dtype_include=float), StandardScaler(), {'alias': 'x'}),
26 |     (GetStartWith('petal'), None, {'alias': 'petal'})
27 | ], df_out=True, default=False)
28 | 
29 | t.fit(df)
30 | print(t.transform(df).shape)
31 | 


--------------------------------------------------------------------------------
/tests/test_data/cars.csv.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scikit-learn-contrib/sklearn-pandas/c9db2d6dcbf515eade751073f43318e43cae5177/tests/test_data/cars.csv.gz


--------------------------------------------------------------------------------
/tests/test_dataframe_mapper.py:
--------------------------------------------------------------------------------
   1 | # -*- coding: utf8 -*-
   2 | 
   3 | import pytest
   4 | from unittest.mock import Mock
   5 | from pandas import DataFrame
   6 | import pandas as pd
   7 | from scipy import sparse
   8 | from sklearn.datasets import load_iris
   9 | from sklearn.pipeline import Pipeline
  10 | from sklearn.model_selection import cross_val_score
  11 | from sklearn.svm import SVC
  12 | from sklearn.feature_extraction.text import CountVectorizer
  13 | from sklearn.feature_extraction import DictVectorizer
  14 | from sklearn.preprocessing import (
  15 |     StandardScaler, OneHotEncoder, LabelBinarizer)
  16 | from sklearn.impute import SimpleImputer as Imputer
  17 | from sklearn.feature_selection import SelectKBest, chi2
  18 | from sklearn.base import BaseEstimator, TransformerMixin
  19 | import sklearn.decomposition
  20 | import numpy as np
  21 | from numpy.testing import assert_array_equal
  22 | import pickle
  23 | from sklearn.compose import make_column_selector
  24 | 
  25 | from sklearn_pandas import DataFrameMapper
  26 | from sklearn_pandas.dataframe_mapper import _handle_feature, _build_transformer
  27 | from sklearn_pandas.pipeline import TransformerPipeline
  28 | 
  29 | 
  30 | class MockXTransformer(object):
  31 |     """
  32 |     Mock transformer that accepts no y argument.
  33 |     """
  34 |     def fit(self, X):
  35 |         return self
  36 | 
  37 |     def transform(self, X):
  38 |         return X
  39 | 
  40 | 
  41 | class MockTClassifier(object):
  42 |     """
  43 |     Mock transformer/classifier.
  44 |     """
  45 |     def fit(self, X, y=None):
  46 |         return self
  47 | 
  48 |     def transform(self, X):
  49 |         return X
  50 | 
  51 |     def predict(self, X):
  52 |         return True
  53 | 
  54 | 
  55 | class DateEncoder():
  56 |     def fit(self, X, y=None):
  57 |         return self
  58 | 
  59 |     def transform(self, X):
  60 |         dt = X.dt
  61 |         return pd.concat([dt.year, dt.month, dt.day], axis=1)
  62 | 
  63 | 
  64 | class ToSparseTransformer(BaseEstimator, TransformerMixin):
  65 |     """
  66 |     Transforms numpy matrix to sparse format.
  67 |     """
  68 |     def fit(self, X):
  69 |         return self
  70 | 
  71 |     def transform(self, X):
  72 |         return sparse.csr_matrix(X)
  73 | 
  74 | 
  75 | class CustomTransformer(BaseEstimator, TransformerMixin):
  76 |     """
  77 |     Example of transformer in which the number of classes
  78 |     is not equals to the number of output columns.
  79 |     """
  80 |     def fit(self, X, y=None):
  81 |         self.min = X.min()
  82 |         self.classes_ = np.unique(X)
  83 |         return self
  84 | 
  85 |     def transform(self, X):
  86 |         classes = np.unique(X)
  87 |         if len(np.setdiff1d(classes, self.classes_)) > 0:
  88 |             raise ValueError('Unknown values found.')
  89 |         return X - self.min
  90 | 
  91 | 
  92 | class MockImageTransformer(BaseEstimator, TransformerMixin):
  93 |     """
  94 |     Example transformer that takes the max of a 2d vector
  95 |     then scales the result.
  96 |     """
  97 |     def __init__(self, multiplier=10.0):
  98 |         self.multiplier = multiplier
  99 | 
 100 |     def fit(self, X, y=None):
 101 |         return self
 102 | 
 103 |     def transform(self, X):
 104 |         assert isinstance(X, pd.DataFrame)
 105 |         for col in X.columns:
 106 |             X[col] = X[col].map(lambda img: np.max(img))
 107 |         return X * self.multiplier
 108 | 
 109 | 
 110 | @pytest.fixture
 111 | def simple_dataframe():
 112 |     return pd.DataFrame({'a': [1, 2, 3]})
 113 | 
 114 | 
 115 | @pytest.fixture
 116 | def complex_dataframe():
 117 |     return pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'c', 'c'],
 118 |                          'feat1': [1, 2, 3, 4, 5, 6],
 119 |                          'feat2': [1, 2, 3, 2, 3, 4]})
 120 | 
 121 | 
 122 | @pytest.fixture
 123 | def complex_object_dataframe():
 124 |     return pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'c', 'c'],
 125 |                          'feat1': [1, 2, 3, 4, 5, 6],
 126 |                          'feat2': [1, 2, 3, 2, 3, 4],
 127 |                          'img2d': [1*np.eye(2), 2*np.eye(2), 3*np.eye(2),
 128 |                                    4*np.eye(2), 5*np.eye(2), 6*np.eye(2)]})
 129 | 
 130 | 
 131 | @pytest.fixture
 132 | def multiindex_dataframe():
 133 |     """Example MultiIndex DataFrame, taken from pandas documentation
 134 |     """
 135 |     iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']]
 136 |     index = pd.MultiIndex.from_product(iterables, names=['first', 'second'])
 137 |     df = pd.DataFrame(np.random.randn(10, 8), columns=index)
 138 |     return df
 139 | 
 140 | 
 141 | @pytest.fixture
 142 | def multiindex_dataframe_incomplete(multiindex_dataframe):
 143 |     """Example MultiIndex DataFrame with missing entries
 144 |     """
 145 |     df = multiindex_dataframe
 146 |     mask_array = np.zeros(df.size)
 147 |     mask_array[:20] = 1
 148 |     np.random.shuffle(mask_array)
 149 |     mask = mask_array.reshape(df.shape).astype(bool)
 150 |     df.mask(mask, inplace=True)
 151 |     return df
 152 | 
 153 | 
 154 | def test_transformed_names_simple(simple_dataframe):
 155 |     """
 156 |     Get transformed names of features in `transformed_names` attribute
 157 |     for simple transformation
 158 |     """
 159 |     df = simple_dataframe
 160 |     mapper = DataFrameMapper([('a', None)])
 161 |     mapper.fit_transform(df)
 162 |     assert mapper.transformed_names_ == ['a']
 163 | 
 164 | 
 165 | def test_transformed_names_binarizer(complex_dataframe):
 166 |     """
 167 |     Get transformed names of features in `transformed_names` attribute
 168 |     for a transformation that multiplies the number of columns
 169 |     """
 170 |     df = complex_dataframe
 171 |     mapper = DataFrameMapper([('target', LabelBinarizer())])
 172 |     mapper.fit_transform(df)
 173 |     assert mapper.transformed_names_ == ['target_a', 'target_b', 'target_c']
 174 | 
 175 | 
 176 | def test_logging(caplog, complex_dataframe):
 177 |     """
 178 |     Get transformed names of features in `transformed_names` attribute
 179 |     for a transformation that multiplies the number of columns
 180 |     """
 181 |     import logging
 182 |     logger = logging.getLogger('sklearn_pandas')
 183 |     logger.setLevel(logging.INFO)
 184 |     df = complex_dataframe
 185 |     mapper = DataFrameMapper([('target', LabelBinarizer())])
 186 |     mapper.fit_transform(df)
 187 |     assert '[FIT_TRANSFORM] target:' in caplog.text
 188 | 
 189 | 
 190 | def test_transformed_names_binarizer_unicode():
 191 |     df = pd.DataFrame({'target': [u'ñ', u'á', u'é']})
 192 |     mapper = DataFrameMapper([('target', LabelBinarizer())])
 193 |     mapper.fit_transform(df)
 194 |     expected_names = {u'target_ñ', u'target_á', u'target_é'}
 195 |     assert set(mapper.transformed_names_) == expected_names
 196 | 
 197 | 
 198 | def test_transformed_names_transformers_list(complex_dataframe):
 199 |     """
 200 |     When using a list of transformers, use them in inverse order to get the
 201 |     transformed names
 202 |     """
 203 |     df = complex_dataframe
 204 |     mapper = DataFrameMapper([
 205 |         ('target', [LabelBinarizer(), MockXTransformer()])
 206 |     ])
 207 |     mapper.fit_transform(df)
 208 |     assert mapper.transformed_names_ == ['target_a', 'target_b', 'target_c']
 209 | 
 210 | 
 211 | def test_transformed_names_simple_alias(simple_dataframe):
 212 |     """
 213 |     If we specify an alias for a single output column, it is used for the
 214 |     output
 215 |     """
 216 |     df = simple_dataframe
 217 |     mapper = DataFrameMapper([('a', None, {'alias': 'new_name'})])
 218 |     mapper.fit_transform(df)
 219 |     assert mapper.transformed_names_ == ['new_name']
 220 | 
 221 | 
 222 | def test_transformed_names_complex_alias(complex_dataframe):
 223 |     """
 224 |     If we specify an alias for a multiple output column, it is used for the
 225 |     output
 226 |     """
 227 |     df = complex_dataframe
 228 |     mapper = DataFrameMapper([('target', LabelBinarizer(), {'alias': 'new'})])
 229 |     mapper.fit_transform(df)
 230 |     assert mapper.transformed_names_ == ['new_a', 'new_b', 'new_c']
 231 | 
 232 | 
 233 | def test_exception_column_context_transform(simple_dataframe):
 234 |     """
 235 |     If an exception is raised when transforming a column,
 236 |     the exception includes the name of the column being transformed
 237 |     """
 238 |     class FailingTransformer(object):
 239 |         def fit(self, X):
 240 |             pass
 241 | 
 242 |         def transform(self, X):
 243 |             raise Exception('Some exception')
 244 | 
 245 |     df = simple_dataframe
 246 |     mapper = DataFrameMapper([('a', FailingTransformer())])
 247 |     mapper.fit(df)
 248 | 
 249 |     with pytest.raises(Exception, match='a: Some exception'):
 250 |         mapper.transform(df)
 251 | 
 252 | 
 253 | def test_exception_column_context_fit(simple_dataframe):
 254 |     """
 255 |     If an exception is raised when fit a column,
 256 |     the exception includes the name of the column being fitted
 257 |     """
 258 |     class FailingFitter(object):
 259 |         def fit(self, X):
 260 |             raise Exception('Some exception')
 261 | 
 262 |     df = simple_dataframe
 263 |     mapper = DataFrameMapper([('a', FailingFitter())])
 264 | 
 265 |     with pytest.raises(Exception, match='a: Some exception'):
 266 |         mapper.fit(df)
 267 | 
 268 | 
 269 | def test_simple_df(simple_dataframe):
 270 |     """
 271 |     Get a dataframe from a simple mapped dataframe
 272 |     """
 273 |     df = simple_dataframe
 274 |     mapper = DataFrameMapper([('a', None)], df_out=True)
 275 |     transformed = mapper.fit_transform(df)
 276 |     assert type(transformed) == pd.DataFrame
 277 |     assert len(transformed["a"]) == len(simple_dataframe["a"])
 278 | 
 279 | 
 280 | def test_complex_df(complex_dataframe):
 281 |     """
 282 |     Get a dataframe from a complex mapped dataframe
 283 |     """
 284 |     df = complex_dataframe
 285 |     mapper = DataFrameMapper(
 286 |         [('target', None), ('feat1', None), ('feat2', None)],
 287 |         df_out=True)
 288 |     transformed = mapper.fit_transform(df)
 289 |     assert len(transformed) == len(complex_dataframe)
 290 |     for c in df.columns:
 291 |         assert len(transformed[c]) == len(df[c])
 292 | 
 293 | 
 294 | def test_complex_object_df(complex_object_dataframe):
 295 |     """
 296 |     Get a dataframe from a complex dataframe with 2d features
 297 |     """
 298 |     df = complex_object_dataframe
 299 |     img_scale = 10
 300 |     mapper = DataFrameMapper(
 301 |         [('target', None), ('feat1', None),
 302 |          (make_column_selector('feat2'), StandardScaler()),
 303 |          (make_column_selector('img2d'), MockImageTransformer(img_scale))],
 304 |         df_out=True, input_df=True)
 305 |     transformed = mapper.fit_transform(df)
 306 |     assert len(transformed) == len(complex_object_dataframe)
 307 |     assert np.isclose(
 308 |         np.sum(transformed['img2d']),
 309 |         np.max(np.sum(df['img2d'])) * img_scale, atol=1e-12)
 310 | 
 311 | 
 312 | def test_numeric_column_names(complex_dataframe):
 313 |     """
 314 |     Get a dataframe from a complex mapped dataframe with numeric column names
 315 |     """
 316 |     df = complex_dataframe
 317 |     df.columns = [0, 1, 2]
 318 |     mapper = DataFrameMapper(
 319 |         [(0, None), (1, None), (2, None)], df_out=True)
 320 |     transformed = mapper.fit_transform(df)
 321 |     assert len(transformed) == len(complex_dataframe)
 322 |     for c in df.columns:
 323 |         assert len(transformed[c]) == len(df[c])
 324 | 
 325 | 
 326 | def test_multiindex_df(multiindex_dataframe_incomplete):
 327 |     """
 328 |     Get a dataframe from a multiindex dataframe with missing data
 329 |     """
 330 |     df = multiindex_dataframe_incomplete
 331 |     mapper = DataFrameMapper([([c], Imputer()) for c in df.columns],
 332 |                              df_out=True)
 333 |     transformed = mapper.fit_transform(df)
 334 |     assert len(transformed) == len(multiindex_dataframe_incomplete)
 335 |     for c in df.columns:
 336 |         assert len(transformed[str(c)]) == len(df[c])
 337 | 
 338 | 
 339 | def test_binarizer_df():
 340 |     """
 341 |     Check level names from LabelBinarizer
 342 |     """
 343 |     df = pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'c', 'a']})
 344 |     mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True)
 345 |     transformed = mapper.fit_transform(df)
 346 |     cols = transformed.columns
 347 |     assert len(cols) == 3
 348 |     assert cols[0] == 'target_a'
 349 |     assert cols[1] == 'target_b'
 350 |     assert cols[2] == 'target_c'
 351 | 
 352 | 
 353 | def test_binarizer_int_df():
 354 |     """
 355 |     Check level names from LabelBinarizer for a numeric array.
 356 |     """
 357 |     df = pd.DataFrame({'target': [5, 5, 6, 6, 7, 5]})
 358 |     mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True)
 359 |     transformed = mapper.fit_transform(df)
 360 |     cols = transformed.columns
 361 |     assert len(cols) == 3
 362 |     assert cols[0] == 'target_5'
 363 |     assert cols[1] == 'target_6'
 364 |     assert cols[2] == 'target_7'
 365 | 
 366 | 
 367 | def test_binarizer2_df():
 368 |     """
 369 |     Check level names from LabelBinarizer with just one output column
 370 |     """
 371 |     df = pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'a']})
 372 |     mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True)
 373 |     transformed = mapper.fit_transform(df)
 374 |     cols = transformed.columns
 375 |     assert len(cols) == 1
 376 |     assert cols[0] == 'target'
 377 | 
 378 | 
 379 | def test_onehot_df():
 380 |     """
 381 |     Check level ids from one-hot
 382 |     """
 383 |     df = pd.DataFrame({'target': [0, 0, 1, 1, 2, 3, 0]})
 384 |     mapper = DataFrameMapper([(['target'], OneHotEncoder())], df_out=True)
 385 |     transformed = mapper.fit_transform(df)
 386 |     cols = transformed.columns
 387 |     assert len(cols) == 4
 388 |     assert cols[0] == 'target_x0_0'
 389 |     assert cols[3] == 'target_x0_3'
 390 | 
 391 | 
 392 | def test_customtransform_df():
 393 |     """
 394 |     Check level ids from a transformer in which
 395 |     the number of classes is not equals to the number of output columns.
 396 |     """
 397 |     df = pd.DataFrame({'target': [6, 5, 7, 5, 4, 8, 8]})
 398 |     mapper = DataFrameMapper([(['target'], CustomTransformer())], df_out=True)
 399 |     transformed = mapper.fit_transform(df)
 400 |     cols = transformed.columns
 401 |     assert len(mapper.features[0][1].classes_) == 5
 402 |     assert len(cols) == 1
 403 |     assert cols[0] == 'target'
 404 | 
 405 | 
 406 | def test_preserve_df_index():
 407 |     """
 408 |     The index is preserved when df_out=True
 409 |     """
 410 |     df = pd.DataFrame({'target': [1, 2, 3]},
 411 |                       index=['a', 'b', 'c'])
 412 |     mapper = DataFrameMapper([('target', None)],
 413 |                              df_out=True)
 414 | 
 415 |     transformed = mapper.fit_transform(df)
 416 | 
 417 |     assert_array_equal(transformed.index, df.index)
 418 | 
 419 | 
 420 | def test_preserve_df_index_rows_dropped():
 421 |     """
 422 |     If df_out=True but the original df index length doesn't
 423 |     match the number of final rows, use a numeric index
 424 |     """
 425 |     class DropLastRowTransformer(object):
 426 |         def fit(self, X):
 427 |             return self
 428 | 
 429 |         def transform(self, X):
 430 |             return X[:-1]
 431 | 
 432 |     df = pd.DataFrame({'target': [1, 2, 3]},
 433 |                       index=['a', 'b', 'c'])
 434 |     mapper = DataFrameMapper([('target', DropLastRowTransformer())],
 435 |                              df_out=True)
 436 | 
 437 |     transformed = mapper.fit_transform(df)
 438 | 
 439 |     assert_array_equal(transformed.index, np.array([0, 1]))
 440 | 
 441 | 
 442 | def test_pca(complex_dataframe):
 443 |     """
 444 |     Check multi in and out with PCA
 445 |     """
 446 |     df = complex_dataframe
 447 |     mapper = DataFrameMapper(
 448 |         [(['feat1', 'feat2'], sklearn.decomposition.PCA(2))],
 449 |         df_out=True)
 450 |     transformed = mapper.fit_transform(df)
 451 |     cols = transformed.columns
 452 |     assert len(cols) == 2
 453 |     assert cols[0] == 'feat1_feat2_0'
 454 |     assert cols[1] == 'feat1_feat2_1'
 455 | 
 456 | 
 457 | def test_fit_transform(simple_dataframe):
 458 |     """
 459 |     Check that custom fit_transform methods of the transformers are invoked.
 460 |     """
 461 |     df = simple_dataframe
 462 |     mock_transformer = Mock()
 463 |     # return something of measurable length but does nothing
 464 |     mock_transformer.fit_transform.return_value = np.array([1, 2, 3])
 465 |     mapper = DataFrameMapper([("a", mock_transformer)])
 466 |     mapper.fit_transform(df)
 467 |     assert mock_transformer.fit_transform.called
 468 | 
 469 | 
 470 | def test_fit_transform_equiv_mock(simple_dataframe):
 471 |     """
 472 |     Check for equivalent results for code paths fit_transform
 473 |     versus fit and transform in DataFrameMapper using the mock
 474 |     transformer which does not implement a custom fit_transform.
 475 |     """
 476 |     df = simple_dataframe
 477 |     mapper = DataFrameMapper([('a', MockXTransformer())])
 478 |     transformed_combined = mapper.fit_transform(df)
 479 |     transformed_separate = mapper.fit(df).transform(df)
 480 |     assert np.all(transformed_combined == transformed_separate)
 481 | 
 482 | 
 483 | def test_fit_transform_equiv_pca(complex_dataframe):
 484 |     """
 485 |     Check for equivalent results for code paths fit_transform
 486 |     versus fit and transform in DataFrameMapper and transformer
 487 |     using PCA which implements a custom fit_transform. The
 488 |     equivalence of both paths in the transformer only can be
 489 |     asserted since this is tested in the sklearn tests
 490 |     scikit-learn/sklearn/decomposition/tests/test_pca.py
 491 |     """
 492 |     df = complex_dataframe
 493 |     mapper = DataFrameMapper(
 494 |         [(['feat1', 'feat2'], sklearn.decomposition.PCA(2))],
 495 |         df_out=True)
 496 |     transformed_combined = mapper.fit_transform(df)
 497 |     transformed_separate = mapper.fit(df).transform(df)
 498 |     assert np.allclose(transformed_combined, transformed_separate)
 499 | 
 500 | 
 501 | def test_input_df_true_first_transformer(simple_dataframe, monkeypatch):
 502 |     """
 503 |     If input_df is True, the first transformer is passed
 504 |     a pd.Series instead of an np.array
 505 |     """
 506 |     df = simple_dataframe
 507 |     monkeypatch.setattr(MockXTransformer, 'fit', Mock())
 508 |     monkeypatch.setattr(MockXTransformer, 'transform',
 509 |                         Mock(return_value=np.array([1, 2, 3])))
 510 |     mapper = DataFrameMapper([
 511 |         ('a', MockXTransformer())
 512 |     ], input_df=True)
 513 |     out = mapper.fit_transform(df)
 514 | 
 515 |     args, _ = MockXTransformer().fit.call_args
 516 |     assert isinstance(args[0], pd.Series)
 517 | 
 518 |     args, _ = MockXTransformer().transform.call_args
 519 |     assert isinstance(args[0], pd.Series)
 520 | 
 521 |     assert_array_equal(out, np.array([1, 2, 3]).reshape(-1, 1))
 522 | 
 523 | 
 524 | def test_input_df_true_next_transformers(simple_dataframe, monkeypatch):
 525 |     """
 526 |     If input_df is True, the subsequent transformers get passed pandas
 527 |     objects instead of numpy arrays (given the previous transformers
 528 |     output pandas objects as well)
 529 |     """
 530 |     df = simple_dataframe
 531 |     monkeypatch.setattr(MockTClassifier, 'fit', Mock())
 532 |     monkeypatch.setattr(MockTClassifier, 'transform',
 533 |                         Mock(return_value=pd.Series([1, 2, 3])))
 534 |     mapper = DataFrameMapper([
 535 |         ('a', [MockXTransformer(), MockTClassifier()])
 536 |     ], input_df=True)
 537 |     mapper.fit(df)
 538 |     out = mapper.transform(df)
 539 | 
 540 |     args, _ = MockTClassifier().fit.call_args
 541 |     assert isinstance(args[0], pd.Series)
 542 | 
 543 |     assert_array_equal(out, np.array([1, 2, 3]).reshape(-1, 1))
 544 | 
 545 | 
 546 | def test_input_df_true_multiple_cols(complex_dataframe):
 547 |     """
 548 |     When input_df is True, applying transformers to multiple columns
 549 |     works as expected
 550 |     """
 551 |     df = complex_dataframe
 552 | 
 553 |     mapper = DataFrameMapper([
 554 |         ('target', MockXTransformer()),
 555 |         ('feat1',  MockXTransformer()),
 556 |     ], input_df=True)
 557 |     out = mapper.fit_transform(df)
 558 | 
 559 |     assert_array_equal(out[:, 0], df['target'].values)
 560 |     assert_array_equal(out[:, 1], df['feat1'].values)
 561 | 
 562 | 
 563 | def test_input_df_date_encoder():
 564 |     """
 565 |     When input_df is True we can apply a transformer that only works
 566 |     with pandas dataframes like a DateEncoder
 567 |     """
 568 |     df = pd.DataFrame(
 569 |         {'dates': pd.date_range('2015-10-30', '2015-11-02')})
 570 |     mapper = DataFrameMapper([
 571 |         ('dates', DateEncoder())
 572 |     ], input_df=True)
 573 |     out = mapper.fit_transform(df)
 574 |     expected = np.array([
 575 |         [2015, 10, 30],
 576 |         [2015, 10, 31],
 577 |         [2015, 11, 1],
 578 |         [2015, 11, 2]
 579 |     ])
 580 |     assert_array_equal(out, expected)
 581 | 
 582 | 
 583 | def test_local_input_df_date_encoder():
 584 |     """
 585 |     When input_df is True we can apply a transformer that only works
 586 |     with pandas dataframes like a DateEncoder
 587 |     """
 588 |     df = pd.DataFrame(
 589 |         {'dates': pd.date_range('2015-10-30', '2015-11-02')})
 590 |     mapper = DataFrameMapper([
 591 |         ('dates', DateEncoder(), {'input_df': True})
 592 |     ], input_df=False)
 593 |     out = mapper.fit_transform(df)
 594 |     expected = np.array([
 595 |         [2015, 10, 30],
 596 |         [2015, 10, 31],
 597 |         [2015, 11, 1],
 598 |         [2015, 11, 2]
 599 |     ])
 600 |     assert_array_equal(out, expected)
 601 | 
 602 | 
 603 | def test_nonexistent_columns_explicit_fail(simple_dataframe):
 604 |     """
 605 |     If a nonexistent column is selected, KeyError is raised.
 606 |     """
 607 |     mapper = DataFrameMapper(None)
 608 |     with pytest.raises(KeyError):
 609 |         mapper._get_col_subset(simple_dataframe, ['nonexistent_feature'])
 610 | 
 611 | 
 612 | def test_get_col_subset_single_column_array(simple_dataframe):
 613 |     """
 614 |     Selecting a single column should return a 1-dimensional numpy array.
 615 |     """
 616 |     mapper = DataFrameMapper(None)
 617 |     array = mapper._get_col_subset(simple_dataframe, "a")
 618 | 
 619 |     assert type(array) == np.ndarray
 620 |     assert array.shape == (len(simple_dataframe["a"]),)
 621 | 
 622 | 
 623 | def test_get_col_subset_single_column_list(simple_dataframe):
 624 |     """
 625 |     Selecting a list of columns (even if the list contains a single element)
 626 |     should return a 2-dimensional numpy array.
 627 |     """
 628 |     mapper = DataFrameMapper(None)
 629 |     array = mapper._get_col_subset(simple_dataframe, ["a"])
 630 | 
 631 |     assert type(array) == np.ndarray
 632 |     assert array.shape == (len(simple_dataframe["a"]), 1)
 633 | 
 634 | 
 635 | def test_cols_string_array(simple_dataframe):
 636 |     """
 637 |     If a string is specified as the columns, the transformer
 638 |     is called with a 1-d array as input.
 639 |     """
 640 |     df = simple_dataframe
 641 |     mock_transformer = Mock()
 642 |     mapper = DataFrameMapper([("a", mock_transformer)])
 643 | 
 644 |     mapper.fit(df)
 645 |     args, kwargs = mock_transformer.fit.call_args
 646 |     assert args[0].shape == (3,)
 647 | 
 648 | 
 649 | def test_cols_list_column_vector(simple_dataframe):
 650 |     """
 651 |     If a one-element list is specified as the columns, the transformer
 652 |     is called with a column vector as input.
 653 |     """
 654 |     df = simple_dataframe
 655 |     mock_transformer = Mock()
 656 |     mapper = DataFrameMapper([(["a"], mock_transformer)])
 657 | 
 658 |     mapper.fit(df)
 659 |     args, kwargs = mock_transformer.fit.call_args
 660 |     assert args[0].shape == (3, 1)
 661 | 
 662 | 
 663 | def test_handle_feature_2dim():
 664 |     """
 665 |     2-dimensional arrays are returned unchanged.
 666 |     """
 667 |     array = np.array([[1, 2], [3, 4]])
 668 |     assert_array_equal(_handle_feature(array), array)
 669 | 
 670 | 
 671 | def test_handle_feature_1dim():
 672 |     """
 673 |     1-dimensional arrays are converted to 2-dimensional column vectors.
 674 |     """
 675 |     array = np.array([1, 2])
 676 |     assert_array_equal(_handle_feature(array), np.array([[1], [2]]))
 677 | 
 678 | 
 679 | def test_build_transformers():
 680 |     """
 681 |     When a list of transformers is passed, return a pipeline with
 682 |     each element of the iterable as a step of the pipeline.
 683 |     """
 684 |     transformers = [MockTClassifier(), MockTClassifier()]
 685 |     pipeline = _build_transformer(transformers)
 686 |     assert isinstance(pipeline, Pipeline)
 687 |     for ix, transformer in enumerate(transformers):
 688 |         assert pipeline.steps[ix][1] == transformer
 689 | 
 690 | 
 691 | def test_selected_columns():
 692 |     """
 693 |     selected_columns returns a set of the columns appearing in the features
 694 |     of the mapper.
 695 |     """
 696 |     mapper = DataFrameMapper([
 697 |         ('a', None),
 698 |         (['a', 'b'], None)
 699 |     ])
 700 |     assert mapper._selected_columns == {'a', 'b'}
 701 | 
 702 | 
 703 | def test_unselected_columns():
 704 |     """
 705 |     unselected_columns returns a list of the columns not appearing in the
 706 |     features of the mapper but present in the given dataframe.
 707 |     """
 708 |     df = pd.DataFrame({'a': [1], 'b': [2], 'c': [3]})
 709 |     mapper = DataFrameMapper([
 710 |         ('a', None),
 711 |         (['a', 'b'], None)
 712 |     ])
 713 |     assert 'c' in mapper._unselected_columns(df)
 714 | 
 715 | 
 716 | def test_drop_and_default_false():
 717 |     """
 718 |     If default=False, non explicitly selected columns and drop columns
 719 |     are discarded.
 720 |     """
 721 |     df = pd.DataFrame({'a': [1], 'b': [2], 'c': [3]})
 722 |     mapper = DataFrameMapper([
 723 |             ('a', None)
 724 |         ], drop_cols=['c'], default=False)
 725 |     transformed = mapper.fit_transform(df)
 726 |     assert transformed.shape == (1, 1)
 727 |     assert mapper.transformed_names_ == ['a']
 728 | 
 729 | 
 730 | def test_drop_and_default_none():
 731 |     """
 732 |     If default=None, drop columns are discarded and
 733 |     remaining non explicitly selected columns are passed through untransformed
 734 |     """
 735 |     df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]})
 736 |     mapper = DataFrameMapper([
 737 |         ('a', None)
 738 |     ], drop_cols=['c'], default=None)
 739 | 
 740 |     transformed = mapper.fit_transform(df)
 741 |     assert transformed.shape == (3, 2)
 742 |     assert mapper.transformed_names_ == ['a', 'b']
 743 | 
 744 | 
 745 | def test_conflicting_drop():
 746 |     """
 747 |     Drop column name shouldn't get confused with transformed columns.
 748 |     """
 749 |     df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]})
 750 |     mapper = DataFrameMapper([
 751 |         ('a', None)
 752 |     ], drop_cols=['a'], default=False)
 753 | 
 754 |     transformed = mapper.fit_transform(df)
 755 |     assert transformed.shape == (3, 1)
 756 |     assert mapper.transformed_names_ == ['a']
 757 | 
 758 | 
 759 | def test_default_false():
 760 |     """
 761 |     If default=False, non explicitly selected columns are discarded.
 762 |     """
 763 |     df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]})
 764 |     mapper = DataFrameMapper([
 765 |         ('b', None)
 766 |     ], default=False)
 767 | 
 768 |     transformed = mapper.fit_transform(df)
 769 |     assert transformed.shape == (3, 1)
 770 | 
 771 | 
 772 | def test_default_none():
 773 |     """
 774 |     If default=None, non explicitly selected columns are passed through
 775 |     untransformed.
 776 |     """
 777 |     df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]})
 778 |     mapper = DataFrameMapper([
 779 |         (['a'], OneHotEncoder())
 780 |     ], default=None)
 781 | 
 782 |     transformed = mapper.fit_transform(df)
 783 |     assert (transformed[:, 3] == np.array([3, 5, 7]).T).all()
 784 | 
 785 | 
 786 | def test_default_none_names():
 787 |     """
 788 |     If default=None, column names are returned unmodified.
 789 |     """
 790 |     df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]})
 791 |     mapper = DataFrameMapper([], default=None)
 792 | 
 793 |     mapper.fit_transform(df)
 794 |     assert mapper.transformed_names_ == ['a', 'b']
 795 | 
 796 | 
 797 | def test_default_transformer():
 798 |     """
 799 |     If default=Transformer, non explicitly selected columns are applied this
 800 |     transformer.
 801 |     """
 802 |     df = pd.DataFrame({'a': [1, np.nan, 3], })
 803 |     mapper = DataFrameMapper([], default=Imputer())
 804 | 
 805 |     transformed = mapper.fit_transform(df)
 806 |     assert (transformed[: 0] == np.array([1., 2., 3.])).all()
 807 | 
 808 | 
 809 | def test_list_transformers_single_arg(simple_dataframe):
 810 |     """
 811 |     Multiple transformers can be specified in a list even if some of them
 812 |     only accept one X argument instead of two (X, y).
 813 |     """
 814 |     mapper = DataFrameMapper([
 815 |         ('a', [MockXTransformer()])
 816 |     ])
 817 |     # doesn't fail
 818 |     mapper.fit_transform(simple_dataframe)
 819 | 
 820 | 
 821 | def test_list_transformers():
 822 |     """
 823 |     Specifying a list of transformers applies them sequentially to the
 824 |     selected column.
 825 |     """
 826 |     dataframe = pd.DataFrame({"a": [1, np.nan, 3], "b": [1, 5, 7]},
 827 |                              dtype=np.float64)
 828 | 
 829 |     mapper = DataFrameMapper([
 830 |         (["a"], [Imputer(), StandardScaler()]),
 831 |         (["b"], StandardScaler()),
 832 |     ])
 833 |     dmatrix = mapper.fit_transform(dataframe)
 834 | 
 835 |     assert pd.isnull(dmatrix).sum() == 0  # no null values
 836 | 
 837 |     # all features have mean 0 and std deviation 1 (standardized)
 838 |     assert (abs(dmatrix.mean(axis=0) - 0) <= 1e-6).all()
 839 |     assert (abs(dmatrix.std(axis=0) - 1) <= 1e-6).all()
 840 | 
 841 | 
 842 | def test_list_transformers_old_unpickle(simple_dataframe):
 843 |     mapper = DataFrameMapper(None)
 844 |     # simulate the mapper was created with < 1.0.0 code
 845 |     mapper.features = [('a', [MockXTransformer()])]
 846 |     mapper_pickled = pickle.dumps(mapper)
 847 | 
 848 |     loaded_mapper = pickle.loads(mapper_pickled)
 849 |     transformer = loaded_mapper.features[0][1]
 850 |     assert isinstance(transformer, TransformerPipeline)
 851 |     assert isinstance(transformer.steps[0][1], MockXTransformer)
 852 | 
 853 | 
 854 | def test_sparse_features(simple_dataframe):
 855 |     """
 856 |     If any of the extracted features is sparse and "sparse" argument
 857 |     is true, the hstacked result is also sparse.
 858 |     """
 859 |     df = simple_dataframe
 860 |     mapper = DataFrameMapper([
 861 |         ("a", ToSparseTransformer())
 862 |     ], sparse=True)
 863 |     dmatrix = mapper.fit_transform(df)
 864 | 
 865 |     assert type(dmatrix) == sparse.csr.csr_matrix
 866 | 
 867 | 
 868 | def test_sparse_off(simple_dataframe):
 869 |     """
 870 |     If the resulting features are sparse but the "sparse" argument
 871 |     of the mapper is False, return a non-sparse matrix.
 872 |     """
 873 |     df = simple_dataframe
 874 |     mapper = DataFrameMapper([
 875 |         ("a", ToSparseTransformer())
 876 |     ], sparse=False)
 877 | 
 878 |     dmatrix = mapper.fit_transform(df)
 879 |     assert type(dmatrix) != sparse.csr.csr_matrix
 880 | 
 881 | 
 882 | def test_fit_with_optional_y_arg(complex_dataframe):
 883 |     """
 884 |     Transformers with an optional y argument in the fit method
 885 |     are handled correctly
 886 |     """
 887 |     df = complex_dataframe
 888 |     mapper = DataFrameMapper([(['feat1', 'feat2'], MockTClassifier())])
 889 |     # doesn't fail
 890 |     mapper.fit(df[['feat1', 'feat2']], df['target'])
 891 | 
 892 | 
 893 | def test_fit_with_required_y_arg(complex_dataframe):
 894 |     """
 895 |     Transformers with a required y argument in the fit method
 896 |     are handled and perform correctly
 897 |     """
 898 |     df = complex_dataframe
 899 |     mapper = DataFrameMapper([(['feat1', 'feat2'], SelectKBest(chi2, k=1))])
 900 | 
 901 |     # fit, doesn't fail
 902 |     ft_arr = mapper.fit(df[['feat1', 'feat2']], df['target'])
 903 | 
 904 |     # fit_transform
 905 |     ft_arr = mapper.fit_transform(df[['feat1', 'feat2']], df['target'])
 906 |     assert_array_equal(ft_arr, df[['feat1']].values)
 907 | 
 908 |     # transform
 909 |     t_arr = mapper.transform(df[['feat1', 'feat2']])
 910 |     assert_array_equal(t_arr, df[['feat1']].values)
 911 | 
 912 | 
 913 | # Integration tests with real dataframes
 914 | 
 915 | @pytest.fixture
 916 | def iris_dataframe():
 917 |     iris = load_iris()
 918 |     return DataFrame(
 919 |         data={
 920 |             iris.feature_names[0]: iris.data[:, 0],
 921 |             iris.feature_names[1]: iris.data[:, 1],
 922 |             iris.feature_names[2]: iris.data[:, 2],
 923 |             iris.feature_names[3]: iris.data[:, 3],
 924 |             "species": np.array([iris.target_names[e] for e in iris.target])
 925 |         }
 926 |     )
 927 | 
 928 | 
 929 | @pytest.fixture
 930 | def cars_dataframe():
 931 |     return pd.read_csv("tests/test_data/cars.csv.gz", compression='gzip')
 932 | 
 933 | 
 934 | def test_with_iris_dataframe(iris_dataframe):
 935 |     pipeline = Pipeline([
 936 |         ("preprocess", DataFrameMapper([
 937 |             ("petal length (cm)", None),
 938 |             ("petal width (cm)", None),
 939 |             ("sepal length (cm)", None),
 940 |             ("sepal width (cm)", None),
 941 |         ])),
 942 |         ("classify", SVC(kernel='linear'))
 943 |     ])
 944 |     data = iris_dataframe.drop("species", axis=1)
 945 |     labels = iris_dataframe["species"]
 946 |     scores = cross_val_score(pipeline, data, labels)
 947 |     assert scores.mean() > 0.96
 948 |     assert (scores.std() * 2) < 0.04
 949 | 
 950 | 
 951 | def test_dict_vectorizer():
 952 |     df = pd.DataFrame(
 953 |         [[{'a': 1, 'b': 2}], [{'a': 3}]],
 954 |         columns=['colA']
 955 |     )
 956 | 
 957 |     outdf = DataFrameMapper(
 958 |         [('colA', DictVectorizer())],
 959 |         df_out=True,
 960 |         default=False
 961 |     ).fit_transform(df)
 962 | 
 963 |     columns = sorted(list(outdf.columns))
 964 |     assert len(columns) == 2
 965 |     assert columns[0] == 'colA_a'
 966 |     assert columns[1] == 'colA_b'
 967 | 
 968 | 
 969 | def test_with_car_dataframe(cars_dataframe):
 970 |     pipeline = Pipeline([
 971 |         ("preprocess", DataFrameMapper([
 972 |             ("description", CountVectorizer()),
 973 |         ])),
 974 |         ("classify", SVC(kernel='linear'))
 975 |     ])
 976 |     data = cars_dataframe.drop("model", axis=1)
 977 |     labels = cars_dataframe["model"]
 978 |     scores = cross_val_score(pipeline, data, labels)
 979 |     assert scores.mean() > 0.30
 980 | 
 981 | 
 982 | def test_direct_cross_validation(iris_dataframe):
 983 |     """
 984 |     Starting with sklearn>=0.16.0 we no longer need CV wrappers for dataframes.
 985 |     See https://github.com/paulgb/sklearn-pandas/issues/11
 986 |     """
 987 |     pipeline = Pipeline([
 988 |         ("preprocess", DataFrameMapper([
 989 |             ("petal length (cm)", None),
 990 |             ("petal width (cm)", None),
 991 |             ("sepal length (cm)", None),
 992 |             ("sepal width (cm)", None),
 993 |         ])),
 994 |         ("classify", SVC(kernel='linear'))
 995 |     ])
 996 |     data = iris_dataframe.drop("species", axis=1)
 997 |     labels = iris_dataframe["species"]
 998 |     scores = cross_val_score(pipeline, data, labels)
 999 |     assert scores.mean() > 0.96
1000 |     assert (scores.std() * 2) < 0.04
1001 | 
1002 | 
1003 | def test_heterogeneous_output_types_input_df():
1004 |     """
1005 |     Modify feat2, but pass feat1 through unmodified.
1006 |     This fails if input_df == False
1007 |     """
1008 |     df = pd.DataFrame({
1009 |         'feat1': [1, 2, 3, 4, 5, 6],
1010 |         'feat2': [1.0, 2.0, 3.0, 2.0, 3.0, 4.0]
1011 |     })
1012 |     M = DataFrameMapper([
1013 |         (['feat2'], StandardScaler())
1014 |         ], input_df=True, df_out=True, default=None)
1015 |     dft = M.fit_transform(df)
1016 |     assert dft['feat1'].dtype == np.dtype('int64')
1017 |     assert dft['feat2'].dtype == np.dtype('float64')
1018 | 
1019 | 
1020 | def test_make_column_selector(iris_dataframe):
1021 |     t = DataFrameMapper([
1022 |         (make_column_selector(dtype_include=float), None, {'alias': 'x'}),
1023 |         ('sepal length (cm)', None),
1024 |     ], df_out=True, default=False)
1025 | 
1026 |     xt = t.fit(iris_dataframe).transform(iris_dataframe)
1027 |     expected = ['x_0', 'x_1', 'x_2', 'x_3', 'sepal length (cm)']
1028 |     assert list(xt.columns) == expected
1029 | 
1030 |     pickled = pickle.dumps(t)
1031 |     t2 = pickle.loads(pickled)
1032 |     xt2 = t2.transform(iris_dataframe)
1033 |     assert np.array_equal(xt.values, xt2.values)
1034 | 


--------------------------------------------------------------------------------
/tests/test_features_generator.py:
--------------------------------------------------------------------------------
  1 | from collections import Counter
  2 | 
  3 | import pytest
  4 | import numpy as np
  5 | from pandas import DataFrame
  6 | from numpy.testing import assert_array_equal
  7 | 
  8 | from sklearn_pandas import DataFrameMapper
  9 | from sklearn_pandas.features_generator import gen_features
 10 | 
 11 | 
 12 | class MockClass(object):
 13 | 
 14 |     def __init__(self, value=1, name='class'):
 15 |         self.value = value
 16 |         self.name = name
 17 | 
 18 | 
 19 | class MockTransformer(object):
 20 | 
 21 |     def __init__(self):
 22 |         self.most_common_ = None
 23 | 
 24 |     def fit(self, X, y=None):
 25 |         [(value, _)] = Counter(X).most_common(1)
 26 |         self.most_common_ = value
 27 |         return self
 28 | 
 29 |     def transform(self, X, y=None):
 30 |         return np.asarray([self.most_common_] * len(X))
 31 | 
 32 | 
 33 | @pytest.fixture
 34 | def simple_dataset():
 35 |     return DataFrame({
 36 |         'feat1': [1, 2, 1, 3, 1],
 37 |         'feat2': [1, 2, 2, 2, 3],
 38 |         'feat3': [1, 2, 3, 4, 5],
 39 |     })
 40 | 
 41 | 
 42 | def test_generate_features_with_default_parameters():
 43 |     """
 44 |     Tests generating features from classes with default init arguments.
 45 |     """
 46 |     columns = ['colA', 'colB', 'colC']
 47 |     feature_defs = gen_features(columns=columns, classes=[MockClass])
 48 |     assert len(feature_defs) == len(columns)
 49 | 
 50 |     for feature in feature_defs:
 51 |         assert feature[2] == {}
 52 | 
 53 |     feature_dict = dict([_[0:2] for _ in feature_defs])
 54 |     assert columns == sorted(feature_dict.keys())
 55 | 
 56 |     # default init arguments for MockClass for clarification.
 57 |     expected = {'value': 1, 'name': 'class'}
 58 |     for column, transformers in feature_dict.items():
 59 |         for obj in transformers:
 60 |             assert_attributes(obj, **expected)
 61 | 
 62 | 
 63 | def test_generate_features_with_several_classes():
 64 |     """
 65 |     Tests generating features pipeline with different transformers parameters.
 66 |     """
 67 |     feature_defs = gen_features(
 68 |         columns=['colA', 'colB', 'colC'],
 69 |         classes=[
 70 |             {'class': MockClass},
 71 |             {'class': MockClass, 'name': 'mockA'},
 72 |             {'class': MockClass, 'name': 'mockB', 'value': None}
 73 |         ]
 74 |     )
 75 | 
 76 |     for col, transformers, params in feature_defs:
 77 |         assert_attributes(transformers[0], name='class', value=1)
 78 |         assert_attributes(transformers[1], name='mockA', value=1)
 79 |         assert_attributes(transformers[2], name='mockB', value=None)
 80 | 
 81 | 
 82 | def test_generate_features_with_none_only_transformers():
 83 |     """
 84 |     Tests generating "dummy" feature definition which doesn't apply any
 85 |     transformation.
 86 |     """
 87 |     feature_defs = gen_features(
 88 |         columns=['colA', 'colB', 'colC'], classes=[None])
 89 | 
 90 |     expected = [('colA', None, {}),
 91 |                 ('colB', None, {}),
 92 |                 ('colC', None, {})]
 93 | 
 94 |     assert feature_defs == expected
 95 | 
 96 | 
 97 | def test_compatibility_with_data_frame_mapper(simple_dataset):
 98 |     """
 99 |     Tests compatibility of generated feature definition with DataFrameMapper.
100 |     """
101 |     features_defs = gen_features(
102 |         columns=['feat1', 'feat2'],
103 |         classes=[MockTransformer])
104 |     features_defs.append(('feat3', None))
105 | 
106 |     mapper = DataFrameMapper(features_defs)
107 |     X = mapper.fit_transform(simple_dataset)
108 |     expected = np.asarray([
109 |         [1, 2, 1],
110 |         [1, 2, 2],
111 |         [1, 2, 3],
112 |         [1, 2, 4],
113 |         [1, 2, 5]
114 |     ])
115 | 
116 |     assert_array_equal(X, expected)
117 | 
118 | 
119 | def assert_attributes(obj, **attrs):
120 |     for attr, value in attrs.items():
121 |         assert getattr(obj, attr) == value
122 | 


--------------------------------------------------------------------------------
/tests/test_pipeline.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | from sklearn_pandas.pipeline import TransformerPipeline, _call_fit
  3 | 
  4 | # In py3, mock is included with the unittest standard library
  5 | # In py2, it's a separate package
  6 | try:
  7 |     from unittest.mock import patch
  8 | except ImportError:
  9 |     from mock import patch
 10 | 
 11 | 
 12 | class NoTransformT(object):
 13 |     """Transformer without transform method.
 14 |     """
 15 |     def fit(self, x):
 16 |         return self
 17 | 
 18 | 
 19 | class NoFitT(object):
 20 |     """Transformer without fit method.
 21 |     """
 22 |     def transform(self, x):
 23 |         return self
 24 | 
 25 | 
 26 | class Trans(object):
 27 |     """
 28 |     Transformer with fit and transform methods
 29 |     """
 30 |     def fit(self, x, y=None):
 31 |         return self
 32 | 
 33 |     def transform(self, x):
 34 |         return self
 35 | 
 36 | 
 37 | def func_x_y(x, y, kwarg='kwarg'):
 38 |     """
 39 |     Function with required x and y arguments
 40 |     """
 41 |     return
 42 | 
 43 | 
 44 | def func_x(x, kwarg='kwarg'):
 45 |     """
 46 |     Function with required x argument
 47 |     """
 48 |     return
 49 | 
 50 | 
 51 | def func_raise_type_err(x, y, kwarg='kwarg'):
 52 |     """
 53 |     Function with required x and y arguments,
 54 |     raises TypeError
 55 |     """
 56 |     raise TypeError
 57 | 
 58 | 
 59 | def test_all_steps_fit_transform():
 60 |     """
 61 |     All steps must implement fit and transform. Otherwise, raise TypeError.
 62 |     """
 63 |     with pytest.raises(TypeError):
 64 |         TransformerPipeline([('svc', NoTransformT())])
 65 | 
 66 |     with pytest.raises(TypeError):
 67 |         TransformerPipeline([('svc', NoFitT())])
 68 | 
 69 | 
 70 | @patch.object(Trans, 'fit', side_effect=func_x_y)
 71 | def test_called_with_x_and_y(mock_fit):
 72 |     """
 73 |     Fit method with required X and y arguments is called with both and with
 74 |     any additional keywords
 75 |     """
 76 |     _call_fit(Trans().fit, 'X', 'y', kwarg='kwarg')
 77 |     mock_fit.assert_called_with('X', 'y', kwarg='kwarg')
 78 | 
 79 | 
 80 | @patch.object(Trans, 'fit', side_effect=func_x)
 81 | def test_called_with_x(mock_fit):
 82 |     """
 83 |     Fit method with a required X arguments is called with it and with
 84 |     any additional keywords
 85 |     """
 86 |     _call_fit(Trans().fit, 'X', 'y', kwarg='kwarg')
 87 |     mock_fit.assert_called_with('X', kwarg='kwarg')
 88 | 
 89 |     _call_fit(Trans().fit, 'X', kwarg='kwarg')
 90 |     mock_fit.assert_called_with('X', kwarg='kwarg')
 91 | 
 92 | 
 93 | @patch.object(Trans, 'fit', side_effect=func_raise_type_err)
 94 | def test_raises_type_error(mock_fit):
 95 |     """
 96 |     If a fit method with required X and y arguments raises a TypeError, it's
 97 |     re-raised (for a different reason) when it's called with one argument
 98 |     """
 99 |     with pytest.raises(TypeError):
100 |         _call_fit(Trans().fit, 'X', 'y', kwarg='kwarg')
101 | 


--------------------------------------------------------------------------------
/tests/test_transformers.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | import pytest
 3 | import numpy as np
 4 | from pandas import DataFrame
 5 | import joblib
 6 | 
 7 | from sklearn_pandas import DataFrameMapper
 8 | from sklearn_pandas import NumericalTransformer
 9 | 
10 | 
11 | @pytest.fixture
12 | def simple_dataset():
13 |     return DataFrame({
14 |         'feat1': [1, 2, 1, 3, 1],
15 |         'feat2': [1, 2, 2, 2, 3],
16 |         'feat3': [1, 2, 3, 4, 5],
17 |     })
18 | 
19 | 
20 | def test_common_numerical_transformer(simple_dataset):
21 |     """
22 |     Test log transformation
23 |     """
24 |     transfomer = DataFrameMapper([
25 |         ('feat1', NumericalTransformer('log'))
26 |     ], df_out=True)
27 |     df = simple_dataset
28 |     outDF = transfomer.fit_transform(df)
29 |     assert list(outDF.columns) == ['feat1']
30 |     assert np.array_equal(df['feat1'].apply(np.log).values, outDF.feat1.values)
31 | 
32 | 
33 | def test_numerical_transformer_serialization(simple_dataset):
34 |     """
35 |     Test if you can serialize transformer
36 |     """
37 |     transfomer = DataFrameMapper([
38 |         ('feat1', NumericalTransformer('log'))
39 |     ])
40 | 
41 |     df = simple_dataset
42 |     transfomer.fit(df)
43 |     f = tempfile.NamedTemporaryFile(delete=True)
44 |     joblib.dump(transfomer, f.name)
45 |     transfomer2 = joblib.load(f.name)
46 |     np.array_equal(transfomer.transform(df), transfomer2.transform(df))
47 |     f.close()
48 | 


--------------------------------------------------------------------------------