├── .circleci └── config.yml ├── .github └── workflows │ ├── step1_test.yml │ ├── step2_release.yml │ ├── step3_pypi_deploy.yml │ └── step4_conda_deploy.yml ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.rst ├── conda ├── conda_build_config.yml └── meta.yaml ├── nox.ini ├── noxfile.py ├── pytest.ini ├── setup.cfg ├── setup.py ├── sklearn_pandas ├── __init__.py ├── cross_validation.py ├── dataframe_mapper.py ├── features_generator.py ├── pipeline.py └── transformers.py ├── test.py └── tests ├── test_data └── cars.csv.gz ├── test_dataframe_mapper.py ├── test_features_generator.py ├── test_pipeline.py └── test_transformers.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | jobs: 3 | test37: 4 | docker: 5 | - image: circleci/python:3.7 6 | steps: 7 | - checkout 8 | - run: pip install --user nox 9 | - run: ~/.local/bin/nox 10 | test38: 11 | docker: 12 | - image: circleci/python:3.8 13 | steps: 14 | - checkout 15 | - run: pip install --user nox 16 | - run: ~/.local/bin/nox 17 | test39: 18 | docker: 19 | - image: cimg/python:3.9.1 20 | steps: 21 | - checkout 22 | - run: pip install --user nox 23 | - run: ~/.local/bin/nox 24 | 25 | workflows: 26 | version: 2 27 | build_and_test: 28 | jobs: 29 | - test37 30 | - test39 31 | -------------------------------------------------------------------------------- /.github/workflows/step1_test.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: 1 Test Package 5 | 6 | on: 7 | workflow_dispatch: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | test: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: [3.7, 3.8, 3.9] 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v1 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install nox 27 | - name: Test with pytest 28 | run: nox 29 | -------------------------------------------------------------------------------- /.github/workflows/step2_release.yml: -------------------------------------------------------------------------------- 1 | name: 2 Release Package 2 | 3 | on: 4 | workflow_dispatch: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | 10 | release: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - name: Checkout Code 15 | uses: actions/checkout@v2 16 | with: 17 | fetch-depth: 0 18 | - name: Changelog 19 | uses: scottbrenner/generate-changelog-action@master 20 | id: Changelog 21 | - name: Create Release 22 | id: create_release 23 | uses: actions/create-release@latest 24 | env: 25 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # This token is provided by Actions, you do not need to create your own token 26 | with: 27 | tag_name: ${{ github.ref }} 28 | release_name: ${{ github.ref }} 29 | body: | 30 | ${{ steps.Changelog.outputs.changelog }} 31 | draft: false 32 | prerelease: false -------------------------------------------------------------------------------- /.github/workflows/step3_pypi_deploy.yml: -------------------------------------------------------------------------------- 1 | name: 3 PyPI Deploy 2 | 3 | on: 4 | workflow_dispatch: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | 10 | deploy: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout Code 14 | uses: actions/checkout@v2 15 | with: 16 | fetch-depth: 0 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: "3.x" 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | - name: Build and publish PyPI 26 | env: 27 | TWINE_USERNAME: __token__ 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python setup.py sdist bdist_wheel 31 | twine upload --repository pypi dist/* -------------------------------------------------------------------------------- /.github/workflows/step4_conda_deploy.yml: -------------------------------------------------------------------------------- 1 | name: 4 Conda Deploy 2 | 3 | on: 4 | workflow_dispatch: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | 10 | conda_deploy: 11 | runs-on: ubuntu-latest 12 | # needs: test 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | - name: publish-to-conda 17 | uses: fcakyon/conda-publish-action@v1.3 18 | with: 19 | subdir: 'conda' 20 | anacondatoken: ${{ secrets.ANACONDA_TOKEN }} 21 | platforms: 'win osx linux' -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | *.pyc 3 | .tox/ 4 | build/ 5 | dist/ 6 | .cache/ 7 | .idea/ 8 | .pytest_cache/ 9 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | ## Development environment and steps 4 | 5 | 1. Click on the "Fork" button at the top-right of the GitHub page. 6 | 2. Clone your fork. Example: `git clone git@github.com:dukebody/sklearn-pandas.git`. 7 | 3. Create a new branch to work on the issue/feature you want. 8 | 4. Hack out your code. To run the tests and `flake8`, just run `nox`. Tests live in the `tests` subfolder. 9 | 5. Submit a new PR with your code, indicating in the PR which issue/feature it relates to. 10 | 11 | Note: You don't need to install `sklearn-pandas` in your virtualenv to run the tests. `tox` will automatically create multiple virtual environments to run them with multiple package versions. 12 | 13 | 14 | ## Guidelines 15 | 16 | - Remember that `sklearn-pandas` does not expect to do everything. Its scope is to serve as an integration layer between `scikit-learn` and `pandas` where needed. If the feature you want to implement adds a lot of complexity to the code, think twice if it is really needed or can be worked around in a few lines. 17 | - Always write tests for any change introduced. 18 | - If the change involves new options or modifies the public interface, modify also the `README` file explaining how to use it. It uses doctests to test the documentation itself. 19 | - If the change is not just cosmetic, add a line to the Changelog section and your name to the Credits section of the `README` file. 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | sklearn-pandas -- bridge code for cross-validation of pandas data frames 2 | with sklearn 3 | 4 | This software is provided 'as-is', without any express or implied 5 | warranty. In no event will the authors be held liable for any damages 6 | arising from the use of this software. 7 | 8 | Permission is granted to anyone to use this software for any purpose, 9 | including commercial applications, and to alter it and redistribute it 10 | freely, subject to the following restrictions: 11 | 12 | 1. The origin of this software must not be misrepresented; you must not 13 | claim that you wrote the original software. If you use this software 14 | in a product, an acknowledgment in the product documentation would be 15 | appreciated but is not required. 16 | 2. Altered source versions must be plainly marked as such, and must not be 17 | misrepresented as being the original software. 18 | 3. This notice may not be removed or altered from any source distribution. 19 | 20 | Paul Butler 21 | 22 | The source code of DataFrameMapper is derived from code originally written by 23 | Ben Hamner and released under the following license. 24 | 25 | Copyright (c) 2013, Ben Hamner 26 | Author: Ben Hamner (ben@benhamner.com) 27 | All rights reserved. 28 | 29 | Redistribution and use in source and binary forms, with or without 30 | modification, are permitted provided that the following conditions are met: 31 | 32 | 1. Redistributions of source code must retain the above copyright notice, this 33 | list of conditions and the following disclaimer. 34 | 2. Redistributions in binary form must reproduce the above copyright notice, 35 | this list of conditions and the following disclaimer in the documentation 36 | and/or other materials provided with the distribution. 37 | 38 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 39 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 40 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 41 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 42 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 43 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 44 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 45 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 46 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 47 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 48 | 49 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.rst 3 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | 2 | Sklearn-pandas 3 | ============== 4 | 5 | .. image:: https://circleci.com/gh/scikit-learn-contrib/sklearn-pandas.svg?style=svg 6 | :target: https://circleci.com/gh/scikit-learn-contrib/sklearn-pandas 7 | .. image:: https://img.shields.io/pypi/v/sklearn-pandas.svg 8 | :target: https://pypi.python.org/pypi/sklearn-pandas/ 9 | .. image:: https://anaconda.org/conda-forge/sklearn-pandas/badges/version.svg 10 | :target: https://anaconda.org/conda-forge/sklearn-pandas/ 11 | 12 | .. highlight:: python 13 | 14 | This module provides a bridge between `Scikit-Learn `__'s machine learning methods and `pandas `__-style Data Frames. 15 | In particular, it provides a way to map ``DataFrame`` columns to transformations, which are later recombined into features. 16 | 17 | Installation 18 | ------------ 19 | 20 | You can install ``sklearn-pandas`` with ``pip``:: 21 | 22 | # pip install sklearn-pandas 23 | 24 | or conda-forge:: 25 | 26 | # conda install -c conda-forge sklearn-pandas 27 | 28 | Tests 29 | ----- 30 | 31 | The examples in this file double as basic sanity tests. To run them, use ``doctest``, which is included with python:: 32 | 33 | # python -m doctest README.rst 34 | 35 | 36 | Usage 37 | ----- 38 | 39 | 40 | Import 41 | ****** 42 | 43 | Import what you need from the ``sklearn_pandas`` package. The choices are: 44 | 45 | * ``DataFrameMapper``, a class for mapping pandas data frame columns to different sklearn transformations 46 | 47 | 48 | For this demonstration, we will import both:: 49 | 50 | >>> from sklearn_pandas import DataFrameMapper 51 | 52 | For these examples, we'll also use pandas, numpy, and sklearn:: 53 | 54 | >>> import pandas as pd 55 | >>> import numpy as np 56 | >>> import sklearn.preprocessing, sklearn.decomposition, \ 57 | ... sklearn.linear_model, sklearn.pipeline, sklearn.metrics, \ 58 | ... sklearn.compose 59 | >>> from sklearn.feature_extraction.text import CountVectorizer 60 | 61 | 62 | Load some Data 63 | ************** 64 | 65 | 66 | Normally you'll read the data from a file, but for demonstration purposes we'll create a data frame from a Python dict:: 67 | 68 | >>> data = pd.DataFrame({'pet': ['cat', 'dog', 'dog', 'fish', 'cat', 'dog', 'cat', 'fish'], 69 | ... 'children': [4., 6, 3, 3, 2, 3, 5, 4], 70 | ... 'salary': [90., 24, 44, 27, 32, 59, 36, 27]}) 71 | 72 | 73 | Transformation Mapping 74 | ---------------------- 75 | 76 | 77 | Map the Columns to Transformations 78 | ********************************** 79 | 80 | The mapper takes a list of tuples. Each tuple has three elements: 81 | 1. column name(s): The first element is a column name from the pandas DataFrame, or a list containing one or multiple columns (we will see an example with multiple columns later) or an instance of a callable function such as `make_column_selector `__. 82 | 2. transformer(s): The second element is an object which will perform the transformation which will be applied to that column. 83 | 3. attributes: The third one is optional and is a dictionary containing the transformation options, if applicable (see "custom column names for transformed features" below). 84 | 85 | Let's see an example:: 86 | 87 | >>> mapper = DataFrameMapper([ 88 | ... ('pet', sklearn.preprocessing.LabelBinarizer()), 89 | ... (['children'], sklearn.preprocessing.StandardScaler()) 90 | ... ]) 91 | 92 | The difference between specifying the column selector as ``'column'`` (as a simple string) and ``['column']`` (as a list with one element) is the shape of the array that is passed to the transformer. In the first case, a one dimensional array will be passed, while in the second case it will be a 2-dimensional array with one column, i.e. a column vector. 93 | 94 | This behaviour mimics the same pattern as pandas' dataframes ``__getitem__`` indexing:: 95 | 96 | >>> data['children'].shape 97 | (8,) 98 | >>> data[['children']].shape 99 | (8, 1) 100 | 101 | Be aware that some transformers expect a 1-dimensional input (the label-oriented ones) while some others, like ``OneHotEncoder`` or ``Imputer``, expect 2-dimensional input, with the shape ``[n_samples, n_features]``. 102 | 103 | 104 | Test the Transformation 105 | *********************** 106 | 107 | We can use the ``fit_transform`` shortcut to both fit the model and see what transformed data looks like. In this and the other examples, output is rounded to two digits with ``np.round`` to account for rounding errors on different hardware:: 108 | 109 | >>> np.round(mapper.fit_transform(data.copy()), 2) 110 | array([[ 1. , 0. , 0. , 0.21], 111 | [ 0. , 1. , 0. , 1.88], 112 | [ 0. , 1. , 0. , -0.63], 113 | [ 0. , 0. , 1. , -0.63], 114 | [ 1. , 0. , 0. , -1.46], 115 | [ 0. , 1. , 0. , -0.63], 116 | [ 1. , 0. , 0. , 1.04], 117 | [ 0. , 0. , 1. , 0.21]]) 118 | 119 | Note that the first three columns are the output of the ``LabelBinarizer`` (corresponding to ``cat``, ``dog``, and ``fish`` respectively) and the fourth column is the standardized value for the number of children. In general, the columns are ordered according to the order given when the ``DataFrameMapper`` is constructed. 120 | 121 | Now that the transformation is trained, we confirm that it works on new data:: 122 | 123 | >>> sample = pd.DataFrame({'pet': ['cat'], 'children': [5.]}) 124 | >>> np.round(mapper.transform(sample), 2) 125 | array([[1. , 0. , 0. , 1.04]]) 126 | 127 | 128 | Output features names 129 | ********************* 130 | 131 | In certain cases, like when studying the feature importances for some model, 132 | we want to be able to associate the original features to the ones generated by 133 | the dataframe mapper. We can do so by inspecting the automatically generated ``transformed_names_`` attribute of the mapper after transformation:: 134 | 135 | >>> mapper.transformed_names_ 136 | ['pet_cat', 'pet_dog', 'pet_fish', 'children'] 137 | 138 | 139 | Custom column names for transformed features 140 | ******************************************** 141 | 142 | We can provide a custom name for the transformed features, to be used instead 143 | of the automatically generated one, by specifying it as the third argument 144 | of the feature definition:: 145 | 146 | 147 | >>> mapper_alias = DataFrameMapper([ 148 | ... (['children'], sklearn.preprocessing.StandardScaler(), 149 | ... {'alias': 'children_scaled'}) 150 | ... ]) 151 | >>> _ = mapper_alias.fit_transform(data.copy()) 152 | >>> mapper_alias.transformed_names_ 153 | ['children_scaled'] 154 | 155 | Alternatively, you can also specify prefix and/or suffix to add to the column name. For example:: 156 | 157 | 158 | >>> mapper_alias = DataFrameMapper([ 159 | ... (['children'], sklearn.preprocessing.StandardScaler(), {'prefix': 'standard_scaled_'}), 160 | ... (['children'], sklearn.preprocessing.StandardScaler(), {'suffix': '_raw'}) 161 | ... ]) 162 | >>> _ = mapper_alias.fit_transform(data.copy()) 163 | >>> mapper_alias.transformed_names_ 164 | ['standard_scaled_children', 'children_raw'] 165 | 166 | 167 | Dynamic Columns 168 | *********************** 169 | In some situations the columns are not known before hand and we would like to dynamically select them during the fit operation. As shown below, in such situations you can provide either a custom callable or use `make_column_selector `__. 170 | 171 | :: 172 | 173 | >>> class GetColumnsStartingWith: 174 | ... def __init__(self, start_str): 175 | ... self.pattern = start_str 176 | ... 177 | ... def __call__(self, X:pd.DataFrame=None): 178 | ... return [c for c in X.columns if c.startswith(self.pattern)] 179 | ... 180 | >>> df = pd.DataFrame({ 181 | ... 'sepal length (cm)': [1.0, 2.0, 3.0], 182 | ... 'sepal width (cm)': [1.0, 2.0, 3.0], 183 | ... 'petal length (cm)': [1.0, 2.0, 3.0], 184 | ... 'petal width (cm)': [1.0, 2.0, 3.0] 185 | ... }) 186 | >>> t = DataFrameMapper([ 187 | ... ( 188 | ... sklearn.compose.make_column_selector(dtype_include=float), 189 | ... sklearn.preprocessing.StandardScaler(), 190 | ... {'alias': 'x'} 191 | ... ), 192 | ... ( 193 | ... GetColumnsStartingWith('petal'), 194 | ... None, 195 | ... {'alias': 'petal'} 196 | ... )], df_out=True, default=False) 197 | >>> t.fit(df).transform(df).shape 198 | (3, 6) 199 | >>> t.transformed_names_ 200 | ['x_0', 'x_1', 'x_2', 'x_3', 'petal_0', 'petal_1'] 201 | 202 | 203 | 204 | Above we use `make_column_selector` to select all columns that are of type float and also use a custom callable function to select columns that start with the word 'petal'. 205 | 206 | 207 | Passing Series/DataFrames to the transformers 208 | ********************************************* 209 | 210 | By default the transformers are passed a numpy array of the selected columns 211 | as input. This is because ``sklearn`` transformers are historically designed to 212 | work with numpy arrays, not with pandas dataframes, even though their basic 213 | indexing interfaces are similar. 214 | 215 | However we can pass a dataframe/series to the transformers to handle custom 216 | cases initializing the dataframe mapper with ``input_df=True``:: 217 | 218 | >>> from sklearn.base import TransformerMixin 219 | >>> class DateEncoder(TransformerMixin): 220 | ... def fit(self, X, y=None): 221 | ... return self 222 | ... 223 | ... def transform(self, X): 224 | ... dt = X.dt 225 | ... return pd.concat([dt.year, dt.month, dt.day], axis=1) 226 | >>> dates_df = pd.DataFrame( 227 | ... {'dates': pd.date_range('2015-10-30', '2015-11-02')}) 228 | >>> mapper_dates = DataFrameMapper([ 229 | ... ('dates', DateEncoder()) 230 | ... ], input_df=True) 231 | >>> mapper_dates.fit_transform(dates_df) 232 | array([[2015, 10, 30], 233 | [2015, 10, 31], 234 | [2015, 11, 1], 235 | [2015, 11, 2]]) 236 | 237 | We can also specify this option per group of columns instead of for the 238 | whole mapper:: 239 | 240 | >>> mapper_dates = DataFrameMapper([ 241 | ... ('dates', DateEncoder(), {'input_df': True}) 242 | ... ]) 243 | >>> mapper_dates.fit_transform(dates_df) 244 | array([[2015, 10, 30], 245 | [2015, 10, 31], 246 | [2015, 11, 1], 247 | [2015, 11, 2]]) 248 | 249 | Outputting a dataframe 250 | ********************** 251 | 252 | By default the output of the dataframe mapper is a numpy array. This is so because most sklearn estimators expect a numpy array as input. If however we want the output of the mapper to be a dataframe, we can do so using the parameter ``df_out`` when creating the mapper:: 253 | 254 | >>> mapper_df = DataFrameMapper([ 255 | ... ('pet', sklearn.preprocessing.LabelBinarizer()), 256 | ... (['children'], sklearn.preprocessing.StandardScaler()) 257 | ... ], df_out=True) 258 | >>> np.round(mapper_df.fit_transform(data.copy()), 2) 259 | pet_cat pet_dog pet_fish children 260 | 0 1 0 0 0.21 261 | 1 0 1 0 1.88 262 | 2 0 1 0 -0.63 263 | 3 0 0 1 -0.63 264 | 4 1 0 0 -1.46 265 | 5 0 1 0 -0.63 266 | 6 1 0 0 1.04 267 | 7 0 0 1 0.21 268 | 269 | The names for the columns are the same ones present in the ``transformed_names_`` 270 | attribute. 271 | 272 | Note this does not work together with the ``default=True`` or ``sparse=True`` arguments to the mapper. 273 | 274 | Dropping columns explictly 275 | ******************************* 276 | 277 | Sometimes it is required to drop a specific column/ list of columns. 278 | For this purpose, ``drop_cols`` argument for ``DataFrameMapper`` can be used. 279 | Default value is ``None``:: 280 | 281 | >>> mapper_df = DataFrameMapper([ 282 | ... ('pet', sklearn.preprocessing.LabelBinarizer()), 283 | ... (['children'], sklearn.preprocessing.StandardScaler()) 284 | ... ], drop_cols=['salary']) 285 | 286 | Now running ``fit_transform`` will run transformations on 'pet' and 'children' and drop 'salary' column:: 287 | 288 | >>> np.round(mapper_df.fit_transform(data.copy()), 1) 289 | array([[ 1. , 0. , 0. , 0.2], 290 | [ 0. , 1. , 0. , 1.9], 291 | [ 0. , 1. , 0. , -0.6], 292 | [ 0. , 0. , 1. , -0.6], 293 | [ 1. , 0. , 0. , -1.5], 294 | [ 0. , 1. , 0. , -0.6], 295 | [ 1. , 0. , 0. , 1. ], 296 | [ 0. , 0. , 1. , 0.2]]) 297 | 298 | Transformations may require multiple input columns. In these 299 | 300 | Transform Multiple Columns 301 | ************************** 302 | 303 | Transformations may require multiple input columns. In these cases, the column names can be specified in a list:: 304 | 305 | >>> mapper2 = DataFrameMapper([ 306 | ... (['children', 'salary'], sklearn.decomposition.PCA(1)) 307 | ... ]) 308 | 309 | Now running ``fit_transform`` will run PCA on the ``children`` and ``salary`` columns and return the first principal component:: 310 | 311 | >>> np.round(mapper2.fit_transform(data.copy()), 1) 312 | array([[ 47.6], 313 | [-18.4], 314 | [ 1.6], 315 | [-15.4], 316 | [-10.4], 317 | [ 16.6], 318 | [ -6.4], 319 | [-15.4]]) 320 | 321 | Multiple transformers for the same column 322 | ***************************************** 323 | 324 | Multiple transformers can be applied to the same column specifying them 325 | in a list:: 326 | 327 | >>> from sklearn.impute import SimpleImputer 328 | >>> mapper3 = DataFrameMapper([ 329 | ... (['age'], [SimpleImputer(), 330 | ... sklearn.preprocessing.StandardScaler()])]) 331 | >>> data_3 = pd.DataFrame({'age': [1, np.nan, 3]}) 332 | >>> mapper3.fit_transform(data_3) 333 | array([[-1.22474487], 334 | [ 0. ], 335 | [ 1.22474487]]) 336 | 337 | 338 | Columns that don't need any transformation 339 | ****************************************** 340 | 341 | Only columns that are listed in the DataFrameMapper are kept. To keep a column but don't apply any transformation to it, use `None` as transformer:: 342 | 343 | >>> mapper3 = DataFrameMapper([ 344 | ... ('pet', sklearn.preprocessing.LabelBinarizer()), 345 | ... ('children', None) 346 | ... ]) 347 | >>> np.round(mapper3.fit_transform(data.copy())) 348 | array([[1., 0., 0., 4.], 349 | [0., 1., 0., 6.], 350 | [0., 1., 0., 3.], 351 | [0., 0., 1., 3.], 352 | [1., 0., 0., 2.], 353 | [0., 1., 0., 3.], 354 | [1., 0., 0., 5.], 355 | [0., 0., 1., 4.]]) 356 | 357 | Applying a default transformer 358 | ****************************** 359 | 360 | A default transformer can be applied to columns not explicitly selected 361 | passing it as the ``default`` argument to the mapper:: 362 | 363 | >>> mapper4 = DataFrameMapper([ 364 | ... ('pet', sklearn.preprocessing.LabelBinarizer()), 365 | ... ('children', None) 366 | ... ], default=sklearn.preprocessing.StandardScaler()) 367 | >>> np.round(mapper4.fit_transform(data.copy()), 1) 368 | array([[ 1. , 0. , 0. , 4. , 2.3], 369 | [ 0. , 1. , 0. , 6. , -0.9], 370 | [ 0. , 1. , 0. , 3. , 0.1], 371 | [ 0. , 0. , 1. , 3. , -0.7], 372 | [ 1. , 0. , 0. , 2. , -0.5], 373 | [ 0. , 1. , 0. , 3. , 0.8], 374 | [ 1. , 0. , 0. , 5. , -0.3], 375 | [ 0. , 0. , 1. , 4. , -0.7]]) 376 | 377 | Using ``default=False`` (the default) drops unselected columns. Using 378 | ``default=None`` pass the unselected columns unchanged. 379 | 380 | 381 | Same transformer for the multiple columns 382 | ***************************************** 383 | 384 | Sometimes it is required to apply the same transformation to several dataframe columns. 385 | To simplify this process, the package provides ``gen_features`` function which accepts a list 386 | of columns and feature transformer class (or list of classes), and generates a feature definition, 387 | acceptable by ``DataFrameMapper``. 388 | 389 | For example, consider a dataset with three categorical columns, 'col1', 'col2', and 'col3', 390 | To binarize each of them, one could pass column names and ``LabelBinarizer`` transformer class 391 | into generator, and then use returned definition as ``features`` argument for ``DataFrameMapper``:: 392 | 393 | >>> from sklearn_pandas import gen_features 394 | >>> feature_def = gen_features( 395 | ... columns=['col1', 'col2', 'col3'], 396 | ... classes=[sklearn.preprocessing.LabelEncoder] 397 | ... ) 398 | >>> feature_def 399 | [('col1', [LabelEncoder()], {}), ('col2', [LabelEncoder()], {}), ('col3', [LabelEncoder()], {})] 400 | >>> mapper5 = DataFrameMapper(feature_def) 401 | >>> data5 = pd.DataFrame({ 402 | ... 'col1': ['yes', 'no', 'yes'], 403 | ... 'col2': [True, False, False], 404 | ... 'col3': ['one', 'two', 'three'] 405 | ... }) 406 | >>> mapper5.fit_transform(data5) 407 | array([[1, 1, 0], 408 | [0, 0, 2], 409 | [1, 0, 1]]) 410 | 411 | If it is required to override some of transformer parameters, then a dict with 'class' key and 412 | transformer parameters should be provided. For example, consider a dataset with missing values. 413 | Then the following code could be used to override default imputing strategy:: 414 | 415 | >>> from sklearn.impute import SimpleImputer 416 | >>> import numpy as np 417 | >>> feature_def = gen_features( 418 | ... columns=[['col1'], ['col2'], ['col3']], 419 | ... classes=[{'class': SimpleImputer, 'strategy':'most_frequent'}] 420 | ... ) 421 | >>> mapper6 = DataFrameMapper(feature_def) 422 | >>> data6 = pd.DataFrame({ 423 | ... 'col1': [np.nan, 1, 1, 2, 3], 424 | ... 'col2': [True, False, np.nan, np.nan, True], 425 | ... 'col3': [0, 0, 0, np.nan, np.nan] 426 | ... }) 427 | >>> mapper6.fit_transform(data6) 428 | array([[1.0, True, 0.0], 429 | [1.0, False, 0.0], 430 | [1.0, True, 0.0], 431 | [2.0, True, 0.0], 432 | [3.0, True, 0.0]], dtype=object) 433 | 434 | You can also specify global prefix or suffix for the generated transformed column names using the prefix and suffix 435 | parameters:: 436 | 437 | >>> feature_def = gen_features( 438 | ... columns=['col1', 'col2', 'col3'], 439 | ... classes=[sklearn.preprocessing.LabelEncoder], 440 | ... prefix="lblencoder_" 441 | ... ) 442 | >>> mapper5 = DataFrameMapper(feature_def) 443 | >>> data5 = pd.DataFrame({ 444 | ... 'col1': ['yes', 'no', 'yes'], 445 | ... 'col2': [True, False, False], 446 | ... 'col3': ['one', 'two', 'three'] 447 | ... }) 448 | >>> _ = mapper5.fit_transform(data5) 449 | >>> mapper5.transformed_names_ 450 | ['lblencoder_col1', 'lblencoder_col2', 'lblencoder_col3'] 451 | 452 | Feature selection and other supervised transformations 453 | ****************************************************** 454 | 455 | ``DataFrameMapper`` supports transformers that require both X and y arguments. An example of this is feature selection. Treating the 'pet' column as the target, we will select the column that best predicts it. 456 | 457 | :: 458 | 459 | >>> from sklearn.feature_selection import SelectKBest, chi2 460 | >>> mapper_fs = DataFrameMapper([(['children','salary'], SelectKBest(chi2, k=1))]) 461 | >>> mapper_fs.fit_transform(data[['children','salary']], data['pet']) 462 | array([[90.], 463 | [24.], 464 | [44.], 465 | [27.], 466 | [32.], 467 | [59.], 468 | [36.], 469 | [27.]]) 470 | 471 | Working with sparse features 472 | **************************** 473 | 474 | A ``DataFrameMapper`` will return a dense feature array by default. Setting ``sparse=True`` in the mapper will return 475 | a sparse array whenever any of the extracted features is sparse. Example:: 476 | 477 | >>> mapper5 = DataFrameMapper([ 478 | ... ('pet', CountVectorizer()), 479 | ... ], sparse=True) 480 | >>> type(mapper5.fit_transform(data)) 481 | 482 | 483 | The stacking of the sparse features is done without ever densifying them. 484 | 485 | 486 | Using ``NumericalTransformer`` 487 | *********************************** 488 | 489 | While you can use ``FunctionTransformation`` to generate arbitrary transformers, it can present serialization issues 490 | when pickling. Use ``NumericalTransformer`` instead, which takes the function name as a string parameter and hence 491 | can be easily serialized. 492 | 493 | :: 494 | 495 | >>> from sklearn_pandas import NumericalTransformer 496 | >>> mapper5 = DataFrameMapper([ 497 | ... ('children', NumericalTransformer('log')), 498 | ... ]) 499 | >>> mapper5.fit_transform(data) 500 | array([[1.38629436], 501 | [1.79175947], 502 | [1.09861229], 503 | [1.09861229], 504 | [0.69314718], 505 | [1.09861229], 506 | [1.60943791], 507 | [1.38629436]]) 508 | 509 | Changing Logging level 510 | *********************************** 511 | 512 | You can change log level to info to print time take to fit/transform features. Setting it to higher level will stop printing elapsed time. 513 | Below example shows how to change logging level. 514 | 515 | :: 516 | 517 | >>> import logging 518 | >>> logging.getLogger('sklearn_pandas').setLevel(logging.INFO) 519 | 520 | 521 | Changelog 522 | --------- 523 | 524 | 525 | 2.2.0 (2021-05-07) 526 | ****************** 527 | * Added an ability to provide callable functions instead of static column list. 528 | 529 | 530 | 2.1.0 (2021-02-26) 531 | ****************** 532 | * Removed test for Python 3.6 and added Python 3.9 533 | * Added deprecation warning for NumericalTransformer 534 | * Fixed pickling issue causing integration issues with Baikal. 535 | * Started publishing package to conda repo 536 | 537 | 538 | 2.0.4 (2020-11-06) 539 | ****************** 540 | 541 | * Explicitly handling serialization (#224) 542 | * document fixes 543 | * Making transform function thread safe (#194) 544 | * Switched to nox for unit testing (#226) 545 | 546 | 547 | 2.0.3 (2020-11-06) 548 | ****************** 549 | 550 | * Added elapsed time information for each feature. 551 | 552 | 553 | 2.0.2 (2020-10-01) 554 | ****************** 555 | 556 | * Fix `DataFrameMapper` drop_cols attribute naming consistency with scikit-learn and initialization. 557 | 558 | 559 | 2.0.1 (2020-09-07) 560 | ****************** 561 | 562 | * Added an option to explicitly drop columns. 563 | 564 | 565 | 2.0.0 (2020-08-01) 566 | ****************** 567 | 568 | * Deprecated support for Python < 3.6. 569 | * Deprecated support for old versions of scikit-learn, pandas and numpy. Please check setup.py for minimum requirement. 570 | * Removed CategoricalImputer, cross_val_score and GridSearchCV. All these functionality now exists as part of 571 | scikit-learn. Please use SimpleImputer instead of CategoricalImputer. Also 572 | Cross validation from sklearn now supports dataframe so we don't need to use cross validation wrapper provided over 573 | here. 574 | * Added ``NumericalTransformer`` for common numerical transformations. Currently it implements log and log1p 575 | transformation. 576 | * Added prefix and suffix options. See examples above. These are usually helpful when using gen_features. 577 | * Added ``drop_cols`` argument to DataframeMapper. This can be used to explicitly drop columns 578 | 579 | 580 | 1.8.0 (2018-12-01) 581 | ****************** 582 | 583 | * Add ``FunctionTransformer`` class (#117). 584 | * Fix column names derivation for dataframes with multi-index or non-string 585 | columns (#166). 586 | * Change behaviour of DataFrameMapper's fit_transform method to invoke each underlying transformers' 587 | native fit_transform if implemented (#150). 588 | 589 | 590 | 1.7.0 (2018-08-15) 591 | ****************** 592 | 593 | * Fix issues with unicode names in ``get_names`` (#160). 594 | * Update to build using ``numpy==1.14`` and ``python==3.6`` (#154). 595 | * Add ``strategy`` and ``fill_value`` parameters to ``CategoricalImputer`` to allow imputing 596 | with values other than the mode (#144),(#161). 597 | * Preserve input data types when no transform is supplied (#138). 598 | 599 | 600 | 1.6.0 (2017-10-28) 601 | ****************** 602 | 603 | * Add column name to exception during fit/transform (#110). 604 | * Add ``gen_feature`` helper function to help generating the same transformation for multiple columns (#126). 605 | 606 | 607 | 1.5.0 (2017-06-24) 608 | ****************** 609 | 610 | * Allow inputting a dataframe/series per group of columns. 611 | * Get feature names also from ``estimator.get_feature_names()`` if present. 612 | * Attempt to derive feature names from individual transformers when applying a 613 | list of transformers. 614 | * Do not mutate features in ``__init__`` to be compatible with 615 | ``sklearn>=0.20`` (#76). 616 | 617 | 618 | 1.4.0 (2017-05-13) 619 | ****************** 620 | 621 | * Allow specifying a custom name (alias) for transformed columns (#83). 622 | * Capture output columns generated names in ``transformed_names_`` attribute (#78). 623 | * Add ``CategoricalImputer`` that replaces null-like values with the mode 624 | for string-like columns. 625 | * Add ``input_df`` init argument to allow inputting a dataframe/series to the 626 | transformers instead of a numpy array (#60). 627 | 628 | 629 | 1.3.0 (2017-01-21) 630 | ****************** 631 | 632 | * Make the mapper return dataframes when ``df_out=True`` (#70, #74). 633 | * Update imports to avoid deprecation warnings in sklearn 0.18 (#68). 634 | 635 | 636 | 1.2.0 (2016-10-02) 637 | ****************** 638 | 639 | * Deprecate custom cross-validation shim classes. 640 | * Require ``scikit-learn>=0.15.0``. Resolves #49. 641 | * Allow applying a default transformer to columns not selected explicitly in 642 | the mapper. Resolves #55. 643 | * Allow specifying an optional ``y`` argument during transform for 644 | supervised transformations. Resolves #58. 645 | 646 | 647 | 1.1.0 (2015-12-06) 648 | ******************* 649 | 650 | * Delete obsolete ``PassThroughTransformer``. If no transformation is desired for a given column, use ``None`` as transformer. 651 | * Factor out code in several modules, to avoid having everything in ``__init__.py``. 652 | * Use custom ``TransformerPipeline`` class to allow transformation steps accepting only a X argument. Fixes #46. 653 | * Add compatibility shim for unpickling mappers with list of transformers created before 1.0.0. Fixes #45. 654 | 655 | 656 | 1.0.0 (2015-11-28) 657 | ******************* 658 | 659 | * Change version numbering scheme to SemVer. 660 | * Use ``sklearn.pipeline.Pipeline`` instead of copying its code. Resolves #43. 661 | * Raise ``KeyError`` when selecting unexistent columns in the dataframe. Fixes #30. 662 | * Return sparse feature array if any of the features is sparse and ``sparse`` argument is ``True``. Defaults to ``False`` to avoid potential breaking of existing code. Resolves #34. 663 | * Return model and prediction in custom CV classes. Fixes #27. 664 | 665 | 666 | 0.0.12 (2015-11-07) 667 | ******************** 668 | 669 | * Allow specifying a list of transformers to use sequentially on the same column. 670 | 671 | 672 | Credits 673 | ------- 674 | 675 | The code for ``DataFrameMapper`` is based on code originally written by `Ben Hamner `__. 676 | 677 | Other contributors: 678 | 679 | * Ariel Rossanigo (@arielrossanigo) 680 | * Arnau Gil Amat (@arnau126) 681 | * Assaf Ben-David (@AssafBenDavid) 682 | * Brendan Herger (@bjherger) 683 | * Cal Paterson (@calpaterson) 684 | * @defvorfu 685 | * Floris Hoogenboom (@FlorisHoogenboom) 686 | * Gustavo Sena Mafra (@gsmafra) 687 | * Israel Saeta Pérez (@dukebody) 688 | * Jeremy Howard (@jph00) 689 | * Jimmy Wan (@jimmywan) 690 | * Kristof Van Engeland (@kristofve91) 691 | * Olivier Grisel (@ogrisel) 692 | * Paul Butler (@paulgb) 693 | * Richard Miller (@rwjmiller) 694 | * Ritesh Agrawal (@ragrawal) 695 | * @SandroCasagrande 696 | * Timothy Sweetser (@hacktuarial) 697 | * Vitaley Zaretskey (@vzaretsk) 698 | * Zac Stewart (@zacstewart) 699 | * Parul Singh (@paro1234) 700 | * Vincent Heusinkveld (@VHeusinkveld) 701 | -------------------------------------------------------------------------------- /conda/conda_build_config.yml: -------------------------------------------------------------------------------- 1 | python: 2 | - 3.7 3 | - 3.8 4 | - 3.9 5 | -------------------------------------------------------------------------------- /conda/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set data = load_setup_py_data() %} 2 | 3 | package: 4 | name: sklearn-pandas 5 | version: {{ data['version'] }} 6 | 7 | source: 8 | path: .. 9 | 10 | build: 11 | number: 0 12 | script: python setup.py install --single-version-externally-managed --record=record.txt 13 | 14 | requirements: 15 | build: 16 | - python 17 | - scikit-learn>=0.23.0 18 | - scipy>=1.5.1 19 | - pandas>=1.1.4 20 | - numpy>=1.18.1 21 | 22 | run: 23 | - python 24 | - scikit-learn>=0.23.0 25 | - scipy>=1.5.1 26 | - pandas>=1.1.4 27 | - numpy>=1.18.1 28 | 29 | test: 30 | imports: 31 | - sklearn_pandas 32 | 33 | about: 34 | home: {{ data['url'] }} 35 | license: {{ data['license'] }} 36 | 37 | summary: {{ data['description'] }} -------------------------------------------------------------------------------- /nox.ini: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = 3 | .git 4 | .github 5 | __pycache__ 6 | build 7 | dist 8 | *site-packages/ 9 | *bin/ 10 | *.egg/* 11 | .eggs 12 | .tox 13 | docs -------------------------------------------------------------------------------- /noxfile.py: -------------------------------------------------------------------------------- 1 | import nox 2 | 3 | @nox.session 4 | def lint(session): 5 | session.install('pytest>=5.3.5', 'setuptools>=45.2', 6 | 'wheel>=0.34.2', 'flake8>=3.7.9', 7 | 'numpy==1.18.1', 'pandas==1.1.4') 8 | session.install('.') 9 | session.run('flake8', 'sklearn_pandas/', 'tests') 10 | 11 | @nox.session 12 | @nox.parametrize('numpy', ['1.18.1', '1.19.4', '1.20.1']) 13 | @nox.parametrize('scipy', ['1.5.4', '1.6.0']) 14 | @nox.parametrize('pandas', ['1.1.4', '1.2.2']) 15 | def tests(session, numpy, scipy, pandas): 16 | session.install('pytest>=5.3.5', 17 | 'setuptools>=45.2', 18 | 'wheel>=0.34.2', 19 | f'numpy=={numpy}', 20 | f'scipy=={scipy}', 21 | f'pandas=={pandas}' 22 | ) 23 | session.install('.') 24 | session.run('py.test', 'README.rst', 'tests') 25 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = --doctest-glob='*.rst' -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [wheel] 2 | universal = 1 3 | 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from setuptools import setup 5 | from setuptools.command.test import test as TestCommand 6 | import re 7 | 8 | for line in open('sklearn_pandas/__init__.py'): 9 | match = re.match("__version__ *= *'(.*)'", line) 10 | if match: 11 | __version__, = match.groups() 12 | 13 | 14 | class PyTest(TestCommand): 15 | user_options = [('pytest-args=', 'a', "Arguments to pass to py.test")] 16 | 17 | def initialize_options(self): 18 | TestCommand.initialize_options(self) 19 | self.pytest_args = [] 20 | 21 | def finalize_options(self): 22 | TestCommand.finalize_options(self) 23 | self.test_args = [] 24 | self.test_suite = True 25 | 26 | def run(self): 27 | import pytest 28 | errno = pytest.main(self.pytest_args) 29 | raise SystemExit(errno) 30 | 31 | 32 | setup(name='sklearn-pandas', 33 | version=__version__, 34 | description='Pandas integration with sklearn', 35 | maintainer='Ritesh Agrawal', 36 | maintainer_email='ragrawal@gmail.com', 37 | url='https://github.com/scikit-learn-contrib/sklearn-pandas', 38 | packages=['sklearn_pandas'], 39 | keywords=['scikit', 'sklearn', 'pandas'], 40 | install_requires=[ 41 | 'scikit-learn>=0.23.0', 42 | 'scipy>=1.5.1', 43 | 'pandas>=1.1.4', 44 | 'numpy>=1.18.1' 45 | ], 46 | tests_require=['pytest', 'mock'], 47 | cmdclass={'test': PyTest}, 48 | license='MIT License' 49 | ) 50 | -------------------------------------------------------------------------------- /sklearn_pandas/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '2.2.0' 2 | 3 | import logging 4 | logger = logging.getLogger(__name__) 5 | 6 | from .dataframe_mapper import DataFrameMapper # NOQA 7 | from .features_generator import gen_features # NOQA 8 | from .transformers import NumericalTransformer # NOQA 9 | -------------------------------------------------------------------------------- /sklearn_pandas/cross_validation.py: -------------------------------------------------------------------------------- 1 | class DataWrapper(object): 2 | 3 | def __init__(self, df): 4 | self.df = df 5 | 6 | def __len__(self): 7 | return len(self.df) 8 | 9 | def __getitem__(self, key): 10 | return self.df.iloc[key] 11 | -------------------------------------------------------------------------------- /sklearn_pandas/dataframe_mapper.py: -------------------------------------------------------------------------------- 1 | import contextlib 2 | from datetime import datetime 3 | import pandas as pd 4 | import numpy as np 5 | from scipy import sparse 6 | from sklearn.base import BaseEstimator, TransformerMixin 7 | from .cross_validation import DataWrapper 8 | from .pipeline import make_transformer_pipeline, _call_fit, TransformerPipeline 9 | from . import logger 10 | 11 | string_types = text_type = str 12 | 13 | 14 | def _handle_feature(fea): 15 | """ 16 | Convert 1-dimensional arrays to 2-dimensional column vectors. 17 | """ 18 | if len(fea.shape) == 1: 19 | fea = np.array([fea]).T 20 | 21 | return fea 22 | 23 | 24 | def _build_transformer(transformers): 25 | if isinstance(transformers, list): 26 | transformers = make_transformer_pipeline(*transformers) 27 | return transformers 28 | 29 | 30 | def _build_feature(columns, transformers, options={}, X=None): 31 | if X is None: 32 | return (columns, _build_transformer(transformers), options) 33 | return ( 34 | columns(X) if callable(columns) else columns, 35 | _build_transformer(transformers), 36 | options 37 | ) 38 | 39 | 40 | def _elapsed_secs(t1): 41 | return (datetime.now()-t1).total_seconds() 42 | 43 | 44 | def _get_feature_names(estimator): 45 | """ 46 | Attempt to extract feature names based on a given estimator 47 | """ 48 | if hasattr(estimator, 'classes_'): 49 | return estimator.classes_ 50 | elif hasattr(estimator, 'get_feature_names'): 51 | return estimator.get_feature_names() 52 | return None 53 | 54 | 55 | @contextlib.contextmanager 56 | def add_column_names_to_exception(column_names): 57 | # Stolen from https://stackoverflow.com/a/17677938/356729 58 | try: 59 | yield 60 | except Exception as ex: 61 | if ex.args: 62 | msg = u'{}: {}'.format(column_names, ex.args[0]) 63 | else: 64 | msg = text_type(column_names) 65 | ex.args = (msg,) + ex.args[1:] 66 | raise 67 | 68 | 69 | class DataFrameMapper(BaseEstimator, TransformerMixin): 70 | """ 71 | Map Pandas data frame column subsets to their own 72 | sklearn transformation. 73 | """ 74 | 75 | def __init__(self, features, default=False, sparse=False, df_out=False, 76 | input_df=False, drop_cols=None): 77 | """ 78 | Params: 79 | 80 | features a list of tuples with features definitions. 81 | The first element is the pandas column selector. This can 82 | be a string (for one column) or a list of strings. 83 | The second element is an object that supports 84 | sklearn's transform interface, or a list of such objects 85 | The third element is optional and, if present, must be 86 | a dictionary with the options to apply to the 87 | transformation. Example: {'alias': 'day_of_week'} 88 | 89 | default default transformer to apply to the columns not 90 | explicitly selected in the mapper. If False (default), 91 | discard them. If None, pass them through untouched. Any 92 | other transformer will be applied to all the unselected 93 | columns as a whole, taken as a 2d-array. 94 | 95 | sparse will return sparse matrix if set True and any of the 96 | extracted features is sparse. Defaults to False. 97 | 98 | df_out return a pandas data frame, with each column named using 99 | the pandas column that created it (if there's only one 100 | input and output) or the input columns joined with '_' 101 | if there's multiple inputs, and the name concatenated with 102 | '_1', '_2' etc if there's multiple outputs. NB: does not 103 | work if *default* or *sparse* are true 104 | 105 | input_df If ``True`` pass the selected columns to the transformers 106 | as a pandas DataFrame or Series. Otherwise pass them as a 107 | numpy array. Defaults to ``False``. 108 | 109 | drop_cols List of columns to be dropped. Defaults to None. 110 | 111 | """ 112 | self.features = features 113 | self.default = default 114 | self.built_default = None 115 | self.sparse = sparse 116 | self.df_out = df_out 117 | self.input_df = input_df 118 | self.drop_cols = [] if drop_cols is None else drop_cols 119 | self.transformed_names_ = [] 120 | if (df_out and (sparse or default)): 121 | raise ValueError("Can not use df_out with sparse or default") 122 | 123 | def _build(self, X=None): 124 | """ 125 | Build attributes built_features and built_default. 126 | """ 127 | if isinstance(self.features, list): 128 | self.built_features = [ 129 | _build_feature(*f, X=X) for f in self.features 130 | ] 131 | else: 132 | self.built_features = _build_feature(*self.features, X=X) 133 | self.built_default = _build_transformer(self.default) 134 | 135 | @property 136 | def _selected_columns(self): 137 | """ 138 | Return a set of selected columns in the feature list. 139 | """ 140 | selected_columns = set() 141 | for feature in self.features: 142 | columns = feature[0] 143 | if isinstance(columns, list): 144 | selected_columns = selected_columns.union(set(columns)) 145 | else: 146 | selected_columns.add(columns) 147 | return selected_columns 148 | 149 | def _unselected_columns(self, X): 150 | """ 151 | Return list of columns present in X and not selected explicitly in the 152 | mapper. 153 | 154 | Unselected columns are returned in the order they appear in the 155 | dataframe to avoid issues with different ordering during default fit 156 | and transform steps. 157 | """ 158 | X_columns = list(X.columns) 159 | return [column for column in X_columns if 160 | column not in self._selected_columns 161 | and column not in self.drop_cols] 162 | 163 | def __setstate__(self, state): 164 | # compatibility for older versions of sklearn-pandas 165 | super().__setstate__(state) 166 | self.features = [_build_feature(*feat) for feat in state['features']] 167 | self.sparse = state.get('sparse', False) 168 | self.default = state.get('default', False) 169 | self.df_out = state.get('df_out', False) 170 | self.input_df = state.get('input_df', False) 171 | self.drop_cols = state.get('drop_cols', []) 172 | self.built_features = state.get('built_features', self.features) 173 | self.built_default = state.get('built_default', self.default) 174 | self.transformed_names_ = state.get('transformed_names_', []) 175 | 176 | def __getstate__(self): 177 | state = super().__getstate__() 178 | state['features'] = self.features 179 | state['sparse'] = self.sparse 180 | state['default'] = self.default 181 | state['df_out'] = self.df_out 182 | state['input_df'] = self.input_df 183 | state['drop_cols'] = self.drop_cols 184 | state['build_features'] = getattr(self, 'built_features', None) 185 | state['built_default'] = self.built_default 186 | state['transformed_names_'] = self.transformed_names_ 187 | return state 188 | 189 | def _get_col_subset(self, X, cols, input_df=False): 190 | """ 191 | Get a subset of columns from the given table X. 192 | 193 | X a Pandas dataframe; the table to select columns from 194 | cols a string or list of strings representing the columns to select. 195 | It can also be a callable that returns True or False, i.e. 196 | compatible with the built-in filter function. 197 | 198 | Returns a numpy array with the data from the selected columns 199 | """ 200 | 201 | if isinstance(cols, string_types): 202 | return_vector = True 203 | cols = [cols] 204 | else: 205 | return_vector = False 206 | 207 | # Needed when using the cross-validation compatibility 208 | # layer for sklearn<0.16.0. 209 | # Will be dropped on sklearn-pandas 2.0. 210 | if isinstance(X, list): 211 | X = [x[cols] for x in X] 212 | X = pd.DataFrame(X) 213 | 214 | elif isinstance(X, DataWrapper): 215 | X = X.df # fetch underlying data 216 | 217 | if return_vector: 218 | t = X[cols[0]] 219 | else: 220 | t = X[cols] 221 | 222 | # return either a DataFrame/Series or a numpy array 223 | if input_df: 224 | return t 225 | else: 226 | return t.values 227 | 228 | def fit(self, X, y=None): 229 | """ 230 | Fit a transformation from the pipeline 231 | 232 | X the data to fit 233 | 234 | y the target vector relative to X, optional 235 | 236 | """ 237 | self._build(X=X) 238 | 239 | for columns, transformers, options in self.built_features: 240 | t1 = datetime.now() 241 | input_df = options.get('input_df', self.input_df) 242 | 243 | if transformers is not None: 244 | with add_column_names_to_exception(columns): 245 | Xt = self._get_col_subset(X, columns, input_df) 246 | _call_fit(transformers.fit, Xt, y) 247 | logger.info(f"[FIT] {columns}: {_elapsed_secs(t1)} secs") 248 | 249 | # handle features not explicitly selected 250 | if self.built_default: # not False and not None 251 | unsel_cols = self._unselected_columns(X) 252 | with add_column_names_to_exception(unsel_cols): 253 | Xt = self._get_col_subset(X, unsel_cols, self.input_df) 254 | _call_fit(self.built_default.fit, Xt, y) 255 | return self 256 | 257 | def get_names(self, columns, transformer, x, alias=None, prefix='', 258 | suffix=''): 259 | """ 260 | Return verbose names for the transformed columns. 261 | 262 | columns name (or list of names) of the original column(s) 263 | transformer transformer - can be a TransformerPipeline 264 | x transformed columns (numpy.ndarray) 265 | alias base name to use for the selected columns 266 | """ 267 | if alias is not None: 268 | name = alias 269 | elif isinstance(columns, list): 270 | name = '_'.join(map(str, columns)) 271 | else: 272 | name = columns 273 | num_cols = x.shape[1] if len(x.shape) > 1 else 1 274 | 275 | output = [] 276 | 277 | if num_cols > 1: 278 | # If there are as many columns as classes in the transformer, 279 | # infer column names from classes names. 280 | 281 | # If we are dealing with multiple transformers for these columns 282 | # attempt to extract the names from each of them, starting from the 283 | # last one 284 | if isinstance(transformer, TransformerPipeline): 285 | inverse_steps = transformer.steps[::-1] 286 | estimators = (estimator for name, estimator in inverse_steps) 287 | names_steps = (_get_feature_names(e) for e in estimators) 288 | names = next((n for n in names_steps if n is not None), None) 289 | # Otherwise use the only estimator present 290 | else: 291 | names = _get_feature_names(transformer) 292 | 293 | if names is not None and len(names) == num_cols: 294 | output = [f"{name}_{o}" for o in names] 295 | # otherwise, return name concatenated with '_1', '_2', etc. 296 | else: 297 | output = [name + '_' + str(o) for o in range(num_cols)] 298 | else: 299 | output = [name] 300 | 301 | if prefix == suffix == "": 302 | return output 303 | 304 | return ['{}{}{}'.format(prefix, x, suffix) for x in output] 305 | 306 | def get_dtypes(self, extracted): 307 | dtypes_features = [self.get_dtype(ex) for ex in extracted] 308 | return [dtype for dtype_feature in dtypes_features 309 | for dtype in dtype_feature] 310 | 311 | def get_dtype(self, ex): 312 | if isinstance(ex, np.ndarray) or sparse.issparse(ex): 313 | return [ex.dtype] * ex.shape[1] 314 | elif isinstance(ex, pd.DataFrame): 315 | return list(ex.dtypes) 316 | else: 317 | raise TypeError(type(ex)) 318 | 319 | def _transform(self, X, y=None, do_fit=False): 320 | """ 321 | Transform the given data with possibility to fit in advance. 322 | Avoids code duplication for implementation of transform and 323 | fit_transform. 324 | """ 325 | if do_fit: 326 | self._build(X=X) 327 | 328 | extracted = [] 329 | transformed_names_ = [] 330 | for columns, transformers, options in self.built_features: 331 | input_df = options.get('input_df', self.input_df) 332 | 333 | # columns could be a string or list of 334 | # strings; we don't care because pandas 335 | # will handle either. 336 | Xt = self._get_col_subset(X, columns, input_df) 337 | 338 | if transformers is not None: 339 | with add_column_names_to_exception(columns): 340 | if do_fit and hasattr(transformers, 'fit_transform'): 341 | t1 = datetime.now() 342 | Xt = _call_fit(transformers.fit_transform, Xt, y) 343 | logger.info(f"[FIT_TRANSFORM] {columns}: {_elapsed_secs(t1)} secs") # NOQA 344 | else: 345 | if do_fit: 346 | t1 = datetime.now() 347 | _call_fit(transformers.fit, Xt, y) 348 | logger.info( 349 | f"[FIT] {columns}: {_elapsed_secs(t1)} secs") 350 | 351 | t1 = datetime.now() 352 | Xt = transformers.transform(Xt) 353 | logger.info(f"[TRANSFORM] {columns}: {_elapsed_secs(t1)} secs") # NOQA 354 | 355 | extracted.append(_handle_feature(Xt)) 356 | 357 | alias = options.get('alias') 358 | 359 | prefix = options.get('prefix', '') 360 | suffix = options.get('suffix', '') 361 | 362 | transformed_names_ += self.get_names( 363 | columns, transformers, Xt, alias, prefix, suffix) 364 | 365 | # handle features not explicitly selected 366 | if self.built_default is not False: 367 | unsel_cols = self._unselected_columns(X) 368 | Xt = self._get_col_subset(X, unsel_cols, self.input_df) 369 | if self.built_default is not None: 370 | with add_column_names_to_exception(unsel_cols): 371 | if do_fit and hasattr(self.built_default, 'fit_transform'): 372 | Xt = _call_fit(self.built_default.fit_transform, Xt, y) 373 | else: 374 | if do_fit: 375 | _call_fit(self.built_default.fit, Xt, y) 376 | Xt = self.built_default.transform(Xt) 377 | transformed_names_ += self.get_names( 378 | unsel_cols, self.built_default, Xt) 379 | else: 380 | # if not applying a default transformer, 381 | # keep column names unmodified 382 | transformed_names_ += unsel_cols 383 | 384 | extracted.append(_handle_feature(Xt)) 385 | 386 | self.transformed_names_ = transformed_names_ 387 | 388 | # combine the feature outputs into one array. 389 | # at this point we lose track of which features 390 | # were created from which input columns, so it's 391 | # assumed that that doesn't matter to the model. 392 | 393 | # If any of the extracted features is sparse, combine sparsely. 394 | # Otherwise, combine as normal arrays. 395 | if any(sparse.issparse(fea) for fea in extracted): 396 | stacked = sparse.hstack(extracted).tocsr() 397 | # return a sparse matrix only if the mapper was initialized 398 | # with sparse=True 399 | if not self.sparse: 400 | stacked = stacked.toarray() 401 | else: 402 | stacked = np.hstack(extracted) 403 | 404 | if self.df_out: 405 | # if no rows were dropped preserve the original index, 406 | # otherwise use a new integer one 407 | no_rows_dropped = len(X) == len(stacked) 408 | if no_rows_dropped: 409 | index = X.index 410 | else: 411 | index = None 412 | 413 | # output different data types, if appropriate 414 | dtypes = self.get_dtypes(extracted) 415 | df_out = pd.DataFrame( 416 | stacked, 417 | columns=self.transformed_names_, 418 | index=index) 419 | # preserve types 420 | for col, dtype in zip(self.transformed_names_, dtypes): 421 | df_out[col] = df_out[col].astype(dtype) 422 | return df_out 423 | else: 424 | return stacked 425 | 426 | def transform(self, X): 427 | """ 428 | Transform the given data. Assumes that fit has already been called. 429 | 430 | X the data to transform 431 | """ 432 | return self._transform(X) 433 | 434 | def fit_transform(self, X, y=None): 435 | """ 436 | Fit a transformation from the pipeline and directly apply 437 | it to the given data. 438 | 439 | X the data to fit 440 | 441 | y the target vector relative to X, optional 442 | """ 443 | return self._transform(X, y, True) 444 | -------------------------------------------------------------------------------- /sklearn_pandas/features_generator.py: -------------------------------------------------------------------------------- 1 | def gen_features(columns, classes=None, prefix='', suffix=''): 2 | """Generates a feature definition list which can be passed 3 | into DataFrameMapper 4 | 5 | Params: 6 | 7 | columns a list of column names to generate features for. 8 | 9 | classes a list of classes for each feature, a list of dictionaries with 10 | transformer class and init parameters, or None. 11 | 12 | If list of classes is provided, then each of them is 13 | instantiated with default arguments. Example: 14 | 15 | classes = [StandardScaler, LabelBinarizer] 16 | 17 | If list of dictionaries is provided, then each of them should 18 | have a 'class' key with transformer class. All other keys are 19 | passed into 'class' value constructor. Example: 20 | 21 | classes = [ 22 | {'class': StandardScaler, 'with_mean': False}, 23 | {'class': LabelBinarizer} 24 | }] 25 | 26 | If None value selected, then each feature left as is. 27 | 28 | prefix add prefix to transformed column names 29 | 30 | suffix add suffix to transformed column names. 31 | 32 | """ 33 | if classes is None: 34 | return [(column, None) for column in columns] 35 | 36 | feature_defs = [] 37 | 38 | for column in columns: 39 | feature_transformers = [] 40 | 41 | arguments = {} 42 | if prefix and prefix != "": 43 | arguments['prefix'] = prefix 44 | if suffix and suffix != "": 45 | arguments['suffix'] = suffix 46 | 47 | classes = [cls for cls in classes if cls is not None] 48 | if not classes: 49 | feature_defs.append((column, None, arguments)) 50 | 51 | else: 52 | for definition in classes: 53 | if isinstance(definition, dict): 54 | params = definition.copy() 55 | klass = params.pop('class') 56 | feature_transformers.append(klass(**params)) 57 | else: 58 | feature_transformers.append(definition()) 59 | 60 | if not feature_transformers: 61 | feature_transformers = None 62 | 63 | feature_defs.append((column, feature_transformers, arguments)) 64 | 65 | return feature_defs 66 | -------------------------------------------------------------------------------- /sklearn_pandas/pipeline.py: -------------------------------------------------------------------------------- 1 | import six 2 | from sklearn.pipeline import _name_estimators, Pipeline 3 | from sklearn.utils import tosequence 4 | 5 | 6 | def _call_fit(fit_method, X, y=None, **kwargs): 7 | """ 8 | helper function, calls the fit or fit_transform method with the correct 9 | number of parameters 10 | 11 | fit_method: fit or fit_transform method of the transformer 12 | X: the data to fit 13 | y: the target vector relative to X, optional 14 | kwargs: any keyword arguments to the fit method 15 | 16 | return: the result of the fit or fit_transform method 17 | 18 | WARNING: if this function raises a TypeError exception, test the fit 19 | or fit_transform method passed to it in isolation as _call_fit will not 20 | distinguish TypeError due to incorrect number of arguments from 21 | other TypeError 22 | """ 23 | try: 24 | return fit_method(X, y, **kwargs) 25 | except TypeError: 26 | # fit takes only one argument 27 | return fit_method(X, **kwargs) 28 | 29 | 30 | class TransformerPipeline(Pipeline): 31 | """ 32 | Pipeline that expects all steps to be transformers taking a single X 33 | argument, an optional y argument, and having fit and transform methods. 34 | 35 | Code is copied from sklearn's Pipeline 36 | """ 37 | 38 | def __init__(self, steps): 39 | names, estimators = zip(*steps) 40 | if len(dict(steps)) != len(steps): 41 | raise ValueError( 42 | "Provided step names are not unique: %s" % (names,)) 43 | 44 | # shallow copy of steps 45 | self.steps = tosequence(steps) 46 | estimator = estimators[-1] 47 | 48 | for e in estimators: 49 | if (not (hasattr(e, "fit") or hasattr(e, "fit_transform")) or not 50 | hasattr(e, "transform")): 51 | raise TypeError("All steps of the chain should " 52 | "be transforms and implement fit and transform" 53 | " '%s' (type %s) doesn't)" % (e, type(e))) 54 | 55 | if not hasattr(estimator, "fit"): 56 | raise TypeError("Last step of chain should implement fit " 57 | "'%s' (type %s) doesn't)" 58 | % (estimator, type(estimator))) 59 | 60 | def _pre_transform(self, X, y=None, **fit_params): 61 | fit_params_steps = dict((step, {}) for step, _ in self.steps) 62 | for pname, pval in six.iteritems(fit_params): 63 | step, param = pname.split('__', 1) 64 | fit_params_steps[step][param] = pval 65 | Xt = X 66 | for name, transform in self.steps[:-1]: 67 | if hasattr(transform, "fit_transform"): 68 | Xt = _call_fit(transform.fit_transform, 69 | Xt, y, **fit_params_steps[name]) 70 | else: 71 | Xt = _call_fit(transform.fit, 72 | Xt, y, **fit_params_steps[name]).transform(Xt) 73 | return Xt, fit_params_steps[self.steps[-1][0]] 74 | 75 | def fit(self, X, y=None, **fit_params): 76 | Xt, fit_params = self._pre_transform(X, y, **fit_params) 77 | _call_fit(self.steps[-1][-1].fit, Xt, y, **fit_params) 78 | return self 79 | 80 | def fit_transform(self, X, y=None, **fit_params): 81 | Xt, fit_params = self._pre_transform(X, y, **fit_params) 82 | if hasattr(self.steps[-1][-1], 'fit_transform'): 83 | return _call_fit(self.steps[-1][-1].fit_transform, 84 | Xt, y, **fit_params) 85 | else: 86 | return _call_fit(self.steps[-1][-1].fit, 87 | Xt, y, **fit_params).transform(Xt) 88 | 89 | 90 | def make_transformer_pipeline(*steps): 91 | """Construct a TransformerPipeline from the given estimators. 92 | """ 93 | return TransformerPipeline(_name_estimators(steps)) 94 | -------------------------------------------------------------------------------- /sklearn_pandas/transformers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.base import TransformerMixin 4 | import warnings 5 | 6 | 7 | def _get_mask(X, value): 8 | """ 9 | Compute the boolean mask X == missing_values. 10 | """ 11 | if value == "NaN" or \ 12 | value is None or \ 13 | (isinstance(value, float) and np.isnan(value)): 14 | return pd.isnull(X) 15 | else: 16 | return X == value 17 | 18 | 19 | class NumericalTransformer(TransformerMixin): 20 | """ 21 | Provides commonly used numerical transformers. 22 | """ 23 | SUPPORTED_FUNCTIONS = ['log', 'log1p'] 24 | 25 | def __init__(self, func): 26 | """ 27 | Params 28 | 29 | func function to apply to input columns. The function will be 30 | applied to each value. Supported functions are defined 31 | in SUPPORTED_FUNCTIONS variable. Throws assertion error if the 32 | not supported. 33 | """ 34 | 35 | warnings.warn(""" 36 | NumericalTransformer will be deprecated in 3.0 version. 37 | Please use Sklearn.base.TransformerMixin to write 38 | customer transformers 39 | """, DeprecationWarning) 40 | 41 | assert func in self.SUPPORTED_FUNCTIONS, \ 42 | f"Only following func are supported: {self.SUPPORTED_FUNCTIONS}" 43 | super(NumericalTransformer, self).__init__() 44 | self.__func = func 45 | 46 | def fit(self, X, y=None): 47 | return self 48 | 49 | def transform(self, X, y=None): 50 | if self.__func == 'log1p': 51 | return np.vectorize(np.log1p)(X) 52 | elif self.__func == 'log': 53 | return np.vectorize(np.log)(X) 54 | 55 | raise ValueError(f"Invalid function name: {self.__func}") 56 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from unittest.mock import Mock 3 | import numpy as np 4 | import pandas as pd 5 | from sklearn_pandas import DataFrameMapper 6 | from sklearn.compose import make_column_selector 7 | from sklearn.preprocessing import StandardScaler 8 | 9 | 10 | class GetStartWith: 11 | def __init__(self, start_str): 12 | self.start_str = start_str 13 | 14 | def __call__(self, X: pd.DataFrame) -> list: 15 | return [c for c in X.columns if c.startswith(self.start_str)] 16 | 17 | 18 | df = pd.DataFrame({ 19 | 'sepal length (cm)': [1.0, 2.0, 3.0], 20 | 'sepal width (cm)': [1.0, 2.0, 3.0], 21 | 'petal length (cm)': [1.0, 2.0, 3.0], 22 | 'petal width (cm)': [1.0, 2.0, 3.0] 23 | }) 24 | t = DataFrameMapper([ 25 | (make_column_selector(dtype_include=float), StandardScaler(), {'alias': 'x'}), 26 | (GetStartWith('petal'), None, {'alias': 'petal'}) 27 | ], df_out=True, default=False) 28 | 29 | t.fit(df) 30 | print(t.transform(df).shape) 31 | -------------------------------------------------------------------------------- /tests/test_data/cars.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scikit-learn-contrib/sklearn-pandas/c9db2d6dcbf515eade751073f43318e43cae5177/tests/test_data/cars.csv.gz -------------------------------------------------------------------------------- /tests/test_dataframe_mapper.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf8 -*- 2 | 3 | import pytest 4 | from unittest.mock import Mock 5 | from pandas import DataFrame 6 | import pandas as pd 7 | from scipy import sparse 8 | from sklearn.datasets import load_iris 9 | from sklearn.pipeline import Pipeline 10 | from sklearn.model_selection import cross_val_score 11 | from sklearn.svm import SVC 12 | from sklearn.feature_extraction.text import CountVectorizer 13 | from sklearn.feature_extraction import DictVectorizer 14 | from sklearn.preprocessing import ( 15 | StandardScaler, OneHotEncoder, LabelBinarizer) 16 | from sklearn.impute import SimpleImputer as Imputer 17 | from sklearn.feature_selection import SelectKBest, chi2 18 | from sklearn.base import BaseEstimator, TransformerMixin 19 | import sklearn.decomposition 20 | import numpy as np 21 | from numpy.testing import assert_array_equal 22 | import pickle 23 | from sklearn.compose import make_column_selector 24 | 25 | from sklearn_pandas import DataFrameMapper 26 | from sklearn_pandas.dataframe_mapper import _handle_feature, _build_transformer 27 | from sklearn_pandas.pipeline import TransformerPipeline 28 | 29 | 30 | class MockXTransformer(object): 31 | """ 32 | Mock transformer that accepts no y argument. 33 | """ 34 | def fit(self, X): 35 | return self 36 | 37 | def transform(self, X): 38 | return X 39 | 40 | 41 | class MockTClassifier(object): 42 | """ 43 | Mock transformer/classifier. 44 | """ 45 | def fit(self, X, y=None): 46 | return self 47 | 48 | def transform(self, X): 49 | return X 50 | 51 | def predict(self, X): 52 | return True 53 | 54 | 55 | class DateEncoder(): 56 | def fit(self, X, y=None): 57 | return self 58 | 59 | def transform(self, X): 60 | dt = X.dt 61 | return pd.concat([dt.year, dt.month, dt.day], axis=1) 62 | 63 | 64 | class ToSparseTransformer(BaseEstimator, TransformerMixin): 65 | """ 66 | Transforms numpy matrix to sparse format. 67 | """ 68 | def fit(self, X): 69 | return self 70 | 71 | def transform(self, X): 72 | return sparse.csr_matrix(X) 73 | 74 | 75 | class CustomTransformer(BaseEstimator, TransformerMixin): 76 | """ 77 | Example of transformer in which the number of classes 78 | is not equals to the number of output columns. 79 | """ 80 | def fit(self, X, y=None): 81 | self.min = X.min() 82 | self.classes_ = np.unique(X) 83 | return self 84 | 85 | def transform(self, X): 86 | classes = np.unique(X) 87 | if len(np.setdiff1d(classes, self.classes_)) > 0: 88 | raise ValueError('Unknown values found.') 89 | return X - self.min 90 | 91 | 92 | class MockImageTransformer(BaseEstimator, TransformerMixin): 93 | """ 94 | Example transformer that takes the max of a 2d vector 95 | then scales the result. 96 | """ 97 | def __init__(self, multiplier=10.0): 98 | self.multiplier = multiplier 99 | 100 | def fit(self, X, y=None): 101 | return self 102 | 103 | def transform(self, X): 104 | assert isinstance(X, pd.DataFrame) 105 | for col in X.columns: 106 | X[col] = X[col].map(lambda img: np.max(img)) 107 | return X * self.multiplier 108 | 109 | 110 | @pytest.fixture 111 | def simple_dataframe(): 112 | return pd.DataFrame({'a': [1, 2, 3]}) 113 | 114 | 115 | @pytest.fixture 116 | def complex_dataframe(): 117 | return pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'c', 'c'], 118 | 'feat1': [1, 2, 3, 4, 5, 6], 119 | 'feat2': [1, 2, 3, 2, 3, 4]}) 120 | 121 | 122 | @pytest.fixture 123 | def complex_object_dataframe(): 124 | return pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'c', 'c'], 125 | 'feat1': [1, 2, 3, 4, 5, 6], 126 | 'feat2': [1, 2, 3, 2, 3, 4], 127 | 'img2d': [1*np.eye(2), 2*np.eye(2), 3*np.eye(2), 128 | 4*np.eye(2), 5*np.eye(2), 6*np.eye(2)]}) 129 | 130 | 131 | @pytest.fixture 132 | def multiindex_dataframe(): 133 | """Example MultiIndex DataFrame, taken from pandas documentation 134 | """ 135 | iterables = [['bar', 'baz', 'foo', 'qux'], ['one', 'two']] 136 | index = pd.MultiIndex.from_product(iterables, names=['first', 'second']) 137 | df = pd.DataFrame(np.random.randn(10, 8), columns=index) 138 | return df 139 | 140 | 141 | @pytest.fixture 142 | def multiindex_dataframe_incomplete(multiindex_dataframe): 143 | """Example MultiIndex DataFrame with missing entries 144 | """ 145 | df = multiindex_dataframe 146 | mask_array = np.zeros(df.size) 147 | mask_array[:20] = 1 148 | np.random.shuffle(mask_array) 149 | mask = mask_array.reshape(df.shape).astype(bool) 150 | df.mask(mask, inplace=True) 151 | return df 152 | 153 | 154 | def test_transformed_names_simple(simple_dataframe): 155 | """ 156 | Get transformed names of features in `transformed_names` attribute 157 | for simple transformation 158 | """ 159 | df = simple_dataframe 160 | mapper = DataFrameMapper([('a', None)]) 161 | mapper.fit_transform(df) 162 | assert mapper.transformed_names_ == ['a'] 163 | 164 | 165 | def test_transformed_names_binarizer(complex_dataframe): 166 | """ 167 | Get transformed names of features in `transformed_names` attribute 168 | for a transformation that multiplies the number of columns 169 | """ 170 | df = complex_dataframe 171 | mapper = DataFrameMapper([('target', LabelBinarizer())]) 172 | mapper.fit_transform(df) 173 | assert mapper.transformed_names_ == ['target_a', 'target_b', 'target_c'] 174 | 175 | 176 | def test_logging(caplog, complex_dataframe): 177 | """ 178 | Get transformed names of features in `transformed_names` attribute 179 | for a transformation that multiplies the number of columns 180 | """ 181 | import logging 182 | logger = logging.getLogger('sklearn_pandas') 183 | logger.setLevel(logging.INFO) 184 | df = complex_dataframe 185 | mapper = DataFrameMapper([('target', LabelBinarizer())]) 186 | mapper.fit_transform(df) 187 | assert '[FIT_TRANSFORM] target:' in caplog.text 188 | 189 | 190 | def test_transformed_names_binarizer_unicode(): 191 | df = pd.DataFrame({'target': [u'ñ', u'á', u'é']}) 192 | mapper = DataFrameMapper([('target', LabelBinarizer())]) 193 | mapper.fit_transform(df) 194 | expected_names = {u'target_ñ', u'target_á', u'target_é'} 195 | assert set(mapper.transformed_names_) == expected_names 196 | 197 | 198 | def test_transformed_names_transformers_list(complex_dataframe): 199 | """ 200 | When using a list of transformers, use them in inverse order to get the 201 | transformed names 202 | """ 203 | df = complex_dataframe 204 | mapper = DataFrameMapper([ 205 | ('target', [LabelBinarizer(), MockXTransformer()]) 206 | ]) 207 | mapper.fit_transform(df) 208 | assert mapper.transformed_names_ == ['target_a', 'target_b', 'target_c'] 209 | 210 | 211 | def test_transformed_names_simple_alias(simple_dataframe): 212 | """ 213 | If we specify an alias for a single output column, it is used for the 214 | output 215 | """ 216 | df = simple_dataframe 217 | mapper = DataFrameMapper([('a', None, {'alias': 'new_name'})]) 218 | mapper.fit_transform(df) 219 | assert mapper.transformed_names_ == ['new_name'] 220 | 221 | 222 | def test_transformed_names_complex_alias(complex_dataframe): 223 | """ 224 | If we specify an alias for a multiple output column, it is used for the 225 | output 226 | """ 227 | df = complex_dataframe 228 | mapper = DataFrameMapper([('target', LabelBinarizer(), {'alias': 'new'})]) 229 | mapper.fit_transform(df) 230 | assert mapper.transformed_names_ == ['new_a', 'new_b', 'new_c'] 231 | 232 | 233 | def test_exception_column_context_transform(simple_dataframe): 234 | """ 235 | If an exception is raised when transforming a column, 236 | the exception includes the name of the column being transformed 237 | """ 238 | class FailingTransformer(object): 239 | def fit(self, X): 240 | pass 241 | 242 | def transform(self, X): 243 | raise Exception('Some exception') 244 | 245 | df = simple_dataframe 246 | mapper = DataFrameMapper([('a', FailingTransformer())]) 247 | mapper.fit(df) 248 | 249 | with pytest.raises(Exception, match='a: Some exception'): 250 | mapper.transform(df) 251 | 252 | 253 | def test_exception_column_context_fit(simple_dataframe): 254 | """ 255 | If an exception is raised when fit a column, 256 | the exception includes the name of the column being fitted 257 | """ 258 | class FailingFitter(object): 259 | def fit(self, X): 260 | raise Exception('Some exception') 261 | 262 | df = simple_dataframe 263 | mapper = DataFrameMapper([('a', FailingFitter())]) 264 | 265 | with pytest.raises(Exception, match='a: Some exception'): 266 | mapper.fit(df) 267 | 268 | 269 | def test_simple_df(simple_dataframe): 270 | """ 271 | Get a dataframe from a simple mapped dataframe 272 | """ 273 | df = simple_dataframe 274 | mapper = DataFrameMapper([('a', None)], df_out=True) 275 | transformed = mapper.fit_transform(df) 276 | assert type(transformed) == pd.DataFrame 277 | assert len(transformed["a"]) == len(simple_dataframe["a"]) 278 | 279 | 280 | def test_complex_df(complex_dataframe): 281 | """ 282 | Get a dataframe from a complex mapped dataframe 283 | """ 284 | df = complex_dataframe 285 | mapper = DataFrameMapper( 286 | [('target', None), ('feat1', None), ('feat2', None)], 287 | df_out=True) 288 | transformed = mapper.fit_transform(df) 289 | assert len(transformed) == len(complex_dataframe) 290 | for c in df.columns: 291 | assert len(transformed[c]) == len(df[c]) 292 | 293 | 294 | def test_complex_object_df(complex_object_dataframe): 295 | """ 296 | Get a dataframe from a complex dataframe with 2d features 297 | """ 298 | df = complex_object_dataframe 299 | img_scale = 10 300 | mapper = DataFrameMapper( 301 | [('target', None), ('feat1', None), 302 | (make_column_selector('feat2'), StandardScaler()), 303 | (make_column_selector('img2d'), MockImageTransformer(img_scale))], 304 | df_out=True, input_df=True) 305 | transformed = mapper.fit_transform(df) 306 | assert len(transformed) == len(complex_object_dataframe) 307 | assert np.isclose( 308 | np.sum(transformed['img2d']), 309 | np.max(np.sum(df['img2d'])) * img_scale, atol=1e-12) 310 | 311 | 312 | def test_numeric_column_names(complex_dataframe): 313 | """ 314 | Get a dataframe from a complex mapped dataframe with numeric column names 315 | """ 316 | df = complex_dataframe 317 | df.columns = [0, 1, 2] 318 | mapper = DataFrameMapper( 319 | [(0, None), (1, None), (2, None)], df_out=True) 320 | transformed = mapper.fit_transform(df) 321 | assert len(transformed) == len(complex_dataframe) 322 | for c in df.columns: 323 | assert len(transformed[c]) == len(df[c]) 324 | 325 | 326 | def test_multiindex_df(multiindex_dataframe_incomplete): 327 | """ 328 | Get a dataframe from a multiindex dataframe with missing data 329 | """ 330 | df = multiindex_dataframe_incomplete 331 | mapper = DataFrameMapper([([c], Imputer()) for c in df.columns], 332 | df_out=True) 333 | transformed = mapper.fit_transform(df) 334 | assert len(transformed) == len(multiindex_dataframe_incomplete) 335 | for c in df.columns: 336 | assert len(transformed[str(c)]) == len(df[c]) 337 | 338 | 339 | def test_binarizer_df(): 340 | """ 341 | Check level names from LabelBinarizer 342 | """ 343 | df = pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'c', 'a']}) 344 | mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True) 345 | transformed = mapper.fit_transform(df) 346 | cols = transformed.columns 347 | assert len(cols) == 3 348 | assert cols[0] == 'target_a' 349 | assert cols[1] == 'target_b' 350 | assert cols[2] == 'target_c' 351 | 352 | 353 | def test_binarizer_int_df(): 354 | """ 355 | Check level names from LabelBinarizer for a numeric array. 356 | """ 357 | df = pd.DataFrame({'target': [5, 5, 6, 6, 7, 5]}) 358 | mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True) 359 | transformed = mapper.fit_transform(df) 360 | cols = transformed.columns 361 | assert len(cols) == 3 362 | assert cols[0] == 'target_5' 363 | assert cols[1] == 'target_6' 364 | assert cols[2] == 'target_7' 365 | 366 | 367 | def test_binarizer2_df(): 368 | """ 369 | Check level names from LabelBinarizer with just one output column 370 | """ 371 | df = pd.DataFrame({'target': ['a', 'a', 'b', 'b', 'a']}) 372 | mapper = DataFrameMapper([('target', LabelBinarizer())], df_out=True) 373 | transformed = mapper.fit_transform(df) 374 | cols = transformed.columns 375 | assert len(cols) == 1 376 | assert cols[0] == 'target' 377 | 378 | 379 | def test_onehot_df(): 380 | """ 381 | Check level ids from one-hot 382 | """ 383 | df = pd.DataFrame({'target': [0, 0, 1, 1, 2, 3, 0]}) 384 | mapper = DataFrameMapper([(['target'], OneHotEncoder())], df_out=True) 385 | transformed = mapper.fit_transform(df) 386 | cols = transformed.columns 387 | assert len(cols) == 4 388 | assert cols[0] == 'target_x0_0' 389 | assert cols[3] == 'target_x0_3' 390 | 391 | 392 | def test_customtransform_df(): 393 | """ 394 | Check level ids from a transformer in which 395 | the number of classes is not equals to the number of output columns. 396 | """ 397 | df = pd.DataFrame({'target': [6, 5, 7, 5, 4, 8, 8]}) 398 | mapper = DataFrameMapper([(['target'], CustomTransformer())], df_out=True) 399 | transformed = mapper.fit_transform(df) 400 | cols = transformed.columns 401 | assert len(mapper.features[0][1].classes_) == 5 402 | assert len(cols) == 1 403 | assert cols[0] == 'target' 404 | 405 | 406 | def test_preserve_df_index(): 407 | """ 408 | The index is preserved when df_out=True 409 | """ 410 | df = pd.DataFrame({'target': [1, 2, 3]}, 411 | index=['a', 'b', 'c']) 412 | mapper = DataFrameMapper([('target', None)], 413 | df_out=True) 414 | 415 | transformed = mapper.fit_transform(df) 416 | 417 | assert_array_equal(transformed.index, df.index) 418 | 419 | 420 | def test_preserve_df_index_rows_dropped(): 421 | """ 422 | If df_out=True but the original df index length doesn't 423 | match the number of final rows, use a numeric index 424 | """ 425 | class DropLastRowTransformer(object): 426 | def fit(self, X): 427 | return self 428 | 429 | def transform(self, X): 430 | return X[:-1] 431 | 432 | df = pd.DataFrame({'target': [1, 2, 3]}, 433 | index=['a', 'b', 'c']) 434 | mapper = DataFrameMapper([('target', DropLastRowTransformer())], 435 | df_out=True) 436 | 437 | transformed = mapper.fit_transform(df) 438 | 439 | assert_array_equal(transformed.index, np.array([0, 1])) 440 | 441 | 442 | def test_pca(complex_dataframe): 443 | """ 444 | Check multi in and out with PCA 445 | """ 446 | df = complex_dataframe 447 | mapper = DataFrameMapper( 448 | [(['feat1', 'feat2'], sklearn.decomposition.PCA(2))], 449 | df_out=True) 450 | transformed = mapper.fit_transform(df) 451 | cols = transformed.columns 452 | assert len(cols) == 2 453 | assert cols[0] == 'feat1_feat2_0' 454 | assert cols[1] == 'feat1_feat2_1' 455 | 456 | 457 | def test_fit_transform(simple_dataframe): 458 | """ 459 | Check that custom fit_transform methods of the transformers are invoked. 460 | """ 461 | df = simple_dataframe 462 | mock_transformer = Mock() 463 | # return something of measurable length but does nothing 464 | mock_transformer.fit_transform.return_value = np.array([1, 2, 3]) 465 | mapper = DataFrameMapper([("a", mock_transformer)]) 466 | mapper.fit_transform(df) 467 | assert mock_transformer.fit_transform.called 468 | 469 | 470 | def test_fit_transform_equiv_mock(simple_dataframe): 471 | """ 472 | Check for equivalent results for code paths fit_transform 473 | versus fit and transform in DataFrameMapper using the mock 474 | transformer which does not implement a custom fit_transform. 475 | """ 476 | df = simple_dataframe 477 | mapper = DataFrameMapper([('a', MockXTransformer())]) 478 | transformed_combined = mapper.fit_transform(df) 479 | transformed_separate = mapper.fit(df).transform(df) 480 | assert np.all(transformed_combined == transformed_separate) 481 | 482 | 483 | def test_fit_transform_equiv_pca(complex_dataframe): 484 | """ 485 | Check for equivalent results for code paths fit_transform 486 | versus fit and transform in DataFrameMapper and transformer 487 | using PCA which implements a custom fit_transform. The 488 | equivalence of both paths in the transformer only can be 489 | asserted since this is tested in the sklearn tests 490 | scikit-learn/sklearn/decomposition/tests/test_pca.py 491 | """ 492 | df = complex_dataframe 493 | mapper = DataFrameMapper( 494 | [(['feat1', 'feat2'], sklearn.decomposition.PCA(2))], 495 | df_out=True) 496 | transformed_combined = mapper.fit_transform(df) 497 | transformed_separate = mapper.fit(df).transform(df) 498 | assert np.allclose(transformed_combined, transformed_separate) 499 | 500 | 501 | def test_input_df_true_first_transformer(simple_dataframe, monkeypatch): 502 | """ 503 | If input_df is True, the first transformer is passed 504 | a pd.Series instead of an np.array 505 | """ 506 | df = simple_dataframe 507 | monkeypatch.setattr(MockXTransformer, 'fit', Mock()) 508 | monkeypatch.setattr(MockXTransformer, 'transform', 509 | Mock(return_value=np.array([1, 2, 3]))) 510 | mapper = DataFrameMapper([ 511 | ('a', MockXTransformer()) 512 | ], input_df=True) 513 | out = mapper.fit_transform(df) 514 | 515 | args, _ = MockXTransformer().fit.call_args 516 | assert isinstance(args[0], pd.Series) 517 | 518 | args, _ = MockXTransformer().transform.call_args 519 | assert isinstance(args[0], pd.Series) 520 | 521 | assert_array_equal(out, np.array([1, 2, 3]).reshape(-1, 1)) 522 | 523 | 524 | def test_input_df_true_next_transformers(simple_dataframe, monkeypatch): 525 | """ 526 | If input_df is True, the subsequent transformers get passed pandas 527 | objects instead of numpy arrays (given the previous transformers 528 | output pandas objects as well) 529 | """ 530 | df = simple_dataframe 531 | monkeypatch.setattr(MockTClassifier, 'fit', Mock()) 532 | monkeypatch.setattr(MockTClassifier, 'transform', 533 | Mock(return_value=pd.Series([1, 2, 3]))) 534 | mapper = DataFrameMapper([ 535 | ('a', [MockXTransformer(), MockTClassifier()]) 536 | ], input_df=True) 537 | mapper.fit(df) 538 | out = mapper.transform(df) 539 | 540 | args, _ = MockTClassifier().fit.call_args 541 | assert isinstance(args[0], pd.Series) 542 | 543 | assert_array_equal(out, np.array([1, 2, 3]).reshape(-1, 1)) 544 | 545 | 546 | def test_input_df_true_multiple_cols(complex_dataframe): 547 | """ 548 | When input_df is True, applying transformers to multiple columns 549 | works as expected 550 | """ 551 | df = complex_dataframe 552 | 553 | mapper = DataFrameMapper([ 554 | ('target', MockXTransformer()), 555 | ('feat1', MockXTransformer()), 556 | ], input_df=True) 557 | out = mapper.fit_transform(df) 558 | 559 | assert_array_equal(out[:, 0], df['target'].values) 560 | assert_array_equal(out[:, 1], df['feat1'].values) 561 | 562 | 563 | def test_input_df_date_encoder(): 564 | """ 565 | When input_df is True we can apply a transformer that only works 566 | with pandas dataframes like a DateEncoder 567 | """ 568 | df = pd.DataFrame( 569 | {'dates': pd.date_range('2015-10-30', '2015-11-02')}) 570 | mapper = DataFrameMapper([ 571 | ('dates', DateEncoder()) 572 | ], input_df=True) 573 | out = mapper.fit_transform(df) 574 | expected = np.array([ 575 | [2015, 10, 30], 576 | [2015, 10, 31], 577 | [2015, 11, 1], 578 | [2015, 11, 2] 579 | ]) 580 | assert_array_equal(out, expected) 581 | 582 | 583 | def test_local_input_df_date_encoder(): 584 | """ 585 | When input_df is True we can apply a transformer that only works 586 | with pandas dataframes like a DateEncoder 587 | """ 588 | df = pd.DataFrame( 589 | {'dates': pd.date_range('2015-10-30', '2015-11-02')}) 590 | mapper = DataFrameMapper([ 591 | ('dates', DateEncoder(), {'input_df': True}) 592 | ], input_df=False) 593 | out = mapper.fit_transform(df) 594 | expected = np.array([ 595 | [2015, 10, 30], 596 | [2015, 10, 31], 597 | [2015, 11, 1], 598 | [2015, 11, 2] 599 | ]) 600 | assert_array_equal(out, expected) 601 | 602 | 603 | def test_nonexistent_columns_explicit_fail(simple_dataframe): 604 | """ 605 | If a nonexistent column is selected, KeyError is raised. 606 | """ 607 | mapper = DataFrameMapper(None) 608 | with pytest.raises(KeyError): 609 | mapper._get_col_subset(simple_dataframe, ['nonexistent_feature']) 610 | 611 | 612 | def test_get_col_subset_single_column_array(simple_dataframe): 613 | """ 614 | Selecting a single column should return a 1-dimensional numpy array. 615 | """ 616 | mapper = DataFrameMapper(None) 617 | array = mapper._get_col_subset(simple_dataframe, "a") 618 | 619 | assert type(array) == np.ndarray 620 | assert array.shape == (len(simple_dataframe["a"]),) 621 | 622 | 623 | def test_get_col_subset_single_column_list(simple_dataframe): 624 | """ 625 | Selecting a list of columns (even if the list contains a single element) 626 | should return a 2-dimensional numpy array. 627 | """ 628 | mapper = DataFrameMapper(None) 629 | array = mapper._get_col_subset(simple_dataframe, ["a"]) 630 | 631 | assert type(array) == np.ndarray 632 | assert array.shape == (len(simple_dataframe["a"]), 1) 633 | 634 | 635 | def test_cols_string_array(simple_dataframe): 636 | """ 637 | If a string is specified as the columns, the transformer 638 | is called with a 1-d array as input. 639 | """ 640 | df = simple_dataframe 641 | mock_transformer = Mock() 642 | mapper = DataFrameMapper([("a", mock_transformer)]) 643 | 644 | mapper.fit(df) 645 | args, kwargs = mock_transformer.fit.call_args 646 | assert args[0].shape == (3,) 647 | 648 | 649 | def test_cols_list_column_vector(simple_dataframe): 650 | """ 651 | If a one-element list is specified as the columns, the transformer 652 | is called with a column vector as input. 653 | """ 654 | df = simple_dataframe 655 | mock_transformer = Mock() 656 | mapper = DataFrameMapper([(["a"], mock_transformer)]) 657 | 658 | mapper.fit(df) 659 | args, kwargs = mock_transformer.fit.call_args 660 | assert args[0].shape == (3, 1) 661 | 662 | 663 | def test_handle_feature_2dim(): 664 | """ 665 | 2-dimensional arrays are returned unchanged. 666 | """ 667 | array = np.array([[1, 2], [3, 4]]) 668 | assert_array_equal(_handle_feature(array), array) 669 | 670 | 671 | def test_handle_feature_1dim(): 672 | """ 673 | 1-dimensional arrays are converted to 2-dimensional column vectors. 674 | """ 675 | array = np.array([1, 2]) 676 | assert_array_equal(_handle_feature(array), np.array([[1], [2]])) 677 | 678 | 679 | def test_build_transformers(): 680 | """ 681 | When a list of transformers is passed, return a pipeline with 682 | each element of the iterable as a step of the pipeline. 683 | """ 684 | transformers = [MockTClassifier(), MockTClassifier()] 685 | pipeline = _build_transformer(transformers) 686 | assert isinstance(pipeline, Pipeline) 687 | for ix, transformer in enumerate(transformers): 688 | assert pipeline.steps[ix][1] == transformer 689 | 690 | 691 | def test_selected_columns(): 692 | """ 693 | selected_columns returns a set of the columns appearing in the features 694 | of the mapper. 695 | """ 696 | mapper = DataFrameMapper([ 697 | ('a', None), 698 | (['a', 'b'], None) 699 | ]) 700 | assert mapper._selected_columns == {'a', 'b'} 701 | 702 | 703 | def test_unselected_columns(): 704 | """ 705 | unselected_columns returns a list of the columns not appearing in the 706 | features of the mapper but present in the given dataframe. 707 | """ 708 | df = pd.DataFrame({'a': [1], 'b': [2], 'c': [3]}) 709 | mapper = DataFrameMapper([ 710 | ('a', None), 711 | (['a', 'b'], None) 712 | ]) 713 | assert 'c' in mapper._unselected_columns(df) 714 | 715 | 716 | def test_drop_and_default_false(): 717 | """ 718 | If default=False, non explicitly selected columns and drop columns 719 | are discarded. 720 | """ 721 | df = pd.DataFrame({'a': [1], 'b': [2], 'c': [3]}) 722 | mapper = DataFrameMapper([ 723 | ('a', None) 724 | ], drop_cols=['c'], default=False) 725 | transformed = mapper.fit_transform(df) 726 | assert transformed.shape == (1, 1) 727 | assert mapper.transformed_names_ == ['a'] 728 | 729 | 730 | def test_drop_and_default_none(): 731 | """ 732 | If default=None, drop columns are discarded and 733 | remaining non explicitly selected columns are passed through untransformed 734 | """ 735 | df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]}) 736 | mapper = DataFrameMapper([ 737 | ('a', None) 738 | ], drop_cols=['c'], default=None) 739 | 740 | transformed = mapper.fit_transform(df) 741 | assert transformed.shape == (3, 2) 742 | assert mapper.transformed_names_ == ['a', 'b'] 743 | 744 | 745 | def test_conflicting_drop(): 746 | """ 747 | Drop column name shouldn't get confused with transformed columns. 748 | """ 749 | df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]}) 750 | mapper = DataFrameMapper([ 751 | ('a', None) 752 | ], drop_cols=['a'], default=False) 753 | 754 | transformed = mapper.fit_transform(df) 755 | assert transformed.shape == (3, 1) 756 | assert mapper.transformed_names_ == ['a'] 757 | 758 | 759 | def test_default_false(): 760 | """ 761 | If default=False, non explicitly selected columns are discarded. 762 | """ 763 | df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]}) 764 | mapper = DataFrameMapper([ 765 | ('b', None) 766 | ], default=False) 767 | 768 | transformed = mapper.fit_transform(df) 769 | assert transformed.shape == (3, 1) 770 | 771 | 772 | def test_default_none(): 773 | """ 774 | If default=None, non explicitly selected columns are passed through 775 | untransformed. 776 | """ 777 | df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]}) 778 | mapper = DataFrameMapper([ 779 | (['a'], OneHotEncoder()) 780 | ], default=None) 781 | 782 | transformed = mapper.fit_transform(df) 783 | assert (transformed[:, 3] == np.array([3, 5, 7]).T).all() 784 | 785 | 786 | def test_default_none_names(): 787 | """ 788 | If default=None, column names are returned unmodified. 789 | """ 790 | df = pd.DataFrame({'a': [1, 2, 3], 'b': [3, 5, 7]}) 791 | mapper = DataFrameMapper([], default=None) 792 | 793 | mapper.fit_transform(df) 794 | assert mapper.transformed_names_ == ['a', 'b'] 795 | 796 | 797 | def test_default_transformer(): 798 | """ 799 | If default=Transformer, non explicitly selected columns are applied this 800 | transformer. 801 | """ 802 | df = pd.DataFrame({'a': [1, np.nan, 3], }) 803 | mapper = DataFrameMapper([], default=Imputer()) 804 | 805 | transformed = mapper.fit_transform(df) 806 | assert (transformed[: 0] == np.array([1., 2., 3.])).all() 807 | 808 | 809 | def test_list_transformers_single_arg(simple_dataframe): 810 | """ 811 | Multiple transformers can be specified in a list even if some of them 812 | only accept one X argument instead of two (X, y). 813 | """ 814 | mapper = DataFrameMapper([ 815 | ('a', [MockXTransformer()]) 816 | ]) 817 | # doesn't fail 818 | mapper.fit_transform(simple_dataframe) 819 | 820 | 821 | def test_list_transformers(): 822 | """ 823 | Specifying a list of transformers applies them sequentially to the 824 | selected column. 825 | """ 826 | dataframe = pd.DataFrame({"a": [1, np.nan, 3], "b": [1, 5, 7]}, 827 | dtype=np.float64) 828 | 829 | mapper = DataFrameMapper([ 830 | (["a"], [Imputer(), StandardScaler()]), 831 | (["b"], StandardScaler()), 832 | ]) 833 | dmatrix = mapper.fit_transform(dataframe) 834 | 835 | assert pd.isnull(dmatrix).sum() == 0 # no null values 836 | 837 | # all features have mean 0 and std deviation 1 (standardized) 838 | assert (abs(dmatrix.mean(axis=0) - 0) <= 1e-6).all() 839 | assert (abs(dmatrix.std(axis=0) - 1) <= 1e-6).all() 840 | 841 | 842 | def test_list_transformers_old_unpickle(simple_dataframe): 843 | mapper = DataFrameMapper(None) 844 | # simulate the mapper was created with < 1.0.0 code 845 | mapper.features = [('a', [MockXTransformer()])] 846 | mapper_pickled = pickle.dumps(mapper) 847 | 848 | loaded_mapper = pickle.loads(mapper_pickled) 849 | transformer = loaded_mapper.features[0][1] 850 | assert isinstance(transformer, TransformerPipeline) 851 | assert isinstance(transformer.steps[0][1], MockXTransformer) 852 | 853 | 854 | def test_sparse_features(simple_dataframe): 855 | """ 856 | If any of the extracted features is sparse and "sparse" argument 857 | is true, the hstacked result is also sparse. 858 | """ 859 | df = simple_dataframe 860 | mapper = DataFrameMapper([ 861 | ("a", ToSparseTransformer()) 862 | ], sparse=True) 863 | dmatrix = mapper.fit_transform(df) 864 | 865 | assert type(dmatrix) == sparse.csr.csr_matrix 866 | 867 | 868 | def test_sparse_off(simple_dataframe): 869 | """ 870 | If the resulting features are sparse but the "sparse" argument 871 | of the mapper is False, return a non-sparse matrix. 872 | """ 873 | df = simple_dataframe 874 | mapper = DataFrameMapper([ 875 | ("a", ToSparseTransformer()) 876 | ], sparse=False) 877 | 878 | dmatrix = mapper.fit_transform(df) 879 | assert type(dmatrix) != sparse.csr.csr_matrix 880 | 881 | 882 | def test_fit_with_optional_y_arg(complex_dataframe): 883 | """ 884 | Transformers with an optional y argument in the fit method 885 | are handled correctly 886 | """ 887 | df = complex_dataframe 888 | mapper = DataFrameMapper([(['feat1', 'feat2'], MockTClassifier())]) 889 | # doesn't fail 890 | mapper.fit(df[['feat1', 'feat2']], df['target']) 891 | 892 | 893 | def test_fit_with_required_y_arg(complex_dataframe): 894 | """ 895 | Transformers with a required y argument in the fit method 896 | are handled and perform correctly 897 | """ 898 | df = complex_dataframe 899 | mapper = DataFrameMapper([(['feat1', 'feat2'], SelectKBest(chi2, k=1))]) 900 | 901 | # fit, doesn't fail 902 | ft_arr = mapper.fit(df[['feat1', 'feat2']], df['target']) 903 | 904 | # fit_transform 905 | ft_arr = mapper.fit_transform(df[['feat1', 'feat2']], df['target']) 906 | assert_array_equal(ft_arr, df[['feat1']].values) 907 | 908 | # transform 909 | t_arr = mapper.transform(df[['feat1', 'feat2']]) 910 | assert_array_equal(t_arr, df[['feat1']].values) 911 | 912 | 913 | # Integration tests with real dataframes 914 | 915 | @pytest.fixture 916 | def iris_dataframe(): 917 | iris = load_iris() 918 | return DataFrame( 919 | data={ 920 | iris.feature_names[0]: iris.data[:, 0], 921 | iris.feature_names[1]: iris.data[:, 1], 922 | iris.feature_names[2]: iris.data[:, 2], 923 | iris.feature_names[3]: iris.data[:, 3], 924 | "species": np.array([iris.target_names[e] for e in iris.target]) 925 | } 926 | ) 927 | 928 | 929 | @pytest.fixture 930 | def cars_dataframe(): 931 | return pd.read_csv("tests/test_data/cars.csv.gz", compression='gzip') 932 | 933 | 934 | def test_with_iris_dataframe(iris_dataframe): 935 | pipeline = Pipeline([ 936 | ("preprocess", DataFrameMapper([ 937 | ("petal length (cm)", None), 938 | ("petal width (cm)", None), 939 | ("sepal length (cm)", None), 940 | ("sepal width (cm)", None), 941 | ])), 942 | ("classify", SVC(kernel='linear')) 943 | ]) 944 | data = iris_dataframe.drop("species", axis=1) 945 | labels = iris_dataframe["species"] 946 | scores = cross_val_score(pipeline, data, labels) 947 | assert scores.mean() > 0.96 948 | assert (scores.std() * 2) < 0.04 949 | 950 | 951 | def test_dict_vectorizer(): 952 | df = pd.DataFrame( 953 | [[{'a': 1, 'b': 2}], [{'a': 3}]], 954 | columns=['colA'] 955 | ) 956 | 957 | outdf = DataFrameMapper( 958 | [('colA', DictVectorizer())], 959 | df_out=True, 960 | default=False 961 | ).fit_transform(df) 962 | 963 | columns = sorted(list(outdf.columns)) 964 | assert len(columns) == 2 965 | assert columns[0] == 'colA_a' 966 | assert columns[1] == 'colA_b' 967 | 968 | 969 | def test_with_car_dataframe(cars_dataframe): 970 | pipeline = Pipeline([ 971 | ("preprocess", DataFrameMapper([ 972 | ("description", CountVectorizer()), 973 | ])), 974 | ("classify", SVC(kernel='linear')) 975 | ]) 976 | data = cars_dataframe.drop("model", axis=1) 977 | labels = cars_dataframe["model"] 978 | scores = cross_val_score(pipeline, data, labels) 979 | assert scores.mean() > 0.30 980 | 981 | 982 | def test_direct_cross_validation(iris_dataframe): 983 | """ 984 | Starting with sklearn>=0.16.0 we no longer need CV wrappers for dataframes. 985 | See https://github.com/paulgb/sklearn-pandas/issues/11 986 | """ 987 | pipeline = Pipeline([ 988 | ("preprocess", DataFrameMapper([ 989 | ("petal length (cm)", None), 990 | ("petal width (cm)", None), 991 | ("sepal length (cm)", None), 992 | ("sepal width (cm)", None), 993 | ])), 994 | ("classify", SVC(kernel='linear')) 995 | ]) 996 | data = iris_dataframe.drop("species", axis=1) 997 | labels = iris_dataframe["species"] 998 | scores = cross_val_score(pipeline, data, labels) 999 | assert scores.mean() > 0.96 1000 | assert (scores.std() * 2) < 0.04 1001 | 1002 | 1003 | def test_heterogeneous_output_types_input_df(): 1004 | """ 1005 | Modify feat2, but pass feat1 through unmodified. 1006 | This fails if input_df == False 1007 | """ 1008 | df = pd.DataFrame({ 1009 | 'feat1': [1, 2, 3, 4, 5, 6], 1010 | 'feat2': [1.0, 2.0, 3.0, 2.0, 3.0, 4.0] 1011 | }) 1012 | M = DataFrameMapper([ 1013 | (['feat2'], StandardScaler()) 1014 | ], input_df=True, df_out=True, default=None) 1015 | dft = M.fit_transform(df) 1016 | assert dft['feat1'].dtype == np.dtype('int64') 1017 | assert dft['feat2'].dtype == np.dtype('float64') 1018 | 1019 | 1020 | def test_make_column_selector(iris_dataframe): 1021 | t = DataFrameMapper([ 1022 | (make_column_selector(dtype_include=float), None, {'alias': 'x'}), 1023 | ('sepal length (cm)', None), 1024 | ], df_out=True, default=False) 1025 | 1026 | xt = t.fit(iris_dataframe).transform(iris_dataframe) 1027 | expected = ['x_0', 'x_1', 'x_2', 'x_3', 'sepal length (cm)'] 1028 | assert list(xt.columns) == expected 1029 | 1030 | pickled = pickle.dumps(t) 1031 | t2 = pickle.loads(pickled) 1032 | xt2 = t2.transform(iris_dataframe) 1033 | assert np.array_equal(xt.values, xt2.values) 1034 | -------------------------------------------------------------------------------- /tests/test_features_generator.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | import pytest 4 | import numpy as np 5 | from pandas import DataFrame 6 | from numpy.testing import assert_array_equal 7 | 8 | from sklearn_pandas import DataFrameMapper 9 | from sklearn_pandas.features_generator import gen_features 10 | 11 | 12 | class MockClass(object): 13 | 14 | def __init__(self, value=1, name='class'): 15 | self.value = value 16 | self.name = name 17 | 18 | 19 | class MockTransformer(object): 20 | 21 | def __init__(self): 22 | self.most_common_ = None 23 | 24 | def fit(self, X, y=None): 25 | [(value, _)] = Counter(X).most_common(1) 26 | self.most_common_ = value 27 | return self 28 | 29 | def transform(self, X, y=None): 30 | return np.asarray([self.most_common_] * len(X)) 31 | 32 | 33 | @pytest.fixture 34 | def simple_dataset(): 35 | return DataFrame({ 36 | 'feat1': [1, 2, 1, 3, 1], 37 | 'feat2': [1, 2, 2, 2, 3], 38 | 'feat3': [1, 2, 3, 4, 5], 39 | }) 40 | 41 | 42 | def test_generate_features_with_default_parameters(): 43 | """ 44 | Tests generating features from classes with default init arguments. 45 | """ 46 | columns = ['colA', 'colB', 'colC'] 47 | feature_defs = gen_features(columns=columns, classes=[MockClass]) 48 | assert len(feature_defs) == len(columns) 49 | 50 | for feature in feature_defs: 51 | assert feature[2] == {} 52 | 53 | feature_dict = dict([_[0:2] for _ in feature_defs]) 54 | assert columns == sorted(feature_dict.keys()) 55 | 56 | # default init arguments for MockClass for clarification. 57 | expected = {'value': 1, 'name': 'class'} 58 | for column, transformers in feature_dict.items(): 59 | for obj in transformers: 60 | assert_attributes(obj, **expected) 61 | 62 | 63 | def test_generate_features_with_several_classes(): 64 | """ 65 | Tests generating features pipeline with different transformers parameters. 66 | """ 67 | feature_defs = gen_features( 68 | columns=['colA', 'colB', 'colC'], 69 | classes=[ 70 | {'class': MockClass}, 71 | {'class': MockClass, 'name': 'mockA'}, 72 | {'class': MockClass, 'name': 'mockB', 'value': None} 73 | ] 74 | ) 75 | 76 | for col, transformers, params in feature_defs: 77 | assert_attributes(transformers[0], name='class', value=1) 78 | assert_attributes(transformers[1], name='mockA', value=1) 79 | assert_attributes(transformers[2], name='mockB', value=None) 80 | 81 | 82 | def test_generate_features_with_none_only_transformers(): 83 | """ 84 | Tests generating "dummy" feature definition which doesn't apply any 85 | transformation. 86 | """ 87 | feature_defs = gen_features( 88 | columns=['colA', 'colB', 'colC'], classes=[None]) 89 | 90 | expected = [('colA', None, {}), 91 | ('colB', None, {}), 92 | ('colC', None, {})] 93 | 94 | assert feature_defs == expected 95 | 96 | 97 | def test_compatibility_with_data_frame_mapper(simple_dataset): 98 | """ 99 | Tests compatibility of generated feature definition with DataFrameMapper. 100 | """ 101 | features_defs = gen_features( 102 | columns=['feat1', 'feat2'], 103 | classes=[MockTransformer]) 104 | features_defs.append(('feat3', None)) 105 | 106 | mapper = DataFrameMapper(features_defs) 107 | X = mapper.fit_transform(simple_dataset) 108 | expected = np.asarray([ 109 | [1, 2, 1], 110 | [1, 2, 2], 111 | [1, 2, 3], 112 | [1, 2, 4], 113 | [1, 2, 5] 114 | ]) 115 | 116 | assert_array_equal(X, expected) 117 | 118 | 119 | def assert_attributes(obj, **attrs): 120 | for attr, value in attrs.items(): 121 | assert getattr(obj, attr) == value 122 | -------------------------------------------------------------------------------- /tests/test_pipeline.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from sklearn_pandas.pipeline import TransformerPipeline, _call_fit 3 | 4 | # In py3, mock is included with the unittest standard library 5 | # In py2, it's a separate package 6 | try: 7 | from unittest.mock import patch 8 | except ImportError: 9 | from mock import patch 10 | 11 | 12 | class NoTransformT(object): 13 | """Transformer without transform method. 14 | """ 15 | def fit(self, x): 16 | return self 17 | 18 | 19 | class NoFitT(object): 20 | """Transformer without fit method. 21 | """ 22 | def transform(self, x): 23 | return self 24 | 25 | 26 | class Trans(object): 27 | """ 28 | Transformer with fit and transform methods 29 | """ 30 | def fit(self, x, y=None): 31 | return self 32 | 33 | def transform(self, x): 34 | return self 35 | 36 | 37 | def func_x_y(x, y, kwarg='kwarg'): 38 | """ 39 | Function with required x and y arguments 40 | """ 41 | return 42 | 43 | 44 | def func_x(x, kwarg='kwarg'): 45 | """ 46 | Function with required x argument 47 | """ 48 | return 49 | 50 | 51 | def func_raise_type_err(x, y, kwarg='kwarg'): 52 | """ 53 | Function with required x and y arguments, 54 | raises TypeError 55 | """ 56 | raise TypeError 57 | 58 | 59 | def test_all_steps_fit_transform(): 60 | """ 61 | All steps must implement fit and transform. Otherwise, raise TypeError. 62 | """ 63 | with pytest.raises(TypeError): 64 | TransformerPipeline([('svc', NoTransformT())]) 65 | 66 | with pytest.raises(TypeError): 67 | TransformerPipeline([('svc', NoFitT())]) 68 | 69 | 70 | @patch.object(Trans, 'fit', side_effect=func_x_y) 71 | def test_called_with_x_and_y(mock_fit): 72 | """ 73 | Fit method with required X and y arguments is called with both and with 74 | any additional keywords 75 | """ 76 | _call_fit(Trans().fit, 'X', 'y', kwarg='kwarg') 77 | mock_fit.assert_called_with('X', 'y', kwarg='kwarg') 78 | 79 | 80 | @patch.object(Trans, 'fit', side_effect=func_x) 81 | def test_called_with_x(mock_fit): 82 | """ 83 | Fit method with a required X arguments is called with it and with 84 | any additional keywords 85 | """ 86 | _call_fit(Trans().fit, 'X', 'y', kwarg='kwarg') 87 | mock_fit.assert_called_with('X', kwarg='kwarg') 88 | 89 | _call_fit(Trans().fit, 'X', kwarg='kwarg') 90 | mock_fit.assert_called_with('X', kwarg='kwarg') 91 | 92 | 93 | @patch.object(Trans, 'fit', side_effect=func_raise_type_err) 94 | def test_raises_type_error(mock_fit): 95 | """ 96 | If a fit method with required X and y arguments raises a TypeError, it's 97 | re-raised (for a different reason) when it's called with one argument 98 | """ 99 | with pytest.raises(TypeError): 100 | _call_fit(Trans().fit, 'X', 'y', kwarg='kwarg') 101 | -------------------------------------------------------------------------------- /tests/test_transformers.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import pytest 3 | import numpy as np 4 | from pandas import DataFrame 5 | import joblib 6 | 7 | from sklearn_pandas import DataFrameMapper 8 | from sklearn_pandas import NumericalTransformer 9 | 10 | 11 | @pytest.fixture 12 | def simple_dataset(): 13 | return DataFrame({ 14 | 'feat1': [1, 2, 1, 3, 1], 15 | 'feat2': [1, 2, 2, 2, 3], 16 | 'feat3': [1, 2, 3, 4, 5], 17 | }) 18 | 19 | 20 | def test_common_numerical_transformer(simple_dataset): 21 | """ 22 | Test log transformation 23 | """ 24 | transfomer = DataFrameMapper([ 25 | ('feat1', NumericalTransformer('log')) 26 | ], df_out=True) 27 | df = simple_dataset 28 | outDF = transfomer.fit_transform(df) 29 | assert list(outDF.columns) == ['feat1'] 30 | assert np.array_equal(df['feat1'].apply(np.log).values, outDF.feat1.values) 31 | 32 | 33 | def test_numerical_transformer_serialization(simple_dataset): 34 | """ 35 | Test if you can serialize transformer 36 | """ 37 | transfomer = DataFrameMapper([ 38 | ('feat1', NumericalTransformer('log')) 39 | ]) 40 | 41 | df = simple_dataset 42 | transfomer.fit(df) 43 | f = tempfile.NamedTemporaryFile(delete=True) 44 | joblib.dump(transfomer, f.name) 45 | transfomer2 = joblib.load(f.name) 46 | np.array_equal(transfomer.transform(df), transfomer2.transform(df)) 47 | f.close() 48 | --------------------------------------------------------------------------------