├── __init__.py
├── tests
    ├── __init__.py
    ├── test_image_transformers.py
    ├── test_dataframe_transformer.py
    ├── test_ts_fresh.py
    ├── test_bag_of_features.py
    ├── test_data_type.py
    └── test_transformer.py
├── xpandas
    ├── __init__.py
    ├── transformers
    │   ├── text_transformer
    │   │   ├── __init__.py
    │   │   └── text_transformer.py
    │   ├── image_transformer
    │   │   ├── __init__.py
    │   │   └── image_transformer.py
    │   ├── identity_transformer
    │   │   ├── __init__.py
    │   │   └── identity_transformer.py
    │   ├── pipeline_transformer
    │   │   ├── __init__.py
    │   │   └── pipeline_transformer.py
    │   ├── bag_of_features_transformer
    │   │   ├── __init__.py
    │   │   └── bag_of_features_transformer.py
    │   ├── series_transformers
    │   │   ├── __init__.py
    │   │   └── series_transformer.py
    │   ├── __init__.py
    │   └── transformer.py
    └── data_container
    │   ├── __init__.py
    │   └── data_container.py
├── docs
    ├── _static
    │   ├── .gitignore
    │   ├── Logo.png
    │   └── GitHub-Mark-32px.png
    ├── authors.rst
    ├── changes.rst
    ├── license.rst
    ├── docsapi.rst
    ├── contributing.rst
    ├── example.rst
    ├── index.rst
    ├── installation.rst
    ├── conf.py
    ├── introduction.rst
    └── Makefile
├── requirements-docs.txt
├── examples
    ├── imgs
    │   ├── Logo.png
    │   ├── XSeries.png
    │   ├── logo.sketch
    │   ├── Transformer.png
    │   └── XDataFrame.png
    ├── container_example.py
    ├── transformer_example.py
    └── ExampleUsage.ipynb
├── CHANGES.rst
├── requirements.txt
├── Pipfile
├── setup.py
├── AUTHORS.rst
├── .travis.yml
├── README.md
├── LICENSE.txt
├── .gitignore
├── CONTRIBUTING.md
├── CODE_OF_CONDUCT.md
└── github_deploy_key.enc


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/xpandas/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/_static/.gitignore:
--------------------------------------------------------------------------------
1 | # Empty directory
2 | 


--------------------------------------------------------------------------------
/xpandas/transformers/text_transformer/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/authors.rst:
--------------------------------------------------------------------------------
1 | .. _authors:
2 | .. include:: ../AUTHORS.rst
3 | 


--------------------------------------------------------------------------------
/docs/changes.rst:
--------------------------------------------------------------------------------
1 | .. _changes:
2 | .. include:: ../CHANGES.rst
3 | 


--------------------------------------------------------------------------------
/xpandas/transformers/text_transformer/text_transformer.py:
--------------------------------------------------------------------------------
1 | # TODO
2 | 


--------------------------------------------------------------------------------
/requirements-docs.txt:
--------------------------------------------------------------------------------
1 | ipython==7.0.1
2 | Sphinx==1.4.3
3 | nbsphinx==0.3.5


--------------------------------------------------------------------------------
/xpandas/data_container/__init__.py:
--------------------------------------------------------------------------------
1 | from .data_container import XDataFrame, XSeries
2 | 


--------------------------------------------------------------------------------
/xpandas/transformers/image_transformer/__init__.py:
--------------------------------------------------------------------------------
1 | from .image_transformer import ImageTransformer
2 | 


--------------------------------------------------------------------------------
/docs/_static/Logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/xpandas/HEAD/docs/_static/Logo.png


--------------------------------------------------------------------------------
/xpandas/transformers/identity_transformer/__init__.py:
--------------------------------------------------------------------------------
1 | from .identity_transformer import IdentityTransformer


--------------------------------------------------------------------------------
/xpandas/transformers/pipeline_transformer/__init__.py:
--------------------------------------------------------------------------------
1 | from .pipeline_transformer import PipeLineChain
2 | 


--------------------------------------------------------------------------------
/examples/imgs/Logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/xpandas/HEAD/examples/imgs/Logo.png


--------------------------------------------------------------------------------
/docs/license.rst:
--------------------------------------------------------------------------------
1 | .. _license:
2 | 
3 | =======
4 | License
5 | =======
6 | 
7 | .. literalinclude:: ../LICENSE.txt
8 | 


--------------------------------------------------------------------------------
/examples/imgs/XSeries.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/xpandas/HEAD/examples/imgs/XSeries.png


--------------------------------------------------------------------------------
/examples/imgs/logo.sketch:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/xpandas/HEAD/examples/imgs/logo.sketch


--------------------------------------------------------------------------------
/examples/imgs/Transformer.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/xpandas/HEAD/examples/imgs/Transformer.png


--------------------------------------------------------------------------------
/examples/imgs/XDataFrame.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/xpandas/HEAD/examples/imgs/XDataFrame.png


--------------------------------------------------------------------------------
/xpandas/transformers/bag_of_features_transformer/__init__.py:
--------------------------------------------------------------------------------
1 | from .bag_of_features_transformer import BagOfWordsTransformer
2 | 


--------------------------------------------------------------------------------
/docs/_static/GitHub-Mark-32px.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alan-turing-institute/xpandas/HEAD/docs/_static/GitHub-Mark-32px.png


--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
1 | =========
2 | Changelog
3 | =========
4 | 
5 | Version 1.0b
6 | ============
7 | 
8 | - First public release (beta) of XPandas


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.15.2
2 | scikit-image==0.14.1
3 | scikit-learn==0.20.0
4 | scipy==1.1.0
5 | pytest==3.8.2
6 | pandas==0.23.4
7 | tsfresh==0.11.1
8 | 


--------------------------------------------------------------------------------
/docs/docsapi.rst:
--------------------------------------------------------------------------------
 1 | API Documentation
 2 | =================
 3 | 
 4 | Information on specific functions, classes, and methods.
 5 | 
 6 | .. toctree::
 7 |    :glob:
 8 | 
 9 |    api/*
10 | 


--------------------------------------------------------------------------------
/xpandas/transformers/series_transformers/__init__.py:
--------------------------------------------------------------------------------
1 | from .series_transformer import TimeSeriesWindowTransformer, TsFreshSeriesTransformer, \
2 |     TimeSeriesTransformer, MeanSeriesTransformer
3 | 


--------------------------------------------------------------------------------
/docs/contributing.rst:
--------------------------------------------------------------------------------
1 | Contributing
2 | ++++++++++++
3 | 
4 | We welcome any contribution to the XPandas project.
5 | 
6 | Please read our `contribution guide <https://github.com/alan-turing-institute/xpandas/blob/master/CONTRIBUTING.md>`_ on Github.


--------------------------------------------------------------------------------
/docs/example.rst:
--------------------------------------------------------------------------------
 1 | Example
 2 | =======
 3 | 
 4 | `Read as jupyter notebook on Github <https://github.com/alan-turing-institute/xpandas/blob/master/examples/ExampleUsage.ipynb>`_
 5 | 
 6 | .. toctree::
 7 |     :maxdepth: 4
 8 | 
 9 |     ExampleUsage
10 | 


--------------------------------------------------------------------------------
/xpandas/transformers/__init__.py:
--------------------------------------------------------------------------------
1 | from .pipeline_transformer import PipeLineChain
2 | from .series_transformers import TimeSeriesWindowTransformer, MeanSeriesTransformer, TimeSeriesTransformer, TsFreshSeriesTransformer
3 | from .transformer import XSeriesTransformer, XDataFrameTransformer
4 | from .identity_transformer import IdentityTransformer


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | 
 3 | url = "https://pypi.python.org/simple"
 4 | verify_ssl = true
 5 | name = "pypi"
 6 | 
 7 | 
 8 | [packages]
 9 | 
10 | numpy = "*"
11 | scipy = "*"
12 | pandas = "*"
13 | scikit-learn = "*"
14 | tsfresh = "*"
15 | pytest = "*"
16 | scikit-image = "*"
17 | Sphinx = "*"
18 | nbsphinx = "*"
19 | 
20 | 
21 | [dev-packages]
22 | 


--------------------------------------------------------------------------------
/xpandas/transformers/identity_transformer/identity_transformer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from ..transformer import XSeriesTransformer
 4 | 
 5 | 
 6 | class IdentityTransformer(XSeriesTransformer):
 7 |     '''
 8 |     Performs identity transformer X -> X
 9 |     '''
10 |     def __init__(self):
11 |         super(IdentityTransformer, self).__init__(transform_function=lambda x: x)
12 | 


--------------------------------------------------------------------------------
/examples/container_example.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from xpandas.data_container import XSeries, XDataFrame
 4 | 
 5 | n = 1000
 6 | 
 7 | xseries = XSeries([
 8 |         pd.Series(np.random.normal(size=500))
 9 |     ] * n)
10 | 
11 | xdataframe = XDataFrame({
12 |     'gender': XSeries(np.random.binomial(1, 0.7, n)),
13 |     'age': XSeries(np.random.poisson(25, n)),
14 |     'series': xseries
15 | })


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | 
 4 | setup(name='XPandas',
 5 |       version='1.0.2',
 6 |       description='1d/2d data container with map-reduce transformers',
 7 |       url='https://github.com/alan-turing-institute/xpandas',
 8 |       author='Vitaly Davydov (@iwitaly)',
 9 |       author_email='1061040@gmail.com',
10 |       license='BSD',
11 |       keywords='data container sklearn pandas map reduce transformer',
12 |       packages=find_packages(),
13 |       zip_safe=False)


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | =======
 2 | Welcome to XPandas documentation!
 3 | =======
 4 | 
 5 | .. _xpandasgit: https://github.com/alan-turing-institute/xpandas
 6 | 
 7 | 
 8 | .. |git label image| image:: _static/GitHub-Mark-32px.png
 9 |     :target: xpandasgit_
10 | 
11 | Welcome to the documentation of **XPandas** -- data containers and transformations for storing 1D and 2D data of any type. The source code is available on |git label image|
12 | 
13 | 
14 | tl;dr
15 | #####
16 | 
17 | Install :code:`pip install xpandas` and check out the :ref:`Example`.
18 | 
19 | 
20 | Table of Contents
21 | #################
22 | 
23 | .. toctree::
24 |    :maxdepth: 2
25 | 
26 |    introduction
27 |    installation
28 |    example
29 |    docsapi
30 |    contributing
31 |    changes
32 |    authors
33 |    license


--------------------------------------------------------------------------------
/xpandas/transformers/pipeline_transformer/pipeline_transformer.py:
--------------------------------------------------------------------------------
 1 | from sklearn.pipeline import Pipeline
 2 | 
 3 | from ...data_container import XDataFrame, XSeries
 4 | 
 5 | 
 6 | class PipeLineChain(Pipeline):
 7 |     '''
 8 |     PipeLine transformer. Can chain multiple transformers and estimator from scikit-learn.
 9 |     Based on scikit-learn Pipeline
10 |     http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline
11 |     '''
12 | 
13 |     def transform(self, X, **kwargs):
14 |         transformed_object = super(PipeLineChain, self).transform(X, **kwargs)
15 |         if type(transformed_object) != XSeries and type(transformed_object) != XDataFrame:
16 |             transformed_object = XDataFrame(transformed_object)
17 |         return transformed_object
18 | 


--------------------------------------------------------------------------------
/AUTHORS.rst:
--------------------------------------------------------------------------------
 1 | ==========
 2 | Developers
 3 | ==========
 4 | 
 5 | **XPandas** is developed open source under the direction of `Dr Franz Király`_. Check out the `public code repository`_ to learn more.
 6 | 
 7 | **Active**
 8 | 
 9 | * Vitaly Davydov `iwitaly <https://github.com/iwitaly>`_: principal developer and curator
10 | * Franz Kiraly `fkiraly <https://github.com/fkiraly>`_: project manager and designated point of contact
11 | * Frithjof Gressmann `frthjf <https://github.com/frthjf>`_: contributor
12 | 
13 | **Former/inactive**
14 | 
15 | None
16 | 
17 | If you like to contribute, read our `contribution guide <https://github.com/alan-turing-institute/xpandas/blob/master/CONTRIBUTING.md>`_.
18 | 
19 | .. _public code repository: https://github.com/alan-turing-institute/xpandas
20 | .. _Dr Franz Király: https://www.ucl.ac.uk/statistics/people/franz-kiraly


--------------------------------------------------------------------------------
/tests/test_image_transformers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import skimage.transform as skimage_transform
 3 | 
 4 | from ..xpandas.data_container import XSeries
 5 | from ..xpandas.transformers.image_transformer import ImageTransformer
 6 | 
 7 | n = 20
 8 | m = 20
 9 | colours_n = 255
10 | 
11 | 
12 | def generate_image(is_3d=True):
13 |     if is_3d:
14 |         return (np.random.rand(30, 30, 3) * 255).astype('uint8')
15 |     return (np.random.rand(30, 30) * 255).astype('uint8')
16 | 
17 | 
18 | def test_image_transformation():
19 |     s = XSeries([generate_image(False) for _ in range(100)])
20 | 
21 |     try:
22 |         image_transformer = ImageTransformer().fit()
23 |         assert False
24 |     except:
25 |         assert True
26 | 
27 |     image_transformer = ImageTransformer(skimage_transform.hough_circle, radius=5).fit()
28 |     s_transformed = image_transformer.transform(s)
29 | 
30 |     assert s_transformed.data_type == np.ndarray
31 | 
32 |     image_transformer = ImageTransformer(skimage_transform.resize, output_shape=(10, 10)).fit()
33 |     s_transformed = image_transformer.transform(s)
34 | 
35 |     assert s_transformed.data_type == np.ndarray
36 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |     - 3.6
 4 | 
 5 | addons:
 6 |   apt:
 7 |     packages:
 8 |     - pandoc
 9 | 
10 | install:
11 |   - pip install -r requirements.txt
12 |   - pip install -r requirements-docs.txt
13 | 
14 | script:
15 |   - pytest
16 |   - set -e
17 |   - pip install -e .
18 |   - cd docs
19 |   - make buildapi
20 |   - make html
21 |   - pip install doctr
22 |   - cd ..
23 |   - doctr deploy . --built-docs docs/_build/html/
24 | 
25 | env:
26 |   global:
27 |     - secure: "VbInh8C4nFX7sDvyLPIs4YQnOOSORzPP85PsWqWQet5gx2AqWPmdkuYbSthXTKIl/V0HPWrwOMFyki9FpeC+DJrzyiftAyNoLmBO1Yha3y5NcwYovZ4u4kVsfORpQAm5pSCKKQxt17w6PgBRyZ4T2LnTF5k9Ig9AkxDllR01PItw4mLN2u8M8/hx4eKhyD4dTy596Dtd2AiQuoK7LhkLaMNUBx5Q26yLhw9muyLDBS55kHp4xXlB2ggZnr7S4FJp0WV+pzKPFVEs6l1kK1U+1OyB1CszjC/mLjYPzn/QoNtKXnSnRM1GxZp+/z7guldoEG81YSx2/F3xTHGJ28g3larhoL+YjbVq+FAce9g/0i7Ee6I2Gaa4Jwj2IcK02KguIPJp+Aj6wj6hgmvPaKYyaF7EPRiZMOKyKA3kpyXdOfJj2oUaPevHonPsxdkNXnB8UuUU4ulKPCjeo8L/+9O5gM9zDngFNFXbnJ2cVZtta0L7Uug00IwKS1FGunl7bGh9V3jP4OLrzKyykDYf/lRMB5YXPExlGi9+TWQ+MmXVbqVjR02YRkWAvPnD096z62eEidn8DFGXRFFndRJcBD3Z56S0RPkd7oIF0+sKujLMqXYzRV6NttuOuvnICd1nTsjQV3h9GcAL2cOfNGEE9MEaDnyvBVoEP3a0CDDcVChETIk="
28 | 


--------------------------------------------------------------------------------
/xpandas/transformers/image_transformer/image_transformer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from ..transformer import XSeriesTransformer
 4 | 
 5 | 
 6 | class ImageTransformer(XSeriesTransformer):
 7 |     '''
 8 |     Performs image transformation based on skimage transformation function
 9 |     http://scikit-image.org/docs/dev/api/skimage.transform.html
10 |     '''
11 |     def __init__(self, skimage_function=None, **function_params):
12 |         '''
13 |         :param skimage_function: transformation function from skimage
14 |         '''
15 |         accepted_types = [
16 |             list, np.ndarray, np.array
17 |         ]
18 | 
19 |         if skimage_function is None:
20 |             raise Exception('Please specify transform function from scikit-image'
21 |                             ' http://scikit-image.org/docs/dev/api/skimage.transform.html')
22 | 
23 |         def image_transform_function(img):
24 |             return skimage_function(img, **function_params)
25 | 
26 |         super(ImageTransformer, self).__init__(data_types=accepted_types,
27 |                                                columns=None,
28 |                                                transform_function=image_transform_function)
29 | 


--------------------------------------------------------------------------------
/docs/installation.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ************
 3 | 
 4 | Stable
 5 | ^^^^^^^^^^^^^
 6 | 
 7 | The installation of the latest stable version is easy using the python package manager `pip`_. ::
 8 | 
 9 |     pip install xpandas
10 | 
11 | That's it. You are now ready to go. We recommend reading the :doc:`examples <ExampleUsage>` to get started.
12 | 
13 | 
14 | Bleeding edge
15 | ^^^^^^^^^^^^^
16 | 
17 | To test or develop new features you may want to install the latest package version
18 | from the master branch (bleeding edge installation). You can install directly from Git repository ::
19 | 
20 |     pip install git+https://github.com/alan-turing-institute/xpandas.git
21 | 
22 | 
23 | Or clone the source from our `public code repository`_ on GitHub and change into the XPandas directory.
24 | Make sure that all dependencies are installed ::
25 | 
26 |     pip install -r requirements.txt
27 | 
28 | Then run ::
29 | 
30 |     python setup.py develop
31 | 
32 | to install the package into the activated Python environment.
33 | If you would like to contribute to documentation please refer to :ref:`Contributing`.
34 | 
35 | Note that bleeding edge installations are likely contain bugs are not recommended for productive environments.
36 | 
37 | 
38 | .. _pip: http://www.pip-installer.org/
39 | .. _public code repository: https://github.com/alan-turing-institute/xpandas


--------------------------------------------------------------------------------
/examples/transformer_example.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | from xpandas.data_container import XSeries, XDataFrame
 4 | 
 5 | from xpandas.transformers import XDataFrameTransformer
 6 | from xpandas.transformers import PipeLineChain
 7 | from xpandas.transformers import XSeriesTransformer
 8 | from xpandas.transformers import TimeSeriesWindowTransformer
 9 | 
10 | from sklearn.linear_model import LogisticRegression
11 | from sklearn.decomposition import PCA
12 | 
13 | 
14 | n = 1000
15 | 
16 | xseries = XSeries([
17 |         pd.Series(np.random.normal(size=500))
18 |     ] * n)
19 | my_awesome_transfomer = XSeriesTransformer(transform_function=lambda x: x.std())
20 | my_awesome_transfomer.fit(X)
21 | print(my_awesome_transfomer.transform(X).head())
22 | 
23 | 
24 | xdataframe = XDataFrame({
25 |     'gender': XSeries(np.random.binomial(1, 0.7, n)),
26 |     'age': XSeries(np.random.poisson(25, n)),
27 |     'series': xseries
28 | })
29 | df_transformer = XDataFrameTransformer({
30 |     'series': TimeSeriesWindowTransformer(windows_size=4),
31 |     'age': my_awesome_transfomer
32 | })
33 | df_transformer.fit(df)
34 | transformed_df = df_transformer.transform(df)
35 | 
36 | 
37 | chain = PipeLineChain([
38 |     ('moving average trans', TimeSeriesWindowTransformer(windows_size=5)),
39 |     ('extract features', my_awesome_transfomer),
40 |     ('pca', PCA(n_components=5)),
41 |     ('logit_regression', LogisticRegression())
42 | ])
43 | chain.fit(X)
44 | print(chain.get_params)
45 | transformed_X = chain.transform(X)
46 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ![Logo](/examples/imgs/Logo.png)
 2 | 
 3 | [![Build Status](https://travis-ci.org/alan-turing-institute/xpandas.svg?branch=master)](https://travis-ci.org/alan-turing-institute/xpandas)
 4 | [![PyPI version](https://badge.fury.io/py/XPandas.svg)](https://badge.fury.io/py/XPandas)
 5 | 
 6 | **XPandas** (extended [`Pandas`](https://pandas.pydata.org/)) implements 1D and 2D data containers for storing type-heterogeneous tabular data of any type, 
 7 | and encapsulates feature extraction and transformation modelling in an sklearn-compatible transformer interface.
 8 | 
 9 | ## Quickstart
10 | 
11 | Install the latest version
12 | 
13 |     $ pip install xpandas
14 |     
15 | and run the example jupyter notebook
16 |     
17 |     $ jupyter examples/ExampleUsage.ipynb
18 | 
19 | ## Documentation
20 | 
21 | The full documentation is available at [https://alan-turing-institute.github.io/xpandas/](https://alan-turing-institute.github.io/xpandas/).
22 | 
23 | ## Acknowledgements
24 | 
25 | - **Bernd Bischl (@berndbischl)**, who mentioned the idea of a general data container with transformers attached to columns in personal discussion with Franz Kiraly during a London visit in 2016.
26 | - **Franz Kiraly (@fkiraly)**, who initiated and funded the project up to release, and who substantially contributed to the API design.
27 | - **Haoran Xue (@HaoranXue)**, who, under the supervision of Franz Kiraly, earlier completed a thesis for a degree at UCL on the topic, and who wrote a similar package as part of it. No code was re-used in the creation of the XPandas package.
28 | 
29 | 
30 | List of [developers and contributors](AUTHORS.rst)
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/xpandas/transformers/bag_of_features_transformer/bag_of_features_transformer.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | 
 6 | from ..transformer import XSeriesTransformer
 7 | 
 8 | 
 9 | class BagOfWordsTransformer(XSeriesTransformer):
10 |     '''
11 |     Performs bag-of-features transformer for strings of any categorical data.
12 |     '''
13 |     def __init__(self, dictionary=None, **kwargs):
14 |         '''
15 |         :param dictionary: custom dictionary to count against. if None, calculate dictionary from dataset
16 |         '''
17 |         self.dictionary = dictionary
18 | 
19 |         accepted_types = [
20 |             pd.Series, list, np.array, tuple
21 |         ]
22 | 
23 |         def bag_of_words_transform_function(corpus):
24 |             counter = Counter(corpus)
25 |             for el in self.dictionary:
26 |                 if counter.get(el) is None:
27 |                     counter[el] = 0
28 |             return counter
29 | 
30 |         super(BagOfWordsTransformer, self).__init__(data_types=accepted_types,
31 |                                                     columns=None,
32 |                                                     transform_function=bag_of_words_transform_function)
33 | 
34 |     def __calculate_dictionary(self, X):
35 |         dictionary = set()
36 |         for el in X:
37 |             dictionary = dictionary.union(el)
38 |         return dictionary
39 | 
40 |     def fit(self, X=None, y=None, **kwargs):
41 |         super(BagOfWordsTransformer, self).fit(X, y, **kwargs)
42 |         if self.dictionary is not None:
43 |             return self
44 |         self.dictionary = self.__calculate_dictionary(X)
45 |         return self
46 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017, shared by: 
 2 | The Alan Turing institute, UK; 
 3 | University College London, UK; 
 4 | Vitaly Davydov; 
 5 | Franz J Kiraly.
 6 | All rights reserved.
 7 | 
 8 | Redistribution and use in source and binary forms, with or without modification,
 9 | are permitted provided that the following conditions are met:
10 | 
11 | * Redistributions of source code must retain the above copyright notice, this
12 |   list of conditions and the following disclaimer.
13 | 
14 | * Redistributions in binary form must reproduce the above copyright notice, this
15 |   list of conditions and the following disclaimer in the documentation and/or
16 |   other materials provided with the distribution.
17 | 
18 | * Neither the name of the copyright holders nor the names of the project's
19 |   contributors may be used to endorse or promote products derived from
20 |   this software without specific prior written permission of all copyright holders.
21 | 
22 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
23 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
24 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
25 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR
26 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
27 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
29 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/tests/test_dataframe_transformer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | here = os.path.dirname(os.path.realpath(__file__))
 5 | sys.path.append(os.path.join(here, ".."))
 6 | 
 7 | from ..xpandas.data_container import XDataFrame, XSeries
 8 | from ..xpandas.transformers import XSeriesTransformer, TimeSeriesTransformer, \
 9 |     TimeSeriesWindowTransformer, MeanSeriesTransformer, IdentityTransformer, \
10 |     XDataFrameTransformer, PipeLineChain
11 | import pandas as pd
12 | import numpy as np
13 | 
14 | 
15 | def test_naming():
16 |     X = XSeries([
17 |         pd.Series(np.random.normal(0, 1, 100), name='X')
18 |     ])
19 |     df = XDataFrame({
20 |         'X': X
21 |     })
22 | 
23 |     dataframe_transformer = XDataFrameTransformer({
24 |         'X': [TimeSeriesTransformer()]
25 |     })
26 | 
27 |     dataframe_transformer.fit(df)
28 |     transformed_df = dataframe_transformer.transform(df)
29 | 
30 |     for col_name in transformed_df.columns:
31 |         assert col_name.startswith('X_TimeSeriesTransformer')
32 | 
33 | 
34 | def test_multiple_transformers_for_one_column():
35 |     X = XSeries([
36 |         pd.Series(np.random.normal(0, 1, 100), name='X')
37 |     ])
38 |     df = XDataFrame({
39 |         'X': X
40 |     })
41 | 
42 |     dataframe_transformer = XDataFrameTransformer({
43 |         'X': [TimeSeriesTransformer(), IdentityTransformer(), MeanSeriesTransformer()]
44 |     })
45 | 
46 |     dataframe_transformer.fit(df)
47 |     transformed_df = dataframe_transformer.transform(df)
48 | 
49 |     for col_name in transformed_df.columns:
50 |         assert col_name.startswith('X_TimeSeriesTransformer') or \
51 |                col_name.startswith('X_IdentityTransformer') or \
52 |                col_name.startswith('X_MeanSeriesTransformer')


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | /docs/api/*
  2 | 
  3 | # Created by .ignore support plugin (hsz.mobi)
  4 | ### Python template
  5 | # Byte-compiled / optimized / DLL files
  6 | __pycache__/
  7 | *.py[cod]
  8 | *$py.class
  9 | 
 10 | # C extensions
 11 | *.so
 12 | 
 13 | # Distribution / packaging
 14 | .Python
 15 | env/
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *,cover
 51 | .hypothesis/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # pyenv
 78 | .python-version
 79 | 
 80 | # celery beat schedule file
 81 | celerybeat-schedule
 82 | 
 83 | # SageMath parsed files
 84 | *.sage.py
 85 | 
 86 | # dotenv
 87 | .env
 88 | 
 89 | # virtualenv
 90 | .venv
 91 | venv/
 92 | ENV/
 93 | 
 94 | # Spyder project settings
 95 | .spyderproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | .idea/
101 | *.csv
102 | ExampleUsage.rst
103 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import alabaster
 3 | 
 4 | needs_sphinx = '1.4.3'
 5 | 
 6 | html_theme_path = [alabaster.get_path()]
 7 | 
 8 | sys.path.insert(0, os.path.abspath('../xpandas'))
 9 | 
10 | extensions = ['alabaster', 'sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.todo',
11 |               'sphinx.ext.autosummary', 'sphinx.ext.viewcode', 'sphinx.ext.coverage',
12 |               'sphinx.ext.doctest', 'sphinx.ext.ifconfig', 'sphinx.ext.pngmath',
13 |               'sphinx.ext.napoleon', 'nbsphinx', 'IPython.sphinxext.ipython_console_highlighting',
14 |               'sphinx.ext.autosectionlabel']
15 | 
16 | templates_path = ['_templates']
17 | 
18 | source_suffix = '.rst'
19 | 
20 | master_doc = 'index'
21 | 
22 | project = u'XPandas'
23 | copyright = u'2017, UCL'
24 | 
25 | version = ''  # Is set by calling `setup.py docs`
26 | release = ''  # Is set by calling `setup.py docs`
27 | 
28 | exclude_patterns = ['_build', '**.ipynb_checkpoints']
29 | 
30 | # pygments_style = 'sphinx'
31 | 
32 | html_theme = 'alabaster'
33 | 
34 | html_theme_options = {
35 |     'logo': 'Logo.png',
36 |     'github_user': 'alan-turing-institute',
37 |     'github_repo': 'xpandas',
38 |     'travis_button': True,
39 |     'analytics_id': 'UA-108477151-1'
40 | }
41 | 
42 | 
43 | try:
44 |     from xpandas import __version__ as version
45 | except ImportError:
46 |     pass
47 | else:
48 |     release = version
49 | 
50 | html_static_path = ['_static']
51 | 
52 | html_sidebars = {
53 |     '**': [
54 |         'about.html',
55 |         'navigation.html',
56 |         'searchbox.html',
57 |         'donate.html',
58 |     ]
59 | }
60 | 
61 | html_show_sourcelink = False
62 | 
63 | html_show_sphinx = False
64 | 
65 | htmlhelp_basename = 'XPandas-doc'
66 | 
67 | 
68 | python_version = '.'.join(map(str, sys.version_info[0:2]))
69 | intersphinx_mapping = {
70 |     'sphinx': ('http://sphinx.pocoo.org', None),
71 |     'python': ('http://docs.python.org/' + python_version, None),
72 |     'matplotlib': ('http://matplotlib.sourceforge.net', None),
73 |     'numpy': ('http://docs.scipy.org/doc/numpy', None),
74 |     'sklearn': ('http://scikit-learn.org/stable', None),
75 |     'pandas': ('http://pandas.pydata.org/pandas-docs/stable', None),
76 |     'scipy': ('http://docs.scipy.org/doc/scipy/reference/', None),
77 | }
78 | 


--------------------------------------------------------------------------------
/tests/test_ts_fresh.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | from ..xpandas.data_container import XSeries, XDataFrame
 5 | from ..xpandas.transformers.pipeline_transformer import PipeLineChain
 6 | from ..xpandas.transformers.series_transformers import TsFreshSeriesTransformer, TimeSeriesWindowTransformer
 7 | from ..xpandas.transformers.transformer import XDataFrameTransformer
 8 | 
 9 | 
10 | def test_ts_fresh_series():
11 |     series = XSeries([
12 |         pd.Series(np.random.uniform(0, 100, 100)),
13 |         pd.Series(np.random.uniform(0, 100, 100)),
14 |         pd.Series(np.random.uniform(0, 100, 100)),
15 |         pd.Series(np.random.uniform(0, 100, 100)),
16 |         pd.Series(np.random.uniform(0, 100, 100))
17 |     ], name='Y')
18 | 
19 |     series = series
20 |     # print(series.index)
21 | 
22 |     transformer = TsFreshSeriesTransformer()
23 | 
24 |     transformer.fit(series)
25 |     transformed = transformer.transform(series)
26 |     # print(transformed)
27 | 
28 |     assert type(transformed) == XDataFrame
29 | 
30 | 
31 | def test_ts_fresh_df():
32 |     s1 = XSeries([
33 |         pd.Series(np.random.uniform(0, 100, 10)),
34 |         pd.Series(np.random.uniform(0, 100, 10)),
35 |         pd.Series(np.random.uniform(0, 100, 10))
36 |     ], name='X')
37 |     s2 = XSeries([
38 |         pd.Series(np.random.uniform(0, 100, 10)),
39 |         pd.Series(np.random.uniform(0, 100, 10)),
40 |         pd.Series(np.random.uniform(0, 100, 10))
41 |     ], name='Y')
42 | 
43 |     df = XDataFrame({
44 |         'X': s1,
45 |         'Y': s2
46 |     })
47 | 
48 |     data_frame_transformer = XDataFrameTransformer(transformations={
49 |         'X': TsFreshSeriesTransformer(),
50 |         'Y': TsFreshSeriesTransformer()
51 |     })
52 | 
53 |     data_frame_transformer.fit(df)
54 |     transformed_df = data_frame_transformer.transform(df)
55 | 
56 |     assert type(transformed_df) == XDataFrame
57 | 
58 | 
59 | def test_ts_fresh_chain():
60 |     s1 = XSeries([
61 |         pd.Series(np.random.normal(0, 1, 20))
62 |         for _ in range(10)
63 |     ], name='X')
64 | 
65 |     pipe = PipeLineChain([
66 |         ('mean shift', TimeSeriesWindowTransformer()),
67 |         ('ts fresh step', TsFreshSeriesTransformer())
68 |     ])
69 | 
70 |     pipe.fit(s1)
71 |     transformed_df = pipe.transform(s1)
72 | 
73 |     # print(transformed_df.head())
74 | 
75 |     assert type(transformed_df) == XDataFrame
76 | 


--------------------------------------------------------------------------------
/tests/test_bag_of_features.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | 
 3 | import numpy as np
 4 | from sklearn.datasets import fetch_20newsgroups
 5 | from sklearn.decomposition import PCA
 6 | 
 7 | from ..xpandas.data_container import XSeries, XDataFrame
 8 | from ..xpandas.transformers import XSeriesTransformer
 9 | from ..xpandas.transformers.bag_of_features_transformer import BagOfWordsTransformer
10 | from ..xpandas.transformers.pipeline_transformer import PipeLineChain
11 | 
12 | 
13 | def test_bag_of_words_for_series():
14 |     dataset = fetch_20newsgroups(shuffle=True, random_state=1,
15 |                                  remove=('headers', 'footers', 'quotes'))
16 | 
17 |     series = XSeries(dataset.data[:10])
18 |     assert series.data_type == str
19 | 
20 |     translator = str.maketrans('', '', string.punctuation)
21 |     tokenizer_transformer = XSeriesTransformer(
22 |         transform_function=lambda text: text.lower().translate(translator).strip().split()
23 |     )
24 | 
25 |     transformed_series = tokenizer_transformer.fit_transform(series)
26 |     # print(transformed_series)
27 | 
28 |     bag_transform = BagOfWordsTransformer()
29 | 
30 |     transformed_series = bag_transform.fit_transform(transformed_series)
31 | 
32 |     # print(transformed_series)
33 | 
34 |     assert type(transformed_series) == XDataFrame
35 | 
36 | 
37 | def test_bag_of_words_for_series_pipeline():
38 |     dataset = fetch_20newsgroups(shuffle=True, random_state=1,
39 |                                  remove=('headers', 'footers', 'quotes'))
40 |     n = 100
41 |     series = XSeries(dataset.data[:n])
42 |     assert series.data_type == str
43 | 
44 |     translator = str.maketrans('', '', string.punctuation)
45 |     tokenizer_transformer = XSeriesTransformer(
46 |         transform_function=lambda text: text.lower().translate(translator).strip().split()
47 |     )
48 | 
49 |     # series = tokenizer_transformer.transform(series)
50 | 
51 |     Y = np.random.binomial(1, 0.5, n)
52 | 
53 |     pipeline = PipeLineChain([
54 |         ('preprocessing', XSeriesTransformer(
55 |             transform_function=lambda text: text.lower().translate(translator).strip().split()
56 |         )),
57 |         ('extractor', BagOfWordsTransformer()),
58 |         ('pca', PCA(n_components=10)),
59 |         # ('svc', LinearSVC())
60 |     ])
61 | 
62 |     pipeline = pipeline.fit(series)
63 |     transformed_series = pipeline.transform(series)
64 | 
65 |     # print(transformed_series)
66 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | ## How to contribute to XPandas
 2 | 
 3 | #### **Did you find a bug?**
 4 | 
 5 | * **Ensure the bug was not already reported** by searching on GitHub under [Issues](https://github.com/alan-turing-institute/xpandas/issues).
 6 | 
 7 | * If you're unable to find an open issue addressing the problem,
 8 | [open a new one](https://github.com/alan-turing-institute/xpandas/issues/new).
 9 | Be sure to include a **title and clear description**,
10 | as much relevant information as possible, and a **code sample** or an **executable test case**
11 | demonstrating the expected behavior that is not occurring.
12 | 
13 | * Please follow the further discussion in case more information is needed or questions arise.
14 | 
15 | #### **Did you write a patch that fixes a bug?**
16 | 
17 | * Open a new GitHub pull request with the patch.
18 | 
19 | * Ensure the PR description clearly describes the problem and solution. Include the relevant issue number if applicable.
20 | 
21 | #### **Do you intend to add a new feature or change an existing one?**
22 | 
23 | * Suggest your change in an issue and offer to implement the feature. 
24 | 
25 | * Wait for positive feedback in order to avoid double work (maybe your idea is already in development).
26 | 
27 | * Implement and send a PR
28 | 
29 | #### **Do you want to contribute to the xpandas documentation?**
30 | 
31 | * Understand how the documentation is build (see below)
32 | * Send a PR that propose changes to the docs directory
33 | 
34 | We use a custom [Makefile](docs/Makefile) that configures an automatic generation of the `*.rst` API documentation of each entity (class or module) in the [docs/api](docs/api) directory:
35 | 
36 |     buildapi:
37 |         sphinx-apidoc -fMeT ../xpandas -o api
38 |         @echo "Auto-generation of API documentation finished. " \
39 |               "The generated files are in 'api/'"
40 | 
41 | Before building the documentation the entity files have therefore to be regenerated from the source using the `make buildapi` command. Then, `make html` creates the HTML documentation which includes a conversion of the [examples notebook](examples/ExampleUsage.ipynb) into *ExampleUsage.rst* before building.
42 | 
43 | **Deployment of the documentation**
44 | 
45 | This documenation is hosted on GitHub Pages instead of [ReadTheDocs](https://readthedocs.org/) to avoid adverts and keep all things together on Github.
46 | 
47 | As GitHub Pages does not support Sphinx we make us of the [Doctr](https://drdoctr.github.io/doctr/) package that automatically updates our docs
48 | on GH Pages branch using Travis CI; the build process is triggered by commits to the master branch that pass the tests. Please take a look at the [.travis.yml](.travis.yml) file for more details.
49 |  
50 | XPandas is a team effort. We encourage you to pitch in and join us!
51 | 
52 | Thanks! :heart: :heart: :heart:
53 | 
54 | **Xpandas Team**
55 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
 6 | 
 7 | ## Our Standards
 8 | 
 9 | Examples of behavior that contributes to creating a positive environment include:
10 | 
11 | * Using welcoming and inclusive language
12 | * Being respectful of differing viewpoints and experiences
13 | * Gracefully accepting constructive criticism
14 | * Focusing on what is best for the community
15 | * Showing empathy towards other community members
16 | 
17 | Examples of unacceptable behavior by participants include:
18 | 
19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | * Trolling, insulting/derogatory comments, and personal or political attacks
21 | * Public or private harassment
22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | * Other conduct which could reasonably be considered inappropriate in a professional setting
24 | 
25 | ## Our Responsibilities
26 | 
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 | 
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 | 
31 | ## Scope
32 | 
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 | 
35 | ## Enforcement
36 | 
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at f.kiraly@ucl.ac.uk. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 | 
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 | 
41 | ## Attribution
42 | 
43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
44 | 
45 | [homepage]: http://contributor-covenant.org
46 | [version]: http://contributor-covenant.org/version/1/4/
47 | 


--------------------------------------------------------------------------------
/docs/introduction.rst:
--------------------------------------------------------------------------------
 1 | Introduction
 2 | ************
 3 | 
 4 | XPandas (extended `Pandas <https://pandas.pydata.org/>`_.) implements 1D and 2D data containers for storing type-heterogeneous tabular data of any type, and encapsulates feature extraction and transformation modelling in an sklearn-compatible transformer interface.
 5 | 
 6 | Description
 7 | +++++++++++
 8 | 
 9 | **XPandas** provides universal 1D typed list (`XSeries`) and 2D type-heterogeneous data-frame (`XDataFrame`) containers and provides an extended sklearn-like transformer classes interfacing said containers. Transformers can be used for automated feature extraction and map-reduce style transformations but are not limited to it.
10 | 
11 | `XSeries` is based on `pandas.Series` that can store objects of any type. Example would be a series of image containers, or a series of `pandas.Series` objects stored as `XSeries`. `XSeries` can be visualised according to a schema.
12 | 
13 | 
14 | .. image:: ../examples/imgs/XSeries.png
15 | 
16 | `XDataFrame` extends `pandas.DataFrame` by allowing arbitrary object types per column. It provides the same convenient sub-setting interface and extended abstract access methods. Each column is internally stored as an `XSeries` container, all of same length.
17 | 
18 | One example could be a medical data set where each row is a different patient, say, in a hospital. The columns would correspond to a type-heterogeneous set of features like numbers (age, height, weight, etc.), categorical (gender, hair color, etc.), images (x-ray pictures), time series (heat beat, lab history), and other parts of a medical record.
19 | 
20 | With `XDataFrame` one can store all this information in a single 2D data container instead of a tedious collection of custom nested lists or arrays.
21 | 
22 | .. image:: ../examples/imgs/XDataFrame.png
23 | 
24 | Another advantage of XPandas is the clean interface it provides to ready-to-go machine learning algorithms in `scikit-learn <http://scikit-learn.org>`_. The transformers interface can be used to easily convert the types in a `XDataFrame` to the primitive types with which sklearn can interface, as part of a modelling pipeline. In the example with patients data, one may want to extract summary features from each `pandas.Series`, or extract features from each image, say in a deep learning model.
25 | 
26 | More technically, the implemented `XSeriesTransformer` class allows for the implementation of transformation defaults for `XSeries`; similarly `XDataFrameTransformer` implements a transformation for `XDataFrame` type objects. From a mathematical point of view `XSeriesTransformer` encapsulate abstract functions of the type `XSeries -> XSeries or XDataFrame` whereas
27 | `XDataFrameTransformer` represents mappings from `XDataFrame -> XDataFrame`. Each of the transformers follow the familiar fit/transform/parameters API of sklearn.
28 | 
29 | .. image:: ../examples/imgs/Transformer.png
30 | 
31 | Data types
32 | ++++++++++
33 | 
34 | Notably, XPandas comes with several pre-implemented transformers for the most common non-primitive data types.
35 | 
36 | Time series
37 | 
38 | 
39 | -  ``TimeSeriesTransformer(features)`` — extract ``features`` from each
40 |    series. ``features`` is a subset of [ 'mean', 'std', 'max', 'min',
41 |    'median', 'quantile\_25', 'quantile\_75', 'quantile\_90',
42 |    'quantile\_95']
43 | 
44 | -  ``TimeSeriesWindowTransformer(windows_size)`` — calculate rolling
45 |    mean with given ``windows_size``
46 | -  ``TsFreshSeriesTransformer`` — extract features using
47 |    `tsfresh <tsfresh.readthedocs.io>`__ package
48 | 
49 | Image
50 | 
51 | 
52 | -  ``ImageTransformer`` — Performs image transformation based on skimage
53 |    `transformation
54 |    function <http://scikit-image.org/docs/dev/api/skimage.transform.html>`__
55 | 
56 | Categorical data
57 | 
58 | 
59 | -  ``BagOfWordsTransformer(dictionary)`` — Performs bag-of-features
60 |    transformer for strings of any categorical data
61 | 
62 | XPandas also allows for pipelining, via the ``PipeLineChain``
63 | transformer, which can chain multiple transformers and ``scikit-learn``
64 | predictor into a single pipeline. ``PipeLineChain`` is based on the
65 | ``scikit-learn``
66 | `Pipeline <http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline>`__.
67 | 


--------------------------------------------------------------------------------
/github_deploy_key.enc:
--------------------------------------------------------------------------------
1 | gAAAAABZ6lhCyRariN85qLf0ABv3ReKJYlFI7dWpM4yfrbUQ-sTpD0YzjL61mAhOZGx606cINu-RetLgTUcs6m1J75AwiOQl2HuXMvpinOJ5bIqH5xdyFQqC_5HiHGwyn0M4PIFdiAPsT9V7mJf39_7wGnSHHRTB6tJbfIoIv5iV_rrC__T-UpkA2s5RCl1J8KmF3CFU7OvPvtpnSlL8JJfTs3yCRwDjI5Y7gs2jwkkAvl75Ry2uRzEOLQq6pY8GZmZJx-eqi6gFzEhZOzWwoSAgoRov-Bp1ZIwfAKAMLoAZfuxQiBeDz1mwqiTYY4lEoG_yng2lrUXx1o7ktz_hujoj39ZR2NDahOvBdsauJB-g_4iRHazl1IhY3vxrh9V7gQYK-fLZot967rJ5FIno4GyQIVgchzuvNVrmamAHkJuyc54PmdDk52E88cLVcKblyfTd3j4xWjDRai5md7jAUuSwF5B6LvdSnLCUgOPIrgNCHfehtsGLDPCUEbuAeEIOLPuirFJn_GTMjO5xu77YIIekesWHebG8OuMCm_cUFIqWavLWjCO3RLXKH6_fyxaoynwCwY4bCTbBjkSvTg8M_V8g5rWHjTL8qzu4eoQZ8EDqycxvr5ZTw-Lwdp-czT6uOatH88t_ucZhffHT0FEpMOQPWddPxCQV1ewetXQk-x5pZphwLl4dJq7cor7AUE-SxDCiALPv_Tdi0yfxC-qMRawcBLADPPAgiVal-vd0arplCFnccoFrgPR-t4DT53n8yCRxzXH-HrlkeZAH29pYcsCKiKgR5qOdHN5_QIAG5T4ovVTf-AKtOZFIsnoug-e08k4afYWcNXx3NE2ourDfJYHIMd34w1FfZG9Rf5xDFfJlfSPSubCWtkfB4UPqtVFwWWBv5DHCfkbNnevib3rwwDENKO0cvB85qvOBSGHRVbUDMf6LZ6qMr-dWFNb2iGm_faQ9NPdHRa5_fQ9DfT5TNmIcbBPj0oPHwQvYFDBosnNaQZpjwphVjhN1j-EIr2l4n1MnoPAbF9rW6L9oYFHOindytsGS39LumUK_ON3n7ZoN_dqUFFQH5qOcQn3JLzjUVCxd_9HsFtz1inw1XzZ-SrLZl2bhp1uztNZumma5utqzr1UYhElRkQdxKWFQwhzwe9LQ22F_26x0THTXrmI7-8tY1eh6K9mj5ogYCX_fYgj8MwK6bgXlun8yjph1kAnYOqDWvw-PaQGWRojEUjyq6Fv5tetawrCApiVRTD5LOKGFxQCQ4lIThs5DiqcLTkVUrfefPfjahqcXtsvCd5FYF4u-bpJVwt2nZj0Ux_U5x20PC9dp1l-FhgkoLh6SrPEaGZQUKiOfRhkU2ia9HbcRUODlpIPc5VsqILYF5K9V-mmyaablu0_51KZcWS5-tcldYznHMq7U-eYcun7HgutFNMQ_SUWZMMHSyXA_0epQJyieMjggqUbRCm4FmEo-TVNgj_Yv8eG9gPEcy86W_K8k60CG9lX02AVKqdfUUpZTXSjM5iokIqM2PTmuGtZGW0dVE2WwuCpASvEivb9pN2jn_Zydak6JlL84Dcd3gS_AQ-Fw3OjrNVnXxk_iRL7Pho01EH7bTCiCuzhevbnX2gomH6drDZtm8X8OuQW0SeUixlPW2535QQL_H8ZCQs85Gm5Mv1R73UD8iwIr31PNIoYsIDwLSvGtlM70KrJW1bP9nYv_ye-AmJPXO2WAKF9fxV5K_HC3Fbs323KTVrlougq93Go_Mt867nhu59JxPm2cnp_TNYMvH42enTbKixNZxjCQm9quwydLM88NEeFVhL9gSO9B8YZjVxnLv7Yos4SkNIOEGt9mfnBtvm6UMv-Rd8mGcgAbzrOSMDNHnqGJpjbWlSeXOyGPSDx6Fzq0OONDCZaQbznCtTUGDZkKaFn2699ttY2cqo5BqtwGoGKWynrlkiN-XuJNgoT1VJ5kp07jRK2sxjiaqe0n_IjCtOwKq25G4ee-ZuVEvKysvvqqClfcbrNyN9cw-eIhCtZdyG-6ZkgZqwBwmSN0MjDJH6Oj9Chs40sz4GvXDIRJLk99ggr2S4NvTneqdoYvxCCH1J1r2XEfUr-e5OeP0fzTpeTzwXF_re41Od0Je8-A2Ug7lF2LPGPRhH4Ee-cBN0bAi5MC2SPzLAAqRLy7mZHNQmyWNaniShBV-Z3WTcfS07Mt5eMwT1AOADVAHrCJwYTIOi7l1UpAyT2FWNIq0z3_Ho0i9xwZHEG74IsXHeyHAnWWTffe836mDkcxlnRDobR4ZHsbDFmmxAg0Drni8S__OWs5mOrgq0DV2962AOqDqgbOmSRYIPluxkVNcrE46Y6rTHi0XLXpRYCUBiFhIzrk9N9l8z3QZPxHgqrF6S9xDbCCz04jRIZX-HUYNPYR8N754ah9lMU4kbYGrY76EfbmXj_Ek5rsXSvLssS_VrYMe8iXIBye9zu46enie0qYC6G0J9K9rdppJ7iFWrGGhbt0-VzSPqSNQ8ajgrrxeK1oH5kB0R1YDQGZDLr0ABgWbwwnsFcBwfFX9C13yxCsqJLfn6xqMGYfx44jOavI-JWlRIDXWAToK6Y2cDOAcuISi8eRcbzdh1JRFGU3z0bch-6-mCe48K5TnsMpAcOFFBQJ7PnkouzZbpVbtDJBKtC9DnfVDDyfa38jO28qDN2Ne3mOJ-H3fdae8wT5mKgSOeUcn028r9CJFb8wt9R5dbWgB32H3_8clqggkHQkgUmdQiTAehEUc_TiFee20s-W3Ux_ljKjCqpZOzjgnLTfhP7VwRQs9U9lBg7PA-7C-5jHdI5yhs6rPgTihPD14bl2cmflq3ogU8g0ZZApEyN47OdsOgAwsdnoe6m0MMYyXCgfVwDdmP0bwkHOuhjHaOtZt0XevWbKtsHE83fsrCi3KiIxSo8Ii5qH7C1Ud2eE6VSkog6LVTL8rc2W2yy9avcEtW9ihs3zHavQN5KbP8tm8dP8_BgkutEq5dqxsPgu3sBJa11y4F6kk794196cDdWkesFwx-C7rThB91KKK_69fo90UUonNfO4oopioOLKqkjCoaLrJdSfJSyhE4M_7lLXrNYs70ndMM3WtyRQX2_ueiTBLa5BQdRfjdzTsX29jxRSjA-7I2sGZjXncsqyCsAASeveHk0Bv6tDVBpPzAf6pHF--uBpbmN7RDyb1FnxBt4bZnG-pTSknMIVxuCrUqUwlgjFIXL5WLy8johIxLLeE3KxvOQl1b1xX7VsRA7ViRgOetY5-_4Cmzey3EbLE23IFRjZdoUZ7-EajTXQFOXZChZh_Mx7yVlrfK4uOjf_uMizM7kqZFgJ5HCk0szXHlv9k8C64-AJw-zTpaKAWmAUXtMbbdigMqYyQJzdS2l-3i72WOqbfFBlnNbkDvJmYXWvIPEaqMb7LyuI6k6risdyasF2wpHhFIXVNcgwv6vNWJop6y3Mmt-IsHZHoz6IoU_0mxi1hyGziAKcSnm9TiPWBwSyV10oDSWWT0aamm9XiszrFVcyZECwSzIU_bKCH-oqysA5YQKWKe9RKLK2EnwDZ33qJf7h6PspfcUYCCeQQBlIgMkEONm4z74IqZ4Gbm4Pbh-DeTVYd7kqPk-aR_VpJM6vLtyM-s2_J3wuKy36YMMKGlAFLvuQTEChj1h14nHpkOSTUrTbqzZUnqY9BKRuN--usMZoCPcO-lnBB7cIKmsonV9ePYNnHE6wCkqg_f7mInVi7qA-GCMTxqz9kdYGLcISJf6XeCyXvfYMV1yqqOyxv9ykp41pXe6Jf-IdFzlJRQ5Ffrzhh7hblydf1PufzAybvR4ILe0B8vJEkEggGmtpz4VtzHwp37h9w83CIlQ92jwlsoylVzEMS9ZV3GRhADdIb_KZeFuFsox-AK1_GBhszco_5Wpjc2KpR-njOxLioCbn7wQibjHuKPQV1UgKeb4ikyZHO6iEzmd6hqn5xjGhCM7eXdI30j4YRPyxP6ez15b1llKgxXI9Sz8hkxGOhBDU6sPQZj_4Cd2fcZUSWlfZful0Mbe07pajtfWd__79P_UG_cv7MEevQGJQreGaywRq6XpXdIDoH9g94T_NMFv2rSIuRvN_45mAwTEjQqAmf9mhzdDAXFe9xXE_FKCXzT3GHLp2JNTU8m9OtDuBV3WRfVn6nGOVGIE4vUdPKp2EaK3r34PB388wjABOOFjTOMsTLqjYYJ9bTlD86ZJR2z1M-f5ZZwwuQ34zqPVoGNIHl5SwyYwl_iTGDOCH9pSIxhUKPr__5ZyXOasI8KzReAKf0cBUqmfAlAwjNL_Fql6k8ZF27mHy_4_Q6SQi8s7jM5KFoFMzC7c1EFGKnxaaB6mpKGRx9mq0BPDM08DJ0kN5SOTZ6ayTnOltWRCg_vxp-gA1hKJwDoiazHiATo5IgdXF9yA=


--------------------------------------------------------------------------------
/xpandas/transformers/series_transformers/series_transformer.py:
--------------------------------------------------------------------------------
  1 | from functools import partial
  2 | 
  3 | import pandas as pd
  4 | from tsfresh.feature_extraction.extraction import _do_extraction_on_chunk
  5 | from tsfresh.feature_extraction.settings import ComprehensiveFCParameters
  6 | 
  7 | from ..transformer import XSeriesTransformer
  8 | 
  9 | 
 10 | class TimeSeriesTransformer(XSeriesTransformer):
 11 |     '''
 12 |     Extract common features 'mean', 'std', 'max', 'min',
 13 |         'median', 'quantile_25', 'quantile_75',
 14 |         'quantile_90', 'quantile_95' from pandas.Series.
 15 |     Transform XSeries to XDataFrame.
 16 |     '''
 17 |     FEATURES = [
 18 |         'mean', 'std', 'max', 'min',
 19 |         'median', 'quantile_25', 'quantile_75',
 20 |         'quantile_90', 'quantile_95'
 21 |     ]
 22 | 
 23 |     def __init__(self, features=None, **kwargs):
 24 |         '''
 25 |         :param features: list of features from FEATURES property
 26 |         '''
 27 |         accepted_types = [
 28 |             pd.Series
 29 |         ]
 30 | 
 31 |         if features is None:
 32 |             features = self.FEATURES
 33 |         else:
 34 |             for f in features:
 35 |                 if f not in self.FEATURES:
 36 |                     raise ValueError('Unrecognized feature {}. Available features {}'.format(f, self.FEATURES))
 37 | 
 38 |         def series_transform(series):
 39 |             transformed_series = {}
 40 | 
 41 |             for f in features:
 42 |                 if f.startswith('quantile_'):
 43 |                     quant_rate = int(f.split('_')[1]) / 100.
 44 |                     transformed_series[f] = series.quantile(quant_rate)
 45 |                 else:
 46 |                     method_to_call = getattr(series, f)
 47 |                     result = method_to_call()
 48 |                     transformed_series[f] = result
 49 | 
 50 |             return transformed_series
 51 | 
 52 |         super(TimeSeriesTransformer, self).__init__(data_types=accepted_types,
 53 |                                                     transform_function=series_transform)
 54 | 
 55 | 
 56 | class TimeSeriesWindowTransformer(XSeriesTransformer):
 57 |     '''
 58 |     Calculate rolling mean over XSeries of pandas.Series.
 59 |     '''
 60 |     def __init__(self, windows_size=3, **kwargs):
 61 |         '''
 62 |         :param windows_size: size of window for rolling mean
 63 |         '''
 64 |         accepted_types = [
 65 |             pd.Series
 66 |         ]
 67 | 
 68 |         self.windows_size = windows_size
 69 | 
 70 |         def series_transform(series, **params):
 71 |             return series.rolling(window=self.windows_size).mean().dropna()
 72 | 
 73 |         super(TimeSeriesWindowTransformer, self).__init__(data_types=accepted_types,
 74 |                                                           transform_function=series_transform)
 75 | 
 76 | 
 77 | class MeanSeriesTransformer(XSeriesTransformer):
 78 |     '''
 79 |     Example transformer
 80 |     '''
 81 |     def __init__(self, **kwargs):
 82 |         self.total_mean = None
 83 | 
 84 |         def mean_minus_mean_function(s, total_mean=None):
 85 |             if total_mean is None:
 86 |                 total_mean = self.total_mean
 87 |             return s.mean() - total_mean
 88 | 
 89 |         accepted_types = [
 90 |             pd.Series
 91 |         ]
 92 | 
 93 |         super(MeanSeriesTransformer, self).__init__(data_types=accepted_types,
 94 |                                                     transform_function=mean_minus_mean_function)
 95 | 
 96 |     def fit(self, X, y=None, **kwargs):
 97 |         super(MeanSeriesTransformer, self).fit(X, **kwargs)
 98 |         sum_and_size = X.apply(lambda s: (s.sum(), len(s)))
 99 |         sum_total = sum([x[0] for x in sum_and_size])
100 |         total_size = sum([x[1] for x in sum_and_size])
101 |         self.total_mean = sum_total / total_size
102 | 
103 |         return self
104 | 
105 | 
106 | class TsFreshSeriesTransformer(XSeriesTransformer):
107 |     '''
108 |     Performs transformation with tsfresh http://tsfresh.readthedocs.io/en/latest/ package
109 |     over XSeries of pandas.Series.
110 |     '''
111 |     def __init__(self, **kwargs):
112 |         accepted_types = [
113 |             pd.Series
114 |         ]
115 | 
116 |         default_fc_parameters = ComprehensiveFCParameters()
117 |         extraction_function = partial(_do_extraction_on_chunk,
118 |                                       default_fc_parameters=default_fc_parameters,
119 |                                       kind_to_fc_parameters=None)
120 | 
121 |         def series_transform(series):
122 |             series_name = series.name
123 |             if series_name is None:
124 |                 series_name = self.name
125 | 
126 |             input_series = (
127 |                 1, series_name, series
128 |             )
129 |             extracted_data = extraction_function(input_series)
130 |             extracted_data_flat = {
131 |                 x['variable']: x['value']
132 |                 for x in extracted_data
133 |             }
134 |             return extracted_data_flat
135 | 
136 |         super(TsFreshSeriesTransformer, self).__init__(data_types=accepted_types,
137 |                                                        columns=None,
138 |                                                        transform_function=series_transform)
139 | 
140 |     def transform(self, X):
141 |         self.name = X.name
142 |         return super(TsFreshSeriesTransformer, self).transform(X)
143 | 


--------------------------------------------------------------------------------
/tests/test_data_type.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | here = os.path.dirname(os.path.realpath(__file__))
  5 | sys.path.append(os.path.join(here, ".."))
  6 | 
  7 | from ..xpandas.data_container import XSeries, XDataFrame
  8 | import pandas as pd
  9 | import numpy as np
 10 | 
 11 | 
 12 | def test_series_type_series():
 13 |     s = XSeries([
 14 |         pd.Series([1, 2, 3], index=['a', 'b', 'c']),
 15 |         pd.Series([4, 5, 6], index=['d', 'e', 'g'])
 16 |     ])
 17 | 
 18 |     assert s.data_type == pd.Series
 19 | 
 20 | 
 21 | def test_series_type_primiteves():
 22 |     s1 = XSeries([
 23 |         1, 2, 3
 24 |     ])
 25 | 
 26 |     assert s1.data_type == np.int64
 27 | 
 28 |     s2 = XSeries([
 29 |         'a', 'b', 'c'
 30 |     ])
 31 | 
 32 |     assert s2.data_type == str
 33 | 
 34 | 
 35 | def test_series_different_data_type_exception():
 36 |     try:
 37 |         s1 = XSeries([
 38 |             pd.Series([1, 2, 3], index=['a', 'b', 'c']),
 39 |             pd.DataFrame({})
 40 |         ])
 41 | 
 42 |         s2 = XSeries([
 43 |             1, 2, 'abs'
 44 |         ])
 45 |     except ValueError:
 46 |         assert True
 47 |         return
 48 | 
 49 |     assert False
 50 | 
 51 | 
 52 | def test_series_type_data_frame():
 53 |     s = XSeries([
 54 |         pd.DataFrame({
 55 |             'a': [1, 2, 3],
 56 |             'b': [4, 5, 6]
 57 |         }),
 58 |         pd.DataFrame({
 59 |             'c': [7, 8, 9],
 60 |             'd': [10, 11, 12]
 61 |         })
 62 |     ])
 63 | 
 64 |     assert s.data_type == pd.DataFrame
 65 | 
 66 | 
 67 | def test_series_slise_type():
 68 |     s = XSeries([
 69 |         pd.Series([1, 2, 3], index=['a', 'b', 'c']),
 70 |         pd.Series([4, 5, 6], index=['d', 'e', 'g']),
 71 |         pd.Series([7, 8, 9])
 72 |     ])
 73 | 
 74 |     sub_s = s[:2]
 75 | 
 76 |     assert sub_s.data_type == pd.Series
 77 | 
 78 | 
 79 | def test_series_custom_class_type():
 80 |     class MyClass(object):
 81 |         a = 1
 82 |         b = 2
 83 | 
 84 |         def __init__(self, a, b):
 85 |             self.a = a
 86 |             self.b = b
 87 | 
 88 |     class MySubClass(MyClass):
 89 |         pass
 90 | 
 91 |     s = XSeries([
 92 |         MyClass(1, 2),
 93 |         MyClass(3, 4),
 94 |         MyClass(5, 6)
 95 |     ])
 96 | 
 97 |     assert s.data_type == MyClass
 98 | 
 99 |     sub_s = XSeries([
100 |         MySubClass(1, 2),
101 |         MySubClass(3, 4),
102 |         MySubClass(5, 6)
103 |     ])
104 | 
105 |     assert sub_s.data_type == MySubClass
106 | 
107 | 
108 | def test_dataframe_data_types():
109 |     s1 = XSeries([pd.Series([1, 2, 3], index=['a', 'b', 'c']),
110 |                       pd.Series([4, 5, 6], index=['d', 'e', 'g'])])
111 |     s2 = XSeries([1, 2, 3])
112 |     s3 = XSeries([{"k1": "v1"}, {"k2": 'v2'}])
113 |     s4 = XSeries(['f', 's', 't'])
114 | 
115 |     df = XDataFrame({
116 |         'first_col': s1,
117 |         'second_col': s2,
118 |         'third_col': s3,
119 |         'fourth_col': s4
120 |     })
121 | 
122 |     assert df['first_col'].data_type == pd.Series
123 |     assert df['second_col'].data_type == np.int64
124 |     assert df['third_col'].data_type == dict
125 |     assert df['fourth_col'].data_type == str
126 | 
127 |     assert type(df[['first_col']]) == XDataFrame
128 |     assert type(df[['first_col', 'second_col']]) == XDataFrame
129 | 
130 | 
131 | def test_dataframe_sub_frame_data_types():
132 |     s1 = XSeries([pd.Series([1, 2, 3], index=['a', 'b', 'c']),
133 |                       pd.Series([4, 5, 6], index=['d', 'e', 'g'])])
134 |     s2 = XSeries([1, 2, 3])
135 |     s3 = XSeries([{"k1": "v1"}, {"k2": 'v2'}])
136 |     s4 = XSeries(['f', 's', 't'])
137 | 
138 |     df = XDataFrame({
139 |         'first_col': s1,
140 |         'second_col': s2,
141 |         'third_col': s3,
142 |         'fourth_col': s4
143 |     })
144 | 
145 |     sub_df = df.loc[:2]
146 | 
147 |     assert type(sub_df) == XDataFrame
148 |     assert sub_df['first_col'].data_type == pd.Series
149 |     assert sub_df['second_col'].data_type == np.int64
150 |     assert sub_df['third_col'].data_type == dict
151 |     assert sub_df['fourth_col'].data_type == str
152 | 
153 |     assert type(sub_df[['first_col']]) == XDataFrame
154 |     assert type(sub_df[['first_col', 'second_col']]) == XDataFrame
155 | 
156 | 
157 | def test_series_map_transformer():
158 |     s = XSeries([
159 |         pd.Series([1, 2, 3], index=['a', 'b', 'c']),
160 |         pd.Series([4, 5, 6], index=['d', 'e', 'g'])
161 |     ])
162 | 
163 |     func = lambda series: series + 1
164 |     mapped_s = s.map(func)
165 |     assert mapped_s.data_type == pd.Series
166 |     assert mapped_s[0].equals(pd.Series([2, 3, 4], index=['a', 'b', 'c']))
167 | 
168 |     func = lambda series: series.mean()
169 |     mapped_s = s.map(func)
170 |     assert mapped_s.data_type == np.float64
171 | 
172 | 
173 | def test_series_extract_features_with_apply_func():
174 |     s = XSeries([
175 |         pd.Series([1, 2, 3], index=['a', 'b', 'c']),
176 |         pd.Series([4, 5, 6], index=['d', 'e', 'g'])
177 |     ], name='MySuperSeries')
178 | 
179 |     func = lambda series: {'mean': series.mean(), 'std': series.std()}
180 |     mapped_s = s.apply(func)
181 |     assert type(mapped_s) == XDataFrame
182 | 
183 | 
184 | def test_series_replace_element():
185 |     s = XSeries([
186 |         pd.Series([1, 2, 3], index=['a', 'b', 'c']),
187 |         pd.Series([4, 5, 6], index=['d', 'e', 'g'])
188 |     ], name='MySuperSeries')
189 | 
190 |     try:
191 |         s[0] = 111
192 |         assert False
193 |     except:
194 |         assert True
195 | 
196 |     try:
197 |         s[0] = pd.Series(np.random.normal(size=100))
198 |         assert True
199 |     except:
200 |         assert False
201 | 
202 | 
203 | def test_series_to_pandas_series():
204 |     s = XSeries([
205 |         pd.Series([1, 2, 3], index=['a', 'b', 'c']),
206 |         pd.Series([4, 5, 6], index=['d', 'e', 'g'])
207 |     ], name='MySuperSeries')
208 |     s = XSeries(['a', 'b', 'c'], name='MySuperSeries')
209 |     s = s.to_pandas_series()
210 | 
211 |     assert type(s) == pd.Series
212 | 
213 | 
214 | def test_dataframe_to_pandas_dataframe():
215 |     s1 = XSeries([pd.Series([1, 2, 3], index=['a', 'b', 'c']),
216 |                       pd.Series([4, 5, 6], index=['d', 'e', 'g'])])
217 |     s2 = XSeries([1, 2, 3])
218 |     s3 = XSeries([{"k1": "v1"}, {"k2": 'v2'}])
219 |     s4 = XSeries(['f', 's', 't'])
220 | 
221 |     df = XDataFrame({
222 |         'first_col': s1,
223 |         'second_col': s2,
224 |         'third_col': s3,
225 |         'fourth_col': s4
226 |     })
227 | 
228 |     try:
229 |         df.to_pandas_dataframe()
230 |         assert False
231 |     except:
232 |         assert True
233 | 
234 |     s1 = XSeries([4, 5, 6])
235 |     s2 = XSeries([1, 2, 3])
236 | 
237 |     df = XDataFrame({
238 |         'first_col': s1,
239 |         'second_col': s2,
240 |     })
241 | 
242 |     try:
243 |         df = df.to_pandas_dataframe()
244 |         assert True
245 |     except:
246 |         assert False
247 | 
248 |     assert type(df) == pd.DataFrame
249 | 


--------------------------------------------------------------------------------
/tests/test_transformer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | 
  4 | here = os.path.dirname(os.path.realpath(__file__))
  5 | sys.path.append(os.path.join(here, ".."))
  6 | 
  7 | from ..xpandas.data_container import XDataFrame, XSeries
  8 | from ..xpandas.transformers import XSeriesTransformer, TimeSeriesTransformer, \
  9 |     TimeSeriesWindowTransformer, MeanSeriesTransformer, XDataFrameTransformer, PipeLineChain
 10 | import pandas as pd
 11 | import numpy as np
 12 | 
 13 | 
 14 | def test_transformer_custom():
 15 |     s = XSeries([
 16 |         pd.Series([1, 2, 3], index=['a', 'b', 'c']),
 17 |         pd.Series([4, 5, 6], index=['d', 'e', 'g'])
 18 |     ])
 19 | 
 20 |     series_transformer = XSeriesTransformer(transform_function=lambda series: series.mean())
 21 |     series_transformer = series_transformer.fit()
 22 | 
 23 |     s_transformed = series_transformer.transform(s)
 24 | 
 25 |     assert type(s_transformed) == XSeries
 26 |     assert s_transformed.data_type == np.float64
 27 | 
 28 | 
 29 | def test_transformer_custom_to_data_frame():
 30 |     s = XSeries([
 31 |         pd.Series([1, 2, 3], index=['a', 'b', 'c']),
 32 |         pd.Series([4, 5, 6], index=['d', 'e', 'g'])
 33 |     ])
 34 | 
 35 |     series_transformer = XSeriesTransformer(transform_function=lambda series: {'mean': series.mean()})
 36 |     series_transformer = series_transformer.fit()
 37 | 
 38 |     s_transformed = series_transformer.transform(s)
 39 | 
 40 |     assert type(s_transformed) == XDataFrame
 41 | 
 42 | 
 43 | def test_transformer_custom_series_to_series():
 44 |     s = XSeries([
 45 |         pd.Series([1, 2, 3], index=['a', 'b', 'c']),
 46 |         pd.Series([4, 5, 6], index=['d', 'e', 'g'])
 47 |     ])
 48 | 
 49 |     series_transformer = XSeriesTransformer(transform_function=lambda series: series + 1)
 50 |     series_transformer = series_transformer.fit()
 51 | 
 52 |     s_transformed = series_transformer.transform(s)
 53 | 
 54 |     assert type(s_transformed) == XSeries
 55 |     assert s_transformed.data_type == pd.Series
 56 | 
 57 | 
 58 | def test_transformer_series_transformer():
 59 |     s = XSeries([
 60 |         pd.Series([1, 2, 3], index=['a', 'b', 'c']),
 61 |         pd.Series([4, 5, 6], index=['d', 'e', 'g'])
 62 |     ])
 63 | 
 64 |     series_transformer = TimeSeriesTransformer()
 65 |     series_transformer = series_transformer.fit()
 66 | 
 67 |     transformed_series = series_transformer.transform(s)
 68 | 
 69 |     assert type(transformed_series) == XDataFrame
 70 | 
 71 | 
 72 | def test_transformer_series_to_series_transformer():
 73 |     s = XSeries([
 74 |         pd.Series(np.random.normal(0, 10, 100)),
 75 |         pd.Series(np.random.uniform(-100, 100, 150)),
 76 |         pd.Series(np.random.random_integers(0, 500, 200))
 77 |     ])
 78 | 
 79 |     series_to_series_transformer = TimeSeriesWindowTransformer(windows_size=5)
 80 |     series_to_series_transformer.set_params(windows_size=3)
 81 |     series_to_series_transformer.fit()
 82 |     transformed_series = series_to_series_transformer.transform(s)
 83 | 
 84 |     assert series_to_series_transformer.transform_function(s[0]).equals(transformed_series[0])
 85 |     assert transformed_series.data_type == pd.Series
 86 |     assert type(transformed_series) == XSeries
 87 | 
 88 | 
 89 | def test_transformer_data_frame():
 90 |     s1 = XSeries([pd.Series([1, 2, 3], index=['a', 'b', 'c']),
 91 |                       pd.Series([4, 5, 6], index=['d', 'e', 'g'])])
 92 |     s2 = XSeries([1, 2, 3])
 93 |     s3 = XSeries([{"k1": "v1"}, {"k2": 'v2'}])
 94 |     s4 = XSeries(['f', 's', 't'])
 95 | 
 96 |     df = XDataFrame({
 97 |         'second_col': s2,
 98 |         'third_col': s3,
 99 |         'fourth_col': s4
100 |     })
101 | 
102 |     data_frame_transformer = TimeSeriesTransformer().fit()
103 |     try:
104 |         data_frame_transformer.transform(df)
105 |         assert False
106 |     except:
107 |         assert True
108 | 
109 |     s1 = XSeries([
110 |         pd.Series(np.random.normal(size=10)),
111 |         pd.Series(np.random.normal(size=15))
112 |     ])
113 |     s2 = XSeries([
114 |         pd.Series(np.random.normal(size=10)),
115 |         pd.Series(np.random.normal(size=10))
116 |     ])
117 |     s3 = XSeries([{"k1": "v1"}, {"k2": 'v2'}])
118 |     s4 = XSeries(['f', 's'])
119 |     df = XDataFrame({
120 |         'first_col': s1,
121 |         'second_col': s2,
122 |         'third_col': s3,
123 |         'fourth_col': s4
124 |     })
125 | 
126 |     # print(
127 |     #     df['first_col'].shape
128 |     # )
129 | 
130 |     data_frame_transformer = XDataFrameTransformer(transformations={
131 |         'first_col': TimeSeriesTransformer(),
132 |         'second_col': TimeSeriesTransformer()
133 |     })
134 | 
135 |     data_frame_transformer.fit(df)
136 |     transformers_df = data_frame_transformer.transform(df)
137 |     # print(transformers_df.head())
138 | 
139 | 
140 | def test_pipeline_transformer_for_series():
141 |     from sklearn.decomposition import PCA
142 | 
143 |     s1 = XSeries([
144 |         pd.Series(np.random.normal(size=15)),
145 |         pd.Series(np.random.normal(size=15)),
146 |         pd.Series(np.random.normal(size=15)),
147 |         pd.Series(np.random.normal(size=15)),
148 |         pd.Series(np.random.normal(size=15)),
149 |         pd.Series(np.random.normal(size=15)),
150 |         pd.Series(np.random.normal(size=15))
151 |     ])
152 | 
153 |     pipeline = PipeLineChain(
154 |         [
155 |             ('first_transformer', TimeSeriesWindowTransformer()),
156 |             ('mean_transformer', TimeSeriesTransformer())
157 |         ]
158 |     )
159 |     pipeline = pipeline.fit(s1)
160 |     transformed_ts = pipeline.transform(s1)
161 | 
162 |     pipeline = PipeLineChain(
163 |         [
164 |             ('first_transformer', TimeSeriesWindowTransformer()),
165 |             ('mean_transformer', TimeSeriesTransformer()),
166 |             ('pca', PCA(n_components=4))
167 |         ]
168 |     )
169 |     pipeline.fit(s1)
170 |     transformed_ts = pipeline.transform(s1)
171 | 
172 | 
173 | def test_mean_transformer():
174 |     s1 = XSeries([
175 |         pd.Series(np.random.normal(size=10)),
176 |         pd.Series(np.random.normal(size=15))
177 |     ])
178 |     s2 = XSeries([
179 |         pd.Series(np.random.normal(size=10)),
180 |         pd.Series(np.random.normal(size=15)),
181 |         pd.Series(np.random.normal(size=100))
182 |     ])
183 | 
184 |     tr = MeanSeriesTransformer()
185 |     tr = tr.fit(s1)
186 | 
187 |     transformed_s = tr.transform(s2)
188 | 
189 |     assert transformed_s.shape[0] == 3
190 |     assert type(transformed_s) == XSeries
191 | 
192 | 
193 | def test_mean_transformer_data_frame():
194 |     s1 = XSeries([
195 |         pd.Series(np.random.normal(size=10)),
196 |         pd.Series(np.random.normal(size=15))
197 |     ])
198 |     s2 = XSeries([
199 |         pd.Series(np.random.normal(size=10)),
200 |         pd.Series(np.random.normal(size=15))
201 |     ])
202 | 
203 |     df = XDataFrame({
204 |         's1': s1,
205 |         's2': s2
206 |     })
207 | 
208 |     tr = MeanSeriesTransformer()
209 |     try:
210 |         tr = tr.fit(df)
211 |         assert False
212 |     except:
213 |         assert True
214 | 


--------------------------------------------------------------------------------
/xpandas/transformers/transformer.py:
--------------------------------------------------------------------------------
  1 | from sklearn.base import BaseEstimator, TransformerMixin
  2 | 
  3 | from ..data_container import XDataFrame, XSeries
  4 | 
  5 | 
  6 | class XSeriesTransformer(BaseEstimator, TransformerMixin):
  7 |     '''
  8 |     XSeriesTransformer is a base class for all custom transformers.
  9 |     XSeriesTransformer is a high level abstraction to transform XSeries of
 10 |     specific data_types to an another XSeries or XDataFrame.
 11 |     XSeriesTransformer encapsulates transformation and based on scikit-learn BaseEstimator
 12 |     http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html
 13 |     '''
 14 |     _TRANSFORM_ARG_FUNCTION_NAME = 'transform_function'
 15 | 
 16 |     def __init__(self, transform_function=None, data_types=None, name=None, **kwargs):
 17 |         '''
 18 |         :param transform_function: Callable that performs actual transform
 19 |         :param data_types: list of data_type that this transformer can work with. if None,
 20 |                             error might be raised at run time
 21 |         :param name: name for transformer. if none, class name is default
 22 |         :param kwargs: additional arguments
 23 |         '''
 24 |         if transform_function is not None and not callable(transform_function):
 25 |             raise ValueError('transform_function must be callable')
 26 | 
 27 |         self.transform_function = transform_function
 28 |         self.data_types = data_types
 29 | 
 30 |         if name is None:
 31 |             self.name = self.__class__.__name__
 32 |         else:
 33 |             self.name = name
 34 | 
 35 |     def _check_input(self, input_data):
 36 |         '''
 37 |         Check that input valid: input_data is XSeries and transformer
 38 |         "knows" how to work with input_data.data_type.
 39 |         In error raise exception.
 40 |         '''
 41 |         if type(input_data) != XSeries:
 42 |             raise ValueError('X must be XSeries type')
 43 |         elif type(input_data) == XSeries and self.data_types is not None \
 44 |                 and input_data.data_type not in self.data_types:
 45 |             raise ValueError('Estimator does not support {} type'.format(input_data.data_type))
 46 | 
 47 |     def fit(self, X=None, y=None, **kwargs):
 48 |         '''
 49 |         Fit transformer for giver data.
 50 |         Must be overwritten in child classes
 51 |         :param X: XSeries to fit transformer on
 52 |         :param y: Labels column for X
 53 |         :param kwargs: additional arguments for transformer
 54 |         :return: fitted self object
 55 |         '''
 56 |         if X is not None:
 57 |             self._check_input(X)
 58 | 
 59 |         return self
 60 | 
 61 |     def _transform_series(self, custom_series):
 62 |         '''
 63 |         Helper method to transform XSeries
 64 |         :param custom_series: XSeries object
 65 |         :return: transformed XSeries.
 66 |                  it could be XSeries or XDataFrame object
 67 |         '''
 68 |         return custom_series.apply(func=self.transform_function, prefix=self.name)
 69 | 
 70 |     def transform(self, X):
 71 |         '''
 72 |         Apply transformation to X with current transformer
 73 |         :param X: input XSeries
 74 |         :param columns: deprecated
 75 |         :return: transformed XSeries.
 76 |                  it could be XSeries or XDataFrame object
 77 | 
 78 |         '''
 79 |         if not hasattr(self, self._TRANSFORM_ARG_FUNCTION_NAME):
 80 |             raise ValueError('You mast pass transform_function argument with a function')
 81 | 
 82 |         self._check_input(X)
 83 | 
 84 |         transform_series = self._transform_series(X)
 85 |         transform_series.index = X.index
 86 | 
 87 |         return transform_series
 88 | 
 89 | 
 90 | class XDataFrameTransformer(BaseEstimator, TransformerMixin):
 91 |     '''
 92 |     XDataFrameTransformer is a set of XSeriesTransformer instances.
 93 |     XDataFrameTransformer can transform XDataFrame object to another XDataFrame
 94 |     based on set of XSeriesTransformer transformers.
 95 |     '''
 96 | 
 97 |     def _validate_transformations(self, transformations):
 98 |         for k, v in transformations.items():
 99 |             if not isinstance(k, str):
100 |                 raise TypeError('Key must be a string {}'.format(k))
101 | 
102 |             if isinstance(v, list):
103 |                 for t in v:
104 |                     if not isinstance(t, XSeriesTransformer):
105 |                         raise TypeError('All objects of {} must be a Transformer object. Issue with {}'.format(v, t))
106 |             elif not isinstance(v, XSeriesTransformer):
107 |                 raise TypeError('Value must be a Transformer object {}'.format(v))
108 | 
109 |     def _wrap_transformers_in_list(self, transformations):
110 |         new_transformers = {}
111 |         for k, v in transformations.items():
112 |             if isinstance(v, list):
113 |                 new_transformers[k] = v
114 |             else:
115 |                 new_transformers[k] = [v]
116 |         return new_transformers
117 | 
118 |     def __init__(self, transformations):
119 |         '''
120 |         Init XDataFrameTransformer with a dict of transformations.
121 |         Each transformation specify column and transformer object
122 |         :param transformations: dict {column_name: Transformer object or [Transformer object]}
123 |         '''
124 |         self._validate_transformations(transformations)
125 |         self.transformations = self._wrap_transformers_in_list(transformations)
126 | 
127 |     def fit(self, X=None, y=None, **kwargs):
128 |         '''
129 |         Fit each transformer at self.transformations dictionary
130 |         '''
131 |         if not isinstance(X, XDataFrame):
132 |             raise TypeError('X must be a XDataFrame type. Not {}'.format(type(X)))
133 | 
134 |         for col_name, transformations in self.transformations.items():
135 |             for t in transformations:
136 |                 t.fit(X[col_name])
137 | 
138 |         return self
139 | 
140 |     def transform(self, X, columns_mapping=None):
141 |         '''
142 |         Transform X with fitted dictionary self.transformations.
143 |         :param columns_mapping: {old_col: new_col} mapping between columns in fit data set and current X
144 |         :return:
145 |         '''
146 |         if columns_mapping is None:
147 |             columns_mapping = {}
148 | 
149 |         transformers_df = X.copy()
150 | 
151 |         for col_name, transformations in self.transformations.items():
152 |             for t in transformations:
153 |                 new_col_name = columns_mapping.get(col_name, col_name)
154 |                 transformed_column = t.transform(X[new_col_name])
155 | 
156 |                 if type(transformed_column) == XSeries:
157 |                     transformers_df.rename(columns={
158 |                         new_col_name: transformed_column.name
159 |                     }, inplace=True)
160 |                     transformers_df[transformed_column.name] = transformed_column
161 |                 else:
162 |                     transformers_df.drop(new_col_name, inplace=True, axis=1)
163 | 
164 |                     transformers_df = XDataFrame.concat_dataframes(
165 |                         [transformers_df, transformed_column]
166 |                     )
167 | 
168 |         return transformers_df
169 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 
 49 | clean:
 50 | 	rm -rf $(BUILDDIR)/*
 51 | 
 52 | html:
 53 | 	ipython nbconvert ../examples/ExampleUsage.ipynb --to rst
 54 | 	mv ../examples/ExampleUsage.rst .
 55 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 58 | 
 59 | livehtml:
 60 | 	sphinx-autobuild -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 61 | 
 62 | dirhtml:
 63 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 64 | 	@echo
 65 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 66 | 
 67 | singlehtml:
 68 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 69 | 	@echo
 70 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 71 | 
 72 | pickle:
 73 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 74 | 	@echo
 75 | 	@echo "Build finished; now you can process the pickle files."
 76 | 
 77 | json:
 78 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 79 | 	@echo
 80 | 	@echo "Build finished; now you can process the JSON files."
 81 | 
 82 | htmlhelp:
 83 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 84 | 	@echo
 85 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 86 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 87 | 
 88 | qthelp:
 89 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 90 | 	@echo
 91 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 92 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 93 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/XPandas.qhcp"
 94 | 	@echo "To view the help file:"
 95 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/XPandas.qhc"
 96 | 
 97 | devhelp:
 98 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
 99 | 	@echo
100 | 	@echo "Build finished."
101 | 	@echo "To view the help file:"
102 | 	@echo "# mkdir -p $HOME/.local/share/devhelp/XPandas"
103 | 	@echo "# ln -s $(BUILDDIR)/devhelp $HOME/.local/share/devhelp/XPandas"
104 | 	@echo "# devhelp"
105 | 
106 | epub:
107 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
108 | 	@echo
109 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
110 | 
111 | latex:
112 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
113 | 	@echo
114 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
115 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
116 | 	      "(use \`make latexpdf' here to do that automatically)."
117 | 
118 | latexpdf:
119 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
120 | 	@echo "Running LaTeX files through pdflatex..."
121 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
122 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
123 | 
124 | latexpdfja:
125 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
126 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
127 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
128 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
129 | 
130 | text:
131 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
132 | 	@echo
133 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
134 | 
135 | man:
136 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
137 | 	@echo
138 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
139 | 
140 | texinfo:
141 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
142 | 	@echo
143 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
144 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
145 | 	      "(use \`make info' here to do that automatically)."
146 | 
147 | info:
148 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
149 | 	@echo "Running Texinfo files through makeinfo..."
150 | 	make -C $(BUILDDIR)/texinfo info
151 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
152 | 
153 | gettext:
154 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
155 | 	@echo
156 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
157 | 
158 | changes:
159 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
160 | 	@echo
161 | 	@echo "The overview file is in $(BUILDDIR)/changes."
162 | 
163 | linkcheck:
164 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
165 | 	@echo
166 | 	@echo "Link check complete; look for any errors in the above output " \
167 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
168 | 
169 | doctest:
170 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
171 | 	@echo "Testing of doctests in the sources finished, look at the " \
172 | 	      "results in $(BUILDDIR)/doctest/output.txt."
173 | 
174 | xml:
175 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
176 | 	@echo
177 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
178 | 
179 | pseudoxml:
180 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
181 | 	@echo
182 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
183 | 
184 | buildapi:
185 | 	sphinx-apidoc -fMT ../xpandas -o api
186 | 	@echo "Auto-generation of API documentation finished. " \
187 |           "The generated files are in 'api/'"
188 | 


--------------------------------------------------------------------------------
/xpandas/data_container/data_container.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | 
  5 | def _check_all_elements_have_the_same_property(array, func):
  6 |     '''
  7 |     Helper function that checks if all elements have the same func(element) value.
  8 |     :param array: input values
  9 |     :param func: any callable object
 10 |     :return: tuple. the first element indicates is all elements are have the same func(element) value,
 11 |              second element is a value of func(element)
 12 |     '''
 13 |     if len(array) == 0:
 14 |         return True, None
 15 |     try:
 16 |         first_element_type = func(array[0])
 17 |     except:
 18 |         return True, None
 19 |     do_all_have_property = all(func(x) == first_element_type
 20 |                                for x in array)
 21 | 
 22 |     return do_all_have_property, first_element_type
 23 | 
 24 | 
 25 | def _is_class_a_primitive(cls):
 26 |     '''
 27 |     Check if class is a number or string including numpy numbers
 28 |     :param cls: any class
 29 |     :return: True if class is a primitive class, else False
 30 |     '''
 31 |     primitives = [
 32 |         np.float16, np.float32, np.float64, np.float128,
 33 |         np.int8, np.int16, np.int32, np.int64,
 34 |         bool, str, np.uint8, np.uint16, np.uint32, np.uint64,
 35 |         int, float
 36 |     ]
 37 |     return cls in primitives
 38 | 
 39 | 
 40 | class XSeries(pd.Series):
 41 |     '''
 42 |     XSeries is an homogeneous abstract 1d container that encapsulates any data type inside.
 43 |     It is an extension of pandas.Series class.
 44 |     XSeries has a property data_type that is a type ot objects that are inside XSeries.
 45 |     '''
 46 |     _metadata = ['data_type']
 47 | 
 48 |     @property
 49 |     def _constructor(self):
 50 |         return XSeries
 51 | 
 52 |     @property
 53 |     def _constructor_expanddim(self):
 54 |         return XDataFrame
 55 | 
 56 |     def __init__(self, *args, **kwargs):
 57 |         '''
 58 |         The same arguments as for pandas.Series
 59 |         https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html
 60 | 
 61 |         In order to create XSeries of any data_type, data argument must be a pythons list.
 62 |         For example, to create XSeries of pandas.Series, pass data should be
 63 |         data = [s_1, s2, ..., s3] where s_i is a instance of pandas.Series.
 64 |         '''
 65 |         super(XSeries, self).__init__(*args, **kwargs)
 66 | 
 67 |         data = kwargs.get('data')
 68 |         if data is None:
 69 |             data = args[0]
 70 | 
 71 |         check_result, data_type = _check_all_elements_have_the_same_property(data, type)
 72 |         if not check_result:
 73 |             raise ValueError('Not all elements the same type')
 74 | 
 75 |         if data_type is not None:
 76 |             self._data_type = data_type
 77 |         else:
 78 |             self._data_type = type(data._values[0])
 79 | 
 80 |     def apply(self, *args, **kwargs):
 81 |         '''
 82 |         Overwrite standart pandas.Series method.
 83 |         Apply transform function to all elements in self.
 84 |         *If transform function return dict like object,
 85 |         transform XSeries to XDataFrame see XDataFrame constructor*
 86 | 
 87 |         :param func: function to apply
 88 |         :param prefix: prefix for columns if needs to return XDataFrame object
 89 |         :return: XSeries of XDataFrame depending on transformation
 90 |         '''
 91 |         func = kwargs.get('func')
 92 |         if func is None:
 93 |             func = args[0]
 94 | 
 95 |         # TODO
 96 |         # Possibly change to handle NaN
 97 |         mapped_series = self.dropna()
 98 |         mapped_series = mapped_series.map(func, na_action='ignore')
 99 |         mapped_data_type = mapped_series.data_type
100 | 
101 |         custom_prefix = kwargs.get('prefix')
102 |         if custom_prefix is None:
103 |             custom_prefix = self.name
104 |         else:
105 |             custom_prefix = '{}_{}'.format(self.name, custom_prefix)
106 | 
107 |         if mapped_series.__is_data_type_dict_like():
108 |             custom_df = XDataFrame.from_records(mapped_series.values)
109 | 
110 |             if custom_prefix is not None:
111 |                 custom_df.columns = custom_df.columns.map(lambda x: '{}_{}'.format(custom_prefix, x))
112 |             return custom_df
113 |         elif mapped_data_type == pd.DataFrame:
114 |             return pd.concat(mapped_series.values, ignore_index=True)
115 |         else:
116 |             mapped_series.name = custom_prefix
117 | 
118 |         return mapped_series
119 | 
120 |     def __is_data_type_dict_like(self):
121 |         '''
122 |         Check if data encapsulated by self is instance of dict
123 |         '''
124 |         return isinstance(self.iloc[0], dict)
125 | 
126 |     @property
127 |     def data_type(self):
128 |         '''
129 |         Getter for a data_type property
130 |         data_type is a data type that self encapsulates
131 |         For example, if self is contains images, that data_type would be Image
132 |         '''
133 |         first_element_data_type = type(self.iloc[0])
134 |         self._data_type = first_element_data_type
135 |         return self._data_type
136 | 
137 |     @data_type.setter
138 |     def data_type(self, data_type):
139 |         '''
140 |         Setter for a data_type property
141 |         data_type is a data type that self encapsulates
142 |         For example, if self is contains images, that data_type would be Image
143 |         '''
144 | 
145 |         self._data_type = data_type
146 | 
147 |     def to_pandas_series(self):
148 |         '''
149 |         Convert self to pandas.Series if data_type is a primitive type
150 |         etc. number of string
151 |         :return: Pandas Series or raise exception if data_type is not a primitive type
152 |         '''
153 |         is_primitive = _is_class_a_primitive(self.data_type)
154 |         if is_primitive:
155 |             self.__class__ = pd.Series
156 |         else:
157 |             raise ValueError('Unable to cast to pd.Series. {} is not a primitive type.'.format(self.data_type))
158 |         return self
159 | 
160 |     def __str__(self):
161 |         s = super(XSeries, self).__str__()
162 |         return '{}\ndata_type: {}'.format(s, self.data_type)
163 | 
164 |     def __getitem__(self, key):
165 |         return super(XSeries, self).__getitem__(key)
166 | 
167 |     def __setitem__(self, key, value):
168 |         value_type = type(value)
169 |         if value_type != self.data_type:
170 |             raise ValueError('Can not assign key {} with {} wrong data_type {} correct is {}'.format(
171 |                 key, value, value_type, self.data_type
172 |             ))
173 | 
174 |         return super(XSeries, self).__setitem__(key, value)
175 | 
176 | 
177 | class XDataFrame(pd.DataFrame):
178 |     '''
179 |     XDataFrame is 2d container that stores XSeries objects
180 |     XDataFrame is an extension of pandas.DataFrame object
181 |     '''
182 | 
183 |     @property
184 |     def _constructor(self):
185 |         return XDataFrame
186 | 
187 |     @property
188 |     def _constructor_sliced(self):
189 |         return XSeries
190 | 
191 |     def __init__(self, *args, **kwargs):
192 |         '''
193 |         The same arguments as for pandas.DataFrame
194 |         https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html
195 | 
196 |         data argument should be a list of XSeries objects or dict of XSeries objects.
197 |         In dict is passed, key must be a string and it's indicate appropriate column name.
198 |         For example, to create XDataFrame data should looks like
199 |         data = {'col_1': s_1, 'col_2': s_2, ..., 'col_n': s_n} where s_i is a XSeries
200 |         '''
201 |         data = kwargs.get('data')
202 |         if data is None:
203 |             data = args[0]
204 | 
205 |         data_to_check = []
206 |         if isinstance(data, list):
207 |             data_to_check = data
208 |         elif isinstance(data, dict):
209 |             data_to_check = data.values()
210 | 
211 |         for d in data_to_check:
212 |             if not isinstance(d, XSeries):
213 |                 raise ValueError('All data must be XSeries instances')
214 |         super(XDataFrame, self).__init__(*args, **kwargs)
215 | 
216 |     def get_columns_of_type(self, column_type):
217 |         '''
218 |         Get all columns from XDataFrame with given column_type
219 |         :param column_type: list of types or a single type
220 |         :return: tuple. the first element is subMultiDataFrame and second is a list of column of a given column_type
221 |         '''
222 |         if type(column_type) != list:
223 |             column_type = [column_type]
224 | 
225 |         columns_to_select = [
226 |             col_name
227 |             for col_name in self
228 |             if self[col_name].data_type in column_type
229 |         ]
230 | 
231 |         return self[columns_to_select], columns_to_select
232 | 
233 |     def get_data_types(self):
234 |         '''
235 |         Get a list of data_types of each XSeries inside XDataFrame
236 |         :return: list of data_type
237 |         '''
238 |         data_types = [
239 |             self[column].data_type
240 |             for column in self
241 |         ]
242 |         return data_types
243 | 
244 |     def to_pandas_dataframe(self):
245 |         '''
246 |         Convert self to pandas.DataFrame if all columns are primitive types.
247 |         See more at XSeries.to_pandas_series
248 |         :return:
249 |         '''
250 |         data_types = self.get_data_types()
251 |         is_all_columns_are_primitive = all(
252 |             _is_class_a_primitive(dt)
253 |             for dt in data_types
254 |         )
255 |         if is_all_columns_are_primitive:
256 |             self.__class__ = pd.DataFrame
257 |         else:
258 |             raise ValueError('Unable to cast to pd.DataFrame. {} is not all primitives.'.format(self.data_types))
259 |         return self
260 | 
261 |     @classmethod
262 |     def concat_dataframes(cls, data_frames):
263 |         '''
264 |         Concatenate XDataFrame using pandas.concat method
265 |         https://pandas.pydata.org/pandas-docs/stable/generated/pandas.concat.html
266 |         over columns
267 |         :param data_frames: list of XDataFrame instances
268 |         :return: XDataFrame — concatenated list of data_frames
269 |         '''
270 |         return pd.concat(data_frames, axis=1)
271 | 


--------------------------------------------------------------------------------
/examples/ExampleUsage.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "This notebook presents basic usage examples of the XPandas package."
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "### Example dataset"
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "code",
  19 |    "execution_count": 1,
  20 |    "metadata": {},
  21 |    "outputs": [
  22 |     {
  23 |      "name": "stderr",
  24 |      "output_type": "stream",
  25 |      "text": [
  26 |       "/Users/iwitaly/anaconda/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.\n",
  27 |       "  from pandas.core import datetools\n"
  28 |      ]
  29 |     }
  30 |    ],
  31 |    "source": [
  32 |     "from io import BytesIO\n",
  33 |     "from zipfile import ZipFile\n",
  34 |     "from urllib.request import urlopen\n",
  35 |     "\n",
  36 |     "import numpy as np\n",
  37 |     "import pandas as pd\n",
  38 |     "import os, sys\n",
  39 |     "import requests\n",
  40 |     "\n",
  41 |     "sys.path.insert(0, '..')\n",
  42 |     "\n",
  43 |     "from xpandas.data_container import *\n",
  44 |     "from xpandas.transformers import TimeSeriesTransformer, TimeSeriesWindowTransformer"
  45 |    ]
  46 |   },
  47 |   {
  48 |    "cell_type": "markdown",
  49 |    "metadata": {},
  50 |    "source": [
  51 |     "The usage example shown is based on open source time series [data set](http://timeseriesclassification.com/Downloads/FordA.zip).\n",
  52 |     "\n",
  53 |     "The first thing we need to do is to read data. Here, we use the `urlopen` function from Python's built-in urllib to download data set and limit the length of each data series."
  54 |    ]
  55 |   },
  56 |   {
  57 |    "cell_type": "code",
  58 |    "execution_count": 2,
  59 |    "metadata": {
  60 |     "collapsed": true
  61 |    },
  62 |    "outputs": [],
  63 |    "source": [
  64 |     "url = \"http://timeseriesclassification.com/Downloads/FordA.zip\"\n",
  65 |     "series_offset = 505"
  66 |    ]
  67 |   },
  68 |   {
  69 |    "cell_type": "code",
  70 |    "execution_count": 3,
  71 |    "metadata": {
  72 |     "collapsed": true
  73 |    },
  74 |    "outputs": [],
  75 |    "source": [
  76 |     "url = urlopen(url)\n",
  77 |     "zipfile = ZipFile(BytesIO(url.read()))\n",
  78 |     "lines = zipfile.open('FordA/FordA.csv').readlines()\n",
  79 |     "lines = [l.decode('utf-8') for l in lines]\n",
  80 |     "lines = lines[series_offset:]"
  81 |    ]
  82 |   },
  83 |   {
  84 |    "cell_type": "markdown",
  85 |    "metadata": {},
  86 |    "source": [
  87 |     "``lines`` is now a list of strings representing timeseries in a comma separated format that we can convert into floats"
  88 |    ]
  89 |   },
  90 |   {
  91 |    "cell_type": "code",
  92 |    "execution_count": 4,
  93 |    "metadata": {
  94 |     "collapsed": true
  95 |    },
  96 |    "outputs": [],
  97 |    "source": [
  98 |     "lines = [list(map(float, l.split(','))) for l in lines]"
  99 |    ]
 100 |   },
 101 |   {
 102 |    "cell_type": "code",
 103 |    "execution_count": 5,
 104 |    "metadata": {},
 105 |    "outputs": [
 106 |     {
 107 |      "data": {
 108 |       "text/plain": [
 109 |        "[1.1871,\n",
 110 |        " 0.4096,\n",
 111 |        " -0.43154,\n",
 112 |        " -1.231,\n",
 113 |        " -1.9055,\n",
 114 |        " -2.3824,\n",
 115 |        " -2.588,\n",
 116 |        " -2.5018,\n",
 117 |        " -2.1353,\n",
 118 |        " -1.574]"
 119 |       ]
 120 |      },
 121 |      "execution_count": 5,
 122 |      "metadata": {},
 123 |      "output_type": "execute_result"
 124 |     }
 125 |    ],
 126 |    "source": [
 127 |     "lines[0][:10]"
 128 |    ]
 129 |   },
 130 |   {
 131 |    "cell_type": "markdown",
 132 |    "metadata": {},
 133 |    "source": [
 134 |     "Let's convert each embedded list into more convenient ``pandas.Series`` object."
 135 |    ]
 136 |   },
 137 |   {
 138 |    "cell_type": "code",
 139 |    "execution_count": 6,
 140 |    "metadata": {
 141 |     "collapsed": true
 142 |    },
 143 |    "outputs": [],
 144 |    "source": [
 145 |     "lines = [pd.Series(l) for l in lines]"
 146 |    ]
 147 |   },
 148 |   {
 149 |    "cell_type": "code",
 150 |    "execution_count": 7,
 151 |    "metadata": {},
 152 |    "outputs": [
 153 |     {
 154 |      "data": {
 155 |       "text/plain": [
 156 |        "0    1.18710\n",
 157 |        "1    0.40960\n",
 158 |        "2   -0.43154\n",
 159 |        "3   -1.23100\n",
 160 |        "4   -1.90550\n",
 161 |        "5   -2.38240\n",
 162 |        "6   -2.58800\n",
 163 |        "7   -2.50180\n",
 164 |        "8   -2.13530\n",
 165 |        "9   -1.57400\n",
 166 |        "dtype: float64"
 167 |       ]
 168 |      },
 169 |      "execution_count": 7,
 170 |      "metadata": {},
 171 |      "output_type": "execute_result"
 172 |     }
 173 |    ],
 174 |    "source": [
 175 |     "lines[0][:10]"
 176 |    ]
 177 |   },
 178 |   {
 179 |    "cell_type": "markdown",
 180 |    "metadata": {},
 181 |    "source": [
 182 |     "# XPandas: Data structures"
 183 |    ]
 184 |   },
 185 |   {
 186 |    "cell_type": "markdown",
 187 |    "metadata": {},
 188 |    "source": [
 189 |     "### XSeries"
 190 |    ]
 191 |   },
 192 |   {
 193 |    "cell_type": "markdown",
 194 |    "metadata": {},
 195 |    "source": [
 196 |     "``XSeries`` is a 1d data container that can store any objects inside.\n",
 197 |     "\n",
 198 |     "Using the ``pandas.Series`` objects we can encapsulate the list ``lines`` into ``XSeries`` object. The object has a global index of series and an sub-index for each ``pandas.Series``."
 199 |    ]
 200 |   },
 201 |   {
 202 |    "cell_type": "code",
 203 |    "execution_count": 8,
 204 |    "metadata": {
 205 |     "collapsed": true
 206 |    },
 207 |    "outputs": [],
 208 |    "source": [
 209 |     "X = XSeries(lines)"
 210 |    ]
 211 |   },
 212 |   {
 213 |    "cell_type": "code",
 214 |    "execution_count": 9,
 215 |    "metadata": {},
 216 |    "outputs": [
 217 |     {
 218 |      "data": {
 219 |       "text/plain": [
 220 |        "0    0      1.187100\n",
 221 |        "1      0.409600\n",
 222 |        "2     -0.43154...\n",
 223 |        "1    0      0.094261\n",
 224 |        "1      0.310310\n",
 225 |        "2      0.53060...\n",
 226 |        "2    0     -1.157000\n",
 227 |        "1     -1.592600\n",
 228 |        "2     -1.50960...\n",
 229 |        "3    0      0.356960\n",
 230 |        "1      0.300850\n",
 231 |        "2      0.24314...\n",
 232 |        "4    0      0.307980\n",
 233 |        "1      0.370350\n",
 234 |        "2      0.26015...\n",
 235 |        "dtype: object\n",
 236 |        "data_type: <class 'pandas.core.series.Series'>"
 237 |       ]
 238 |      },
 239 |      "execution_count": 9,
 240 |      "metadata": {},
 241 |      "output_type": "execute_result"
 242 |     }
 243 |    ],
 244 |    "source": [
 245 |     "X.head()"
 246 |    ]
 247 |   },
 248 |   {
 249 |    "cell_type": "markdown",
 250 |    "metadata": {},
 251 |    "source": [
 252 |     "The output reveals the ``data_type`` property of the ``XSeries`` object which contains the type of the contained objects, in this case, ``pandas.Series``. The ``XSeries`` is thus build up of ``pandas.Series``. Specifically, ``X`` supports all methods of its containing object ``pandas.Series``."
 253 |    ]
 254 |   },
 255 |   {
 256 |    "cell_type": "markdown",
 257 |    "metadata": {},
 258 |    "source": [
 259 |     "### XDataFrame"
 260 |    ]
 261 |   },
 262 |   {
 263 |    "cell_type": "markdown",
 264 |    "metadata": {},
 265 |    "source": [
 266 |     "``XDataFrame`` is an abstract 2d container that is based on ``pandas.DataFrame`` and stores ``XSeries`` objects.\n",
 267 |     "\n",
 268 |     "The main feature of the ``XDataFrame`` are columns of ``XSeries`` that can contain and manage any **data_type**. For example, one may have a data set consisting of series, images, texts, plain numbers, or even custom objects. Ideally, we would want to handle such different data types in a unified 2d data container, e.g. a chain of transformers to create a simple 2d matrix of training data.\n",
 269 |     "\n",
 270 |     "The following examples illustrates such a ``XDataFrame`` workflow.\n",
 271 |     "\n",
 272 |     "Let ``Y`` be a vector of labels for each row."
 273 |    ]
 274 |   },
 275 |   {
 276 |    "cell_type": "code",
 277 |    "execution_count": 10,
 278 |    "metadata": {
 279 |     "collapsed": true
 280 |    },
 281 |    "outputs": [],
 282 |    "source": [
 283 |     "Y = np.random.binomial(1, 0.5, X.shape[0])\n",
 284 |     "Y = XSeries(Y)"
 285 |    ]
 286 |   },
 287 |   {
 288 |    "cell_type": "code",
 289 |    "execution_count": 11,
 290 |    "metadata": {
 291 |     "collapsed": true
 292 |    },
 293 |    "outputs": [],
 294 |    "source": [
 295 |     "df = XDataFrame({\n",
 296 |     "    'X': X,\n",
 297 |     "    'Y': Y\n",
 298 |     "})"
 299 |    ]
 300 |   },
 301 |   {
 302 |    "cell_type": "code",
 303 |    "execution_count": 12,
 304 |    "metadata": {},
 305 |    "outputs": [
 306 |     {
 307 |      "data": {
 308 |       "text/html": [
 309 |        "<div>\n",
 310 |        "<style>\n",
 311 |        "    .dataframe thead tr:only-child th {\n",
 312 |        "        text-align: right;\n",
 313 |        "    }\n",
 314 |        "\n",
 315 |        "    .dataframe thead th {\n",
 316 |        "        text-align: left;\n",
 317 |        "    }\n",
 318 |        "\n",
 319 |        "    .dataframe tbody tr th {\n",
 320 |        "        vertical-align: top;\n",
 321 |        "    }\n",
 322 |        "</style>\n",
 323 |        "<table border=\"1\" class=\"dataframe\">\n",
 324 |        "  <thead>\n",
 325 |        "    <tr style=\"text-align: right;\">\n",
 326 |        "      <th></th>\n",
 327 |        "      <th>X</th>\n",
 328 |        "      <th>Y</th>\n",
 329 |        "    </tr>\n",
 330 |        "  </thead>\n",
 331 |        "  <tbody>\n",
 332 |        "    <tr>\n",
 333 |        "      <th>0</th>\n",
 334 |        "      <td>0      1.187100\n",
 335 |        "1      0.409600\n",
 336 |        "2     -0.43154...</td>\n",
 337 |        "      <td>1</td>\n",
 338 |        "    </tr>\n",
 339 |        "    <tr>\n",
 340 |        "      <th>1</th>\n",
 341 |        "      <td>0      0.094261\n",
 342 |        "1      0.310310\n",
 343 |        "2      0.53060...</td>\n",
 344 |        "      <td>1</td>\n",
 345 |        "    </tr>\n",
 346 |        "    <tr>\n",
 347 |        "      <th>2</th>\n",
 348 |        "      <td>0     -1.157000\n",
 349 |        "1     -1.592600\n",
 350 |        "2     -1.50960...</td>\n",
 351 |        "      <td>0</td>\n",
 352 |        "    </tr>\n",
 353 |        "    <tr>\n",
 354 |        "      <th>3</th>\n",
 355 |        "      <td>0      0.356960\n",
 356 |        "1      0.300850\n",
 357 |        "2      0.24314...</td>\n",
 358 |        "      <td>1</td>\n",
 359 |        "    </tr>\n",
 360 |        "    <tr>\n",
 361 |        "      <th>4</th>\n",
 362 |        "      <td>0      0.307980\n",
 363 |        "1      0.370350\n",
 364 |        "2      0.26015...</td>\n",
 365 |        "      <td>1</td>\n",
 366 |        "    </tr>\n",
 367 |        "  </tbody>\n",
 368 |        "</table>\n",
 369 |        "</div>"
 370 |       ],
 371 |       "text/plain": [
 372 |        "                                                   X  Y\n",
 373 |        "0  0      1.187100\n",
 374 |        "1      0.409600\n",
 375 |        "2     -0.43154...  1\n",
 376 |        "1  0      0.094261\n",
 377 |        "1      0.310310\n",
 378 |        "2      0.53060...  1\n",
 379 |        "2  0     -1.157000\n",
 380 |        "1     -1.592600\n",
 381 |        "2     -1.50960...  0\n",
 382 |        "3  0      0.356960\n",
 383 |        "1      0.300850\n",
 384 |        "2      0.24314...  1\n",
 385 |        "4  0      0.307980\n",
 386 |        "1      0.370350\n",
 387 |        "2      0.26015...  1"
 388 |       ]
 389 |      },
 390 |      "execution_count": 12,
 391 |      "metadata": {},
 392 |      "output_type": "execute_result"
 393 |     }
 394 |    ],
 395 |    "source": [
 396 |     "df.head()"
 397 |    ]
 398 |   },
 399 |   {
 400 |    "cell_type": "markdown",
 401 |    "metadata": {},
 402 |    "source": [
 403 |     "Add new column to XDataFrame:"
 404 |    ]
 405 |   },
 406 |   {
 407 |    "cell_type": "code",
 408 |    "execution_count": 13,
 409 |    "metadata": {
 410 |     "collapsed": true
 411 |    },
 412 |    "outputs": [],
 413 |    "source": [
 414 |     "df['X_1'] = XSeries([\n",
 415 |     "    pd.Series(np.random.normal(size=100))\n",
 416 |     "    for _ in range(X.shape[0])\n",
 417 |     "])"
 418 |    ]
 419 |   },
 420 |   {
 421 |    "cell_type": "markdown",
 422 |    "metadata": {},
 423 |    "source": [
 424 |     "# XPandas: Transformers"
 425 |    ]
 426 |   },
 427 |   {
 428 |    "cell_type": "markdown",
 429 |    "metadata": {},
 430 |    "source": [
 431 |     "A major motivation for this project is the common data science task of extracting features from some complex objects (for example series) before proceeding with the machine learning.\n",
 432 |     "\n",
 433 |     "Given a ``XSeries`` of ``pandas.Series`` one would, for instance, like to extract features from each series. That's where *Transformers* play a vital role.\n",
 434 |     "\n",
 435 |     "Each ``Transformer`` object support ``fit, transform`` methods just like [scikit-learn transformers](http://scikit-learn.org/stable/data_transforms.html).\n",
 436 |     "\n",
 437 |     "Let's explore some examples."
 438 |    ]
 439 |   },
 440 |   {
 441 |    "cell_type": "markdown",
 442 |    "metadata": {},
 443 |    "source": [
 444 |     "### TimeSeriesWindowTransformer\n",
 445 |     "\n",
 446 |     "This transformer calculates moving average with given window size."
 447 |    ]
 448 |   },
 449 |   {
 450 |    "cell_type": "code",
 451 |    "execution_count": 14,
 452 |    "metadata": {
 453 |     "collapsed": true
 454 |    },
 455 |    "outputs": [],
 456 |    "source": [
 457 |     "tr = TimeSeriesWindowTransformer(windows_size=5)\n",
 458 |     "tr.fit(X)\n",
 459 |     "transformed_series = tr.transform(X)"
 460 |    ]
 461 |   },
 462 |   {
 463 |    "cell_type": "code",
 464 |    "execution_count": 15,
 465 |    "metadata": {},
 466 |    "outputs": [
 467 |     {
 468 |      "data": {
 469 |       "text/plain": [
 470 |        "0    4     -0.394268\n",
 471 |        "5     -1.108168\n",
 472 |        "6     -1.70768...\n",
 473 |        "1    4      0.509686\n",
 474 |        "5      0.680500\n",
 475 |        "6      0.80574...\n",
 476 |        "2    4     -1.098344\n",
 477 |        "5     -0.755320\n",
 478 |        "6     -0.21608...\n",
 479 |        "3    4      0.234223\n",
 480 |        "5      0.165730\n",
 481 |        "6      0.09269...\n",
 482 |        "4    4      0.202701\n",
 483 |        "5      0.154336\n",
 484 |        "6      0.14082...\n",
 485 |        "dtype: object\n",
 486 |        "data_type: <class 'pandas.core.series.Series'>"
 487 |       ]
 488 |      },
 489 |      "execution_count": 15,
 490 |      "metadata": {},
 491 |      "output_type": "execute_result"
 492 |     }
 493 |    ],
 494 |    "source": [
 495 |     "transformed_series.head()"
 496 |    ]
 497 |   },
 498 |   {
 499 |    "cell_type": "markdown",
 500 |    "metadata": {},
 501 |    "source": [
 502 |     "Of course, with a windows_size = 5 first 4 elements are NaN."
 503 |    ]
 504 |   },
 505 |   {
 506 |    "cell_type": "code",
 507 |    "execution_count": 16,
 508 |    "metadata": {},
 509 |    "outputs": [
 510 |     {
 511 |      "data": {
 512 |       "text/plain": [
 513 |        "4    -0.394268\n",
 514 |        "5    -1.108168\n",
 515 |        "6    -1.707688\n",
 516 |        "7    -2.121740\n",
 517 |        "8    -2.302600\n",
 518 |        "9    -2.236300\n",
 519 |        "10   -1.942152\n",
 520 |        "11   -1.469980\n",
 521 |        "12   -0.891442\n",
 522 |        "13   -0.287676\n",
 523 |        "dtype: float64"
 524 |       ]
 525 |      },
 526 |      "execution_count": 16,
 527 |      "metadata": {},
 528 |      "output_type": "execute_result"
 529 |     }
 530 |    ],
 531 |    "source": [
 532 |     "transformed_series[0].head(10)"
 533 |    ]
 534 |   },
 535 |   {
 536 |    "cell_type": "markdown",
 537 |    "metadata": {},
 538 |    "source": [
 539 |     "### TimeSeriesTransformer\n",
 540 |     "\n",
 541 |     "Let's try another transformer, probably the most common one. It extract several quantitative features from each pandas.Series like mean, std, quantiles. You can also pass you own list of features. As a result we retrieve a ``XDataFrame`` object."
 542 |    ]
 543 |   },
 544 |   {
 545 |    "cell_type": "code",
 546 |    "execution_count": 17,
 547 |    "metadata": {
 548 |     "collapsed": true
 549 |    },
 550 |    "outputs": [],
 551 |    "source": [
 552 |     "tr = TimeSeriesTransformer()\n",
 553 |     "tr.fit(X)\n",
 554 |     "transformed_series = tr.transform(X)"
 555 |    ]
 556 |   },
 557 |   {
 558 |    "cell_type": "code",
 559 |    "execution_count": 18,
 560 |    "metadata": {},
 561 |    "outputs": [
 562 |     {
 563 |      "data": {
 564 |       "text/plain": [
 565 |        "xpandas.data_container.data_container.XDataFrame"
 566 |       ]
 567 |      },
 568 |      "execution_count": 18,
 569 |      "metadata": {},
 570 |      "output_type": "execute_result"
 571 |     }
 572 |    ],
 573 |    "source": [
 574 |     "type(transformed_series)"
 575 |    ]
 576 |   },
 577 |   {
 578 |    "cell_type": "code",
 579 |    "execution_count": 19,
 580 |    "metadata": {},
 581 |    "outputs": [
 582 |     {
 583 |      "data": {
 584 |       "text/html": [
 585 |        "<div>\n",
 586 |        "<style>\n",
 587 |        "    .dataframe thead tr:only-child th {\n",
 588 |        "        text-align: right;\n",
 589 |        "    }\n",
 590 |        "\n",
 591 |        "    .dataframe thead th {\n",
 592 |        "        text-align: left;\n",
 593 |        "    }\n",
 594 |        "\n",
 595 |        "    .dataframe tbody tr th {\n",
 596 |        "        vertical-align: top;\n",
 597 |        "    }\n",
 598 |        "</style>\n",
 599 |        "<table border=\"1\" class=\"dataframe\">\n",
 600 |        "  <thead>\n",
 601 |        "    <tr style=\"text-align: right;\">\n",
 602 |        "      <th></th>\n",
 603 |        "      <th>None_TimeSeriesTransformer_max</th>\n",
 604 |        "      <th>None_TimeSeriesTransformer_mean</th>\n",
 605 |        "      <th>None_TimeSeriesTransformer_median</th>\n",
 606 |        "    </tr>\n",
 607 |        "  </thead>\n",
 608 |        "  <tbody>\n",
 609 |        "    <tr>\n",
 610 |        "      <th>0</th>\n",
 611 |        "      <td>2.5263</td>\n",
 612 |        "      <td>0.001995</td>\n",
 613 |        "      <td>0.011186</td>\n",
 614 |        "    </tr>\n",
 615 |        "    <tr>\n",
 616 |        "      <th>1</th>\n",
 617 |        "      <td>2.6291</td>\n",
 618 |        "      <td>0.001997</td>\n",
 619 |        "      <td>-0.024726</td>\n",
 620 |        "    </tr>\n",
 621 |        "    <tr>\n",
 622 |        "      <th>2</th>\n",
 623 |        "      <td>2.6072</td>\n",
 624 |        "      <td>-0.001996</td>\n",
 625 |        "      <td>0.060685</td>\n",
 626 |        "    </tr>\n",
 627 |        "    <tr>\n",
 628 |        "      <th>3</th>\n",
 629 |        "      <td>2.6431</td>\n",
 630 |        "      <td>-0.001997</td>\n",
 631 |        "      <td>-0.022668</td>\n",
 632 |        "    </tr>\n",
 633 |        "    <tr>\n",
 634 |        "      <th>4</th>\n",
 635 |        "      <td>3.2398</td>\n",
 636 |        "      <td>-0.001995</td>\n",
 637 |        "      <td>-0.048518</td>\n",
 638 |        "    </tr>\n",
 639 |        "  </tbody>\n",
 640 |        "</table>\n",
 641 |        "</div>"
 642 |       ],
 643 |       "text/plain": [
 644 |        "   None_TimeSeriesTransformer_max  None_TimeSeriesTransformer_mean  \\\n",
 645 |        "0                          2.5263                         0.001995   \n",
 646 |        "1                          2.6291                         0.001997   \n",
 647 |        "2                          2.6072                        -0.001996   \n",
 648 |        "3                          2.6431                        -0.001997   \n",
 649 |        "4                          3.2398                        -0.001995   \n",
 650 |        "\n",
 651 |        "   None_TimeSeriesTransformer_median  \n",
 652 |        "0                           0.011186  \n",
 653 |        "1                          -0.024726  \n",
 654 |        "2                           0.060685  \n",
 655 |        "3                          -0.022668  \n",
 656 |        "4                          -0.048518  "
 657 |       ]
 658 |      },
 659 |      "execution_count": 19,
 660 |      "metadata": {},
 661 |      "output_type": "execute_result"
 662 |     }
 663 |    ],
 664 |    "source": [
 665 |     "transformed_series.head().iloc[:, :3]"
 666 |    ]
 667 |   },
 668 |   {
 669 |    "cell_type": "markdown",
 670 |    "metadata": {},
 671 |    "source": [
 672 |     "We can also make use of the TSFresh transformer"
 673 |    ]
 674 |   },
 675 |   {
 676 |    "cell_type": "code",
 677 |    "execution_count": 20,
 678 |    "metadata": {
 679 |     "collapsed": true
 680 |    },
 681 |    "outputs": [],
 682 |    "source": [
 683 |     "from xpandas.transformers import TsFreshSeriesTransformer"
 684 |    ]
 685 |   },
 686 |   {
 687 |    "cell_type": "code",
 688 |    "execution_count": 21,
 689 |    "metadata": {
 690 |     "collapsed": true
 691 |    },
 692 |    "outputs": [],
 693 |    "source": [
 694 |     "tr = TsFreshSeriesTransformer()\n",
 695 |     "tr.fit(X.head())\n",
 696 |     "transformed_series = tr.transform(X.head())"
 697 |    ]
 698 |   },
 699 |   {
 700 |    "cell_type": "code",
 701 |    "execution_count": 22,
 702 |    "metadata": {},
 703 |    "outputs": [
 704 |     {
 705 |      "data": {
 706 |       "text/html": [
 707 |        "<div>\n",
 708 |        "<style>\n",
 709 |        "    .dataframe thead tr:only-child th {\n",
 710 |        "        text-align: right;\n",
 711 |        "    }\n",
 712 |        "\n",
 713 |        "    .dataframe thead th {\n",
 714 |        "        text-align: left;\n",
 715 |        "    }\n",
 716 |        "\n",
 717 |        "    .dataframe tbody tr th {\n",
 718 |        "        vertical-align: top;\n",
 719 |        "    }\n",
 720 |        "</style>\n",
 721 |        "<table border=\"1\" class=\"dataframe\">\n",
 722 |        "  <thead>\n",
 723 |        "    <tr style=\"text-align: right;\">\n",
 724 |        "      <th></th>\n",
 725 |        "      <th>None__abs_energy</th>\n",
 726 |        "      <th>None__absolute_sum_of_changes</th>\n",
 727 |        "      <th>None__agg_autocorrelation__f_agg_\"mean\"</th>\n",
 728 |        "    </tr>\n",
 729 |        "  </thead>\n",
 730 |        "  <tbody>\n",
 731 |        "    <tr>\n",
 732 |        "      <th>0</th>\n",
 733 |        "      <td>500.000126</td>\n",
 734 |        "      <td>134.513280</td>\n",
 735 |        "      <td>-0.012049</td>\n",
 736 |        "    </tr>\n",
 737 |        "    <tr>\n",
 738 |        "      <th>1</th>\n",
 739 |        "      <td>499.999290</td>\n",
 740 |        "      <td>114.289925</td>\n",
 741 |        "      <td>0.003075</td>\n",
 742 |        "    </tr>\n",
 743 |        "    <tr>\n",
 744 |        "      <th>2</th>\n",
 745 |        "      <td>500.001514</td>\n",
 746 |        "      <td>164.089622</td>\n",
 747 |        "      <td>-0.013172</td>\n",
 748 |        "    </tr>\n",
 749 |        "    <tr>\n",
 750 |        "      <th>3</th>\n",
 751 |        "      <td>499.999445</td>\n",
 752 |        "      <td>103.510040</td>\n",
 753 |        "      <td>-0.005639</td>\n",
 754 |        "    </tr>\n",
 755 |        "    <tr>\n",
 756 |        "      <th>4</th>\n",
 757 |        "      <td>500.003011</td>\n",
 758 |        "      <td>154.299542</td>\n",
 759 |        "      <td>0.001552</td>\n",
 760 |        "    </tr>\n",
 761 |        "  </tbody>\n",
 762 |        "</table>\n",
 763 |        "</div>"
 764 |       ],
 765 |       "text/plain": [
 766 |        "   None__abs_energy  None__absolute_sum_of_changes  \\\n",
 767 |        "0        500.000126                     134.513280   \n",
 768 |        "1        499.999290                     114.289925   \n",
 769 |        "2        500.001514                     164.089622   \n",
 770 |        "3        499.999445                     103.510040   \n",
 771 |        "4        500.003011                     154.299542   \n",
 772 |        "\n",
 773 |        "   None__agg_autocorrelation__f_agg_\"mean\"  \n",
 774 |        "0                                -0.012049  \n",
 775 |        "1                                 0.003075  \n",
 776 |        "2                                -0.013172  \n",
 777 |        "3                                -0.005639  \n",
 778 |        "4                                 0.001552  "
 779 |       ]
 780 |      },
 781 |      "execution_count": 22,
 782 |      "metadata": {},
 783 |      "output_type": "execute_result"
 784 |     }
 785 |    ],
 786 |    "source": [
 787 |     "transformed_series.head().iloc[:, :3]"
 788 |    ]
 789 |   },
 790 |   {
 791 |    "cell_type": "markdown",
 792 |    "metadata": {},
 793 |    "source": [
 794 |     "### Custom inline Transformer"
 795 |    ]
 796 |   },
 797 |   {
 798 |    "cell_type": "markdown",
 799 |    "metadata": {},
 800 |    "source": [
 801 |     "One can also create inline ``CustomTransfomer`` like this"
 802 |    ]
 803 |   },
 804 |   {
 805 |    "cell_type": "code",
 806 |    "execution_count": 23,
 807 |    "metadata": {
 808 |     "collapsed": true
 809 |    },
 810 |    "outputs": [],
 811 |    "source": [
 812 |     "from xpandas.transformers import XSeriesTransformer"
 813 |    ]
 814 |   },
 815 |   {
 816 |    "cell_type": "code",
 817 |    "execution_count": 24,
 818 |    "metadata": {
 819 |     "collapsed": true
 820 |    },
 821 |    "outputs": [],
 822 |    "source": [
 823 |     "my_awesome_transfomer = XSeriesTransformer(transform_function=lambda x: x.std())"
 824 |    ]
 825 |   },
 826 |   {
 827 |    "cell_type": "code",
 828 |    "execution_count": 25,
 829 |    "metadata": {},
 830 |    "outputs": [
 831 |     {
 832 |      "data": {
 833 |       "text/plain": [
 834 |        "XSeriesTransformer(data_types=None, name='XSeriesTransformer',\n",
 835 |        "          transform_function=<function <lambda> at 0x11929ad90>)"
 836 |       ]
 837 |      },
 838 |      "execution_count": 25,
 839 |      "metadata": {},
 840 |      "output_type": "execute_result"
 841 |     }
 842 |    ],
 843 |    "source": [
 844 |     "my_awesome_transfomer.fit(X)"
 845 |    ]
 846 |   },
 847 |   {
 848 |    "cell_type": "code",
 849 |    "execution_count": 26,
 850 |    "metadata": {},
 851 |    "outputs": [
 852 |     {
 853 |      "data": {
 854 |       "text/plain": [
 855 |        "0    0.999998\n",
 856 |        "1    0.999997\n",
 857 |        "2    1.000000\n",
 858 |        "3    0.999997\n",
 859 |        "4    1.000001\n",
 860 |        "dtype: float64\n",
 861 |        "data_type: <class 'numpy.float64'>"
 862 |       ]
 863 |      },
 864 |      "execution_count": 26,
 865 |      "metadata": {},
 866 |      "output_type": "execute_result"
 867 |     }
 868 |    ],
 869 |    "source": [
 870 |     "my_awesome_transfomer.transform(X).head()"
 871 |    ]
 872 |   },
 873 |   {
 874 |    "cell_type": "markdown",
 875 |    "metadata": {},
 876 |    "source": [
 877 |     "If you want to create your custom transformer with any complex logic, please take a look at internal implementation of transformers."
 878 |    ]
 879 |   },
 880 |   {
 881 |    "cell_type": "markdown",
 882 |    "metadata": {},
 883 |    "source": [
 884 |     "## XDataFrame transformer"
 885 |    ]
 886 |   },
 887 |   {
 888 |    "cell_type": "markdown",
 889 |    "metadata": {},
 890 |    "source": [
 891 |     "To transform a **XDataFrame** one has to specify the transformation logic for the columns that should be transformed using a **XDataFrameTransformer**.\n",
 892 |     "\n",
 893 |     "The constructor of **XDataFrameTransformer** input mapping dictionary of {col_name: XSeries transformer}.\n",
 894 |     "\n",
 895 |     "For example, let's apply **TimeSeriesWindowTransformer** to the $X$ column and **TimeSeriesTransformer** to the $X_1$ column.\n",
 896 |     "\n",
 897 |     "When apply transformation to the column, *it's replaced with transformed*."
 898 |    ]
 899 |   },
 900 |   {
 901 |    "cell_type": "code",
 902 |    "execution_count": 27,
 903 |    "metadata": {
 904 |     "collapsed": true
 905 |    },
 906 |    "outputs": [],
 907 |    "source": [
 908 |     "from xpandas.transformers import XDataFrameTransformer"
 909 |    ]
 910 |   },
 911 |   {
 912 |    "cell_type": "code",
 913 |    "execution_count": 28,
 914 |    "metadata": {
 915 |     "collapsed": true
 916 |    },
 917 |    "outputs": [],
 918 |    "source": [
 919 |     "df_transformer = XDataFrameTransformer({\n",
 920 |     "    'X': TimeSeriesWindowTransformer(windows_size=4),\n",
 921 |     "    'X_1': TimeSeriesTransformer()\n",
 922 |     "})"
 923 |    ]
 924 |   },
 925 |   {
 926 |    "cell_type": "code",
 927 |    "execution_count": 29,
 928 |    "metadata": {},
 929 |    "outputs": [
 930 |     {
 931 |      "data": {
 932 |       "text/plain": [
 933 |        "XDataFrameTransformer(transformations={'X': [TimeSeriesWindowTransformer(windows_size=4)], 'X_1': [TimeSeriesTransformer(features=None)]})"
 934 |       ]
 935 |      },
 936 |      "execution_count": 29,
 937 |      "metadata": {},
 938 |      "output_type": "execute_result"
 939 |     }
 940 |    ],
 941 |    "source": [
 942 |     "df_transformer.fit(df)"
 943 |    ]
 944 |   },
 945 |   {
 946 |    "cell_type": "code",
 947 |    "execution_count": 30,
 948 |    "metadata": {
 949 |     "collapsed": true
 950 |    },
 951 |    "outputs": [],
 952 |    "source": [
 953 |     "transformed_df = df_transformer.transform(df)"
 954 |    ]
 955 |   },
 956 |   {
 957 |    "cell_type": "code",
 958 |    "execution_count": 31,
 959 |    "metadata": {},
 960 |    "outputs": [
 961 |     {
 962 |      "data": {
 963 |       "text/html": [
 964 |        "<div>\n",
 965 |        "<style>\n",
 966 |        "    .dataframe thead tr:only-child th {\n",
 967 |        "        text-align: right;\n",
 968 |        "    }\n",
 969 |        "\n",
 970 |        "    .dataframe thead th {\n",
 971 |        "        text-align: left;\n",
 972 |        "    }\n",
 973 |        "\n",
 974 |        "    .dataframe tbody tr th {\n",
 975 |        "        vertical-align: top;\n",
 976 |        "    }\n",
 977 |        "</style>\n",
 978 |        "<table border=\"1\" class=\"dataframe\">\n",
 979 |        "  <thead>\n",
 980 |        "    <tr style=\"text-align: right;\">\n",
 981 |        "      <th></th>\n",
 982 |        "      <th>X_TimeSeriesWindowTransformer</th>\n",
 983 |        "      <th>Y</th>\n",
 984 |        "      <th>X_1_TimeSeriesTransformer_max</th>\n",
 985 |        "    </tr>\n",
 986 |        "  </thead>\n",
 987 |        "  <tbody>\n",
 988 |        "    <tr>\n",
 989 |        "      <th>0</th>\n",
 990 |        "      <td>3     -0.016460\n",
 991 |        "4     -0.789610\n",
 992 |        "5     -1.48761...</td>\n",
 993 |        "      <td>1</td>\n",
 994 |        "      <td>2.383478</td>\n",
 995 |        "    </tr>\n",
 996 |        "    <tr>\n",
 997 |        "      <th>1</th>\n",
 998 |        "      <td>3      0.416408\n",
 999 |        "4      0.613542\n",
1000 |        "5      0.77304...</td>\n",
1001 |        "      <td>1</td>\n",
1002 |        "      <td>2.451725</td>\n",
1003 |        "    </tr>\n",
1004 |        "    <tr>\n",
1005 |        "      <th>2</th>\n",
1006 |        "      <td>3     -1.315175\n",
1007 |        "4     -1.083680\n",
1008 |        "5     -0.54600...</td>\n",
1009 |        "      <td>0</td>\n",
1010 |        "      <td>2.164009</td>\n",
1011 |        "    </tr>\n",
1012 |        "    <tr>\n",
1013 |        "      <th>3</th>\n",
1014 |        "      <td>3      0.268788\n",
1015 |        "4      0.203539\n",
1016 |        "5      0.13194...</td>\n",
1017 |        "      <td>1</td>\n",
1018 |        "      <td>2.951486</td>\n",
1019 |        "    </tr>\n",
1020 |        "    <tr>\n",
1021 |        "      <th>4</th>\n",
1022 |        "      <td>3      0.255629\n",
1023 |        "4      0.176381\n",
1024 |        "5      0.10033...</td>\n",
1025 |        "      <td>1</td>\n",
1026 |        "      <td>2.453836</td>\n",
1027 |        "    </tr>\n",
1028 |        "  </tbody>\n",
1029 |        "</table>\n",
1030 |        "</div>"
1031 |       ],
1032 |       "text/plain": [
1033 |        "                       X_TimeSeriesWindowTransformer  Y  \\\n",
1034 |        "0  3     -0.016460\n",
1035 |        "4     -0.789610\n",
1036 |        "5     -1.48761...  1   \n",
1037 |        "1  3      0.416408\n",
1038 |        "4      0.613542\n",
1039 |        "5      0.77304...  1   \n",
1040 |        "2  3     -1.315175\n",
1041 |        "4     -1.083680\n",
1042 |        "5     -0.54600...  0   \n",
1043 |        "3  3      0.268788\n",
1044 |        "4      0.203539\n",
1045 |        "5      0.13194...  1   \n",
1046 |        "4  3      0.255629\n",
1047 |        "4      0.176381\n",
1048 |        "5      0.10033...  1   \n",
1049 |        "\n",
1050 |        "   X_1_TimeSeriesTransformer_max  \n",
1051 |        "0                       2.383478  \n",
1052 |        "1                       2.451725  \n",
1053 |        "2                       2.164009  \n",
1054 |        "3                       2.951486  \n",
1055 |        "4                       2.453836  "
1056 |       ]
1057 |      },
1058 |      "execution_count": 31,
1059 |      "metadata": {},
1060 |      "output_type": "execute_result"
1061 |     }
1062 |    ],
1063 |    "source": [
1064 |     "transformed_df.head().iloc[:, :3]"
1065 |    ]
1066 |   },
1067 |   {
1068 |    "cell_type": "markdown",
1069 |    "metadata": {},
1070 |    "source": [
1071 |     "## Pipeline transformer"
1072 |    ]
1073 |   },
1074 |   {
1075 |    "cell_type": "markdown",
1076 |    "metadata": {},
1077 |    "source": [
1078 |     "Well, that's a nice transformer, but can I create [pipelines](http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html) as in scikit-learn?\n",
1079 |     "\n",
1080 |     "Sure! Let's see on example where we combine ``TimeSeriesTransformer`` and ``TimeSeriesWindowTransformer`` into a combined pipeline using a ``PipeLineChain``.\n",
1081 |     "\n",
1082 |     "First let's see example of ``PipeLineChain`` with ``XSeries`` and then with ``XDataFrame``."
1083 |    ]
1084 |   },
1085 |   {
1086 |    "cell_type": "code",
1087 |    "execution_count": 32,
1088 |    "metadata": {
1089 |     "collapsed": true
1090 |    },
1091 |    "outputs": [],
1092 |    "source": [
1093 |     "from xpandas.transformers import PipeLineChain"
1094 |    ]
1095 |   },
1096 |   {
1097 |    "cell_type": "code",
1098 |    "execution_count": 33,
1099 |    "metadata": {},
1100 |    "outputs": [
1101 |     {
1102 |      "data": {
1103 |       "text/plain": [
1104 |        "PipeLineChain(steps=[('moving average trans', TimeSeriesWindowTransformer(windows_size=5)), ('extract features', TimeSeriesTransformer(features=None))])"
1105 |       ]
1106 |      },
1107 |      "execution_count": 33,
1108 |      "metadata": {},
1109 |      "output_type": "execute_result"
1110 |     }
1111 |    ],
1112 |    "source": [
1113 |     "chain = PipeLineChain([\n",
1114 |     "    ('moving average trans', TimeSeriesWindowTransformer(windows_size=5)),\n",
1115 |     "    ('extract features', TimeSeriesTransformer())\n",
1116 |     "])\n",
1117 |     "chain.fit(X)"
1118 |    ]
1119 |   },
1120 |   {
1121 |    "cell_type": "code",
1122 |    "execution_count": 34,
1123 |    "metadata": {},
1124 |    "outputs": [
1125 |     {
1126 |      "data": {
1127 |       "text/plain": [
1128 |        "<bound method Pipeline.get_params of PipeLineChain(steps=[('moving average trans', TimeSeriesWindowTransformer(windows_size=5)), ('extract features', TimeSeriesTransformer(features=None))])>"
1129 |       ]
1130 |      },
1131 |      "execution_count": 34,
1132 |      "metadata": {},
1133 |      "output_type": "execute_result"
1134 |     }
1135 |    ],
1136 |    "source": [
1137 |     "chain.get_params"
1138 |    ]
1139 |   },
1140 |   {
1141 |    "cell_type": "code",
1142 |    "execution_count": 35,
1143 |    "metadata": {
1144 |     "collapsed": true
1145 |    },
1146 |    "outputs": [],
1147 |    "source": [
1148 |     "transformed_X = chain.transform(X)"
1149 |    ]
1150 |   },
1151 |   {
1152 |    "cell_type": "code",
1153 |    "execution_count": 36,
1154 |    "metadata": {},
1155 |    "outputs": [
1156 |     {
1157 |      "data": {
1158 |       "text/html": [
1159 |        "<div>\n",
1160 |        "<style>\n",
1161 |        "    .dataframe thead tr:only-child th {\n",
1162 |        "        text-align: right;\n",
1163 |        "    }\n",
1164 |        "\n",
1165 |        "    .dataframe thead th {\n",
1166 |        "        text-align: left;\n",
1167 |        "    }\n",
1168 |        "\n",
1169 |        "    .dataframe tbody tr th {\n",
1170 |        "        vertical-align: top;\n",
1171 |        "    }\n",
1172 |        "</style>\n",
1173 |        "<table border=\"1\" class=\"dataframe\">\n",
1174 |        "  <thead>\n",
1175 |        "    <tr style=\"text-align: right;\">\n",
1176 |        "      <th></th>\n",
1177 |        "      <th>None_TimeSeriesWindowTransformer_TimeSeriesTransformer_max</th>\n",
1178 |        "      <th>None_TimeSeriesWindowTransformer_TimeSeriesTransformer_mean</th>\n",
1179 |        "    </tr>\n",
1180 |        "  </thead>\n",
1181 |        "  <tbody>\n",
1182 |        "    <tr>\n",
1183 |        "      <th>0</th>\n",
1184 |        "      <td>2.16144</td>\n",
1185 |        "      <td>0.002078</td>\n",
1186 |        "    </tr>\n",
1187 |        "    <tr>\n",
1188 |        "      <th>1</th>\n",
1189 |        "      <td>2.39636</td>\n",
1190 |        "      <td>-0.002229</td>\n",
1191 |        "    </tr>\n",
1192 |        "    <tr>\n",
1193 |        "      <th>2</th>\n",
1194 |        "      <td>2.32512</td>\n",
1195 |        "      <td>0.005656</td>\n",
1196 |        "    </tr>\n",
1197 |        "    <tr>\n",
1198 |        "      <th>3</th>\n",
1199 |        "      <td>2.44430</td>\n",
1200 |        "      <td>0.000632</td>\n",
1201 |        "    </tr>\n",
1202 |        "    <tr>\n",
1203 |        "      <th>4</th>\n",
1204 |        "      <td>2.64094</td>\n",
1205 |        "      <td>-0.001295</td>\n",
1206 |        "    </tr>\n",
1207 |        "  </tbody>\n",
1208 |        "</table>\n",
1209 |        "</div>"
1210 |       ],
1211 |       "text/plain": [
1212 |        "   None_TimeSeriesWindowTransformer_TimeSeriesTransformer_max  \\\n",
1213 |        "0                                            2.16144            \n",
1214 |        "1                                            2.39636            \n",
1215 |        "2                                            2.32512            \n",
1216 |        "3                                            2.44430            \n",
1217 |        "4                                            2.64094            \n",
1218 |        "\n",
1219 |        "   None_TimeSeriesWindowTransformer_TimeSeriesTransformer_mean  \n",
1220 |        "0                                           0.002078            \n",
1221 |        "1                                          -0.002229            \n",
1222 |        "2                                           0.005656            \n",
1223 |        "3                                           0.000632            \n",
1224 |        "4                                          -0.001295            "
1225 |       ]
1226 |      },
1227 |      "execution_count": 36,
1228 |      "metadata": {},
1229 |      "output_type": "execute_result"
1230 |     }
1231 |    ],
1232 |    "source": [
1233 |     "transformed_X.head().iloc[:, :2]"
1234 |    ]
1235 |   },
1236 |   {
1237 |    "cell_type": "markdown",
1238 |    "metadata": {
1239 |     "collapsed": true
1240 |    },
1241 |    "source": [
1242 |     "All right! Let's try to add scikit-learn transformer to the PipeLineChain. For example, let's do PCA on transformed_X."
1243 |    ]
1244 |   },
1245 |   {
1246 |    "cell_type": "code",
1247 |    "execution_count": 37,
1248 |    "metadata": {
1249 |     "collapsed": true
1250 |    },
1251 |    "outputs": [],
1252 |    "source": [
1253 |     "from sklearn.decomposition import PCA"
1254 |    ]
1255 |   },
1256 |   {
1257 |    "cell_type": "code",
1258 |    "execution_count": 38,
1259 |    "metadata": {},
1260 |    "outputs": [
1261 |     {
1262 |      "data": {
1263 |       "text/plain": [
1264 |        "PipeLineChain(steps=[('moving average trans', TimeSeriesWindowTransformer(windows_size=5)), ('extract features', TimeSeriesTransformer(features=None)), ('pca', PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,\n",
1265 |        "  svd_solver='auto', tol=0.0, whiten=False))])"
1266 |       ]
1267 |      },
1268 |      "execution_count": 38,
1269 |      "metadata": {},
1270 |      "output_type": "execute_result"
1271 |     }
1272 |    ],
1273 |    "source": [
1274 |     "chain = PipeLineChain([\n",
1275 |     "    ('moving average trans', TimeSeriesWindowTransformer(windows_size=5)),\n",
1276 |     "    ('extract features', TimeSeriesTransformer()),\n",
1277 |     "    ('pca', PCA(n_components=5))\n",
1278 |     "])\n",
1279 |     "chain.fit(X)"
1280 |    ]
1281 |   },
1282 |   {
1283 |    "cell_type": "code",
1284 |    "execution_count": 39,
1285 |    "metadata": {},
1286 |    "outputs": [],
1287 |    "source": [
1288 |     "transformed_X = chain.transform(X)"
1289 |    ]
1290 |   },
1291 |   {
1292 |    "cell_type": "code",
1293 |    "execution_count": 40,
1294 |    "metadata": {},
1295 |    "outputs": [
1296 |     {
1297 |      "data": {
1298 |       "text/html": [
1299 |        "<div>\n",
1300 |        "<style>\n",
1301 |        "    .dataframe thead tr:only-child th {\n",
1302 |        "        text-align: right;\n",
1303 |        "    }\n",
1304 |        "\n",
1305 |        "    .dataframe thead th {\n",
1306 |        "        text-align: left;\n",
1307 |        "    }\n",
1308 |        "\n",
1309 |        "    .dataframe tbody tr th {\n",
1310 |        "        vertical-align: top;\n",
1311 |        "    }\n",
1312 |        "</style>\n",
1313 |        "<table border=\"1\" class=\"dataframe\">\n",
1314 |        "  <thead>\n",
1315 |        "    <tr style=\"text-align: right;\">\n",
1316 |        "      <th></th>\n",
1317 |        "      <th>0</th>\n",
1318 |        "      <th>1</th>\n",
1319 |        "      <th>2</th>\n",
1320 |        "      <th>3</th>\n",
1321 |        "      <th>4</th>\n",
1322 |        "    </tr>\n",
1323 |        "  </thead>\n",
1324 |        "  <tbody>\n",
1325 |        "    <tr>\n",
1326 |        "      <th>0</th>\n",
1327 |        "      <td>-0.133152</td>\n",
1328 |        "      <td>-0.242552</td>\n",
1329 |        "      <td>0.097523</td>\n",
1330 |        "      <td>-0.004435</td>\n",
1331 |        "      <td>-0.009747</td>\n",
1332 |        "    </tr>\n",
1333 |        "    <tr>\n",
1334 |        "      <th>1</th>\n",
1335 |        "      <td>-0.125413</td>\n",
1336 |        "      <td>0.076021</td>\n",
1337 |        "      <td>-0.089267</td>\n",
1338 |        "      <td>0.010531</td>\n",
1339 |        "      <td>0.017437</td>\n",
1340 |        "    </tr>\n",
1341 |        "    <tr>\n",
1342 |        "      <th>2</th>\n",
1343 |        "      <td>-0.028607</td>\n",
1344 |        "      <td>-0.088828</td>\n",
1345 |        "      <td>0.205043</td>\n",
1346 |        "      <td>0.098009</td>\n",
1347 |        "      <td>0.032338</td>\n",
1348 |        "    </tr>\n",
1349 |        "    <tr>\n",
1350 |        "      <th>3</th>\n",
1351 |        "      <td>0.071478</td>\n",
1352 |        "      <td>-0.058813</td>\n",
1353 |        "      <td>-0.247669</td>\n",
1354 |        "      <td>-0.023550</td>\n",
1355 |        "      <td>-0.052968</td>\n",
1356 |        "    </tr>\n",
1357 |        "    <tr>\n",
1358 |        "      <th>4</th>\n",
1359 |        "      <td>0.200611</td>\n",
1360 |        "      <td>0.110884</td>\n",
1361 |        "      <td>0.064200</td>\n",
1362 |        "      <td>0.012187</td>\n",
1363 |        "      <td>-0.038497</td>\n",
1364 |        "    </tr>\n",
1365 |        "  </tbody>\n",
1366 |        "</table>\n",
1367 |        "</div>"
1368 |       ],
1369 |       "text/plain": [
1370 |        "          0         1         2         3         4\n",
1371 |        "0 -0.133152 -0.242552  0.097523 -0.004435 -0.009747\n",
1372 |        "1 -0.125413  0.076021 -0.089267  0.010531  0.017437\n",
1373 |        "2 -0.028607 -0.088828  0.205043  0.098009  0.032338\n",
1374 |        "3  0.071478 -0.058813 -0.247669 -0.023550 -0.052968\n",
1375 |        "4  0.200611  0.110884  0.064200  0.012187 -0.038497"
1376 |       ]
1377 |      },
1378 |      "execution_count": 40,
1379 |      "metadata": {},
1380 |      "output_type": "execute_result"
1381 |     }
1382 |    ],
1383 |    "source": [
1384 |     "transformed_X.head()"
1385 |    ]
1386 |   },
1387 |   {
1388 |    "cell_type": "markdown",
1389 |    "metadata": {},
1390 |    "source": [
1391 |     "Let's do even more interesting things! Adding a scikit-learn estimator at the end of PipeLineChain!"
1392 |    ]
1393 |   },
1394 |   {
1395 |    "cell_type": "code",
1396 |    "execution_count": 41,
1397 |    "metadata": {
1398 |     "collapsed": true
1399 |    },
1400 |    "outputs": [],
1401 |    "source": [
1402 |     "from sklearn.linear_model import LogisticRegression\n",
1403 |     "from sklearn.model_selection import train_test_split\n",
1404 |     "from sklearn.metrics import accuracy_score"
1405 |    ]
1406 |   },
1407 |   {
1408 |    "cell_type": "code",
1409 |    "execution_count": 42,
1410 |    "metadata": {
1411 |     "collapsed": true
1412 |    },
1413 |    "outputs": [],
1414 |    "source": [
1415 |     "X_train, X_test, y_train, y_test = train_test_split(X, Y)"
1416 |    ]
1417 |   },
1418 |   {
1419 |    "cell_type": "markdown",
1420 |    "metadata": {},
1421 |    "source": [
1422 |     "Be sure that types of X_train and X_test are XSeries."
1423 |    ]
1424 |   },
1425 |   {
1426 |    "cell_type": "code",
1427 |    "execution_count": 43,
1428 |    "metadata": {},
1429 |    "outputs": [
1430 |     {
1431 |      "name": "stdout",
1432 |      "output_type": "stream",
1433 |      "text": [
1434 |       "<class 'xpandas.data_container.data_container.XSeries'>\n",
1435 |       "<class 'xpandas.data_container.data_container.XSeries'>\n"
1436 |      ]
1437 |     }
1438 |    ],
1439 |    "source": [
1440 |     "print(type(X_train))\n",
1441 |     "print(type(X_test))"
1442 |    ]
1443 |   },
1444 |   {
1445 |    "cell_type": "code",
1446 |    "execution_count": 44,
1447 |    "metadata": {
1448 |     "collapsed": true
1449 |    },
1450 |    "outputs": [],
1451 |    "source": [
1452 |     "chain = PipeLineChain([\n",
1453 |     "    ('moving average trans', TimeSeriesWindowTransformer(windows_size=5)),\n",
1454 |     "    ('extract features', TimeSeriesTransformer()),\n",
1455 |     "    ('pca', PCA(n_components=5)),\n",
1456 |     "    ('logit_regression', LogisticRegression())\n",
1457 |     "    \n",
1458 |     "])\n",
1459 |     "chain = chain.fit(X_train, y_train)"
1460 |    ]
1461 |   },
1462 |   {
1463 |    "cell_type": "code",
1464 |    "execution_count": 45,
1465 |    "metadata": {
1466 |     "collapsed": true
1467 |    },
1468 |    "outputs": [],
1469 |    "source": [
1470 |     "prediction = chain.predict(X_test)"
1471 |    ]
1472 |   },
1473 |   {
1474 |    "cell_type": "code",
1475 |    "execution_count": 46,
1476 |    "metadata": {},
1477 |    "outputs": [
1478 |     {
1479 |      "data": {
1480 |       "text/plain": [
1481 |        "0.5004061738424046"
1482 |       ]
1483 |      },
1484 |      "execution_count": 46,
1485 |      "metadata": {},
1486 |      "output_type": "execute_result"
1487 |     }
1488 |    ],
1489 |    "source": [
1490 |     "accuracy_score(y_test, prediction)"
1491 |    ]
1492 |   },
1493 |   {
1494 |    "cell_type": "markdown",
1495 |    "metadata": {},
1496 |    "source": [
1497 |     "Let's now try ``PipeLineChain`` with ``XDataFrameTransformer``.\n",
1498 |     "\n",
1499 |     "Imagine data set of feature columns gender (0 or 1), age (int), series( pandas.Series), target (0 or 1). Let's try to create ``PipeLineChain`` that extracts features from series and performs ``PCA`` over all feature set and then performs LogitRegression classification."
1500 |    ]
1501 |   },
1502 |   {
1503 |    "cell_type": "code",
1504 |    "execution_count": 47,
1505 |    "metadata": {
1506 |     "collapsed": true
1507 |    },
1508 |    "outputs": [],
1509 |    "source": [
1510 |     "n = 100\n",
1511 |     "\n",
1512 |     "df_features = XDataFrame({\n",
1513 |     "    'gender': XSeries(np.random.binomial(1, 0.7, n)),\n",
1514 |     "    'age': XSeries(np.random.poisson(25, n)),\n",
1515 |     "    'series': XSeries([\n",
1516 |     "        pd.Series(np.random.normal(size=500))\n",
1517 |     "    ] * n)\n",
1518 |     "})\n",
1519 |     "\n",
1520 |     "target = XSeries(np.random.binomial(1, 0.45, n))"
1521 |    ]
1522 |   },
1523 |   {
1524 |    "cell_type": "code",
1525 |    "execution_count": 48,
1526 |    "metadata": {
1527 |     "collapsed": true
1528 |    },
1529 |    "outputs": [],
1530 |    "source": [
1531 |     "features_transformer = XDataFrameTransformer({\n",
1532 |     "    'series': TimeSeriesTransformer()\n",
1533 |     "})"
1534 |    ]
1535 |   },
1536 |   {
1537 |    "cell_type": "code",
1538 |    "execution_count": 49,
1539 |    "metadata": {
1540 |     "collapsed": true
1541 |    },
1542 |    "outputs": [],
1543 |    "source": [
1544 |     "pipe_line = PipeLineChain([\n",
1545 |     "    ('extract_from_series', features_transformer),\n",
1546 |     "    ('pca', PCA(n_components=5)),\n",
1547 |     "    ('logit_regression', LogisticRegression())\n",
1548 |     "])"
1549 |    ]
1550 |   },
1551 |   {
1552 |    "cell_type": "code",
1553 |    "execution_count": 50,
1554 |    "metadata": {
1555 |     "collapsed": true
1556 |    },
1557 |    "outputs": [],
1558 |    "source": [
1559 |     "df_features_train, df_features_test, \\\n",
1560 |     "        y_train, y_test = train_test_split(df_features, target)"
1561 |    ]
1562 |   },
1563 |   {
1564 |    "cell_type": "code",
1565 |    "execution_count": 51,
1566 |    "metadata": {},
1567 |    "outputs": [
1568 |     {
1569 |      "data": {
1570 |       "text/plain": [
1571 |        "PipeLineChain(steps=[('extract_from_series', XDataFrameTransformer(transformations={'series': [TimeSeriesTransformer(features=None)]})), ('pca', PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,\n",
1572 |        "  svd_solver='auto', tol=0.0, whiten=False)), ('logit_regression', LogisticRegression(C=1.0, cla...ty='l2', random_state=None, solver='liblinear', tol=0.0001,\n",
1573 |        "          verbose=0, warm_start=False))])"
1574 |       ]
1575 |      },
1576 |      "execution_count": 51,
1577 |      "metadata": {},
1578 |      "output_type": "execute_result"
1579 |     }
1580 |    ],
1581 |    "source": [
1582 |     "pipe_line.fit(df_features_train, y_train)"
1583 |    ]
1584 |   },
1585 |   {
1586 |    "cell_type": "code",
1587 |    "execution_count": 52,
1588 |    "metadata": {},
1589 |    "outputs": [
1590 |     {
1591 |      "data": {
1592 |       "text/plain": [
1593 |        "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
1594 |        "       0, 0])"
1595 |       ]
1596 |      },
1597 |      "execution_count": 52,
1598 |      "metadata": {},
1599 |      "output_type": "execute_result"
1600 |     }
1601 |    ],
1602 |    "source": [
1603 |     "pipe_line.predict(df_features_test)"
1604 |    ]
1605 |   },
1606 |   {
1607 |    "cell_type": "code",
1608 |    "execution_count": null,
1609 |    "metadata": {
1610 |     "collapsed": true
1611 |    },
1612 |    "outputs": [],
1613 |    "source": []
1614 |   }
1615 |  ],
1616 |  "metadata": {
1617 |   "kernelspec": {
1618 |    "display_name": "Python 3",
1619 |    "language": "python",
1620 |    "name": "python3"
1621 |   },
1622 |   "language_info": {
1623 |    "codemirror_mode": {
1624 |     "name": "ipython",
1625 |     "version": 3
1626 |    },
1627 |    "file_extension": ".py",
1628 |    "mimetype": "text/x-python",
1629 |    "name": "python",
1630 |    "nbconvert_exporter": "python",
1631 |    "pygments_lexer": "ipython3",
1632 |    "version": "3.6.1"
1633 |   }
1634 |  },
1635 |  "nbformat": 4,
1636 |  "nbformat_minor": 2
1637 | }
1638 | 


--------------------------------------------------------------------------------