├── __init__.py ├── tests ├── __init__.py ├── test_image_transformers.py ├── test_dataframe_transformer.py ├── test_ts_fresh.py ├── test_bag_of_features.py ├── test_data_type.py └── test_transformer.py ├── xpandas ├── __init__.py ├── transformers │ ├── text_transformer │ │ ├── __init__.py │ │ └── text_transformer.py │ ├── image_transformer │ │ ├── __init__.py │ │ └── image_transformer.py │ ├── identity_transformer │ │ ├── __init__.py │ │ └── identity_transformer.py │ ├── pipeline_transformer │ │ ├── __init__.py │ │ └── pipeline_transformer.py │ ├── bag_of_features_transformer │ │ ├── __init__.py │ │ └── bag_of_features_transformer.py │ ├── series_transformers │ │ ├── __init__.py │ │ └── series_transformer.py │ ├── __init__.py │ └── transformer.py └── data_container │ ├── __init__.py │ └── data_container.py ├── docs ├── _static │ ├── .gitignore │ ├── Logo.png │ └── GitHub-Mark-32px.png ├── authors.rst ├── changes.rst ├── license.rst ├── docsapi.rst ├── contributing.rst ├── example.rst ├── index.rst ├── installation.rst ├── conf.py ├── introduction.rst └── Makefile ├── requirements-docs.txt ├── examples ├── imgs │ ├── Logo.png │ ├── XSeries.png │ ├── logo.sketch │ ├── Transformer.png │ └── XDataFrame.png ├── container_example.py ├── transformer_example.py └── ExampleUsage.ipynb ├── CHANGES.rst ├── requirements.txt ├── Pipfile ├── setup.py ├── AUTHORS.rst ├── .travis.yml ├── README.md ├── LICENSE.txt ├── .gitignore ├── CONTRIBUTING.md ├── CODE_OF_CONDUCT.md └── github_deploy_key.enc /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /xpandas/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/_static/.gitignore: -------------------------------------------------------------------------------- 1 | # Empty directory 2 | -------------------------------------------------------------------------------- /xpandas/transformers/text_transformer/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/authors.rst: -------------------------------------------------------------------------------- 1 | .. _authors: 2 | .. include:: ../AUTHORS.rst 3 | -------------------------------------------------------------------------------- /docs/changes.rst: -------------------------------------------------------------------------------- 1 | .. _changes: 2 | .. include:: ../CHANGES.rst 3 | -------------------------------------------------------------------------------- /xpandas/transformers/text_transformer/text_transformer.py: -------------------------------------------------------------------------------- 1 | # TODO 2 | -------------------------------------------------------------------------------- /requirements-docs.txt: -------------------------------------------------------------------------------- 1 | ipython==7.0.1 2 | Sphinx==1.4.3 3 | nbsphinx==0.3.5 -------------------------------------------------------------------------------- /xpandas/data_container/__init__.py: -------------------------------------------------------------------------------- 1 | from .data_container import XDataFrame, XSeries 2 | -------------------------------------------------------------------------------- /xpandas/transformers/image_transformer/__init__.py: -------------------------------------------------------------------------------- 1 | from .image_transformer import ImageTransformer 2 | -------------------------------------------------------------------------------- /docs/_static/Logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/xpandas/HEAD/docs/_static/Logo.png -------------------------------------------------------------------------------- /xpandas/transformers/identity_transformer/__init__.py: -------------------------------------------------------------------------------- 1 | from .identity_transformer import IdentityTransformer -------------------------------------------------------------------------------- /xpandas/transformers/pipeline_transformer/__init__.py: -------------------------------------------------------------------------------- 1 | from .pipeline_transformer import PipeLineChain 2 | -------------------------------------------------------------------------------- /examples/imgs/Logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/xpandas/HEAD/examples/imgs/Logo.png -------------------------------------------------------------------------------- /docs/license.rst: -------------------------------------------------------------------------------- 1 | .. _license: 2 | 3 | ======= 4 | License 5 | ======= 6 | 7 | .. literalinclude:: ../LICENSE.txt 8 | -------------------------------------------------------------------------------- /examples/imgs/XSeries.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/xpandas/HEAD/examples/imgs/XSeries.png -------------------------------------------------------------------------------- /examples/imgs/logo.sketch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/xpandas/HEAD/examples/imgs/logo.sketch -------------------------------------------------------------------------------- /examples/imgs/Transformer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/xpandas/HEAD/examples/imgs/Transformer.png -------------------------------------------------------------------------------- /examples/imgs/XDataFrame.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/xpandas/HEAD/examples/imgs/XDataFrame.png -------------------------------------------------------------------------------- /xpandas/transformers/bag_of_features_transformer/__init__.py: -------------------------------------------------------------------------------- 1 | from .bag_of_features_transformer import BagOfWordsTransformer 2 | -------------------------------------------------------------------------------- /docs/_static/GitHub-Mark-32px.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alan-turing-institute/xpandas/HEAD/docs/_static/GitHub-Mark-32px.png -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Changelog 3 | ========= 4 | 5 | Version 1.0b 6 | ============ 7 | 8 | - First public release (beta) of XPandas -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.15.2 2 | scikit-image==0.14.1 3 | scikit-learn==0.20.0 4 | scipy==1.1.0 5 | pytest==3.8.2 6 | pandas==0.23.4 7 | tsfresh==0.11.1 8 | -------------------------------------------------------------------------------- /docs/docsapi.rst: -------------------------------------------------------------------------------- 1 | API Documentation 2 | ================= 3 | 4 | Information on specific functions, classes, and methods. 5 | 6 | .. toctree:: 7 | :glob: 8 | 9 | api/* 10 | -------------------------------------------------------------------------------- /xpandas/transformers/series_transformers/__init__.py: -------------------------------------------------------------------------------- 1 | from .series_transformer import TimeSeriesWindowTransformer, TsFreshSeriesTransformer, \ 2 | TimeSeriesTransformer, MeanSeriesTransformer 3 | -------------------------------------------------------------------------------- /docs/contributing.rst: -------------------------------------------------------------------------------- 1 | Contributing 2 | ++++++++++++ 3 | 4 | We welcome any contribution to the XPandas project. 5 | 6 | Please read our `contribution guide `_ on Github. -------------------------------------------------------------------------------- /docs/example.rst: -------------------------------------------------------------------------------- 1 | Example 2 | ======= 3 | 4 | `Read as jupyter notebook on Github `_ 5 | 6 | .. toctree:: 7 | :maxdepth: 4 8 | 9 | ExampleUsage 10 | -------------------------------------------------------------------------------- /xpandas/transformers/__init__.py: -------------------------------------------------------------------------------- 1 | from .pipeline_transformer import PipeLineChain 2 | from .series_transformers import TimeSeriesWindowTransformer, MeanSeriesTransformer, TimeSeriesTransformer, TsFreshSeriesTransformer 3 | from .transformer import XSeriesTransformer, XDataFrameTransformer 4 | from .identity_transformer import IdentityTransformer -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | 3 | url = "https://pypi.python.org/simple" 4 | verify_ssl = true 5 | name = "pypi" 6 | 7 | 8 | [packages] 9 | 10 | numpy = "*" 11 | scipy = "*" 12 | pandas = "*" 13 | scikit-learn = "*" 14 | tsfresh = "*" 15 | pytest = "*" 16 | scikit-image = "*" 17 | Sphinx = "*" 18 | nbsphinx = "*" 19 | 20 | 21 | [dev-packages] 22 | -------------------------------------------------------------------------------- /xpandas/transformers/identity_transformer/identity_transformer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from ..transformer import XSeriesTransformer 4 | 5 | 6 | class IdentityTransformer(XSeriesTransformer): 7 | ''' 8 | Performs identity transformer X -> X 9 | ''' 10 | def __init__(self): 11 | super(IdentityTransformer, self).__init__(transform_function=lambda x: x) 12 | -------------------------------------------------------------------------------- /examples/container_example.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from xpandas.data_container import XSeries, XDataFrame 4 | 5 | n = 1000 6 | 7 | xseries = XSeries([ 8 | pd.Series(np.random.normal(size=500)) 9 | ] * n) 10 | 11 | xdataframe = XDataFrame({ 12 | 'gender': XSeries(np.random.binomial(1, 0.7, n)), 13 | 'age': XSeries(np.random.poisson(25, n)), 14 | 'series': xseries 15 | }) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | 4 | setup(name='XPandas', 5 | version='1.0.2', 6 | description='1d/2d data container with map-reduce transformers', 7 | url='https://github.com/alan-turing-institute/xpandas', 8 | author='Vitaly Davydov (@iwitaly)', 9 | author_email='1061040@gmail.com', 10 | license='BSD', 11 | keywords='data container sklearn pandas map reduce transformer', 12 | packages=find_packages(), 13 | zip_safe=False) -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Welcome to XPandas documentation! 3 | ======= 4 | 5 | .. _xpandasgit: https://github.com/alan-turing-institute/xpandas 6 | 7 | 8 | .. |git label image| image:: _static/GitHub-Mark-32px.png 9 | :target: xpandasgit_ 10 | 11 | Welcome to the documentation of **XPandas** -- data containers and transformations for storing 1D and 2D data of any type. The source code is available on |git label image| 12 | 13 | 14 | tl;dr 15 | ##### 16 | 17 | Install :code:`pip install xpandas` and check out the :ref:`Example`. 18 | 19 | 20 | Table of Contents 21 | ################# 22 | 23 | .. toctree:: 24 | :maxdepth: 2 25 | 26 | introduction 27 | installation 28 | example 29 | docsapi 30 | contributing 31 | changes 32 | authors 33 | license -------------------------------------------------------------------------------- /xpandas/transformers/pipeline_transformer/pipeline_transformer.py: -------------------------------------------------------------------------------- 1 | from sklearn.pipeline import Pipeline 2 | 3 | from ...data_container import XDataFrame, XSeries 4 | 5 | 6 | class PipeLineChain(Pipeline): 7 | ''' 8 | PipeLine transformer. Can chain multiple transformers and estimator from scikit-learn. 9 | Based on scikit-learn Pipeline 10 | http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline 11 | ''' 12 | 13 | def transform(self, X, **kwargs): 14 | transformed_object = super(PipeLineChain, self).transform(X, **kwargs) 15 | if type(transformed_object) != XSeries and type(transformed_object) != XDataFrame: 16 | transformed_object = XDataFrame(transformed_object) 17 | return transformed_object 18 | -------------------------------------------------------------------------------- /AUTHORS.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | Developers 3 | ========== 4 | 5 | **XPandas** is developed open source under the direction of `Dr Franz Király`_. Check out the `public code repository`_ to learn more. 6 | 7 | **Active** 8 | 9 | * Vitaly Davydov `iwitaly `_: principal developer and curator 10 | * Franz Kiraly `fkiraly `_: project manager and designated point of contact 11 | * Frithjof Gressmann `frthjf `_: contributor 12 | 13 | **Former/inactive** 14 | 15 | None 16 | 17 | If you like to contribute, read our `contribution guide `_. 18 | 19 | .. _public code repository: https://github.com/alan-turing-institute/xpandas 20 | .. _Dr Franz Király: https://www.ucl.ac.uk/statistics/people/franz-kiraly -------------------------------------------------------------------------------- /tests/test_image_transformers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import skimage.transform as skimage_transform 3 | 4 | from ..xpandas.data_container import XSeries 5 | from ..xpandas.transformers.image_transformer import ImageTransformer 6 | 7 | n = 20 8 | m = 20 9 | colours_n = 255 10 | 11 | 12 | def generate_image(is_3d=True): 13 | if is_3d: 14 | return (np.random.rand(30, 30, 3) * 255).astype('uint8') 15 | return (np.random.rand(30, 30) * 255).astype('uint8') 16 | 17 | 18 | def test_image_transformation(): 19 | s = XSeries([generate_image(False) for _ in range(100)]) 20 | 21 | try: 22 | image_transformer = ImageTransformer().fit() 23 | assert False 24 | except: 25 | assert True 26 | 27 | image_transformer = ImageTransformer(skimage_transform.hough_circle, radius=5).fit() 28 | s_transformed = image_transformer.transform(s) 29 | 30 | assert s_transformed.data_type == np.ndarray 31 | 32 | image_transformer = ImageTransformer(skimage_transform.resize, output_shape=(10, 10)).fit() 33 | s_transformed = image_transformer.transform(s) 34 | 35 | assert s_transformed.data_type == np.ndarray 36 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - 3.6 4 | 5 | addons: 6 | apt: 7 | packages: 8 | - pandoc 9 | 10 | install: 11 | - pip install -r requirements.txt 12 | - pip install -r requirements-docs.txt 13 | 14 | script: 15 | - pytest 16 | - set -e 17 | - pip install -e . 18 | - cd docs 19 | - make buildapi 20 | - make html 21 | - pip install doctr 22 | - cd .. 23 | - doctr deploy . --built-docs docs/_build/html/ 24 | 25 | env: 26 | global: 27 | - secure: "VbInh8C4nFX7sDvyLPIs4YQnOOSORzPP85PsWqWQet5gx2AqWPmdkuYbSthXTKIl/V0HPWrwOMFyki9FpeC+DJrzyiftAyNoLmBO1Yha3y5NcwYovZ4u4kVsfORpQAm5pSCKKQxt17w6PgBRyZ4T2LnTF5k9Ig9AkxDllR01PItw4mLN2u8M8/hx4eKhyD4dTy596Dtd2AiQuoK7LhkLaMNUBx5Q26yLhw9muyLDBS55kHp4xXlB2ggZnr7S4FJp0WV+pzKPFVEs6l1kK1U+1OyB1CszjC/mLjYPzn/QoNtKXnSnRM1GxZp+/z7guldoEG81YSx2/F3xTHGJ28g3larhoL+YjbVq+FAce9g/0i7Ee6I2Gaa4Jwj2IcK02KguIPJp+Aj6wj6hgmvPaKYyaF7EPRiZMOKyKA3kpyXdOfJj2oUaPevHonPsxdkNXnB8UuUU4ulKPCjeo8L/+9O5gM9zDngFNFXbnJ2cVZtta0L7Uug00IwKS1FGunl7bGh9V3jP4OLrzKyykDYf/lRMB5YXPExlGi9+TWQ+MmXVbqVjR02YRkWAvPnD096z62eEidn8DFGXRFFndRJcBD3Z56S0RPkd7oIF0+sKujLMqXYzRV6NttuOuvnICd1nTsjQV3h9GcAL2cOfNGEE9MEaDnyvBVoEP3a0CDDcVChETIk=" 28 | -------------------------------------------------------------------------------- /xpandas/transformers/image_transformer/image_transformer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from ..transformer import XSeriesTransformer 4 | 5 | 6 | class ImageTransformer(XSeriesTransformer): 7 | ''' 8 | Performs image transformation based on skimage transformation function 9 | http://scikit-image.org/docs/dev/api/skimage.transform.html 10 | ''' 11 | def __init__(self, skimage_function=None, **function_params): 12 | ''' 13 | :param skimage_function: transformation function from skimage 14 | ''' 15 | accepted_types = [ 16 | list, np.ndarray, np.array 17 | ] 18 | 19 | if skimage_function is None: 20 | raise Exception('Please specify transform function from scikit-image' 21 | ' http://scikit-image.org/docs/dev/api/skimage.transform.html') 22 | 23 | def image_transform_function(img): 24 | return skimage_function(img, **function_params) 25 | 26 | super(ImageTransformer, self).__init__(data_types=accepted_types, 27 | columns=None, 28 | transform_function=image_transform_function) 29 | -------------------------------------------------------------------------------- /docs/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ************ 3 | 4 | Stable 5 | ^^^^^^^^^^^^^ 6 | 7 | The installation of the latest stable version is easy using the python package manager `pip`_. :: 8 | 9 | pip install xpandas 10 | 11 | That's it. You are now ready to go. We recommend reading the :doc:`examples ` to get started. 12 | 13 | 14 | Bleeding edge 15 | ^^^^^^^^^^^^^ 16 | 17 | To test or develop new features you may want to install the latest package version 18 | from the master branch (bleeding edge installation). You can install directly from Git repository :: 19 | 20 | pip install git+https://github.com/alan-turing-institute/xpandas.git 21 | 22 | 23 | Or clone the source from our `public code repository`_ on GitHub and change into the XPandas directory. 24 | Make sure that all dependencies are installed :: 25 | 26 | pip install -r requirements.txt 27 | 28 | Then run :: 29 | 30 | python setup.py develop 31 | 32 | to install the package into the activated Python environment. 33 | If you would like to contribute to documentation please refer to :ref:`Contributing`. 34 | 35 | Note that bleeding edge installations are likely contain bugs are not recommended for productive environments. 36 | 37 | 38 | .. _pip: http://www.pip-installer.org/ 39 | .. _public code repository: https://github.com/alan-turing-institute/xpandas -------------------------------------------------------------------------------- /examples/transformer_example.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from xpandas.data_container import XSeries, XDataFrame 4 | 5 | from xpandas.transformers import XDataFrameTransformer 6 | from xpandas.transformers import PipeLineChain 7 | from xpandas.transformers import XSeriesTransformer 8 | from xpandas.transformers import TimeSeriesWindowTransformer 9 | 10 | from sklearn.linear_model import LogisticRegression 11 | from sklearn.decomposition import PCA 12 | 13 | 14 | n = 1000 15 | 16 | xseries = XSeries([ 17 | pd.Series(np.random.normal(size=500)) 18 | ] * n) 19 | my_awesome_transfomer = XSeriesTransformer(transform_function=lambda x: x.std()) 20 | my_awesome_transfomer.fit(X) 21 | print(my_awesome_transfomer.transform(X).head()) 22 | 23 | 24 | xdataframe = XDataFrame({ 25 | 'gender': XSeries(np.random.binomial(1, 0.7, n)), 26 | 'age': XSeries(np.random.poisson(25, n)), 27 | 'series': xseries 28 | }) 29 | df_transformer = XDataFrameTransformer({ 30 | 'series': TimeSeriesWindowTransformer(windows_size=4), 31 | 'age': my_awesome_transfomer 32 | }) 33 | df_transformer.fit(df) 34 | transformed_df = df_transformer.transform(df) 35 | 36 | 37 | chain = PipeLineChain([ 38 | ('moving average trans', TimeSeriesWindowTransformer(windows_size=5)), 39 | ('extract features', my_awesome_transfomer), 40 | ('pca', PCA(n_components=5)), 41 | ('logit_regression', LogisticRegression()) 42 | ]) 43 | chain.fit(X) 44 | print(chain.get_params) 45 | transformed_X = chain.transform(X) 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Logo](/examples/imgs/Logo.png) 2 | 3 | [![Build Status](https://travis-ci.org/alan-turing-institute/xpandas.svg?branch=master)](https://travis-ci.org/alan-turing-institute/xpandas) 4 | [![PyPI version](https://badge.fury.io/py/XPandas.svg)](https://badge.fury.io/py/XPandas) 5 | 6 | **XPandas** (extended [`Pandas`](https://pandas.pydata.org/)) implements 1D and 2D data containers for storing type-heterogeneous tabular data of any type, 7 | and encapsulates feature extraction and transformation modelling in an sklearn-compatible transformer interface. 8 | 9 | ## Quickstart 10 | 11 | Install the latest version 12 | 13 | $ pip install xpandas 14 | 15 | and run the example jupyter notebook 16 | 17 | $ jupyter examples/ExampleUsage.ipynb 18 | 19 | ## Documentation 20 | 21 | The full documentation is available at [https://alan-turing-institute.github.io/xpandas/](https://alan-turing-institute.github.io/xpandas/). 22 | 23 | ## Acknowledgements 24 | 25 | - **Bernd Bischl (@berndbischl)**, who mentioned the idea of a general data container with transformers attached to columns in personal discussion with Franz Kiraly during a London visit in 2016. 26 | - **Franz Kiraly (@fkiraly)**, who initiated and funded the project up to release, and who substantially contributed to the API design. 27 | - **Haoran Xue (@HaoranXue)**, who, under the supervision of Franz Kiraly, earlier completed a thesis for a degree at UCL on the topic, and who wrote a similar package as part of it. No code was re-used in the creation of the XPandas package. 28 | 29 | 30 | List of [developers and contributors](AUTHORS.rst) 31 | 32 | 33 | -------------------------------------------------------------------------------- /xpandas/transformers/bag_of_features_transformer/bag_of_features_transformer.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | 3 | import numpy as np 4 | import pandas as pd 5 | 6 | from ..transformer import XSeriesTransformer 7 | 8 | 9 | class BagOfWordsTransformer(XSeriesTransformer): 10 | ''' 11 | Performs bag-of-features transformer for strings of any categorical data. 12 | ''' 13 | def __init__(self, dictionary=None, **kwargs): 14 | ''' 15 | :param dictionary: custom dictionary to count against. if None, calculate dictionary from dataset 16 | ''' 17 | self.dictionary = dictionary 18 | 19 | accepted_types = [ 20 | pd.Series, list, np.array, tuple 21 | ] 22 | 23 | def bag_of_words_transform_function(corpus): 24 | counter = Counter(corpus) 25 | for el in self.dictionary: 26 | if counter.get(el) is None: 27 | counter[el] = 0 28 | return counter 29 | 30 | super(BagOfWordsTransformer, self).__init__(data_types=accepted_types, 31 | columns=None, 32 | transform_function=bag_of_words_transform_function) 33 | 34 | def __calculate_dictionary(self, X): 35 | dictionary = set() 36 | for el in X: 37 | dictionary = dictionary.union(el) 38 | return dictionary 39 | 40 | def fit(self, X=None, y=None, **kwargs): 41 | super(BagOfWordsTransformer, self).fit(X, y, **kwargs) 42 | if self.dictionary is not None: 43 | return self 44 | self.dictionary = self.__calculate_dictionary(X) 45 | return self 46 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017, shared by: 2 | The Alan Turing institute, UK; 3 | University College London, UK; 4 | Vitaly Davydov; 5 | Franz J Kiraly. 6 | All rights reserved. 7 | 8 | Redistribution and use in source and binary forms, with or without modification, 9 | are permitted provided that the following conditions are met: 10 | 11 | * Redistributions of source code must retain the above copyright notice, this 12 | list of conditions and the following disclaimer. 13 | 14 | * Redistributions in binary form must reproduce the above copyright notice, this 15 | list of conditions and the following disclaimer in the documentation and/or 16 | other materials provided with the distribution. 17 | 18 | * Neither the name of the copyright holders nor the names of the project's 19 | contributors may be used to endorse or promote products derived from 20 | this software without specific prior written permission of all copyright holders. 21 | 22 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 23 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 24 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 25 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR 26 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 27 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 29 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 30 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 31 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /tests/test_dataframe_transformer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | here = os.path.dirname(os.path.realpath(__file__)) 5 | sys.path.append(os.path.join(here, "..")) 6 | 7 | from ..xpandas.data_container import XDataFrame, XSeries 8 | from ..xpandas.transformers import XSeriesTransformer, TimeSeriesTransformer, \ 9 | TimeSeriesWindowTransformer, MeanSeriesTransformer, IdentityTransformer, \ 10 | XDataFrameTransformer, PipeLineChain 11 | import pandas as pd 12 | import numpy as np 13 | 14 | 15 | def test_naming(): 16 | X = XSeries([ 17 | pd.Series(np.random.normal(0, 1, 100), name='X') 18 | ]) 19 | df = XDataFrame({ 20 | 'X': X 21 | }) 22 | 23 | dataframe_transformer = XDataFrameTransformer({ 24 | 'X': [TimeSeriesTransformer()] 25 | }) 26 | 27 | dataframe_transformer.fit(df) 28 | transformed_df = dataframe_transformer.transform(df) 29 | 30 | for col_name in transformed_df.columns: 31 | assert col_name.startswith('X_TimeSeriesTransformer') 32 | 33 | 34 | def test_multiple_transformers_for_one_column(): 35 | X = XSeries([ 36 | pd.Series(np.random.normal(0, 1, 100), name='X') 37 | ]) 38 | df = XDataFrame({ 39 | 'X': X 40 | }) 41 | 42 | dataframe_transformer = XDataFrameTransformer({ 43 | 'X': [TimeSeriesTransformer(), IdentityTransformer(), MeanSeriesTransformer()] 44 | }) 45 | 46 | dataframe_transformer.fit(df) 47 | transformed_df = dataframe_transformer.transform(df) 48 | 49 | for col_name in transformed_df.columns: 50 | assert col_name.startswith('X_TimeSeriesTransformer') or \ 51 | col_name.startswith('X_IdentityTransformer') or \ 52 | col_name.startswith('X_MeanSeriesTransformer') -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /docs/api/* 2 | 3 | # Created by .ignore support plugin (hsz.mobi) 4 | ### Python template 5 | # Byte-compiled / optimized / DLL files 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | env/ 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *,cover 51 | .hypothesis/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # dotenv 87 | .env 88 | 89 | # virtualenv 90 | .venv 91 | venv/ 92 | ENV/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | .idea/ 101 | *.csv 102 | ExampleUsage.rst 103 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import alabaster 3 | 4 | needs_sphinx = '1.4.3' 5 | 6 | html_theme_path = [alabaster.get_path()] 7 | 8 | sys.path.insert(0, os.path.abspath('../xpandas')) 9 | 10 | extensions = ['alabaster', 'sphinx.ext.autodoc', 'sphinx.ext.intersphinx', 'sphinx.ext.todo', 11 | 'sphinx.ext.autosummary', 'sphinx.ext.viewcode', 'sphinx.ext.coverage', 12 | 'sphinx.ext.doctest', 'sphinx.ext.ifconfig', 'sphinx.ext.pngmath', 13 | 'sphinx.ext.napoleon', 'nbsphinx', 'IPython.sphinxext.ipython_console_highlighting', 14 | 'sphinx.ext.autosectionlabel'] 15 | 16 | templates_path = ['_templates'] 17 | 18 | source_suffix = '.rst' 19 | 20 | master_doc = 'index' 21 | 22 | project = u'XPandas' 23 | copyright = u'2017, UCL' 24 | 25 | version = '' # Is set by calling `setup.py docs` 26 | release = '' # Is set by calling `setup.py docs` 27 | 28 | exclude_patterns = ['_build', '**.ipynb_checkpoints'] 29 | 30 | # pygments_style = 'sphinx' 31 | 32 | html_theme = 'alabaster' 33 | 34 | html_theme_options = { 35 | 'logo': 'Logo.png', 36 | 'github_user': 'alan-turing-institute', 37 | 'github_repo': 'xpandas', 38 | 'travis_button': True, 39 | 'analytics_id': 'UA-108477151-1' 40 | } 41 | 42 | 43 | try: 44 | from xpandas import __version__ as version 45 | except ImportError: 46 | pass 47 | else: 48 | release = version 49 | 50 | html_static_path = ['_static'] 51 | 52 | html_sidebars = { 53 | '**': [ 54 | 'about.html', 55 | 'navigation.html', 56 | 'searchbox.html', 57 | 'donate.html', 58 | ] 59 | } 60 | 61 | html_show_sourcelink = False 62 | 63 | html_show_sphinx = False 64 | 65 | htmlhelp_basename = 'XPandas-doc' 66 | 67 | 68 | python_version = '.'.join(map(str, sys.version_info[0:2])) 69 | intersphinx_mapping = { 70 | 'sphinx': ('http://sphinx.pocoo.org', None), 71 | 'python': ('http://docs.python.org/' + python_version, None), 72 | 'matplotlib': ('http://matplotlib.sourceforge.net', None), 73 | 'numpy': ('http://docs.scipy.org/doc/numpy', None), 74 | 'sklearn': ('http://scikit-learn.org/stable', None), 75 | 'pandas': ('http://pandas.pydata.org/pandas-docs/stable', None), 76 | 'scipy': ('http://docs.scipy.org/doc/scipy/reference/', None), 77 | } 78 | -------------------------------------------------------------------------------- /tests/test_ts_fresh.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from ..xpandas.data_container import XSeries, XDataFrame 5 | from ..xpandas.transformers.pipeline_transformer import PipeLineChain 6 | from ..xpandas.transformers.series_transformers import TsFreshSeriesTransformer, TimeSeriesWindowTransformer 7 | from ..xpandas.transformers.transformer import XDataFrameTransformer 8 | 9 | 10 | def test_ts_fresh_series(): 11 | series = XSeries([ 12 | pd.Series(np.random.uniform(0, 100, 100)), 13 | pd.Series(np.random.uniform(0, 100, 100)), 14 | pd.Series(np.random.uniform(0, 100, 100)), 15 | pd.Series(np.random.uniform(0, 100, 100)), 16 | pd.Series(np.random.uniform(0, 100, 100)) 17 | ], name='Y') 18 | 19 | series = series 20 | # print(series.index) 21 | 22 | transformer = TsFreshSeriesTransformer() 23 | 24 | transformer.fit(series) 25 | transformed = transformer.transform(series) 26 | # print(transformed) 27 | 28 | assert type(transformed) == XDataFrame 29 | 30 | 31 | def test_ts_fresh_df(): 32 | s1 = XSeries([ 33 | pd.Series(np.random.uniform(0, 100, 10)), 34 | pd.Series(np.random.uniform(0, 100, 10)), 35 | pd.Series(np.random.uniform(0, 100, 10)) 36 | ], name='X') 37 | s2 = XSeries([ 38 | pd.Series(np.random.uniform(0, 100, 10)), 39 | pd.Series(np.random.uniform(0, 100, 10)), 40 | pd.Series(np.random.uniform(0, 100, 10)) 41 | ], name='Y') 42 | 43 | df = XDataFrame({ 44 | 'X': s1, 45 | 'Y': s2 46 | }) 47 | 48 | data_frame_transformer = XDataFrameTransformer(transformations={ 49 | 'X': TsFreshSeriesTransformer(), 50 | 'Y': TsFreshSeriesTransformer() 51 | }) 52 | 53 | data_frame_transformer.fit(df) 54 | transformed_df = data_frame_transformer.transform(df) 55 | 56 | assert type(transformed_df) == XDataFrame 57 | 58 | 59 | def test_ts_fresh_chain(): 60 | s1 = XSeries([ 61 | pd.Series(np.random.normal(0, 1, 20)) 62 | for _ in range(10) 63 | ], name='X') 64 | 65 | pipe = PipeLineChain([ 66 | ('mean shift', TimeSeriesWindowTransformer()), 67 | ('ts fresh step', TsFreshSeriesTransformer()) 68 | ]) 69 | 70 | pipe.fit(s1) 71 | transformed_df = pipe.transform(s1) 72 | 73 | # print(transformed_df.head()) 74 | 75 | assert type(transformed_df) == XDataFrame 76 | -------------------------------------------------------------------------------- /tests/test_bag_of_features.py: -------------------------------------------------------------------------------- 1 | import string 2 | 3 | import numpy as np 4 | from sklearn.datasets import fetch_20newsgroups 5 | from sklearn.decomposition import PCA 6 | 7 | from ..xpandas.data_container import XSeries, XDataFrame 8 | from ..xpandas.transformers import XSeriesTransformer 9 | from ..xpandas.transformers.bag_of_features_transformer import BagOfWordsTransformer 10 | from ..xpandas.transformers.pipeline_transformer import PipeLineChain 11 | 12 | 13 | def test_bag_of_words_for_series(): 14 | dataset = fetch_20newsgroups(shuffle=True, random_state=1, 15 | remove=('headers', 'footers', 'quotes')) 16 | 17 | series = XSeries(dataset.data[:10]) 18 | assert series.data_type == str 19 | 20 | translator = str.maketrans('', '', string.punctuation) 21 | tokenizer_transformer = XSeriesTransformer( 22 | transform_function=lambda text: text.lower().translate(translator).strip().split() 23 | ) 24 | 25 | transformed_series = tokenizer_transformer.fit_transform(series) 26 | # print(transformed_series) 27 | 28 | bag_transform = BagOfWordsTransformer() 29 | 30 | transformed_series = bag_transform.fit_transform(transformed_series) 31 | 32 | # print(transformed_series) 33 | 34 | assert type(transformed_series) == XDataFrame 35 | 36 | 37 | def test_bag_of_words_for_series_pipeline(): 38 | dataset = fetch_20newsgroups(shuffle=True, random_state=1, 39 | remove=('headers', 'footers', 'quotes')) 40 | n = 100 41 | series = XSeries(dataset.data[:n]) 42 | assert series.data_type == str 43 | 44 | translator = str.maketrans('', '', string.punctuation) 45 | tokenizer_transformer = XSeriesTransformer( 46 | transform_function=lambda text: text.lower().translate(translator).strip().split() 47 | ) 48 | 49 | # series = tokenizer_transformer.transform(series) 50 | 51 | Y = np.random.binomial(1, 0.5, n) 52 | 53 | pipeline = PipeLineChain([ 54 | ('preprocessing', XSeriesTransformer( 55 | transform_function=lambda text: text.lower().translate(translator).strip().split() 56 | )), 57 | ('extractor', BagOfWordsTransformer()), 58 | ('pca', PCA(n_components=10)), 59 | # ('svc', LinearSVC()) 60 | ]) 61 | 62 | pipeline = pipeline.fit(series) 63 | transformed_series = pipeline.transform(series) 64 | 65 | # print(transformed_series) 66 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## How to contribute to XPandas 2 | 3 | #### **Did you find a bug?** 4 | 5 | * **Ensure the bug was not already reported** by searching on GitHub under [Issues](https://github.com/alan-turing-institute/xpandas/issues). 6 | 7 | * If you're unable to find an open issue addressing the problem, 8 | [open a new one](https://github.com/alan-turing-institute/xpandas/issues/new). 9 | Be sure to include a **title and clear description**, 10 | as much relevant information as possible, and a **code sample** or an **executable test case** 11 | demonstrating the expected behavior that is not occurring. 12 | 13 | * Please follow the further discussion in case more information is needed or questions arise. 14 | 15 | #### **Did you write a patch that fixes a bug?** 16 | 17 | * Open a new GitHub pull request with the patch. 18 | 19 | * Ensure the PR description clearly describes the problem and solution. Include the relevant issue number if applicable. 20 | 21 | #### **Do you intend to add a new feature or change an existing one?** 22 | 23 | * Suggest your change in an issue and offer to implement the feature. 24 | 25 | * Wait for positive feedback in order to avoid double work (maybe your idea is already in development). 26 | 27 | * Implement and send a PR 28 | 29 | #### **Do you want to contribute to the xpandas documentation?** 30 | 31 | * Understand how the documentation is build (see below) 32 | * Send a PR that propose changes to the docs directory 33 | 34 | We use a custom [Makefile](docs/Makefile) that configures an automatic generation of the `*.rst` API documentation of each entity (class or module) in the [docs/api](docs/api) directory: 35 | 36 | buildapi: 37 | sphinx-apidoc -fMeT ../xpandas -o api 38 | @echo "Auto-generation of API documentation finished. " \ 39 | "The generated files are in 'api/'" 40 | 41 | Before building the documentation the entity files have therefore to be regenerated from the source using the `make buildapi` command. Then, `make html` creates the HTML documentation which includes a conversion of the [examples notebook](examples/ExampleUsage.ipynb) into *ExampleUsage.rst* before building. 42 | 43 | **Deployment of the documentation** 44 | 45 | This documenation is hosted on GitHub Pages instead of [ReadTheDocs](https://readthedocs.org/) to avoid adverts and keep all things together on Github. 46 | 47 | As GitHub Pages does not support Sphinx we make us of the [Doctr](https://drdoctr.github.io/doctr/) package that automatically updates our docs 48 | on GH Pages branch using Travis CI; the build process is triggered by commits to the master branch that pass the tests. Please take a look at the [.travis.yml](.travis.yml) file for more details. 49 | 50 | XPandas is a team effort. We encourage you to pitch in and join us! 51 | 52 | Thanks! :heart: :heart: :heart: 53 | 54 | **Xpandas Team** 55 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | * Using welcoming and inclusive language 12 | * Being respectful of differing viewpoints and experiences 13 | * Gracefully accepting constructive criticism 14 | * Focusing on what is best for the community 15 | * Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | * Trolling, insulting/derogatory comments, and personal or political attacks 21 | * Public or private harassment 22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | * Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at f.kiraly@ucl.ac.uk. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] 44 | 45 | [homepage]: http://contributor-covenant.org 46 | [version]: http://contributor-covenant.org/version/1/4/ 47 | -------------------------------------------------------------------------------- /docs/introduction.rst: -------------------------------------------------------------------------------- 1 | Introduction 2 | ************ 3 | 4 | XPandas (extended `Pandas `_.) implements 1D and 2D data containers for storing type-heterogeneous tabular data of any type, and encapsulates feature extraction and transformation modelling in an sklearn-compatible transformer interface. 5 | 6 | Description 7 | +++++++++++ 8 | 9 | **XPandas** provides universal 1D typed list (`XSeries`) and 2D type-heterogeneous data-frame (`XDataFrame`) containers and provides an extended sklearn-like transformer classes interfacing said containers. Transformers can be used for automated feature extraction and map-reduce style transformations but are not limited to it. 10 | 11 | `XSeries` is based on `pandas.Series` that can store objects of any type. Example would be a series of image containers, or a series of `pandas.Series` objects stored as `XSeries`. `XSeries` can be visualised according to a schema. 12 | 13 | 14 | .. image:: ../examples/imgs/XSeries.png 15 | 16 | `XDataFrame` extends `pandas.DataFrame` by allowing arbitrary object types per column. It provides the same convenient sub-setting interface and extended abstract access methods. Each column is internally stored as an `XSeries` container, all of same length. 17 | 18 | One example could be a medical data set where each row is a different patient, say, in a hospital. The columns would correspond to a type-heterogeneous set of features like numbers (age, height, weight, etc.), categorical (gender, hair color, etc.), images (x-ray pictures), time series (heat beat, lab history), and other parts of a medical record. 19 | 20 | With `XDataFrame` one can store all this information in a single 2D data container instead of a tedious collection of custom nested lists or arrays. 21 | 22 | .. image:: ../examples/imgs/XDataFrame.png 23 | 24 | Another advantage of XPandas is the clean interface it provides to ready-to-go machine learning algorithms in `scikit-learn `_. The transformers interface can be used to easily convert the types in a `XDataFrame` to the primitive types with which sklearn can interface, as part of a modelling pipeline. In the example with patients data, one may want to extract summary features from each `pandas.Series`, or extract features from each image, say in a deep learning model. 25 | 26 | More technically, the implemented `XSeriesTransformer` class allows for the implementation of transformation defaults for `XSeries`; similarly `XDataFrameTransformer` implements a transformation for `XDataFrame` type objects. From a mathematical point of view `XSeriesTransformer` encapsulate abstract functions of the type `XSeries -> XSeries or XDataFrame` whereas 27 | `XDataFrameTransformer` represents mappings from `XDataFrame -> XDataFrame`. Each of the transformers follow the familiar fit/transform/parameters API of sklearn. 28 | 29 | .. image:: ../examples/imgs/Transformer.png 30 | 31 | Data types 32 | ++++++++++ 33 | 34 | Notably, XPandas comes with several pre-implemented transformers for the most common non-primitive data types. 35 | 36 | Time series 37 | 38 | 39 | - ``TimeSeriesTransformer(features)`` — extract ``features`` from each 40 | series. ``features`` is a subset of [ 'mean', 'std', 'max', 'min', 41 | 'median', 'quantile\_25', 'quantile\_75', 'quantile\_90', 42 | 'quantile\_95'] 43 | 44 | - ``TimeSeriesWindowTransformer(windows_size)`` — calculate rolling 45 | mean with given ``windows_size`` 46 | - ``TsFreshSeriesTransformer`` — extract features using 47 | `tsfresh `__ package 48 | 49 | Image 50 | 51 | 52 | - ``ImageTransformer`` — Performs image transformation based on skimage 53 | `transformation 54 | function `__ 55 | 56 | Categorical data 57 | 58 | 59 | - ``BagOfWordsTransformer(dictionary)`` — Performs bag-of-features 60 | transformer for strings of any categorical data 61 | 62 | XPandas also allows for pipelining, via the ``PipeLineChain`` 63 | transformer, which can chain multiple transformers and ``scikit-learn`` 64 | predictor into a single pipeline. ``PipeLineChain`` is based on the 65 | ``scikit-learn`` 66 | `Pipeline `__. 67 | -------------------------------------------------------------------------------- /github_deploy_key.enc: -------------------------------------------------------------------------------- 1 | gAAAAABZ6lhCyRariN85qLf0ABv3ReKJYlFI7dWpM4yfrbUQ-sTpD0YzjL61mAhOZGx606cINu-RetLgTUcs6m1J75AwiOQl2HuXMvpinOJ5bIqH5xdyFQqC_5HiHGwyn0M4PIFdiAPsT9V7mJf39_7wGnSHHRTB6tJbfIoIv5iV_rrC__T-UpkA2s5RCl1J8KmF3CFU7OvPvtpnSlL8JJfTs3yCRwDjI5Y7gs2jwkkAvl75Ry2uRzEOLQq6pY8GZmZJx-eqi6gFzEhZOzWwoSAgoRov-Bp1ZIwfAKAMLoAZfuxQiBeDz1mwqiTYY4lEoG_yng2lrUXx1o7ktz_hujoj39ZR2NDahOvBdsauJB-g_4iRHazl1IhY3vxrh9V7gQYK-fLZot967rJ5FIno4GyQIVgchzuvNVrmamAHkJuyc54PmdDk52E88cLVcKblyfTd3j4xWjDRai5md7jAUuSwF5B6LvdSnLCUgOPIrgNCHfehtsGLDPCUEbuAeEIOLPuirFJn_GTMjO5xu77YIIekesWHebG8OuMCm_cUFIqWavLWjCO3RLXKH6_fyxaoynwCwY4bCTbBjkSvTg8M_V8g5rWHjTL8qzu4eoQZ8EDqycxvr5ZTw-Lwdp-czT6uOatH88t_ucZhffHT0FEpMOQPWddPxCQV1ewetXQk-x5pZphwLl4dJq7cor7AUE-SxDCiALPv_Tdi0yfxC-qMRawcBLADPPAgiVal-vd0arplCFnccoFrgPR-t4DT53n8yCRxzXH-HrlkeZAH29pYcsCKiKgR5qOdHN5_QIAG5T4ovVTf-AKtOZFIsnoug-e08k4afYWcNXx3NE2ourDfJYHIMd34w1FfZG9Rf5xDFfJlfSPSubCWtkfB4UPqtVFwWWBv5DHCfkbNnevib3rwwDENKO0cvB85qvOBSGHRVbUDMf6LZ6qMr-dWFNb2iGm_faQ9NPdHRa5_fQ9DfT5TNmIcbBPj0oPHwQvYFDBosnNaQZpjwphVjhN1j-EIr2l4n1MnoPAbF9rW6L9oYFHOindytsGS39LumUK_ON3n7ZoN_dqUFFQH5qOcQn3JLzjUVCxd_9HsFtz1inw1XzZ-SrLZl2bhp1uztNZumma5utqzr1UYhElRkQdxKWFQwhzwe9LQ22F_26x0THTXrmI7-8tY1eh6K9mj5ogYCX_fYgj8MwK6bgXlun8yjph1kAnYOqDWvw-PaQGWRojEUjyq6Fv5tetawrCApiVRTD5LOKGFxQCQ4lIThs5DiqcLTkVUrfefPfjahqcXtsvCd5FYF4u-bpJVwt2nZj0Ux_U5x20PC9dp1l-FhgkoLh6SrPEaGZQUKiOfRhkU2ia9HbcRUODlpIPc5VsqILYF5K9V-mmyaablu0_51KZcWS5-tcldYznHMq7U-eYcun7HgutFNMQ_SUWZMMHSyXA_0epQJyieMjggqUbRCm4FmEo-TVNgj_Yv8eG9gPEcy86W_K8k60CG9lX02AVKqdfUUpZTXSjM5iokIqM2PTmuGtZGW0dVE2WwuCpASvEivb9pN2jn_Zydak6JlL84Dcd3gS_AQ-Fw3OjrNVnXxk_iRL7Pho01EH7bTCiCuzhevbnX2gomH6drDZtm8X8OuQW0SeUixlPW2535QQL_H8ZCQs85Gm5Mv1R73UD8iwIr31PNIoYsIDwLSvGtlM70KrJW1bP9nYv_ye-AmJPXO2WAKF9fxV5K_HC3Fbs323KTVrlougq93Go_Mt867nhu59JxPm2cnp_TNYMvH42enTbKixNZxjCQm9quwydLM88NEeFVhL9gSO9B8YZjVxnLv7Yos4SkNIOEGt9mfnBtvm6UMv-Rd8mGcgAbzrOSMDNHnqGJpjbWlSeXOyGPSDx6Fzq0OONDCZaQbznCtTUGDZkKaFn2699ttY2cqo5BqtwGoGKWynrlkiN-XuJNgoT1VJ5kp07jRK2sxjiaqe0n_IjCtOwKq25G4ee-ZuVEvKysvvqqClfcbrNyN9cw-eIhCtZdyG-6ZkgZqwBwmSN0MjDJH6Oj9Chs40sz4GvXDIRJLk99ggr2S4NvTneqdoYvxCCH1J1r2XEfUr-e5OeP0fzTpeTzwXF_re41Od0Je8-A2Ug7lF2LPGPRhH4Ee-cBN0bAi5MC2SPzLAAqRLy7mZHNQmyWNaniShBV-Z3WTcfS07Mt5eMwT1AOADVAHrCJwYTIOi7l1UpAyT2FWNIq0z3_Ho0i9xwZHEG74IsXHeyHAnWWTffe836mDkcxlnRDobR4ZHsbDFmmxAg0Drni8S__OWs5mOrgq0DV2962AOqDqgbOmSRYIPluxkVNcrE46Y6rTHi0XLXpRYCUBiFhIzrk9N9l8z3QZPxHgqrF6S9xDbCCz04jRIZX-HUYNPYR8N754ah9lMU4kbYGrY76EfbmXj_Ek5rsXSvLssS_VrYMe8iXIBye9zu46enie0qYC6G0J9K9rdppJ7iFWrGGhbt0-VzSPqSNQ8ajgrrxeK1oH5kB0R1YDQGZDLr0ABgWbwwnsFcBwfFX9C13yxCsqJLfn6xqMGYfx44jOavI-JWlRIDXWAToK6Y2cDOAcuISi8eRcbzdh1JRFGU3z0bch-6-mCe48K5TnsMpAcOFFBQJ7PnkouzZbpVbtDJBKtC9DnfVDDyfa38jO28qDN2Ne3mOJ-H3fdae8wT5mKgSOeUcn028r9CJFb8wt9R5dbWgB32H3_8clqggkHQkgUmdQiTAehEUc_TiFee20s-W3Ux_ljKjCqpZOzjgnLTfhP7VwRQs9U9lBg7PA-7C-5jHdI5yhs6rPgTihPD14bl2cmflq3ogU8g0ZZApEyN47OdsOgAwsdnoe6m0MMYyXCgfVwDdmP0bwkHOuhjHaOtZt0XevWbKtsHE83fsrCi3KiIxSo8Ii5qH7C1Ud2eE6VSkog6LVTL8rc2W2yy9avcEtW9ihs3zHavQN5KbP8tm8dP8_BgkutEq5dqxsPgu3sBJa11y4F6kk794196cDdWkesFwx-C7rThB91KKK_69fo90UUonNfO4oopioOLKqkjCoaLrJdSfJSyhE4M_7lLXrNYs70ndMM3WtyRQX2_ueiTBLa5BQdRfjdzTsX29jxRSjA-7I2sGZjXncsqyCsAASeveHk0Bv6tDVBpPzAf6pHF--uBpbmN7RDyb1FnxBt4bZnG-pTSknMIVxuCrUqUwlgjFIXL5WLy8johIxLLeE3KxvOQl1b1xX7VsRA7ViRgOetY5-_4Cmzey3EbLE23IFRjZdoUZ7-EajTXQFOXZChZh_Mx7yVlrfK4uOjf_uMizM7kqZFgJ5HCk0szXHlv9k8C64-AJw-zTpaKAWmAUXtMbbdigMqYyQJzdS2l-3i72WOqbfFBlnNbkDvJmYXWvIPEaqMb7LyuI6k6risdyasF2wpHhFIXVNcgwv6vNWJop6y3Mmt-IsHZHoz6IoU_0mxi1hyGziAKcSnm9TiPWBwSyV10oDSWWT0aamm9XiszrFVcyZECwSzIU_bKCH-oqysA5YQKWKe9RKLK2EnwDZ33qJf7h6PspfcUYCCeQQBlIgMkEONm4z74IqZ4Gbm4Pbh-DeTVYd7kqPk-aR_VpJM6vLtyM-s2_J3wuKy36YMMKGlAFLvuQTEChj1h14nHpkOSTUrTbqzZUnqY9BKRuN--usMZoCPcO-lnBB7cIKmsonV9ePYNnHE6wCkqg_f7mInVi7qA-GCMTxqz9kdYGLcISJf6XeCyXvfYMV1yqqOyxv9ykp41pXe6Jf-IdFzlJRQ5Ffrzhh7hblydf1PufzAybvR4ILe0B8vJEkEggGmtpz4VtzHwp37h9w83CIlQ92jwlsoylVzEMS9ZV3GRhADdIb_KZeFuFsox-AK1_GBhszco_5Wpjc2KpR-njOxLioCbn7wQibjHuKPQV1UgKeb4ikyZHO6iEzmd6hqn5xjGhCM7eXdI30j4YRPyxP6ez15b1llKgxXI9Sz8hkxGOhBDU6sPQZj_4Cd2fcZUSWlfZful0Mbe07pajtfWd__79P_UG_cv7MEevQGJQreGaywRq6XpXdIDoH9g94T_NMFv2rSIuRvN_45mAwTEjQqAmf9mhzdDAXFe9xXE_FKCXzT3GHLp2JNTU8m9OtDuBV3WRfVn6nGOVGIE4vUdPKp2EaK3r34PB388wjABOOFjTOMsTLqjYYJ9bTlD86ZJR2z1M-f5ZZwwuQ34zqPVoGNIHl5SwyYwl_iTGDOCH9pSIxhUKPr__5ZyXOasI8KzReAKf0cBUqmfAlAwjNL_Fql6k8ZF27mHy_4_Q6SQi8s7jM5KFoFMzC7c1EFGKnxaaB6mpKGRx9mq0BPDM08DJ0kN5SOTZ6ayTnOltWRCg_vxp-gA1hKJwDoiazHiATo5IgdXF9yA= -------------------------------------------------------------------------------- /xpandas/transformers/series_transformers/series_transformer.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | 3 | import pandas as pd 4 | from tsfresh.feature_extraction.extraction import _do_extraction_on_chunk 5 | from tsfresh.feature_extraction.settings import ComprehensiveFCParameters 6 | 7 | from ..transformer import XSeriesTransformer 8 | 9 | 10 | class TimeSeriesTransformer(XSeriesTransformer): 11 | ''' 12 | Extract common features 'mean', 'std', 'max', 'min', 13 | 'median', 'quantile_25', 'quantile_75', 14 | 'quantile_90', 'quantile_95' from pandas.Series. 15 | Transform XSeries to XDataFrame. 16 | ''' 17 | FEATURES = [ 18 | 'mean', 'std', 'max', 'min', 19 | 'median', 'quantile_25', 'quantile_75', 20 | 'quantile_90', 'quantile_95' 21 | ] 22 | 23 | def __init__(self, features=None, **kwargs): 24 | ''' 25 | :param features: list of features from FEATURES property 26 | ''' 27 | accepted_types = [ 28 | pd.Series 29 | ] 30 | 31 | if features is None: 32 | features = self.FEATURES 33 | else: 34 | for f in features: 35 | if f not in self.FEATURES: 36 | raise ValueError('Unrecognized feature {}. Available features {}'.format(f, self.FEATURES)) 37 | 38 | def series_transform(series): 39 | transformed_series = {} 40 | 41 | for f in features: 42 | if f.startswith('quantile_'): 43 | quant_rate = int(f.split('_')[1]) / 100. 44 | transformed_series[f] = series.quantile(quant_rate) 45 | else: 46 | method_to_call = getattr(series, f) 47 | result = method_to_call() 48 | transformed_series[f] = result 49 | 50 | return transformed_series 51 | 52 | super(TimeSeriesTransformer, self).__init__(data_types=accepted_types, 53 | transform_function=series_transform) 54 | 55 | 56 | class TimeSeriesWindowTransformer(XSeriesTransformer): 57 | ''' 58 | Calculate rolling mean over XSeries of pandas.Series. 59 | ''' 60 | def __init__(self, windows_size=3, **kwargs): 61 | ''' 62 | :param windows_size: size of window for rolling mean 63 | ''' 64 | accepted_types = [ 65 | pd.Series 66 | ] 67 | 68 | self.windows_size = windows_size 69 | 70 | def series_transform(series, **params): 71 | return series.rolling(window=self.windows_size).mean().dropna() 72 | 73 | super(TimeSeriesWindowTransformer, self).__init__(data_types=accepted_types, 74 | transform_function=series_transform) 75 | 76 | 77 | class MeanSeriesTransformer(XSeriesTransformer): 78 | ''' 79 | Example transformer 80 | ''' 81 | def __init__(self, **kwargs): 82 | self.total_mean = None 83 | 84 | def mean_minus_mean_function(s, total_mean=None): 85 | if total_mean is None: 86 | total_mean = self.total_mean 87 | return s.mean() - total_mean 88 | 89 | accepted_types = [ 90 | pd.Series 91 | ] 92 | 93 | super(MeanSeriesTransformer, self).__init__(data_types=accepted_types, 94 | transform_function=mean_minus_mean_function) 95 | 96 | def fit(self, X, y=None, **kwargs): 97 | super(MeanSeriesTransformer, self).fit(X, **kwargs) 98 | sum_and_size = X.apply(lambda s: (s.sum(), len(s))) 99 | sum_total = sum([x[0] for x in sum_and_size]) 100 | total_size = sum([x[1] for x in sum_and_size]) 101 | self.total_mean = sum_total / total_size 102 | 103 | return self 104 | 105 | 106 | class TsFreshSeriesTransformer(XSeriesTransformer): 107 | ''' 108 | Performs transformation with tsfresh http://tsfresh.readthedocs.io/en/latest/ package 109 | over XSeries of pandas.Series. 110 | ''' 111 | def __init__(self, **kwargs): 112 | accepted_types = [ 113 | pd.Series 114 | ] 115 | 116 | default_fc_parameters = ComprehensiveFCParameters() 117 | extraction_function = partial(_do_extraction_on_chunk, 118 | default_fc_parameters=default_fc_parameters, 119 | kind_to_fc_parameters=None) 120 | 121 | def series_transform(series): 122 | series_name = series.name 123 | if series_name is None: 124 | series_name = self.name 125 | 126 | input_series = ( 127 | 1, series_name, series 128 | ) 129 | extracted_data = extraction_function(input_series) 130 | extracted_data_flat = { 131 | x['variable']: x['value'] 132 | for x in extracted_data 133 | } 134 | return extracted_data_flat 135 | 136 | super(TsFreshSeriesTransformer, self).__init__(data_types=accepted_types, 137 | columns=None, 138 | transform_function=series_transform) 139 | 140 | def transform(self, X): 141 | self.name = X.name 142 | return super(TsFreshSeriesTransformer, self).transform(X) 143 | -------------------------------------------------------------------------------- /tests/test_data_type.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | here = os.path.dirname(os.path.realpath(__file__)) 5 | sys.path.append(os.path.join(here, "..")) 6 | 7 | from ..xpandas.data_container import XSeries, XDataFrame 8 | import pandas as pd 9 | import numpy as np 10 | 11 | 12 | def test_series_type_series(): 13 | s = XSeries([ 14 | pd.Series([1, 2, 3], index=['a', 'b', 'c']), 15 | pd.Series([4, 5, 6], index=['d', 'e', 'g']) 16 | ]) 17 | 18 | assert s.data_type == pd.Series 19 | 20 | 21 | def test_series_type_primiteves(): 22 | s1 = XSeries([ 23 | 1, 2, 3 24 | ]) 25 | 26 | assert s1.data_type == np.int64 27 | 28 | s2 = XSeries([ 29 | 'a', 'b', 'c' 30 | ]) 31 | 32 | assert s2.data_type == str 33 | 34 | 35 | def test_series_different_data_type_exception(): 36 | try: 37 | s1 = XSeries([ 38 | pd.Series([1, 2, 3], index=['a', 'b', 'c']), 39 | pd.DataFrame({}) 40 | ]) 41 | 42 | s2 = XSeries([ 43 | 1, 2, 'abs' 44 | ]) 45 | except ValueError: 46 | assert True 47 | return 48 | 49 | assert False 50 | 51 | 52 | def test_series_type_data_frame(): 53 | s = XSeries([ 54 | pd.DataFrame({ 55 | 'a': [1, 2, 3], 56 | 'b': [4, 5, 6] 57 | }), 58 | pd.DataFrame({ 59 | 'c': [7, 8, 9], 60 | 'd': [10, 11, 12] 61 | }) 62 | ]) 63 | 64 | assert s.data_type == pd.DataFrame 65 | 66 | 67 | def test_series_slise_type(): 68 | s = XSeries([ 69 | pd.Series([1, 2, 3], index=['a', 'b', 'c']), 70 | pd.Series([4, 5, 6], index=['d', 'e', 'g']), 71 | pd.Series([7, 8, 9]) 72 | ]) 73 | 74 | sub_s = s[:2] 75 | 76 | assert sub_s.data_type == pd.Series 77 | 78 | 79 | def test_series_custom_class_type(): 80 | class MyClass(object): 81 | a = 1 82 | b = 2 83 | 84 | def __init__(self, a, b): 85 | self.a = a 86 | self.b = b 87 | 88 | class MySubClass(MyClass): 89 | pass 90 | 91 | s = XSeries([ 92 | MyClass(1, 2), 93 | MyClass(3, 4), 94 | MyClass(5, 6) 95 | ]) 96 | 97 | assert s.data_type == MyClass 98 | 99 | sub_s = XSeries([ 100 | MySubClass(1, 2), 101 | MySubClass(3, 4), 102 | MySubClass(5, 6) 103 | ]) 104 | 105 | assert sub_s.data_type == MySubClass 106 | 107 | 108 | def test_dataframe_data_types(): 109 | s1 = XSeries([pd.Series([1, 2, 3], index=['a', 'b', 'c']), 110 | pd.Series([4, 5, 6], index=['d', 'e', 'g'])]) 111 | s2 = XSeries([1, 2, 3]) 112 | s3 = XSeries([{"k1": "v1"}, {"k2": 'v2'}]) 113 | s4 = XSeries(['f', 's', 't']) 114 | 115 | df = XDataFrame({ 116 | 'first_col': s1, 117 | 'second_col': s2, 118 | 'third_col': s3, 119 | 'fourth_col': s4 120 | }) 121 | 122 | assert df['first_col'].data_type == pd.Series 123 | assert df['second_col'].data_type == np.int64 124 | assert df['third_col'].data_type == dict 125 | assert df['fourth_col'].data_type == str 126 | 127 | assert type(df[['first_col']]) == XDataFrame 128 | assert type(df[['first_col', 'second_col']]) == XDataFrame 129 | 130 | 131 | def test_dataframe_sub_frame_data_types(): 132 | s1 = XSeries([pd.Series([1, 2, 3], index=['a', 'b', 'c']), 133 | pd.Series([4, 5, 6], index=['d', 'e', 'g'])]) 134 | s2 = XSeries([1, 2, 3]) 135 | s3 = XSeries([{"k1": "v1"}, {"k2": 'v2'}]) 136 | s4 = XSeries(['f', 's', 't']) 137 | 138 | df = XDataFrame({ 139 | 'first_col': s1, 140 | 'second_col': s2, 141 | 'third_col': s3, 142 | 'fourth_col': s4 143 | }) 144 | 145 | sub_df = df.loc[:2] 146 | 147 | assert type(sub_df) == XDataFrame 148 | assert sub_df['first_col'].data_type == pd.Series 149 | assert sub_df['second_col'].data_type == np.int64 150 | assert sub_df['third_col'].data_type == dict 151 | assert sub_df['fourth_col'].data_type == str 152 | 153 | assert type(sub_df[['first_col']]) == XDataFrame 154 | assert type(sub_df[['first_col', 'second_col']]) == XDataFrame 155 | 156 | 157 | def test_series_map_transformer(): 158 | s = XSeries([ 159 | pd.Series([1, 2, 3], index=['a', 'b', 'c']), 160 | pd.Series([4, 5, 6], index=['d', 'e', 'g']) 161 | ]) 162 | 163 | func = lambda series: series + 1 164 | mapped_s = s.map(func) 165 | assert mapped_s.data_type == pd.Series 166 | assert mapped_s[0].equals(pd.Series([2, 3, 4], index=['a', 'b', 'c'])) 167 | 168 | func = lambda series: series.mean() 169 | mapped_s = s.map(func) 170 | assert mapped_s.data_type == np.float64 171 | 172 | 173 | def test_series_extract_features_with_apply_func(): 174 | s = XSeries([ 175 | pd.Series([1, 2, 3], index=['a', 'b', 'c']), 176 | pd.Series([4, 5, 6], index=['d', 'e', 'g']) 177 | ], name='MySuperSeries') 178 | 179 | func = lambda series: {'mean': series.mean(), 'std': series.std()} 180 | mapped_s = s.apply(func) 181 | assert type(mapped_s) == XDataFrame 182 | 183 | 184 | def test_series_replace_element(): 185 | s = XSeries([ 186 | pd.Series([1, 2, 3], index=['a', 'b', 'c']), 187 | pd.Series([4, 5, 6], index=['d', 'e', 'g']) 188 | ], name='MySuperSeries') 189 | 190 | try: 191 | s[0] = 111 192 | assert False 193 | except: 194 | assert True 195 | 196 | try: 197 | s[0] = pd.Series(np.random.normal(size=100)) 198 | assert True 199 | except: 200 | assert False 201 | 202 | 203 | def test_series_to_pandas_series(): 204 | s = XSeries([ 205 | pd.Series([1, 2, 3], index=['a', 'b', 'c']), 206 | pd.Series([4, 5, 6], index=['d', 'e', 'g']) 207 | ], name='MySuperSeries') 208 | s = XSeries(['a', 'b', 'c'], name='MySuperSeries') 209 | s = s.to_pandas_series() 210 | 211 | assert type(s) == pd.Series 212 | 213 | 214 | def test_dataframe_to_pandas_dataframe(): 215 | s1 = XSeries([pd.Series([1, 2, 3], index=['a', 'b', 'c']), 216 | pd.Series([4, 5, 6], index=['d', 'e', 'g'])]) 217 | s2 = XSeries([1, 2, 3]) 218 | s3 = XSeries([{"k1": "v1"}, {"k2": 'v2'}]) 219 | s4 = XSeries(['f', 's', 't']) 220 | 221 | df = XDataFrame({ 222 | 'first_col': s1, 223 | 'second_col': s2, 224 | 'third_col': s3, 225 | 'fourth_col': s4 226 | }) 227 | 228 | try: 229 | df.to_pandas_dataframe() 230 | assert False 231 | except: 232 | assert True 233 | 234 | s1 = XSeries([4, 5, 6]) 235 | s2 = XSeries([1, 2, 3]) 236 | 237 | df = XDataFrame({ 238 | 'first_col': s1, 239 | 'second_col': s2, 240 | }) 241 | 242 | try: 243 | df = df.to_pandas_dataframe() 244 | assert True 245 | except: 246 | assert False 247 | 248 | assert type(df) == pd.DataFrame 249 | -------------------------------------------------------------------------------- /tests/test_transformer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | here = os.path.dirname(os.path.realpath(__file__)) 5 | sys.path.append(os.path.join(here, "..")) 6 | 7 | from ..xpandas.data_container import XDataFrame, XSeries 8 | from ..xpandas.transformers import XSeriesTransformer, TimeSeriesTransformer, \ 9 | TimeSeriesWindowTransformer, MeanSeriesTransformer, XDataFrameTransformer, PipeLineChain 10 | import pandas as pd 11 | import numpy as np 12 | 13 | 14 | def test_transformer_custom(): 15 | s = XSeries([ 16 | pd.Series([1, 2, 3], index=['a', 'b', 'c']), 17 | pd.Series([4, 5, 6], index=['d', 'e', 'g']) 18 | ]) 19 | 20 | series_transformer = XSeriesTransformer(transform_function=lambda series: series.mean()) 21 | series_transformer = series_transformer.fit() 22 | 23 | s_transformed = series_transformer.transform(s) 24 | 25 | assert type(s_transformed) == XSeries 26 | assert s_transformed.data_type == np.float64 27 | 28 | 29 | def test_transformer_custom_to_data_frame(): 30 | s = XSeries([ 31 | pd.Series([1, 2, 3], index=['a', 'b', 'c']), 32 | pd.Series([4, 5, 6], index=['d', 'e', 'g']) 33 | ]) 34 | 35 | series_transformer = XSeriesTransformer(transform_function=lambda series: {'mean': series.mean()}) 36 | series_transformer = series_transformer.fit() 37 | 38 | s_transformed = series_transformer.transform(s) 39 | 40 | assert type(s_transformed) == XDataFrame 41 | 42 | 43 | def test_transformer_custom_series_to_series(): 44 | s = XSeries([ 45 | pd.Series([1, 2, 3], index=['a', 'b', 'c']), 46 | pd.Series([4, 5, 6], index=['d', 'e', 'g']) 47 | ]) 48 | 49 | series_transformer = XSeriesTransformer(transform_function=lambda series: series + 1) 50 | series_transformer = series_transformer.fit() 51 | 52 | s_transformed = series_transformer.transform(s) 53 | 54 | assert type(s_transformed) == XSeries 55 | assert s_transformed.data_type == pd.Series 56 | 57 | 58 | def test_transformer_series_transformer(): 59 | s = XSeries([ 60 | pd.Series([1, 2, 3], index=['a', 'b', 'c']), 61 | pd.Series([4, 5, 6], index=['d', 'e', 'g']) 62 | ]) 63 | 64 | series_transformer = TimeSeriesTransformer() 65 | series_transformer = series_transformer.fit() 66 | 67 | transformed_series = series_transformer.transform(s) 68 | 69 | assert type(transformed_series) == XDataFrame 70 | 71 | 72 | def test_transformer_series_to_series_transformer(): 73 | s = XSeries([ 74 | pd.Series(np.random.normal(0, 10, 100)), 75 | pd.Series(np.random.uniform(-100, 100, 150)), 76 | pd.Series(np.random.random_integers(0, 500, 200)) 77 | ]) 78 | 79 | series_to_series_transformer = TimeSeriesWindowTransformer(windows_size=5) 80 | series_to_series_transformer.set_params(windows_size=3) 81 | series_to_series_transformer.fit() 82 | transformed_series = series_to_series_transformer.transform(s) 83 | 84 | assert series_to_series_transformer.transform_function(s[0]).equals(transformed_series[0]) 85 | assert transformed_series.data_type == pd.Series 86 | assert type(transformed_series) == XSeries 87 | 88 | 89 | def test_transformer_data_frame(): 90 | s1 = XSeries([pd.Series([1, 2, 3], index=['a', 'b', 'c']), 91 | pd.Series([4, 5, 6], index=['d', 'e', 'g'])]) 92 | s2 = XSeries([1, 2, 3]) 93 | s3 = XSeries([{"k1": "v1"}, {"k2": 'v2'}]) 94 | s4 = XSeries(['f', 's', 't']) 95 | 96 | df = XDataFrame({ 97 | 'second_col': s2, 98 | 'third_col': s3, 99 | 'fourth_col': s4 100 | }) 101 | 102 | data_frame_transformer = TimeSeriesTransformer().fit() 103 | try: 104 | data_frame_transformer.transform(df) 105 | assert False 106 | except: 107 | assert True 108 | 109 | s1 = XSeries([ 110 | pd.Series(np.random.normal(size=10)), 111 | pd.Series(np.random.normal(size=15)) 112 | ]) 113 | s2 = XSeries([ 114 | pd.Series(np.random.normal(size=10)), 115 | pd.Series(np.random.normal(size=10)) 116 | ]) 117 | s3 = XSeries([{"k1": "v1"}, {"k2": 'v2'}]) 118 | s4 = XSeries(['f', 's']) 119 | df = XDataFrame({ 120 | 'first_col': s1, 121 | 'second_col': s2, 122 | 'third_col': s3, 123 | 'fourth_col': s4 124 | }) 125 | 126 | # print( 127 | # df['first_col'].shape 128 | # ) 129 | 130 | data_frame_transformer = XDataFrameTransformer(transformations={ 131 | 'first_col': TimeSeriesTransformer(), 132 | 'second_col': TimeSeriesTransformer() 133 | }) 134 | 135 | data_frame_transformer.fit(df) 136 | transformers_df = data_frame_transformer.transform(df) 137 | # print(transformers_df.head()) 138 | 139 | 140 | def test_pipeline_transformer_for_series(): 141 | from sklearn.decomposition import PCA 142 | 143 | s1 = XSeries([ 144 | pd.Series(np.random.normal(size=15)), 145 | pd.Series(np.random.normal(size=15)), 146 | pd.Series(np.random.normal(size=15)), 147 | pd.Series(np.random.normal(size=15)), 148 | pd.Series(np.random.normal(size=15)), 149 | pd.Series(np.random.normal(size=15)), 150 | pd.Series(np.random.normal(size=15)) 151 | ]) 152 | 153 | pipeline = PipeLineChain( 154 | [ 155 | ('first_transformer', TimeSeriesWindowTransformer()), 156 | ('mean_transformer', TimeSeriesTransformer()) 157 | ] 158 | ) 159 | pipeline = pipeline.fit(s1) 160 | transformed_ts = pipeline.transform(s1) 161 | 162 | pipeline = PipeLineChain( 163 | [ 164 | ('first_transformer', TimeSeriesWindowTransformer()), 165 | ('mean_transformer', TimeSeriesTransformer()), 166 | ('pca', PCA(n_components=4)) 167 | ] 168 | ) 169 | pipeline.fit(s1) 170 | transformed_ts = pipeline.transform(s1) 171 | 172 | 173 | def test_mean_transformer(): 174 | s1 = XSeries([ 175 | pd.Series(np.random.normal(size=10)), 176 | pd.Series(np.random.normal(size=15)) 177 | ]) 178 | s2 = XSeries([ 179 | pd.Series(np.random.normal(size=10)), 180 | pd.Series(np.random.normal(size=15)), 181 | pd.Series(np.random.normal(size=100)) 182 | ]) 183 | 184 | tr = MeanSeriesTransformer() 185 | tr = tr.fit(s1) 186 | 187 | transformed_s = tr.transform(s2) 188 | 189 | assert transformed_s.shape[0] == 3 190 | assert type(transformed_s) == XSeries 191 | 192 | 193 | def test_mean_transformer_data_frame(): 194 | s1 = XSeries([ 195 | pd.Series(np.random.normal(size=10)), 196 | pd.Series(np.random.normal(size=15)) 197 | ]) 198 | s2 = XSeries([ 199 | pd.Series(np.random.normal(size=10)), 200 | pd.Series(np.random.normal(size=15)) 201 | ]) 202 | 203 | df = XDataFrame({ 204 | 's1': s1, 205 | 's2': s2 206 | }) 207 | 208 | tr = MeanSeriesTransformer() 209 | try: 210 | tr = tr.fit(df) 211 | assert False 212 | except: 213 | assert True 214 | -------------------------------------------------------------------------------- /xpandas/transformers/transformer.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator, TransformerMixin 2 | 3 | from ..data_container import XDataFrame, XSeries 4 | 5 | 6 | class XSeriesTransformer(BaseEstimator, TransformerMixin): 7 | ''' 8 | XSeriesTransformer is a base class for all custom transformers. 9 | XSeriesTransformer is a high level abstraction to transform XSeries of 10 | specific data_types to an another XSeries or XDataFrame. 11 | XSeriesTransformer encapsulates transformation and based on scikit-learn BaseEstimator 12 | http://scikit-learn.org/stable/modules/generated/sklearn.base.BaseEstimator.html 13 | ''' 14 | _TRANSFORM_ARG_FUNCTION_NAME = 'transform_function' 15 | 16 | def __init__(self, transform_function=None, data_types=None, name=None, **kwargs): 17 | ''' 18 | :param transform_function: Callable that performs actual transform 19 | :param data_types: list of data_type that this transformer can work with. if None, 20 | error might be raised at run time 21 | :param name: name for transformer. if none, class name is default 22 | :param kwargs: additional arguments 23 | ''' 24 | if transform_function is not None and not callable(transform_function): 25 | raise ValueError('transform_function must be callable') 26 | 27 | self.transform_function = transform_function 28 | self.data_types = data_types 29 | 30 | if name is None: 31 | self.name = self.__class__.__name__ 32 | else: 33 | self.name = name 34 | 35 | def _check_input(self, input_data): 36 | ''' 37 | Check that input valid: input_data is XSeries and transformer 38 | "knows" how to work with input_data.data_type. 39 | In error raise exception. 40 | ''' 41 | if type(input_data) != XSeries: 42 | raise ValueError('X must be XSeries type') 43 | elif type(input_data) == XSeries and self.data_types is not None \ 44 | and input_data.data_type not in self.data_types: 45 | raise ValueError('Estimator does not support {} type'.format(input_data.data_type)) 46 | 47 | def fit(self, X=None, y=None, **kwargs): 48 | ''' 49 | Fit transformer for giver data. 50 | Must be overwritten in child classes 51 | :param X: XSeries to fit transformer on 52 | :param y: Labels column for X 53 | :param kwargs: additional arguments for transformer 54 | :return: fitted self object 55 | ''' 56 | if X is not None: 57 | self._check_input(X) 58 | 59 | return self 60 | 61 | def _transform_series(self, custom_series): 62 | ''' 63 | Helper method to transform XSeries 64 | :param custom_series: XSeries object 65 | :return: transformed XSeries. 66 | it could be XSeries or XDataFrame object 67 | ''' 68 | return custom_series.apply(func=self.transform_function, prefix=self.name) 69 | 70 | def transform(self, X): 71 | ''' 72 | Apply transformation to X with current transformer 73 | :param X: input XSeries 74 | :param columns: deprecated 75 | :return: transformed XSeries. 76 | it could be XSeries or XDataFrame object 77 | 78 | ''' 79 | if not hasattr(self, self._TRANSFORM_ARG_FUNCTION_NAME): 80 | raise ValueError('You mast pass transform_function argument with a function') 81 | 82 | self._check_input(X) 83 | 84 | transform_series = self._transform_series(X) 85 | transform_series.index = X.index 86 | 87 | return transform_series 88 | 89 | 90 | class XDataFrameTransformer(BaseEstimator, TransformerMixin): 91 | ''' 92 | XDataFrameTransformer is a set of XSeriesTransformer instances. 93 | XDataFrameTransformer can transform XDataFrame object to another XDataFrame 94 | based on set of XSeriesTransformer transformers. 95 | ''' 96 | 97 | def _validate_transformations(self, transformations): 98 | for k, v in transformations.items(): 99 | if not isinstance(k, str): 100 | raise TypeError('Key must be a string {}'.format(k)) 101 | 102 | if isinstance(v, list): 103 | for t in v: 104 | if not isinstance(t, XSeriesTransformer): 105 | raise TypeError('All objects of {} must be a Transformer object. Issue with {}'.format(v, t)) 106 | elif not isinstance(v, XSeriesTransformer): 107 | raise TypeError('Value must be a Transformer object {}'.format(v)) 108 | 109 | def _wrap_transformers_in_list(self, transformations): 110 | new_transformers = {} 111 | for k, v in transformations.items(): 112 | if isinstance(v, list): 113 | new_transformers[k] = v 114 | else: 115 | new_transformers[k] = [v] 116 | return new_transformers 117 | 118 | def __init__(self, transformations): 119 | ''' 120 | Init XDataFrameTransformer with a dict of transformations. 121 | Each transformation specify column and transformer object 122 | :param transformations: dict {column_name: Transformer object or [Transformer object]} 123 | ''' 124 | self._validate_transformations(transformations) 125 | self.transformations = self._wrap_transformers_in_list(transformations) 126 | 127 | def fit(self, X=None, y=None, **kwargs): 128 | ''' 129 | Fit each transformer at self.transformations dictionary 130 | ''' 131 | if not isinstance(X, XDataFrame): 132 | raise TypeError('X must be a XDataFrame type. Not {}'.format(type(X))) 133 | 134 | for col_name, transformations in self.transformations.items(): 135 | for t in transformations: 136 | t.fit(X[col_name]) 137 | 138 | return self 139 | 140 | def transform(self, X, columns_mapping=None): 141 | ''' 142 | Transform X with fitted dictionary self.transformations. 143 | :param columns_mapping: {old_col: new_col} mapping between columns in fit data set and current X 144 | :return: 145 | ''' 146 | if columns_mapping is None: 147 | columns_mapping = {} 148 | 149 | transformers_df = X.copy() 150 | 151 | for col_name, transformations in self.transformations.items(): 152 | for t in transformations: 153 | new_col_name = columns_mapping.get(col_name, col_name) 154 | transformed_column = t.transform(X[new_col_name]) 155 | 156 | if type(transformed_column) == XSeries: 157 | transformers_df.rename(columns={ 158 | new_col_name: transformed_column.name 159 | }, inplace=True) 160 | transformers_df[transformed_column.name] = transformed_column 161 | else: 162 | transformers_df.drop(new_col_name, inplace=True, axis=1) 163 | 164 | transformers_df = XDataFrame.concat_dataframes( 165 | [transformers_df, transformed_column] 166 | ) 167 | 168 | return transformers_df 169 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | rm -rf $(BUILDDIR)/* 51 | 52 | html: 53 | ipython nbconvert ../examples/ExampleUsage.ipynb --to rst 54 | mv ../examples/ExampleUsage.rst . 55 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 56 | @echo 57 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 58 | 59 | livehtml: 60 | sphinx-autobuild -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 61 | 62 | dirhtml: 63 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 64 | @echo 65 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 66 | 67 | singlehtml: 68 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 69 | @echo 70 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 71 | 72 | pickle: 73 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 74 | @echo 75 | @echo "Build finished; now you can process the pickle files." 76 | 77 | json: 78 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 79 | @echo 80 | @echo "Build finished; now you can process the JSON files." 81 | 82 | htmlhelp: 83 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 84 | @echo 85 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 86 | ".hhp project file in $(BUILDDIR)/htmlhelp." 87 | 88 | qthelp: 89 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 90 | @echo 91 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 92 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 93 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/XPandas.qhcp" 94 | @echo "To view the help file:" 95 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/XPandas.qhc" 96 | 97 | devhelp: 98 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 99 | @echo 100 | @echo "Build finished." 101 | @echo "To view the help file:" 102 | @echo "# mkdir -p $HOME/.local/share/devhelp/XPandas" 103 | @echo "# ln -s $(BUILDDIR)/devhelp $HOME/.local/share/devhelp/XPandas" 104 | @echo "# devhelp" 105 | 106 | epub: 107 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 108 | @echo 109 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 110 | 111 | latex: 112 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 113 | @echo 114 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 115 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 116 | "(use \`make latexpdf' here to do that automatically)." 117 | 118 | latexpdf: 119 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 120 | @echo "Running LaTeX files through pdflatex..." 121 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 122 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 123 | 124 | latexpdfja: 125 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 126 | @echo "Running LaTeX files through platex and dvipdfmx..." 127 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 128 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 129 | 130 | text: 131 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 132 | @echo 133 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 134 | 135 | man: 136 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 137 | @echo 138 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 139 | 140 | texinfo: 141 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 142 | @echo 143 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 144 | @echo "Run \`make' in that directory to run these through makeinfo" \ 145 | "(use \`make info' here to do that automatically)." 146 | 147 | info: 148 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 149 | @echo "Running Texinfo files through makeinfo..." 150 | make -C $(BUILDDIR)/texinfo info 151 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 152 | 153 | gettext: 154 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 155 | @echo 156 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 157 | 158 | changes: 159 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 160 | @echo 161 | @echo "The overview file is in $(BUILDDIR)/changes." 162 | 163 | linkcheck: 164 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 165 | @echo 166 | @echo "Link check complete; look for any errors in the above output " \ 167 | "or in $(BUILDDIR)/linkcheck/output.txt." 168 | 169 | doctest: 170 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 171 | @echo "Testing of doctests in the sources finished, look at the " \ 172 | "results in $(BUILDDIR)/doctest/output.txt." 173 | 174 | xml: 175 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 176 | @echo 177 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 178 | 179 | pseudoxml: 180 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 181 | @echo 182 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 183 | 184 | buildapi: 185 | sphinx-apidoc -fMT ../xpandas -o api 186 | @echo "Auto-generation of API documentation finished. " \ 187 | "The generated files are in 'api/'" 188 | -------------------------------------------------------------------------------- /xpandas/data_container/data_container.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def _check_all_elements_have_the_same_property(array, func): 6 | ''' 7 | Helper function that checks if all elements have the same func(element) value. 8 | :param array: input values 9 | :param func: any callable object 10 | :return: tuple. the first element indicates is all elements are have the same func(element) value, 11 | second element is a value of func(element) 12 | ''' 13 | if len(array) == 0: 14 | return True, None 15 | try: 16 | first_element_type = func(array[0]) 17 | except: 18 | return True, None 19 | do_all_have_property = all(func(x) == first_element_type 20 | for x in array) 21 | 22 | return do_all_have_property, first_element_type 23 | 24 | 25 | def _is_class_a_primitive(cls): 26 | ''' 27 | Check if class is a number or string including numpy numbers 28 | :param cls: any class 29 | :return: True if class is a primitive class, else False 30 | ''' 31 | primitives = [ 32 | np.float16, np.float32, np.float64, np.float128, 33 | np.int8, np.int16, np.int32, np.int64, 34 | bool, str, np.uint8, np.uint16, np.uint32, np.uint64, 35 | int, float 36 | ] 37 | return cls in primitives 38 | 39 | 40 | class XSeries(pd.Series): 41 | ''' 42 | XSeries is an homogeneous abstract 1d container that encapsulates any data type inside. 43 | It is an extension of pandas.Series class. 44 | XSeries has a property data_type that is a type ot objects that are inside XSeries. 45 | ''' 46 | _metadata = ['data_type'] 47 | 48 | @property 49 | def _constructor(self): 50 | return XSeries 51 | 52 | @property 53 | def _constructor_expanddim(self): 54 | return XDataFrame 55 | 56 | def __init__(self, *args, **kwargs): 57 | ''' 58 | The same arguments as for pandas.Series 59 | https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html 60 | 61 | In order to create XSeries of any data_type, data argument must be a pythons list. 62 | For example, to create XSeries of pandas.Series, pass data should be 63 | data = [s_1, s2, ..., s3] where s_i is a instance of pandas.Series. 64 | ''' 65 | super(XSeries, self).__init__(*args, **kwargs) 66 | 67 | data = kwargs.get('data') 68 | if data is None: 69 | data = args[0] 70 | 71 | check_result, data_type = _check_all_elements_have_the_same_property(data, type) 72 | if not check_result: 73 | raise ValueError('Not all elements the same type') 74 | 75 | if data_type is not None: 76 | self._data_type = data_type 77 | else: 78 | self._data_type = type(data._values[0]) 79 | 80 | def apply(self, *args, **kwargs): 81 | ''' 82 | Overwrite standart pandas.Series method. 83 | Apply transform function to all elements in self. 84 | *If transform function return dict like object, 85 | transform XSeries to XDataFrame see XDataFrame constructor* 86 | 87 | :param func: function to apply 88 | :param prefix: prefix for columns if needs to return XDataFrame object 89 | :return: XSeries of XDataFrame depending on transformation 90 | ''' 91 | func = kwargs.get('func') 92 | if func is None: 93 | func = args[0] 94 | 95 | # TODO 96 | # Possibly change to handle NaN 97 | mapped_series = self.dropna() 98 | mapped_series = mapped_series.map(func, na_action='ignore') 99 | mapped_data_type = mapped_series.data_type 100 | 101 | custom_prefix = kwargs.get('prefix') 102 | if custom_prefix is None: 103 | custom_prefix = self.name 104 | else: 105 | custom_prefix = '{}_{}'.format(self.name, custom_prefix) 106 | 107 | if mapped_series.__is_data_type_dict_like(): 108 | custom_df = XDataFrame.from_records(mapped_series.values) 109 | 110 | if custom_prefix is not None: 111 | custom_df.columns = custom_df.columns.map(lambda x: '{}_{}'.format(custom_prefix, x)) 112 | return custom_df 113 | elif mapped_data_type == pd.DataFrame: 114 | return pd.concat(mapped_series.values, ignore_index=True) 115 | else: 116 | mapped_series.name = custom_prefix 117 | 118 | return mapped_series 119 | 120 | def __is_data_type_dict_like(self): 121 | ''' 122 | Check if data encapsulated by self is instance of dict 123 | ''' 124 | return isinstance(self.iloc[0], dict) 125 | 126 | @property 127 | def data_type(self): 128 | ''' 129 | Getter for a data_type property 130 | data_type is a data type that self encapsulates 131 | For example, if self is contains images, that data_type would be Image 132 | ''' 133 | first_element_data_type = type(self.iloc[0]) 134 | self._data_type = first_element_data_type 135 | return self._data_type 136 | 137 | @data_type.setter 138 | def data_type(self, data_type): 139 | ''' 140 | Setter for a data_type property 141 | data_type is a data type that self encapsulates 142 | For example, if self is contains images, that data_type would be Image 143 | ''' 144 | 145 | self._data_type = data_type 146 | 147 | def to_pandas_series(self): 148 | ''' 149 | Convert self to pandas.Series if data_type is a primitive type 150 | etc. number of string 151 | :return: Pandas Series or raise exception if data_type is not a primitive type 152 | ''' 153 | is_primitive = _is_class_a_primitive(self.data_type) 154 | if is_primitive: 155 | self.__class__ = pd.Series 156 | else: 157 | raise ValueError('Unable to cast to pd.Series. {} is not a primitive type.'.format(self.data_type)) 158 | return self 159 | 160 | def __str__(self): 161 | s = super(XSeries, self).__str__() 162 | return '{}\ndata_type: {}'.format(s, self.data_type) 163 | 164 | def __getitem__(self, key): 165 | return super(XSeries, self).__getitem__(key) 166 | 167 | def __setitem__(self, key, value): 168 | value_type = type(value) 169 | if value_type != self.data_type: 170 | raise ValueError('Can not assign key {} with {} wrong data_type {} correct is {}'.format( 171 | key, value, value_type, self.data_type 172 | )) 173 | 174 | return super(XSeries, self).__setitem__(key, value) 175 | 176 | 177 | class XDataFrame(pd.DataFrame): 178 | ''' 179 | XDataFrame is 2d container that stores XSeries objects 180 | XDataFrame is an extension of pandas.DataFrame object 181 | ''' 182 | 183 | @property 184 | def _constructor(self): 185 | return XDataFrame 186 | 187 | @property 188 | def _constructor_sliced(self): 189 | return XSeries 190 | 191 | def __init__(self, *args, **kwargs): 192 | ''' 193 | The same arguments as for pandas.DataFrame 194 | https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html 195 | 196 | data argument should be a list of XSeries objects or dict of XSeries objects. 197 | In dict is passed, key must be a string and it's indicate appropriate column name. 198 | For example, to create XDataFrame data should looks like 199 | data = {'col_1': s_1, 'col_2': s_2, ..., 'col_n': s_n} where s_i is a XSeries 200 | ''' 201 | data = kwargs.get('data') 202 | if data is None: 203 | data = args[0] 204 | 205 | data_to_check = [] 206 | if isinstance(data, list): 207 | data_to_check = data 208 | elif isinstance(data, dict): 209 | data_to_check = data.values() 210 | 211 | for d in data_to_check: 212 | if not isinstance(d, XSeries): 213 | raise ValueError('All data must be XSeries instances') 214 | super(XDataFrame, self).__init__(*args, **kwargs) 215 | 216 | def get_columns_of_type(self, column_type): 217 | ''' 218 | Get all columns from XDataFrame with given column_type 219 | :param column_type: list of types or a single type 220 | :return: tuple. the first element is subMultiDataFrame and second is a list of column of a given column_type 221 | ''' 222 | if type(column_type) != list: 223 | column_type = [column_type] 224 | 225 | columns_to_select = [ 226 | col_name 227 | for col_name in self 228 | if self[col_name].data_type in column_type 229 | ] 230 | 231 | return self[columns_to_select], columns_to_select 232 | 233 | def get_data_types(self): 234 | ''' 235 | Get a list of data_types of each XSeries inside XDataFrame 236 | :return: list of data_type 237 | ''' 238 | data_types = [ 239 | self[column].data_type 240 | for column in self 241 | ] 242 | return data_types 243 | 244 | def to_pandas_dataframe(self): 245 | ''' 246 | Convert self to pandas.DataFrame if all columns are primitive types. 247 | See more at XSeries.to_pandas_series 248 | :return: 249 | ''' 250 | data_types = self.get_data_types() 251 | is_all_columns_are_primitive = all( 252 | _is_class_a_primitive(dt) 253 | for dt in data_types 254 | ) 255 | if is_all_columns_are_primitive: 256 | self.__class__ = pd.DataFrame 257 | else: 258 | raise ValueError('Unable to cast to pd.DataFrame. {} is not all primitives.'.format(self.data_types)) 259 | return self 260 | 261 | @classmethod 262 | def concat_dataframes(cls, data_frames): 263 | ''' 264 | Concatenate XDataFrame using pandas.concat method 265 | https://pandas.pydata.org/pandas-docs/stable/generated/pandas.concat.html 266 | over columns 267 | :param data_frames: list of XDataFrame instances 268 | :return: XDataFrame — concatenated list of data_frames 269 | ''' 270 | return pd.concat(data_frames, axis=1) 271 | -------------------------------------------------------------------------------- /examples/ExampleUsage.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "This notebook presents basic usage examples of the XPandas package." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Example dataset" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [ 22 | { 23 | "name": "stderr", 24 | "output_type": "stream", 25 | "text": [ 26 | "/Users/iwitaly/anaconda/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.\n", 27 | " from pandas.core import datetools\n" 28 | ] 29 | } 30 | ], 31 | "source": [ 32 | "from io import BytesIO\n", 33 | "from zipfile import ZipFile\n", 34 | "from urllib.request import urlopen\n", 35 | "\n", 36 | "import numpy as np\n", 37 | "import pandas as pd\n", 38 | "import os, sys\n", 39 | "import requests\n", 40 | "\n", 41 | "sys.path.insert(0, '..')\n", 42 | "\n", 43 | "from xpandas.data_container import *\n", 44 | "from xpandas.transformers import TimeSeriesTransformer, TimeSeriesWindowTransformer" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "The usage example shown is based on open source time series [data set](http://timeseriesclassification.com/Downloads/FordA.zip).\n", 52 | "\n", 53 | "The first thing we need to do is to read data. Here, we use the `urlopen` function from Python's built-in urllib to download data set and limit the length of each data series." 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 2, 59 | "metadata": { 60 | "collapsed": true 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "url = \"http://timeseriesclassification.com/Downloads/FordA.zip\"\n", 65 | "series_offset = 505" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 3, 71 | "metadata": { 72 | "collapsed": true 73 | }, 74 | "outputs": [], 75 | "source": [ 76 | "url = urlopen(url)\n", 77 | "zipfile = ZipFile(BytesIO(url.read()))\n", 78 | "lines = zipfile.open('FordA/FordA.csv').readlines()\n", 79 | "lines = [l.decode('utf-8') for l in lines]\n", 80 | "lines = lines[series_offset:]" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "``lines`` is now a list of strings representing timeseries in a comma separated format that we can convert into floats" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 4, 93 | "metadata": { 94 | "collapsed": true 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "lines = [list(map(float, l.split(','))) for l in lines]" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 5, 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "data": { 108 | "text/plain": [ 109 | "[1.1871,\n", 110 | " 0.4096,\n", 111 | " -0.43154,\n", 112 | " -1.231,\n", 113 | " -1.9055,\n", 114 | " -2.3824,\n", 115 | " -2.588,\n", 116 | " -2.5018,\n", 117 | " -2.1353,\n", 118 | " -1.574]" 119 | ] 120 | }, 121 | "execution_count": 5, 122 | "metadata": {}, 123 | "output_type": "execute_result" 124 | } 125 | ], 126 | "source": [ 127 | "lines[0][:10]" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "Let's convert each embedded list into more convenient ``pandas.Series`` object." 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 6, 140 | "metadata": { 141 | "collapsed": true 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "lines = [pd.Series(l) for l in lines]" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 7, 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/plain": [ 156 | "0 1.18710\n", 157 | "1 0.40960\n", 158 | "2 -0.43154\n", 159 | "3 -1.23100\n", 160 | "4 -1.90550\n", 161 | "5 -2.38240\n", 162 | "6 -2.58800\n", 163 | "7 -2.50180\n", 164 | "8 -2.13530\n", 165 | "9 -1.57400\n", 166 | "dtype: float64" 167 | ] 168 | }, 169 | "execution_count": 7, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "lines[0][:10]" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "# XPandas: Data structures" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "### XSeries" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "``XSeries`` is a 1d data container that can store any objects inside.\n", 197 | "\n", 198 | "Using the ``pandas.Series`` objects we can encapsulate the list ``lines`` into ``XSeries`` object. The object has a global index of series and an sub-index for each ``pandas.Series``." 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 8, 204 | "metadata": { 205 | "collapsed": true 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "X = XSeries(lines)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 9, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "data": { 219 | "text/plain": [ 220 | "0 0 1.187100\n", 221 | "1 0.409600\n", 222 | "2 -0.43154...\n", 223 | "1 0 0.094261\n", 224 | "1 0.310310\n", 225 | "2 0.53060...\n", 226 | "2 0 -1.157000\n", 227 | "1 -1.592600\n", 228 | "2 -1.50960...\n", 229 | "3 0 0.356960\n", 230 | "1 0.300850\n", 231 | "2 0.24314...\n", 232 | "4 0 0.307980\n", 233 | "1 0.370350\n", 234 | "2 0.26015...\n", 235 | "dtype: object\n", 236 | "data_type: " 237 | ] 238 | }, 239 | "execution_count": 9, 240 | "metadata": {}, 241 | "output_type": "execute_result" 242 | } 243 | ], 244 | "source": [ 245 | "X.head()" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "The output reveals the ``data_type`` property of the ``XSeries`` object which contains the type of the contained objects, in this case, ``pandas.Series``. The ``XSeries`` is thus build up of ``pandas.Series``. Specifically, ``X`` supports all methods of its containing object ``pandas.Series``." 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "### XDataFrame" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "``XDataFrame`` is an abstract 2d container that is based on ``pandas.DataFrame`` and stores ``XSeries`` objects.\n", 267 | "\n", 268 | "The main feature of the ``XDataFrame`` are columns of ``XSeries`` that can contain and manage any **data_type**. For example, one may have a data set consisting of series, images, texts, plain numbers, or even custom objects. Ideally, we would want to handle such different data types in a unified 2d data container, e.g. a chain of transformers to create a simple 2d matrix of training data.\n", 269 | "\n", 270 | "The following examples illustrates such a ``XDataFrame`` workflow.\n", 271 | "\n", 272 | "Let ``Y`` be a vector of labels for each row." 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 10, 278 | "metadata": { 279 | "collapsed": true 280 | }, 281 | "outputs": [], 282 | "source": [ 283 | "Y = np.random.binomial(1, 0.5, X.shape[0])\n", 284 | "Y = XSeries(Y)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 11, 290 | "metadata": { 291 | "collapsed": true 292 | }, 293 | "outputs": [], 294 | "source": [ 295 | "df = XDataFrame({\n", 296 | " 'X': X,\n", 297 | " 'Y': Y\n", 298 | "})" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 12, 304 | "metadata": {}, 305 | "outputs": [ 306 | { 307 | "data": { 308 | "text/html": [ 309 | "
\n", 310 | "\n", 323 | "\n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | "
XY
00 1.187100\n", 335 | "1 0.409600\n", 336 | "2 -0.43154...1
10 0.094261\n", 342 | "1 0.310310\n", 343 | "2 0.53060...1
20 -1.157000\n", 349 | "1 -1.592600\n", 350 | "2 -1.50960...0
30 0.356960\n", 356 | "1 0.300850\n", 357 | "2 0.24314...1
40 0.307980\n", 363 | "1 0.370350\n", 364 | "2 0.26015...1
\n", 369 | "
" 370 | ], 371 | "text/plain": [ 372 | " X Y\n", 373 | "0 0 1.187100\n", 374 | "1 0.409600\n", 375 | "2 -0.43154... 1\n", 376 | "1 0 0.094261\n", 377 | "1 0.310310\n", 378 | "2 0.53060... 1\n", 379 | "2 0 -1.157000\n", 380 | "1 -1.592600\n", 381 | "2 -1.50960... 0\n", 382 | "3 0 0.356960\n", 383 | "1 0.300850\n", 384 | "2 0.24314... 1\n", 385 | "4 0 0.307980\n", 386 | "1 0.370350\n", 387 | "2 0.26015... 1" 388 | ] 389 | }, 390 | "execution_count": 12, 391 | "metadata": {}, 392 | "output_type": "execute_result" 393 | } 394 | ], 395 | "source": [ 396 | "df.head()" 397 | ] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": {}, 402 | "source": [ 403 | "Add new column to XDataFrame:" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 13, 409 | "metadata": { 410 | "collapsed": true 411 | }, 412 | "outputs": [], 413 | "source": [ 414 | "df['X_1'] = XSeries([\n", 415 | " pd.Series(np.random.normal(size=100))\n", 416 | " for _ in range(X.shape[0])\n", 417 | "])" 418 | ] 419 | }, 420 | { 421 | "cell_type": "markdown", 422 | "metadata": {}, 423 | "source": [ 424 | "# XPandas: Transformers" 425 | ] 426 | }, 427 | { 428 | "cell_type": "markdown", 429 | "metadata": {}, 430 | "source": [ 431 | "A major motivation for this project is the common data science task of extracting features from some complex objects (for example series) before proceeding with the machine learning.\n", 432 | "\n", 433 | "Given a ``XSeries`` of ``pandas.Series`` one would, for instance, like to extract features from each series. That's where *Transformers* play a vital role.\n", 434 | "\n", 435 | "Each ``Transformer`` object support ``fit, transform`` methods just like [scikit-learn transformers](http://scikit-learn.org/stable/data_transforms.html).\n", 436 | "\n", 437 | "Let's explore some examples." 438 | ] 439 | }, 440 | { 441 | "cell_type": "markdown", 442 | "metadata": {}, 443 | "source": [ 444 | "### TimeSeriesWindowTransformer\n", 445 | "\n", 446 | "This transformer calculates moving average with given window size." 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": 14, 452 | "metadata": { 453 | "collapsed": true 454 | }, 455 | "outputs": [], 456 | "source": [ 457 | "tr = TimeSeriesWindowTransformer(windows_size=5)\n", 458 | "tr.fit(X)\n", 459 | "transformed_series = tr.transform(X)" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": 15, 465 | "metadata": {}, 466 | "outputs": [ 467 | { 468 | "data": { 469 | "text/plain": [ 470 | "0 4 -0.394268\n", 471 | "5 -1.108168\n", 472 | "6 -1.70768...\n", 473 | "1 4 0.509686\n", 474 | "5 0.680500\n", 475 | "6 0.80574...\n", 476 | "2 4 -1.098344\n", 477 | "5 -0.755320\n", 478 | "6 -0.21608...\n", 479 | "3 4 0.234223\n", 480 | "5 0.165730\n", 481 | "6 0.09269...\n", 482 | "4 4 0.202701\n", 483 | "5 0.154336\n", 484 | "6 0.14082...\n", 485 | "dtype: object\n", 486 | "data_type: " 487 | ] 488 | }, 489 | "execution_count": 15, 490 | "metadata": {}, 491 | "output_type": "execute_result" 492 | } 493 | ], 494 | "source": [ 495 | "transformed_series.head()" 496 | ] 497 | }, 498 | { 499 | "cell_type": "markdown", 500 | "metadata": {}, 501 | "source": [ 502 | "Of course, with a windows_size = 5 first 4 elements are NaN." 503 | ] 504 | }, 505 | { 506 | "cell_type": "code", 507 | "execution_count": 16, 508 | "metadata": {}, 509 | "outputs": [ 510 | { 511 | "data": { 512 | "text/plain": [ 513 | "4 -0.394268\n", 514 | "5 -1.108168\n", 515 | "6 -1.707688\n", 516 | "7 -2.121740\n", 517 | "8 -2.302600\n", 518 | "9 -2.236300\n", 519 | "10 -1.942152\n", 520 | "11 -1.469980\n", 521 | "12 -0.891442\n", 522 | "13 -0.287676\n", 523 | "dtype: float64" 524 | ] 525 | }, 526 | "execution_count": 16, 527 | "metadata": {}, 528 | "output_type": "execute_result" 529 | } 530 | ], 531 | "source": [ 532 | "transformed_series[0].head(10)" 533 | ] 534 | }, 535 | { 536 | "cell_type": "markdown", 537 | "metadata": {}, 538 | "source": [ 539 | "### TimeSeriesTransformer\n", 540 | "\n", 541 | "Let's try another transformer, probably the most common one. It extract several quantitative features from each pandas.Series like mean, std, quantiles. You can also pass you own list of features. As a result we retrieve a ``XDataFrame`` object." 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": 17, 547 | "metadata": { 548 | "collapsed": true 549 | }, 550 | "outputs": [], 551 | "source": [ 552 | "tr = TimeSeriesTransformer()\n", 553 | "tr.fit(X)\n", 554 | "transformed_series = tr.transform(X)" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": 18, 560 | "metadata": {}, 561 | "outputs": [ 562 | { 563 | "data": { 564 | "text/plain": [ 565 | "xpandas.data_container.data_container.XDataFrame" 566 | ] 567 | }, 568 | "execution_count": 18, 569 | "metadata": {}, 570 | "output_type": "execute_result" 571 | } 572 | ], 573 | "source": [ 574 | "type(transformed_series)" 575 | ] 576 | }, 577 | { 578 | "cell_type": "code", 579 | "execution_count": 19, 580 | "metadata": {}, 581 | "outputs": [ 582 | { 583 | "data": { 584 | "text/html": [ 585 | "
\n", 586 | "\n", 599 | "\n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | "
None_TimeSeriesTransformer_maxNone_TimeSeriesTransformer_meanNone_TimeSeriesTransformer_median
02.52630.0019950.011186
12.62910.001997-0.024726
22.6072-0.0019960.060685
32.6431-0.001997-0.022668
43.2398-0.001995-0.048518
\n", 641 | "
" 642 | ], 643 | "text/plain": [ 644 | " None_TimeSeriesTransformer_max None_TimeSeriesTransformer_mean \\\n", 645 | "0 2.5263 0.001995 \n", 646 | "1 2.6291 0.001997 \n", 647 | "2 2.6072 -0.001996 \n", 648 | "3 2.6431 -0.001997 \n", 649 | "4 3.2398 -0.001995 \n", 650 | "\n", 651 | " None_TimeSeriesTransformer_median \n", 652 | "0 0.011186 \n", 653 | "1 -0.024726 \n", 654 | "2 0.060685 \n", 655 | "3 -0.022668 \n", 656 | "4 -0.048518 " 657 | ] 658 | }, 659 | "execution_count": 19, 660 | "metadata": {}, 661 | "output_type": "execute_result" 662 | } 663 | ], 664 | "source": [ 665 | "transformed_series.head().iloc[:, :3]" 666 | ] 667 | }, 668 | { 669 | "cell_type": "markdown", 670 | "metadata": {}, 671 | "source": [ 672 | "We can also make use of the TSFresh transformer" 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": 20, 678 | "metadata": { 679 | "collapsed": true 680 | }, 681 | "outputs": [], 682 | "source": [ 683 | "from xpandas.transformers import TsFreshSeriesTransformer" 684 | ] 685 | }, 686 | { 687 | "cell_type": "code", 688 | "execution_count": 21, 689 | "metadata": { 690 | "collapsed": true 691 | }, 692 | "outputs": [], 693 | "source": [ 694 | "tr = TsFreshSeriesTransformer()\n", 695 | "tr.fit(X.head())\n", 696 | "transformed_series = tr.transform(X.head())" 697 | ] 698 | }, 699 | { 700 | "cell_type": "code", 701 | "execution_count": 22, 702 | "metadata": {}, 703 | "outputs": [ 704 | { 705 | "data": { 706 | "text/html": [ 707 | "
\n", 708 | "\n", 721 | "\n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | "
None__abs_energyNone__absolute_sum_of_changesNone__agg_autocorrelation__f_agg_\"mean\"
0500.000126134.513280-0.012049
1499.999290114.2899250.003075
2500.001514164.089622-0.013172
3499.999445103.510040-0.005639
4500.003011154.2995420.001552
\n", 763 | "
" 764 | ], 765 | "text/plain": [ 766 | " None__abs_energy None__absolute_sum_of_changes \\\n", 767 | "0 500.000126 134.513280 \n", 768 | "1 499.999290 114.289925 \n", 769 | "2 500.001514 164.089622 \n", 770 | "3 499.999445 103.510040 \n", 771 | "4 500.003011 154.299542 \n", 772 | "\n", 773 | " None__agg_autocorrelation__f_agg_\"mean\" \n", 774 | "0 -0.012049 \n", 775 | "1 0.003075 \n", 776 | "2 -0.013172 \n", 777 | "3 -0.005639 \n", 778 | "4 0.001552 " 779 | ] 780 | }, 781 | "execution_count": 22, 782 | "metadata": {}, 783 | "output_type": "execute_result" 784 | } 785 | ], 786 | "source": [ 787 | "transformed_series.head().iloc[:, :3]" 788 | ] 789 | }, 790 | { 791 | "cell_type": "markdown", 792 | "metadata": {}, 793 | "source": [ 794 | "### Custom inline Transformer" 795 | ] 796 | }, 797 | { 798 | "cell_type": "markdown", 799 | "metadata": {}, 800 | "source": [ 801 | "One can also create inline ``CustomTransfomer`` like this" 802 | ] 803 | }, 804 | { 805 | "cell_type": "code", 806 | "execution_count": 23, 807 | "metadata": { 808 | "collapsed": true 809 | }, 810 | "outputs": [], 811 | "source": [ 812 | "from xpandas.transformers import XSeriesTransformer" 813 | ] 814 | }, 815 | { 816 | "cell_type": "code", 817 | "execution_count": 24, 818 | "metadata": { 819 | "collapsed": true 820 | }, 821 | "outputs": [], 822 | "source": [ 823 | "my_awesome_transfomer = XSeriesTransformer(transform_function=lambda x: x.std())" 824 | ] 825 | }, 826 | { 827 | "cell_type": "code", 828 | "execution_count": 25, 829 | "metadata": {}, 830 | "outputs": [ 831 | { 832 | "data": { 833 | "text/plain": [ 834 | "XSeriesTransformer(data_types=None, name='XSeriesTransformer',\n", 835 | " transform_function= at 0x11929ad90>)" 836 | ] 837 | }, 838 | "execution_count": 25, 839 | "metadata": {}, 840 | "output_type": "execute_result" 841 | } 842 | ], 843 | "source": [ 844 | "my_awesome_transfomer.fit(X)" 845 | ] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": 26, 850 | "metadata": {}, 851 | "outputs": [ 852 | { 853 | "data": { 854 | "text/plain": [ 855 | "0 0.999998\n", 856 | "1 0.999997\n", 857 | "2 1.000000\n", 858 | "3 0.999997\n", 859 | "4 1.000001\n", 860 | "dtype: float64\n", 861 | "data_type: " 862 | ] 863 | }, 864 | "execution_count": 26, 865 | "metadata": {}, 866 | "output_type": "execute_result" 867 | } 868 | ], 869 | "source": [ 870 | "my_awesome_transfomer.transform(X).head()" 871 | ] 872 | }, 873 | { 874 | "cell_type": "markdown", 875 | "metadata": {}, 876 | "source": [ 877 | "If you want to create your custom transformer with any complex logic, please take a look at internal implementation of transformers." 878 | ] 879 | }, 880 | { 881 | "cell_type": "markdown", 882 | "metadata": {}, 883 | "source": [ 884 | "## XDataFrame transformer" 885 | ] 886 | }, 887 | { 888 | "cell_type": "markdown", 889 | "metadata": {}, 890 | "source": [ 891 | "To transform a **XDataFrame** one has to specify the transformation logic for the columns that should be transformed using a **XDataFrameTransformer**.\n", 892 | "\n", 893 | "The constructor of **XDataFrameTransformer** input mapping dictionary of {col_name: XSeries transformer}.\n", 894 | "\n", 895 | "For example, let's apply **TimeSeriesWindowTransformer** to the $X$ column and **TimeSeriesTransformer** to the $X_1$ column.\n", 896 | "\n", 897 | "When apply transformation to the column, *it's replaced with transformed*." 898 | ] 899 | }, 900 | { 901 | "cell_type": "code", 902 | "execution_count": 27, 903 | "metadata": { 904 | "collapsed": true 905 | }, 906 | "outputs": [], 907 | "source": [ 908 | "from xpandas.transformers import XDataFrameTransformer" 909 | ] 910 | }, 911 | { 912 | "cell_type": "code", 913 | "execution_count": 28, 914 | "metadata": { 915 | "collapsed": true 916 | }, 917 | "outputs": [], 918 | "source": [ 919 | "df_transformer = XDataFrameTransformer({\n", 920 | " 'X': TimeSeriesWindowTransformer(windows_size=4),\n", 921 | " 'X_1': TimeSeriesTransformer()\n", 922 | "})" 923 | ] 924 | }, 925 | { 926 | "cell_type": "code", 927 | "execution_count": 29, 928 | "metadata": {}, 929 | "outputs": [ 930 | { 931 | "data": { 932 | "text/plain": [ 933 | "XDataFrameTransformer(transformations={'X': [TimeSeriesWindowTransformer(windows_size=4)], 'X_1': [TimeSeriesTransformer(features=None)]})" 934 | ] 935 | }, 936 | "execution_count": 29, 937 | "metadata": {}, 938 | "output_type": "execute_result" 939 | } 940 | ], 941 | "source": [ 942 | "df_transformer.fit(df)" 943 | ] 944 | }, 945 | { 946 | "cell_type": "code", 947 | "execution_count": 30, 948 | "metadata": { 949 | "collapsed": true 950 | }, 951 | "outputs": [], 952 | "source": [ 953 | "transformed_df = df_transformer.transform(df)" 954 | ] 955 | }, 956 | { 957 | "cell_type": "code", 958 | "execution_count": 31, 959 | "metadata": {}, 960 | "outputs": [ 961 | { 962 | "data": { 963 | "text/html": [ 964 | "
\n", 965 | "\n", 978 | "\n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | "
X_TimeSeriesWindowTransformerYX_1_TimeSeriesTransformer_max
03 -0.016460\n", 991 | "4 -0.789610\n", 992 | "5 -1.48761...12.383478
13 0.416408\n", 999 | "4 0.613542\n", 1000 | "5 0.77304...12.451725
23 -1.315175\n", 1007 | "4 -1.083680\n", 1008 | "5 -0.54600...02.164009
33 0.268788\n", 1015 | "4 0.203539\n", 1016 | "5 0.13194...12.951486
43 0.255629\n", 1023 | "4 0.176381\n", 1024 | "5 0.10033...12.453836
\n", 1030 | "
" 1031 | ], 1032 | "text/plain": [ 1033 | " X_TimeSeriesWindowTransformer Y \\\n", 1034 | "0 3 -0.016460\n", 1035 | "4 -0.789610\n", 1036 | "5 -1.48761... 1 \n", 1037 | "1 3 0.416408\n", 1038 | "4 0.613542\n", 1039 | "5 0.77304... 1 \n", 1040 | "2 3 -1.315175\n", 1041 | "4 -1.083680\n", 1042 | "5 -0.54600... 0 \n", 1043 | "3 3 0.268788\n", 1044 | "4 0.203539\n", 1045 | "5 0.13194... 1 \n", 1046 | "4 3 0.255629\n", 1047 | "4 0.176381\n", 1048 | "5 0.10033... 1 \n", 1049 | "\n", 1050 | " X_1_TimeSeriesTransformer_max \n", 1051 | "0 2.383478 \n", 1052 | "1 2.451725 \n", 1053 | "2 2.164009 \n", 1054 | "3 2.951486 \n", 1055 | "4 2.453836 " 1056 | ] 1057 | }, 1058 | "execution_count": 31, 1059 | "metadata": {}, 1060 | "output_type": "execute_result" 1061 | } 1062 | ], 1063 | "source": [ 1064 | "transformed_df.head().iloc[:, :3]" 1065 | ] 1066 | }, 1067 | { 1068 | "cell_type": "markdown", 1069 | "metadata": {}, 1070 | "source": [ 1071 | "## Pipeline transformer" 1072 | ] 1073 | }, 1074 | { 1075 | "cell_type": "markdown", 1076 | "metadata": {}, 1077 | "source": [ 1078 | "Well, that's a nice transformer, but can I create [pipelines](http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html) as in scikit-learn?\n", 1079 | "\n", 1080 | "Sure! Let's see on example where we combine ``TimeSeriesTransformer`` and ``TimeSeriesWindowTransformer`` into a combined pipeline using a ``PipeLineChain``.\n", 1081 | "\n", 1082 | "First let's see example of ``PipeLineChain`` with ``XSeries`` and then with ``XDataFrame``." 1083 | ] 1084 | }, 1085 | { 1086 | "cell_type": "code", 1087 | "execution_count": 32, 1088 | "metadata": { 1089 | "collapsed": true 1090 | }, 1091 | "outputs": [], 1092 | "source": [ 1093 | "from xpandas.transformers import PipeLineChain" 1094 | ] 1095 | }, 1096 | { 1097 | "cell_type": "code", 1098 | "execution_count": 33, 1099 | "metadata": {}, 1100 | "outputs": [ 1101 | { 1102 | "data": { 1103 | "text/plain": [ 1104 | "PipeLineChain(steps=[('moving average trans', TimeSeriesWindowTransformer(windows_size=5)), ('extract features', TimeSeriesTransformer(features=None))])" 1105 | ] 1106 | }, 1107 | "execution_count": 33, 1108 | "metadata": {}, 1109 | "output_type": "execute_result" 1110 | } 1111 | ], 1112 | "source": [ 1113 | "chain = PipeLineChain([\n", 1114 | " ('moving average trans', TimeSeriesWindowTransformer(windows_size=5)),\n", 1115 | " ('extract features', TimeSeriesTransformer())\n", 1116 | "])\n", 1117 | "chain.fit(X)" 1118 | ] 1119 | }, 1120 | { 1121 | "cell_type": "code", 1122 | "execution_count": 34, 1123 | "metadata": {}, 1124 | "outputs": [ 1125 | { 1126 | "data": { 1127 | "text/plain": [ 1128 | "" 1129 | ] 1130 | }, 1131 | "execution_count": 34, 1132 | "metadata": {}, 1133 | "output_type": "execute_result" 1134 | } 1135 | ], 1136 | "source": [ 1137 | "chain.get_params" 1138 | ] 1139 | }, 1140 | { 1141 | "cell_type": "code", 1142 | "execution_count": 35, 1143 | "metadata": { 1144 | "collapsed": true 1145 | }, 1146 | "outputs": [], 1147 | "source": [ 1148 | "transformed_X = chain.transform(X)" 1149 | ] 1150 | }, 1151 | { 1152 | "cell_type": "code", 1153 | "execution_count": 36, 1154 | "metadata": {}, 1155 | "outputs": [ 1156 | { 1157 | "data": { 1158 | "text/html": [ 1159 | "
\n", 1160 | "\n", 1173 | "\n", 1174 | " \n", 1175 | " \n", 1176 | " \n", 1177 | " \n", 1178 | " \n", 1179 | " \n", 1180 | " \n", 1181 | " \n", 1182 | " \n", 1183 | " \n", 1184 | " \n", 1185 | " \n", 1186 | " \n", 1187 | " \n", 1188 | " \n", 1189 | " \n", 1190 | " \n", 1191 | " \n", 1192 | " \n", 1193 | " \n", 1194 | " \n", 1195 | " \n", 1196 | " \n", 1197 | " \n", 1198 | " \n", 1199 | " \n", 1200 | " \n", 1201 | " \n", 1202 | " \n", 1203 | " \n", 1204 | " \n", 1205 | " \n", 1206 | " \n", 1207 | " \n", 1208 | "
None_TimeSeriesWindowTransformer_TimeSeriesTransformer_maxNone_TimeSeriesWindowTransformer_TimeSeriesTransformer_mean
02.161440.002078
12.39636-0.002229
22.325120.005656
32.444300.000632
42.64094-0.001295
\n", 1209 | "
" 1210 | ], 1211 | "text/plain": [ 1212 | " None_TimeSeriesWindowTransformer_TimeSeriesTransformer_max \\\n", 1213 | "0 2.16144 \n", 1214 | "1 2.39636 \n", 1215 | "2 2.32512 \n", 1216 | "3 2.44430 \n", 1217 | "4 2.64094 \n", 1218 | "\n", 1219 | " None_TimeSeriesWindowTransformer_TimeSeriesTransformer_mean \n", 1220 | "0 0.002078 \n", 1221 | "1 -0.002229 \n", 1222 | "2 0.005656 \n", 1223 | "3 0.000632 \n", 1224 | "4 -0.001295 " 1225 | ] 1226 | }, 1227 | "execution_count": 36, 1228 | "metadata": {}, 1229 | "output_type": "execute_result" 1230 | } 1231 | ], 1232 | "source": [ 1233 | "transformed_X.head().iloc[:, :2]" 1234 | ] 1235 | }, 1236 | { 1237 | "cell_type": "markdown", 1238 | "metadata": { 1239 | "collapsed": true 1240 | }, 1241 | "source": [ 1242 | "All right! Let's try to add scikit-learn transformer to the PipeLineChain. For example, let's do PCA on transformed_X." 1243 | ] 1244 | }, 1245 | { 1246 | "cell_type": "code", 1247 | "execution_count": 37, 1248 | "metadata": { 1249 | "collapsed": true 1250 | }, 1251 | "outputs": [], 1252 | "source": [ 1253 | "from sklearn.decomposition import PCA" 1254 | ] 1255 | }, 1256 | { 1257 | "cell_type": "code", 1258 | "execution_count": 38, 1259 | "metadata": {}, 1260 | "outputs": [ 1261 | { 1262 | "data": { 1263 | "text/plain": [ 1264 | "PipeLineChain(steps=[('moving average trans', TimeSeriesWindowTransformer(windows_size=5)), ('extract features', TimeSeriesTransformer(features=None)), ('pca', PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,\n", 1265 | " svd_solver='auto', tol=0.0, whiten=False))])" 1266 | ] 1267 | }, 1268 | "execution_count": 38, 1269 | "metadata": {}, 1270 | "output_type": "execute_result" 1271 | } 1272 | ], 1273 | "source": [ 1274 | "chain = PipeLineChain([\n", 1275 | " ('moving average trans', TimeSeriesWindowTransformer(windows_size=5)),\n", 1276 | " ('extract features', TimeSeriesTransformer()),\n", 1277 | " ('pca', PCA(n_components=5))\n", 1278 | "])\n", 1279 | "chain.fit(X)" 1280 | ] 1281 | }, 1282 | { 1283 | "cell_type": "code", 1284 | "execution_count": 39, 1285 | "metadata": {}, 1286 | "outputs": [], 1287 | "source": [ 1288 | "transformed_X = chain.transform(X)" 1289 | ] 1290 | }, 1291 | { 1292 | "cell_type": "code", 1293 | "execution_count": 40, 1294 | "metadata": {}, 1295 | "outputs": [ 1296 | { 1297 | "data": { 1298 | "text/html": [ 1299 | "
\n", 1300 | "\n", 1313 | "\n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | "
01234
0-0.133152-0.2425520.097523-0.004435-0.009747
1-0.1254130.076021-0.0892670.0105310.017437
2-0.028607-0.0888280.2050430.0980090.032338
30.071478-0.058813-0.247669-0.023550-0.052968
40.2006110.1108840.0642000.012187-0.038497
\n", 1367 | "
" 1368 | ], 1369 | "text/plain": [ 1370 | " 0 1 2 3 4\n", 1371 | "0 -0.133152 -0.242552 0.097523 -0.004435 -0.009747\n", 1372 | "1 -0.125413 0.076021 -0.089267 0.010531 0.017437\n", 1373 | "2 -0.028607 -0.088828 0.205043 0.098009 0.032338\n", 1374 | "3 0.071478 -0.058813 -0.247669 -0.023550 -0.052968\n", 1375 | "4 0.200611 0.110884 0.064200 0.012187 -0.038497" 1376 | ] 1377 | }, 1378 | "execution_count": 40, 1379 | "metadata": {}, 1380 | "output_type": "execute_result" 1381 | } 1382 | ], 1383 | "source": [ 1384 | "transformed_X.head()" 1385 | ] 1386 | }, 1387 | { 1388 | "cell_type": "markdown", 1389 | "metadata": {}, 1390 | "source": [ 1391 | "Let's do even more interesting things! Adding a scikit-learn estimator at the end of PipeLineChain!" 1392 | ] 1393 | }, 1394 | { 1395 | "cell_type": "code", 1396 | "execution_count": 41, 1397 | "metadata": { 1398 | "collapsed": true 1399 | }, 1400 | "outputs": [], 1401 | "source": [ 1402 | "from sklearn.linear_model import LogisticRegression\n", 1403 | "from sklearn.model_selection import train_test_split\n", 1404 | "from sklearn.metrics import accuracy_score" 1405 | ] 1406 | }, 1407 | { 1408 | "cell_type": "code", 1409 | "execution_count": 42, 1410 | "metadata": { 1411 | "collapsed": true 1412 | }, 1413 | "outputs": [], 1414 | "source": [ 1415 | "X_train, X_test, y_train, y_test = train_test_split(X, Y)" 1416 | ] 1417 | }, 1418 | { 1419 | "cell_type": "markdown", 1420 | "metadata": {}, 1421 | "source": [ 1422 | "Be sure that types of X_train and X_test are XSeries." 1423 | ] 1424 | }, 1425 | { 1426 | "cell_type": "code", 1427 | "execution_count": 43, 1428 | "metadata": {}, 1429 | "outputs": [ 1430 | { 1431 | "name": "stdout", 1432 | "output_type": "stream", 1433 | "text": [ 1434 | "\n", 1435 | "\n" 1436 | ] 1437 | } 1438 | ], 1439 | "source": [ 1440 | "print(type(X_train))\n", 1441 | "print(type(X_test))" 1442 | ] 1443 | }, 1444 | { 1445 | "cell_type": "code", 1446 | "execution_count": 44, 1447 | "metadata": { 1448 | "collapsed": true 1449 | }, 1450 | "outputs": [], 1451 | "source": [ 1452 | "chain = PipeLineChain([\n", 1453 | " ('moving average trans', TimeSeriesWindowTransformer(windows_size=5)),\n", 1454 | " ('extract features', TimeSeriesTransformer()),\n", 1455 | " ('pca', PCA(n_components=5)),\n", 1456 | " ('logit_regression', LogisticRegression())\n", 1457 | " \n", 1458 | "])\n", 1459 | "chain = chain.fit(X_train, y_train)" 1460 | ] 1461 | }, 1462 | { 1463 | "cell_type": "code", 1464 | "execution_count": 45, 1465 | "metadata": { 1466 | "collapsed": true 1467 | }, 1468 | "outputs": [], 1469 | "source": [ 1470 | "prediction = chain.predict(X_test)" 1471 | ] 1472 | }, 1473 | { 1474 | "cell_type": "code", 1475 | "execution_count": 46, 1476 | "metadata": {}, 1477 | "outputs": [ 1478 | { 1479 | "data": { 1480 | "text/plain": [ 1481 | "0.5004061738424046" 1482 | ] 1483 | }, 1484 | "execution_count": 46, 1485 | "metadata": {}, 1486 | "output_type": "execute_result" 1487 | } 1488 | ], 1489 | "source": [ 1490 | "accuracy_score(y_test, prediction)" 1491 | ] 1492 | }, 1493 | { 1494 | "cell_type": "markdown", 1495 | "metadata": {}, 1496 | "source": [ 1497 | "Let's now try ``PipeLineChain`` with ``XDataFrameTransformer``.\n", 1498 | "\n", 1499 | "Imagine data set of feature columns gender (0 or 1), age (int), series( pandas.Series), target (0 or 1). Let's try to create ``PipeLineChain`` that extracts features from series and performs ``PCA`` over all feature set and then performs LogitRegression classification." 1500 | ] 1501 | }, 1502 | { 1503 | "cell_type": "code", 1504 | "execution_count": 47, 1505 | "metadata": { 1506 | "collapsed": true 1507 | }, 1508 | "outputs": [], 1509 | "source": [ 1510 | "n = 100\n", 1511 | "\n", 1512 | "df_features = XDataFrame({\n", 1513 | " 'gender': XSeries(np.random.binomial(1, 0.7, n)),\n", 1514 | " 'age': XSeries(np.random.poisson(25, n)),\n", 1515 | " 'series': XSeries([\n", 1516 | " pd.Series(np.random.normal(size=500))\n", 1517 | " ] * n)\n", 1518 | "})\n", 1519 | "\n", 1520 | "target = XSeries(np.random.binomial(1, 0.45, n))" 1521 | ] 1522 | }, 1523 | { 1524 | "cell_type": "code", 1525 | "execution_count": 48, 1526 | "metadata": { 1527 | "collapsed": true 1528 | }, 1529 | "outputs": [], 1530 | "source": [ 1531 | "features_transformer = XDataFrameTransformer({\n", 1532 | " 'series': TimeSeriesTransformer()\n", 1533 | "})" 1534 | ] 1535 | }, 1536 | { 1537 | "cell_type": "code", 1538 | "execution_count": 49, 1539 | "metadata": { 1540 | "collapsed": true 1541 | }, 1542 | "outputs": [], 1543 | "source": [ 1544 | "pipe_line = PipeLineChain([\n", 1545 | " ('extract_from_series', features_transformer),\n", 1546 | " ('pca', PCA(n_components=5)),\n", 1547 | " ('logit_regression', LogisticRegression())\n", 1548 | "])" 1549 | ] 1550 | }, 1551 | { 1552 | "cell_type": "code", 1553 | "execution_count": 50, 1554 | "metadata": { 1555 | "collapsed": true 1556 | }, 1557 | "outputs": [], 1558 | "source": [ 1559 | "df_features_train, df_features_test, \\\n", 1560 | " y_train, y_test = train_test_split(df_features, target)" 1561 | ] 1562 | }, 1563 | { 1564 | "cell_type": "code", 1565 | "execution_count": 51, 1566 | "metadata": {}, 1567 | "outputs": [ 1568 | { 1569 | "data": { 1570 | "text/plain": [ 1571 | "PipeLineChain(steps=[('extract_from_series', XDataFrameTransformer(transformations={'series': [TimeSeriesTransformer(features=None)]})), ('pca', PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,\n", 1572 | " svd_solver='auto', tol=0.0, whiten=False)), ('logit_regression', LogisticRegression(C=1.0, cla...ty='l2', random_state=None, solver='liblinear', tol=0.0001,\n", 1573 | " verbose=0, warm_start=False))])" 1574 | ] 1575 | }, 1576 | "execution_count": 51, 1577 | "metadata": {}, 1578 | "output_type": "execute_result" 1579 | } 1580 | ], 1581 | "source": [ 1582 | "pipe_line.fit(df_features_train, y_train)" 1583 | ] 1584 | }, 1585 | { 1586 | "cell_type": "code", 1587 | "execution_count": 52, 1588 | "metadata": {}, 1589 | "outputs": [ 1590 | { 1591 | "data": { 1592 | "text/plain": [ 1593 | "array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 1594 | " 0, 0])" 1595 | ] 1596 | }, 1597 | "execution_count": 52, 1598 | "metadata": {}, 1599 | "output_type": "execute_result" 1600 | } 1601 | ], 1602 | "source": [ 1603 | "pipe_line.predict(df_features_test)" 1604 | ] 1605 | }, 1606 | { 1607 | "cell_type": "code", 1608 | "execution_count": null, 1609 | "metadata": { 1610 | "collapsed": true 1611 | }, 1612 | "outputs": [], 1613 | "source": [] 1614 | } 1615 | ], 1616 | "metadata": { 1617 | "kernelspec": { 1618 | "display_name": "Python 3", 1619 | "language": "python", 1620 | "name": "python3" 1621 | }, 1622 | "language_info": { 1623 | "codemirror_mode": { 1624 | "name": "ipython", 1625 | "version": 3 1626 | }, 1627 | "file_extension": ".py", 1628 | "mimetype": "text/x-python", 1629 | "name": "python", 1630 | "nbconvert_exporter": "python", 1631 | "pygments_lexer": "ipython3", 1632 | "version": "3.6.1" 1633 | } 1634 | }, 1635 | "nbformat": 4, 1636 | "nbformat_minor": 2 1637 | } 1638 | --------------------------------------------------------------------------------