├── 1_problem.py ├── 2_transform_solution.py ├── 3_feature_union_solution.py ├── LICENSE ├── pandas_feature_union.py ├── pandas_transform.py └── readme.md /1_problem.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.datasets import load_iris 3 | from sklearn.pipeline import FeatureUnion, make_pipeline 4 | from sklearn.preprocessing import FunctionTransformer 5 | 6 | 7 | def main(): 8 | raw_data = load_iris() 9 | data = pd.DataFrame(raw_data["data"], columns=raw_data["feature_names"]) 10 | 11 | pipeline = FeatureUnion([ 12 | ("1", make_pipeline( 13 | FunctionTransformer(lambda X: X.loc[:, ["sepal length (cm)"]]), 14 | # other transformations 15 | )), 16 | ("2", make_pipeline( 17 | FunctionTransformer(lambda X: X.loc[:, ["sepal width (cm)"]]), 18 | # other transformations 19 | )) 20 | ]) 21 | 22 | X = pipeline.fit_transform(data) 23 | print(X["sepal length (cm)"].mean()) 24 | print(X["sepal width (cm)"].mean()) 25 | 26 | 27 | if __name__ == '__main__': 28 | main() 29 | -------------------------------------------------------------------------------- /2_transform_solution.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.datasets import load_iris 3 | from sklearn.pipeline import FeatureUnion, make_pipeline 4 | 5 | from pandas_transform import PandasTransform 6 | 7 | 8 | def main(): 9 | raw_data = load_iris() 10 | data = pd.DataFrame(raw_data["data"], columns=raw_data["feature_names"]) 11 | data.loc[:, "class"] = raw_data["target"] 12 | 13 | pipeline = FeatureUnion([ 14 | ("1", make_pipeline( 15 | PandasTransform(lambda X: X.loc[:, ["sepal length (cm)"]]), 16 | # other transformations 17 | )), 18 | ("2", make_pipeline( 19 | PandasTransform(lambda X: X.loc[:, ["sepal width (cm)"]]), 20 | # other transformations 21 | )) 22 | ]) 23 | 24 | X = pipeline.fit_transform(data) 25 | print(X["sepal length (cm)"].mean()) 26 | print(X["sepal width (cm)"].mean()) 27 | 28 | 29 | if __name__ == '__main__': 30 | main() 31 | -------------------------------------------------------------------------------- /3_feature_union_solution.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.datasets import load_iris 3 | from sklearn.pipeline import make_pipeline 4 | 5 | from pandas_feature_union import PandasFeatureUnion 6 | from pandas_transform import PandasTransform 7 | 8 | 9 | def main(): 10 | raw_data = load_iris() 11 | data = pd.DataFrame(raw_data["data"], columns=raw_data["feature_names"]) 12 | data.loc[:, "class"] = raw_data["target"] 13 | 14 | pipeline = PandasFeatureUnion([ 15 | ("1", make_pipeline( 16 | PandasTransform(lambda X: X.loc[:, ["sepal length (cm)"]]), 17 | # other transformations 18 | )), 19 | ("2", make_pipeline( 20 | PandasTransform(lambda X: X.loc[:, ["sepal width (cm)"]]), 21 | # other transformations 22 | )) 23 | ]) 24 | 25 | X = pipeline.fit_transform(data) 26 | print(X["sepal length (cm)"].mean()) 27 | print(X["sepal width (cm)"].mean()) 28 | 29 | 30 | if __name__ == '__main__': 31 | main() 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Marcin Zabłocki 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /pandas_feature_union.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.externals.joblib import Parallel, delayed 4 | from sklearn.pipeline import FeatureUnion, _fit_transform_one, _transform_one 5 | from scipy import sparse 6 | 7 | 8 | class PandasFeatureUnion(FeatureUnion): 9 | def fit_transform(self, X, y=None, **fit_params): 10 | self._validate_transformers() 11 | result = Parallel(n_jobs=self.n_jobs)( 12 | delayed(_fit_transform_one)( 13 | transformer=trans, 14 | X=X, 15 | y=y, 16 | weight=weight, 17 | **fit_params) 18 | for name, trans, weight in self._iter()) 19 | 20 | if not result: 21 | # All transformers are None 22 | return np.zeros((X.shape[0], 0)) 23 | Xs, transformers = zip(*result) 24 | self._update_transformer_list(transformers) 25 | if any(sparse.issparse(f) for f in Xs): 26 | Xs = sparse.hstack(Xs).tocsr() 27 | else: 28 | Xs = self.merge_dataframes_by_column(Xs) 29 | return Xs 30 | 31 | def merge_dataframes_by_column(self, Xs): 32 | return pd.concat(Xs, axis="columns", copy=False) 33 | 34 | def transform(self, X): 35 | Xs = Parallel(n_jobs=self.n_jobs)( 36 | delayed(_transform_one)( 37 | transformer=trans, 38 | X=X, 39 | y=None, 40 | weight=weight) 41 | for name, trans, weight in self._iter()) 42 | if not Xs: 43 | # All transformers are None 44 | return np.zeros((X.shape[0], 0)) 45 | if any(sparse.issparse(f) for f in Xs): 46 | Xs = sparse.hstack(Xs).tocsr() 47 | else: 48 | Xs = self.merge_dataframes_by_column(Xs) 49 | return Xs 50 | -------------------------------------------------------------------------------- /pandas_transform.py: -------------------------------------------------------------------------------- 1 | from sklearn.base import BaseEstimator, TransformerMixin 2 | 3 | 4 | class PandasTransform(TransformerMixin, BaseEstimator): 5 | def __init__(self, fn): 6 | self.fn = fn 7 | 8 | def fit(self, X, y=None): 9 | return self 10 | 11 | def transform(self, X, y=None, copy=None): 12 | return self.fn(X) -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Pandas DataFrame Feature Union 2 | This repo contains code that might help you to use Pandas DataFrame objects inside of scikit-learn Pipelines and FeatureUnion abstractions. 3 | 4 | ## How to use 5 | Implementation description and example usage can be found in my blog post: 6 | https://zablo.net/blog/post/pandas-dataframe-in-scikit-learn-feature-union --------------------------------------------------------------------------------