├── .github └── workflows │ ├── release.yaml │ └── tests.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.md ├── featuretools_sklearn_transformer ├── __init__.py ├── tests │ ├── __init__.py │ └── test_transformer.py └── transformer.py ├── requirements.txt ├── setup.cfg ├── setup.py └── test-requirements.txt /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | release: 3 | types: [published] 4 | 5 | name: Release 6 | jobs: 7 | pypi: 8 | name: Release to PyPI 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v2 12 | - name: Upload to PyPI 13 | uses: FeatureLabs/gh-action-pypi-upload@v1 14 | env: 15 | PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} 16 | PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 17 | TEST_PYPI_USERNAME: ${{ secrets.TEST_PYPI_USERNAME }} 18 | TEST_PYPI_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }} 19 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | on: 2 | pull_request: 3 | types: [opened, synchronize] 4 | push: 5 | branches: 6 | - main 7 | 8 | name: Tests 9 | jobs: 10 | entry_point_test: 11 | name: Entry Point Test - Python ${{ matrix.python_version }} 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | python_version: ["3.7", "3.8"] 16 | steps: 17 | - name: Set up python ${{ matrix.python_version }} 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: ${{ matrix.python_version }} 21 | - name: Checkout repository 22 | uses: actions/checkout@v2 23 | with: 24 | ref: ${{ github.event.pull_request.head.ref }} 25 | repository: ${{ github.event.pull_request.head.repo.full_name }} 26 | - name: Build source distribution 27 | run: make package_build 28 | - name: Install package with test requirements 29 | run: | 30 | pip config --site set global.progress_bar off 31 | pip install --upgrade pip 32 | pip install dist/package/ 33 | pip install -r dist/package/test-requirements.txt 34 | - name: Test entry point 35 | run: make entry-point-test 36 | 37 | lint_tests: 38 | name: Lint Tests - Python ${{ matrix.python-version }} 39 | runs-on: ubuntu-latest 40 | strategy: 41 | matrix: 42 | python-version: ["3.7", "3.8"] 43 | steps: 44 | - name: Set up Python ${{ matrix.python-version }} 45 | uses: actions/setup-python@v2 46 | with: 47 | python-version: ${{ matrix.python-version }} 48 | - name: Checkout repository 49 | uses: actions/checkout@v2 50 | with: 51 | ref: ${{ github.event.pull_request.head.ref }} 52 | repository: ${{ github.event.pull_request.head.repo.full_name }} 53 | - name: Build source distribution 54 | run: make package_build 55 | - name: Install package with test requirements 56 | run: | 57 | pip config --site set global.progress_bar off 58 | pip install --upgrade pip 59 | pip install dist/package/ 60 | pip install -r dist/package/test-requirements.txt 61 | - name: Run lint tests 62 | run: cd dist/package && make lint-tests -f ../../Makefile 63 | 64 | unit_tests: 65 | name: Unit Tests - Python ${{ matrix.python-version }} 66 | runs-on: ubuntu-latest 67 | strategy: 68 | matrix: 69 | python-version: ["3.7", "3.8"] 70 | steps: 71 | - name: Set up Python ${{ matrix.python-version }} 72 | uses: actions/setup-python@v2 73 | with: 74 | python-version: ${{ matrix.python-version }} 75 | - name: Checkout repository 76 | uses: actions/checkout@v2 77 | with: 78 | ref: ${{ github.event.pull_request.head.ref }} 79 | repository: ${{ github.event.pull_request.head.repo.full_name }} 80 | - name: Build source distribution 81 | run: make package_build 82 | - name: Install package with test requirements 83 | run: | 84 | pip config --site set global.progress_bar off 85 | pip install --upgrade pip 86 | pip install dist/package/ 87 | pip install -r dist/package/test-requirements.txt 88 | - name: Run unit tests (no code coverage) 89 | if: ${{ matrix.python-version != 3.8 }} 90 | run: cd dist/package && make unit_tests -f ../../Makefile 91 | - name: Run unit tests with code coverage 92 | if: ${{ matrix.python-version == 3.8 }} 93 | run: cd dist/package && make unit_tests addopts="--cov=featuretools_sklearn_transformer" -f ../../Makefile 94 | - name: Upload coverage to Codecov 95 | if: ${{ matrix.python-version == 3.8 }} 96 | uses: codecov/codecov-action@v1 97 | with: 98 | fail_ci_if_error: true 99 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # IDE 2 | .vscode 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | docs/source/generated 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # pyenv 80 | .python-version 81 | 82 | # celery beat schedule file 83 | celerybeat-schedule 84 | 85 | # SageMath parsed files 86 | *.sage.py 87 | 88 | # Environments 89 | .env 90 | .venv 91 | env/ 92 | venv/ 93 | ENV/ 94 | env.bak/ 95 | venv.bak/ 96 | 97 | # Spyder project settings 98 | .spyderproject 99 | .spyproject 100 | 101 | # Rope project settings 102 | .ropeproject 103 | 104 | # mkdocs documentation 105 | /site 106 | 107 | # mypy 108 | .mypy_cache/ 109 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2019, Feature Labs, Inc. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | include test-requirements.txt 3 | include LICENSE 4 | include README.md 5 | recursive-exclude * __pycache__ 6 | recursive-exclude * *.py[co] 7 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: clean 2 | clean: 3 | find . -name '*.pyo' -delete 4 | find . -name '*.pyc' -delete 5 | find . -name __pycache__ -delete 6 | find . -name '*~' -delete 7 | 8 | .PHONY: lint-tests 9 | lint-tests: 10 | flake8 featuretools_sklearn_transformer && isort --check-only --recursive featuretools_sklearn_transformer 11 | 12 | .PHONY: lint-fix 13 | lint-fix: 14 | autopep8 --in-place --recursive --max-line-length=100 --select="E225,E303,E302,E203,E128,E231,E251,E271,E127,E126,E301,W291,W293,E226,E306,E221" featuretools_sklearn_transformer 15 | isort --recursive featuretools_sklearn_transformer 16 | 17 | .PHONY: unit_tests 18 | unit_tests: 19 | pytest --cache-clear --show-capture=stderr -vv ${addopts} 20 | 21 | .PHONY: installdeps 22 | installdeps: 23 | pip install --upgrade pip -q 24 | pip install -e . -q 25 | pip install -r test-requirements.txt -q 26 | 27 | .PHONY: entry-point-test 28 | entry-point-test: 29 | cd ~ && python -c "from featuretools.wrappers import DFSTransformer" 30 | 31 | .PHONY: package_build 32 | package_build: 33 | rm -rf dist/package 34 | python setup.py sdist 35 | $(eval package=$(shell python setup.py --fullname)) 36 | tar -zxvf "dist/${package}.tar.gz" 37 | mv ${package} dist/package 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # featuretools-sklearn-transformer 2 | 3 | ![Tests](https://github.com/FeatureLabs/featuretools-sklearn-transformer/workflows/Tests/badge.svg) 4 | [![Coverage Status](https://codecov.io/gh/alteryx/featuretools-sklearn-transformer/branch/main/graph/badge.svg)](https://codecov.io/gh/alteryx/featuretools-sklearn-transformer) 5 | [![PyPI version](https://badge.fury.io/py/featuretools-sklearn-transformer.svg?maxAge=2592000)](https://badge.fury.io/py/featuretools-sklearn-transformer) 6 | 7 | [Featuretools](https://github.com/alteryx/featuretools)' DFS as a scikit-learn transformer 8 | 9 | ### Install 10 | ```shell 11 | pip install featuretools_sklearn_transformer 12 | ``` 13 | 14 | ### Use 15 | 16 | To use the transformer in a pipeline, initialize an instance of the transformer by passing in 17 | the parameters you would like to use for calculating features. To fit the model and generate features for 18 | the training data, pass in an entityset or list of dataframes and relationships containing only the relevant 19 | training data as the `X` input, along with the training targets as the `y` input. To generate a feature matrix from test data, pass in 20 | an entityset containing only the relevant test data as the `X` input. 21 | 22 | The input supplied for `X` can take several formats: 23 | - To use a Featuretools EntitySet without cutoff times, simply pass in the EntitySet 24 | - To use a Featuretools EntitySet with a cutoff times DataFrame, pass in a tuple of the form (EntitySet, cutoff_time_df) 25 | - To use a list DataFrames and Relationships without cutoff times, pass a tuple of the form (dataframes, relationships) 26 | - To use a list of DataFrames and Relationships with a cutoff times DataFrame, pass a tuple of the form ((dataframes, relationships), cutoff_time_df) 27 | 28 | Note that because this transformer requires a Featuretools EntitySet or dataframes and relationships as input, it does not currently work 29 | with certain methods such as `sklearn.model_selection.cross_val_score` or `sklearn.model_selection.GridSearchCV` which expect the `X` values 30 | to be an iterable which can be split by the method. 31 | 32 | The example below shows how to use the transformer with an EntitySet, both with and without a cutoff time DataFrame. 33 | 34 | ```python 35 | import featuretools as ft 36 | import pandas as pd 37 | 38 | from featuretools.wrappers import DFSTransformer 39 | from sklearn.pipeline import Pipeline 40 | from sklearn.ensemble import ExtraTreesClassifier 41 | 42 | # Get example data 43 | train_es = ft.demo.load_mock_customer(return_entityset=True, n_customers=3) 44 | test_es = ft.demo.load_mock_customer(return_entityset=True, n_customers=2) 45 | y = [True, False, True] 46 | 47 | # Build pipeline 48 | pipeline = Pipeline(steps=[ 49 | ('ft', DFSTransformer(target_dataframe_name="customers", 50 | max_features=2)), 51 | ('et', ExtraTreesClassifier(n_estimators=100)) 52 | ]) 53 | 54 | # Fit and predict 55 | pipeline.fit(X=train_es, y=y) # fit on customers in training entityset 56 | pipeline.predict_proba(test_es) # predict probability of each class on test entityset 57 | pipeline.predict(test_es) # predict on test entityset 58 | 59 | # Same as above, but using cutoff times 60 | train_ct = pd.DataFrame() 61 | train_ct['customer_id'] = [1, 2, 3] 62 | train_ct['time'] = pd.to_datetime(['2014-1-1 04:00', 63 | '2014-1-2 17:20', 64 | '2014-1-4 09:53']) 65 | 66 | pipeline.fit(X=(train_es, train_ct), y=y) 67 | 68 | test_ct = pd.DataFrame() 69 | test_ct['customer_id'] = [1, 2] 70 | test_ct['time'] = pd.to_datetime(['2014-1-4 13:48', 71 | '2014-1-5 15:32']) 72 | pipeline.predict_proba((test_es, test_ct)) 73 | pipeline.predict((test_es, test_ct)) 74 | ``` 75 | 76 | ## Built at Alteryx Innovation Labs 77 | 78 | 79 | Alteryx Innovation Labs 80 | 81 | -------------------------------------------------------------------------------- /featuretools_sklearn_transformer/__init__.py: -------------------------------------------------------------------------------- 1 | from .transformer import DFSTransformer # noqa: F401 2 | 3 | __version__ = "1.0.0" 4 | -------------------------------------------------------------------------------- /featuretools_sklearn_transformer/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alteryx/featuretools-sklearn-transformer/11140cafa0c56776b56b0b6ff9745089ce733102/featuretools_sklearn_transformer/tests/__init__.py -------------------------------------------------------------------------------- /featuretools_sklearn_transformer/tests/test_transformer.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import pytest 6 | from featuretools.demo.mock_customer import load_mock_customer 7 | from featuretools.wrappers import DFSTransformer 8 | from sklearn.ensemble import ExtraTreesClassifier 9 | from sklearn.impute import SimpleImputer 10 | from sklearn.model_selection import GridSearchCV, cross_val_score 11 | from sklearn.pipeline import Pipeline 12 | from sklearn.preprocessing import FunctionTransformer, StandardScaler 13 | 14 | 15 | def select_numeric(df): 16 | return df.select_dtypes(exclude=["object", pd.CategoricalDtype]) 17 | 18 | 19 | @pytest.fixture 20 | def es(): 21 | es = load_mock_customer( 22 | n_customers=15, 23 | n_products=15, 24 | n_sessions=75, 25 | n_transactions=1000, 26 | random_seed=0, 27 | return_entityset=True, 28 | ) 29 | return es 30 | 31 | 32 | @pytest.fixture 33 | def es_customer_filtered(es): 34 | new_es = copy.deepcopy(es) 35 | customers_df = es["customers"] 36 | sessions_df = es["sessions"] 37 | products_df = es["products"] 38 | transactions_df = es["transactions"] 39 | customer_ids = [1, 2, 3] 40 | customers_df = customers_df.loc[customer_ids] 41 | sessions_df = sessions_df[sessions_df["customer_id"].isin(customer_ids)] 42 | transactions_df = transactions_df[ 43 | transactions_df["session_id"].isin(sessions_df["session_id"].values) 44 | ] 45 | products_df = products_df[ 46 | products_df["product_id"].isin(transactions_df["product_id"].values) 47 | ] 48 | new_es.replace_dataframe("customers", customers_df) 49 | new_es.replace_dataframe("sessions", sessions_df) 50 | new_es.replace_dataframe("transactions", transactions_df) 51 | new_es.replace_dataframe("products", products_df) 52 | 53 | return new_es 54 | 55 | 56 | def get_dataframes_and_relationships(es): 57 | dataframes = {} 58 | relationships = [] 59 | 60 | for df in es.dataframes: 61 | dataframes[df.ww.name] = ( 62 | df, 63 | df.ww.index, 64 | df.ww.time_index, 65 | df.ww.logical_types, 66 | ) 67 | 68 | for rel in es.relationships: 69 | relationships.append( 70 | ( 71 | rel._parent_dataframe_name, 72 | rel._parent_column_name, 73 | rel._child_dataframe_name, 74 | rel._child_column_name, 75 | ) 76 | ) 77 | 78 | return dataframes, relationships 79 | 80 | 81 | @pytest.fixture 82 | def df(es): 83 | df = es["customers"].copy() 84 | df["target"] = np.random.randint(1, 3, df.shape[0]) # 1 or 2 values 85 | return df 86 | 87 | 88 | @pytest.fixture 89 | def pipeline(): 90 | pipeline = Pipeline( 91 | steps=[ 92 | ("ft", DFSTransformer(target_dataframe_name="customers", max_features=20)), 93 | ("numeric", FunctionTransformer(select_numeric, validate=False)), 94 | ("imp", SimpleImputer()), 95 | ("et", ExtraTreesClassifier(n_estimators=10)), 96 | ] 97 | ) 98 | return pipeline 99 | 100 | 101 | def test_sklearn_transformer_with_entityset(es): 102 | # Using with transformers 103 | pipeline = Pipeline( 104 | steps=[ 105 | ("ft", DFSTransformer(target_dataframe_name="customers")), 106 | ("numeric", FunctionTransformer(select_numeric, validate=False)), 107 | ("sc", StandardScaler()), 108 | ] 109 | ) 110 | 111 | X_train = pipeline.fit(es).transform(es) 112 | 113 | assert X_train.shape[0] == 15 114 | 115 | 116 | def test_sklearn_transformer_with_dataframes_and_relationships(es): 117 | # Using with transformers 118 | pipeline = Pipeline( 119 | steps=[ 120 | ("ft", DFSTransformer(target_dataframe_name="customers")), 121 | ("numeric", FunctionTransformer(select_numeric, validate=False)), 122 | ("sc", StandardScaler()), 123 | ] 124 | ) 125 | dataframes, relationships = get_dataframes_and_relationships(es) 126 | 127 | X_train = pipeline.fit((dataframes, relationships)).transform( 128 | (dataframes, relationships) 129 | ) 130 | 131 | assert X_train.shape[0] == 15 132 | 133 | 134 | def test_sklearn_estimator_with_entityset(df, es, pipeline): 135 | # Using with estimator 136 | pipeline.fit(es, y=df.target.values).predict(es) 137 | result = pipeline.score(es, df.target.values) 138 | 139 | assert isinstance(result, (float)) 140 | 141 | # Pickling / Unpickling Pipeline 142 | # TODO fix this 143 | # s = pickle.dumps(pipeline) 144 | # pipe_pickled = pickle.loads(s) 145 | # result = pipe_pickled.score(df['customer_id'].values, df.target.values) 146 | # assert isinstance(result, (float)) 147 | 148 | 149 | def test_sklearn_estimator_with_dataframes_and_relationships(df, es, pipeline): 150 | # Using with estimator 151 | dataframes, relationships = get_dataframes_and_relationships(es) 152 | pipeline.fit((dataframes, relationships), y=df.target.values).predict( 153 | (dataframes, relationships) 154 | ) 155 | result = pipeline.score((dataframes, relationships), df.target.values) 156 | 157 | assert isinstance(result, (float)) 158 | 159 | 160 | # cross_val_score cannot split entityset input 161 | @pytest.mark.xfail 162 | def test_sklearn_cross_val_score(df, es, pipeline): 163 | # Using with cross_val_score 164 | results = cross_val_score( 165 | pipeline, X=es, y=df.target.values, cv=2, scoring="accuracy" 166 | ) 167 | 168 | assert isinstance(results[0], (float)) 169 | assert isinstance(results[1], (float)) 170 | 171 | 172 | # GridSearchCV cannot split entityset input 173 | @pytest.mark.xfail 174 | def test_sklearn_gridsearchcv(df, es, pipeline): 175 | # Using with GridSearchCV 176 | params = {"et__max_depth": [5, 10]} 177 | grid = GridSearchCV(estimator=pipeline, param_grid=params, cv=3) 178 | grid.fit(es, df.target.values) 179 | 180 | assert len(grid.predict(df["customer_id"].values)) == 15 181 | 182 | 183 | def test_sklearn_cutoff_with_entityset(pipeline, es_customer_filtered): 184 | # Using cutoff_time to filter data 185 | ct = pd.DataFrame() 186 | ct["customer_id"] = [1, 2, 3] 187 | ct["time"] = pd.to_datetime(["2014-1-1 04:00", "2014-1-1 04:00", "2014-1-1 04:00"]) 188 | ct["label"] = [True, True, False] 189 | 190 | results = pipeline.fit(X=(es_customer_filtered, ct), y=ct.label).predict( 191 | X=(es_customer_filtered, ct) 192 | ) 193 | 194 | assert len(results) == 3 195 | 196 | 197 | def test_sklearn_cutoff_with_dataframes_and_relationships( 198 | pipeline, es_customer_filtered 199 | ): 200 | # Using cutoff_time to filter data 201 | ct = pd.DataFrame() 202 | ct["customer_id"] = [1, 2, 3] 203 | ct["time"] = pd.to_datetime(["2014-1-1 04:00", "2014-1-1 04:00", "2014-1-1 04:00"]) 204 | ct["label"] = [True, True, False] 205 | 206 | dataframes, relationships = get_dataframes_and_relationships(es_customer_filtered) 207 | results = pipeline.fit(X=((dataframes, relationships), ct), y=ct.label).predict( 208 | X=((dataframes, relationships), ct) 209 | ) 210 | 211 | assert len(results) == 3 212 | 213 | 214 | def test_cfm_uses_filtered_target_df_with_entityset(es): 215 | pipeline = Pipeline( 216 | steps=[("ft", DFSTransformer(target_dataframe_name="transactions"))] 217 | ) 218 | 219 | train_ids = [1, 2, 3] 220 | test_ids = [10, 55, 853] 221 | 222 | train_es = filter_transactions(es, ids=train_ids) 223 | test_es = filter_transactions(es, ids=test_ids) 224 | 225 | fm_train = pipeline.fit_transform(X=train_es) 226 | assert all(fm_train["sessions.COUNT(transactions)"] == [1, 1, 1]) 227 | assert set(fm_train.index.values) == set(train_ids) 228 | 229 | fm_test = pipeline.transform(test_es) 230 | 231 | assert all(fm_test["sessions.COUNT(transactions)"] == [1, 2, 2]) 232 | assert set(fm_test.index.values) == set(test_ids) 233 | 234 | 235 | def test_cfm_uses_filtered_target_df_with_dataframes_and_relationships(es): 236 | pipeline = Pipeline( 237 | steps=[("ft", DFSTransformer(target_dataframe_name="transactions"))] 238 | ) 239 | 240 | train_ids = [3, 1, 2] 241 | test_ids = [853, 55, 10] 242 | 243 | train_es = filter_transactions(es, ids=train_ids) 244 | test_es = filter_transactions(es, ids=test_ids) 245 | train_dataframes, train_relationships = get_dataframes_and_relationships(train_es) 246 | test_dataframes, test_relationships = get_dataframes_and_relationships(test_es) 247 | 248 | fm_train = pipeline.fit_transform(X=(train_dataframes, train_relationships)) 249 | assert all(fm_train["sessions.COUNT(transactions)"] == [1, 1, 1]) 250 | assert set(fm_train.index.values) == set(train_ids) 251 | 252 | fm_test = pipeline.transform(X=(test_dataframes, test_relationships)) 253 | assert all(fm_test["sessions.COUNT(transactions)"] == [2, 2, 1]) 254 | assert set(fm_test.index.values) == set(test_ids) 255 | 256 | 257 | def filter_transactions(es, ids): 258 | new_es = copy.deepcopy(es) 259 | customers_df = es["customers"] 260 | sessions_df = es["sessions"] 261 | products_df = es["products"] 262 | transactions_df = es["transactions"] 263 | transactions_df = transactions_df.loc[ids] 264 | sessions_df = sessions_df[ 265 | sessions_df["session_id"].isin(transactions_df["session_id"].values) 266 | ] 267 | products_df = products_df[ 268 | products_df["product_id"].isin(transactions_df["product_id"].values) 269 | ] 270 | customers_df = customers_df[ 271 | customers_df["customer_id"].isin(sessions_df["customer_id"].values) 272 | ] 273 | new_es.replace_dataframe("customers", customers_df) 274 | new_es.replace_dataframe("sessions", sessions_df) 275 | new_es.replace_dataframe("transactions", transactions_df, already_sorted=True) 276 | new_es.replace_dataframe("products", products_df) 277 | 278 | return new_es 279 | -------------------------------------------------------------------------------- /featuretools_sklearn_transformer/transformer.py: -------------------------------------------------------------------------------- 1 | from featuretools.computational_backends import calculate_feature_matrix 2 | from featuretools.synthesis import dfs 3 | from sklearn.base import TransformerMixin 4 | 5 | 6 | class DFSTransformer(TransformerMixin): 7 | """Transformer using Scikit-Learn interface for Pipeline uses. 8 | """ 9 | 10 | def __init__(self, 11 | target_dataframe_name=None, 12 | agg_primitives=None, 13 | trans_primitives=None, 14 | allowed_paths=None, 15 | max_depth=2, 16 | ignore_dataframes=None, 17 | ignore_columns=None, 18 | seed_features=None, 19 | drop_contains=None, 20 | drop_exact=None, 21 | where_primitives=None, 22 | max_features=-1, 23 | verbose=False): 24 | """Creates Transformer 25 | 26 | Args: 27 | 28 | target_dataframe_name (str): Name of dataframe on which to make 29 | predictions. 30 | 31 | agg_primitives (list[str or AggregationPrimitive], optional): List 32 | of Aggregation Feature types to apply. 33 | 34 | Default: ["sum", "std", "max", "skew", "min", "mean", 35 | "count", "percent_true", "num_unique", "mode"] 36 | 37 | trans_primitives (list[str or TransformPrimitive], optional): 38 | List of Transform Feature functions to apply. 39 | 40 | Default: ["day", "year", "month", "weekday", "haversine", 41 | "num_words", "num_characters"] 42 | 43 | allowed_paths (list[list[str]]): Allowed dataframe paths on which to 44 | make features. 45 | 46 | max_depth (int) : Maximum allowed depth of features. 47 | 48 | ignore_dataframes (list[str], optional): List of dataframes to 49 | blacklist when creating features. 50 | 51 | ignore_columns (dict[str -> list[str]], optional): List of 52 | specific columns within each dataframe to blacklist when 53 | creating features. 54 | 55 | seed_features (list[:class:`.FeatureBase`]): List of manually 56 | defined features to use. 57 | 58 | drop_contains (list[str], optional): Drop features 59 | that contains these strings in name. 60 | 61 | drop_exact (list[str], optional): Drop features that 62 | exactly match these strings in name. 63 | 64 | where_primitives (list[str or PrimitiveBase], optional): 65 | List of Primitives names (or types) to apply with where 66 | clauses. 67 | 68 | Default: 69 | 70 | ["count"] 71 | 72 | max_features (int, optional) : Cap the number of generated features 73 | to this number. If -1, no limit. 74 | 75 | Example: 76 | .. ipython:: python 77 | 78 | import featuretools as ft 79 | import pandas as pd 80 | 81 | from featuretools.wrappers import DFSTransformer 82 | from sklearn.pipeline import Pipeline 83 | from sklearn.ensemble import ExtraTreesClassifier 84 | 85 | # Get example data 86 | train_es = ft.demo.load_mock_customer(return_entityset=True, n_customers=3) 87 | test_es = ft.demo.load_mock_customer(return_entityset=True, n_customers=2) 88 | y = [True, False, True] 89 | 90 | # Build pipeline 91 | pipeline = Pipeline(steps=[ 92 | ('ft', DFSTransformer(target_dataframe_name="customers", 93 | max_features=2)), 94 | ('et', ExtraTreesClassifier(n_estimators=100)) 95 | ]) 96 | 97 | # Fit and predict 98 | pipeline.fit(X=train_es, y=y) # fit on customers in training entityset 99 | pipeline.predict_proba(test_es) # predict probability of each class on test entityset 100 | pipeline.predict(test_es) # predict on test entityset 101 | 102 | # Same as above, but using cutoff times 103 | train_ct = pd.DataFrame() 104 | train_ct['customer_id'] = [1, 2, 3] 105 | train_ct['time'] = pd.to_datetime(['2014-1-1 04:00', 106 | '2014-1-2 17:20', 107 | '2014-1-4 09:53']) 108 | 109 | pipeline.fit(X=(train_es, train_ct), y=y) 110 | 111 | test_ct = pd.DataFrame() 112 | test_ct['customer_id'] = [1, 2] 113 | test_ct['time'] = pd.to_datetime(['2014-1-4 13:48', 114 | '2014-1-5 15:32']) 115 | pipeline.predict_proba((test_es, test_ct)) 116 | pipeline.predict((test_es, test_ct)) 117 | 118 | """ 119 | self.feature_defs = [] 120 | self.target_dataframe_name = target_dataframe_name 121 | self.agg_primitives = agg_primitives 122 | self.trans_primitives = trans_primitives 123 | self.allowed_paths = allowed_paths 124 | self.max_depth = max_depth 125 | self.ignore_dataframes = ignore_dataframes 126 | self.ignore_columns = ignore_columns 127 | self.seed_features = seed_features 128 | self.drop_contains = drop_contains 129 | self.drop_exact = drop_exact 130 | self.where_primitives = where_primitives 131 | self.max_features = max_features 132 | self.verbose = verbose 133 | 134 | def fit(self, X, y=None): 135 | """Wrapper for DFS 136 | 137 | Calculates a list of features given a dictionary of dataframes and a list 138 | of relationships. Alternatively, an EntitySet can be passed instead of 139 | the dataframes and relationships. 140 | 141 | Args: 142 | X: (ft.Entityset or tuple): Entityset to calculate features on. If a tuple is 143 | passed it can take one of these forms: (entityset, cutoff_time_dataframe), 144 | (dataframes, relationships), or ((dataframes, relationships), cutoff_time_dataframe) 145 | y: (iterable): Training targets 146 | 147 | See Also: 148 | :func:`synthesis.dfs` 149 | """ 150 | es, dataframes, relationships, _ = parse_x_input(X) 151 | 152 | self.feature_defs = dfs(entityset=es, 153 | dataframes=dataframes, 154 | relationships=relationships, 155 | target_dataframe_name=self.target_dataframe_name, 156 | agg_primitives=self.agg_primitives, 157 | trans_primitives=self.trans_primitives, 158 | allowed_paths=self.allowed_paths, 159 | max_depth=self.max_depth, 160 | ignore_dataframes=self.ignore_dataframes, 161 | ignore_columns=self.ignore_columns, 162 | seed_features=self.seed_features, 163 | drop_contains=self.drop_contains, 164 | drop_exact=self.drop_exact, 165 | where_primitives=self.where_primitives, 166 | max_features=self.max_features, 167 | features_only=True, 168 | verbose=self.verbose) 169 | 170 | return self 171 | 172 | def transform(self, X): 173 | """Wrapper for calculate_feature_matrix 174 | 175 | Calculates a feature matrix for a the given input data and calculation times. 176 | 177 | Args: 178 | X: (ft.Entityset or tuple): Entityset to calculate features on. If a tuple is 179 | passed it can take one of these forms: (entityset, cutoff_time_dataframe), 180 | (dataframes, relationships), or ((dataframes, relationships), cutoff_time_dataframe) 181 | 182 | See Also: 183 | :func:`computational_backends.calculate_feature_matrix` 184 | """ 185 | es, dataframes, relationships, cutoff_time = parse_x_input(X) 186 | 187 | X_transformed = calculate_feature_matrix( 188 | features=self.feature_defs, 189 | instance_ids=None, 190 | cutoff_time=cutoff_time, 191 | entityset=es, 192 | dataframes=dataframes, 193 | relationships=relationships, 194 | verbose=self.verbose) 195 | 196 | return X_transformed 197 | 198 | def get_params(self, deep=True): 199 | out = { 200 | 'target_dataframe_name': self.target_dataframe_name, 201 | 'agg_primitives': self.agg_primitives, 202 | 'trans_primitives': self.trans_primitives, 203 | 'allowed_paths': self.allowed_paths, 204 | 'max_depth': self.max_depth, 205 | 'ignore_dataframes': self.ignore_dataframes, 206 | 'ignore_columns': self.ignore_columns, 207 | 'seed_features': self.seed_features, 208 | 'drop_contains': self.drop_contains, 209 | 'drop_exact': self.drop_exact, 210 | 'where_primitives': self.where_primitives, 211 | 'max_features': self.max_features, 212 | 'verbose': self.verbose, 213 | } 214 | return out 215 | 216 | 217 | def parse_x_input(X): 218 | if isinstance(X, tuple): 219 | if isinstance(X[0], tuple): 220 | # Input of ((dataframes, relationships), cutoff_time) 221 | dataframes = X[0][0] 222 | relationships = X[0][1] 223 | es = None 224 | cutoff_time = X[1] 225 | elif isinstance(X[0], dict): 226 | # Input of (dataframes, relationships) 227 | dataframes = X[0] 228 | relationships = X[1] 229 | es = None 230 | cutoff_time = None 231 | else: 232 | # Input of (entityset, cutoff_time) 233 | es = X[0] 234 | dataframes = None 235 | relationships = None 236 | cutoff_time = X[1] 237 | else: 238 | # Input of entityset 239 | es = X 240 | dataframes = None 241 | relationships = None 242 | cutoff_time = None 243 | 244 | return es, dataframes, relationships, cutoff_time 245 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | featuretools>=1.0.0 2 | numpy 3 | pandas 4 | scikit-learn>=0.20.0,!=0.22 5 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [tool:pytest] 2 | python_files = featuretools_sklearn_transformer/tests/* 3 | filterwarnings = 4 | ignore::DeprecationWarning 5 | ignore::PendingDeprecationWarning 6 | [flake8] 7 | exclude = docs/* 8 | ignore = E501,W504 9 | [metadata] 10 | description-file = README.md 11 | [aliases] 12 | test=pytest 13 | [isort] 14 | forced_separate=featuretools_sklearn_transformer 15 | multi_line_output=3 16 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from os import path 2 | 3 | from setuptools import find_packages, setup 4 | 5 | this_directory = path.abspath(path.dirname(__file__)) 6 | with open(path.join(this_directory, "README.md")) as f: 7 | long_description = f.read() 8 | 9 | setup( 10 | name="featuretools_sklearn_transformer", 11 | version="1.0.0", 12 | author="Feature Labs, Inc.", 13 | author_email="support@featurelabs.com", 14 | license="BSD 3-clause", 15 | url="http://www.featurelabs.com/", 16 | python_requires=">=3.7, <4", 17 | install_requires=open("requirements.txt").readlines(), 18 | packages=find_packages(), 19 | description="Featuretools Transformer for Scikit-Learn Pipeline use.", 20 | long_description=long_description, 21 | long_description_content_type="text/markdown", 22 | include_package_data=True, 23 | entry_points={ 24 | "featuretools_plugin": [ 25 | "wrappers = featuretools_sklearn_transformer", 26 | ], 27 | }, 28 | ) 29 | -------------------------------------------------------------------------------- /test-requirements.txt: -------------------------------------------------------------------------------- 1 | pytest~=5.3.1 2 | pytest-xdist~=1.30.0 3 | pytest-cov~=2.8.1 4 | codecov~=2.0.15 5 | flake8~=3.7.9 6 | autopep8~=1.4.4 7 | isort~=4.3.21 8 | --------------------------------------------------------------------------------