├── .github
    └── workflows
    │   ├── release.yaml
    │   └── tests.yml
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.md
├── featuretools_sklearn_transformer
    ├── __init__.py
    ├── tests
    │   ├── __init__.py
    │   └── test_transformer.py
    └── transformer.py
├── requirements.txt
├── setup.cfg
├── setup.py
└── test-requirements.txt


/.github/workflows/release.yaml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   release:
 3 |     types: [published]
 4 | 
 5 | name: Release
 6 | jobs:
 7 |   pypi:
 8 |     name: Release to PyPI
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |     - uses: actions/checkout@v2
12 |     - name: Upload to PyPI
13 |       uses: FeatureLabs/gh-action-pypi-upload@v1
14 |       env:
15 |         PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }}
16 |         PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
17 |         TEST_PYPI_USERNAME: ${{ secrets.TEST_PYPI_USERNAME }}
18 |         TEST_PYPI_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }}
19 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   pull_request:
 3 |     types: [opened, synchronize]
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | name: Tests
 9 | jobs:
10 |   entry_point_test:
11 |     name: Entry Point Test - Python ${{ matrix.python_version }}
12 |     runs-on: ubuntu-latest
13 |     strategy:
14 |       matrix:
15 |         python_version: ["3.7", "3.8"]
16 |     steps:
17 |       - name: Set up python ${{ matrix.python_version }}
18 |         uses: actions/setup-python@v2
19 |         with:
20 |           python-version: ${{ matrix.python_version }}
21 |       - name: Checkout repository
22 |         uses: actions/checkout@v2
23 |         with:
24 |           ref: ${{ github.event.pull_request.head.ref }}
25 |           repository: ${{ github.event.pull_request.head.repo.full_name }}
26 |       - name: Build source distribution
27 |         run: make package_build
28 |       - name: Install package with test requirements
29 |         run: |
30 |           pip config --site set global.progress_bar off
31 |           pip install --upgrade pip
32 |           pip install dist/package/
33 |           pip install -r dist/package/test-requirements.txt
34 |       - name: Test entry point
35 |         run: make entry-point-test
36 | 
37 |   lint_tests:
38 |     name: Lint Tests - Python ${{ matrix.python-version }}
39 |     runs-on: ubuntu-latest
40 |     strategy:
41 |       matrix:
42 |         python-version: ["3.7", "3.8"]
43 |     steps:
44 |       - name: Set up Python ${{ matrix.python-version }}
45 |         uses: actions/setup-python@v2
46 |         with:
47 |           python-version: ${{ matrix.python-version }}
48 |       - name: Checkout repository
49 |         uses: actions/checkout@v2
50 |         with:
51 |           ref: ${{ github.event.pull_request.head.ref }}
52 |           repository: ${{ github.event.pull_request.head.repo.full_name }}
53 |       - name: Build source distribution
54 |         run: make package_build
55 |       - name: Install package with test requirements
56 |         run: |
57 |           pip config --site set global.progress_bar off
58 |           pip install --upgrade pip
59 |           pip install dist/package/
60 |           pip install -r dist/package/test-requirements.txt
61 |       - name: Run lint tests
62 |         run: cd dist/package && make lint-tests -f ../../Makefile
63 | 
64 |   unit_tests:
65 |     name: Unit Tests - Python ${{ matrix.python-version }}
66 |     runs-on: ubuntu-latest
67 |     strategy:
68 |       matrix:
69 |         python-version: ["3.7", "3.8"]
70 |     steps:
71 |       - name: Set up Python ${{ matrix.python-version }}
72 |         uses: actions/setup-python@v2
73 |         with:
74 |           python-version: ${{ matrix.python-version }}
75 |       - name: Checkout repository
76 |         uses: actions/checkout@v2
77 |         with:
78 |           ref: ${{ github.event.pull_request.head.ref }}
79 |           repository: ${{ github.event.pull_request.head.repo.full_name }}
80 |       - name: Build source distribution
81 |         run: make package_build
82 |       - name: Install package with test requirements
83 |         run: |
84 |           pip config --site set global.progress_bar off
85 |           pip install --upgrade pip
86 |           pip install dist/package/
87 |           pip install -r dist/package/test-requirements.txt
88 |       - name: Run unit tests (no code coverage) 
89 |         if: ${{ matrix.python-version != 3.8 }}
90 |         run: cd dist/package && make unit_tests -f ../../Makefile
91 |       - name: Run unit tests with code coverage
92 |         if: ${{ matrix.python-version == 3.8 }}
93 |         run: cd dist/package && make unit_tests addopts="--cov=featuretools_sklearn_transformer" -f ../../Makefile
94 |       - name: Upload coverage to Codecov
95 |         if: ${{ matrix.python-version == 3.8 }}
96 |         uses: codecov/codecov-action@v1
97 |         with:
98 |           fail_ci_if_error: true
99 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # IDE
  2 | .vscode
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | docs/source/generated
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # pyenv
 80 | .python-version
 81 | 
 82 | # celery beat schedule file
 83 | celerybeat-schedule
 84 | 
 85 | # SageMath parsed files
 86 | *.sage.py
 87 | 
 88 | # Environments
 89 | .env
 90 | .venv
 91 | env/
 92 | venv/
 93 | ENV/
 94 | env.bak/
 95 | venv.bak/
 96 | 
 97 | # Spyder project settings
 98 | .spyderproject
 99 | .spyproject
100 | 
101 | # Rope project settings
102 | .ropeproject
103 | 
104 | # mkdocs documentation
105 | /site
106 | 
107 | # mypy
108 | .mypy_cache/
109 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 3-Clause License
 2 | 
 3 | Copyright (c) 2019, Feature Labs, Inc.
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | * Neither the name of the copyright holder nor the names of its
17 |   contributors may be used to endorse or promote products derived from
18 |   this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | include test-requirements.txt
3 | include LICENSE
4 | include README.md
5 | recursive-exclude * __pycache__
6 | recursive-exclude * *.py[co]
7 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: clean
 2 | clean:
 3 | 	find . -name '*.pyo' -delete
 4 | 	find . -name '*.pyc' -delete
 5 | 	find . -name __pycache__ -delete
 6 | 	find . -name '*~' -delete
 7 | 
 8 | .PHONY: lint-tests
 9 | lint-tests:
10 | 	flake8 featuretools_sklearn_transformer && isort --check-only --recursive featuretools_sklearn_transformer
11 | 
12 | .PHONY: lint-fix
13 | lint-fix:
14 | 	autopep8 --in-place --recursive --max-line-length=100 --select="E225,E303,E302,E203,E128,E231,E251,E271,E127,E126,E301,W291,W293,E226,E306,E221" featuretools_sklearn_transformer
15 | 	isort --recursive featuretools_sklearn_transformer
16 | 
17 | .PHONY: unit_tests
18 | unit_tests:
19 | 	pytest --cache-clear --show-capture=stderr -vv ${addopts}
20 | 
21 | .PHONY: installdeps
22 | installdeps:
23 | 	pip install --upgrade pip -q
24 | 	pip install -e . -q
25 | 	pip install -r test-requirements.txt -q
26 | 
27 | .PHONY: entry-point-test
28 | entry-point-test:
29 | 	cd ~ && python -c "from featuretools.wrappers import DFSTransformer"
30 | 
31 | .PHONY: package_build
32 | package_build:
33 | 	rm -rf dist/package
34 | 	python setup.py sdist
35 | 	$(eval package=$(shell python setup.py --fullname))
36 | 	tar -zxvf "dist/${package}.tar.gz" 
37 | 	mv ${package} dist/package
38 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # featuretools-sklearn-transformer
 2 | 
 3 | ![Tests](https://github.com/FeatureLabs/featuretools-sklearn-transformer/workflows/Tests/badge.svg)
 4 | [![Coverage Status](https://codecov.io/gh/alteryx/featuretools-sklearn-transformer/branch/main/graph/badge.svg)](https://codecov.io/gh/alteryx/featuretools-sklearn-transformer)
 5 | [![PyPI version](https://badge.fury.io/py/featuretools-sklearn-transformer.svg?maxAge=2592000)](https://badge.fury.io/py/featuretools-sklearn-transformer)
 6 | 
 7 | [Featuretools](https://github.com/alteryx/featuretools)' DFS as a scikit-learn transformer
 8 | 
 9 | ### Install
10 | ```shell
11 | pip install featuretools_sklearn_transformer
12 | ```
13 | 
14 | ### Use
15 | 
16 | To use the transformer in a pipeline, initialize an instance of the transformer by passing in
17 | the parameters you would like to use for calculating features. To fit the model and generate features for
18 | the training data, pass in an entityset or list of dataframes and relationships containing only the relevant
19 | training data as the `X` input, along with the training targets as the `y` input. To generate a feature matrix from test data, pass in
20 | an entityset containing only the relevant test data as the `X` input.
21 | 
22 | The input supplied for `X` can take several formats:
23 | - To use a Featuretools EntitySet without cutoff times, simply pass in the EntitySet
24 | - To use a Featuretools EntitySet with a cutoff times DataFrame, pass in a tuple of the form (EntitySet, cutoff_time_df)
25 | - To use a list DataFrames and Relationships without cutoff times, pass a tuple of the form (dataframes, relationships)
26 | - To use a list of DataFrames and Relationships with a cutoff times DataFrame, pass a tuple of the form ((dataframes, relationships), cutoff_time_df)
27 | 
28 | Note that because this transformer requires a Featuretools EntitySet or dataframes and relationships as input, it does not currently work
29 | with certain methods such as `sklearn.model_selection.cross_val_score` or `sklearn.model_selection.GridSearchCV` which expect the `X` values
30 | to be an iterable which can be split by the method.
31 | 
32 | The example below shows how to use the transformer with an EntitySet, both with and without a cutoff time DataFrame.
33 | 
34 | ```python
35 | import featuretools as ft
36 | import pandas as pd
37 | 
38 | from featuretools.wrappers import DFSTransformer
39 | from sklearn.pipeline import Pipeline
40 | from sklearn.ensemble import ExtraTreesClassifier
41 | 
42 | # Get example data
43 | train_es = ft.demo.load_mock_customer(return_entityset=True, n_customers=3)
44 | test_es = ft.demo.load_mock_customer(return_entityset=True, n_customers=2)
45 | y = [True, False, True]
46 | 
47 | # Build pipeline
48 | pipeline = Pipeline(steps=[
49 |     ('ft', DFSTransformer(target_dataframe_name="customers",
50 |                           max_features=2)),
51 |     ('et', ExtraTreesClassifier(n_estimators=100))
52 | ])
53 | 
54 | # Fit and predict
55 | pipeline.fit(X=train_es, y=y) # fit on customers in training entityset
56 | pipeline.predict_proba(test_es) # predict probability of each class on test entityset
57 | pipeline.predict(test_es) # predict on test entityset
58 | 
59 | # Same as above, but using cutoff times
60 | train_ct = pd.DataFrame()
61 | train_ct['customer_id'] = [1, 2, 3]
62 | train_ct['time'] = pd.to_datetime(['2014-1-1 04:00',
63 |                                    '2014-1-2 17:20',
64 |                                    '2014-1-4 09:53'])
65 | 
66 | pipeline.fit(X=(train_es, train_ct), y=y)
67 | 
68 | test_ct = pd.DataFrame()
69 | test_ct['customer_id'] = [1, 2]
70 | test_ct['time'] = pd.to_datetime(['2014-1-4 13:48',
71 |                                   '2014-1-5 15:32'])
72 | pipeline.predict_proba((test_es, test_ct))
73 | pipeline.predict((test_es, test_ct))
74 | ```
75 | 
76 | ## Built at Alteryx Innovation Labs
77 | 
78 | <a href="https://www.alteryx.com/innovation-labs">
79 |     <img src="https://evalml-web-images.s3.amazonaws.com/alteryx_innovation_labs.png" alt="Alteryx Innovation Labs" />
80 | </a>
81 | 


--------------------------------------------------------------------------------
/featuretools_sklearn_transformer/__init__.py:
--------------------------------------------------------------------------------
1 | from .transformer import DFSTransformer  # noqa: F401
2 | 
3 | __version__ = "1.0.0"
4 | 


--------------------------------------------------------------------------------
/featuretools_sklearn_transformer/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alteryx/featuretools-sklearn-transformer/11140cafa0c56776b56b0b6ff9745089ce733102/featuretools_sklearn_transformer/tests/__init__.py


--------------------------------------------------------------------------------
/featuretools_sklearn_transformer/tests/test_transformer.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | 
  3 | import numpy as np
  4 | import pandas as pd
  5 | import pytest
  6 | from featuretools.demo.mock_customer import load_mock_customer
  7 | from featuretools.wrappers import DFSTransformer
  8 | from sklearn.ensemble import ExtraTreesClassifier
  9 | from sklearn.impute import SimpleImputer
 10 | from sklearn.model_selection import GridSearchCV, cross_val_score
 11 | from sklearn.pipeline import Pipeline
 12 | from sklearn.preprocessing import FunctionTransformer, StandardScaler
 13 | 
 14 | 
 15 | def select_numeric(df):
 16 |     return df.select_dtypes(exclude=["object", pd.CategoricalDtype])
 17 | 
 18 | 
 19 | @pytest.fixture
 20 | def es():
 21 |     es = load_mock_customer(
 22 |         n_customers=15,
 23 |         n_products=15,
 24 |         n_sessions=75,
 25 |         n_transactions=1000,
 26 |         random_seed=0,
 27 |         return_entityset=True,
 28 |     )
 29 |     return es
 30 | 
 31 | 
 32 | @pytest.fixture
 33 | def es_customer_filtered(es):
 34 |     new_es = copy.deepcopy(es)
 35 |     customers_df = es["customers"]
 36 |     sessions_df = es["sessions"]
 37 |     products_df = es["products"]
 38 |     transactions_df = es["transactions"]
 39 |     customer_ids = [1, 2, 3]
 40 |     customers_df = customers_df.loc[customer_ids]
 41 |     sessions_df = sessions_df[sessions_df["customer_id"].isin(customer_ids)]
 42 |     transactions_df = transactions_df[
 43 |         transactions_df["session_id"].isin(sessions_df["session_id"].values)
 44 |     ]
 45 |     products_df = products_df[
 46 |         products_df["product_id"].isin(transactions_df["product_id"].values)
 47 |     ]
 48 |     new_es.replace_dataframe("customers", customers_df)
 49 |     new_es.replace_dataframe("sessions", sessions_df)
 50 |     new_es.replace_dataframe("transactions", transactions_df)
 51 |     new_es.replace_dataframe("products", products_df)
 52 | 
 53 |     return new_es
 54 | 
 55 | 
 56 | def get_dataframes_and_relationships(es):
 57 |     dataframes = {}
 58 |     relationships = []
 59 | 
 60 |     for df in es.dataframes:
 61 |         dataframes[df.ww.name] = (
 62 |             df,
 63 |             df.ww.index,
 64 |             df.ww.time_index,
 65 |             df.ww.logical_types,
 66 |         )
 67 | 
 68 |     for rel in es.relationships:
 69 |         relationships.append(
 70 |             (
 71 |                 rel._parent_dataframe_name,
 72 |                 rel._parent_column_name,
 73 |                 rel._child_dataframe_name,
 74 |                 rel._child_column_name,
 75 |             )
 76 |         )
 77 | 
 78 |     return dataframes, relationships
 79 | 
 80 | 
 81 | @pytest.fixture
 82 | def df(es):
 83 |     df = es["customers"].copy()
 84 |     df["target"] = np.random.randint(1, 3, df.shape[0])  # 1 or 2 values
 85 |     return df
 86 | 
 87 | 
 88 | @pytest.fixture
 89 | def pipeline():
 90 |     pipeline = Pipeline(
 91 |         steps=[
 92 |             ("ft", DFSTransformer(target_dataframe_name="customers", max_features=20)),
 93 |             ("numeric", FunctionTransformer(select_numeric, validate=False)),
 94 |             ("imp", SimpleImputer()),
 95 |             ("et", ExtraTreesClassifier(n_estimators=10)),
 96 |         ]
 97 |     )
 98 |     return pipeline
 99 | 
100 | 
101 | def test_sklearn_transformer_with_entityset(es):
102 |     # Using with transformers
103 |     pipeline = Pipeline(
104 |         steps=[
105 |             ("ft", DFSTransformer(target_dataframe_name="customers")),
106 |             ("numeric", FunctionTransformer(select_numeric, validate=False)),
107 |             ("sc", StandardScaler()),
108 |         ]
109 |     )
110 | 
111 |     X_train = pipeline.fit(es).transform(es)
112 | 
113 |     assert X_train.shape[0] == 15
114 | 
115 | 
116 | def test_sklearn_transformer_with_dataframes_and_relationships(es):
117 |     # Using with transformers
118 |     pipeline = Pipeline(
119 |         steps=[
120 |             ("ft", DFSTransformer(target_dataframe_name="customers")),
121 |             ("numeric", FunctionTransformer(select_numeric, validate=False)),
122 |             ("sc", StandardScaler()),
123 |         ]
124 |     )
125 |     dataframes, relationships = get_dataframes_and_relationships(es)
126 | 
127 |     X_train = pipeline.fit((dataframes, relationships)).transform(
128 |         (dataframes, relationships)
129 |     )
130 | 
131 |     assert X_train.shape[0] == 15
132 | 
133 | 
134 | def test_sklearn_estimator_with_entityset(df, es, pipeline):
135 |     # Using with estimator
136 |     pipeline.fit(es, y=df.target.values).predict(es)
137 |     result = pipeline.score(es, df.target.values)
138 | 
139 |     assert isinstance(result, (float))
140 | 
141 |     # Pickling / Unpickling Pipeline
142 |     # TODO fix this
143 |     # s = pickle.dumps(pipeline)
144 |     # pipe_pickled = pickle.loads(s)
145 |     # result = pipe_pickled.score(df['customer_id'].values, df.target.values)
146 |     # assert isinstance(result, (float))
147 | 
148 | 
149 | def test_sklearn_estimator_with_dataframes_and_relationships(df, es, pipeline):
150 |     # Using with estimator
151 |     dataframes, relationships = get_dataframes_and_relationships(es)
152 |     pipeline.fit((dataframes, relationships), y=df.target.values).predict(
153 |         (dataframes, relationships)
154 |     )
155 |     result = pipeline.score((dataframes, relationships), df.target.values)
156 | 
157 |     assert isinstance(result, (float))
158 | 
159 | 
160 | # cross_val_score cannot split entityset input
161 | @pytest.mark.xfail
162 | def test_sklearn_cross_val_score(df, es, pipeline):
163 |     # Using with cross_val_score
164 |     results = cross_val_score(
165 |         pipeline, X=es, y=df.target.values, cv=2, scoring="accuracy"
166 |     )
167 | 
168 |     assert isinstance(results[0], (float))
169 |     assert isinstance(results[1], (float))
170 | 
171 | 
172 | # GridSearchCV cannot split entityset input
173 | @pytest.mark.xfail
174 | def test_sklearn_gridsearchcv(df, es, pipeline):
175 |     # Using with GridSearchCV
176 |     params = {"et__max_depth": [5, 10]}
177 |     grid = GridSearchCV(estimator=pipeline, param_grid=params, cv=3)
178 |     grid.fit(es, df.target.values)
179 | 
180 |     assert len(grid.predict(df["customer_id"].values)) == 15
181 | 
182 | 
183 | def test_sklearn_cutoff_with_entityset(pipeline, es_customer_filtered):
184 |     # Using cutoff_time to filter data
185 |     ct = pd.DataFrame()
186 |     ct["customer_id"] = [1, 2, 3]
187 |     ct["time"] = pd.to_datetime(["2014-1-1 04:00", "2014-1-1 04:00", "2014-1-1 04:00"])
188 |     ct["label"] = [True, True, False]
189 | 
190 |     results = pipeline.fit(X=(es_customer_filtered, ct), y=ct.label).predict(
191 |         X=(es_customer_filtered, ct)
192 |     )
193 | 
194 |     assert len(results) == 3
195 | 
196 | 
197 | def test_sklearn_cutoff_with_dataframes_and_relationships(
198 |     pipeline, es_customer_filtered
199 | ):
200 |     # Using cutoff_time to filter data
201 |     ct = pd.DataFrame()
202 |     ct["customer_id"] = [1, 2, 3]
203 |     ct["time"] = pd.to_datetime(["2014-1-1 04:00", "2014-1-1 04:00", "2014-1-1 04:00"])
204 |     ct["label"] = [True, True, False]
205 | 
206 |     dataframes, relationships = get_dataframes_and_relationships(es_customer_filtered)
207 |     results = pipeline.fit(X=((dataframes, relationships), ct), y=ct.label).predict(
208 |         X=((dataframes, relationships), ct)
209 |     )
210 | 
211 |     assert len(results) == 3
212 | 
213 | 
214 | def test_cfm_uses_filtered_target_df_with_entityset(es):
215 |     pipeline = Pipeline(
216 |         steps=[("ft", DFSTransformer(target_dataframe_name="transactions"))]
217 |     )
218 | 
219 |     train_ids = [1, 2, 3]
220 |     test_ids = [10, 55, 853]
221 | 
222 |     train_es = filter_transactions(es, ids=train_ids)
223 |     test_es = filter_transactions(es, ids=test_ids)
224 | 
225 |     fm_train = pipeline.fit_transform(X=train_es)
226 |     assert all(fm_train["sessions.COUNT(transactions)"] == [1, 1, 1])
227 |     assert set(fm_train.index.values) == set(train_ids)
228 | 
229 |     fm_test = pipeline.transform(test_es)
230 | 
231 |     assert all(fm_test["sessions.COUNT(transactions)"] == [1, 2, 2])
232 |     assert set(fm_test.index.values) == set(test_ids)
233 | 
234 | 
235 | def test_cfm_uses_filtered_target_df_with_dataframes_and_relationships(es):
236 |     pipeline = Pipeline(
237 |         steps=[("ft", DFSTransformer(target_dataframe_name="transactions"))]
238 |     )
239 | 
240 |     train_ids = [3, 1, 2]
241 |     test_ids = [853, 55, 10]
242 | 
243 |     train_es = filter_transactions(es, ids=train_ids)
244 |     test_es = filter_transactions(es, ids=test_ids)
245 |     train_dataframes, train_relationships = get_dataframes_and_relationships(train_es)
246 |     test_dataframes, test_relationships = get_dataframes_and_relationships(test_es)
247 | 
248 |     fm_train = pipeline.fit_transform(X=(train_dataframes, train_relationships))
249 |     assert all(fm_train["sessions.COUNT(transactions)"] == [1, 1, 1])
250 |     assert set(fm_train.index.values) == set(train_ids)
251 | 
252 |     fm_test = pipeline.transform(X=(test_dataframes, test_relationships))
253 |     assert all(fm_test["sessions.COUNT(transactions)"] == [2, 2, 1])
254 |     assert set(fm_test.index.values) == set(test_ids)
255 | 
256 | 
257 | def filter_transactions(es, ids):
258 |     new_es = copy.deepcopy(es)
259 |     customers_df = es["customers"]
260 |     sessions_df = es["sessions"]
261 |     products_df = es["products"]
262 |     transactions_df = es["transactions"]
263 |     transactions_df = transactions_df.loc[ids]
264 |     sessions_df = sessions_df[
265 |         sessions_df["session_id"].isin(transactions_df["session_id"].values)
266 |     ]
267 |     products_df = products_df[
268 |         products_df["product_id"].isin(transactions_df["product_id"].values)
269 |     ]
270 |     customers_df = customers_df[
271 |         customers_df["customer_id"].isin(sessions_df["customer_id"].values)
272 |     ]
273 |     new_es.replace_dataframe("customers", customers_df)
274 |     new_es.replace_dataframe("sessions", sessions_df)
275 |     new_es.replace_dataframe("transactions", transactions_df, already_sorted=True)
276 |     new_es.replace_dataframe("products", products_df)
277 | 
278 |     return new_es
279 | 


--------------------------------------------------------------------------------
/featuretools_sklearn_transformer/transformer.py:
--------------------------------------------------------------------------------
  1 | from featuretools.computational_backends import calculate_feature_matrix
  2 | from featuretools.synthesis import dfs
  3 | from sklearn.base import TransformerMixin
  4 | 
  5 | 
  6 | class DFSTransformer(TransformerMixin):
  7 |     """Transformer using Scikit-Learn interface for Pipeline uses.
  8 |     """
  9 | 
 10 |     def __init__(self,
 11 |                  target_dataframe_name=None,
 12 |                  agg_primitives=None,
 13 |                  trans_primitives=None,
 14 |                  allowed_paths=None,
 15 |                  max_depth=2,
 16 |                  ignore_dataframes=None,
 17 |                  ignore_columns=None,
 18 |                  seed_features=None,
 19 |                  drop_contains=None,
 20 |                  drop_exact=None,
 21 |                  where_primitives=None,
 22 |                  max_features=-1,
 23 |                  verbose=False):
 24 |         """Creates Transformer
 25 | 
 26 |         Args:
 27 | 
 28 |             target_dataframe_name (str): Name of dataframe on which to make
 29 |                 predictions.
 30 | 
 31 |             agg_primitives (list[str or AggregationPrimitive], optional): List
 32 |                 of Aggregation Feature types to apply.
 33 | 
 34 |                     Default: ["sum", "std", "max", "skew", "min", "mean",
 35 |                               "count", "percent_true", "num_unique", "mode"]
 36 | 
 37 |             trans_primitives (list[str or TransformPrimitive], optional):
 38 |                 List of Transform Feature functions to apply.
 39 | 
 40 |                     Default: ["day", "year", "month", "weekday", "haversine",
 41 |                               "num_words", "num_characters"]
 42 | 
 43 |             allowed_paths (list[list[str]]): Allowed dataframe paths on which to
 44 |                 make features.
 45 | 
 46 |             max_depth (int) : Maximum allowed depth of features.
 47 | 
 48 |             ignore_dataframes (list[str], optional): List of dataframes to
 49 |                 blacklist when creating features.
 50 | 
 51 |             ignore_columns (dict[str -> list[str]], optional): List of
 52 |                 specific columns within each dataframe to blacklist when
 53 |                 creating features.
 54 | 
 55 |             seed_features (list[:class:`.FeatureBase`]): List of manually
 56 |                 defined features to use.
 57 | 
 58 |             drop_contains (list[str], optional): Drop features
 59 |                 that contains these strings in name.
 60 | 
 61 |             drop_exact (list[str], optional): Drop features that
 62 |                 exactly match these strings in name.
 63 | 
 64 |             where_primitives (list[str or PrimitiveBase], optional):
 65 |                 List of Primitives names (or types) to apply with where
 66 |                 clauses.
 67 | 
 68 |                     Default:
 69 | 
 70 |                         ["count"]
 71 | 
 72 |             max_features (int, optional) : Cap the number of generated features
 73 |                     to this number. If -1, no limit.
 74 | 
 75 |         Example:
 76 |             .. ipython:: python
 77 | 
 78 |                 import featuretools as ft
 79 |                 import pandas as pd
 80 | 
 81 |                 from featuretools.wrappers import DFSTransformer
 82 |                 from sklearn.pipeline import Pipeline
 83 |                 from sklearn.ensemble import ExtraTreesClassifier
 84 | 
 85 |                 # Get example data
 86 |                 train_es = ft.demo.load_mock_customer(return_entityset=True, n_customers=3)
 87 |                 test_es = ft.demo.load_mock_customer(return_entityset=True, n_customers=2)
 88 |                 y = [True, False, True]
 89 | 
 90 |                 # Build pipeline
 91 |                 pipeline = Pipeline(steps=[
 92 |                     ('ft', DFSTransformer(target_dataframe_name="customers",
 93 |                                           max_features=2)),
 94 |                     ('et', ExtraTreesClassifier(n_estimators=100))
 95 |                 ])
 96 | 
 97 |                 # Fit and predict
 98 |                 pipeline.fit(X=train_es, y=y) # fit on customers in training entityset
 99 |                 pipeline.predict_proba(test_es) # predict probability of each class on test entityset
100 |                 pipeline.predict(test_es) # predict on test entityset
101 | 
102 |                 # Same as above, but using cutoff times
103 |                 train_ct = pd.DataFrame()
104 |                 train_ct['customer_id'] = [1, 2, 3]
105 |                 train_ct['time'] = pd.to_datetime(['2014-1-1 04:00',
106 |                                                    '2014-1-2 17:20',
107 |                                                    '2014-1-4 09:53'])
108 | 
109 |                 pipeline.fit(X=(train_es, train_ct), y=y)
110 | 
111 |                 test_ct = pd.DataFrame()
112 |                 test_ct['customer_id'] = [1, 2]
113 |                 test_ct['time'] = pd.to_datetime(['2014-1-4 13:48',
114 |                                                   '2014-1-5 15:32'])
115 |                 pipeline.predict_proba((test_es, test_ct))
116 |                 pipeline.predict((test_es, test_ct))
117 | 
118 |         """
119 |         self.feature_defs = []
120 |         self.target_dataframe_name = target_dataframe_name
121 |         self.agg_primitives = agg_primitives
122 |         self.trans_primitives = trans_primitives
123 |         self.allowed_paths = allowed_paths
124 |         self.max_depth = max_depth
125 |         self.ignore_dataframes = ignore_dataframes
126 |         self.ignore_columns = ignore_columns
127 |         self.seed_features = seed_features
128 |         self.drop_contains = drop_contains
129 |         self.drop_exact = drop_exact
130 |         self.where_primitives = where_primitives
131 |         self.max_features = max_features
132 |         self.verbose = verbose
133 | 
134 |     def fit(self, X, y=None):
135 |         """Wrapper for DFS
136 | 
137 |             Calculates a list of features given a dictionary of dataframes and a list
138 |             of relationships. Alternatively, an EntitySet can be passed instead of
139 |             the dataframes and relationships.
140 | 
141 |             Args:
142 |                 X: (ft.Entityset or tuple): Entityset to calculate features on. If a tuple is
143 |                     passed it can take one of these forms: (entityset, cutoff_time_dataframe),
144 |                     (dataframes, relationships), or ((dataframes, relationships), cutoff_time_dataframe)
145 |                 y: (iterable): Training targets
146 | 
147 |             See Also:
148 |                 :func:`synthesis.dfs`
149 |         """
150 |         es, dataframes, relationships, _ = parse_x_input(X)
151 | 
152 |         self.feature_defs = dfs(entityset=es,
153 |                                 dataframes=dataframes,
154 |                                 relationships=relationships,
155 |                                 target_dataframe_name=self.target_dataframe_name,
156 |                                 agg_primitives=self.agg_primitives,
157 |                                 trans_primitives=self.trans_primitives,
158 |                                 allowed_paths=self.allowed_paths,
159 |                                 max_depth=self.max_depth,
160 |                                 ignore_dataframes=self.ignore_dataframes,
161 |                                 ignore_columns=self.ignore_columns,
162 |                                 seed_features=self.seed_features,
163 |                                 drop_contains=self.drop_contains,
164 |                                 drop_exact=self.drop_exact,
165 |                                 where_primitives=self.where_primitives,
166 |                                 max_features=self.max_features,
167 |                                 features_only=True,
168 |                                 verbose=self.verbose)
169 | 
170 |         return self
171 | 
172 |     def transform(self, X):
173 |         """Wrapper for calculate_feature_matrix
174 | 
175 |             Calculates a feature matrix for a the given input data and calculation times.
176 | 
177 |             Args:
178 |                 X: (ft.Entityset or tuple): Entityset to calculate features on. If a tuple is
179 |                     passed it can take one of these forms: (entityset, cutoff_time_dataframe),
180 |                     (dataframes, relationships), or ((dataframes, relationships), cutoff_time_dataframe)
181 | 
182 |             See Also:
183 |                 :func:`computational_backends.calculate_feature_matrix`
184 |         """
185 |         es, dataframes, relationships, cutoff_time = parse_x_input(X)
186 | 
187 |         X_transformed = calculate_feature_matrix(
188 |             features=self.feature_defs,
189 |             instance_ids=None,
190 |             cutoff_time=cutoff_time,
191 |             entityset=es,
192 |             dataframes=dataframes,
193 |             relationships=relationships,
194 |             verbose=self.verbose)
195 | 
196 |         return X_transformed
197 | 
198 |     def get_params(self, deep=True):
199 |         out = {
200 |             'target_dataframe_name': self.target_dataframe_name,
201 |             'agg_primitives': self.agg_primitives,
202 |             'trans_primitives': self.trans_primitives,
203 |             'allowed_paths': self.allowed_paths,
204 |             'max_depth': self.max_depth,
205 |             'ignore_dataframes': self.ignore_dataframes,
206 |             'ignore_columns': self.ignore_columns,
207 |             'seed_features': self.seed_features,
208 |             'drop_contains': self.drop_contains,
209 |             'drop_exact': self.drop_exact,
210 |             'where_primitives': self.where_primitives,
211 |             'max_features': self.max_features,
212 |             'verbose': self.verbose,
213 |         }
214 |         return out
215 | 
216 | 
217 | def parse_x_input(X):
218 |     if isinstance(X, tuple):
219 |         if isinstance(X[0], tuple):
220 |             # Input of ((dataframes, relationships), cutoff_time)
221 |             dataframes = X[0][0]
222 |             relationships = X[0][1]
223 |             es = None
224 |             cutoff_time = X[1]
225 |         elif isinstance(X[0], dict):
226 |             # Input of (dataframes, relationships)
227 |             dataframes = X[0]
228 |             relationships = X[1]
229 |             es = None
230 |             cutoff_time = None
231 |         else:
232 |             # Input of (entityset, cutoff_time)
233 |             es = X[0]
234 |             dataframes = None
235 |             relationships = None
236 |             cutoff_time = X[1]
237 |     else:
238 |         # Input of entityset
239 |         es = X
240 |         dataframes = None
241 |         relationships = None
242 |         cutoff_time = None
243 | 
244 |     return es, dataframes, relationships, cutoff_time
245 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | featuretools>=1.0.0
2 | numpy
3 | pandas
4 | scikit-learn>=0.20.0,!=0.22
5 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [tool:pytest]
 2 | python_files = featuretools_sklearn_transformer/tests/*
 3 | filterwarnings =
 4 |     ignore::DeprecationWarning
 5 |     ignore::PendingDeprecationWarning
 6 | [flake8]
 7 | exclude = docs/*
 8 | ignore = E501,W504
 9 | [metadata]
10 | description-file = README.md
11 | [aliases]
12 | test=pytest
13 | [isort]
14 | forced_separate=featuretools_sklearn_transformer
15 | multi_line_output=3
16 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from os import path
 2 | 
 3 | from setuptools import find_packages, setup
 4 | 
 5 | this_directory = path.abspath(path.dirname(__file__))
 6 | with open(path.join(this_directory, "README.md")) as f:
 7 |     long_description = f.read()
 8 | 
 9 | setup(
10 |     name="featuretools_sklearn_transformer",
11 |     version="1.0.0",
12 |     author="Feature Labs, Inc.",
13 |     author_email="support@featurelabs.com",
14 |     license="BSD 3-clause",
15 |     url="http://www.featurelabs.com/",
16 |     python_requires=">=3.7, <4",
17 |     install_requires=open("requirements.txt").readlines(),
18 |     packages=find_packages(),
19 |     description="Featuretools Transformer for Scikit-Learn Pipeline use.",
20 |     long_description=long_description,
21 |     long_description_content_type="text/markdown",
22 |     include_package_data=True,
23 |     entry_points={
24 |         "featuretools_plugin": [
25 |             "wrappers = featuretools_sklearn_transformer",
26 |         ],
27 |     },
28 | )
29 | 


--------------------------------------------------------------------------------
/test-requirements.txt:
--------------------------------------------------------------------------------
1 | pytest~=5.3.1
2 | pytest-xdist~=1.30.0
3 | pytest-cov~=2.8.1
4 | codecov~=2.0.15
5 | flake8~=3.7.9
6 | autopep8~=1.4.4
7 | isort~=4.3.21
8 | 


--------------------------------------------------------------------------------