├── .flake8
├── MANIFEST.in
├── .gitattributes
├── docs
├── lofo_logo.png
├── plot_importance.png
├── pydata_feb19_lofo.pdf
└── plot_importance_box.png
├── requirements.txt
├── lofo
├── __init__.py
├── infer_defaults.py
├── utils.py
├── plotting.py
├── lofo_importance.py
├── flofo_importance.py
└── dataset.py
├── setup.py
├── LICENSE
├── .github
└── workflows
│ ├── python-publish.yml
│ └── python-app.yml
├── data
└── test_data.py
├── tests
├── test_dataset.py
├── test_flofo_importance.py
└── test_lofo_importance.py
├── .gitignore
├── README.md
└── LOFOImportance Example.ipynb
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length=120
3 | exclude = */__init__.py
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include *.txt
3 | include README.md
4 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.py linguist-language=python
2 | *.ipynb linguist-documentation
--------------------------------------------------------------------------------
/docs/lofo_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aerdem4/lofo-importance/HEAD/docs/lofo_logo.png
--------------------------------------------------------------------------------
/docs/plot_importance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aerdem4/lofo-importance/HEAD/docs/plot_importance.png
--------------------------------------------------------------------------------
/docs/pydata_feb19_lofo.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aerdem4/lofo-importance/HEAD/docs/pydata_feb19_lofo.pdf
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.17.3
2 | pandas
3 | scipy
4 | scikit-learn>=0.20.3
5 | tqdm
6 | lightgbm
7 | networkx
--------------------------------------------------------------------------------
/docs/plot_importance_box.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aerdem4/lofo-importance/HEAD/docs/plot_importance_box.png
--------------------------------------------------------------------------------
/lofo/__init__.py:
--------------------------------------------------------------------------------
1 | from .lofo_importance import LOFOImportance
2 | from .flofo_importance import FLOFOImportance
3 | from .dataset import Dataset
4 | from .plotting import plot_importance
5 |
--------------------------------------------------------------------------------
/lofo/infer_defaults.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.preprocessing import LabelEncoder
3 | from lightgbm import LGBMClassifier, LGBMRegressor
4 | from lofo.utils import flatten_list
5 |
6 |
7 | def infer_model(df, features, y, n_jobs):
8 | model_class = LGBMRegressor
9 | if len(np.unique(y)) == 2:
10 | y = LabelEncoder().fit_transform(y)
11 | model_class = LGBMClassifier
12 |
13 | categoricals = df[flatten_list(features)].select_dtypes(exclude=[np.number]).columns.tolist()
14 | for f in categoricals:
15 | df[f] = LabelEncoder().fit_transform(df[f].astype(str))
16 |
17 | min_child_samples = int(0.01*df.shape[0])
18 |
19 | model = model_class(min_child_samples=min_child_samples, n_jobs=n_jobs)
20 |
21 | return model, df, categoricals, y
22 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | with open('requirements.txt') as f:
4 | requirements = f.read().splitlines()
5 |
6 | with open("README.md", "r") as fh:
7 | long_description = fh.read()
8 |
9 | setup(
10 | name='lofo-importance',
11 | version='0.3.5',
12 | url="https://github.com/aerdem4/lofo-importance",
13 | author="Ahmet Erdem",
14 | author_email="ahmeterd4@gmail.com",
15 | description="Leave One Feature Out Importance",
16 | keywords="feature importance selection explainable data-science machine-learning",
17 | long_description=long_description,
18 | long_description_content_type="text/markdown",
19 | packages=find_packages(),
20 | install_requires=requirements,
21 | classifiers=[
22 | "Programming Language :: Python :: 3",
23 | "License :: OSI Approved :: MIT License",
24 | "Operating System :: OS Independent",
25 | ]
26 | )
27 |
--------------------------------------------------------------------------------
/lofo/utils.py:
--------------------------------------------------------------------------------
1 | import multiprocessing
2 | import pandas as pd
3 |
4 |
5 | def lofo_to_df(lofo_scores, feature_list):
6 | importance_df = pd.DataFrame()
7 | importance_df["feature"] = feature_list
8 | importance_df["importance_mean"] = lofo_scores.mean(axis=1)
9 | importance_df["importance_std"] = lofo_scores.std(axis=1)
10 |
11 | for val_score in range(lofo_scores.shape[1]):
12 | importance_df["val_imp_{}".format(val_score)] = lofo_scores[:, val_score]
13 |
14 | return importance_df.sort_values("importance_mean", ascending=False)
15 |
16 |
17 | def parallel_apply(cv_func, feature_list, n_jobs):
18 | pool = multiprocessing.Pool(n_jobs)
19 | manager = multiprocessing.Manager()
20 | result_queue = manager.Queue()
21 |
22 | for f in feature_list:
23 | pool.apply_async(cv_func, (f, result_queue))
24 |
25 | pool.close()
26 | pool.join()
27 |
28 | lofo_cv_result = [result_queue.get() for _ in range(len(feature_list))]
29 | return lofo_cv_result
30 |
31 |
32 | def flatten_list(nested_list):
33 | return [item for sublist in nested_list for item in sublist]
34 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Ahmet Erdem
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package using Twine when a release is created
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
3 |
4 | # This workflow uses actions that are not certified by GitHub.
5 | # They are provided by a third-party and are governed by
6 | # separate terms of service, privacy policy, and support
7 | # documentation.
8 |
9 | name: Upload Python Package
10 |
11 | on:
12 | release:
13 | types: [published]
14 |
15 | permissions:
16 | contents: read
17 |
18 | jobs:
19 | deploy:
20 |
21 | runs-on: ubuntu-latest
22 |
23 | steps:
24 | - uses: actions/checkout@v3
25 | - name: Set up Python
26 | uses: actions/setup-python@v3
27 | with:
28 | python-version: '3.x'
29 | - name: Install dependencies
30 | run: |
31 | python -m pip install --upgrade pip
32 | pip install build
33 | - name: Build package
34 | run: python -m build
35 | - name: Publish package
36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 | with:
38 | user: __token__
39 | password: ${{ secrets.PYPI_KEY }}
40 |
--------------------------------------------------------------------------------
/lofo/plotting.py:
--------------------------------------------------------------------------------
1 | import warnings
2 |
3 |
4 | def plot_importance(importance_df, figsize=(8, 8), kind="default"):
5 | """Plot feature importance
6 |
7 | Parameters
8 | ----------
9 | importance_df : pandas dataframe
10 | Output dataframe from LOFO/FLOFO get_importance
11 | kind : string
12 | plot type can be default or box
13 | figsize : tuple
14 | """
15 | importance_df = importance_df.copy()
16 | importance_df["color"] = (importance_df["importance_mean"] > 0).map({True: 'g', False: 'r'})
17 | importance_df.sort_values("importance_mean", inplace=True)
18 |
19 | available_kinds = {"default", "box"}
20 | if kind not in available_kinds:
21 | warnings.warn("{kind} not in {ak}. Setting to default".format(kind=kind, ak=available_kinds))
22 |
23 | if kind == "default":
24 | importance_df.plot(x="feature", y="importance_mean", xerr="importance_std",
25 | kind='barh', color=importance_df["color"], figsize=figsize)
26 | elif kind == "box":
27 | lofo_score_cols = [col for col in importance_df.columns if col.startswith("val_imp")]
28 | features = importance_df["feature"].values.tolist()
29 | importance_df.set_index("feature")[lofo_score_cols].T.boxplot(column=features, vert=False, figsize=figsize)
30 |
--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3 |
4 | name: Python application
5 |
6 | on:
7 | push:
8 | branches: [ "master" ]
9 | pull_request:
10 | branches: [ "master" ]
11 |
12 | permissions:
13 | contents: read
14 |
15 | jobs:
16 | build:
17 |
18 | runs-on: ubuntu-latest
19 |
20 | steps:
21 | - uses: actions/checkout@v3
22 | - name: Set up Python 3.10
23 | uses: actions/setup-python@v3
24 | with:
25 | python-version: "3.10"
26 | - name: Install dependencies
27 | run: |
28 | python -m pip install --upgrade pip
29 | pip install flake8 pytest matplotlib
30 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
31 | - name: Lint with flake8
32 | run: |
33 | # stop the build if there are Python syntax errors or undefined names
34 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
35 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
36 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
37 | - name: Test with pytest
38 | run: |
39 | python -m pytest tests
40 |
--------------------------------------------------------------------------------
/data/test_data.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 |
5 | def _to_binary(target):
6 | return (target > target.median()).astype(int)
7 |
8 |
9 | def generate_test_data(data_size, text=False):
10 | df = pd.DataFrame()
11 |
12 | np.random.seed(0)
13 | df["A"] = np.random.rand(data_size)
14 | df["B"] = np.random.rand(data_size)
15 | df["C"] = np.random.rand(data_size)
16 | df["D"] = np.random.rand(data_size)
17 |
18 | df["D2"] = df["D"].values + 0.1*np.random.rand(data_size)
19 | df.loc[df["D2"] > 1, "D2"] = None
20 |
21 | df["target"] = 0.2 * np.random.rand(data_size) + df["A"] * df["D"] + 2 * df["B"]
22 | df["binary_target"] = _to_binary(df["target"])
23 |
24 | if text:
25 | df["T"] = np.random.choice(["Bojack", "Horseman", "Todd", "Chavez"], data_size)
26 | df["target"] *= (df["T"] == "Todd")
27 | df["binary_target"] *= (df["T"] == "Todd")
28 |
29 | return df
30 |
31 |
32 | def generate_unstructured_test_data(data_size, text=False):
33 | df = generate_test_data(data_size, text)
34 | df.loc[np.random.rand(data_size) < 0.3, "A"] = None
35 | df["E"] = np.random.choice(["category1", "category2", "category3"], data_size)
36 | df["E"] = df["E"].astype("category")
37 |
38 | df["target"] = (df["E"] != "category2")*df["target"]
39 | df["binary_target"] = _to_binary(df["target"])
40 | return df
41 |
--------------------------------------------------------------------------------
/tests/test_dataset.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | from data.test_data import generate_unstructured_test_data
3 | from lofo import Dataset
4 |
5 |
6 | def test_dataset():
7 | df = generate_unstructured_test_data(1000, text=True)
8 | features = ["A", "B", "C", "D", "D2", "E"]
9 |
10 | # Exception: feature group row count is not equal to the features' row count
11 | feature_groups = {"interactions": df[["A", "B"]].values[:10]*df[["C", "D"]].values[:10]}
12 | with pytest.raises(Exception):
13 | assert Dataset(df=df, target="binary_target", features=features, feature_groups=feature_groups)
14 |
15 | # Exception: Feature group name A is in use by other features
16 | feature_groups = {"A": df[["A", "B"]].values*df[["C", "D"]].values}
17 | with pytest.raises(Exception):
18 | assert Dataset(df=df, target="binary_target", features=features, feature_groups=feature_groups)
19 |
20 | # Exception: Feature group type is not numpy.ndarray or scipy.csr.csr_matrix
21 | feature_groups = {"F": df[["A", "B"]]}
22 | with pytest.raises(Exception):
23 | assert Dataset(df=df, target="binary_target", features=features, feature_groups=feature_groups)
24 |
25 | d = Dataset(df=df, target="binary_target", features=features, feature_groups={"F": df[["A", "B"]].values},
26 | auto_group_threshold=0.5)
27 | assert "D" not in d.feature_names and "D2" not in d.feature_names
28 | assert "D & D2" in d.feature_names and "F" in d.feature_groups.keys()
29 |
--------------------------------------------------------------------------------
/tests/test_flofo_importance.py:
--------------------------------------------------------------------------------
1 | from sklearn.model_selection import train_test_split
2 | from lightgbm import LGBMClassifier
3 | from data.test_data import generate_test_data
4 | from lofo.plotting import plot_importance
5 | from lofo.flofo_importance import FLOFOImportance
6 |
7 |
8 | def test_flofo_importance():
9 | df = generate_test_data(100000)
10 | df.loc[df["A"] < df["A"].median(), "A"] = None
11 |
12 | train_df, val_df = train_test_split(df, test_size=0.2, random_state=0)
13 | val_df_checkpoint = val_df.copy()
14 |
15 | features = ["A", "B", "C", "D"]
16 |
17 | lgbm = LGBMClassifier(random_state=0, n_jobs=1)
18 | lgbm.fit(train_df[features], train_df["binary_target"])
19 |
20 | flofo = FLOFOImportance(lgbm, df, features, 'binary_target', scoring='roc_auc')
21 | flofo_parallel = FLOFOImportance(lgbm, df, features, 'binary_target', scoring='roc_auc', n_jobs=3)
22 |
23 | importance_df = flofo.get_importance()
24 | importance_df_parallel = flofo_parallel.get_importance()
25 | is_feature_order_same = importance_df["feature"].values == importance_df_parallel["feature"].values
26 |
27 | plot_importance(importance_df)
28 |
29 | assert is_feature_order_same.sum() == len(features), "Parallel FLOFO returned different result!"
30 | assert val_df.equals(val_df_checkpoint), "LOFOImportance mutated the dataframe!"
31 | assert len(features) == importance_df.shape[0], "Missing importance value for some features!"
32 | assert importance_df["feature"].values[0] == "B", "Most important feature is different than B!"
33 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 | .idea
107 |
--------------------------------------------------------------------------------
/tests/test_lofo_importance.py:
--------------------------------------------------------------------------------
1 | from sklearn.feature_extraction.text import CountVectorizer
2 | from sklearn.linear_model import LogisticRegression
3 | from lightgbm import LGBMClassifier
4 | from sklearn.model_selection import KFold
5 | from lofo import LOFOImportance, Dataset, plot_importance
6 | from data.test_data import generate_test_data, generate_unstructured_test_data
7 |
8 |
9 | def test_lofo_importance():
10 | df = generate_test_data(1000)
11 | features = ["A", "B", "C", "D"]
12 | dataset = Dataset(df=df, target="binary_target", features=features)
13 |
14 | lgbm = LGBMClassifier(random_state=0, n_jobs=4)
15 |
16 | lofo = LOFOImportance(dataset, model=lgbm, cv=4, scoring='roc_auc')
17 |
18 | importance_df = lofo.get_importance()
19 |
20 | plot_importance(importance_df)
21 |
22 | assert len(features) == importance_df.shape[0], "Missing importance value for some features!"
23 | assert importance_df["feature"].values[0] == "B", "Most important feature is different than B!"
24 |
25 |
26 | def test_multithreading():
27 | df = generate_test_data(100000)
28 | features = ["A", "B", "C", "D"]
29 | dataset = Dataset(df=df, target="binary_target", features=features)
30 |
31 | lr = LogisticRegression(solver='liblinear')
32 | cv = KFold(n_splits=4, shuffle=True, random_state=0)
33 |
34 | lofo = LOFOImportance(dataset, model=lr, cv=cv, scoring='roc_auc', n_jobs=3)
35 |
36 | importance_df = lofo.get_importance()
37 |
38 | assert len(features) == importance_df.shape[0], "Missing importance value for some features!"
39 | assert importance_df["feature"].values[0] == "B", "Most important feature is different than B!"
40 |
41 |
42 | def test_default_model():
43 | df = generate_unstructured_test_data(1000)
44 | features = ["A", "B", "C", "D", "E"]
45 | dataset = Dataset(df=df, target="target", features=features)
46 |
47 | lofo = LOFOImportance(dataset, cv=4, scoring='neg_mean_absolute_error')
48 | importance_df = lofo.get_importance()
49 | assert "E" in lofo.fit_params["categorical_feature"], "Categorical feature is not detected!"
50 | assert len(features) == importance_df.shape[0], "Missing importance value for some features!"
51 |
52 | df_checkpoint = df.copy()
53 |
54 | dataset = Dataset(df=df, target="binary_target", features=features)
55 | lofo = LOFOImportance(dataset, cv=4, scoring='roc_auc')
56 | importance_df = lofo.get_importance()
57 |
58 | assert "E" in lofo.fit_params["categorical_feature"], "Categorical feature is not detected!"
59 | assert df.equals(df_checkpoint), "LOFOImportance mutated the dataframe!"
60 | assert importance_df["feature"].values[0] == "E", "Most important feature is different than E!"
61 |
62 |
63 | def test_feature_groups():
64 | df = generate_test_data(1000, text=True)
65 | features = ["A", "B", "C", "D"]
66 |
67 | cv = CountVectorizer(ngram_range=(3, 3), analyzer="char")
68 | feature_groups = dict()
69 | feature_groups["names"] = cv.fit_transform(df["T"])
70 | feature_groups["interactions"] = df[["A", "B"]].values*df[["C", "D"]].values
71 |
72 | dataset = Dataset(df=df, target="binary_target", features=features, feature_groups=feature_groups)
73 |
74 | lgbm = LGBMClassifier(random_state=0, n_jobs=4)
75 |
76 | lofo = LOFOImportance(dataset, model=lgbm, cv=4, scoring='roc_auc')
77 |
78 | importance_df = lofo.get_importance()
79 |
80 | assert len(features) + len(feature_groups) == importance_df.shape[0], "Missing importance value for some features!"
81 | assert importance_df["feature"].values[0] == "names", "Most important feature is different than 'names'!"
82 |
--------------------------------------------------------------------------------
/lofo/lofo_importance.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from sklearn.model_selection import cross_validate
3 | from tqdm.autonotebook import tqdm
4 | import warnings
5 | from lofo.infer_defaults import infer_model
6 | from lofo.utils import lofo_to_df, parallel_apply
7 | import sklearn
8 |
9 |
10 | class LOFOImportance:
11 | """
12 | Leave One Feature Out Importance
13 | Given a model and cross-validation scheme, calculates the feature importances.
14 |
15 | Parameters
16 | ----------
17 | dataset: LOFO Dataset object
18 | scoring: string or callable
19 | Same as scoring in sklearn API
20 | model: model (sklearn API), optional
21 | Not trained model object
22 | fit_params : dict, optional
23 | fit parameters for the model
24 | cv: int or iterable
25 | Same as cv in sklearn API
26 | n_jobs: int, optional
27 | Number of jobs for parallel computation
28 | cv_groups: array-like, with shape (n_samples,), optional
29 | Group labels for the samples used while splitting the dataset into train/test set.
30 | Only used in conjunction with a “Group” cv instance (e.g., GroupKFold).
31 | """
32 |
33 | def __init__(self, dataset, scoring, model=None, fit_params=None, cv=4, n_jobs=None, cv_groups=None):
34 |
35 | self.fit_params = fit_params if fit_params else dict()
36 | if model is None:
37 | model, dataset.df, categoricals, dataset.y = infer_model(dataset.df, dataset.features, dataset.y, n_jobs)
38 | self.fit_params["categorical_feature"] = categoricals
39 | n_jobs = 1
40 |
41 | self.model = model
42 | self.dataset = dataset
43 | self.scoring = scoring
44 | self.cv = cv
45 | self.cv_groups = cv_groups
46 | self.n_jobs = n_jobs
47 | if self.n_jobs is not None and self.n_jobs > 1:
48 | warning_str = ("Warning: If your model is multithreaded, please initialise the number"
49 | "of jobs of LOFO to be equal to 1, otherwise you may experience performance issues.")
50 | warnings.warn(warning_str)
51 |
52 | sklearn_version = tuple(map(int, sklearn.__version__.split(".")[:2]))
53 | self._cv_param_name = "params" if sklearn_version >= (1, 4) else "fit_params"
54 |
55 | def _get_cv_score(self, feature_to_remove):
56 | X, fit_params = self.dataset.getX(feature_to_remove=feature_to_remove, fit_params=self.fit_params)
57 | y = self.dataset.y
58 |
59 | kwargs = {self._cv_param_name: fit_params,
60 | "cv": self.cv, "scoring": self.scoring, "groups": self.cv_groups}
61 |
62 | with warnings.catch_warnings():
63 | warnings.simplefilter("ignore")
64 | cv_results = cross_validate(self.model, X, y, **kwargs)
65 | return cv_results['test_score']
66 |
67 | def _get_cv_score_parallel(self, feature, result_queue):
68 | test_score = self._get_cv_score(feature_to_remove=feature)
69 | result_queue.put((feature, test_score))
70 | return test_score
71 |
72 | def get_importance(self):
73 | """Run LOFO to get feature importances
74 |
75 | Returns
76 | -------
77 | importance_df : pandas dataframe
78 | Dataframe with feature names and corresponding importance mean and std (sorted by importance)
79 | """
80 | base_cv_score = self._get_cv_score(feature_to_remove=None)
81 | feature_list = self.dataset.feature_names + list(self.dataset.feature_groups.keys())
82 |
83 | if self.n_jobs is not None and self.n_jobs > 1:
84 | lofo_cv_result = parallel_apply(self._get_cv_score_parallel, feature_list, self.n_jobs)
85 | lofo_cv_scores_normalized = np.array([base_cv_score - lofo_cv_score for _, lofo_cv_score in lofo_cv_result])
86 | feature_list = [feature for feature, _ in lofo_cv_result]
87 | else:
88 | lofo_cv_scores = []
89 | for f in tqdm(feature_list):
90 | lofo_cv_scores.append(self._get_cv_score(feature_to_remove=f))
91 | lofo_cv_scores_normalized = np.array([base_cv_score - lofo_cv_score for lofo_cv_score in lofo_cv_scores])
92 |
93 | return lofo_to_df(lofo_cv_scores_normalized, feature_list)
94 |
--------------------------------------------------------------------------------
/lofo/flofo_importance.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from tqdm.autonotebook import tqdm
4 | import warnings
5 | from sklearn.metrics import check_scoring
6 | from lofo.utils import lofo_to_df, parallel_apply
7 |
8 |
9 | class FLOFOImportance:
10 | """
11 | Fast LOFO Importance
12 | Applies already trained model on validation set by noising one feature each time.
13 |
14 | Parameters
15 | ----------
16 | trained_model: model (sklearn API)
17 | The model should be trained already
18 | validation_df: pandas dataframe
19 | features: list of strings
20 | List of column names for features within validation_df
21 | target: string
22 | Column name for target within validation_df
23 | scoring: string or callable
24 | Same as scoring in sklearn API
25 | n_jobs: int, optional
26 | Number of jobs for parallel computation
27 | """
28 |
29 | def __init__(self, trained_model, validation_df, features, target,
30 | scoring, n_jobs=None):
31 | self.trained_model = trained_model
32 | self.df = validation_df.copy()
33 | self.features = features
34 | self.target = target
35 | self.n_jobs = n_jobs
36 | self.scorer = check_scoring(estimator=self.trained_model, scoring=scoring)
37 |
38 | # FLOFO defaults
39 | self.num_bins = 10
40 | self.shuffle_func = np.random.permutation
41 | self.feature_group_len = 2
42 | self.num_sampling = 10
43 |
44 | min_data_needed = 10*(self.num_bins**self.feature_group_len)
45 | if self.df.shape[0] < min_data_needed:
46 | raise Exception("Small validation set (<{})".format(min_data_needed))
47 | if len(self.features) <= self.feature_group_len:
48 | raise Exception("FLOFO needs more than {} features".format(self.feature_group_len))
49 |
50 | if self.n_jobs is not None and self.n_jobs > 1:
51 | warning_str = ("Warning: If your model is multithreaded, please initialise the number"
52 | "of jobs of LOFO to be equal to 1, otherwise you may experience performance issues.")
53 | warnings.warn(warning_str)
54 |
55 | self._bin_features()
56 |
57 | def _bin_features(self):
58 | epsilon = 1e-10
59 | self.bin_df = pd.DataFrame()
60 | for feature in self.features:
61 | self.bin_df[feature] = self.df[feature].fillna(self.df[feature].median())
62 | self.bin_df[feature] = (self.bin_df[feature].rank(pct=True)*(self.num_bins - epsilon)).astype(np.int32)
63 |
64 | def _get_score(self, updated_df):
65 | return self.scorer(self.trained_model, updated_df[self.features], self.df[self.target])
66 |
67 | def _run(self, feature_name, n):
68 | scores = np.zeros(n)
69 | for i in range(n):
70 | feature_list = np.random.choice([feature for feature in self.features if feature != feature_name],
71 | size=self.feature_group_len, replace=False).tolist()
72 | self.bin_df["__f__"] = self.df[feature_name].values
73 | mutated_df = self.df.copy()
74 | mutated_df[feature_name] = self.bin_df.groupby(feature_list)["__f__"].transform(self.shuffle_func).values
75 | scores[i] = self._get_score(mutated_df)
76 | return scores
77 |
78 | def _run_parallel(self, feature_name, result_queue):
79 | test_score = self._run(feature_name, self.num_sampling)
80 | result_queue.put((feature_name, test_score))
81 | return test_score
82 |
83 | def get_importance(self, num_sampling=10, random_state=0):
84 | """Run FLOFO to get feature importances
85 |
86 | Parameters
87 | ----------
88 | num_sampling : int, optional
89 | Number of times features are shuffled
90 | random_state : int, optional
91 | Random seed
92 |
93 | Returns
94 | -------
95 | importance_df : pandas dataframe
96 | Dataframe with feature names and corresponding importance mean and std (sorted by importance)
97 | """
98 | np.random.seed(random_state)
99 | base_score = self._get_score(self.df)
100 | self.num_sampling = num_sampling
101 |
102 | if self.n_jobs is not None and self.n_jobs > 1:
103 | lofo_cv_scores = parallel_apply(self._run_parallel, self.features, self.n_jobs)
104 | lofo_cv_scores_normalized = np.array([base_score - lofo_cv_score for f, lofo_cv_score in lofo_cv_scores])
105 | self.features = [score[0] for score in lofo_cv_scores]
106 | else:
107 | lofo_cv_scores = []
108 | for f in tqdm(self.features):
109 | lofo_cv_scores.append(self._run(f, num_sampling))
110 | lofo_cv_scores_normalized = np.array([base_score - lofo_cv_score for lofo_cv_score in lofo_cv_scores])
111 |
112 | return lofo_to_df(lofo_cv_scores_normalized, self.features)
113 |
--------------------------------------------------------------------------------
/lofo/dataset.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | import numpy as np
3 | import scipy.sparse as ss
4 | from scipy.stats import spearmanr
5 | from lofo.utils import flatten_list
6 | import networkx as nx
7 |
8 |
9 | class Dataset:
10 | """
11 | Dataset for LOFO
12 | Parameters
13 | ----------
14 | df: pandas dataframe
15 | target: string
16 | Column name for target within df
17 | features: list of strings
18 | List of column names within df
19 | feature_groups: dict, optional
20 | Name, value dictionary of feature groups as numpy.darray or scipy.csr.scr_matrix
21 | auto_group_threshold: float, optional
22 | Threshold for grouping correlated features together, must be between 0 and 1
23 | """
24 |
25 | def __init__(self, df, target, features, feature_groups=None, auto_group_threshold=1.0):
26 | self.df = df.copy()
27 | self.features = list(features)
28 | self.feature_groups = feature_groups if feature_groups else dict()
29 |
30 | self.num_rows = df.shape[0]
31 | self.target_name = target
32 | self.y = df[self.target_name].values
33 |
34 | grouped_features, auto_groups = self.auto_group_features(auto_group_threshold)
35 | self.features = [[f] for f in list(set(self.features) - set(grouped_features))] + auto_groups
36 | self.feature_names = [" & ".join(feature_list) for feature_list in self.features]
37 |
38 | if len(auto_groups) > 0:
39 | print("Automatically grouped features by correlation:")
40 | for i in range(len(auto_groups)):
41 | print(i + 1, auto_groups[i])
42 |
43 | for feature_name, feature_matrix in self.feature_groups.items():
44 | if not (isinstance(feature_matrix, np.ndarray) or isinstance(feature_matrix, ss.csr.csr_matrix)):
45 | raise Exception("Data type {dtype} is not a valid type!".format(dtype=type(feature_matrix)))
46 |
47 | if feature_matrix.shape[0] != self.num_rows:
48 | raise Exception("Expected {expected} rows but got {n} rows!".format(expected=self.num_rows,
49 | n=feature_matrix.shape[0]))
50 |
51 | if feature_name in self.feature_names:
52 | same_name_exception = "Feature group name '{name}' is the same with one of the features!"
53 | raise Exception(same_name_exception.format(name=feature_name))
54 |
55 | def auto_group_features(self, auto_group_threshold):
56 | if auto_group_threshold == 1.0:
57 | return [], []
58 | elif auto_group_threshold == 0.0:
59 | grouped_features = list(self.features)
60 | auto_groups = [set(self.features)]
61 | return grouped_features, auto_groups
62 | elif 0 < auto_group_threshold < 1:
63 | feature_matrix = self.df[self.features].values
64 |
65 | for i, feature in enumerate(self.features):
66 | if self.df[feature].dtype.name == "category":
67 | feature_series = self.df.groupby(feature)[self.target_name].transform("mean")
68 | else:
69 | feature_series = self.df[feature]
70 | feature_matrix[:, i] = feature_series.fillna(feature_series.mean()).fillna(0).values
71 |
72 | corr_matrix, _ = spearmanr(np.nan_to_num(feature_matrix))
73 | corr_matrix = np.abs(corr_matrix)
74 |
75 | G = nx.Graph()
76 |
77 | for i in range(len(self.features)):
78 | for j in range(i + 1, len(self.features)):
79 | if corr_matrix[i, j] > auto_group_threshold:
80 | G.add_edge(i, j)
81 |
82 | subgraphs = [G.subgraph(c) for c in nx.connected_components(G)]
83 |
84 | groups = []
85 | for sg in subgraphs:
86 | groups.append([self.features[node] for node in sg.nodes()])
87 |
88 | auto_groups = [sorted(g) for g in groups]
89 | grouped_features = list(itertools.chain(*[list(g) for g in groups]))
90 | return grouped_features, auto_groups
91 | else:
92 | raise Exception("auto_group_threshold must be between 0 and 1 (inclusive)!")
93 |
94 | def getX(self, feature_to_remove, fit_params):
95 | """Get feature matrix and fit_params after removing a feature
96 | Parameters
97 | ----------
98 | feature_to_remove : string
99 | feature name to remove
100 | fit_params : dict
101 | fit parameters for the model
102 | Returns
103 | -------
104 | X : numpy.darray or scipy.csr.scr_matrix
105 | Feature matrix
106 | fit_params: dict
107 | Updated fit_params after feature removal
108 | """
109 | feature_lists = [self.features[i] for i, feature_name in enumerate(self.feature_names)
110 | if feature_name != feature_to_remove]
111 | feature_list = flatten_list(feature_lists)
112 | concat_list = [self.df[feature_list].values]
113 |
114 | for feature_name, feature_matrix in self.feature_groups.items():
115 | if feature_name != feature_to_remove:
116 | concat_list.append(feature_matrix)
117 |
118 | fit_params = fit_params.copy()
119 | if "categorical_feature" in fit_params:
120 | cat_features = [f for f in fit_params["categorical_feature"] if f != feature_to_remove]
121 | fit_params["categorical_feature"] = [ix for ix, f in enumerate(feature_list) if (f in cat_features)]
122 |
123 | has_sparse = False
124 | for feature_name, feature_matrix in self.feature_groups.items():
125 | if feature_name != feature_to_remove and isinstance(feature_matrix, ss.csr.csr_matrix):
126 | has_sparse = True
127 |
128 | concat = np.hstack
129 | if has_sparse:
130 | concat = ss.hstack
131 |
132 | return concat(concat_list), fit_params
133 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | LOFO (Leave One Feature Out) Importance calculates the importances of a set of features based on a metric of choice, for a model of choice, by iteratively removing each feature from the set, and evaluating the performance of the model, with a validation scheme of choice, based on the chosen metric.
4 |
5 | LOFO first evaluates the performance of the model with all the input features included, then iteratively removes one feature at a time, retrains the model, and evaluates its performance on a validation set. The mean and standard deviation (across the folds) of the importance of each feature is then reported.
6 |
7 | If a model is not passed as an argument to LOFO Importance, it will run LightGBM as a default model.
8 |
9 | ## Install
10 |
11 | LOFO Importance can be installed using
12 |
13 | ```
14 | pip install lofo-importance
15 | ```
16 |
17 | ## Advantages of LOFO Importance
18 |
19 | LOFO has several advantages compared to other importance types:
20 |
21 | * It does not favor granular features
22 | * It generalises well to unseen test sets
23 | * It is model agnostic
24 | * It gives negative importance to features that hurt performance upon inclusion
25 | * It can group the features. Especially useful for high dimensional features like TFIDF or OHE features.
26 | * It can automatically group highly correlated features to avoid underestimating their importance.
27 |
28 | ## Example on Kaggle's Microsoft Malware Prediction Competition
29 |
30 | In this Kaggle competition, Microsoft provides a malware dataset to predict whether or not a machine will soon be hit with malware. One of the features, Centos_OSVersion is very predictive on the training set, since some OS versions are probably more prone to bugs and failures than others. However, upon splitting the data out of time, we obtain validation sets with OS versions that have not occurred in the training set. Therefore, the model will not have learned the relationship between the target and this seasonal feature. By evaluating this feature's importance using other importance types, Centos_OSVersion seems to have high importance, because its importance was evaluated using only the training set. However, LOFO Importance depends on a validation scheme, so it will not only give this feature low importance, but even negative importance.
31 |
32 | ```python
33 | import pandas as pd
34 | from sklearn.model_selection import KFold
35 | from lofo import LOFOImportance, Dataset, plot_importance
36 | %matplotlib inline
37 |
38 | # import data
39 | train_df = pd.read_csv("../input/train.csv", dtype=dtypes)
40 |
41 | # extract a sample of the data
42 | sample_df = train_df.sample(frac=0.01, random_state=0)
43 | sample_df.sort_values("AvSigVersion", inplace=True) # Sort by time for time split validation
44 |
45 | # define the validation scheme
46 | cv = KFold(n_splits=4, shuffle=False, random_state=None) # Don't shuffle to keep the time split split validation
47 |
48 | # define the binary target and the features
49 | dataset = Dataset(df=sample_df, target="HasDetections", features=[col for col in train_df.columns if col != "HasDetections"])
50 |
51 | # define the validation scheme and scorer. The default model is LightGBM
52 | lofo_imp = LOFOImportance(dataset, cv=cv, scoring="roc_auc")
53 |
54 | # get the mean and standard deviation of the importances in pandas format
55 | importance_df = lofo_imp.get_importance()
56 |
57 | # plot the means and standard deviations of the importances
58 | plot_importance(importance_df, figsize=(12, 20))
59 | ```
60 |
61 | 
62 |
63 | ## Another Example: Kaggle's TReNDS Competition
64 |
65 | In this Kaggle competition, pariticipants are asked to predict some cognitive properties of patients.
66 | Independent component features (IC) from sMRI and very high dimensional correlation features (FNC) from 3D fMRIs are provided.
67 | LOFO can group the fMRI correlation features into one.
68 |
69 | ```python
70 | def get_lofo_importance(target):
71 | cv = KFold(n_splits=7, shuffle=True, random_state=17)
72 |
73 | dataset = Dataset(df=df[df[target].notnull()], target=target, features=loading_features,
74 | feature_groups={"fnc": df[df[target].notnull()][fnc_features].values
75 | })
76 |
77 | model = Ridge(alpha=0.01)
78 | lofo_imp = LOFOImportance(dataset, cv=cv, scoring="neg_mean_absolute_error", model=model)
79 |
80 | return lofo_imp.get_importance()
81 |
82 | plot_importance(get_lofo_importance(target="domain1_var1"), figsize=(8, 8), kind="box")
83 | ```
84 |
85 | 
86 |
87 | ## Flofo Importance
88 |
89 | If running the LOFO Importance package is too time-costly for you, you can use Fast LOFO. Fast LOFO, or FLOFO takes, as inputs, an already trained model and a validation set, and does a pseudo-random permutation on the values of each feature, one by one, then uses the trained model to make predictions on the validation set. The mean of the FLOFO importance is then the difference in the performance of the model on the validation set over several randomised permutations.
90 | The difference between FLOFO importance and permutation importance is that the permutations on a feature's values are done within groups, where groups are obtained by grouping the validation set by k=2 features. These k features are chosen at random n=10 times, and the mean and standard deviation of the FLOFO importance are calculated based on these n runs.
91 | The reason this grouping makes the measure of importance better is that permuting a feature's value is no longer completely random. In fact, the permutations are done within groups of similar samples, so the permutations are equivalent to noising the samples. This ensures that:
92 |
93 | * The permuted feature values are very unlikely to be replaced by unrealistic values.
94 | * A feature that is predictable by features among the chosen n*k features will be replaced by very similar values during permutation. Therefore, it will only slightly affect the model performance (and will yield a small FLOFO importance). This solves the correlated feature overestimation problem.
95 |
--------------------------------------------------------------------------------
/LOFOImportance Example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "ExecuteTime": {
8 | "end_time": "2019-01-14T12:48:04.450500Z",
9 | "start_time": "2019-01-14T12:48:03.586150Z"
10 | }
11 | },
12 | "outputs": [
13 | {
14 | "name": "stderr",
15 | "output_type": "stream",
16 | "text": [
17 | "/home/aerdem/projects/lofo-importance/lofo/lofo_importance.py:3: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
18 | " from tqdm.autonotebook import tqdm\n"
19 | ]
20 | }
21 | ],
22 | "source": [
23 | "import numpy as np\n",
24 | "import pandas as pd\n",
25 | "from sklearn.linear_model import LinearRegression\n",
26 | "from sklearn.ensemble import RandomForestClassifier\n",
27 | "from sklearn.model_selection import KFold\n",
28 | "from lofo import LOFOImportance, FLOFOImportance, Dataset, plot_importance"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 2,
34 | "metadata": {
35 | "ExecuteTime": {
36 | "end_time": "2019-01-14T12:48:04.513904Z",
37 | "start_time": "2019-01-14T12:48:04.453322Z"
38 | }
39 | },
40 | "outputs": [
41 | {
42 | "data": {
43 | "text/html": [
44 | "
\n",
45 | "\n",
58 | "
\n",
59 | " \n",
60 | " \n",
61 | " | \n",
62 | " A | \n",
63 | " B | \n",
64 | " C | \n",
65 | " D | \n",
66 | " D2 | \n",
67 | " target | \n",
68 | " binary_target | \n",
69 | "
\n",
70 | " \n",
71 | " \n",
72 | " \n",
73 | " | 0 | \n",
74 | " 0.548814 | \n",
75 | " 0.592880 | \n",
76 | " 0.811518 | \n",
77 | " 0.413962 | \n",
78 | " 0.443227 | \n",
79 | " 1.486305 | \n",
80 | " 1 | \n",
81 | "
\n",
82 | " \n",
83 | " | 1 | \n",
84 | " 0.715189 | \n",
85 | " 0.010064 | \n",
86 | " 0.476084 | \n",
87 | " 0.629618 | \n",
88 | " 0.686270 | \n",
89 | " 0.529949 | \n",
90 | " 0 | \n",
91 | "
\n",
92 | " \n",
93 | " | 2 | \n",
94 | " 0.602763 | \n",
95 | " 0.475826 | \n",
96 | " 0.523156 | \n",
97 | " 0.778584 | \n",
98 | " 0.792326 | \n",
99 | " 1.434674 | \n",
100 | " 1 | \n",
101 | "
\n",
102 | " \n",
103 | " | 3 | \n",
104 | " 0.544883 | \n",
105 | " 0.708770 | \n",
106 | " 0.250521 | \n",
107 | " 0.851558 | \n",
108 | " 0.886529 | \n",
109 | " 1.952046 | \n",
110 | " 1 | \n",
111 | "
\n",
112 | " \n",
113 | " | 4 | \n",
114 | " 0.423655 | \n",
115 | " 0.043975 | \n",
116 | " 0.605043 | \n",
117 | " 0.816413 | \n",
118 | " 0.821734 | \n",
119 | " 0.480267 | \n",
120 | " 0 | \n",
121 | "
\n",
122 | " \n",
123 | "
\n",
124 | "
"
125 | ],
126 | "text/plain": [
127 | " A B C D D2 target binary_target\n",
128 | "0 0.548814 0.592880 0.811518 0.413962 0.443227 1.486305 1\n",
129 | "1 0.715189 0.010064 0.476084 0.629618 0.686270 0.529949 0\n",
130 | "2 0.602763 0.475826 0.523156 0.778584 0.792326 1.434674 1\n",
131 | "3 0.544883 0.708770 0.250521 0.851558 0.886529 1.952046 1\n",
132 | "4 0.423655 0.043975 0.605043 0.816413 0.821734 0.480267 0"
133 | ]
134 | },
135 | "execution_count": 2,
136 | "metadata": {},
137 | "output_type": "execute_result"
138 | }
139 | ],
140 | "source": [
141 | "from data.test_data import generate_test_data, generate_unstructured_test_data\n",
142 | "\n",
143 | "df = generate_test_data(1000)\n",
144 | "df.head()"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": 3,
150 | "metadata": {
151 | "ExecuteTime": {
152 | "end_time": "2019-01-14T12:48:04.689772Z",
153 | "start_time": "2019-01-14T12:48:04.527994Z"
154 | }
155 | },
156 | "outputs": [
157 | {
158 | "data": {
159 | "application/vnd.jupyter.widget-view+json": {
160 | "model_id": "581710f7ae654f5387a06a2befa408e4",
161 | "version_major": 2,
162 | "version_minor": 0
163 | },
164 | "text/plain": [
165 | "HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))"
166 | ]
167 | },
168 | "metadata": {},
169 | "output_type": "display_data"
170 | },
171 | {
172 | "name": "stdout",
173 | "output_type": "stream",
174 | "text": [
175 | "\n"
176 | ]
177 | },
178 | {
179 | "data": {
180 | "text/html": [
181 | "\n",
182 | "\n",
195 | "
\n",
196 | " \n",
197 | " \n",
198 | " | \n",
199 | " feature | \n",
200 | " importance_mean | \n",
201 | " importance_std | \n",
202 | " val_imp_0 | \n",
203 | " val_imp_1 | \n",
204 | " val_imp_2 | \n",
205 | " val_imp_3 | \n",
206 | " val_imp_4 | \n",
207 | " val_imp_5 | \n",
208 | " val_imp_6 | \n",
209 | " val_imp_7 | \n",
210 | " val_imp_8 | \n",
211 | " val_imp_9 | \n",
212 | "
\n",
213 | " \n",
214 | " \n",
215 | " \n",
216 | " | 1 | \n",
217 | " B | \n",
218 | " 0.540217 | \n",
219 | " 0.016008 | \n",
220 | " 0.532118 | \n",
221 | " 0.544225 | \n",
222 | " 0.513259 | \n",
223 | " 0.524307 | \n",
224 | " 0.525612 | \n",
225 | " 0.542536 | \n",
226 | " 0.550588 | \n",
227 | " 5.682745e-01 | \n",
228 | " 0.559869 | \n",
229 | " 0.541383 | \n",
230 | "
\n",
231 | " \n",
232 | " | 3 | \n",
233 | " D | \n",
234 | " 0.089187 | \n",
235 | " 0.002629 | \n",
236 | " 0.088832 | \n",
237 | " 0.086291 | \n",
238 | " 0.087612 | \n",
239 | " 0.085380 | \n",
240 | " 0.086004 | \n",
241 | " 0.090378 | \n",
242 | " 0.091582 | \n",
243 | " 9.345964e-02 | \n",
244 | " 0.090800 | \n",
245 | " 0.091527 | \n",
246 | "
\n",
247 | " \n",
248 | " | 0 | \n",
249 | " A | \n",
250 | " 0.088167 | \n",
251 | " 0.002935 | \n",
252 | " 0.090739 | \n",
253 | " 0.086158 | \n",
254 | " 0.085259 | \n",
255 | " 0.093299 | \n",
256 | " 0.088281 | \n",
257 | " 0.088402 | \n",
258 | " 0.083172 | \n",
259 | " 9.189529e-02 | \n",
260 | " 0.087086 | \n",
261 | " 0.087376 | \n",
262 | "
\n",
263 | " \n",
264 | " | 2 | \n",
265 | " C | \n",
266 | " 0.000002 | \n",
267 | " 0.000033 | \n",
268 | " 0.000088 | \n",
269 | " -0.000020 | \n",
270 | " -0.000012 | \n",
271 | " -0.000027 | \n",
272 | " -0.000016 | \n",
273 | " 0.000004 | \n",
274 | " 0.000031 | \n",
275 | " -8.312825e-07 | \n",
276 | " -0.000002 | \n",
277 | " -0.000021 | \n",
278 | "
\n",
279 | " \n",
280 | "
\n",
281 | "
"
282 | ],
283 | "text/plain": [
284 | " feature importance_mean importance_std val_imp_0 val_imp_1 val_imp_2 \\\n",
285 | "1 B 0.540217 0.016008 0.532118 0.544225 0.513259 \n",
286 | "3 D 0.089187 0.002629 0.088832 0.086291 0.087612 \n",
287 | "0 A 0.088167 0.002935 0.090739 0.086158 0.085259 \n",
288 | "2 C 0.000002 0.000033 0.000088 -0.000020 -0.000012 \n",
289 | "\n",
290 | " val_imp_3 val_imp_4 val_imp_5 val_imp_6 val_imp_7 val_imp_8 \\\n",
291 | "1 0.524307 0.525612 0.542536 0.550588 5.682745e-01 0.559869 \n",
292 | "3 0.085380 0.086004 0.090378 0.091582 9.345964e-02 0.090800 \n",
293 | "0 0.093299 0.088281 0.088402 0.083172 9.189529e-02 0.087086 \n",
294 | "2 -0.000027 -0.000016 0.000004 0.000031 -8.312825e-07 -0.000002 \n",
295 | "\n",
296 | " val_imp_9 \n",
297 | "1 0.541383 \n",
298 | "3 0.091527 \n",
299 | "0 0.087376 \n",
300 | "2 -0.000021 "
301 | ]
302 | },
303 | "execution_count": 3,
304 | "metadata": {},
305 | "output_type": "execute_result"
306 | }
307 | ],
308 | "source": [
309 | "lr = LinearRegression()\n",
310 | "lr.fit(df[[\"A\", \"B\", \"C\", \"D\"]], df[\"target\"])\n",
311 | "\n",
312 | "fi = FLOFOImportance(lr, df, [\"A\", \"B\", \"C\", \"D\"], 'target', scoring=\"neg_mean_absolute_error\")\n",
313 | "\n",
314 | "importances = fi.get_importance()\n",
315 | "importances"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 4,
321 | "metadata": {},
322 | "outputs": [
323 | {
324 | "data": {
325 | "application/vnd.jupyter.widget-view+json": {
326 | "model_id": "83d88dddf5bd4965937f0ba96949fcd2",
327 | "version_major": 2,
328 | "version_minor": 0
329 | },
330 | "text/plain": [
331 | "HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))"
332 | ]
333 | },
334 | "metadata": {},
335 | "output_type": "display_data"
336 | },
337 | {
338 | "name": "stdout",
339 | "output_type": "stream",
340 | "text": [
341 | "\n"
342 | ]
343 | },
344 | {
345 | "data": {
346 | "text/html": [
347 | "\n",
348 | "\n",
361 | "
\n",
362 | " \n",
363 | " \n",
364 | " | \n",
365 | " feature | \n",
366 | " importance_mean | \n",
367 | " importance_std | \n",
368 | " val_imp_0 | \n",
369 | " val_imp_1 | \n",
370 | " val_imp_2 | \n",
371 | " val_imp_3 | \n",
372 | "
\n",
373 | " \n",
374 | " \n",
375 | " \n",
376 | " | 1 | \n",
377 | " B | \n",
378 | " 0.447206 | \n",
379 | " 0.024244 | \n",
380 | " 0.432768 | \n",
381 | " 0.418559 | \n",
382 | " 0.454714 | \n",
383 | " 0.482782 | \n",
384 | "
\n",
385 | " \n",
386 | " | 3 | \n",
387 | " A | \n",
388 | " 0.053247 | \n",
389 | " 0.006699 | \n",
390 | " 0.049021 | \n",
391 | " 0.044487 | \n",
392 | " 0.060269 | \n",
393 | " 0.059213 | \n",
394 | "
\n",
395 | " \n",
396 | " | 2 | \n",
397 | " D | \n",
398 | " 0.052560 | \n",
399 | " 0.003008 | \n",
400 | " 0.051912 | \n",
401 | " 0.057638 | \n",
402 | " 0.050646 | \n",
403 | " 0.050044 | \n",
404 | "
\n",
405 | " \n",
406 | " | 0 | \n",
407 | " C | \n",
408 | " -0.000057 | \n",
409 | " 0.000116 | \n",
410 | " 0.000119 | \n",
411 | " -0.000165 | \n",
412 | " -0.000023 | \n",
413 | " -0.000159 | \n",
414 | "
\n",
415 | " \n",
416 | "
\n",
417 | "
"
418 | ],
419 | "text/plain": [
420 | " feature importance_mean importance_std val_imp_0 val_imp_1 val_imp_2 \\\n",
421 | "1 B 0.447206 0.024244 0.432768 0.418559 0.454714 \n",
422 | "3 A 0.053247 0.006699 0.049021 0.044487 0.060269 \n",
423 | "2 D 0.052560 0.003008 0.051912 0.057638 0.050646 \n",
424 | "0 C -0.000057 0.000116 0.000119 -0.000165 -0.000023 \n",
425 | "\n",
426 | " val_imp_3 \n",
427 | "1 0.482782 \n",
428 | "3 0.059213 \n",
429 | "2 0.050044 \n",
430 | "0 -0.000159 "
431 | ]
432 | },
433 | "execution_count": 4,
434 | "metadata": {},
435 | "output_type": "execute_result"
436 | }
437 | ],
438 | "source": [
439 | "from sklearn.metrics import make_scorer, mean_absolute_error\n",
440 | "\n",
441 | "scorer = make_scorer(mean_absolute_error, greater_is_better=False)\n",
442 | "cv = KFold(n_splits=4, shuffle=True, random_state=0)\n",
443 | "\n",
444 | "dataset = Dataset(df=df, target=\"target\", features=[\"A\", \"B\", \"C\", \"D\"])\n",
445 | "fi = LOFOImportance(dataset, scoring=scorer, model=LinearRegression(), cv=cv)\n",
446 | "\n",
447 | "importances = fi.get_importance()\n",
448 | "importances"
449 | ]
450 | },
451 | {
452 | "cell_type": "code",
453 | "execution_count": 5,
454 | "metadata": {
455 | "ExecuteTime": {
456 | "end_time": "2019-01-14T12:48:05.103111Z",
457 | "start_time": "2019-01-14T12:48:04.692682Z"
458 | }
459 | },
460 | "outputs": [
461 | {
462 | "data": {
463 | "application/vnd.jupyter.widget-view+json": {
464 | "model_id": "b3feb8d91fc54a699c316d64a3729bb6",
465 | "version_major": 2,
466 | "version_minor": 0
467 | },
468 | "text/plain": [
469 | "HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))"
470 | ]
471 | },
472 | "metadata": {},
473 | "output_type": "display_data"
474 | },
475 | {
476 | "name": "stdout",
477 | "output_type": "stream",
478 | "text": [
479 | "\n"
480 | ]
481 | },
482 | {
483 | "data": {
484 | "text/html": [
485 | "\n",
486 | "\n",
499 | "
\n",
500 | " \n",
501 | " \n",
502 | " | \n",
503 | " feature | \n",
504 | " importance_mean | \n",
505 | " importance_std | \n",
506 | " val_imp_0 | \n",
507 | " val_imp_1 | \n",
508 | " val_imp_2 | \n",
509 | " val_imp_3 | \n",
510 | "
\n",
511 | " \n",
512 | " \n",
513 | " \n",
514 | " | 1 | \n",
515 | " B | \n",
516 | " 0.414 | \n",
517 | " 0.025377 | \n",
518 | " 0.432 | \n",
519 | " 0.416 | \n",
520 | " 0.436 | \n",
521 | " 0.372 | \n",
522 | "
\n",
523 | " \n",
524 | " | 2 | \n",
525 | " D | \n",
526 | " 0.040 | \n",
527 | " 0.007483 | \n",
528 | " 0.032 | \n",
529 | " 0.052 | \n",
530 | " 0.040 | \n",
531 | " 0.036 | \n",
532 | "
\n",
533 | " \n",
534 | " | 3 | \n",
535 | " A | \n",
536 | " 0.038 | \n",
537 | " 0.012806 | \n",
538 | " 0.044 | \n",
539 | " 0.024 | \n",
540 | " 0.056 | \n",
541 | " 0.028 | \n",
542 | "
\n",
543 | " \n",
544 | " | 0 | \n",
545 | " C | \n",
546 | " 0.016 | \n",
547 | " 0.007483 | \n",
548 | " 0.016 | \n",
549 | " 0.028 | \n",
550 | " 0.008 | \n",
551 | " 0.012 | \n",
552 | "
\n",
553 | " \n",
554 | "
\n",
555 | "
"
556 | ],
557 | "text/plain": [
558 | " feature importance_mean importance_std val_imp_0 val_imp_1 val_imp_2 \\\n",
559 | "1 B 0.414 0.025377 0.432 0.416 0.436 \n",
560 | "2 D 0.040 0.007483 0.032 0.052 0.040 \n",
561 | "3 A 0.038 0.012806 0.044 0.024 0.056 \n",
562 | "0 C 0.016 0.007483 0.016 0.028 0.008 \n",
563 | "\n",
564 | " val_imp_3 \n",
565 | "1 0.372 \n",
566 | "2 0.036 \n",
567 | "3 0.028 \n",
568 | "0 0.012 "
569 | ]
570 | },
571 | "execution_count": 5,
572 | "metadata": {},
573 | "output_type": "execute_result"
574 | }
575 | ],
576 | "source": [
577 | "rf = RandomForestClassifier(n_estimators=10, max_depth=5, random_state=0)\n",
578 | "\n",
579 | "dataset = Dataset(df=df, target=\"binary_target\", features=[\"A\", \"B\", \"C\", \"D\"])\n",
580 | "fi = LOFOImportance(dataset, scoring='accuracy', model=rf, cv=cv)\n",
581 | "\n",
582 | "importances = fi.get_importance()\n",
583 | "importances"
584 | ]
585 | },
586 | {
587 | "cell_type": "code",
588 | "execution_count": 6,
589 | "metadata": {},
590 | "outputs": [
591 | {
592 | "data": {
593 | "application/vnd.jupyter.widget-view+json": {
594 | "model_id": "8c0f86bcbf3147ed8944d3da1e96aa6c",
595 | "version_major": 2,
596 | "version_minor": 0
597 | },
598 | "text/plain": [
599 | "HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))"
600 | ]
601 | },
602 | "metadata": {},
603 | "output_type": "display_data"
604 | },
605 | {
606 | "name": "stdout",
607 | "output_type": "stream",
608 | "text": [
609 | "\n"
610 | ]
611 | },
612 | {
613 | "data": {
614 | "text/html": [
615 | "\n",
616 | "\n",
629 | "
\n",
630 | " \n",
631 | " \n",
632 | " | \n",
633 | " feature | \n",
634 | " importance_mean | \n",
635 | " importance_std | \n",
636 | " val_imp_0 | \n",
637 | " val_imp_1 | \n",
638 | " val_imp_2 | \n",
639 | " val_imp_3 | \n",
640 | "
\n",
641 | " \n",
642 | " \n",
643 | " \n",
644 | " | 1 | \n",
645 | " B | \n",
646 | " 0.210579 | \n",
647 | " 0.006848 | \n",
648 | " 0.218465 | \n",
649 | " 0.199696 | \n",
650 | " 0.210972 | \n",
651 | " 0.213183 | \n",
652 | "
\n",
653 | " \n",
654 | " | 3 | \n",
655 | " A | \n",
656 | " 0.011146 | \n",
657 | " 0.006722 | \n",
658 | " 0.002382 | \n",
659 | " 0.007626 | \n",
660 | " 0.014447 | \n",
661 | " 0.020127 | \n",
662 | "
\n",
663 | " \n",
664 | " | 2 | \n",
665 | " D | \n",
666 | " 0.008892 | \n",
667 | " 0.005367 | \n",
668 | " 0.000255 | \n",
669 | " 0.008582 | \n",
670 | " 0.012859 | \n",
671 | " 0.013872 | \n",
672 | "
\n",
673 | " \n",
674 | " | 0 | \n",
675 | " C | \n",
676 | " -0.000171 | \n",
677 | " 0.004295 | \n",
678 | " -0.001984 | \n",
679 | " -0.005786 | \n",
680 | " 0.001156 | \n",
681 | " 0.005931 | \n",
682 | "
\n",
683 | " \n",
684 | "
\n",
685 | "
"
686 | ],
687 | "text/plain": [
688 | " feature importance_mean importance_std val_imp_0 val_imp_1 val_imp_2 \\\n",
689 | "1 B 0.210579 0.006848 0.218465 0.199696 0.210972 \n",
690 | "3 A 0.011146 0.006722 0.002382 0.007626 0.014447 \n",
691 | "2 D 0.008892 0.005367 0.000255 0.008582 0.012859 \n",
692 | "0 C -0.000171 0.004295 -0.001984 -0.005786 0.001156 \n",
693 | "\n",
694 | " val_imp_3 \n",
695 | "1 0.213183 \n",
696 | "3 0.020127 \n",
697 | "2 0.013872 \n",
698 | "0 0.005931 "
699 | ]
700 | },
701 | "execution_count": 6,
702 | "metadata": {},
703 | "output_type": "execute_result"
704 | }
705 | ],
706 | "source": [
707 | "df = generate_unstructured_test_data(10000)\n",
708 | "\n",
709 | "dataset = Dataset(df=df, target=\"binary_target\", features=[\"A\", \"B\", \"C\", \"D\"])\n",
710 | "fi = LOFOImportance(dataset, 'roc_auc')\n",
711 | "\n",
712 | "importances = fi.get_importance()\n",
713 | "importances"
714 | ]
715 | },
716 | {
717 | "cell_type": "code",
718 | "execution_count": 7,
719 | "metadata": {},
720 | "outputs": [
721 | {
722 | "data": {
723 | "application/vnd.jupyter.widget-view+json": {
724 | "model_id": "0bebf9dcd42d41cb862a45b1678d6807",
725 | "version_major": 2,
726 | "version_minor": 0
727 | },
728 | "text/plain": [
729 | "HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))"
730 | ]
731 | },
732 | "metadata": {},
733 | "output_type": "display_data"
734 | },
735 | {
736 | "name": "stdout",
737 | "output_type": "stream",
738 | "text": [
739 | "\n"
740 | ]
741 | },
742 | {
743 | "data": {
744 | "text/html": [
745 | "\n",
746 | "\n",
759 | "
\n",
760 | " \n",
761 | " \n",
762 | " | \n",
763 | " feature | \n",
764 | " importance_mean | \n",
765 | " importance_std | \n",
766 | " val_imp_0 | \n",
767 | " val_imp_1 | \n",
768 | " val_imp_2 | \n",
769 | " val_imp_3 | \n",
770 | "
\n",
771 | " \n",
772 | " \n",
773 | " \n",
774 | " | 0 | \n",
775 | " E | \n",
776 | " 0.535000 | \n",
777 | " 0.004561 | \n",
778 | " 0.532726 | \n",
779 | " 0.529386 | \n",
780 | " 0.541721 | \n",
781 | " 0.536167 | \n",
782 | "
\n",
783 | " \n",
784 | " | 2 | \n",
785 | " B | \n",
786 | " 0.300475 | \n",
787 | " 0.003054 | \n",
788 | " 0.299546 | \n",
789 | " 0.301227 | \n",
790 | " 0.296328 | \n",
791 | " 0.304798 | \n",
792 | "
\n",
793 | " \n",
794 | " | 4 | \n",
795 | " D | \n",
796 | " 0.047203 | \n",
797 | " 0.001125 | \n",
798 | " 0.048536 | \n",
799 | " 0.047944 | \n",
800 | " 0.046706 | \n",
801 | " 0.045625 | \n",
802 | "
\n",
803 | " \n",
804 | " | 3 | \n",
805 | " A | \n",
806 | " 0.038683 | \n",
807 | " 0.001377 | \n",
808 | " 0.038191 | \n",
809 | " 0.040831 | \n",
810 | " 0.037031 | \n",
811 | " 0.038679 | \n",
812 | "
\n",
813 | " \n",
814 | " | 1 | \n",
815 | " C | \n",
816 | " -0.000693 | \n",
817 | " 0.000305 | \n",
818 | " -0.000725 | \n",
819 | " -0.000689 | \n",
820 | " -0.000249 | \n",
821 | " -0.001109 | \n",
822 | "
\n",
823 | " \n",
824 | "
\n",
825 | "
"
826 | ],
827 | "text/plain": [
828 | " feature importance_mean importance_std val_imp_0 val_imp_1 val_imp_2 \\\n",
829 | "0 E 0.535000 0.004561 0.532726 0.529386 0.541721 \n",
830 | "2 B 0.300475 0.003054 0.299546 0.301227 0.296328 \n",
831 | "4 D 0.047203 0.001125 0.048536 0.047944 0.046706 \n",
832 | "3 A 0.038683 0.001377 0.038191 0.040831 0.037031 \n",
833 | "1 C -0.000693 0.000305 -0.000725 -0.000689 -0.000249 \n",
834 | "\n",
835 | " val_imp_3 \n",
836 | "0 0.536167 \n",
837 | "2 0.304798 \n",
838 | "4 0.045625 \n",
839 | "3 0.038679 \n",
840 | "1 -0.001109 "
841 | ]
842 | },
843 | "execution_count": 7,
844 | "metadata": {},
845 | "output_type": "execute_result"
846 | }
847 | ],
848 | "source": [
849 | "dataset = Dataset(df=df, target=\"target\", features=[\"A\", \"B\", \"C\", \"D\", \"E\"])\n",
850 | "fi = LOFOImportance(dataset, scorer, n_jobs=-1)\n",
851 | "\n",
852 | "importances = fi.get_importance()\n",
853 | "importances"
854 | ]
855 | },
856 | {
857 | "cell_type": "code",
858 | "execution_count": 8,
859 | "metadata": {},
860 | "outputs": [
861 | {
862 | "data": {
863 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAekAAAHSCAYAAADIczP5AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAYI0lEQVR4nO3df7DddZ3f8dcnCRCWX62AFoGY4CDIEsKPm1SYsgZYRUdFC4zKug4w66BLUTvtSHHakaVOu+2uXaxoBWe74DK4OtphxtrW+mPJLGgVLjboCBiEDZhqazarIbIJmx/v/pFrDCE/Tticcz733sdj5g7n1z3f9/3cy33me873nNuqKgBAf+aMewAAYPdEGgA6JdIA0CmRBoBOiTQAdEqkAaBT88Y9wK6OOeaYWrhw4bjHAICRePDBB/+qqo7d3XXdRXrhwoWZnJwc9xgAMBKttSf3dJ2HuwGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOiTQAdEqkAaBTIg0AnRJpAOiUSANAp0QaADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0Cn5o17gFFoN7VxjwDANFY31li2a08aADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0CnRBoAOiXSANApkQaATok0AHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6NW/YG2itbU3yvZ0u+mxV/bthbxcApruhRzrJxqo6cwTbAYAZZRSRBoDp5fbnnl1+z/Idp1esWDGyMUbxnPShrbWVO328bdcbtNauaa1NttYm165dO4KRAKB/raqGu4HWflFVhw96+4mJiZqcnDywM9zUDuj9ATC71I3Da2Vr7cGqmtjddY7uBoBOiTQAdGoUB44d2lpbudP5L1fVDSPYLgBMa0OPdFXNHfY2AGAm8nA3AHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOiTQAdEqkAaBTIg0AnRJpAOiUSANAp0QaADol0gDQKZEGgE7NG/cAo1A31rhHAID9Zk8aADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0CnRBoAOiXSANApkQaATok0AHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOiTQAdEqkAaBTIg0AnRJpAOiUSANAp0QaADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0CnRBoAOiXSANApkQaATok0AHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOzRv3AKPQbmrjHgFmvbqxxj0CTDv2pAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOiTQAdEqkAaBTIg0AnRJpAOiUSANAp0QaADol0gDQKZEGgE6JNAB0SqQBoFNDjXRrbWtrbWVr7aHW2ndaa+cNc3sAMJPMG/L9b6yqM5OktXZxkt9P8uohbxMAZoRhR3pnRyb52Qi3B4zb7b86ufye5UmSFStWjGUUmI6GHelDW2srk8xPclySC3d3o9baNUmuSZIFCxYMeSQAmB5aVQ3vzlv7RVUdPnX63CR/nOT02stGJyYmanJy8sDOcVM7oPcH7L+6cXi/a2A6a609WFUTu7tuZEd3V9X/SnJMkmNHtU0AmM5GFunW2qlJ5iZZN6ptAsB0NqrnpJOkJbmyqrYOeZsAMCMMNdJVNXeY9w8AM5l3HAOATok0AHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOiTQAdEqkAaBTIg0AnRJpAOiUSANAp0QaADol0gDQqXnjHmAU6sYa9wgAsN/sSQNAp0QaADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0CnRBoAOiXSANApkQaATok0AHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOiTQAdEqkAaBTIg0AnRJpAOiUSANAp0QaADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0CnRBoAOiXSANApkQaATok0AHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0Kl54x5gFNpNbdwj7FA31rhHAGCasCcNAJ0SaQDolEgDQKdEGgA6JdIA0Kl9Rrpt99uttQ9NnV/QWls2/NEAYHYbZE/6PyU5N8kVU+c3JPnE0CYCAJIM9jrpf1hVZ7fW/neSVNXPWmsHD3kuAJj1BtmT3txam5ukkqS1dmySbUOdCgAYKNIfS3J3khe31v5NkvuS/NuhTgUA7P3h7tbanCR/meT6JBclaUneUlWPjGA2AJjV9hrpqtrWWvtEVZ2V5NERzQQAZLCHu7/eWrustdbPX6kAgFlgkEi/O8nnkzzbWnu6tbahtfb0kOcCgFlvny/BqqojRjEIAPBc+4x0a+03dnd5Vf3FgR8HAPilQd7M5AM7nZ6fZFmSB5NcOJSJAIAkgz3c/aadz7fWTkzy0X19Xmtta5LvJTkoyZYkf5rk5qryRigAMIBB9qR3tSbJKwe43caqOjNJWmsvTvKZJEcmufEFbBMAZp1BnpO+JVNvCZrtR4OfmeQ7+7ORqvppa+2aJA+01n6vqmqfnzRDLV++PCtWrBj3GABMA4PsSU/udHpLkj+rqm/s74aq6omp9wB/cZL/t/N1UwG/JkkWLFiwv3cNADPSIJH+e1X1H3e+oLX2/l0v+7uoqk8l+VSSTExMzOi9bHvRAAxqkDczuXI3l121vxtqrZ2UZGuSn+7v5wLAbLTHPenW2hVJfivJotbaF3e66ogkf70/G5n685a3Jvn4bH4+GgD2x94e7v5mkp8kOSbJf9jp8g1JvjvAfR/aWluZX70E684kf/QC5wSAWWePka6qJ5M8meTcF3LHVTX3hQ4FAAzwnHRr7VWttQdaa79orf1ta22rP7ABAMM3yIFjH09yRZLHkhya5F1JPjHMoQCAwSKdqvphkrlVtbWqbk/yuuGOBQAM8jrpv2mtHZxkZWvtD7L9YLKB4g4AvHCDxPadU7e7LskzSU5MctkwhwIABvsrWE+21g5NclxV3TSCmQCADHZ095uSrEzy5anzZ+7y5iYAwBAM8nD37yVZluTnSVJVK5MsGuJMAEAGi/Tmqlq/y2Xe2hMAhmyQo7u/31r7rSRzW2snJ3lftr9lKAAwRHvck26t3Tl18vEkv57k2SR/luTpJP90+KMBwOy2tz3pc1prL03ytiQX5Ll/ZOPXkmwa5mAAMNvtLdK3Jvl6kpOSTO50ecv256RPGuJcADDr7fHh7qr6WFW9MsmfVNVJO30sqiqBBoAh2+fR3VX1u6MYBAB4Lu/BDQCdGuQlWNNe3ehl3QBMP/akAaBTIg0AnRJpAOiUSANAp0QaADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0CnRBoAOiXSANApkQaATok0AHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOiTQAdEqkAaBTIg0AnRJpAOiUSANAp0QaADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0CnRBoAOiXSANApkQaATok0AHRKpAGgUyINAJ0SaQDo1LxxDzAK7aY28m3WjTXybQIws9iTBoBOiTQAdEqkAaBTIg0AnRJpAOiUSANAp0QaADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0CnRBoAOiXSANApkQaATg090q21t7TWqrV26rC3BQAzySj2pK9Ict/UfwGAAQ010q21w5P8oyS/k+Ttw9xWV25Pli9fPu4pAJjmhr0n/eYkX66qVUnWtdbO2d2NWmvXtNYmW2uTa9euHfJIADA9DDvSVyT57NTpz2YPD3lX1aeqaqKqJo499tghjzQCVycrVqwY9xQATHPzhnXHrbUXJbkwyeLWWiWZm6Raax+oqhrWdgFgphjmnvTlSe6sqpdV1cKqOjHJXyY5f4jbBIAZY5iRviLJ3btc9l/iKG8AGMjQHu6uqgt2c9nHhrU9AJhpvOMYAHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOiTQAdEqkAaBTIg0AnRJpAOiUSANAp0QaADol0gDQKZEGgE7NG/cAo1A31rhHAID9Zk8aADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0CnRBoAOiXSANApkQaATok0AHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOiTQAdEqkAaBTIg0AnRJpAOiUSANAp0QaADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0CnRBoAOiXSANApkQaATok0AHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOzfxItzbuCQDgBZn5kQaAaUqkAaBTIg0AnRJpAOiUSANAp0QaADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0CnRBoAOiXSANApkQaATok0AHRq6JFurf2D1tpnW2uPt9YebK3999baK4a9XQCY7uYN885bay3J3Uk+XVVvn7psSZKXJFk1zG0DwHQ37D3pC5Jsrqpbf3lBVT1UVfcOebs7LB/VhgDgABt2pE9P8uC+btRau6a1Ntlam1y7du2QRwKA6aGLA8eq6lNVNVFVE8cee+wBve8VB/TeAGB0hh3p7yc5Z8jbAIAZadiR/vMkh7TWrvnlBa21M1pr5w95uwAw7Q010lVVSf5xkt+cegnW95P8fpL/O8ztAsBMMNSXYCVJVf04yVuHvR0AmGm6OHAMAHg+kQaATok0AHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOiTQAdEqkAaBTIg0AnRJpAOjU0P9UJQDPt3nz5qxZsyabNm0a9yiMyPz583PCCSfkoIMOGvhzRBpgDNasWZMjjjgiCxcuTGtt3OMwZFWVdevWZc2aNVm0aNHAnzfzH+6uGvcEAM+zadOmHH300QI9S7TWcvTRR+/3IyczP9IAnRLo2eWFfL9FGgA6JdIAPWjtwH4M4LzzzhvyF/Vcq1evzmc+85mRbnO6E2mAWeqb3/zmyLa1ZcsWkX4BRBpgljr88MOTJCtWrMirX/3qvPnNb85JJ52UG264IXfddVeWLVuWxYsX5/HHH0+SXHXVVXnPe96TiYmJvOIVr8iXvvSlJNsPgrv66quzePHinHXWWbnnnnuSJHfccUcuueSSXHjhhbnoootyww035N57782ZZ56Zm2++OatXr87555+fs88+O2efffaOfzSsWLEiy5cvz+WXX55TTz0173jHO1JTBwE/8MADOe+887JkyZIsW7YsGzZsyNatW/OBD3wgS5cuzRlnnJHbbrttj1/zoF/r2rVrc9lll2Xp0qVZunRpvvGNbyRJ7r///px77rk566yzct555+UHP/jBjq/10ksvzete97qcfPLJuf766w/MN6mquvo455xzCmCme/jhh597wfbXohy4jwEcdthhVVV1zz331FFHHVU//vGPa9OmTfXSl760PvShD1VV1Uc/+tF6//vfX1VVV155ZV188cW1devWWrVqVR1//PG1cePG+shHPlJXX311VVU98sgjdeKJJ9bGjRvr9ttvr+OPP77WrVu3YztveMMbdmz/mWeeqY0bN1ZV1apVq+qXv//vueeeOvLII+tHP/pRbd26tV71qlfVvffeW88++2wtWrSo7r///qqqWr9+fW3evLluu+22+vCHP1xVVZs2bapzzjmnnnjiid1+zYN+rVdccUXde++9VVX15JNP1qmnnvqcbVZVffWrX61LL720qqpuv/32WrRoUf385z+vjRs31oIFC+qpp5563vaf932vqiSTtYcmep00AFm6dGmOO+64JMnLX/7yvPa1r02SLF68eMeecZK89a1vzZw5c3LyySfnpJNOyqOPPpr77rsv733ve5Mkp556al72spdl1apVSZLXvOY1edGLXrTbbW7evDnXXXddVq5cmblz5+74nCRZtmxZTjjhhCTJmWeemdWrV+eoo47Kcccdl6VLlyZJjjzyyCTJV77ylXz3u9/NF77whSTJ+vXr89hjj+3x9ciDfK1f+9rX8vDDD+/4nKeffjq/+MUvsn79+lx55ZV57LHH0lrL5s2bd9zmoosuylFHHZUkOe200/Lkk0/mxBNP3Nuy75NIA5BDDjlkx+k5c+bsOD9nzpxs2bJlx3W7voxoXy8rOuyww/Z43c0335yXvOQleeihh7Jt27bMnz9/t/PMnTv3OTPsqqpyyy235OKLL97rLLu77z19rdu2bcu3vvWt58yUJNddd10uuOCC3H333Vm9enWWL1/+gmYelOekARjY5z//+Wzbti2PP/54nnjiiZxyyik5//zzc9dddyVJVq1alaeeeiqnnHLK8z73iCOOyIYNG3acX79+fY477rjMmTMnd955Z7Zu3brXbZ9yyin5yU9+kgceeCBJsmHDhmzZsiUXX3xxPvnJT+7Yq121alWeeeaZv9PX+drXvja33HLLjvMrV67cMfPxxx+fZPvz0MMm0gA9ONDPSg/JggULsmzZsrz+9a/Prbfemvnz5+faa6/Ntm3bsnjx4rztbW/LHXfc8Zy9yl8644wzMnfu3CxZsiQ333xzrr322nz605/OkiVL8uijj+51rztJDj744Hzuc5/Le9/73ixZsiSvec1rsmnTprzrXe/KaaedlrPPPjunn3563v3ud/+d92I/9rGPZXJyMmeccUZOO+203HrrrUmS66+/Ph/84Adz1llnHZA95X1p1dnbZk5MTNTk5OS4xwAYqkceeSSvfOUrxz3Gfrnqqqvyxje+MZdffvm4R5m2dvd9b609WFUTu7u9PWkA6JQDxwAYyCiegz1Qvve97+Wd73zncy475JBD8u1vf3tME70wIg3AjLN48eIdB3tNZx7uBhiT3o4JYrheyPdbpAHGYP78+Vm3bp1QzxJVlXXr1j3vddf74uFugDE44YQTsmbNmqxdu3bcozAi8+fP3/EuaoMSaYAxOOigg/b4tpXwSx7uBoBOiTQAdEqkAaBT3b0taGttbZInD/DdHpPkrw7wfc5U1mpw1mpw1mpw1mpwM2WtXlZVx+7uiu4iPQyttck9vS8qz2WtBmetBmetBmetBjcb1srD3QDQKZEGgE7Nlkh/atwDTCPWanDWanDWanDWanAzfq1mxXPSADAdzZY9aQCYdmZUpFtrr2ut/aC19sPW2g27uf6Q1trnpq7/dmtt4ein7MMAa/UbrbXvtNa2tNYuH8eMvRhgrf5Za+3h1tp3W2tfb629bBxz9mCAtXpPa+17rbWVrbX7WmunjWPOHuxrrXa63WWttWqtzeijmPdmgJ+rq1pra6d+rla21t41jjmHoqpmxEeSuUkeT3JSkoOTPJTktF1uc22SW6dOvz3J58Y9d8drtTDJGUn+NMnl456587W6IMmvTZ3+XT9Xe12rI3c6fUmSL4977l7Xaup2RyT5iyTfSjIx7rl7XaskVyX5+LhnHcbHTNqTXpbkh1X1RFX9bZLPJnnzLrd5c5JPT53+QpKLWmtthDP2Yp9rVVWrq+q7SbaNY8CODLJW91TV30yd/VaS/fszNzPHIGv19E5nD0syWw+KGeT3VZJ8OMm/T7JplMN1ZtC1mpFmUqSPT/Kjnc6vmbpst7epqi1J1ic5eiTT9WWQtWK7/V2r30nyP4Y6Ub8GWqvW2j9prT2e5A+SvG9Es/Vmn2vVWjs7yYlV9d9GOViHBv1/8LKpp5y+0Fo7cTSjDd9MijSMVWvtt5NMJPnDcc/Ss6r6RFW9PMm/SPKvxj1Pj1prc5L8UZJ/Pu5Zpon/mmRhVZ2R5Kv51SOm095MivT/SbLzv55OmLpst7dprc1LclSSdSOZri+DrBXbDbRWrbXfTPIvk1xSVc+OaLbe7O/P1WeTvGWoE/VrX2t1RJLTk6xora1O8qokX5ylB4/t8+eqqtbt9P/dHyc5Z0SzDd1MivQDSU5urS1qrR2c7QeGfXGX23wxyZVTpy9P8uc1ddTBLDPIWrHdPteqtXZWktuyPdA/HcOMvRhkrU7e6ewbkjw2wvl6ste1qqr1VXVMVS2sqoXZfqzDJVU1OZ5xx2qQn6vjdjp7SZJHRjjfUM0b9wAHSlVtaa1dl+R/ZvvRgH9SVd9vrf3rJJNV9cUk/znJna21Hyb562z/Zs86g6xVa21pkruT/P0kb2qt3VRVvz7GscdiwJ+rP0xyeJLPTx2H+FRVXTK2ocdkwLW6bupRh81JfpZf/aN5VhlwrcjAa/W+1tolSbZk++/2q8Y28AHmHccAoFMz6eFuAJhRRBoAOiXSANApkQaATok0AHRKpAGgUyINAJ0SaQDo1P8HOXhARHlxcfMAAAAASUVORK5CYII=\n",
864 | "text/plain": [
865 | ""
866 | ]
867 | },
868 | "metadata": {
869 | "needs_background": "light"
870 | },
871 | "output_type": "display_data"
872 | }
873 | ],
874 | "source": [
875 | "%matplotlib inline\n",
876 | "plot_importance(importances)"
877 | ]
878 | },
879 | {
880 | "cell_type": "code",
881 | "execution_count": 9,
882 | "metadata": {},
883 | "outputs": [
884 | {
885 | "name": "stdout",
886 | "output_type": "stream",
887 | "text": [
888 | "Automatically grouped features by correlation:\n",
889 | "1 ['D', 'D2']\n"
890 | ]
891 | },
892 | {
893 | "data": {
894 | "application/vnd.jupyter.widget-view+json": {
895 | "model_id": "1b33b416ef064bb7acd791d8345ae7fd",
896 | "version_major": 2,
897 | "version_minor": 0
898 | },
899 | "text/plain": [
900 | "HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))"
901 | ]
902 | },
903 | "metadata": {},
904 | "output_type": "display_data"
905 | },
906 | {
907 | "name": "stdout",
908 | "output_type": "stream",
909 | "text": [
910 | "\n"
911 | ]
912 | },
913 | {
914 | "data": {
915 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfcAAAHSCAYAAADxFIKiAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAS20lEQVR4nO3cf4zkd33f8dcbn5NcDVhtTK5Rod4EUZGUwxC2PxWaJUU0kVOTKCgCpZWpQKe2IfyRqOKkVkW0f2BKE6khbekpaXCIKqOmQXLjhEKTjKKI0PYMNifqlmKw27RqIQ5yOefa2O6nf+w6Xm/2fN899vb7nfc+HpJ1M7Pf/c773prz82Z2bmqMEQCgj+fMPQAAcLjEHQCaEXcAaEbcAaAZcQeAZsQdAJo5MfcAh+Wmm24aGxsbh3rOxx57LDfccMOhnrMje5rOrqaxp2nsabquu7r33nt/e4zxgr23t4n7xsZGzp8/f6jnXK1W2draOtRzdmRP09nVNPY0jT1N13VXVfXwfrd7WR4AmhF3AGhG3AGgGXEHgGbEHQCaEXcAaEbcAaAZcQeAZsQdAJoRdwBoRtwBoBlxB4BmxB0AmhF3AGhG3AGgGXEHgGbEHQCaEXcAaEbcAaAZcQeAZk7MPQAArKtb3vXRPHrp8STJ877lbL7ywB2//7UbT16f+9/5ulnmEncAuEqPXno8D91xa5Lk9J1nf/9ykmycvWeusbwsDwDdiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4g4AzYg7AFylh9/zPZOPraprOMkziTsANCPuANCMuANAM+IOAM2IOwA0I+4A0MyJuQe4nKp6MsmFXTfdNca4Y655AGBdLDbuSS6NMV4x9xAAsG68LA8AzSz5mfvJqrpv1/V3jzE+tPuAqjqT5EySnDp1KqvV6lAHuHjx4qGfsyN7ms6uprGnaexpumu5q93n3XsfG2fvueyx11KNMY7kjg6qqi6OMZ479fjNzc1x/vz5Q51htVpla2vrUM/ZkT1NZ1fT2NM09jTdtdpVVeWpjp6+83Qu3P70W8U2zt6Th+64dd9jD/H+7x1jbO693cvyANCMuANAM+v0M/ePjDHOzjYNAKyJxcZ9jHHd3DMAwDrysjwANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A8BVuvkdvzj52KP8uHdxB4BmxB0AmhF3AGhG3AGgGXEHgGbEHQCaEXcAaEbcAaAZcQeAZk7MPQAArLONs/ckSZ73LU9fTpIbT14/10jiDgBX66E7bt117dbLHnfUvCwPAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADRzYu4BAPb6oV95LM95+HS+8sAdufHk9bn/na+beyRYK565A4vz2OPbvz50x6159NLj8w4Da0jcAaAZcQeAZsQdAJoRdwBoRtwBoBlxB4BmxB0AmhF3AGhG3AGgGXEHFqWqJt0GXJ64A0Az4g4AzYg7ADQj7gDQjLgDQDPiDgDNLDLuVfVkVd1XVfdX1Ser6s/PPRMArIsTcw9wGZfGGK9Ikqr6S0neneQ75h0JANbDIp+57/H8JF+eewgAWBdLfeZ+sqruS/J1Sb4xyXfud1BVnUlyJklOnTqV1Wp1qENcvHjx0M/ZkT1NZ1cHs3tX9vYHeTxNd9x2tdS4735Z/s8l+dmqetkYY+w+aIxxLsm5JNnc3BxbW1uHOsRqtcphn7Mje5rOrg5ma2sr+cg9T1/mGTyepjtuu1r8y/JjjN9MclOSF8w9CwCsg8XHvapemuS6JI/MPQsArIOlviz/1M/ck6SS3D7GeHLOgQBgXSwy7mOM6+aeAQDW1eJflgcADkbcAaAZcQeAZsQdAJoRdwBoRtwBoBlxBxZlz6dMX/Y24PLEHQCaEXcAaEbcAaAZcQeAZsQdAJoRdwBoRtwBoBlxB4BmxB0AmhF3YLE2zt6TG09eP/cYsHZOzD0AwF4f+K4bsrV1Ye4xYG155g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPua+L0nadzy7s+OvcYAKwBcV8jj156fO4RAFgD4g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM+K+BqoqSfLwe75n5kkAWAfiDgDNiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0MwV415VT1bVfVX1maq6v6p+tKr2/b6qeklV/fuq+nRV/dtnOedGVV2qqk9V1QM73/PmXV//wZ1zXKiqj1fVLVf1uwOAY+jEhGMujTFekSRV9Q1J/kWS5yd55z7Hnk3yT8cYP1NV33SF8z44xnjlznm/OckvVFWNMX4myReSfMcY48tV9d1JziX5M9N+SwBwvB3oZfkxxheTnEnytnrqY9Oe6feSvHDn2C8c4LyfT/IjSd6+c/3jY4wv73z5E0+dEwC4sinP3J9hjPH5qrouyTck+V97vvxgkndU1afGGL94wFN/MslL97n9LUl+eb9vqKoz2f7LRk6dOpXVanXAu3x2Fy9ePPRzfrWWNk+yzD0tlV1NY0/T2NN0x21XB4775VTVtyV5XZJXJvlYVf1Okt/MdvBfPMYYVzrFPud8Tbbj/u37fcMY41y2X7LP5ubm2Nrauur597NarXLY5/xqLW2eZJl7Wiq7msaeprGn6Y7brg4c952fjz+Z5It7vvTaJB8fY/xWVX1fkruTvD/JL00Ie7L9l4IHdt3Py5P8VJLvHmM8ctA5AeC4OtDP3KvqBdkO9k/uE+xPJXl9Vd04xvhPSd6b5MeS/NyE824k+YdJ3rdz/Y8n+YUkf3WM8dmDzAgAx92UZ+4nq+q+JNcneSLJB5P8+N6Dxhgfq6qfS/KJqvrdbL/j/a8l+UBVvXqM8aU93/LiqvpUkq9L8pUkPzHG+MDO1/5ukq9P8k923rf3xBhj88C/OwA4hq4Y9zHGdVNPNsb4sWw/W9/tA/sc91CSk89ynrcmeevU+wUAnuYT6gCgGXEHgGbEHQCaEXcAaEbcAaAZcQeAZsR9DTz1eUE3v+OgH9cPwHEk7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4r5EbT14/9wgArIETcw/ANBduvzD3CACsCc/cAaAZcQeAZsQdAJoRdwBoRtwBoBlxB4BmxB0AmhF3AGhG3AGgGXEHgGbEHQCaEXcAaEbcAaAZcQeAZsQdAJoRdwBoRtwBoBlxB4BmxB0AmhF3AGhG3AGgGXEHgGbEHQCaEXcAaEbcAaAZcQeAZsQdAJoRdwBoRtwBoBlxB4BmxB0AmhF3AGhG3AGgGXEHgGbEHQCaEXcAaEbcAaAZcQeAZsQdAJoRdwBoRtwBoBlxB4BmxH1hTt95Ore866NzjwHAGhP3BXr00uNzjwDAGhN3AGhG3AGgGXEHgGbEHQCaEXcAaEbcAaAZcQeAZsQdAJoR9wWpqn0vA8BBiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Azi417VX1vVY2qeuncswDAOlls3JO8Kclv7PwKAEy0yLhX1XOTfHuStyR548zjAMBaOTH3AJfx+iQfGWN8tqoeqapXjTHu3XtQVZ1JciZJTp06ldVqdahDXLx48dDPeRBz3vdBzL2ndWJX09jTNPY03XHb1VLj/qYk/2jn8l071/9A3McY55KcS5LNzc2xtbV1qEOsVqsc9jkPYs77Poi597RO7Goae5rGnqY7brtaXNyr6o8k+c4kp6tqJLkuyaiqvzXGGPNOBwDLt8Sfub8hyQfHGDePMTbGGC9K8oUkr555LgBYC0uM+5uSfHjPbf8q3jUPAJMs7mX5McZr9rntJ+aYBQDW0RKfuQMAXwVxB4BmxB0AmhF3AGhG3AGgGXEHgGbEfUF2fwCfD+MD4GqJOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPuC3TjyevnHgGANXZi7gF4pgu3X5h7BADWnGfuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4v4sfvjhH557BAA4MHEHgGbEHQCaEXcAaEbcAaAZcQeAZsQdAJoRdwBoRtwBoBlxB4BmxP0yqmruEQDgqog7ADQj7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM4uNe1X90aq6q6oerKp7q+qXqupPzD0XACzdibkH2E9tf4LMh5PcOcZ4485ttyQ5leSzc84GAEu3yLgneU2Sx8cY73/qhjHG/TPOAwBrY6lxf1mSe690UFWdSXImSU6dOpXVanXog1yLc3Zz8eJFe5rIrqaxp2nsabrjtqulxn2SMca5JOeSZHNzc2xtbR36fVyLc3azWq3saSK7msaeprGn6Y7brpb6hrrPJHnV3EMAwDpaatx/NcnX7rzsniSpqpdX1atnnAkA1sIi4z7GGEm+L8lrd/4p3GeSvDvJ/5x3MgBYvsX+zH2M8T+S/MDccwDAulnkM3cA4OqJOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7pex/U/tAWD9iDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLg/i/fd/L65RwCAAxN3AGhG3AGgGXEHgGbEHQCaEXcAaEbcAaAZcQeAZsQdAJoRdwBoRtwBoBlxB4BmxB0AmhF3AGhG3AGgGXEHgGbEHQCaEXcAaEbcAaAZcQeAZsQdAJqpMcbcMxyKqvpSkocP+bQ3JfntQz5nR/Y0nV1NY0/T2NN0XXd18xjjBXtvbBP3a6Gqzo8xNueeY+nsaTq7msaeprGn6Y7brrwsDwDNiDsANCPuz+7c3AOsCXuazq6msadp7Gm6Y7UrP3MHgGY8cweAZsQ9SVV9V1X956r6XFWd3efrX1tVH9r5+r+rqo2jn3J+E/b0F6rqk1X1RFW9YY4Zl2DCnn6kqv5jVX26qn6lqm6eY84lmLCrv15VF6rqvqr6jar61jnmnNuV9rTruO+vqlFVx+Zd4btNeDy9uaq+tPN4uq+q3jrHnEdijHGs/0tyXZIHk3xzkq9Jcn+Sb91zzN9M8v6dy29M8qG5517onjaSvDzJzyZ5w9wzL3hPr0nyh3Yu/43j+Hg6wK6ev+vybUk+MvfcS9zTznHPS/LrST6RZHPuuZe4pyRvTvKTc896FP955p786SSfG2N8fozxe0nuSvL6Pce8PsmdO5d/PslfrKo6whmX4Ip7GmM8NMb4dJL/N8eACzFlT782xvjdnaufSPLCI55xKabs6n/vunpDkuP4JqEp/49Kkr+f5D1J/s9RDrcgU/d0LIh78seS/Ldd139r57Z9jxljPJHk0SRffyTTLceUPXHwPb0lyS9f04mWa9KuquqHqurBJP8gyduPaLYlueKequrbkrxojHHPUQ62MFP/7H3/zo/Efr6qXnQ0ox09cYeZVNVfSbKZ5L1zz7JkY4x/PMZ4cZJ3JPk7c8+zNFX1nCQ/nuRH555lDfzrJBtjjJcn+ViefkW2HXFP/nuS3X97e+HObfseU1UnktyY5JEjmW45puyJiXuqqtcm+dtJbhtj/N8jmm1pDvqYuivJ917TiZbpSnt6XpKXJVlV1UNJ/mySu4/hm+qu+HgaYzyy68/bTyV51RHNduTEPfkPSV5SVd9UVV+T7TfM3b3nmLuT3L5z+Q1JfnXsvDvjGJmyJybsqapemeSfZTvsX5xhxqWYsquX7Lp6a5L/coTzLcWz7mmM8egY46YxxsYYYyPb7+O4bYxxfp5xZzPl8fSNu67eluSBI5zvSJ2Ye4C5jTGeqKq3Jfk32X635T8fY3ymqv5ekvNjjLuT/HSSD1bV55L8TrYfNMfKlD1V1Z9K8uEkfzjJX66qd40x/uSMYx+5iY+n9yZ5bpJ/ufO+zP86xrhttqFnMnFXb9t5lePxJF/O03/JPjYm7unYm7int1fVbUmeyPb/y98828DXmE+oA4BmvCwPAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLgDQDP/HwaXyDutlQJMAAAAAElFTkSuQmCC\n",
916 | "text/plain": [
917 | ""
918 | ]
919 | },
920 | "metadata": {
921 | "needs_background": "light"
922 | },
923 | "output_type": "display_data"
924 | }
925 | ],
926 | "source": [
927 | "dataset = Dataset(df=df, target=\"target\", features=[\"A\", \"B\", \"C\", \"D\", \"D2\", \"E\"], \n",
928 | " auto_group_threshold=0.7)\n",
929 | "fi = LOFOImportance(dataset, scorer, n_jobs=-1)\n",
930 | "\n",
931 | "importances = fi.get_importance()\n",
932 | "importances\n",
933 | "\n",
934 | "\n",
935 | "plot_importance(importances, kind=\"box\")"
936 | ]
937 | },
938 | {
939 | "cell_type": "code",
940 | "execution_count": 10,
941 | "metadata": {},
942 | "outputs": [
943 | {
944 | "name": "stderr",
945 | "output_type": "stream",
946 | "text": [
947 | "/home/aerdem/projects/lofo-importance/lofo/lofo_importance.py:45: UserWarning: Warning: If your model is multithreaded, please initialise the numberof jobs of LOFO to be equal to 1, otherwise you may experience performance issues.\n",
948 | " warnings.warn(warning_str)\n"
949 | ]
950 | },
951 | {
952 | "data": {
953 | "text/html": [
954 | "\n",
955 | "\n",
968 | "
\n",
969 | " \n",
970 | " \n",
971 | " | \n",
972 | " feature | \n",
973 | " importance_mean | \n",
974 | " importance_std | \n",
975 | " val_imp_0 | \n",
976 | " val_imp_1 | \n",
977 | " val_imp_2 | \n",
978 | " val_imp_3 | \n",
979 | "
\n",
980 | " \n",
981 | " \n",
982 | " \n",
983 | " | 5 | \n",
984 | " names | \n",
985 | " 0.232698 | \n",
986 | " 0.025097 | \n",
987 | " 2.480101e-01 | \n",
988 | " 0.193688 | \n",
989 | " 0.229018 | \n",
990 | " 0.260076 | \n",
991 | "
\n",
992 | " \n",
993 | " | 3 | \n",
994 | " B | \n",
995 | " 0.002641 | \n",
996 | " 0.001413 | \n",
997 | " 4.608295e-03 | \n",
998 | " 0.003212 | \n",
999 | " 0.000838 | \n",
1000 | " 0.001906 | \n",
1001 | "
\n",
1002 | " \n",
1003 | " | 2 | \n",
1004 | " A | \n",
1005 | " 0.000484 | \n",
1006 | " 0.000535 | \n",
1007 | " 1.256808e-03 | \n",
1008 | " -0.000140 | \n",
1009 | " 0.000140 | \n",
1010 | " 0.000681 | \n",
1011 | "
\n",
1012 | " \n",
1013 | " | 1 | \n",
1014 | " D | \n",
1015 | " -0.000067 | \n",
1016 | " 0.000205 | \n",
1017 | " -1.110223e-16 | \n",
1018 | " 0.000140 | \n",
1019 | " 0.000000 | \n",
1020 | " -0.000408 | \n",
1021 | "
\n",
1022 | " \n",
1023 | " | 4 | \n",
1024 | " interactions | \n",
1025 | " -0.000071 | \n",
1026 | " 0.000155 | \n",
1027 | " 0.000000e+00 | \n",
1028 | " -0.000140 | \n",
1029 | " -0.000279 | \n",
1030 | " 0.000136 | \n",
1031 | "
\n",
1032 | " \n",
1033 | " | 0 | \n",
1034 | " C | \n",
1035 | " -0.000102 | \n",
1036 | " 0.000203 | \n",
1037 | " -1.396453e-04 | \n",
1038 | " 0.000140 | \n",
1039 | " 0.000000 | \n",
1040 | " -0.000408 | \n",
1041 | "
\n",
1042 | " \n",
1043 | "
\n",
1044 | "
"
1045 | ],
1046 | "text/plain": [
1047 | " feature importance_mean importance_std val_imp_0 val_imp_1 \\\n",
1048 | "5 names 0.232698 0.025097 2.480101e-01 0.193688 \n",
1049 | "3 B 0.002641 0.001413 4.608295e-03 0.003212 \n",
1050 | "2 A 0.000484 0.000535 1.256808e-03 -0.000140 \n",
1051 | "1 D -0.000067 0.000205 -1.110223e-16 0.000140 \n",
1052 | "4 interactions -0.000071 0.000155 0.000000e+00 -0.000140 \n",
1053 | "0 C -0.000102 0.000203 -1.396453e-04 0.000140 \n",
1054 | "\n",
1055 | " val_imp_2 val_imp_3 \n",
1056 | "5 0.229018 0.260076 \n",
1057 | "3 0.000838 0.001906 \n",
1058 | "2 0.000140 0.000681 \n",
1059 | "1 0.000000 -0.000408 \n",
1060 | "4 -0.000279 0.000136 \n",
1061 | "0 0.000000 -0.000408 "
1062 | ]
1063 | },
1064 | "execution_count": 10,
1065 | "metadata": {},
1066 | "output_type": "execute_result"
1067 | }
1068 | ],
1069 | "source": [
1070 | "from sklearn.feature_extraction.text import CountVectorizer\n",
1071 | "from lightgbm import LGBMClassifier\n",
1072 | "\n",
1073 | "df = generate_test_data(1000, text=True)\n",
1074 | "features = [\"A\", \"B\", \"C\", \"D\"]\n",
1075 | "\n",
1076 | "cv = CountVectorizer(ngram_range=(3, 3), analyzer=\"char\")\n",
1077 | "feature_groups = dict()\n",
1078 | "feature_groups[\"names\"] = cv.fit_transform(df[\"T\"])\n",
1079 | "feature_groups[\"interactions\"] = df[[\"A\", \"B\"]].values*df[[\"C\", \"D\"]].values\n",
1080 | "\n",
1081 | "dataset = Dataset(df=df, target=\"binary_target\", features=features, feature_groups=feature_groups)\n",
1082 | "\n",
1083 | "lgbm = LGBMClassifier(random_state=0, n_jobs=1)\n",
1084 | "\n",
1085 | "lofo = LOFOImportance(dataset, model=lgbm, cv=4, scoring='roc_auc', n_jobs=4)\n",
1086 | "\n",
1087 | "importances = lofo.get_importance()\n",
1088 | "importances"
1089 | ]
1090 | },
1091 | {
1092 | "cell_type": "code",
1093 | "execution_count": null,
1094 | "metadata": {},
1095 | "outputs": [],
1096 | "source": []
1097 | },
1098 | {
1099 | "cell_type": "code",
1100 | "execution_count": null,
1101 | "metadata": {},
1102 | "outputs": [],
1103 | "source": []
1104 | }
1105 | ],
1106 | "metadata": {
1107 | "kernelspec": {
1108 | "display_name": "Python 3",
1109 | "language": "python",
1110 | "name": "python3"
1111 | },
1112 | "language_info": {
1113 | "codemirror_mode": {
1114 | "name": "ipython",
1115 | "version": 3
1116 | },
1117 | "file_extension": ".py",
1118 | "mimetype": "text/x-python",
1119 | "name": "python",
1120 | "nbconvert_exporter": "python",
1121 | "pygments_lexer": "ipython3",
1122 | "version": "3.6.9"
1123 | },
1124 | "toc": {
1125 | "colors": {
1126 | "hover_highlight": "#DAA520",
1127 | "navigate_num": "#000000",
1128 | "navigate_text": "#333333",
1129 | "running_highlight": "#FF0000",
1130 | "selected_highlight": "#FFD700",
1131 | "sidebar_border": "#EEEEEE",
1132 | "wrapper_background": "#FFFFFF"
1133 | },
1134 | "moveMenuLeft": true,
1135 | "nav_menu": {
1136 | "height": "12px",
1137 | "width": "252px"
1138 | },
1139 | "navigate_menu": true,
1140 | "number_sections": true,
1141 | "sideBar": true,
1142 | "threshold": 4,
1143 | "toc_cell": false,
1144 | "toc_section_display": "block",
1145 | "toc_window_display": false,
1146 | "widenNotebook": false
1147 | }
1148 | },
1149 | "nbformat": 4,
1150 | "nbformat_minor": 2
1151 | }
1152 |
--------------------------------------------------------------------------------