├── .flake8 ├── MANIFEST.in ├── .gitattributes ├── docs ├── lofo_logo.png ├── plot_importance.png ├── pydata_feb19_lofo.pdf └── plot_importance_box.png ├── requirements.txt ├── lofo ├── __init__.py ├── infer_defaults.py ├── utils.py ├── plotting.py ├── lofo_importance.py ├── flofo_importance.py └── dataset.py ├── setup.py ├── LICENSE ├── .github └── workflows │ ├── python-publish.yml │ └── python-app.yml ├── data └── test_data.py ├── tests ├── test_dataset.py ├── test_flofo_importance.py └── test_lofo_importance.py ├── .gitignore ├── README.md └── LOFOImportance Example.ipynb /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length=120 3 | exclude = */__init__.py -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include *.txt 3 | include README.md 4 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.py linguist-language=python 2 | *.ipynb linguist-documentation -------------------------------------------------------------------------------- /docs/lofo_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aerdem4/lofo-importance/HEAD/docs/lofo_logo.png -------------------------------------------------------------------------------- /docs/plot_importance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aerdem4/lofo-importance/HEAD/docs/plot_importance.png -------------------------------------------------------------------------------- /docs/pydata_feb19_lofo.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aerdem4/lofo-importance/HEAD/docs/pydata_feb19_lofo.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.17.3 2 | pandas 3 | scipy 4 | scikit-learn>=0.20.3 5 | tqdm 6 | lightgbm 7 | networkx -------------------------------------------------------------------------------- /docs/plot_importance_box.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aerdem4/lofo-importance/HEAD/docs/plot_importance_box.png -------------------------------------------------------------------------------- /lofo/__init__.py: -------------------------------------------------------------------------------- 1 | from .lofo_importance import LOFOImportance 2 | from .flofo_importance import FLOFOImportance 3 | from .dataset import Dataset 4 | from .plotting import plot_importance 5 | -------------------------------------------------------------------------------- /lofo/infer_defaults.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.preprocessing import LabelEncoder 3 | from lightgbm import LGBMClassifier, LGBMRegressor 4 | from lofo.utils import flatten_list 5 | 6 | 7 | def infer_model(df, features, y, n_jobs): 8 | model_class = LGBMRegressor 9 | if len(np.unique(y)) == 2: 10 | y = LabelEncoder().fit_transform(y) 11 | model_class = LGBMClassifier 12 | 13 | categoricals = df[flatten_list(features)].select_dtypes(exclude=[np.number]).columns.tolist() 14 | for f in categoricals: 15 | df[f] = LabelEncoder().fit_transform(df[f].astype(str)) 16 | 17 | min_child_samples = int(0.01*df.shape[0]) 18 | 19 | model = model_class(min_child_samples=min_child_samples, n_jobs=n_jobs) 20 | 21 | return model, df, categoricals, y 22 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open('requirements.txt') as f: 4 | requirements = f.read().splitlines() 5 | 6 | with open("README.md", "r") as fh: 7 | long_description = fh.read() 8 | 9 | setup( 10 | name='lofo-importance', 11 | version='0.3.5', 12 | url="https://github.com/aerdem4/lofo-importance", 13 | author="Ahmet Erdem", 14 | author_email="ahmeterd4@gmail.com", 15 | description="Leave One Feature Out Importance", 16 | keywords="feature importance selection explainable data-science machine-learning", 17 | long_description=long_description, 18 | long_description_content_type="text/markdown", 19 | packages=find_packages(), 20 | install_requires=requirements, 21 | classifiers=[ 22 | "Programming Language :: Python :: 3", 23 | "License :: OSI Approved :: MIT License", 24 | "Operating System :: OS Independent", 25 | ] 26 | ) 27 | -------------------------------------------------------------------------------- /lofo/utils.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import pandas as pd 3 | 4 | 5 | def lofo_to_df(lofo_scores, feature_list): 6 | importance_df = pd.DataFrame() 7 | importance_df["feature"] = feature_list 8 | importance_df["importance_mean"] = lofo_scores.mean(axis=1) 9 | importance_df["importance_std"] = lofo_scores.std(axis=1) 10 | 11 | for val_score in range(lofo_scores.shape[1]): 12 | importance_df["val_imp_{}".format(val_score)] = lofo_scores[:, val_score] 13 | 14 | return importance_df.sort_values("importance_mean", ascending=False) 15 | 16 | 17 | def parallel_apply(cv_func, feature_list, n_jobs): 18 | pool = multiprocessing.Pool(n_jobs) 19 | manager = multiprocessing.Manager() 20 | result_queue = manager.Queue() 21 | 22 | for f in feature_list: 23 | pool.apply_async(cv_func, (f, result_queue)) 24 | 25 | pool.close() 26 | pool.join() 27 | 28 | lofo_cv_result = [result_queue.get() for _ in range(len(feature_list))] 29 | return lofo_cv_result 30 | 31 | 32 | def flatten_list(nested_list): 33 | return [item for sublist in nested_list for item in sublist] 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Ahmet Erdem 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_KEY }} 40 | -------------------------------------------------------------------------------- /lofo/plotting.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | 4 | def plot_importance(importance_df, figsize=(8, 8), kind="default"): 5 | """Plot feature importance 6 | 7 | Parameters 8 | ---------- 9 | importance_df : pandas dataframe 10 | Output dataframe from LOFO/FLOFO get_importance 11 | kind : string 12 | plot type can be default or box 13 | figsize : tuple 14 | """ 15 | importance_df = importance_df.copy() 16 | importance_df["color"] = (importance_df["importance_mean"] > 0).map({True: 'g', False: 'r'}) 17 | importance_df.sort_values("importance_mean", inplace=True) 18 | 19 | available_kinds = {"default", "box"} 20 | if kind not in available_kinds: 21 | warnings.warn("{kind} not in {ak}. Setting to default".format(kind=kind, ak=available_kinds)) 22 | 23 | if kind == "default": 24 | importance_df.plot(x="feature", y="importance_mean", xerr="importance_std", 25 | kind='barh', color=importance_df["color"], figsize=figsize) 26 | elif kind == "box": 27 | lofo_score_cols = [col for col in importance_df.columns if col.startswith("val_imp")] 28 | features = importance_df["feature"].values.tolist() 29 | importance_df.set_index("feature")[lofo_score_cols].T.boxplot(column=features, vert=False, figsize=figsize) 30 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Python application 5 | 6 | on: 7 | push: 8 | branches: [ "master" ] 9 | pull_request: 10 | branches: [ "master" ] 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | build: 17 | 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Set up Python 3.10 23 | uses: actions/setup-python@v3 24 | with: 25 | python-version: "3.10" 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | pip install flake8 pytest matplotlib 30 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 31 | - name: Lint with flake8 32 | run: | 33 | # stop the build if there are Python syntax errors or undefined names 34 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 35 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 36 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 37 | - name: Test with pytest 38 | run: | 39 | python -m pytest tests 40 | -------------------------------------------------------------------------------- /data/test_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def _to_binary(target): 6 | return (target > target.median()).astype(int) 7 | 8 | 9 | def generate_test_data(data_size, text=False): 10 | df = pd.DataFrame() 11 | 12 | np.random.seed(0) 13 | df["A"] = np.random.rand(data_size) 14 | df["B"] = np.random.rand(data_size) 15 | df["C"] = np.random.rand(data_size) 16 | df["D"] = np.random.rand(data_size) 17 | 18 | df["D2"] = df["D"].values + 0.1*np.random.rand(data_size) 19 | df.loc[df["D2"] > 1, "D2"] = None 20 | 21 | df["target"] = 0.2 * np.random.rand(data_size) + df["A"] * df["D"] + 2 * df["B"] 22 | df["binary_target"] = _to_binary(df["target"]) 23 | 24 | if text: 25 | df["T"] = np.random.choice(["Bojack", "Horseman", "Todd", "Chavez"], data_size) 26 | df["target"] *= (df["T"] == "Todd") 27 | df["binary_target"] *= (df["T"] == "Todd") 28 | 29 | return df 30 | 31 | 32 | def generate_unstructured_test_data(data_size, text=False): 33 | df = generate_test_data(data_size, text) 34 | df.loc[np.random.rand(data_size) < 0.3, "A"] = None 35 | df["E"] = np.random.choice(["category1", "category2", "category3"], data_size) 36 | df["E"] = df["E"].astype("category") 37 | 38 | df["target"] = (df["E"] != "category2")*df["target"] 39 | df["binary_target"] = _to_binary(df["target"]) 40 | return df 41 | -------------------------------------------------------------------------------- /tests/test_dataset.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from data.test_data import generate_unstructured_test_data 3 | from lofo import Dataset 4 | 5 | 6 | def test_dataset(): 7 | df = generate_unstructured_test_data(1000, text=True) 8 | features = ["A", "B", "C", "D", "D2", "E"] 9 | 10 | # Exception: feature group row count is not equal to the features' row count 11 | feature_groups = {"interactions": df[["A", "B"]].values[:10]*df[["C", "D"]].values[:10]} 12 | with pytest.raises(Exception): 13 | assert Dataset(df=df, target="binary_target", features=features, feature_groups=feature_groups) 14 | 15 | # Exception: Feature group name A is in use by other features 16 | feature_groups = {"A": df[["A", "B"]].values*df[["C", "D"]].values} 17 | with pytest.raises(Exception): 18 | assert Dataset(df=df, target="binary_target", features=features, feature_groups=feature_groups) 19 | 20 | # Exception: Feature group type is not numpy.ndarray or scipy.csr.csr_matrix 21 | feature_groups = {"F": df[["A", "B"]]} 22 | with pytest.raises(Exception): 23 | assert Dataset(df=df, target="binary_target", features=features, feature_groups=feature_groups) 24 | 25 | d = Dataset(df=df, target="binary_target", features=features, feature_groups={"F": df[["A", "B"]].values}, 26 | auto_group_threshold=0.5) 27 | assert "D" not in d.feature_names and "D2" not in d.feature_names 28 | assert "D & D2" in d.feature_names and "F" in d.feature_groups.keys() 29 | -------------------------------------------------------------------------------- /tests/test_flofo_importance.py: -------------------------------------------------------------------------------- 1 | from sklearn.model_selection import train_test_split 2 | from lightgbm import LGBMClassifier 3 | from data.test_data import generate_test_data 4 | from lofo.plotting import plot_importance 5 | from lofo.flofo_importance import FLOFOImportance 6 | 7 | 8 | def test_flofo_importance(): 9 | df = generate_test_data(100000) 10 | df.loc[df["A"] < df["A"].median(), "A"] = None 11 | 12 | train_df, val_df = train_test_split(df, test_size=0.2, random_state=0) 13 | val_df_checkpoint = val_df.copy() 14 | 15 | features = ["A", "B", "C", "D"] 16 | 17 | lgbm = LGBMClassifier(random_state=0, n_jobs=1) 18 | lgbm.fit(train_df[features], train_df["binary_target"]) 19 | 20 | flofo = FLOFOImportance(lgbm, df, features, 'binary_target', scoring='roc_auc') 21 | flofo_parallel = FLOFOImportance(lgbm, df, features, 'binary_target', scoring='roc_auc', n_jobs=3) 22 | 23 | importance_df = flofo.get_importance() 24 | importance_df_parallel = flofo_parallel.get_importance() 25 | is_feature_order_same = importance_df["feature"].values == importance_df_parallel["feature"].values 26 | 27 | plot_importance(importance_df) 28 | 29 | assert is_feature_order_same.sum() == len(features), "Parallel FLOFO returned different result!" 30 | assert val_df.equals(val_df_checkpoint), "LOFOImportance mutated the dataframe!" 31 | assert len(features) == importance_df.shape[0], "Missing importance value for some features!" 32 | assert importance_df["feature"].values[0] == "B", "Most important feature is different than B!" 33 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | .idea 107 | -------------------------------------------------------------------------------- /tests/test_lofo_importance.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import CountVectorizer 2 | from sklearn.linear_model import LogisticRegression 3 | from lightgbm import LGBMClassifier 4 | from sklearn.model_selection import KFold 5 | from lofo import LOFOImportance, Dataset, plot_importance 6 | from data.test_data import generate_test_data, generate_unstructured_test_data 7 | 8 | 9 | def test_lofo_importance(): 10 | df = generate_test_data(1000) 11 | features = ["A", "B", "C", "D"] 12 | dataset = Dataset(df=df, target="binary_target", features=features) 13 | 14 | lgbm = LGBMClassifier(random_state=0, n_jobs=4) 15 | 16 | lofo = LOFOImportance(dataset, model=lgbm, cv=4, scoring='roc_auc') 17 | 18 | importance_df = lofo.get_importance() 19 | 20 | plot_importance(importance_df) 21 | 22 | assert len(features) == importance_df.shape[0], "Missing importance value for some features!" 23 | assert importance_df["feature"].values[0] == "B", "Most important feature is different than B!" 24 | 25 | 26 | def test_multithreading(): 27 | df = generate_test_data(100000) 28 | features = ["A", "B", "C", "D"] 29 | dataset = Dataset(df=df, target="binary_target", features=features) 30 | 31 | lr = LogisticRegression(solver='liblinear') 32 | cv = KFold(n_splits=4, shuffle=True, random_state=0) 33 | 34 | lofo = LOFOImportance(dataset, model=lr, cv=cv, scoring='roc_auc', n_jobs=3) 35 | 36 | importance_df = lofo.get_importance() 37 | 38 | assert len(features) == importance_df.shape[0], "Missing importance value for some features!" 39 | assert importance_df["feature"].values[0] == "B", "Most important feature is different than B!" 40 | 41 | 42 | def test_default_model(): 43 | df = generate_unstructured_test_data(1000) 44 | features = ["A", "B", "C", "D", "E"] 45 | dataset = Dataset(df=df, target="target", features=features) 46 | 47 | lofo = LOFOImportance(dataset, cv=4, scoring='neg_mean_absolute_error') 48 | importance_df = lofo.get_importance() 49 | assert "E" in lofo.fit_params["categorical_feature"], "Categorical feature is not detected!" 50 | assert len(features) == importance_df.shape[0], "Missing importance value for some features!" 51 | 52 | df_checkpoint = df.copy() 53 | 54 | dataset = Dataset(df=df, target="binary_target", features=features) 55 | lofo = LOFOImportance(dataset, cv=4, scoring='roc_auc') 56 | importance_df = lofo.get_importance() 57 | 58 | assert "E" in lofo.fit_params["categorical_feature"], "Categorical feature is not detected!" 59 | assert df.equals(df_checkpoint), "LOFOImportance mutated the dataframe!" 60 | assert importance_df["feature"].values[0] == "E", "Most important feature is different than E!" 61 | 62 | 63 | def test_feature_groups(): 64 | df = generate_test_data(1000, text=True) 65 | features = ["A", "B", "C", "D"] 66 | 67 | cv = CountVectorizer(ngram_range=(3, 3), analyzer="char") 68 | feature_groups = dict() 69 | feature_groups["names"] = cv.fit_transform(df["T"]) 70 | feature_groups["interactions"] = df[["A", "B"]].values*df[["C", "D"]].values 71 | 72 | dataset = Dataset(df=df, target="binary_target", features=features, feature_groups=feature_groups) 73 | 74 | lgbm = LGBMClassifier(random_state=0, n_jobs=4) 75 | 76 | lofo = LOFOImportance(dataset, model=lgbm, cv=4, scoring='roc_auc') 77 | 78 | importance_df = lofo.get_importance() 79 | 80 | assert len(features) + len(feature_groups) == importance_df.shape[0], "Missing importance value for some features!" 81 | assert importance_df["feature"].values[0] == "names", "Most important feature is different than 'names'!" 82 | -------------------------------------------------------------------------------- /lofo/lofo_importance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.model_selection import cross_validate 3 | from tqdm.autonotebook import tqdm 4 | import warnings 5 | from lofo.infer_defaults import infer_model 6 | from lofo.utils import lofo_to_df, parallel_apply 7 | import sklearn 8 | 9 | 10 | class LOFOImportance: 11 | """ 12 | Leave One Feature Out Importance 13 | Given a model and cross-validation scheme, calculates the feature importances. 14 | 15 | Parameters 16 | ---------- 17 | dataset: LOFO Dataset object 18 | scoring: string or callable 19 | Same as scoring in sklearn API 20 | model: model (sklearn API), optional 21 | Not trained model object 22 | fit_params : dict, optional 23 | fit parameters for the model 24 | cv: int or iterable 25 | Same as cv in sklearn API 26 | n_jobs: int, optional 27 | Number of jobs for parallel computation 28 | cv_groups: array-like, with shape (n_samples,), optional 29 | Group labels for the samples used while splitting the dataset into train/test set. 30 | Only used in conjunction with a “Group” cv instance (e.g., GroupKFold). 31 | """ 32 | 33 | def __init__(self, dataset, scoring, model=None, fit_params=None, cv=4, n_jobs=None, cv_groups=None): 34 | 35 | self.fit_params = fit_params if fit_params else dict() 36 | if model is None: 37 | model, dataset.df, categoricals, dataset.y = infer_model(dataset.df, dataset.features, dataset.y, n_jobs) 38 | self.fit_params["categorical_feature"] = categoricals 39 | n_jobs = 1 40 | 41 | self.model = model 42 | self.dataset = dataset 43 | self.scoring = scoring 44 | self.cv = cv 45 | self.cv_groups = cv_groups 46 | self.n_jobs = n_jobs 47 | if self.n_jobs is not None and self.n_jobs > 1: 48 | warning_str = ("Warning: If your model is multithreaded, please initialise the number" 49 | "of jobs of LOFO to be equal to 1, otherwise you may experience performance issues.") 50 | warnings.warn(warning_str) 51 | 52 | sklearn_version = tuple(map(int, sklearn.__version__.split(".")[:2])) 53 | self._cv_param_name = "params" if sklearn_version >= (1, 4) else "fit_params" 54 | 55 | def _get_cv_score(self, feature_to_remove): 56 | X, fit_params = self.dataset.getX(feature_to_remove=feature_to_remove, fit_params=self.fit_params) 57 | y = self.dataset.y 58 | 59 | kwargs = {self._cv_param_name: fit_params, 60 | "cv": self.cv, "scoring": self.scoring, "groups": self.cv_groups} 61 | 62 | with warnings.catch_warnings(): 63 | warnings.simplefilter("ignore") 64 | cv_results = cross_validate(self.model, X, y, **kwargs) 65 | return cv_results['test_score'] 66 | 67 | def _get_cv_score_parallel(self, feature, result_queue): 68 | test_score = self._get_cv_score(feature_to_remove=feature) 69 | result_queue.put((feature, test_score)) 70 | return test_score 71 | 72 | def get_importance(self): 73 | """Run LOFO to get feature importances 74 | 75 | Returns 76 | ------- 77 | importance_df : pandas dataframe 78 | Dataframe with feature names and corresponding importance mean and std (sorted by importance) 79 | """ 80 | base_cv_score = self._get_cv_score(feature_to_remove=None) 81 | feature_list = self.dataset.feature_names + list(self.dataset.feature_groups.keys()) 82 | 83 | if self.n_jobs is not None and self.n_jobs > 1: 84 | lofo_cv_result = parallel_apply(self._get_cv_score_parallel, feature_list, self.n_jobs) 85 | lofo_cv_scores_normalized = np.array([base_cv_score - lofo_cv_score for _, lofo_cv_score in lofo_cv_result]) 86 | feature_list = [feature for feature, _ in lofo_cv_result] 87 | else: 88 | lofo_cv_scores = [] 89 | for f in tqdm(feature_list): 90 | lofo_cv_scores.append(self._get_cv_score(feature_to_remove=f)) 91 | lofo_cv_scores_normalized = np.array([base_cv_score - lofo_cv_score for lofo_cv_score in lofo_cv_scores]) 92 | 93 | return lofo_to_df(lofo_cv_scores_normalized, feature_list) 94 | -------------------------------------------------------------------------------- /lofo/flofo_importance.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from tqdm.autonotebook import tqdm 4 | import warnings 5 | from sklearn.metrics import check_scoring 6 | from lofo.utils import lofo_to_df, parallel_apply 7 | 8 | 9 | class FLOFOImportance: 10 | """ 11 | Fast LOFO Importance 12 | Applies already trained model on validation set by noising one feature each time. 13 | 14 | Parameters 15 | ---------- 16 | trained_model: model (sklearn API) 17 | The model should be trained already 18 | validation_df: pandas dataframe 19 | features: list of strings 20 | List of column names for features within validation_df 21 | target: string 22 | Column name for target within validation_df 23 | scoring: string or callable 24 | Same as scoring in sklearn API 25 | n_jobs: int, optional 26 | Number of jobs for parallel computation 27 | """ 28 | 29 | def __init__(self, trained_model, validation_df, features, target, 30 | scoring, n_jobs=None): 31 | self.trained_model = trained_model 32 | self.df = validation_df.copy() 33 | self.features = features 34 | self.target = target 35 | self.n_jobs = n_jobs 36 | self.scorer = check_scoring(estimator=self.trained_model, scoring=scoring) 37 | 38 | # FLOFO defaults 39 | self.num_bins = 10 40 | self.shuffle_func = np.random.permutation 41 | self.feature_group_len = 2 42 | self.num_sampling = 10 43 | 44 | min_data_needed = 10*(self.num_bins**self.feature_group_len) 45 | if self.df.shape[0] < min_data_needed: 46 | raise Exception("Small validation set (<{})".format(min_data_needed)) 47 | if len(self.features) <= self.feature_group_len: 48 | raise Exception("FLOFO needs more than {} features".format(self.feature_group_len)) 49 | 50 | if self.n_jobs is not None and self.n_jobs > 1: 51 | warning_str = ("Warning: If your model is multithreaded, please initialise the number" 52 | "of jobs of LOFO to be equal to 1, otherwise you may experience performance issues.") 53 | warnings.warn(warning_str) 54 | 55 | self._bin_features() 56 | 57 | def _bin_features(self): 58 | epsilon = 1e-10 59 | self.bin_df = pd.DataFrame() 60 | for feature in self.features: 61 | self.bin_df[feature] = self.df[feature].fillna(self.df[feature].median()) 62 | self.bin_df[feature] = (self.bin_df[feature].rank(pct=True)*(self.num_bins - epsilon)).astype(np.int32) 63 | 64 | def _get_score(self, updated_df): 65 | return self.scorer(self.trained_model, updated_df[self.features], self.df[self.target]) 66 | 67 | def _run(self, feature_name, n): 68 | scores = np.zeros(n) 69 | for i in range(n): 70 | feature_list = np.random.choice([feature for feature in self.features if feature != feature_name], 71 | size=self.feature_group_len, replace=False).tolist() 72 | self.bin_df["__f__"] = self.df[feature_name].values 73 | mutated_df = self.df.copy() 74 | mutated_df[feature_name] = self.bin_df.groupby(feature_list)["__f__"].transform(self.shuffle_func).values 75 | scores[i] = self._get_score(mutated_df) 76 | return scores 77 | 78 | def _run_parallel(self, feature_name, result_queue): 79 | test_score = self._run(feature_name, self.num_sampling) 80 | result_queue.put((feature_name, test_score)) 81 | return test_score 82 | 83 | def get_importance(self, num_sampling=10, random_state=0): 84 | """Run FLOFO to get feature importances 85 | 86 | Parameters 87 | ---------- 88 | num_sampling : int, optional 89 | Number of times features are shuffled 90 | random_state : int, optional 91 | Random seed 92 | 93 | Returns 94 | ------- 95 | importance_df : pandas dataframe 96 | Dataframe with feature names and corresponding importance mean and std (sorted by importance) 97 | """ 98 | np.random.seed(random_state) 99 | base_score = self._get_score(self.df) 100 | self.num_sampling = num_sampling 101 | 102 | if self.n_jobs is not None and self.n_jobs > 1: 103 | lofo_cv_scores = parallel_apply(self._run_parallel, self.features, self.n_jobs) 104 | lofo_cv_scores_normalized = np.array([base_score - lofo_cv_score for f, lofo_cv_score in lofo_cv_scores]) 105 | self.features = [score[0] for score in lofo_cv_scores] 106 | else: 107 | lofo_cv_scores = [] 108 | for f in tqdm(self.features): 109 | lofo_cv_scores.append(self._run(f, num_sampling)) 110 | lofo_cv_scores_normalized = np.array([base_score - lofo_cv_score for lofo_cv_score in lofo_cv_scores]) 111 | 112 | return lofo_to_df(lofo_cv_scores_normalized, self.features) 113 | -------------------------------------------------------------------------------- /lofo/dataset.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import numpy as np 3 | import scipy.sparse as ss 4 | from scipy.stats import spearmanr 5 | from lofo.utils import flatten_list 6 | import networkx as nx 7 | 8 | 9 | class Dataset: 10 | """ 11 | Dataset for LOFO 12 | Parameters 13 | ---------- 14 | df: pandas dataframe 15 | target: string 16 | Column name for target within df 17 | features: list of strings 18 | List of column names within df 19 | feature_groups: dict, optional 20 | Name, value dictionary of feature groups as numpy.darray or scipy.csr.scr_matrix 21 | auto_group_threshold: float, optional 22 | Threshold for grouping correlated features together, must be between 0 and 1 23 | """ 24 | 25 | def __init__(self, df, target, features, feature_groups=None, auto_group_threshold=1.0): 26 | self.df = df.copy() 27 | self.features = list(features) 28 | self.feature_groups = feature_groups if feature_groups else dict() 29 | 30 | self.num_rows = df.shape[0] 31 | self.target_name = target 32 | self.y = df[self.target_name].values 33 | 34 | grouped_features, auto_groups = self.auto_group_features(auto_group_threshold) 35 | self.features = [[f] for f in list(set(self.features) - set(grouped_features))] + auto_groups 36 | self.feature_names = [" & ".join(feature_list) for feature_list in self.features] 37 | 38 | if len(auto_groups) > 0: 39 | print("Automatically grouped features by correlation:") 40 | for i in range(len(auto_groups)): 41 | print(i + 1, auto_groups[i]) 42 | 43 | for feature_name, feature_matrix in self.feature_groups.items(): 44 | if not (isinstance(feature_matrix, np.ndarray) or isinstance(feature_matrix, ss.csr.csr_matrix)): 45 | raise Exception("Data type {dtype} is not a valid type!".format(dtype=type(feature_matrix))) 46 | 47 | if feature_matrix.shape[0] != self.num_rows: 48 | raise Exception("Expected {expected} rows but got {n} rows!".format(expected=self.num_rows, 49 | n=feature_matrix.shape[0])) 50 | 51 | if feature_name in self.feature_names: 52 | same_name_exception = "Feature group name '{name}' is the same with one of the features!" 53 | raise Exception(same_name_exception.format(name=feature_name)) 54 | 55 | def auto_group_features(self, auto_group_threshold): 56 | if auto_group_threshold == 1.0: 57 | return [], [] 58 | elif auto_group_threshold == 0.0: 59 | grouped_features = list(self.features) 60 | auto_groups = [set(self.features)] 61 | return grouped_features, auto_groups 62 | elif 0 < auto_group_threshold < 1: 63 | feature_matrix = self.df[self.features].values 64 | 65 | for i, feature in enumerate(self.features): 66 | if self.df[feature].dtype.name == "category": 67 | feature_series = self.df.groupby(feature)[self.target_name].transform("mean") 68 | else: 69 | feature_series = self.df[feature] 70 | feature_matrix[:, i] = feature_series.fillna(feature_series.mean()).fillna(0).values 71 | 72 | corr_matrix, _ = spearmanr(np.nan_to_num(feature_matrix)) 73 | corr_matrix = np.abs(corr_matrix) 74 | 75 | G = nx.Graph() 76 | 77 | for i in range(len(self.features)): 78 | for j in range(i + 1, len(self.features)): 79 | if corr_matrix[i, j] > auto_group_threshold: 80 | G.add_edge(i, j) 81 | 82 | subgraphs = [G.subgraph(c) for c in nx.connected_components(G)] 83 | 84 | groups = [] 85 | for sg in subgraphs: 86 | groups.append([self.features[node] for node in sg.nodes()]) 87 | 88 | auto_groups = [sorted(g) for g in groups] 89 | grouped_features = list(itertools.chain(*[list(g) for g in groups])) 90 | return grouped_features, auto_groups 91 | else: 92 | raise Exception("auto_group_threshold must be between 0 and 1 (inclusive)!") 93 | 94 | def getX(self, feature_to_remove, fit_params): 95 | """Get feature matrix and fit_params after removing a feature 96 | Parameters 97 | ---------- 98 | feature_to_remove : string 99 | feature name to remove 100 | fit_params : dict 101 | fit parameters for the model 102 | Returns 103 | ------- 104 | X : numpy.darray or scipy.csr.scr_matrix 105 | Feature matrix 106 | fit_params: dict 107 | Updated fit_params after feature removal 108 | """ 109 | feature_lists = [self.features[i] for i, feature_name in enumerate(self.feature_names) 110 | if feature_name != feature_to_remove] 111 | feature_list = flatten_list(feature_lists) 112 | concat_list = [self.df[feature_list].values] 113 | 114 | for feature_name, feature_matrix in self.feature_groups.items(): 115 | if feature_name != feature_to_remove: 116 | concat_list.append(feature_matrix) 117 | 118 | fit_params = fit_params.copy() 119 | if "categorical_feature" in fit_params: 120 | cat_features = [f for f in fit_params["categorical_feature"] if f != feature_to_remove] 121 | fit_params["categorical_feature"] = [ix for ix, f in enumerate(feature_list) if (f in cat_features)] 122 | 123 | has_sparse = False 124 | for feature_name, feature_matrix in self.feature_groups.items(): 125 | if feature_name != feature_to_remove and isinstance(feature_matrix, ss.csr.csr_matrix): 126 | has_sparse = True 127 | 128 | concat = np.hstack 129 | if has_sparse: 130 | concat = ss.hstack 131 | 132 | return concat(concat_list), fit_params 133 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![alt text](docs/lofo_logo.png?raw=true "Title") 2 | 3 | LOFO (Leave One Feature Out) Importance calculates the importances of a set of features based on a metric of choice, for a model of choice, by iteratively removing each feature from the set, and evaluating the performance of the model, with a validation scheme of choice, based on the chosen metric. 4 | 5 | LOFO first evaluates the performance of the model with all the input features included, then iteratively removes one feature at a time, retrains the model, and evaluates its performance on a validation set. The mean and standard deviation (across the folds) of the importance of each feature is then reported. 6 | 7 | If a model is not passed as an argument to LOFO Importance, it will run LightGBM as a default model. 8 | 9 | ## Install 10 | 11 | LOFO Importance can be installed using 12 | 13 | ``` 14 | pip install lofo-importance 15 | ``` 16 | 17 | ## Advantages of LOFO Importance 18 | 19 | LOFO has several advantages compared to other importance types: 20 | 21 | * It does not favor granular features 22 | * It generalises well to unseen test sets 23 | * It is model agnostic 24 | * It gives negative importance to features that hurt performance upon inclusion 25 | * It can group the features. Especially useful for high dimensional features like TFIDF or OHE features. 26 | * It can automatically group highly correlated features to avoid underestimating their importance. 27 | 28 | ## Example on Kaggle's Microsoft Malware Prediction Competition 29 | 30 | In this Kaggle competition, Microsoft provides a malware dataset to predict whether or not a machine will soon be hit with malware. One of the features, Centos_OSVersion is very predictive on the training set, since some OS versions are probably more prone to bugs and failures than others. However, upon splitting the data out of time, we obtain validation sets with OS versions that have not occurred in the training set. Therefore, the model will not have learned the relationship between the target and this seasonal feature. By evaluating this feature's importance using other importance types, Centos_OSVersion seems to have high importance, because its importance was evaluated using only the training set. However, LOFO Importance depends on a validation scheme, so it will not only give this feature low importance, but even negative importance. 31 | 32 | ```python 33 | import pandas as pd 34 | from sklearn.model_selection import KFold 35 | from lofo import LOFOImportance, Dataset, plot_importance 36 | %matplotlib inline 37 | 38 | # import data 39 | train_df = pd.read_csv("../input/train.csv", dtype=dtypes) 40 | 41 | # extract a sample of the data 42 | sample_df = train_df.sample(frac=0.01, random_state=0) 43 | sample_df.sort_values("AvSigVersion", inplace=True) # Sort by time for time split validation 44 | 45 | # define the validation scheme 46 | cv = KFold(n_splits=4, shuffle=False, random_state=None) # Don't shuffle to keep the time split split validation 47 | 48 | # define the binary target and the features 49 | dataset = Dataset(df=sample_df, target="HasDetections", features=[col for col in train_df.columns if col != "HasDetections"]) 50 | 51 | # define the validation scheme and scorer. The default model is LightGBM 52 | lofo_imp = LOFOImportance(dataset, cv=cv, scoring="roc_auc") 53 | 54 | # get the mean and standard deviation of the importances in pandas format 55 | importance_df = lofo_imp.get_importance() 56 | 57 | # plot the means and standard deviations of the importances 58 | plot_importance(importance_df, figsize=(12, 20)) 59 | ``` 60 | 61 | ![alt text](docs/plot_importance.png?raw=true "Title") 62 | 63 | ## Another Example: Kaggle's TReNDS Competition 64 | 65 | In this Kaggle competition, pariticipants are asked to predict some cognitive properties of patients. 66 | Independent component features (IC) from sMRI and very high dimensional correlation features (FNC) from 3D fMRIs are provided. 67 | LOFO can group the fMRI correlation features into one. 68 | 69 | ```python 70 | def get_lofo_importance(target): 71 | cv = KFold(n_splits=7, shuffle=True, random_state=17) 72 | 73 | dataset = Dataset(df=df[df[target].notnull()], target=target, features=loading_features, 74 | feature_groups={"fnc": df[df[target].notnull()][fnc_features].values 75 | }) 76 | 77 | model = Ridge(alpha=0.01) 78 | lofo_imp = LOFOImportance(dataset, cv=cv, scoring="neg_mean_absolute_error", model=model) 79 | 80 | return lofo_imp.get_importance() 81 | 82 | plot_importance(get_lofo_importance(target="domain1_var1"), figsize=(8, 8), kind="box") 83 | ``` 84 | 85 | ![alt text](docs/plot_importance_box.png?raw=true "Title") 86 | 87 | ## Flofo Importance 88 | 89 | If running the LOFO Importance package is too time-costly for you, you can use Fast LOFO. Fast LOFO, or FLOFO takes, as inputs, an already trained model and a validation set, and does a pseudo-random permutation on the values of each feature, one by one, then uses the trained model to make predictions on the validation set. The mean of the FLOFO importance is then the difference in the performance of the model on the validation set over several randomised permutations. 90 | The difference between FLOFO importance and permutation importance is that the permutations on a feature's values are done within groups, where groups are obtained by grouping the validation set by k=2 features. These k features are chosen at random n=10 times, and the mean and standard deviation of the FLOFO importance are calculated based on these n runs. 91 | The reason this grouping makes the measure of importance better is that permuting a feature's value is no longer completely random. In fact, the permutations are done within groups of similar samples, so the permutations are equivalent to noising the samples. This ensures that: 92 | 93 | * The permuted feature values are very unlikely to be replaced by unrealistic values. 94 | * A feature that is predictable by features among the chosen n*k features will be replaced by very similar values during permutation. Therefore, it will only slightly affect the model performance (and will yield a small FLOFO importance). This solves the correlated feature overestimation problem. 95 | -------------------------------------------------------------------------------- /LOFOImportance Example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "ExecuteTime": { 8 | "end_time": "2019-01-14T12:48:04.450500Z", 9 | "start_time": "2019-01-14T12:48:03.586150Z" 10 | } 11 | }, 12 | "outputs": [ 13 | { 14 | "name": "stderr", 15 | "output_type": "stream", 16 | "text": [ 17 | "/home/aerdem/projects/lofo-importance/lofo/lofo_importance.py:3: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n", 18 | " from tqdm.autonotebook import tqdm\n" 19 | ] 20 | } 21 | ], 22 | "source": [ 23 | "import numpy as np\n", 24 | "import pandas as pd\n", 25 | "from sklearn.linear_model import LinearRegression\n", 26 | "from sklearn.ensemble import RandomForestClassifier\n", 27 | "from sklearn.model_selection import KFold\n", 28 | "from lofo import LOFOImportance, FLOFOImportance, Dataset, plot_importance" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "metadata": { 35 | "ExecuteTime": { 36 | "end_time": "2019-01-14T12:48:04.513904Z", 37 | "start_time": "2019-01-14T12:48:04.453322Z" 38 | } 39 | }, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/html": [ 44 | "
\n", 45 | "\n", 58 | "\n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | "
ABCDD2targetbinary_target
00.5488140.5928800.8115180.4139620.4432271.4863051
10.7151890.0100640.4760840.6296180.6862700.5299490
20.6027630.4758260.5231560.7785840.7923261.4346741
30.5448830.7087700.2505210.8515580.8865291.9520461
40.4236550.0439750.6050430.8164130.8217340.4802670
\n", 124 | "
" 125 | ], 126 | "text/plain": [ 127 | " A B C D D2 target binary_target\n", 128 | "0 0.548814 0.592880 0.811518 0.413962 0.443227 1.486305 1\n", 129 | "1 0.715189 0.010064 0.476084 0.629618 0.686270 0.529949 0\n", 130 | "2 0.602763 0.475826 0.523156 0.778584 0.792326 1.434674 1\n", 131 | "3 0.544883 0.708770 0.250521 0.851558 0.886529 1.952046 1\n", 132 | "4 0.423655 0.043975 0.605043 0.816413 0.821734 0.480267 0" 133 | ] 134 | }, 135 | "execution_count": 2, 136 | "metadata": {}, 137 | "output_type": "execute_result" 138 | } 139 | ], 140 | "source": [ 141 | "from data.test_data import generate_test_data, generate_unstructured_test_data\n", 142 | "\n", 143 | "df = generate_test_data(1000)\n", 144 | "df.head()" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 3, 150 | "metadata": { 151 | "ExecuteTime": { 152 | "end_time": "2019-01-14T12:48:04.689772Z", 153 | "start_time": "2019-01-14T12:48:04.527994Z" 154 | } 155 | }, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "application/vnd.jupyter.widget-view+json": { 160 | "model_id": "581710f7ae654f5387a06a2befa408e4", 161 | "version_major": 2, 162 | "version_minor": 0 163 | }, 164 | "text/plain": [ 165 | "HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))" 166 | ] 167 | }, 168 | "metadata": {}, 169 | "output_type": "display_data" 170 | }, 171 | { 172 | "name": "stdout", 173 | "output_type": "stream", 174 | "text": [ 175 | "\n" 176 | ] 177 | }, 178 | { 179 | "data": { 180 | "text/html": [ 181 | "
\n", 182 | "\n", 195 | "\n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | "
featureimportance_meanimportance_stdval_imp_0val_imp_1val_imp_2val_imp_3val_imp_4val_imp_5val_imp_6val_imp_7val_imp_8val_imp_9
1B0.5402170.0160080.5321180.5442250.5132590.5243070.5256120.5425360.5505885.682745e-010.5598690.541383
3D0.0891870.0026290.0888320.0862910.0876120.0853800.0860040.0903780.0915829.345964e-020.0908000.091527
0A0.0881670.0029350.0907390.0861580.0852590.0932990.0882810.0884020.0831729.189529e-020.0870860.087376
2C0.0000020.0000330.000088-0.000020-0.000012-0.000027-0.0000160.0000040.000031-8.312825e-07-0.000002-0.000021
\n", 281 | "
" 282 | ], 283 | "text/plain": [ 284 | " feature importance_mean importance_std val_imp_0 val_imp_1 val_imp_2 \\\n", 285 | "1 B 0.540217 0.016008 0.532118 0.544225 0.513259 \n", 286 | "3 D 0.089187 0.002629 0.088832 0.086291 0.087612 \n", 287 | "0 A 0.088167 0.002935 0.090739 0.086158 0.085259 \n", 288 | "2 C 0.000002 0.000033 0.000088 -0.000020 -0.000012 \n", 289 | "\n", 290 | " val_imp_3 val_imp_4 val_imp_5 val_imp_6 val_imp_7 val_imp_8 \\\n", 291 | "1 0.524307 0.525612 0.542536 0.550588 5.682745e-01 0.559869 \n", 292 | "3 0.085380 0.086004 0.090378 0.091582 9.345964e-02 0.090800 \n", 293 | "0 0.093299 0.088281 0.088402 0.083172 9.189529e-02 0.087086 \n", 294 | "2 -0.000027 -0.000016 0.000004 0.000031 -8.312825e-07 -0.000002 \n", 295 | "\n", 296 | " val_imp_9 \n", 297 | "1 0.541383 \n", 298 | "3 0.091527 \n", 299 | "0 0.087376 \n", 300 | "2 -0.000021 " 301 | ] 302 | }, 303 | "execution_count": 3, 304 | "metadata": {}, 305 | "output_type": "execute_result" 306 | } 307 | ], 308 | "source": [ 309 | "lr = LinearRegression()\n", 310 | "lr.fit(df[[\"A\", \"B\", \"C\", \"D\"]], df[\"target\"])\n", 311 | "\n", 312 | "fi = FLOFOImportance(lr, df, [\"A\", \"B\", \"C\", \"D\"], 'target', scoring=\"neg_mean_absolute_error\")\n", 313 | "\n", 314 | "importances = fi.get_importance()\n", 315 | "importances" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 4, 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "data": { 325 | "application/vnd.jupyter.widget-view+json": { 326 | "model_id": "83d88dddf5bd4965937f0ba96949fcd2", 327 | "version_major": 2, 328 | "version_minor": 0 329 | }, 330 | "text/plain": [ 331 | "HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))" 332 | ] 333 | }, 334 | "metadata": {}, 335 | "output_type": "display_data" 336 | }, 337 | { 338 | "name": "stdout", 339 | "output_type": "stream", 340 | "text": [ 341 | "\n" 342 | ] 343 | }, 344 | { 345 | "data": { 346 | "text/html": [ 347 | "
\n", 348 | "\n", 361 | "\n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | "
featureimportance_meanimportance_stdval_imp_0val_imp_1val_imp_2val_imp_3
1B0.4472060.0242440.4327680.4185590.4547140.482782
3A0.0532470.0066990.0490210.0444870.0602690.059213
2D0.0525600.0030080.0519120.0576380.0506460.050044
0C-0.0000570.0001160.000119-0.000165-0.000023-0.000159
\n", 417 | "
" 418 | ], 419 | "text/plain": [ 420 | " feature importance_mean importance_std val_imp_0 val_imp_1 val_imp_2 \\\n", 421 | "1 B 0.447206 0.024244 0.432768 0.418559 0.454714 \n", 422 | "3 A 0.053247 0.006699 0.049021 0.044487 0.060269 \n", 423 | "2 D 0.052560 0.003008 0.051912 0.057638 0.050646 \n", 424 | "0 C -0.000057 0.000116 0.000119 -0.000165 -0.000023 \n", 425 | "\n", 426 | " val_imp_3 \n", 427 | "1 0.482782 \n", 428 | "3 0.059213 \n", 429 | "2 0.050044 \n", 430 | "0 -0.000159 " 431 | ] 432 | }, 433 | "execution_count": 4, 434 | "metadata": {}, 435 | "output_type": "execute_result" 436 | } 437 | ], 438 | "source": [ 439 | "from sklearn.metrics import make_scorer, mean_absolute_error\n", 440 | "\n", 441 | "scorer = make_scorer(mean_absolute_error, greater_is_better=False)\n", 442 | "cv = KFold(n_splits=4, shuffle=True, random_state=0)\n", 443 | "\n", 444 | "dataset = Dataset(df=df, target=\"target\", features=[\"A\", \"B\", \"C\", \"D\"])\n", 445 | "fi = LOFOImportance(dataset, scoring=scorer, model=LinearRegression(), cv=cv)\n", 446 | "\n", 447 | "importances = fi.get_importance()\n", 448 | "importances" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": 5, 454 | "metadata": { 455 | "ExecuteTime": { 456 | "end_time": "2019-01-14T12:48:05.103111Z", 457 | "start_time": "2019-01-14T12:48:04.692682Z" 458 | } 459 | }, 460 | "outputs": [ 461 | { 462 | "data": { 463 | "application/vnd.jupyter.widget-view+json": { 464 | "model_id": "b3feb8d91fc54a699c316d64a3729bb6", 465 | "version_major": 2, 466 | "version_minor": 0 467 | }, 468 | "text/plain": [ 469 | "HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))" 470 | ] 471 | }, 472 | "metadata": {}, 473 | "output_type": "display_data" 474 | }, 475 | { 476 | "name": "stdout", 477 | "output_type": "stream", 478 | "text": [ 479 | "\n" 480 | ] 481 | }, 482 | { 483 | "data": { 484 | "text/html": [ 485 | "
\n", 486 | "\n", 499 | "\n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | "
featureimportance_meanimportance_stdval_imp_0val_imp_1val_imp_2val_imp_3
1B0.4140.0253770.4320.4160.4360.372
2D0.0400.0074830.0320.0520.0400.036
3A0.0380.0128060.0440.0240.0560.028
0C0.0160.0074830.0160.0280.0080.012
\n", 555 | "
" 556 | ], 557 | "text/plain": [ 558 | " feature importance_mean importance_std val_imp_0 val_imp_1 val_imp_2 \\\n", 559 | "1 B 0.414 0.025377 0.432 0.416 0.436 \n", 560 | "2 D 0.040 0.007483 0.032 0.052 0.040 \n", 561 | "3 A 0.038 0.012806 0.044 0.024 0.056 \n", 562 | "0 C 0.016 0.007483 0.016 0.028 0.008 \n", 563 | "\n", 564 | " val_imp_3 \n", 565 | "1 0.372 \n", 566 | "2 0.036 \n", 567 | "3 0.028 \n", 568 | "0 0.012 " 569 | ] 570 | }, 571 | "execution_count": 5, 572 | "metadata": {}, 573 | "output_type": "execute_result" 574 | } 575 | ], 576 | "source": [ 577 | "rf = RandomForestClassifier(n_estimators=10, max_depth=5, random_state=0)\n", 578 | "\n", 579 | "dataset = Dataset(df=df, target=\"binary_target\", features=[\"A\", \"B\", \"C\", \"D\"])\n", 580 | "fi = LOFOImportance(dataset, scoring='accuracy', model=rf, cv=cv)\n", 581 | "\n", 582 | "importances = fi.get_importance()\n", 583 | "importances" 584 | ] 585 | }, 586 | { 587 | "cell_type": "code", 588 | "execution_count": 6, 589 | "metadata": {}, 590 | "outputs": [ 591 | { 592 | "data": { 593 | "application/vnd.jupyter.widget-view+json": { 594 | "model_id": "8c0f86bcbf3147ed8944d3da1e96aa6c", 595 | "version_major": 2, 596 | "version_minor": 0 597 | }, 598 | "text/plain": [ 599 | "HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))" 600 | ] 601 | }, 602 | "metadata": {}, 603 | "output_type": "display_data" 604 | }, 605 | { 606 | "name": "stdout", 607 | "output_type": "stream", 608 | "text": [ 609 | "\n" 610 | ] 611 | }, 612 | { 613 | "data": { 614 | "text/html": [ 615 | "
\n", 616 | "\n", 629 | "\n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | "
featureimportance_meanimportance_stdval_imp_0val_imp_1val_imp_2val_imp_3
1B0.2105790.0068480.2184650.1996960.2109720.213183
3A0.0111460.0067220.0023820.0076260.0144470.020127
2D0.0088920.0053670.0002550.0085820.0128590.013872
0C-0.0001710.004295-0.001984-0.0057860.0011560.005931
\n", 685 | "
" 686 | ], 687 | "text/plain": [ 688 | " feature importance_mean importance_std val_imp_0 val_imp_1 val_imp_2 \\\n", 689 | "1 B 0.210579 0.006848 0.218465 0.199696 0.210972 \n", 690 | "3 A 0.011146 0.006722 0.002382 0.007626 0.014447 \n", 691 | "2 D 0.008892 0.005367 0.000255 0.008582 0.012859 \n", 692 | "0 C -0.000171 0.004295 -0.001984 -0.005786 0.001156 \n", 693 | "\n", 694 | " val_imp_3 \n", 695 | "1 0.213183 \n", 696 | "3 0.020127 \n", 697 | "2 0.013872 \n", 698 | "0 0.005931 " 699 | ] 700 | }, 701 | "execution_count": 6, 702 | "metadata": {}, 703 | "output_type": "execute_result" 704 | } 705 | ], 706 | "source": [ 707 | "df = generate_unstructured_test_data(10000)\n", 708 | "\n", 709 | "dataset = Dataset(df=df, target=\"binary_target\", features=[\"A\", \"B\", \"C\", \"D\"])\n", 710 | "fi = LOFOImportance(dataset, 'roc_auc')\n", 711 | "\n", 712 | "importances = fi.get_importance()\n", 713 | "importances" 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "execution_count": 7, 719 | "metadata": {}, 720 | "outputs": [ 721 | { 722 | "data": { 723 | "application/vnd.jupyter.widget-view+json": { 724 | "model_id": "0bebf9dcd42d41cb862a45b1678d6807", 725 | "version_major": 2, 726 | "version_minor": 0 727 | }, 728 | "text/plain": [ 729 | "HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))" 730 | ] 731 | }, 732 | "metadata": {}, 733 | "output_type": "display_data" 734 | }, 735 | { 736 | "name": "stdout", 737 | "output_type": "stream", 738 | "text": [ 739 | "\n" 740 | ] 741 | }, 742 | { 743 | "data": { 744 | "text/html": [ 745 | "
\n", 746 | "\n", 759 | "\n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | "
featureimportance_meanimportance_stdval_imp_0val_imp_1val_imp_2val_imp_3
0E0.5350000.0045610.5327260.5293860.5417210.536167
2B0.3004750.0030540.2995460.3012270.2963280.304798
4D0.0472030.0011250.0485360.0479440.0467060.045625
3A0.0386830.0013770.0381910.0408310.0370310.038679
1C-0.0006930.000305-0.000725-0.000689-0.000249-0.001109
\n", 825 | "
" 826 | ], 827 | "text/plain": [ 828 | " feature importance_mean importance_std val_imp_0 val_imp_1 val_imp_2 \\\n", 829 | "0 E 0.535000 0.004561 0.532726 0.529386 0.541721 \n", 830 | "2 B 0.300475 0.003054 0.299546 0.301227 0.296328 \n", 831 | "4 D 0.047203 0.001125 0.048536 0.047944 0.046706 \n", 832 | "3 A 0.038683 0.001377 0.038191 0.040831 0.037031 \n", 833 | "1 C -0.000693 0.000305 -0.000725 -0.000689 -0.000249 \n", 834 | "\n", 835 | " val_imp_3 \n", 836 | "0 0.536167 \n", 837 | "2 0.304798 \n", 838 | "4 0.045625 \n", 839 | "3 0.038679 \n", 840 | "1 -0.001109 " 841 | ] 842 | }, 843 | "execution_count": 7, 844 | "metadata": {}, 845 | "output_type": "execute_result" 846 | } 847 | ], 848 | "source": [ 849 | "dataset = Dataset(df=df, target=\"target\", features=[\"A\", \"B\", \"C\", \"D\", \"E\"])\n", 850 | "fi = LOFOImportance(dataset, scorer, n_jobs=-1)\n", 851 | "\n", 852 | "importances = fi.get_importance()\n", 853 | "importances" 854 | ] 855 | }, 856 | { 857 | "cell_type": "code", 858 | "execution_count": 8, 859 | "metadata": {}, 860 | "outputs": [ 861 | { 862 | "data": { 863 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAekAAAHSCAYAAADIczP5AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAYI0lEQVR4nO3df7DddZ3f8dcnCRCWX62AFoGY4CDIEsKPm1SYsgZYRUdFC4zKug4w66BLUTvtSHHakaVOu+2uXaxoBWe74DK4OtphxtrW+mPJLGgVLjboCBiEDZhqazarIbIJmx/v/pFrDCE/Tticcz733sdj5g7n1z3f9/3cy33me873nNuqKgBAf+aMewAAYPdEGgA6JdIA0CmRBoBOiTQAdEqkAaBT88Y9wK6OOeaYWrhw4bjHAICRePDBB/+qqo7d3XXdRXrhwoWZnJwc9xgAMBKttSf3dJ2HuwGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOiTQAdEqkAaBTIg0AnRJpAOiUSANAp0QaADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0Cn5o17gFFoN7VxjwDANFY31li2a08aADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0CnRBoAOiXSANApkQaATok0AHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6NW/YG2itbU3yvZ0u+mxV/bthbxcApruhRzrJxqo6cwTbAYAZZRSRBoDp5fbnnl1+z/Idp1esWDGyMUbxnPShrbWVO328bdcbtNauaa1NttYm165dO4KRAKB/raqGu4HWflFVhw96+4mJiZqcnDywM9zUDuj9ATC71I3Da2Vr7cGqmtjddY7uBoBOiTQAdGoUB44d2lpbudP5L1fVDSPYLgBMa0OPdFXNHfY2AGAm8nA3AHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOiTQAdEqkAaBTIg0AnRJpAOiUSANAp0QaADol0gDQKZEGgE7NG/cAo1A31rhHAID9Zk8aADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0CnRBoAOiXSANApkQaATok0AHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOiTQAdEqkAaBTIg0AnRJpAOiUSANAp0QaADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0CnRBoAOiXSANApkQaATok0AHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOzRv3AKPQbmrjHgFmvbqxxj0CTDv2pAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOiTQAdEqkAaBTIg0AnRJpAOiUSANAp0QaADol0gDQKZEGgE6JNAB0SqQBoFNDjXRrbWtrbWVr7aHW2ndaa+cNc3sAMJPMG/L9b6yqM5OktXZxkt9P8uohbxMAZoRhR3pnRyb52Qi3B4zb7b86ufye5UmSFStWjGUUmI6GHelDW2srk8xPclySC3d3o9baNUmuSZIFCxYMeSQAmB5aVQ3vzlv7RVUdPnX63CR/nOT02stGJyYmanJy8sDOcVM7oPcH7L+6cXi/a2A6a609WFUTu7tuZEd3V9X/SnJMkmNHtU0AmM5GFunW2qlJ5iZZN6ptAsB0NqrnpJOkJbmyqrYOeZsAMCMMNdJVNXeY9w8AM5l3HAOATok0AHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOiTQAdEqkAaBTIg0AnRJpAOiUSANAp0QaADol0gDQqXnjHmAU6sYa9wgAsN/sSQNAp0QaADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0CnRBoAOiXSANApkQaATok0AHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOiTQAdEqkAaBTIg0AnRJpAOiUSANAp0QaADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0CnRBoAOiXSANApkQaATok0AHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0Kl54x5gFNpNbdwj7FA31rhHAGCasCcNAJ0SaQDolEgDQKdEGgA6JdIA0Kl9Rrpt99uttQ9NnV/QWls2/NEAYHYbZE/6PyU5N8kVU+c3JPnE0CYCAJIM9jrpf1hVZ7fW/neSVNXPWmsHD3kuAJj1BtmT3txam5ukkqS1dmySbUOdCgAYKNIfS3J3khe31v5NkvuS/NuhTgUA7P3h7tbanCR/meT6JBclaUneUlWPjGA2AJjV9hrpqtrWWvtEVZ2V5NERzQQAZLCHu7/eWrustdbPX6kAgFlgkEi/O8nnkzzbWnu6tbahtfb0kOcCgFlvny/BqqojRjEIAPBc+4x0a+03dnd5Vf3FgR8HAPilQd7M5AM7nZ6fZFmSB5NcOJSJAIAkgz3c/aadz7fWTkzy0X19Xmtta5LvJTkoyZYkf5rk5qryRigAMIBB9qR3tSbJKwe43caqOjNJWmsvTvKZJEcmufEFbBMAZp1BnpO+JVNvCZrtR4OfmeQ7+7ORqvppa+2aJA+01n6vqmqfnzRDLV++PCtWrBj3GABMA4PsSU/udHpLkj+rqm/s74aq6omp9wB/cZL/t/N1UwG/JkkWLFiwv3cNADPSIJH+e1X1H3e+oLX2/l0v+7uoqk8l+VSSTExMzOi9bHvRAAxqkDczuXI3l121vxtqrZ2UZGuSn+7v5wLAbLTHPenW2hVJfivJotbaF3e66ogkf70/G5n685a3Jvn4bH4+GgD2x94e7v5mkp8kOSbJf9jp8g1JvjvAfR/aWluZX70E684kf/QC5wSAWWePka6qJ5M8meTcF3LHVTX3hQ4FAAzwnHRr7VWttQdaa79orf1ta22rP7ABAMM3yIFjH09yRZLHkhya5F1JPjHMoQCAwSKdqvphkrlVtbWqbk/yuuGOBQAM8jrpv2mtHZxkZWvtD7L9YLKB4g4AvHCDxPadU7e7LskzSU5MctkwhwIABvsrWE+21g5NclxV3TSCmQCADHZ095uSrEzy5anzZ+7y5iYAwBAM8nD37yVZluTnSVJVK5MsGuJMAEAGi/Tmqlq/y2Xe2hMAhmyQo7u/31r7rSRzW2snJ3lftr9lKAAwRHvck26t3Tl18vEkv57k2SR/luTpJP90+KMBwOy2tz3pc1prL03ytiQX5Ll/ZOPXkmwa5mAAMNvtLdK3Jvl6kpOSTO50ecv256RPGuJcADDr7fHh7qr6WFW9MsmfVNVJO30sqiqBBoAh2+fR3VX1u6MYBAB4Lu/BDQCdGuQlWNNe3ehl3QBMP/akAaBTIg0AnRJpAOiUSANAp0QaADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0CnRBoAOiXSANApkQaATok0AHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOiTQAdEqkAaBTIg0AnRJpAOiUSANAp0QaADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0CnRBoAOiXSANApkQaATok0AHRKpAGgUyINAJ0SaQDo1LxxDzAK7aY28m3WjTXybQIws9iTBoBOiTQAdEqkAaBTIg0AnRJpAOiUSANAp0QaADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0CnRBoAOiXSANApkQaATg090q21t7TWqrV26rC3BQAzySj2pK9Ict/UfwGAAQ010q21w5P8oyS/k+Ttw9xWV25Pli9fPu4pAJjmhr0n/eYkX66qVUnWtdbO2d2NWmvXtNYmW2uTa9euHfJIADA9DDvSVyT57NTpz2YPD3lX1aeqaqKqJo499tghjzQCVycrVqwY9xQATHPzhnXHrbUXJbkwyeLWWiWZm6Raax+oqhrWdgFgphjmnvTlSe6sqpdV1cKqOjHJXyY5f4jbBIAZY5iRviLJ3btc9l/iKG8AGMjQHu6uqgt2c9nHhrU9AJhpvOMYAHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOiTQAdEqkAaBTIg0AnRJpAOiUSANAp0QaADol0gDQKZEGgE7NG/cAo1A31rhHAID9Zk8aADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0CnRBoAOiXSANApkQaATok0AHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOiTQAdEqkAaBTIg0AnRJpAOiUSANAp0QaADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0CnRBoAOiXSANApkQaATok0AHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOzfxItzbuCQDgBZn5kQaAaUqkAaBTIg0AnRJpAOiUSANAp0QaADol0gDQKZEGgE6JNAB0SqQBoFMiDQCdEmkA6JRIA0CnRBoAOiXSANApkQaATok0AHRq6JFurf2D1tpnW2uPt9YebK3999baK4a9XQCY7uYN885bay3J3Uk+XVVvn7psSZKXJFk1zG0DwHQ37D3pC5Jsrqpbf3lBVT1UVfcOebs7LB/VhgDgABt2pE9P8uC+btRau6a1Ntlam1y7du2QRwKA6aGLA8eq6lNVNVFVE8cee+wBve8VB/TeAGB0hh3p7yc5Z8jbAIAZadiR/vMkh7TWrvnlBa21M1pr5w95uwAw7Q010lVVSf5xkt+cegnW95P8fpL/O8ztAsBMMNSXYCVJVf04yVuHvR0AmGm6OHAMAHg+kQaATok0AHRKpAGgUyINAJ0SaQDolEgDQKdEGgA6JdIA0CmRBoBOiTQAdEqkAaBTIg0AnRJpAOjU0P9UJQDPt3nz5qxZsyabNm0a9yiMyPz583PCCSfkoIMOGvhzRBpgDNasWZMjjjgiCxcuTGtt3OMwZFWVdevWZc2aNVm0aNHAnzfzH+6uGvcEAM+zadOmHH300QI9S7TWcvTRR+/3IyczP9IAnRLo2eWFfL9FGgA6JdIAPWjtwH4M4LzzzhvyF/Vcq1evzmc+85mRbnO6E2mAWeqb3/zmyLa1ZcsWkX4BRBpgljr88MOTJCtWrMirX/3qvPnNb85JJ52UG264IXfddVeWLVuWxYsX5/HHH0+SXHXVVXnPe96TiYmJvOIVr8iXvvSlJNsPgrv66quzePHinHXWWbnnnnuSJHfccUcuueSSXHjhhbnoootyww035N57782ZZ56Zm2++OatXr87555+fs88+O2efffaOfzSsWLEiy5cvz+WXX55TTz0173jHO1JTBwE/8MADOe+887JkyZIsW7YsGzZsyNatW/OBD3wgS5cuzRlnnJHbbrttj1/zoF/r2rVrc9lll2Xp0qVZunRpvvGNbyRJ7r///px77rk566yzct555+UHP/jBjq/10ksvzete97qcfPLJuf766w/MN6mquvo455xzCmCme/jhh597wfbXohy4jwEcdthhVVV1zz331FFHHVU//vGPa9OmTfXSl760PvShD1VV1Uc/+tF6//vfX1VVV155ZV188cW1devWWrVqVR1//PG1cePG+shHPlJXX311VVU98sgjdeKJJ9bGjRvr9ttvr+OPP77WrVu3YztveMMbdmz/mWeeqY0bN1ZV1apVq+qXv//vueeeOvLII+tHP/pRbd26tV71qlfVvffeW88++2wtWrSo7r///qqqWr9+fW3evLluu+22+vCHP1xVVZs2bapzzjmnnnjiid1+zYN+rVdccUXde++9VVX15JNP1qmnnvqcbVZVffWrX61LL720qqpuv/32WrRoUf385z+vjRs31oIFC+qpp5563vaf932vqiSTtYcmep00AFm6dGmOO+64JMnLX/7yvPa1r02SLF68eMeecZK89a1vzZw5c3LyySfnpJNOyqOPPpr77rsv733ve5Mkp556al72spdl1apVSZLXvOY1edGLXrTbbW7evDnXXXddVq5cmblz5+74nCRZtmxZTjjhhCTJmWeemdWrV+eoo47Kcccdl6VLlyZJjjzyyCTJV77ylXz3u9/NF77whSTJ+vXr89hjj+3x9ciDfK1f+9rX8vDDD+/4nKeffjq/+MUvsn79+lx55ZV57LHH0lrL5s2bd9zmoosuylFHHZUkOe200/Lkk0/mxBNP3Nuy75NIA5BDDjlkx+k5c+bsOD9nzpxs2bJlx3W7voxoXy8rOuyww/Z43c0335yXvOQleeihh7Jt27bMnz9/t/PMnTv3OTPsqqpyyy235OKLL97rLLu77z19rdu2bcu3vvWt58yUJNddd10uuOCC3H333Vm9enWWL1/+gmYelOekARjY5z//+Wzbti2PP/54nnjiiZxyyik5//zzc9dddyVJVq1alaeeeiqnnHLK8z73iCOOyIYNG3acX79+fY477rjMmTMnd955Z7Zu3brXbZ9yyin5yU9+kgceeCBJsmHDhmzZsiUXX3xxPvnJT+7Yq121alWeeeaZv9PX+drXvja33HLLjvMrV67cMfPxxx+fZPvz0MMm0gA9ONDPSg/JggULsmzZsrz+9a/Prbfemvnz5+faa6/Ntm3bsnjx4rztbW/LHXfc8Zy9yl8644wzMnfu3CxZsiQ333xzrr322nz605/OkiVL8uijj+51rztJDj744Hzuc5/Le9/73ixZsiSvec1rsmnTprzrXe/KaaedlrPPPjunn3563v3ud/+d92I/9rGPZXJyMmeccUZOO+203HrrrUmS66+/Ph/84Adz1llnHZA95X1p1dnbZk5MTNTk5OS4xwAYqkceeSSvfOUrxz3Gfrnqqqvyxje+MZdffvm4R5m2dvd9b609WFUTu7u9PWkA6JQDxwAYyCiegz1Qvve97+Wd73zncy475JBD8u1vf3tME70wIg3AjLN48eIdB3tNZx7uBhiT3o4JYrheyPdbpAHGYP78+Vm3bp1QzxJVlXXr1j3vddf74uFugDE44YQTsmbNmqxdu3bcozAi8+fP3/EuaoMSaYAxOOigg/b4tpXwSx7uBoBOiTQAdEqkAaBT3b0taGttbZInD/DdHpPkrw7wfc5U1mpw1mpw1mpw1mpwM2WtXlZVx+7uiu4iPQyttck9vS8qz2WtBmetBmetBmetBjcb1srD3QDQKZEGgE7Nlkh/atwDTCPWanDWanDWanDWanAzfq1mxXPSADAdzZY9aQCYdmZUpFtrr2ut/aC19sPW2g27uf6Q1trnpq7/dmtt4ein7MMAa/UbrbXvtNa2tNYuH8eMvRhgrf5Za+3h1tp3W2tfb629bBxz9mCAtXpPa+17rbWVrbX7WmunjWPOHuxrrXa63WWttWqtzeijmPdmgJ+rq1pra6d+rla21t41jjmHoqpmxEeSuUkeT3JSkoOTPJTktF1uc22SW6dOvz3J58Y9d8drtTDJGUn+NMnl456587W6IMmvTZ3+XT9Xe12rI3c6fUmSL4977l7Xaup2RyT5iyTfSjIx7rl7XaskVyX5+LhnHcbHTNqTXpbkh1X1RFX9bZLPJnnzLrd5c5JPT53+QpKLWmtthDP2Yp9rVVWrq+q7SbaNY8CODLJW91TV30yd/VaS/fszNzPHIGv19E5nD0syWw+KGeT3VZJ8OMm/T7JplMN1ZtC1mpFmUqSPT/Kjnc6vmbpst7epqi1J1ic5eiTT9WWQtWK7/V2r30nyP4Y6Ub8GWqvW2j9prT2e5A+SvG9Es/Vmn2vVWjs7yYlV9d9GOViHBv1/8LKpp5y+0Fo7cTSjDd9MijSMVWvtt5NMJPnDcc/Ss6r6RFW9PMm/SPKvxj1Pj1prc5L8UZJ/Pu5Zpon/mmRhVZ2R5Kv51SOm095MivT/SbLzv55OmLpst7dprc1LclSSdSOZri+DrBXbDbRWrbXfTPIvk1xSVc+OaLbe7O/P1WeTvGWoE/VrX2t1RJLTk6xora1O8qokX5ylB4/t8+eqqtbt9P/dHyc5Z0SzDd1MivQDSU5urS1qrR2c7QeGfXGX23wxyZVTpy9P8uc1ddTBLDPIWrHdPteqtXZWktuyPdA/HcOMvRhkrU7e6ewbkjw2wvl6ste1qqr1VXVMVS2sqoXZfqzDJVU1OZ5xx2qQn6vjdjp7SZJHRjjfUM0b9wAHSlVtaa1dl+R/ZvvRgH9SVd9vrf3rJJNV9cUk/znJna21Hyb562z/Zs86g6xVa21pkruT/P0kb2qt3VRVvz7GscdiwJ+rP0xyeJLPTx2H+FRVXTK2ocdkwLW6bupRh81JfpZf/aN5VhlwrcjAa/W+1tolSbZk++/2q8Y28AHmHccAoFMz6eFuAJhRRBoAOiXSANApkQaATok0AHRKpAGgUyINAJ0SaQDo1P8HOXhARHlxcfMAAAAASUVORK5CYII=\n", 864 | "text/plain": [ 865 | "
" 866 | ] 867 | }, 868 | "metadata": { 869 | "needs_background": "light" 870 | }, 871 | "output_type": "display_data" 872 | } 873 | ], 874 | "source": [ 875 | "%matplotlib inline\n", 876 | "plot_importance(importances)" 877 | ] 878 | }, 879 | { 880 | "cell_type": "code", 881 | "execution_count": 9, 882 | "metadata": {}, 883 | "outputs": [ 884 | { 885 | "name": "stdout", 886 | "output_type": "stream", 887 | "text": [ 888 | "Automatically grouped features by correlation:\n", 889 | "1 ['D', 'D2']\n" 890 | ] 891 | }, 892 | { 893 | "data": { 894 | "application/vnd.jupyter.widget-view+json": { 895 | "model_id": "1b33b416ef064bb7acd791d8345ae7fd", 896 | "version_major": 2, 897 | "version_minor": 0 898 | }, 899 | "text/plain": [ 900 | "HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))" 901 | ] 902 | }, 903 | "metadata": {}, 904 | "output_type": "display_data" 905 | }, 906 | { 907 | "name": "stdout", 908 | "output_type": "stream", 909 | "text": [ 910 | "\n" 911 | ] 912 | }, 913 | { 914 | "data": { 915 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAfcAAAHSCAYAAADxFIKiAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAS20lEQVR4nO3cf4zkd33f8dcbn5NcDVhtTK5Rod4EUZGUwxC2PxWaJUU0kVOTKCgCpZWpQKe2IfyRqOKkVkW0f2BKE6khbekpaXCIKqOmQXLjhEKTjKKI0PYMNifqlmKw27RqIQ5yOefa2O6nf+w6Xm/2fN899vb7nfc+HpJ1M7Pf/c773prz82Z2bmqMEQCgj+fMPQAAcLjEHQCaEXcAaEbcAaAZcQeAZsQdAJo5MfcAh+Wmm24aGxsbh3rOxx57LDfccMOhnrMje5rOrqaxp2nsabquu7r33nt/e4zxgr23t4n7xsZGzp8/f6jnXK1W2draOtRzdmRP09nVNPY0jT1N13VXVfXwfrd7WR4AmhF3AGhG3AGgGXEHgGbEHQCaEXcAaEbcAaAZcQeAZsQdAJoRdwBoRtwBoBlxB4BmxB0AmhF3AGhG3AGgGXEHgGbEHQCaEXcAaEbcAaAZcQeAZk7MPQAArKtb3vXRPHrp8STJ877lbL7ywB2//7UbT16f+9/5ulnmEncAuEqPXno8D91xa5Lk9J1nf/9ykmycvWeusbwsDwDdiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4g4AzYg7AFylh9/zPZOPraprOMkziTsANCPuANCMuANAM+IOAM2IOwA0I+4A0MyJuQe4nKp6MsmFXTfdNca4Y655AGBdLDbuSS6NMV4x9xAAsG68LA8AzSz5mfvJqrpv1/V3jzE+tPuAqjqT5EySnDp1KqvV6lAHuHjx4qGfsyN7ms6uprGnaexpumu5q93n3XsfG2fvueyx11KNMY7kjg6qqi6OMZ479fjNzc1x/vz5Q51htVpla2vrUM/ZkT1NZ1fT2NM09jTdtdpVVeWpjp6+83Qu3P70W8U2zt6Th+64dd9jD/H+7x1jbO693cvyANCMuANAM+v0M/ePjDHOzjYNAKyJxcZ9jHHd3DMAwDrysjwANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A8BVuvkdvzj52KP8uHdxB4BmxB0AmhF3AGhG3AGgGXEHgGbEHQCaEXcAaEbcAaAZcQeAZk7MPQAArLONs/ckSZ73LU9fTpIbT14/10jiDgBX66E7bt117dbLHnfUvCwPAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADRzYu4BAPb6oV95LM95+HS+8sAdufHk9bn/na+beyRYK565A4vz2OPbvz50x6159NLj8w4Da0jcAaAZcQeAZsQdAJoRdwBoRtwBoBlxB4BmxB0AmhF3AGhG3AGgGXEHFqWqJt0GXJ64A0Az4g4AzYg7ADQj7gDQjLgDQDPiDgDNLDLuVfVkVd1XVfdX1Ser6s/PPRMArIsTcw9wGZfGGK9Ikqr6S0neneQ75h0JANbDIp+57/H8JF+eewgAWBdLfeZ+sqruS/J1Sb4xyXfud1BVnUlyJklOnTqV1Wp1qENcvHjx0M/ZkT1NZ1cHs3tX9vYHeTxNd9x2tdS4735Z/s8l+dmqetkYY+w+aIxxLsm5JNnc3BxbW1uHOsRqtcphn7Mje5rOrg5ma2sr+cg9T1/mGTyepjtuu1r8y/JjjN9MclOSF8w9CwCsg8XHvapemuS6JI/MPQsArIOlviz/1M/ck6SS3D7GeHLOgQBgXSwy7mOM6+aeAQDW1eJflgcADkbcAaAZcQeAZsQdAJoRdwBoRtwBoBlxBxZlz6dMX/Y24PLEHQCaEXcAaEbcAaAZcQeAZsQdAJoRdwBoRtwBoBlxB4BmxB0AmhF3YLE2zt6TG09eP/cYsHZOzD0AwF4f+K4bsrV1Ye4xYG155g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPua+L0nadzy7s+OvcYAKwBcV8jj156fO4RAFgD4g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM+K+BqoqSfLwe75n5kkAWAfiDgDNiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0MwV415VT1bVfVX1maq6v6p+tKr2/b6qeklV/fuq+nRV/dtnOedGVV2qqk9V1QM73/PmXV//wZ1zXKiqj1fVLVf1uwOAY+jEhGMujTFekSRV9Q1J/kWS5yd55z7Hnk3yT8cYP1NV33SF8z44xnjlznm/OckvVFWNMX4myReSfMcY48tV9d1JziX5M9N+SwBwvB3oZfkxxheTnEnytnrqY9Oe6feSvHDn2C8c4LyfT/IjSd6+c/3jY4wv73z5E0+dEwC4sinP3J9hjPH5qrouyTck+V97vvxgkndU1afGGL94wFN/MslL97n9LUl+eb9vqKoz2f7LRk6dOpXVanXAu3x2Fy9ePPRzfrWWNk+yzD0tlV1NY0/T2NN0x21XB4775VTVtyV5XZJXJvlYVf1Okt/MdvBfPMYYVzrFPud8Tbbj/u37fcMY41y2X7LP5ubm2Nrauur597NarXLY5/xqLW2eZJl7Wiq7msaeprGn6Y7brg4c952fjz+Z5It7vvTaJB8fY/xWVX1fkruTvD/JL00Ie7L9l4IHdt3Py5P8VJLvHmM8ctA5AeC4OtDP3KvqBdkO9k/uE+xPJXl9Vd04xvhPSd6b5MeS/NyE824k+YdJ3rdz/Y8n+YUkf3WM8dmDzAgAx92UZ+4nq+q+JNcneSLJB5P8+N6Dxhgfq6qfS/KJqvrdbL/j/a8l+UBVvXqM8aU93/LiqvpUkq9L8pUkPzHG+MDO1/5ukq9P8k923rf3xBhj88C/OwA4hq4Y9zHGdVNPNsb4sWw/W9/tA/sc91CSk89ynrcmeevU+wUAnuYT6gCgGXEHgGbEHQCaEXcAaEbcAaAZcQeAZsR9DTz1eUE3v+OgH9cPwHEk7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4r5EbT14/9wgArIETcw/ANBduvzD3CACsCc/cAaAZcQeAZsQdAJoRdwBoRtwBoBlxB4BmxB0AmhF3AGhG3AGgGXEHgGbEHQCaEXcAaEbcAaAZcQeAZsQdAJoRdwBoRtwBoBlxB4BmxB0AmhF3AGhG3AGgGXEHgGbEHQCaEXcAaEbcAaAZcQeAZsQdAJoRdwBoRtwBoBlxB4BmxB0AmhF3AGhG3AGgGXEHgGbEHQCaEXcAaEbcAaAZcQeAZsQdAJoRdwBoRtwBoBlxB4BmxH1hTt95Ore866NzjwHAGhP3BXr00uNzjwDAGhN3AGhG3AGgGXEHgGbEHQCaEXcAaEbcAaAZcQeAZsQdAJoR9wWpqn0vA8BBiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Azi417VX1vVY2qeuncswDAOlls3JO8Kclv7PwKAEy0yLhX1XOTfHuStyR548zjAMBaOTH3AJfx+iQfGWN8tqoeqapXjTHu3XtQVZ1JciZJTp06ldVqdahDXLx48dDPeRBz3vdBzL2ndWJX09jTNPY03XHb1VLj/qYk/2jn8l071/9A3McY55KcS5LNzc2xtbV1qEOsVqsc9jkPYs77Poi597RO7Goae5rGnqY7brtaXNyr6o8k+c4kp6tqJLkuyaiqvzXGGPNOBwDLt8Sfub8hyQfHGDePMTbGGC9K8oUkr555LgBYC0uM+5uSfHjPbf8q3jUPAJMs7mX5McZr9rntJ+aYBQDW0RKfuQMAXwVxB4BmxB0AmhF3AGhG3AGgGXEHgGbEfUF2fwCfD+MD4GqJOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPuC3TjyevnHgGANXZi7gF4pgu3X5h7BADWnGfuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4v4sfvjhH557BAA4MHEHgGbEHQCaEXcAaEbcAaAZcQeAZsQdAJoRdwBoRtwBoBlxB4BmxP0yqmruEQDgqog7ADQj7gDQjLgDQDPiDgDNiDsANCPuANCMuANAM4uNe1X90aq6q6oerKp7q+qXqupPzD0XACzdibkH2E9tf4LMh5PcOcZ4485ttyQ5leSzc84GAEu3yLgneU2Sx8cY73/qhjHG/TPOAwBrY6lxf1mSe690UFWdSXImSU6dOpXVanXog1yLc3Zz8eJFe5rIrqaxp2nsabrjtqulxn2SMca5JOeSZHNzc2xtbR36fVyLc3azWq3saSK7msaeprGn6Y7brpb6hrrPJHnV3EMAwDpaatx/NcnX7rzsniSpqpdX1atnnAkA1sIi4z7GGEm+L8lrd/4p3GeSvDvJ/5x3MgBYvsX+zH2M8T+S/MDccwDAulnkM3cA4OqJOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7pex/U/tAWD9iDsANCPuANCMuANAM+IOAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLg/i/fd/L65RwCAAxN3AGhG3AGgGXEHgGbEHQCaEXcAaEbcAaAZcQeAZsQdAJoRdwBoRtwBoBlxB4BmxB0AmhF3AGhG3AGgGXEHgGbEHQCaEXcAaEbcAaAZcQeAZsQdAJqpMcbcMxyKqvpSkocP+bQ3JfntQz5nR/Y0nV1NY0/T2NN0XXd18xjjBXtvbBP3a6Gqzo8xNueeY+nsaTq7msaeprGn6Y7brrwsDwDNiDsANCPuz+7c3AOsCXuazq6msadp7Gm6Y7UrP3MHgGY8cweAZsQ9SVV9V1X956r6XFWd3efrX1tVH9r5+r+rqo2jn3J+E/b0F6rqk1X1RFW9YY4Zl2DCnn6kqv5jVX26qn6lqm6eY84lmLCrv15VF6rqvqr6jar61jnmnNuV9rTruO+vqlFVx+Zd4btNeDy9uaq+tPN4uq+q3jrHnEdijHGs/0tyXZIHk3xzkq9Jcn+Sb91zzN9M8v6dy29M8qG5517onjaSvDzJzyZ5w9wzL3hPr0nyh3Yu/43j+Hg6wK6ev+vybUk+MvfcS9zTznHPS/LrST6RZHPuuZe4pyRvTvKTc896FP955p786SSfG2N8fozxe0nuSvL6Pce8PsmdO5d/PslfrKo6whmX4Ip7GmM8NMb4dJL/N8eACzFlT782xvjdnaufSPLCI55xKabs6n/vunpDkuP4JqEp/49Kkr+f5D1J/s9RDrcgU/d0LIh78seS/Ldd139r57Z9jxljPJHk0SRffyTTLceUPXHwPb0lyS9f04mWa9KuquqHqurBJP8gyduPaLYlueKequrbkrxojHHPUQ62MFP/7H3/zo/Efr6qXnQ0ox09cYeZVNVfSbKZ5L1zz7JkY4x/PMZ4cZJ3JPk7c8+zNFX1nCQ/nuRH555lDfzrJBtjjJcn+ViefkW2HXFP/nuS3X97e+HObfseU1UnktyY5JEjmW45puyJiXuqqtcm+dtJbhtj/N8jmm1pDvqYuivJ917TiZbpSnt6XpKXJVlV1UNJ/mySu4/hm+qu+HgaYzyy68/bTyV51RHNduTEPfkPSV5SVd9UVV+T7TfM3b3nmLuT3L5z+Q1JfnXsvDvjGJmyJybsqapemeSfZTvsX5xhxqWYsquX7Lp6a5L/coTzLcWz7mmM8egY46YxxsYYYyPb7+O4bYxxfp5xZzPl8fSNu67eluSBI5zvSJ2Ye4C5jTGeqKq3Jfk32X635T8fY3ymqv5ekvNjjLuT/HSSD1bV55L8TrYfNMfKlD1V1Z9K8uEkfzjJX66qd40x/uSMYx+5iY+n9yZ5bpJ/ufO+zP86xrhttqFnMnFXb9t5lePxJF/O03/JPjYm7unYm7int1fVbUmeyPb/y98828DXmE+oA4BmvCwPAM2IOwA0I+4A0Iy4A0Az4g4AzYg7ADQj7gDQjLgDQDP/HwaXyDutlQJMAAAAAElFTkSuQmCC\n", 916 | "text/plain": [ 917 | "
" 918 | ] 919 | }, 920 | "metadata": { 921 | "needs_background": "light" 922 | }, 923 | "output_type": "display_data" 924 | } 925 | ], 926 | "source": [ 927 | "dataset = Dataset(df=df, target=\"target\", features=[\"A\", \"B\", \"C\", \"D\", \"D2\", \"E\"], \n", 928 | " auto_group_threshold=0.7)\n", 929 | "fi = LOFOImportance(dataset, scorer, n_jobs=-1)\n", 930 | "\n", 931 | "importances = fi.get_importance()\n", 932 | "importances\n", 933 | "\n", 934 | "\n", 935 | "plot_importance(importances, kind=\"box\")" 936 | ] 937 | }, 938 | { 939 | "cell_type": "code", 940 | "execution_count": 10, 941 | "metadata": {}, 942 | "outputs": [ 943 | { 944 | "name": "stderr", 945 | "output_type": "stream", 946 | "text": [ 947 | "/home/aerdem/projects/lofo-importance/lofo/lofo_importance.py:45: UserWarning: Warning: If your model is multithreaded, please initialise the numberof jobs of LOFO to be equal to 1, otherwise you may experience performance issues.\n", 948 | " warnings.warn(warning_str)\n" 949 | ] 950 | }, 951 | { 952 | "data": { 953 | "text/html": [ 954 | "
\n", 955 | "\n", 968 | "\n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | "
featureimportance_meanimportance_stdval_imp_0val_imp_1val_imp_2val_imp_3
5names0.2326980.0250972.480101e-010.1936880.2290180.260076
3B0.0026410.0014134.608295e-030.0032120.0008380.001906
2A0.0004840.0005351.256808e-03-0.0001400.0001400.000681
1D-0.0000670.000205-1.110223e-160.0001400.000000-0.000408
4interactions-0.0000710.0001550.000000e+00-0.000140-0.0002790.000136
0C-0.0001020.000203-1.396453e-040.0001400.000000-0.000408
\n", 1044 | "
" 1045 | ], 1046 | "text/plain": [ 1047 | " feature importance_mean importance_std val_imp_0 val_imp_1 \\\n", 1048 | "5 names 0.232698 0.025097 2.480101e-01 0.193688 \n", 1049 | "3 B 0.002641 0.001413 4.608295e-03 0.003212 \n", 1050 | "2 A 0.000484 0.000535 1.256808e-03 -0.000140 \n", 1051 | "1 D -0.000067 0.000205 -1.110223e-16 0.000140 \n", 1052 | "4 interactions -0.000071 0.000155 0.000000e+00 -0.000140 \n", 1053 | "0 C -0.000102 0.000203 -1.396453e-04 0.000140 \n", 1054 | "\n", 1055 | " val_imp_2 val_imp_3 \n", 1056 | "5 0.229018 0.260076 \n", 1057 | "3 0.000838 0.001906 \n", 1058 | "2 0.000140 0.000681 \n", 1059 | "1 0.000000 -0.000408 \n", 1060 | "4 -0.000279 0.000136 \n", 1061 | "0 0.000000 -0.000408 " 1062 | ] 1063 | }, 1064 | "execution_count": 10, 1065 | "metadata": {}, 1066 | "output_type": "execute_result" 1067 | } 1068 | ], 1069 | "source": [ 1070 | "from sklearn.feature_extraction.text import CountVectorizer\n", 1071 | "from lightgbm import LGBMClassifier\n", 1072 | "\n", 1073 | "df = generate_test_data(1000, text=True)\n", 1074 | "features = [\"A\", \"B\", \"C\", \"D\"]\n", 1075 | "\n", 1076 | "cv = CountVectorizer(ngram_range=(3, 3), analyzer=\"char\")\n", 1077 | "feature_groups = dict()\n", 1078 | "feature_groups[\"names\"] = cv.fit_transform(df[\"T\"])\n", 1079 | "feature_groups[\"interactions\"] = df[[\"A\", \"B\"]].values*df[[\"C\", \"D\"]].values\n", 1080 | "\n", 1081 | "dataset = Dataset(df=df, target=\"binary_target\", features=features, feature_groups=feature_groups)\n", 1082 | "\n", 1083 | "lgbm = LGBMClassifier(random_state=0, n_jobs=1)\n", 1084 | "\n", 1085 | "lofo = LOFOImportance(dataset, model=lgbm, cv=4, scoring='roc_auc', n_jobs=4)\n", 1086 | "\n", 1087 | "importances = lofo.get_importance()\n", 1088 | "importances" 1089 | ] 1090 | }, 1091 | { 1092 | "cell_type": "code", 1093 | "execution_count": null, 1094 | "metadata": {}, 1095 | "outputs": [], 1096 | "source": [] 1097 | }, 1098 | { 1099 | "cell_type": "code", 1100 | "execution_count": null, 1101 | "metadata": {}, 1102 | "outputs": [], 1103 | "source": [] 1104 | } 1105 | ], 1106 | "metadata": { 1107 | "kernelspec": { 1108 | "display_name": "Python 3", 1109 | "language": "python", 1110 | "name": "python3" 1111 | }, 1112 | "language_info": { 1113 | "codemirror_mode": { 1114 | "name": "ipython", 1115 | "version": 3 1116 | }, 1117 | "file_extension": ".py", 1118 | "mimetype": "text/x-python", 1119 | "name": "python", 1120 | "nbconvert_exporter": "python", 1121 | "pygments_lexer": "ipython3", 1122 | "version": "3.6.9" 1123 | }, 1124 | "toc": { 1125 | "colors": { 1126 | "hover_highlight": "#DAA520", 1127 | "navigate_num": "#000000", 1128 | "navigate_text": "#333333", 1129 | "running_highlight": "#FF0000", 1130 | "selected_highlight": "#FFD700", 1131 | "sidebar_border": "#EEEEEE", 1132 | "wrapper_background": "#FFFFFF" 1133 | }, 1134 | "moveMenuLeft": true, 1135 | "nav_menu": { 1136 | "height": "12px", 1137 | "width": "252px" 1138 | }, 1139 | "navigate_menu": true, 1140 | "number_sections": true, 1141 | "sideBar": true, 1142 | "threshold": 4, 1143 | "toc_cell": false, 1144 | "toc_section_display": "block", 1145 | "toc_window_display": false, 1146 | "widenNotebook": false 1147 | } 1148 | }, 1149 | "nbformat": 4, 1150 | "nbformat_minor": 2 1151 | } 1152 | --------------------------------------------------------------------------------