├── lce ├── tests │ ├── __init__.py │ └── test_lce.py ├── _version.py ├── __init__.py ├── _catboost.py ├── _lightgbm.py ├── _xgboost.py └── _lce.py ├── .github ├── ISSUE_TEMPLATE │ ├── config.yml │ ├── doc_improvement.yml │ ├── feature_request.yml │ └── bug_report.yml ├── CODE_OF_CONDUCT.md ├── PULL_REQUEST_TEMPLATE.md ├── workflows │ └── lce.yml └── CONTRIBUTING.md ├── examples ├── README.rst ├── lceclassifier_iris.py ├── lceregressor_diabetes.py ├── lceclassifier_iris_cv.py └── lceregressor_missing_diabetes.py ├── doc ├── _images │ ├── Figure_LCE.png │ ├── Figure_BaggingvsBoosting.png │ └── logo_lce.svg ├── api.rst ├── _templates │ ├── function.rst │ ├── numpydoc_docstring.py │ └── class.rst ├── _static │ ├── css │ │ └── project-template.css │ └── js │ │ └── copybutton.js ├── reference.rst ├── index.rst ├── contribute.rst ├── make.bat ├── Makefile ├── conf.py └── tutorial.rst ├── requirements.txt ├── setup.cfg ├── .readthedocs.yml ├── .codecov.yml ├── .gitignore ├── .circleci └── config.yml ├── setup.py ├── logo └── logo_lce.svg ├── README.rst └── LICENSE /lce/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lce/_version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.3.4" 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false -------------------------------------------------------------------------------- /examples/README.rst: -------------------------------------------------------------------------------- 1 | .. _general_examples: 2 | 3 | General examples 4 | ================ -------------------------------------------------------------------------------- /doc/_images/Figure_LCE.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LocalCascadeEnsemble/LCE/HEAD/doc/_images/Figure_LCE.png -------------------------------------------------------------------------------- /doc/_images/Figure_BaggingvsBoosting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LocalCascadeEnsemble/LCE/HEAD/doc/_images/Figure_BaggingvsBoosting.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | catboost==1.1.1 2 | hyperopt==0.2.7 3 | lightgbm==3.3.5 4 | numpy==1.23.3 5 | pandas==1.5.0 6 | scikit-learn==1.1.2 7 | xgboost==1.6.2 -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description_file = README.rst 3 | 4 | [aliases] 5 | test = pytest 6 | 7 | [tool:pytest] 8 | addopts = --doctest-modules 9 | -------------------------------------------------------------------------------- /.github/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | Code of Conduct 2 | =============== 3 | 4 | This repository follows the principles of the Python Software Foundation: https://www.python.org/psf/codeofconduct/ 5 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | formats: 2 | - none 3 | requirements_file: requirements.txt 4 | python: 5 | version: 3.8 6 | pip_install: true 7 | extra_requirements: 8 | - tests 9 | - docs -------------------------------------------------------------------------------- /lce/__init__.py: -------------------------------------------------------------------------------- 1 | from ._lce import LCEClassifier 2 | from ._lce import LCERegressor 3 | from ._version import __version__ 4 | 5 | 6 | __all__ = ["LCEClassifier", "LCERegressor", "__version__"] 7 | -------------------------------------------------------------------------------- /.codecov.yml: -------------------------------------------------------------------------------- 1 | comment: false 2 | 3 | coverage: 4 | status: 5 | project: 6 | default: 7 | target: auto 8 | threshold: 5% 9 | patch: 10 | default: 11 | target: auto 12 | threshold: 5% -------------------------------------------------------------------------------- /doc/api.rst: -------------------------------------------------------------------------------- 1 | ############# 2 | API 3 | ############# 4 | 5 | .. _APIDocumentation: 6 | 7 | This is the API documentation of LCE. 8 | 9 | .. currentmodule:: lce 10 | 11 | .. autosummary:: 12 | :toctree: generated/ 13 | :template: class.rst 14 | 15 | LCEClassifier 16 | LCERegressor -------------------------------------------------------------------------------- /doc/_templates/function.rst: -------------------------------------------------------------------------------- 1 | :mod:`{{module}}`.{{objname}} 2 | {{ underline }}==================== 3 | 4 | .. currentmodule:: {{ module }} 5 | 6 | .. autofunction:: {{ objname }} 7 | 8 | .. include:: {{module}}.{{objname}}.examples 9 | 10 | .. raw:: html 11 | 12 |
13 | -------------------------------------------------------------------------------- /doc/_templates/numpydoc_docstring.py: -------------------------------------------------------------------------------- 1 | {{index}} 2 | {{summary}} 3 | {{extended_summary}} 4 | {{parameters}} 5 | {{returns}} 6 | {{yields}} 7 | {{other_parameters}} 8 | {{attributes}} 9 | {{raises}} 10 | {{warns}} 11 | {{warnings}} 12 | {{see_also}} 13 | {{notes}} 14 | {{references}} 15 | {{examples}} 16 | {{methods}} 17 | -------------------------------------------------------------------------------- /doc/_static/css/project-template.css: -------------------------------------------------------------------------------- 1 | @import url("theme.css"); 2 | 3 | .highlight a { 4 | text-decoration: underline; 5 | } 6 | 7 | .deprecated p { 8 | padding: 10px 7px 10px 10px; 9 | color: #b94a48; 10 | background-color: #F3E5E5; 11 | border: 1px solid #eed3d7; 12 | } 13 | 14 | .deprecated p span.versionmodified { 15 | font-weight: bold; 16 | } 17 | -------------------------------------------------------------------------------- /doc/_templates/class.rst: -------------------------------------------------------------------------------- 1 | :mod:`{{module}}`.{{objname}} 2 | {{ underline }}============== 3 | 4 | .. currentmodule:: {{ module }} 5 | 6 | .. autoclass:: {{ objname }} 7 | :members: 8 | 9 | {% block methods %} 10 | .. automethod:: __init__ 11 | {% endblock %} 12 | 13 | .. include:: {{module}}.{{objname}}.examples 14 | 15 | .. raw:: html 16 | 17 |
18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/doc_improvement.yml: -------------------------------------------------------------------------------- 1 | name: Documentation Improvement 2 | description: Create a report to help improve the documentation. Alternatively you can just open a pull request with the suggested change. 3 | 4 | body: 5 | - type: textarea 6 | attributes: 7 | label: Describe the issue 8 | validations: 9 | required: true 10 | - type: textarea 11 | attributes: 12 | label: Explain the improvement -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.yml: -------------------------------------------------------------------------------- 1 | name: Feature Request 2 | description: Suggest an enhancement to LCE. 3 | 4 | body: 5 | - type: textarea 6 | attributes: 7 | label: Describe the workflow you want to enable 8 | validations: 9 | required: true 10 | - type: textarea 11 | attributes: 12 | label: Describe your proposed solution 13 | validations: 14 | required: true 15 | - type: textarea 16 | attributes: 17 | label: Describe alternatives you've considered, if relevant 18 | - type: textarea 19 | attributes: 20 | label: Additional information 21 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | # Description 2 | 3 | Please include a summary of the change and which issue is fixed. List any dependencies that are required for this change. 4 | 5 | Fixes # (issue) 6 | 7 | # How Has This Been Tested? 8 | 9 | Please describe the tests that you ran to verify your changes. Please also note any relevant details of your test configuration. 10 | 11 | 12 | # Checklist: 13 | 14 | - [ ] I have performed a self-review of my own code 15 | - [ ] I have commented my code 16 | - [ ] My changes generate no warning 17 | - [ ] My code follows the style guidelines of this project 18 | - [ ] I have made the corresponding changes to the documentation -------------------------------------------------------------------------------- /.github/workflows/lce.yml: -------------------------------------------------------------------------------- 1 | name: lce workflow 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | strategy: 8 | matrix: 9 | os: [ubuntu-latest, macos-latest, windows-latest] 10 | runs-on: ${{ matrix.os }} 11 | name: test python lce 12 | defaults: 13 | run: 14 | working-directory: ./lce 15 | steps: 16 | - uses: actions/checkout@v2 17 | - uses: actions/setup-python@v2 18 | with: 19 | python-version: 3.8 20 | - name: install dependencies 21 | run: | 22 | pip install -r ../requirements.txt 23 | pip install pytest pytest-cov 24 | - name: run tests and collect coverage 25 | run: pytest --cov=./ --cov-report=xml 26 | - name: upload coverage reports to Codecov with GitHub Action 27 | uses: codecov/codecov-action@v3 28 | -------------------------------------------------------------------------------- /examples/lceclassifier_iris.py: -------------------------------------------------------------------------------- 1 | """ 2 | ============================= 3 | LCEClassifier on Iris dataset 4 | ============================= 5 | 6 | An example of :class:`lce.LCEClassifier` 7 | """ 8 | 9 | from lce import LCEClassifier 10 | from sklearn.datasets import load_iris 11 | from sklearn.metrics import accuracy_score 12 | from sklearn.model_selection import train_test_split 13 | 14 | 15 | # Load data and generate a train/test split 16 | data = load_iris() 17 | X_train, X_test, y_train, y_test = train_test_split( 18 | data.data, data.target, random_state=0 19 | ) 20 | 21 | # Train LCEClassifier with default parameters 22 | clf = LCEClassifier(n_jobs=-1, random_state=0) 23 | clf.fit(X_train, y_train) 24 | 25 | # Make prediction and compute accuracy score 26 | y_pred = clf.predict(X_test) 27 | accuracy = accuracy_score(y_test, y_pred) 28 | print("Accuracy: {:.1f}%".format(accuracy * 100)) 29 | -------------------------------------------------------------------------------- /examples/lceregressor_diabetes.py: -------------------------------------------------------------------------------- 1 | """ 2 | ================================ 3 | LCERegressor on Diabetes dataset 4 | ================================ 5 | 6 | An example of :class:`lce.LCERegressor` 7 | """ 8 | 9 | from lce import LCERegressor 10 | from sklearn.datasets import load_diabetes 11 | from sklearn.metrics import mean_squared_error 12 | from sklearn.model_selection import train_test_split 13 | 14 | 15 | # Load data and generate a train/test split 16 | data = load_diabetes() 17 | X_train, X_test, y_train, y_test = train_test_split( 18 | data.data, data.target, random_state=0 19 | ) 20 | 21 | # Train LCERegressor with default parameters 22 | reg = LCERegressor(n_jobs=-1, random_state=0) 23 | reg.fit(X_train, y_train) 24 | 25 | # Make prediction 26 | y_pred = reg.predict(X_test) 27 | mse = mean_squared_error(y_test, y_pred) 28 | print("The mean squared error (MSE) on test set: {:.0f}".format(mse)) 29 | -------------------------------------------------------------------------------- /examples/lceclassifier_iris_cv.py: -------------------------------------------------------------------------------- 1 | """ 2 | ====================================================================== 3 | LCEClassifier on Iris dataset with scikit-learn cross validation score 4 | ====================================================================== 5 | 6 | An example of :class:`lce.LCEClassifier` 7 | """ 8 | 9 | from lce import LCEClassifier 10 | from sklearn.datasets import load_iris 11 | from sklearn.model_selection import cross_val_score, train_test_split 12 | 13 | # Load data 14 | data = load_iris() 15 | X_train, X_test, y_train, y_test = train_test_split( 16 | data.data, data.target, random_state=0 17 | ) 18 | 19 | # Set LCEClassifier with default parameters 20 | clf = LCEClassifier(n_jobs=-1, random_state=0) 21 | 22 | # Compute cross-validation scores 23 | cv_scores = cross_val_score(clf, X_train, y_train, cv=3) 24 | cv_scores = [round(elem * 100, 1) for elem in cv_scores.tolist()] 25 | print("Cross-validation scores on train set: ", cv_scores) 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # scikit-learn specific 10 | doc/_build/ 11 | doc/auto_examples/ 12 | doc/modules/generated/ 13 | doc/datasets/generated/ 14 | 15 | # Distribution / packaging 16 | 17 | .Python 18 | env/ 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *,cover 53 | .hypothesis/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | 62 | # Sphinx documentation 63 | doc/_build/ 64 | doc/generated/ 65 | 66 | # PyBuilder 67 | target/ 68 | -------------------------------------------------------------------------------- /doc/reference.rst: -------------------------------------------------------------------------------- 1 | ############# 2 | Citation 3 | ############# 4 | 5 | Here are the reference papers. If you use LCE, we would appreciate citations:: 6 | 7 | @article{Fauvel-LCE, 8 | author = {Fauvel, K. and E. Fromont and V. Masson and P. Faverdin and A. Termier}, 9 | title = {{LCE: An Augmented Combination of Bagging and Boosting in Python}}, 10 | journal = {arXiv}, 11 | year = {2023} 12 | } 13 | 14 | 15 | @article{Fauvel-LCEDAMI, 16 | author = {Fauvel, K. and E. Fromont and V. Masson and P. Faverdin and A. Termier}, 17 | title = {{XEM: An Explainable-by-Design Ensemble Method for Multivariate Time Series Classification}}, 18 | journal = {Data Mining and Knowledge Discovery}, 19 | year = {2022}, 20 | volume = {36}, 21 | number = {3}, 22 | pages = {917-957} 23 | } 24 | 25 | 26 | @inproceedings{Fauvel-LCEKDD, 27 | author = {Fauvel, K. and V. Masson and E. Fromont and P. Faverdin and A. Termier}, 28 | title = {{Towards Sustainable Dairy Management - A Machine Learning Enhanced Method for Estrus Detection}}, 29 | booktitle = {Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining}, 30 | year = {2019} 31 | } 32 | 33 | -------------------------------------------------------------------------------- /examples/lceregressor_missing_diabetes.py: -------------------------------------------------------------------------------- 1 | """ 2 | ==================================================== 3 | LCERegressor on Diabetes dataset with missing values 4 | ==================================================== 5 | 6 | An example of :class:`lce.LCERegressor` 7 | """ 8 | 9 | import numpy as np 10 | from lce import LCERegressor 11 | from sklearn.datasets import load_diabetes 12 | from sklearn.metrics import mean_squared_error 13 | from sklearn.model_selection import train_test_split 14 | 15 | 16 | # Load data and generate a train/test split 17 | data = load_diabetes() 18 | X_train, X_test, y_train, y_test = train_test_split( 19 | data.data, data.target, random_state=0 20 | ) 21 | 22 | # Input 20% of missing values per variable in the train set 23 | np.random.seed(0) 24 | m = 0.2 25 | for j in range(0, X_train.shape[1]): 26 | sub = np.random.choice(X_train.shape[0], int(X_train.shape[0] * m)) 27 | X_train[sub, j] = np.nan 28 | 29 | # Train LCERegressor with default parameters 30 | reg = LCERegressor(n_jobs=-1, random_state=0) 31 | reg.fit(X_train, y_train) 32 | 33 | # Make prediction 34 | y_pred = reg.predict(X_test) 35 | mse = mean_squared_error(y_test, y_pred) 36 | print("The mean squared error (MSE) on test set: {:.0f}".format(mse)) 37 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | jobs: 4 | build: 5 | docker: 6 | - image: circleci/python:3.10 7 | working_directory: ~/repo 8 | steps: 9 | - checkout 10 | - run: 11 | name: install dependencies 12 | command: | 13 | wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh 14 | chmod +x miniconda.sh && ./miniconda.sh -b -p ~/miniconda 15 | export PATH="~/miniconda/bin:$PATH" 16 | conda update --yes --quiet conda 17 | conda create -n testenv --yes --quiet python=3.10 18 | source activate testenv 19 | pip install catboost==1.1.1 hyperopt==0.2.7 lightgbm==3.3.5 numpy==1.23.3 numpydoc pandas==1.5.0 scikit-learn==1.1.2 sphinx sphinx-gallery sphinx_rtd_theme pillow pytest pytest-cov xgboost==1.6.2 20 | pip install . 21 | cd doc 22 | make html 23 | - store_artifacts: 24 | path: doc/_build/html/ 25 | destination: doc 26 | - store_artifacts: 27 | path: ~/log.txt 28 | - run: ls -ltrh doc/_build/html 29 | filters: 30 | branches: 31 | ignore: gh-pages 32 | 33 | 34 | workflows: 35 | version: 2 36 | workflow: 37 | jobs: 38 | - build 39 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | 2 | .. image:: _images/logo_lce.svg 3 | :align: center 4 | :width: 40% 5 | 6 | | 7 | 8 | Welcome to the documentation of LCE! 9 | ==================================== 10 | 11 | | **Local Cascade Ensemble (LCE)** is a *high-performing*, *scalable* and *user-friendly* machine learning method for the general tasks of **Classification** and **Regression**. 12 | | In particular, LCE: 13 | 14 | - Enhances the prediction performance of Random Forest and XGBoost by combining their strengths and adopting a complementary diversification approach 15 | - Supports parallel processing to ensure scalability 16 | - Handles missing data by design 17 | - Adopts scikit-learn API for the ease of use 18 | - Adheres to scikit-learn conventions to allow interaction with scikit-learn pipelines and model selection tools 19 | - Is released in open source and commercially usable - Apache 2.0 license 20 | 21 | .. toctree:: 22 | :maxdepth: 1 23 | :hidden: 24 | :caption: Tutorial 25 | 26 | tutorial 27 | 28 | .. toctree:: 29 | :maxdepth: 2 30 | :hidden: 31 | :caption: Documentation 32 | 33 | api 34 | 35 | .. toctree:: 36 | :maxdepth: 2 37 | :hidden: 38 | :caption: Contribute 39 | 40 | contribute 41 | 42 | .. toctree:: 43 | :maxdepth: 2 44 | :hidden: 45 | :caption: Reference 46 | 47 | reference 48 | -------------------------------------------------------------------------------- /doc/contribute.rst: -------------------------------------------------------------------------------- 1 | ################# 2 | Contribute to LCE 3 | ################# 4 | 5 | 6 | They are multiple ways to participate in LCE; for instance, school projects for students, research questions for academics, performance maximization through collaboration for professionals: 7 | 8 | - Add a star to LCE GitHub repository. It seems insignificant but it is key for LCE referencing and visibility 9 | - Answer queries on the issue tracker, investigate bugs, and review other developers’ pull requests to make sure the existing version is running as expected 10 | - Develop new test cases to make the codebase more robust 11 | - Extend LCE capabilities by adding new features (e.g., modeling flexibility, faithful explainability-by-design, GPU support). For more detailed information about new feature ideas, please refer to the article `"LCE: The Most Powerful Machine Learning Method?" `_ 12 | - Design tutorials in various media outlets, targeting different audiences, to make more people discover LCE 13 | 14 | For organizations, it is possible to sponsor the project to cover the expenses needed to develop LCE along with the highest standards (e.g., professional services like a robust continuous integration infrastructure, workshop expenses). 15 | -------------------------------------------------------------------------------- /.github/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Contributing 2 | ============ 3 | 4 | They are multiple ways to participate in LCE; for instance, school projects for students, research questions for academics, performance maximization through collaboration for professionals: 5 | 6 | - Add a star to LCE GitHub repository. It seems insignificant but it is key for LCE referencing and visibility 7 | - Answer queries on the issue tracker, investigate bugs, and review other developers’ pull requests to make sure the existing version is running as expected 8 | - Develop new test cases to make the codebase more robust 9 | - Extend LCE capabilities by adding new features (e.g., modeling flexibility, faithful explainability-by-design, GPU support). For more detailed information about new feature ideas, please refer to the article ["LCE: The Most Powerful Machine Learning Method?"](https://towardsdatascience.com/lce-the-most-powerful-machine-learning-method-e8ea77f317d6?source=friends_link&sk=c8911ad03dd1e0e3fd02a17835609737) 10 | - Design tutorials in various media outlets, targeting different audiences, to make more people discover LCE 11 | 12 | For organizations, it is possible to sponsor the project to cover the expenses needed to develop LCE along with the highest standards (e.g., professional services like a robust continuous integration infrastructure, workshop expenses). 13 | 14 | 15 | Contributing to related projects 16 | -------------------------------- 17 | 18 | LCE thrives in an ecosystem of several related projects, which may also have relevant issues to work on. These projects include: 19 | 20 | - [hyperopt](https://github.com/hyperopt/hyperopt/issues) 21 | - [numpy](https://github.com/numpy/numpy/issues) 22 | - [pandas](https://github.com/pandas-dev/pandas/issues) 23 | - [scikit-learn](https://github.com/scikit-learn/scikit-learn/issues) 24 | - [xgboost](https://github.com/dmlc/xgboost/issues) 25 | 26 | 27 | Code of Conduct 28 | --------------- 29 | 30 | This repository follows the principles of the Python Software Foundation: https://www.python.org/psf/codeofconduct/. -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import codecs 4 | import os 5 | 6 | from setuptools import find_packages, setup 7 | 8 | # get __version__ from _version.py 9 | ver_file = os.path.join("lce", "_version.py") 10 | with open(ver_file) as f: 11 | exec(f.read()) 12 | 13 | DISTNAME = "lcensemble" 14 | DESCRIPTION = "Local Cascade Ensemble package" 15 | with codecs.open("README.rst", encoding="utf-8-sig") as f: 16 | LONG_DESCRIPTION = f.read() 17 | LONG_DESCRIPTION_TYPE = "text/x-rst" 18 | MAINTAINER = "Kevin Fauvel" 19 | MAINTAINER_EMAIL = "kfauvel.lce@gmail.com" 20 | URL = "https://lce.readthedocs.io/en/latest/" 21 | LICENSE = "Apache-2.0" 22 | DOWNLOAD_URL = "https://github.com/LocalCascadeEnsemble/LCE" 23 | VERSION = __version__ 24 | INSTALL_REQUIRES = [ 25 | "catboost==1.1.1", 26 | "hyperopt==0.2.7", 27 | "lightgbm==3.3.5", 28 | "numpy==1.23.3", 29 | "pandas==1.5.0", 30 | "scikit-learn==1.1.2", 31 | "xgboost==1.6.2", 32 | ] 33 | PROJECT_URLS = { 34 | "Documentation": "https://lce.readthedocs.io/en/latest/", 35 | } 36 | CLASSIFIERS = [ 37 | "License :: OSI Approved :: Apache Software License", 38 | "Programming Language :: Python :: 3", 39 | "Programming Language :: Python :: 3.8", 40 | "Programming Language :: Python :: 3.9", 41 | "Programming Language :: Python :: 3.10", 42 | "Operating System :: OS Independent", 43 | ] 44 | EXTRAS_REQUIRE = { 45 | "tests": ["pytest", "pytest-cov"], 46 | "docs": ["sphinx", "sphinx-gallery", "sphinx_rtd_theme", "numpydoc", "pillow"], 47 | } 48 | 49 | setup( 50 | name=DISTNAME, 51 | python_requires=">=3.8", 52 | maintainer=MAINTAINER, 53 | maintainer_email=MAINTAINER_EMAIL, 54 | description=DESCRIPTION, 55 | license=LICENSE, 56 | url=URL, 57 | version=VERSION, 58 | download_url=DOWNLOAD_URL, 59 | project_urls=PROJECT_URLS, 60 | long_description=LONG_DESCRIPTION, 61 | long_description_content_type=LONG_DESCRIPTION_TYPE, 62 | zip_safe=False, 63 | classifiers=CLASSIFIERS, 64 | packages=find_packages(), 65 | install_requires=INSTALL_REQUIRES, 66 | extras_require=EXTRAS_REQUIRE, 67 | ) 68 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.yml: -------------------------------------------------------------------------------- 1 | name: Bug Report 2 | description: Create a report to help reproduce and correct the bug. 3 | 4 | body: 5 | - type: textarea 6 | attributes: 7 | label: Describe the bug 8 | description: > 9 | A concise description of the bug. 10 | validations: 11 | required: true 12 | 13 | - type: textarea 14 | attributes: 15 | label: Observed Results 16 | description: | 17 | Please paste or describe the results you observe. If you observe an error, please paste the error message including the **full traceback** of the exception. 18 | validations: 19 | required: true 20 | 21 | - type: textarea 22 | attributes: 23 | label: Code to Reproduce 24 | description: | 25 | Please add a code example that can reproduce the error when running it. Be as succinct as possible, **do not depend on external data files**: instead you can use [sklearn.datasets.make_regression](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html), [sklearn.datasets.make_classification](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html) or generate synthetic data using `numpy.random`. 26 | 27 | Crafting a code example requires some effort on your side but it really helps to quickly reproduce the problem and analyze its cause without any ambiguity. Ambiguous bug reports tend to be slower to fix because they will require more effort and discussions to pin-point the precise conditions necessary to reproduce the problem. 28 | placeholder: | 29 | ``` 30 | Sample code to reproduce the problem 31 | ``` 32 | validations: 33 | required: true 34 | - type: textarea 35 | attributes: 36 | label: Expected Results 37 | description: > 38 | Please paste or describe the expected results. 39 | placeholder: > 40 | Example: No error is thrown. 41 | validations: 42 | required: true 43 | - type: textarea 44 | attributes: 45 | label: Version 46 | render: shell 47 | description: | 48 | Please run the following and paste the output below. 49 | ```python 50 | import lce; lce.__version__ 51 | ``` 52 | validations: 53 | required: true 54 | - type: markdown 55 | attributes: 56 | value: > 57 | Thanks for contributing 🎉! 58 | -------------------------------------------------------------------------------- /logo/logo_lce.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | image/svg+xml 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /doc/_images/logo_lce.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | image/svg+xml 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /doc/_static/js/copybutton.js: -------------------------------------------------------------------------------- 1 | $(document).ready(function() { 2 | /* Add a [>>>] button on the top-right corner of code samples to hide 3 | * the >>> and ... prompts and the output and thus make the code 4 | * copyable. */ 5 | var div = $('.highlight-python .highlight,' + 6 | '.highlight-python3 .highlight,' + 7 | '.highlight-pycon .highlight,' + 8 | '.highlight-default .highlight') 9 | var pre = div.find('pre'); 10 | 11 | // get the styles from the current theme 12 | pre.parent().parent().css('position', 'relative'); 13 | var hide_text = 'Hide the prompts and output'; 14 | var show_text = 'Show the prompts and output'; 15 | var border_width = pre.css('border-top-width'); 16 | var border_style = pre.css('border-top-style'); 17 | var border_color = pre.css('border-top-color'); 18 | var button_styles = { 19 | 'cursor':'pointer', 'position': 'absolute', 'top': '0', 'right': '0', 20 | 'border-color': border_color, 'border-style': border_style, 21 | 'border-width': border_width, 'color': border_color, 'text-size': '75%', 22 | 'font-family': 'monospace', 'padding-left': '0.2em', 'padding-right': '0.2em', 23 | 'border-radius': '0 3px 0 0' 24 | } 25 | 26 | // create and add the button to all the code blocks that contain >>> 27 | div.each(function(index) { 28 | var jthis = $(this); 29 | if (jthis.find('.gp').length > 0) { 30 | var button = $('>>>'); 31 | button.css(button_styles) 32 | button.attr('title', hide_text); 33 | button.data('hidden', 'false'); 34 | jthis.prepend(button); 35 | } 36 | // tracebacks (.gt) contain bare text elements that need to be 37 | // wrapped in a span to work with .nextUntil() (see later) 38 | jthis.find('pre:has(.gt)').contents().filter(function() { 39 | return ((this.nodeType == 3) && (this.data.trim().length > 0)); 40 | }).wrap(''); 41 | }); 42 | 43 | // define the behavior of the button when it's clicked 44 | $('.copybutton').click(function(e){ 45 | e.preventDefault(); 46 | var button = $(this); 47 | if (button.data('hidden') === 'false') { 48 | // hide the code output 49 | button.parent().find('.go, .gp, .gt').hide(); 50 | button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'hidden'); 51 | button.css('text-decoration', 'line-through'); 52 | button.attr('title', show_text); 53 | button.data('hidden', 'true'); 54 | } else { 55 | // show the code output 56 | button.parent().find('.go, .gp, .gt').show(); 57 | button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'visible'); 58 | button.css('text-decoration', 'none'); 59 | button.attr('title', hide_text); 60 | button.data('hidden', 'false'); 61 | } 62 | }); 63 | }); 64 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | 2 | .. raw:: html 3 | 4 |

5 | 6 |

7 | 8 |
9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 |
31 | 32 | | **Local Cascade Ensemble (LCE)** is a *high-performing*, *scalable* and *user-friendly* machine learning method for the general tasks of **Classification** and **Regression**. 33 | | In particular, LCE: 34 | 35 | - Enhances the prediction performance of Random Forest and XGBoost by combining their strengths and adopting a complementary diversification approach 36 | - Supports parallel processing to ensure scalability 37 | - Handles missing data by design 38 | - Adopts scikit-learn API for the ease of use 39 | - Adheres to scikit-learn conventions to allow interaction with scikit-learn pipelines and model selection tools 40 | - Is released in open source and commercially usable - Apache 2.0 license 41 | 42 | 43 | Getting Started 44 | =============== 45 | 46 | This section presents a quick start tutorial showing snippets for you to try out LCE. 47 | 48 | Installation 49 | ------------ 50 | 51 | You can install LCE from `PyPI `_ with ``pip``:: 52 | 53 | pip install lcensemble 54 | 55 | Or ``conda``:: 56 | 57 | conda install -c conda-forge lcensemble 58 | 59 | 60 | First Example on Iris Dataset 61 | ----------------------------- 62 | 63 | LCEClassifier accuracy on an Iris test set: 64 | 65 | .. code-block:: python 66 | 67 | from lce import LCEClassifier 68 | from sklearn.datasets import load_iris 69 | from sklearn.metrics import accuracy_score 70 | from sklearn.model_selection import train_test_split 71 | 72 | 73 | # Load data and generate a train/test split 74 | data = load_iris() 75 | X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, random_state=0) 76 | 77 | # Train LCEClassifier with default parameters 78 | clf = LCEClassifier(n_jobs=-1, random_state=0) 79 | clf.fit(X_train, y_train) 80 | 81 | # Make prediction and compute accuracy score 82 | y_pred = clf.predict(X_test) 83 | accuracy = accuracy_score(y_test, y_pred) 84 | print("Accuracy: {:.1f}%".format(accuracy*100)) 85 | 86 | .. code-block:: 87 | 88 | Accuracy: 97.4% 89 | 90 | 91 | Documentation 92 | ============= 93 | 94 | LCE documentation, including API documentation and general examples, can be found `here `_. 95 | 96 | 97 | Contribute to LCE 98 | ================= 99 | 100 | Your valuable contribution will help make this package more powerful, and better for the community. 101 | There are multiple ways to participate, check out this `page `_! 102 | 103 | 104 | Reference Papers 105 | ================ 106 | 107 | LCE originated from a research at `Inria, France `_. 108 | Here are the reference papers: 109 | 110 | .. [1] Fauvel, K., E. Fromont, V. Masson, P. Faverdin and A. Termier. LCE: An Augmented Combination of Bagging and Boosting in Python. arXiv, 2023 111 | 112 | .. [2] Fauvel, K., E. Fromont, V. Masson, P. Faverdin and A. Termier. XEM: An Explainable-by-Design Ensemble Method for Multivariate Time Series Classification. Data Mining and Knowledge Discovery, 36(3):917–957, 2022 113 | 114 | .. [3] Fauvel, K., V. Masson, E. Fromont, P. Faverdin and A. Termier. Towards Sustainable Dairy Management - A Machine Learning Enhanced Method for Estrus Detection. In Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, 2019 115 | 116 | If you use LCE, we would appreciate citations. 117 | 118 | 119 | Contact 120 | ======= 121 | 122 | If you have any question, you can contact me here: `Kevin Fauvel `_. -------------------------------------------------------------------------------- /doc/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | goto end 41 | ) 42 | 43 | if "%1" == "clean" ( 44 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 45 | del /q /s %BUILDDIR%\* 46 | goto end 47 | ) 48 | 49 | 50 | %SPHINXBUILD% 2> nul 51 | if errorlevel 9009 ( 52 | echo. 53 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 54 | echo.installed, then set the SPHINXBUILD environment variable to point 55 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 56 | echo.may add the Sphinx directory to PATH. 57 | echo. 58 | echo.If you don't have Sphinx installed, grab it from 59 | echo.http://sphinx-doc.org/ 60 | exit /b 1 61 | ) 62 | 63 | if "%1" == "html" ( 64 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 65 | if errorlevel 1 exit /b 1 66 | echo. 67 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 68 | goto end 69 | ) 70 | 71 | if "%1" == "dirhtml" ( 72 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 73 | if errorlevel 1 exit /b 1 74 | echo. 75 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 76 | goto end 77 | ) 78 | 79 | if "%1" == "singlehtml" ( 80 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 81 | if errorlevel 1 exit /b 1 82 | echo. 83 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 84 | goto end 85 | ) 86 | 87 | if "%1" == "pickle" ( 88 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 89 | if errorlevel 1 exit /b 1 90 | echo. 91 | echo.Build finished; now you can process the pickle files. 92 | goto end 93 | ) 94 | 95 | if "%1" == "json" ( 96 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 97 | if errorlevel 1 exit /b 1 98 | echo. 99 | echo.Build finished; now you can process the JSON files. 100 | goto end 101 | ) 102 | 103 | if "%1" == "htmlhelp" ( 104 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 105 | if errorlevel 1 exit /b 1 106 | echo. 107 | echo.Build finished; now you can run HTML Help Workshop with the ^ 108 | .hhp project file in %BUILDDIR%/htmlhelp. 109 | goto end 110 | ) 111 | 112 | if "%1" == "qthelp" ( 113 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 114 | if errorlevel 1 exit /b 1 115 | echo. 116 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 117 | .qhcp project file in %BUILDDIR%/qthelp, like this: 118 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\project-template.qhcp 119 | echo.To view the help file: 120 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\project-template.ghc 121 | goto end 122 | ) 123 | 124 | if "%1" == "devhelp" ( 125 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished. 129 | goto end 130 | ) 131 | 132 | if "%1" == "epub" ( 133 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 134 | if errorlevel 1 exit /b 1 135 | echo. 136 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 137 | goto end 138 | ) 139 | 140 | if "%1" == "latex" ( 141 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 142 | if errorlevel 1 exit /b 1 143 | echo. 144 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 145 | goto end 146 | ) 147 | 148 | if "%1" == "latexpdf" ( 149 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 150 | cd %BUILDDIR%/latex 151 | make all-pdf 152 | cd %BUILDDIR%/.. 153 | echo. 154 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 155 | goto end 156 | ) 157 | 158 | if "%1" == "latexpdfja" ( 159 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 160 | cd %BUILDDIR%/latex 161 | make all-pdf-ja 162 | cd %BUILDDIR%/.. 163 | echo. 164 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 165 | goto end 166 | ) 167 | 168 | if "%1" == "text" ( 169 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 170 | if errorlevel 1 exit /b 1 171 | echo. 172 | echo.Build finished. The text files are in %BUILDDIR%/text. 173 | goto end 174 | ) 175 | 176 | if "%1" == "man" ( 177 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 178 | if errorlevel 1 exit /b 1 179 | echo. 180 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 181 | goto end 182 | ) 183 | 184 | if "%1" == "texinfo" ( 185 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 186 | if errorlevel 1 exit /b 1 187 | echo. 188 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 189 | goto end 190 | ) 191 | 192 | if "%1" == "gettext" ( 193 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 194 | if errorlevel 1 exit /b 1 195 | echo. 196 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 197 | goto end 198 | ) 199 | 200 | if "%1" == "changes" ( 201 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 202 | if errorlevel 1 exit /b 1 203 | echo. 204 | echo.The overview file is in %BUILDDIR%/changes. 205 | goto end 206 | ) 207 | 208 | if "%1" == "linkcheck" ( 209 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 210 | if errorlevel 1 exit /b 1 211 | echo. 212 | echo.Link check complete; look for any errors in the above output ^ 213 | or in %BUILDDIR%/linkcheck/output.txt. 214 | goto end 215 | ) 216 | 217 | if "%1" == "doctest" ( 218 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 219 | if errorlevel 1 exit /b 1 220 | echo. 221 | echo.Testing of doctests in the sources finished, look at the ^ 222 | results in %BUILDDIR%/doctest/output.txt. 223 | goto end 224 | ) 225 | 226 | if "%1" == "xml" ( 227 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 228 | if errorlevel 1 exit /b 1 229 | echo. 230 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 231 | goto end 232 | ) 233 | 234 | if "%1" == "pseudoxml" ( 235 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 236 | if errorlevel 1 exit /b 1 237 | echo. 238 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 239 | goto end 240 | ) 241 | 242 | :end 243 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " devhelp to make HTML files and a Devhelp project" 34 | @echo " epub to make an epub" 35 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 36 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 37 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 38 | @echo " text to make text files" 39 | @echo " man to make manual pages" 40 | @echo " texinfo to make Texinfo files" 41 | @echo " info to make Texinfo files and run them through makeinfo" 42 | @echo " gettext to make PO message catalogs" 43 | @echo " changes to make an overview of all changed/added/deprecated items" 44 | @echo " xml to make Docutils-native XML files" 45 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 46 | @echo " linkcheck to check all external links for integrity" 47 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 48 | 49 | clean: 50 | -rm -rf $(BUILDDIR)/* 51 | -rm -rf auto_examples/ 52 | -rm -rf generated/* 53 | -rm -rf modules/generated/* 54 | 55 | html: 56 | # These two lines make the build a bit more lengthy, and the 57 | # the embedding of images more robust 58 | rm -rf $(BUILDDIR)/html/_images 59 | #rm -rf _build/doctrees/ 60 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 61 | @echo 62 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 63 | 64 | dirhtml: 65 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 66 | @echo 67 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 68 | 69 | singlehtml: 70 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 71 | @echo 72 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 73 | 74 | pickle: 75 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 76 | @echo 77 | @echo "Build finished; now you can process the pickle files." 78 | 79 | json: 80 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 81 | @echo 82 | @echo "Build finished; now you can process the JSON files." 83 | 84 | htmlhelp: 85 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 86 | @echo 87 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 88 | ".hhp project file in $(BUILDDIR)/htmlhelp." 89 | 90 | qthelp: 91 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 92 | @echo 93 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 94 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 95 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/project-template.qhcp" 96 | @echo "To view the help file:" 97 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/project-template.qhc" 98 | 99 | devhelp: 100 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 101 | @echo 102 | @echo "Build finished." 103 | @echo "To view the help file:" 104 | @echo "# mkdir -p $$HOME/.local/share/devhelp/project-template" 105 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/project-template" 106 | @echo "# devhelp" 107 | 108 | epub: 109 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 110 | @echo 111 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 112 | 113 | latex: 114 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 115 | @echo 116 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 117 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 118 | "(use \`make latexpdf' here to do that automatically)." 119 | 120 | latexpdf: 121 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 122 | @echo "Running LaTeX files through pdflatex..." 123 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 124 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 125 | 126 | latexpdfja: 127 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 128 | @echo "Running LaTeX files through platex and dvipdfmx..." 129 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 130 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 131 | 132 | text: 133 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 134 | @echo 135 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 136 | 137 | man: 138 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 139 | @echo 140 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 141 | 142 | texinfo: 143 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 144 | @echo 145 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 146 | @echo "Run \`make' in that directory to run these through makeinfo" \ 147 | "(use \`make info' here to do that automatically)." 148 | 149 | info: 150 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 151 | @echo "Running Texinfo files through makeinfo..." 152 | make -C $(BUILDDIR)/texinfo info 153 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 154 | 155 | gettext: 156 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 157 | @echo 158 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 159 | 160 | changes: 161 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 162 | @echo 163 | @echo "The overview file is in $(BUILDDIR)/changes." 164 | 165 | linkcheck: 166 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 167 | @echo 168 | @echo "Link check complete; look for any errors in the above output " \ 169 | "or in $(BUILDDIR)/linkcheck/output.txt." 170 | 171 | doctest: 172 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 173 | @echo "Testing of doctests in the sources finished, look at the " \ 174 | "results in $(BUILDDIR)/doctest/output.txt." 175 | 176 | xml: 177 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 178 | @echo 179 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 180 | 181 | pseudoxml: 182 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 183 | @echo 184 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 185 | -------------------------------------------------------------------------------- /lce/_catboost.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from hyperopt import fmin, tpe, hp, STATUS_OK, Trials 3 | from sklearn.metrics import check_scoring 4 | from catboost import CatBoostClassifier, CatBoostRegressor 5 | 6 | 7 | def catboost_opt_classifier( 8 | X, 9 | y, 10 | n_iter=10, 11 | metric="accuracy", 12 | n_estimators=(10, 50, 100), 13 | max_depth=(3, 6, 9), 14 | learning_rate=(0.01, 0.1, 0.3, 0.5), 15 | colsample_bylevel=(1.0,), 16 | reg_lambda=(0.1, 1.0, 5.0), 17 | n_jobs=None, 18 | random_state=None, 19 | ): 20 | """ 21 | Get CatBoost model with the best hyperparameters configuration. 22 | 23 | Parameters 24 | ---------- 25 | X : array-like of shape (n_samples, n_features) 26 | The training input samples. 27 | 28 | y : array-like of shape (n_samples,) 29 | The class labels. 30 | 31 | n_iter: int, default=10 32 | Number of iterations to set the hyperparameters of the base classifier (CatBoost) 33 | in Hyperopt. 34 | 35 | metric: string, default="accuracy" 36 | The score of the base classifier (CatBoost) optimized by Hyperopt. Supported metrics 37 | are the ones from `scikit-learn `_. 38 | 39 | n_estimators : tuple, default=(10, 50, 100) 40 | The number of estimators for the base learner. The tuple provided is 41 | the search space used for the hyperparameter optimization (Hyperopt). 42 | 43 | max_depth : tuple, default=(3, 6, 9) 44 | Maximum tree depth for the base learner. The tuple provided is the search 45 | space used for the hyperparameter optimization (Hyperopt). 46 | 47 | learning_rate : tuple, default=(0.01, 0.1, 0.3, 0.5) 48 | `learning_rate` of the base learner. The tuple provided is the search space used for the 49 | hyperparameter optimization (Hyperopt). 50 | 51 | colsample_bylevel : tuple, default=(1.0,) 52 | Subsample ratio of columns for each level. Subsampling occurs 53 | once for every new depth level reached in a tree. Columns are subsampled 54 | from the set of columns chosen for the current tree. The tuple provided is the search 55 | space used for the hyperparameter optimization (Hyperopt). 56 | 57 | reg_lambda : tuple, default=(0.1, 1.0, 5.0) 58 | `reg_lambda` / `l2_leaf_reg` of CatBoost. The tuple provided is the search 59 | space used for the hyperparameter optimization (Hyperopt). 60 | 61 | n_jobs : int, default=None 62 | The number of jobs to run in parallel. 63 | ``n_jobs=None`` means 1. ``n_jobs=-1`` means using all processors. 64 | 65 | random_state : int, RandomState instance or None, default=None 66 | Controls the randomness of the base learner CatBoost and 67 | the Hyperopt algorithm. 68 | 69 | Returns 70 | ------- 71 | model: object 72 | CatBoost model with the best configuration and fitted on the input data. 73 | """ 74 | # Parameters 75 | classes, y = np.unique(y, return_inverse=True) 76 | 77 | space = { 78 | "n_estimators": hp.choice("n_estimators", n_estimators), 79 | "depth": hp.choice("max_depth", max_depth), 80 | "learning_rate": hp.choice("learning_rate", learning_rate), 81 | "colsample_bylevel": hp.choice("colsample_bylevel", colsample_bylevel), 82 | "reg_lambda": hp.choice("reg_lambda", reg_lambda), 83 | "thread_count": n_jobs, 84 | "random_state": random_state, 85 | } 86 | 87 | # Get best configuration 88 | def p_model(params): 89 | clf = CatBoostClassifier(**params, verbose=False) 90 | clf.fit(X, y) 91 | scorer = check_scoring(clf, scoring=metric) 92 | return scorer(clf, X, y) 93 | 94 | global best 95 | best = -np.inf 96 | 97 | def f(params): 98 | global best 99 | perf = p_model(params) 100 | if perf > best: 101 | best = perf 102 | return {"loss": -best, "status": STATUS_OK} 103 | 104 | rstate = np.random.default_rng(random_state) 105 | best_config = fmin( 106 | fn=f, 107 | space=space, 108 | algo=tpe.suggest, 109 | max_evals=n_iter, 110 | trials=Trials(), 111 | rstate=rstate, 112 | verbose=0, 113 | ) 114 | 115 | # Fit best model 116 | final_params = { 117 | "n_estimators": n_estimators[best_config["n_estimators"]], 118 | "depth": max_depth[best_config["max_depth"]], 119 | "learning_rate": learning_rate[best_config["learning_rate"]], 120 | "colsample_bylevel": colsample_bylevel[best_config["colsample_bylevel"]], 121 | "reg_lambda": reg_lambda[best_config["reg_lambda"]], 122 | "thread_count": n_jobs, 123 | "random_state": random_state, 124 | } 125 | clf = CatBoostClassifier(**final_params, verbose=False) 126 | return clf.fit(X, y) 127 | 128 | 129 | def catboost_opt_regressor( 130 | X, 131 | y, 132 | n_iter=10, 133 | metric="neg_mean_squared_error", 134 | n_estimators=(10, 50, 100), 135 | max_depth=(3, 6, 9), 136 | learning_rate=(0.01, 0.1, 0.3, 0.5), 137 | colsample_bylevel=(1.0,), 138 | reg_lambda=(0.1, 1.0, 5.0), 139 | n_jobs=None, 140 | random_state=None, 141 | ): 142 | """ 143 | Get CatBoost model with the best hyperparameters configuration. 144 | 145 | Parameters 146 | ---------- 147 | X : array-like of shape (n_samples, n_features) 148 | The training input samples. 149 | 150 | y : array-like of shape (n_samples,) 151 | The target values (real numbers). 152 | 153 | n_iter: int, default=10 154 | Number of iterations to set the hyperparameters of the base regressor (CatBoost) 155 | in Hyperopt. 156 | 157 | metric: string, default="neg_mean_squared_error" 158 | The score of the base regressor (CatBoost) optimized by Hyperopt. Supported metrics 159 | are the ones from `scikit-learn `_. 160 | 161 | n_estimators : tuple, default=(10, 50, 100) 162 | The number of estimators for the base learner. The tuple provided is 163 | the search space used for the hyperparameter optimization (Hyperopt). 164 | 165 | max_depth : tuple, default=(3, 6, 9) 166 | Maximum tree depth for the base learner. The tuple provided is the search 167 | space used for the hyperparameter optimization (Hyperopt). 168 | 169 | learning_rate : tuple, default=(0.01, 0.1, 0.3, 0.5) 170 | `learning_rate` of the base learner. The tuple provided is the search space used for the 171 | hyperparameter optimization (Hyperopt). 172 | 173 | colsample_bylevel : tuple, default=(1.0,) 174 | Subsample ratio of columns for each level. Subsampling occurs 175 | once for every new depth level reached in a tree. Columns are subsampled 176 | from the set of columns chosen for the current tree. The tuple provided is the search 177 | space used for the hyperparameter optimization (Hyperopt). 178 | 179 | reg_lambda : tuple, default=(0.1, 1.0, 5.0) 180 | `reg_lambda` / `l2_leaf_reg` of CatBoost. The tuple provided is the search 181 | space used for the hyperparameter optimization (Hyperopt). 182 | 183 | n_jobs : int, default=None 184 | The number of jobs to run in parallel. 185 | ``n_jobs=None`` means 1. ``n_jobs=-1`` means using all processors. 186 | 187 | random_state : int, RandomState instance or None, default=None 188 | Controls the randomness of the base learner CatBoost and 189 | the Hyperopt algorithm. 190 | 191 | Returns 192 | ------- 193 | model: object 194 | CatBoost model with the best configuration and fitted on the input data. 195 | """ 196 | space = { 197 | "n_estimators": hp.choice("n_estimators", n_estimators), 198 | "depth": hp.choice("max_depth", max_depth), 199 | "learning_rate": hp.choice("learning_rate", learning_rate), 200 | "colsample_bylevel": hp.choice("colsample_bylevel", colsample_bylevel), 201 | "reg_lambda": hp.choice("reg_lambda", reg_lambda), 202 | "thread_count": n_jobs, 203 | "random_state": random_state, 204 | } 205 | 206 | # Get best configuration 207 | def p_model(params): 208 | reg = CatBoostRegressor(**params, verbose=False) 209 | reg.fit(X, y) 210 | scorer = check_scoring(reg, scoring=metric) 211 | return scorer(reg, X, y) 212 | 213 | global best 214 | best = -np.inf 215 | 216 | def f(params): 217 | global best 218 | perf = p_model(params) 219 | if perf > best: 220 | best = perf 221 | return {"loss": -best, "status": STATUS_OK} 222 | 223 | rstate = np.random.default_rng(random_state) 224 | best_config = fmin( 225 | fn=f, 226 | space=space, 227 | algo=tpe.suggest, 228 | max_evals=n_iter, 229 | trials=Trials(), 230 | rstate=rstate, 231 | verbose=0, 232 | ) 233 | 234 | # Fit best model 235 | final_params = { 236 | "n_estimators": n_estimators[best_config["n_estimators"]], 237 | "depth": max_depth[best_config["max_depth"]], 238 | "learning_rate": learning_rate[best_config["learning_rate"]], 239 | "colsample_bylevel": colsample_bylevel[best_config["colsample_bylevel"]], 240 | "reg_lambda": reg_lambda[best_config["reg_lambda"]], 241 | "thread_count": n_jobs, 242 | "random_state": random_state, 243 | } 244 | reg = CatBoostRegressor(**final_params, verbose=False) 245 | return reg.fit(X, y) 246 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | Copyright 2023, Kevin FAUVEL 179 | 180 | Licensed under the Apache License, Version 2.0 (the "License"); 181 | you may not use this file except in compliance with the License. 182 | You may obtain a copy of the License at 183 | 184 | http://www.apache.org/licenses/LICENSE-2.0 185 | 186 | Unless required by applicable law or agreed to in writing, software 187 | distributed under the License is distributed on an "AS IS" BASIS, 188 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 189 | See the License for the specific language governing permissions and 190 | limitations under the License. -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # project-template documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Jan 18 14:44:12 2016. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | import os 17 | 18 | import sphinx_gallery 19 | import sphinx_rtd_theme 20 | 21 | # Add to sys.path the top-level directory where the package is located. 22 | sys.path.insert(0, os.path.abspath("..")) 23 | 24 | # If extensions (or modules to document with autodoc) are in another directory, 25 | # add these directories to sys.path here. If the directory is relative to the 26 | # documentation root, use os.path.abspath to make it absolute, like shown here. 27 | # sys.path.insert(0, os.path.abspath('.')) 28 | 29 | # -- General configuration ------------------------------------------------ 30 | 31 | # If your documentation needs a minimal Sphinx version, state it here. 32 | # needs_sphinx = '1.0' 33 | 34 | # Add any Sphinx extension module names here, as strings. They can be 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 36 | # ones. 37 | extensions = [ 38 | "sphinx.ext.autodoc", 39 | "sphinx.ext.autosummary", 40 | "sphinx.ext.doctest", 41 | "sphinx.ext.intersphinx", 42 | "sphinx.ext.viewcode", 43 | "numpydoc", 44 | "sphinx_gallery.gen_gallery", 45 | ] 46 | 47 | # this is needed for some reason... 48 | # see https://github.com/numpy/numpydoc/issues/69 49 | numpydoc_show_class_members = False 50 | 51 | # pngmath / imgmath compatibility layer for different sphinx versions 52 | import sphinx 53 | from distutils.version import LooseVersion 54 | 55 | if LooseVersion(sphinx.__version__) < LooseVersion("1.4"): 56 | extensions.append("sphinx.ext.pngmath") 57 | else: 58 | extensions.append("sphinx.ext.imgmath") 59 | 60 | autodoc_default_flags = ["members", "inherited-members"] 61 | 62 | # Add any paths that contain templates here, relative to this directory. 63 | templates_path = ["_templates"] 64 | 65 | # generate autosummary even if no references 66 | autosummary_generate = True 67 | 68 | # The suffix of source filenames. 69 | source_suffix = ".rst" 70 | 71 | # The encoding of source files. 72 | # source_encoding = 'utf-8-sig' 73 | 74 | # Generate the plots for the gallery 75 | plot_gallery = True 76 | 77 | # The master toctree document. 78 | master_doc = "index" 79 | 80 | # General information about the project. 81 | project = "LCE" 82 | copyright = "2023, Kevin Fauvel" 83 | 84 | # The version info for the project you're documenting, acts as replacement for 85 | # |version| and |release|, also used in various other places throughout the 86 | # built documents. 87 | # 88 | # The short X.Y version. 89 | from lce import __version__ 90 | 91 | version = __version__ 92 | # The full version, including alpha/beta/rc tags. 93 | release = __version__ 94 | 95 | # The language for content autogenerated by Sphinx. Refer to documentation 96 | # for a list of supported languages. 97 | # language = None 98 | 99 | # There are two options for replacing |today|: either, you set today to some 100 | # non-false value, then it is used: 101 | # today = '' 102 | # Else, today_fmt is used as the format for a strftime call. 103 | # today_fmt = '%B %d, %Y' 104 | 105 | # List of patterns, relative to source directory, that match files and 106 | # directories to ignore when looking for source files. 107 | exclude_patterns = ["_build", "_templates"] 108 | 109 | # The reST default role (used for this markup: `text`) to use for all 110 | # documents. 111 | # default_role = None 112 | 113 | # If true, '()' will be appended to :func: etc. cross-reference text. 114 | # add_function_parentheses = True 115 | 116 | # If true, the current module name will be prepended to all description 117 | # unit titles (such as .. function::). 118 | # add_module_names = True 119 | 120 | # If true, sectionauthor and moduleauthor directives will be shown in the 121 | # output. They are ignored by default. 122 | # show_authors = False 123 | 124 | # The name of the Pygments (syntax highlighting) style to use. 125 | pygments_style = "sphinx" 126 | 127 | # Custom style 128 | html_style = "css/project-template.css" 129 | 130 | # A list of ignored prefixes for module index sorting. 131 | # modindex_common_prefix = [] 132 | 133 | # If true, keep warnings as "system message" paragraphs in the built documents. 134 | # keep_warnings = False 135 | 136 | 137 | # -- Options for HTML output ---------------------------------------------- 138 | 139 | # The theme to use for HTML and HTML Help pages. See the documentation for 140 | # a list of builtin themes. 141 | html_theme = "sphinx_rtd_theme" 142 | 143 | # Theme options are theme-specific and customize the look and feel of a theme 144 | # further. For a list of options available for each theme, see the 145 | # documentation. 146 | # html_theme_options = {} 147 | 148 | # Add any paths that contain custom themes here, relative to this directory. 149 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 150 | 151 | # The name for this set of Sphinx documents. If None, it defaults to 152 | # " v documentation". 153 | # html_title = None 154 | 155 | # A shorter title for the navigation bar. Default is the same as html_title. 156 | # html_short_title = None 157 | 158 | # The name of an image file (relative to this directory) to place at the top 159 | # of the sidebar. 160 | # html_logo = None 161 | 162 | # The name of an image file (within the static path) to use as favicon of the 163 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 164 | # pixels large. 165 | # html_favicon = None 166 | 167 | # Add any paths that contain custom static files (such as style sheets) here, 168 | # relative to this directory. They are copied after the builtin static files, 169 | # so a file named "default.css" will overwrite the builtin "default.css". 170 | html_static_path = ["_static"] 171 | 172 | html_logo = "../logo/logo_lce.svg" 173 | html_theme_options = { 174 | "logo_only": True, 175 | "display_version": False, 176 | } 177 | 178 | # Add any extra paths that contain custom files (such as robots.txt or 179 | # .htaccess) here, relative to this directory. These files are copied 180 | # directly to the root of the documentation. 181 | # html_extra_path = [] 182 | 183 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 184 | # using the given strftime format. 185 | # html_last_updated_fmt = '%b %d, %Y' 186 | 187 | # If true, SmartyPants will be used to convert quotes and dashes to 188 | # typographically correct entities. 189 | # html_use_smartypants = True 190 | 191 | # Custom sidebar templates, maps document names to template names. 192 | # html_sidebars = {} 193 | 194 | # Additional templates that should be rendered to pages, maps page names to 195 | # template names. 196 | # html_additional_pages = {} 197 | 198 | # If false, no module index is generated. 199 | # html_domain_indices = True 200 | 201 | # If false, no index is generated. 202 | # html_use_index = True 203 | 204 | # If true, the index is split into individual pages for each letter. 205 | # html_split_index = False 206 | 207 | # If true, links to the reST sources are added to the pages. 208 | # html_show_sourcelink = True 209 | 210 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 211 | # html_show_sphinx = True 212 | 213 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 214 | # html_show_copyright = True 215 | 216 | # If true, an OpenSearch description file will be output, and all pages will 217 | # contain a tag referring to it. The value of this option must be the 218 | # base URL from which the finished HTML is served. 219 | # html_use_opensearch = '' 220 | 221 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 222 | # html_file_suffix = None 223 | 224 | # Output file base name for HTML help builder. 225 | # htmlhelp_basename = 'project-templatedoc' 226 | htmlhelp_basename = "lcedoc" 227 | 228 | 229 | # -- Options for LaTeX output --------------------------------------------- 230 | 231 | latex_elements = { 232 | # The paper size ('letterpaper' or 'a4paper'). 233 | #'papersize': 'letterpaper', 234 | # The font size ('10pt', '11pt' or '12pt'). 235 | #'pointsize': '10pt', 236 | # Additional stuff for the LaTeX preamble. 237 | #'preamble': '', 238 | } 239 | 240 | # Grouping the document tree into LaTeX files. List of tuples 241 | # (source start file, target name, title, 242 | # author, documentclass [howto, manual, or own class]). 243 | latex_documents = [ 244 | ("index", "LCE.tex", "LCE Documentation", "Kevin Fauvel", "manual"), 245 | ] 246 | 247 | # The name of an image file (relative to this directory) to place at the top of 248 | # the title page. 249 | # latex_logo = None 250 | 251 | # For "manual" documents, if this is true, then toplevel headings are parts, 252 | # not chapters. 253 | # latex_use_parts = False 254 | 255 | # If true, show page references after internal links. 256 | # latex_show_pagerefs = False 257 | 258 | # If true, show URL addresses after external links. 259 | # latex_show_urls = False 260 | 261 | # Documents to append as an appendix to all manuals. 262 | # latex_appendices = [] 263 | 264 | # If false, no module index is generated. 265 | # latex_domain_indices = True 266 | 267 | 268 | # -- Options for manual page output --------------------------------------- 269 | 270 | # One entry per manual page. List of tuples 271 | # (source start file, name, description, authors, manual section). 272 | man_pages = [("index", "LCE", "LCE Documentation", ["Kevin Fauvel"], 1)] 273 | 274 | # If true, show URL addresses after external links. 275 | # man_show_urls = False 276 | 277 | 278 | # -- Options for Texinfo output ------------------------------------------- 279 | 280 | # Grouping the document tree into Texinfo files. List of tuples 281 | # (source start file, target name, title, author, 282 | # dir menu entry, description, category) 283 | texinfo_documents = [ 284 | ( 285 | "index", 286 | "LCE", 287 | "LCE Documentation", 288 | "Kevin Fauvel", 289 | "LCE", 290 | "Local Cascade Ensemble.", 291 | "Miscellaneous", 292 | ), 293 | ] 294 | 295 | # Documents to append as an appendix to all manuals. 296 | # texinfo_appendices = [] 297 | 298 | # If false, no module index is generated. 299 | # texinfo_domain_indices = True 300 | 301 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 302 | # texinfo_show_urls = 'footnote' 303 | 304 | # If true, do not generate a @detailmenu in the "Top" node's menu. 305 | # texinfo_no_detailmenu = False 306 | 307 | 308 | # Example configuration for intersphinx: refer to the Python standard library. 309 | # intersphinx configuration 310 | intersphinx_mapping = { 311 | "python": ("https://docs.python.org/{.major}".format(sys.version_info), None), 312 | "numpy": ("https://docs.scipy.org/doc/numpy/", None), 313 | "pandas": ("https://pandas.pydata.org/", None), 314 | "sklearn": ("http://scikit-learn.org/stable", None), 315 | } 316 | 317 | # sphinx-gallery configuration 318 | sphinx_gallery_conf = { 319 | "doc_module": "LCE", 320 | "backreferences_dir": os.path.join("generated"), 321 | "reference_url": {"LCE": None}, 322 | } 323 | 324 | 325 | def setup(app): 326 | # a copy button to copy snippet of code from the documentation 327 | app.add_js_file("js/copybutton.js") 328 | -------------------------------------------------------------------------------- /lce/tests/test_lce.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.datasets import ( 3 | load_breast_cancer, 4 | load_diabetes, 5 | load_iris, 6 | make_regression, 7 | ) 8 | from sklearn.utils.estimator_checks import check_estimator 9 | import unittest 10 | import warnings 11 | 12 | from .._lce import LCEClassifier, LCERegressor 13 | 14 | 15 | class Test(unittest.TestCase): 16 | """Tests of LCE""" 17 | 18 | def test_classifier_params(self): 19 | # Load Iris dataset 20 | data = load_iris() 21 | 22 | # max_depth 23 | with self.assertRaises(ValueError): 24 | LCEClassifier(max_depth=-1).fit(data.data, data.target) 25 | with self.assertRaises(ValueError): 26 | LCEClassifier(max_depth=1.1).fit(data.data, data.target) 27 | 28 | # min_samples_leaf 29 | with self.assertRaises(ValueError): 30 | LCEClassifier(min_samples_leaf=0).fit(data.data, data.target) 31 | with self.assertRaises(ValueError): 32 | LCEClassifier(min_samples_leaf=1.1).fit(data.data, data.target) 33 | with self.assertRaises(ValueError): 34 | LCEClassifier(min_samples_leaf="a").fit(data.data, data.target) 35 | with warnings.catch_warnings(): 36 | LCEClassifier(min_samples_leaf=0.3).fit(data.data, data.target) 37 | 38 | # n_iter 39 | with self.assertRaises(ValueError): 40 | LCEClassifier(n_iter=-1).fit(data.data, data.target) 41 | with self.assertRaises(ValueError): 42 | LCEClassifier(n_iter=1.1).fit(data.data, data.target) 43 | 44 | # verbose 45 | with self.assertRaises(ValueError): 46 | LCEClassifier(verbose=-1).fit(data.data, data.target) 47 | with self.assertRaises(ValueError): 48 | LCEClassifier(verbose=1.1).fit(data.data, data.target) 49 | with warnings.catch_warnings(): 50 | LCEClassifier(verbose=1).fit(data.data, data.target) 51 | 52 | def test_classifier(self): 53 | # Load Breast Cancer dataset 54 | data = load_breast_cancer() 55 | 56 | # Fit and predict (base learner: CatBoost) 57 | with warnings.catch_warnings(): 58 | clf = LCEClassifier( 59 | n_estimators=3, 60 | max_depth=50, 61 | min_samples_leaf=1, 62 | base_learner="catboost", 63 | random_state=0, 64 | verbose=1, 65 | ).fit(data.data, data.target) 66 | clf.predict(data.data) 67 | 68 | # Fit and predict (base learner: LightGBM) 69 | with warnings.catch_warnings(): 70 | clf = LCEClassifier( 71 | n_estimators=3, 72 | max_depth=50, 73 | min_samples_leaf=1, 74 | base_learner="lightgbm", 75 | random_state=0, 76 | verbose=1, 77 | ).fit(data.data, data.target) 78 | clf.predict(data.data) 79 | 80 | # Fit and predict (base learner: XGBoost) 81 | with warnings.catch_warnings(): 82 | clf = LCEClassifier( 83 | n_estimators=3, 84 | max_depth=50, 85 | min_samples_leaf=1, 86 | base_learner="xgboost", 87 | random_state=0, 88 | verbose=1, 89 | ).fit(data.data, data.target) 90 | clf.predict(data.data) 91 | 92 | 93 | def test_classifier_missing(self): 94 | # Load Iris dataset 95 | data = load_iris() 96 | 97 | # Input 2% of missing values per variable (base learner: CatBoost) 98 | np.random.seed(0) 99 | m = 0.02 100 | for j in range(0, data.data.shape[1]): 101 | sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m)) 102 | temp = data.data 103 | temp[sub, j] = np.nan 104 | 105 | with warnings.catch_warnings(): 106 | clf = LCEClassifier( 107 | n_estimators=3, 108 | max_depth=50, 109 | min_samples_leaf=1, 110 | base_learner="catboost", 111 | random_state=0, 112 | verbose=1, 113 | ).fit(temp, data.target) 114 | clf.predict(temp) 115 | 116 | # Input 2% of missing values per variable (base learner: LightGBM) 117 | np.random.seed(0) 118 | m = 0.02 119 | for j in range(0, data.data.shape[1]): 120 | sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m)) 121 | temp = data.data 122 | temp[sub, j] = np.nan 123 | 124 | with warnings.catch_warnings(): 125 | clf = LCEClassifier( 126 | n_estimators=3, 127 | max_depth=50, 128 | min_samples_leaf=1, 129 | base_learner="lightgbm", 130 | random_state=0, 131 | verbose=1, 132 | ).fit(temp, data.target) 133 | clf.predict(temp) 134 | 135 | # Input 2% of missing values per variable (base learner: XGBoost) 136 | np.random.seed(0) 137 | m = 0.02 138 | for j in range(0, data.data.shape[1]): 139 | sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m)) 140 | temp = data.data 141 | temp[sub, j] = np.nan 142 | 143 | with warnings.catch_warnings(): 144 | clf = LCEClassifier( 145 | n_estimators=3, 146 | max_depth=50, 147 | min_samples_leaf=1, 148 | base_learner="xgboost", 149 | random_state=0, 150 | verbose=1, 151 | ).fit(temp, data.target) 152 | clf.predict(temp) 153 | 154 | # Input 20% of missing values per variable (base learner: XGBoost) 155 | np.random.seed(0) 156 | m = 0.2 157 | for j in range(0, data.data.shape[1]): 158 | sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m)) 159 | temp = data.data 160 | temp[sub, j] = np.nan 161 | 162 | with warnings.catch_warnings(): 163 | clf = LCEClassifier( 164 | n_estimators=3, max_depth=50, min_samples_leaf=1, base_learner="xgboost", random_state=0 165 | ).fit(temp, data.target) 166 | clf.predict(temp) 167 | 168 | # Input 60% of missing values per variable (base learner: XGBoost) 169 | np.random.seed(0) 170 | m = 0.6 171 | for j in range(0, data.data.shape[1]): 172 | sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m)) 173 | temp = data.data 174 | temp[sub, j] = np.nan 175 | 176 | with warnings.catch_warnings(): 177 | clf = LCEClassifier( 178 | n_estimators=3, max_depth=50, min_samples_leaf=1, base_learner="xgboost", random_state=0 179 | ).fit(temp, data.target) 180 | clf.predict(temp) 181 | 182 | # Input 100% of missing values per variable (base learner: XGBoost) 183 | np.random.seed(0) 184 | m = 1.0 185 | for j in range(0, data.data.shape[1]): 186 | sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m)) 187 | temp = data.data 188 | temp[sub, j] = np.nan 189 | 190 | with warnings.catch_warnings(): 191 | clf = LCEClassifier( 192 | n_estimators=3, max_depth=50, min_samples_leaf=1, base_learner="xgboost", random_state=0 193 | ).fit(temp, data.target) 194 | clf.predict(temp) 195 | 196 | def test_classifier_sklearn_estimator(self): 197 | # scikit-learn check estimator 198 | assert check_estimator(LCEClassifier()) == None 199 | 200 | def test_regressor_params(self): 201 | # Load Diabetes dataset 202 | data = load_diabetes() 203 | 204 | # max_depth 205 | with self.assertRaises(ValueError): 206 | LCERegressor(max_depth=-1).fit(data.data, data.target) 207 | with self.assertRaises(ValueError): 208 | LCERegressor(max_depth=1.1).fit(data.data, data.target) 209 | 210 | # min_samples_leaf 211 | with self.assertRaises(ValueError): 212 | LCERegressor(min_samples_leaf=0).fit(data.data, data.target) 213 | with self.assertRaises(ValueError): 214 | LCERegressor(min_samples_leaf=1.1).fit(data.data, data.target) 215 | with self.assertRaises(ValueError): 216 | LCERegressor(min_samples_leaf="a").fit(data.data, data.target) 217 | with warnings.catch_warnings(): 218 | LCERegressor(min_samples_leaf=0.3).fit(data.data, data.target) 219 | 220 | # n_iter 221 | with self.assertRaises(ValueError): 222 | LCERegressor(n_iter=-1).fit(data.data, data.target) 223 | with self.assertRaises(ValueError): 224 | LCERegressor(n_iter=1.1).fit(data.data, data.target) 225 | 226 | # verbose 227 | with self.assertRaises(ValueError): 228 | LCERegressor(verbose=-1).fit(data.data, data.target) 229 | with self.assertRaises(ValueError): 230 | LCERegressor(verbose=1.1).fit(data.data, data.target) 231 | with warnings.catch_warnings(): 232 | LCERegressor(verbose=1).fit(data.data, data.target) 233 | 234 | def test_regressor(self): 235 | # Load dataset 236 | n_samples, n_features = 100, 20 237 | rng = np.random.RandomState(0) 238 | X, y = make_regression(n_samples, n_features, random_state=rng) 239 | 240 | # Fit and predict (base learner: CatBoost) 241 | with warnings.catch_warnings(): 242 | reg = LCERegressor( 243 | n_estimators=3, 244 | max_depth=50, 245 | min_samples_leaf=1, 246 | base_learner="catboost", 247 | random_state=0, 248 | verbose=1, 249 | ).fit(X, y) 250 | reg.predict(X) 251 | 252 | # Fit and predict (base learner: LightGBM) 253 | with warnings.catch_warnings(): 254 | reg = LCERegressor( 255 | n_estimators=3, 256 | max_depth=50, 257 | min_samples_leaf=1, 258 | base_learner="lightgbm", 259 | random_state=0, 260 | verbose=1, 261 | ).fit(X, y) 262 | reg.predict(X) 263 | 264 | # Fit and predict (base learner: XGBoost) 265 | with warnings.catch_warnings(): 266 | reg = LCERegressor( 267 | n_estimators=3, 268 | max_depth=50, 269 | min_samples_leaf=1, 270 | base_learner="xgboost", 271 | random_state=0, 272 | verbose=1, 273 | ).fit(X, y) 274 | reg.predict(X) 275 | 276 | def test_regressor_missing(self): 277 | # Load Diabetes dataset 278 | data = load_diabetes() 279 | 280 | # Input 2% of missing values per variable (base learner: CatBoost) 281 | np.random.seed(0) 282 | m = 0.02 283 | for j in range(0, data.data.shape[1]): 284 | sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m)) 285 | temp = data.data 286 | temp[sub, j] = np.nan 287 | 288 | with warnings.catch_warnings(): 289 | reg = LCERegressor( 290 | n_estimators=3, 291 | max_depth=50, 292 | min_samples_leaf=1, 293 | base_learner="catboost", 294 | random_state=0, 295 | verbose=1, 296 | ).fit(temp, data.target) 297 | reg.predict(temp) 298 | 299 | # Input 2% of missing values per variable (base learner: LightGBM) 300 | np.random.seed(0) 301 | m = 0.02 302 | for j in range(0, data.data.shape[1]): 303 | sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m)) 304 | temp = data.data 305 | temp[sub, j] = np.nan 306 | 307 | with warnings.catch_warnings(): 308 | reg = LCERegressor( 309 | n_estimators=3, 310 | max_depth=50, 311 | min_samples_leaf=1, 312 | base_learner="lightgbm", 313 | random_state=0, 314 | verbose=1, 315 | ).fit(temp, data.target) 316 | reg.predict(temp) 317 | 318 | # Input 2% of missing values per variable (base learner: XGBoost) 319 | np.random.seed(0) 320 | m = 0.02 321 | for j in range(0, data.data.shape[1]): 322 | sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m)) 323 | temp = data.data 324 | temp[sub, j] = np.nan 325 | 326 | with warnings.catch_warnings(): 327 | reg = LCERegressor( 328 | n_estimators=3, 329 | max_depth=50, 330 | min_samples_leaf=1, 331 | base_learner="xgboost", 332 | random_state=0, 333 | verbose=1, 334 | ).fit(temp, data.target) 335 | reg.predict(temp) 336 | 337 | # Input 20% of missing values per variable (base learner: XGBoost) 338 | np.random.seed(0) 339 | m = 0.2 340 | for j in range(0, data.data.shape[1]): 341 | sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m)) 342 | temp = data.data 343 | temp[sub, j] = np.nan 344 | 345 | with warnings.catch_warnings(): 346 | reg = LCERegressor( 347 | n_estimators=3, max_depth=50, min_samples_leaf=1, base_learner="xgboost", random_state=0 348 | ).fit(temp, data.target) 349 | reg.predict(temp) 350 | 351 | # Input 60% of missing values per variable (base learner: XGBoost) 352 | np.random.seed(0) 353 | m = 0.6 354 | for j in range(0, data.data.shape[1]): 355 | sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m)) 356 | temp = data.data 357 | temp[sub, j] = np.nan 358 | 359 | with warnings.catch_warnings(): 360 | reg = LCERegressor( 361 | n_estimators=3, max_depth=50, min_samples_leaf=1, base_learner="xgboost", random_state=0 362 | ).fit(temp, data.target) 363 | reg.predict(temp) 364 | 365 | # Input 100% of missing values per variable (base learner: XGBoost) 366 | np.random.seed(0) 367 | m = 1.0 368 | for j in range(0, data.data.shape[1]): 369 | sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m)) 370 | temp = data.data 371 | temp[sub, j] = np.nan 372 | 373 | with warnings.catch_warnings(): 374 | reg = LCERegressor( 375 | n_estimators=3, max_depth=50, min_samples_leaf=1, base_learner="xgboost", random_state=0 376 | ).fit(temp, data.target) 377 | reg.predict(temp) 378 | 379 | def test_regressor_sklearn_estimator(self): 380 | # scikit-learn check estimator 381 | assert check_estimator(LCERegressor()) == None 382 | -------------------------------------------------------------------------------- /doc/tutorial.rst: -------------------------------------------------------------------------------- 1 | LCE Presentation 2 | ================ 3 | As shown in “Why Do Tree-Based Models still Outperform Deep Learning on Tabular Data?” [8]_, **the widely used tree-based models remain the state-of-the-art machine learning methods in many cases**. 4 | **Local Cascade Ensemble (LCE)** [7]_ proposes to combine the strengths of the top performing tree-based ensemble methods - Random Forest [3]_ and eXtreme Gradient Boosting (XGBoost) [4]_, 5 | and integrates a supplementary diversification approach which enables it to be **a better generalizing predictor**. 6 | 7 | Overview 8 | -------- 9 | The construction of an ensemble method involves combining accurate and diverse individual predictors. 10 | There are **two complementary ways** to generate diverse predictors: *(i)* by **changing the training data distribution** and *(ii)* by **learning different parts of the training data**. 11 | 12 | **LCE adopts these two diversification approaches.** 13 | First, *(i)* LCE combines the two well-known methods that modify the distribution of the original training data with complementary effects on the bias-variance trade-off: bagging [2]_ (variance reduction) and boosting [11]_ (bias reduction). 14 | Then, *(ii)* LCE learns different parts of the training data to capture new relationships that cannot be discovered globally based on a divide-and-conquer strategy (a decision tree). 15 | Before detailing how LCE combines these methods, we introduce the key concepts behind them that will be used in the explanation of LCE. 16 | 17 | Concepts 18 | -------- 19 | The bias-variance trade-off defines the capacity of the learning algorithm to generalize beyond the training set. 20 | The *bias* is the component of the prediction error that results from systematic errors of the learning algorithm. 21 | A high bias means that the learning algorithm is not able to capture the underlying structure of the training set (underfitting). 22 | The *variance* measures the sensitivity of the learning algorithm to changes in the training set. 23 | A high variance means that the algorithm is learning too closely the training set (overfitting). 24 | The objective is to minimize both the bias and variance. *Bagging* has a main effect on variance reduction; it is a method for generating multiple versions of a predictor (bootstrap replicates) and using these to get an aggregated predictor. 25 | The current state-of-the-art method that employs bagging is Random Forest [3]_. 26 | Whereas, *boosting* has a main effect on bias reduction; it is a method for iteratively learning weak predictors and adding them to create a final strong one. 27 | After a weak learner is added, the data weights are readjusted, allowing future weak learners to focus more on the examples that previous weak learners mispredicted. 28 | The current state-of-the-art method that uses boosting is XGBoost [4]_. 29 | The following Figure illustrates the difference between bagging and boosting methods. 30 | 31 | 32 | .. image:: _images/Figure_BaggingvsBoosting.png 33 | :width: 90% 34 | :align: center 35 | 36 | 37 | LCE 38 | --- 39 | LCE combines a boosting-bagging approach to handle the bias-variance trade-off faced by machine learning models; in addition, it adopts a divide-and-conquer approach to individualize predictor errors on different parts of the training data. 40 | LCE is represented in the following Figure. 41 | 42 | .. image:: _images/Figure_LCE.png 43 | :width: 90% 44 | :align: center 45 | 46 | 47 | Specifically, LCE is based on cascade generalization: it uses a set of predictors sequentially, and adds new attributes to the input dataset at each stage. 48 | The new attributes are derived from the output given by a predictor (e.g., class probabilities for a classifier), called a base learner. 49 | LCE applies cascade generalization locally following a divide-and-conquer strategy - a decision tree, and reduces bias across a decision tree through the use of boosting-based predictors as base learners. 50 | The current best performing state-of-the-art boosting algorithm is adopted as base learner by default (XGBoost, e.g., XGB¹°, XGB¹¹ in above Figure). 51 | CatBoost [10]_ and LightGBM [9]_ can also be chosen as base learner. 52 | When growing the tree, boosting is propagated down the tree by adding the output of the base learner at each decision node as new attributes to the dataset (e.g., XGB¹°(D¹) in above Figure). 53 | Prediction outputs indicate the ability of the base learner to correctly predict a sample. 54 | At the next tree level, the outputs added to the dataset are exploited by the base learner as a weighting scheme to focus more on previously mispredicted samples. 55 | Then, the overfitting generated by the boosted decision tree is mitigated by the use of bagging. 56 | Bagging provides variance reduction by creating multiple predictors from random sampling with replacement of the original dataset (e.g., D¹, D² in above Figure). 57 | Finally, trees are aggregated with a simple majority vote. 58 | In order to be applied as a predictor, LCE stores, in each node, the model generated by the base learner. 59 | 60 | Missing Data 61 | ------------ 62 | LCE natively handles missing data. 63 | Similar to XGBoost, LCE excludes missing values for the split and uses block propagation. 64 | During a node split, block propagation sends all samples with missing data to the side of the decision node with less errors. 65 | 66 | Hyperparameters 67 | --------------- 68 | The hyperparameters of LCE are the classical ones in tree-based learning (e.g., ``max_depth``, ``max_features``, ``n_estimators``). 69 | Moreover, LCE learns a specific XGBoost model at each node of a tree, and it only requires the ranges of XGBoost hyperparameters to be specified. 70 | Then, the hyperparameters of each XGBoost model are automatically set by Hyperopt [1]_, a sequential model-based optimization using a tree of Parzen estimators algorithm. 71 | Hyperopt chooses the next hyperparameters from both the previous choices and a tree-based optimization algorithm. 72 | Tree of Parzen estimators meets or exceeds grid search and random search performance for hyperparameters setting. 73 | The full list of LCE hyperparameters is available in its :ref:`API documentation `. 74 | 75 | Published Results 76 | ----------------- 77 | LCE has been initially designed for a specific application in [6]_, and then evaluated on the public UCI datasets [5]_ in [7]_. 78 | Results show that LCE obtains on average a better prediction performance than the state-of-the-art classifiers, including Random Forest and XGBoost. 79 | For a comparison between LCE, Random Forest and XGBoost on different public datasets, using the public implementations of the aforementioned algorithms, please refer to the article published in Towards Data Science `"LCE: The Most Powerful Machine Learning Method?" `_. 80 | 81 | 82 | References 83 | ---------- 84 | .. [1] Bergstra, J., R. Bardenet, Y. Bengio and B. Kégl. Algorithms for Hyper-Parameter Optimization. In Proceedings of the 24th International Conference on Neural Information Processing Systems, 2011 85 | .. [2] Breiman, L. Bagging Predictors. Machine Learning, 24(2):123–140, 1996 86 | .. [3] Breiman, L. Random Forests. Machine Learning, 45(1):5–32, 2001 87 | .. [4] Chen, T. and C. Guestrin. XGBoost: A Scalable Tree Boosting System. In Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, 2016 88 | .. [5] Dua, D. and C. Graff. UCI Machine Learning Repository, 2017 89 | .. [6] Fauvel, K., V. Masson, E. Fromont, P. Faverdin and A. Termier. Towards Sustainable Dairy Management - A Machine Learning Enhanced Method for Estrus Detection. In Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, 2019 90 | .. [7] Fauvel, K., E. Fromont, V. Masson, P. Faverdin and A. Termier. XEM: An Explainable-by-Design Ensemble Method for Multivariate Time Series Classification. Data Mining and Knowledge Discovery, 36(3):917–957, 2022 91 | .. [8] Grinsztajn, L., E. Oyallon and G. Varoquaux. Why Do Tree-Based Models still Outperform Deep Learning on Typical Tabular Data? In Proceedings of the 36th Conference on Neural Information Processing Systems Datasets and Benchmarks Track, 2022 92 | .. [9] Ke, G., Q. Meng, T. Finley, T. Wang, W. Chen, W. Ma, Q. Ye and T. Liu. LightGBM: A Highly Efficient Gradient Boosting Decision Tree. In Proceedings of the 31st International Conference on Neural Information Processing Systems, 2017 93 | .. [10] Prokhorenkova, L., G. Gusev, A. Vorobev, A. Dorogush and A. Gulin. CatBoost: Unbiased Boosting with Categorical Features. In Proceedings of the 32nd International Conference on Neural Information Processing Systems, 2018 94 | .. [11] Schapire, R. The Strength of Weak Learnability. Machine Learning, 5(2):197–227, 1990 95 | 96 | 97 | 98 | Installation 99 | ============ 100 | 101 | You can install LCE from `PyPI `_ with ``pip``:: 102 | 103 | pip install lcensemble 104 | 105 | Or ``conda``:: 106 | 107 | conda install -c conda-forge lcensemble 108 | 109 | 110 | Code Examples 111 | ============= 112 | 113 | The following examples illustrate the use of LCE on public datasets for a classification and a regression task. 114 | They also demonstrate the compatibility of LCE with scikit-learn pipelines and model selection tools through the use of ``cross_val_score``. 115 | An example of LCE on a dataset including missing values is also shown. 116 | 117 | Classification 118 | -------------- 119 | 120 | - **Example 1: LCE on Iris Dataset** 121 | 122 | .. code-block:: python 123 | 124 | from lce import LCEClassifier 125 | from sklearn.datasets import load_iris 126 | from sklearn.metrics import accuracy_score 127 | from sklearn.model_selection import train_test_split 128 | 129 | 130 | # Load data and generate a train/test split 131 | data = load_iris() 132 | X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, random_state=0) 133 | 134 | # Train LCEClassifier with default parameters 135 | clf = LCEClassifier(n_jobs=-1, random_state=0) 136 | clf.fit(X_train, y_train) 137 | 138 | # Make prediction and compute accuracy score 139 | y_pred = clf.predict(X_test) 140 | accuracy = accuracy_score(y_test, y_pred) 141 | print("Accuracy: {:.1f}%".format(accuracy*100)) 142 | 143 | .. code-block:: 144 | 145 | Accuracy: 97.4% 146 | 147 | 148 | - **Example 2: LCE with scikit-learn cross validation score** 149 | This example demonstrates the compatibility of LCE with scikit-learn pipelines and model selection tools through the use of ``cross_val_score``. 150 | 151 | .. code-block:: python 152 | 153 | from lce import LCEClassifier 154 | from sklearn.datasets import load_iris 155 | from sklearn.model_selection import cross_val_score, train_test_split 156 | 157 | # Load data 158 | data = load_iris() 159 | X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, random_state=0) 160 | 161 | # Set LCEClassifier with default parameters 162 | clf = LCEClassifier(n_jobs=-1, random_state=0) 163 | 164 | # Compute cross-validation scores 165 | cv_scores = cross_val_score(clf, X_train, y_train, cv=3) 166 | cv_scores = [round(elem*100, 1) for elem in cv_scores.tolist()] 167 | print("Cross-validation scores on train set: ", cv_scores) 168 | 169 | .. code-block:: 170 | 171 | Cross-validation scores on train set: [94.7, 100.0, 94.6] 172 | 173 | 174 | Regression 175 | ---------- 176 | 177 | - **Example 3: LCE on Diabetes Dataset** 178 | 179 | .. code-block:: python 180 | 181 | from lce import LCERegressor 182 | from sklearn.datasets import load_diabetes 183 | from sklearn.metrics import mean_squared_error 184 | from sklearn.model_selection import train_test_split 185 | 186 | 187 | # Load data and generate a train/test split 188 | data = load_diabetes() 189 | X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, random_state=0) 190 | 191 | # Train LCERegressor with default parameters 192 | reg = LCERegressor(n_jobs=-1, random_state=0) 193 | reg.fit(X_train, y_train) 194 | 195 | # Make prediction 196 | y_pred = reg.predict(X_test) 197 | mse = mean_squared_error(y_test, y_pred) 198 | print("The mean squared error (MSE) on test set: {:.0f}".format(mse)) 199 | 200 | .. code-block:: 201 | 202 | The mean squared error (MSE) on test set: 3761 203 | 204 | 205 | - **Example 4: LCE with missing values** 206 | This example illustrates the robustness of LCE to missing values. The Diabetes train set is modified with 20% of missing values per variable. 207 | 208 | .. code-block:: python 209 | 210 | import numpy as np 211 | from lce import LCERegressor 212 | from sklearn.datasets import load_diabetes 213 | from sklearn.metrics import mean_squared_error 214 | from sklearn.model_selection import train_test_split 215 | 216 | 217 | # Load data and generate a train/test split 218 | data = load_diabetes() 219 | X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, random_state=0) 220 | 221 | # Input 20% of missing values per variable in the train set 222 | np.random.seed(0) 223 | m = 0.2 224 | for j in range(0, X_train.shape[1]): 225 | sub = np.random.choice(X_train.shape[0], int(X_train.shape[0]*m)) 226 | X_train[sub, j] = np.nan 227 | 228 | # Train LCERegressor with default parameters 229 | reg = LCERegressor(n_jobs=-1, random_state=0) 230 | reg.fit(X_train, y_train) 231 | 232 | # Make prediction 233 | y_pred = reg.predict(X_test) 234 | mse = mean_squared_error(y_test, y_pred) 235 | print("The mean squared error (MSE) on test set: {:.0f}".format(mse)) 236 | 237 | .. code-block:: 238 | 239 | The mean squared error (MSE) on test set: 3895 240 | 241 | 242 | Python Source Files 243 | ------------------- 244 | 245 | 246 | .. raw:: html 247 | 248 |
249 | 250 | .. only:: html 251 | 252 | .. figure:: _images/logo_lce.svg 253 | :alt: LCEClassifier on Iris dataset 254 | 255 | :ref:`sphx_glr_auto_examples_lceclassifier_iris.py` 256 | 257 | .. raw:: html 258 | 259 |
260 | 261 | .. toctree:: 262 | :hidden: 263 | 264 | /auto_examples/lceclassifier_iris 265 | 266 | 267 | 268 | .. raw:: html 269 | 270 |
271 | 272 | .. only:: html 273 | 274 | .. figure:: _images/logo_lce.svg 275 | :alt: LCEClassifier on Iris dataset with scikit-learn cross validation score 276 | 277 | :ref:`sphx_glr_auto_examples_lceclassifier_iris_cv.py` 278 | 279 | .. raw:: html 280 | 281 |
282 | 283 | .. toctree:: 284 | :hidden: 285 | 286 | /auto_examples/lceclassifier_iris_cv 287 | 288 | 289 | 290 | .. raw:: html 291 | 292 |
293 | 294 | .. only:: html 295 | 296 | .. figure:: _images/logo_lce.svg 297 | :alt: LCERegressor on Diabetes dataset 298 | 299 | :ref:`sphx_glr_auto_examples_lceregressor_diabetes.py` 300 | 301 | .. raw:: html 302 | 303 |
304 | 305 | 306 | .. toctree:: 307 | :hidden: 308 | 309 | /auto_examples/lceregressor_diabetes 310 | 311 | 312 | .. raw:: html 313 | 314 |
315 | 316 | .. only:: html 317 | 318 | .. figure:: _images/logo_lce.svg 319 | :alt: LCERegressor on Diabetes dataset with missing values 320 | 321 | :ref:`sphx_glr_auto_examples_lceregressor_missing_diabetes.py` 322 | 323 | .. raw:: html 324 | 325 |
326 | 327 | 328 | .. toctree:: 329 | :hidden: 330 | 331 | /auto_examples/lceregressor_missing_diabetes 332 | 333 | 334 | 335 | .. raw:: html 336 | 337 |
338 | 339 | 340 | 341 | .. only :: html 342 | 343 | .. container:: sphx-glr-footer 344 | :class: sphx-glr-footer-gallery 345 | 346 | 347 | .. container:: sphx-glr-download sphx-glr-download-python 348 | 349 | :download:`Download all examples in Python source code: auto_examples_python.zip ` 350 | 351 | 352 | 353 | .. container:: sphx-glr-download sphx-glr-download-jupyter 354 | 355 | :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip ` 356 | 357 | 358 | .. only:: html 359 | 360 | .. rst-class:: sphx-glr-signature 361 | 362 | `Gallery generated by Sphinx-Gallery `_ 363 | -------------------------------------------------------------------------------- /lce/_lightgbm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from hyperopt import fmin, tpe, hp, STATUS_OK, Trials 3 | from sklearn.metrics import check_scoring 4 | import lightgbm as lgbm 5 | 6 | 7 | def lgbm_opt_classifier( 8 | X, 9 | y, 10 | n_iter=10, 11 | metric="accuracy", 12 | n_estimators=(10, 50, 100), 13 | max_depth=(3, 6, 9), 14 | num_leaves=(20, 50, 100, 500), 15 | learning_rate=(0.01, 0.1, 0.3, 0.5), 16 | boosting_type=("gbdt",), 17 | min_child_weight=(1, 5, 15, 100), 18 | subsample=(1.0,), 19 | subsample_for_bin=(200000,), 20 | colsample_bytree=(1.0,), 21 | reg_alpha=(0,), 22 | reg_lambda=(0.1, 1.0, 5.0), 23 | n_jobs=None, 24 | random_state=None, 25 | ): 26 | """ 27 | Get LightGBM model with the best hyperparameters configuration. 28 | 29 | Parameters 30 | ---------- 31 | X : array-like of shape (n_samples, n_features) 32 | The training input samples. 33 | 34 | y : array-like of shape (n_samples,) 35 | The class labels. 36 | 37 | n_iter: int, default=10 38 | Number of iterations to set the hyperparameters of the base classifier (LightGBM) 39 | in Hyperopt. 40 | 41 | metric: string, default="accuracy" 42 | The score of the base classifier (LightGBM) optimized by Hyperopt. Supported metrics 43 | are the ones from `scikit-learn `_. 44 | 45 | n_estimators : tuple, default=(10, 50, 100) 46 | The number of LightGBM estimators. The number of estimators of 47 | LightGBM corresponds to the number of boosting rounds. The tuple provided is 48 | the search space used for the hyperparameter optimization (Hyperopt). 49 | 50 | max_depth : tuple, default=(3, 6, 9) 51 | Maximum tree depth for LightGBM base learners. The tuple provided is the search 52 | space used for the hyperparameter optimization (Hyperopt). 53 | 54 | num_leaves : tuple, default=(20, 50, 100, 500) 55 | Maximum tree leaves. The tuple provided is the search 56 | space used for the hyperparameter optimization (Hyperopt). 57 | 58 | learning_rate : tuple, default=(0.01, 0.1, 0.3, 0.5) 59 | `learning_rate` of LightGBM. The tuple provided is the search space used for the 60 | hyperparameter optimization (Hyperopt). 61 | 62 | boosting_type : ("dart", "gbdt", "rf"), default=("gbdt",) 63 | The type of boosting type to use: "dart" dropouts meet Multiple Additive 64 | Regression Trees; "gbdt" traditional Gradient Boosting Decision Tree; "rf" Random Forest. 65 | The tuple provided is the search space used for the hyperparameter optimization (Hyperopt). 66 | 67 | min_child_weight : tuple, default=(1, 5, 15, 100) 68 | `min_child_weight` of LightGBM. `min_child_weight` defines the 69 | minimum sum of instance weight (hessian) needed in a child. If the tree 70 | partition step results in a leaf node with the sum of instance weight 71 | less than `min_child_weight`, then the building process will give up further 72 | partitioning. The larger `min_child_weight` is, the more conservative LightGBM 73 | algorithm will be. The tuple provided is the search space used for the hyperparameter 74 | optimization (Hyperopt). 75 | 76 | subsample : tuple, default=(1.0,) 77 | LightGBM subsample ratio of the training instances. Setting it to 0.5 means 78 | that LightGBM would randomly sample half of the training data prior to 79 | growing trees, and this will prevent overfitting. Subsampling will occur 80 | once in every boosting iteration. The tuple provided is the search space used for 81 | the hyperparameter optimization (Hyperopt). 82 | 83 | subsample_for_bin : tuple, default=(200000,) 84 | Number of samples for constructing bins. The tuple provided is the 85 | search space used for the hyperparameter optimization (Hyperopt). 86 | 87 | colsample_bytree : tuple, default=(1.0,) 88 | LightGBM subsample ratio of columns when constructing each tree. 89 | Subsampling occurs once for every tree constructed. The tuple provided is the search 90 | space used for the hyperparameter optimization (Hyperopt). 91 | 92 | reg_alpha : tuple, default=(0,) 93 | `reg_alpha` of LightGBM. `reg_alpha` corresponds to the L1 regularization 94 | term on the weights. Increasing this value will make LightGBM model more 95 | conservative. The tuple provided is the search space used for the hyperparameter 96 | optimization (Hyperopt). 97 | 98 | reg_lambda : tuple, default=(0.1, 1.0, 5.0) 99 | `reg_lambda` of LightGBM. `reg_lambda` corresponds to the L2 regularization 100 | term on the weights. Increasing this value will make LightGBM model more 101 | conservative. The tuple provided is the search space used for the hyperparameter 102 | optimization (Hyperopt). 103 | 104 | n_jobs : int, default=None 105 | The number of jobs to run in parallel. 106 | ``n_jobs=None`` means 1. ``n_jobs=-1`` means using all processors. 107 | 108 | random_state : int, RandomState instance or None, default=None 109 | Controls the randomness of the base learner LightGBM and 110 | the Hyperopt algorithm. 111 | 112 | Returns 113 | ------- 114 | model: object 115 | LightGBM model with the best configuration and fitted on the input data. 116 | """ 117 | # Parameters 118 | classes, y = np.unique(y, return_inverse=True) 119 | n_classes = classes.size 120 | 121 | if n_classes == 2: 122 | objective = "binary" 123 | num_class = 1 124 | else: 125 | objective = "multiclass" 126 | num_class = n_classes 127 | 128 | space = { 129 | "n_estimators": hp.choice("n_estimators", n_estimators), 130 | "max_depth": hp.choice("max_depth", max_depth), 131 | "num_leaves": hp.choice("num_leaves", num_leaves), 132 | "learning_rate": hp.choice("learning_rate", learning_rate), 133 | "boosting_type": hp.choice("boosting_type", boosting_type), 134 | "min_child_weight": hp.choice("min_child_weight", min_child_weight), 135 | "subsample": hp.choice("subsample", subsample), 136 | "subsample_for_bin": hp.choice("subsample_for_bin", subsample_for_bin), 137 | "colsample_bytree": hp.choice("colsample_bytree", colsample_bytree), 138 | "reg_alpha": hp.choice("reg_alpha", reg_alpha), 139 | "reg_lambda": hp.choice("reg_lambda", reg_lambda), 140 | "objective": objective, 141 | "num_class": num_class, 142 | "n_jobs": n_jobs, 143 | "random_state": random_state, 144 | } 145 | 146 | # Get best configuration 147 | def p_model(params): 148 | clf = lgbm.LGBMClassifier(**params, verbose=-1) 149 | clf.fit(X, y) 150 | scorer = check_scoring(clf, scoring=metric) 151 | return scorer(clf, X, y) 152 | 153 | global best 154 | best = -np.inf 155 | 156 | def f(params): 157 | global best 158 | perf = p_model(params) 159 | if perf > best: 160 | best = perf 161 | return {"loss": -best, "status": STATUS_OK} 162 | 163 | rstate = np.random.default_rng(random_state) 164 | best_config = fmin( 165 | fn=f, 166 | space=space, 167 | algo=tpe.suggest, 168 | max_evals=n_iter, 169 | trials=Trials(), 170 | rstate=rstate, 171 | verbose=0, 172 | ) 173 | 174 | # Fit best model 175 | final_params = { 176 | "n_estimators": n_estimators[best_config["n_estimators"]], 177 | "max_depth": max_depth[best_config["max_depth"]], 178 | "num_leaves": num_leaves[best_config["num_leaves"]], 179 | "learning_rate": learning_rate[best_config["learning_rate"]], 180 | "boosting_type": boosting_type[best_config["boosting_type"]], 181 | "min_child_weight": min_child_weight[best_config["min_child_weight"]], 182 | "subsample": subsample[best_config["subsample"]], 183 | "subsample_for_bin": subsample_for_bin[best_config["subsample_for_bin"]], 184 | "colsample_bytree": colsample_bytree[best_config["colsample_bytree"]], 185 | "reg_alpha": reg_alpha[best_config["reg_alpha"]], 186 | "reg_lambda": reg_lambda[best_config["reg_lambda"]], 187 | "objective": objective, 188 | "num_class": num_class, 189 | "n_jobs": n_jobs, 190 | "random_state": random_state, 191 | } 192 | clf = lgbm.LGBMClassifier(**final_params, verbose=-1) 193 | return clf.fit(X, y) 194 | 195 | 196 | def lgbm_opt_regressor( 197 | X, 198 | y, 199 | n_iter=10, 200 | metric="neg_mean_squared_error", 201 | n_estimators=(10, 50, 100), 202 | max_depth=(3, 6, 9), 203 | num_leaves=(20, 50, 100, 500), 204 | learning_rate=(0.01, 0.1, 0.3, 0.5), 205 | boosting_type=("gbdt",), 206 | min_child_weight=(1, 5, 15, 100), 207 | subsample=(1.0,), 208 | subsample_for_bin=(200000,), 209 | colsample_bytree=(1.0,), 210 | reg_alpha=(0,), 211 | reg_lambda=(0.1, 1.0, 5.0), 212 | n_jobs=None, 213 | random_state=None, 214 | ): 215 | """ 216 | Get LightGBM model with the best hyperparameters configuration. 217 | 218 | Parameters 219 | ---------- 220 | X : array-like of shape (n_samples, n_features) 221 | The training input samples. 222 | 223 | y : array-like of shape (n_samples,) 224 | The target values (real numbers). 225 | 226 | n_iter: int, default=10 227 | Number of iterations to set the hyperparameters of the base regressor (LightGBM) 228 | in Hyperopt. 229 | 230 | metric: string, default="neg_mean_squared_error" 231 | The score of the base regressor (LightGBM) optimized by Hyperopt. Supported metrics 232 | are the ones from `scikit-learn `_. 233 | 234 | n_estimators : tuple, default=(10, 50, 100) 235 | The number of LightGBM estimators. The number of estimators of 236 | LightGBM corresponds to the number of boosting rounds. The tuple provided is 237 | the search space used for the hyperparameter optimization (Hyperopt). 238 | 239 | max_depth : tuple, default=(3, 6, 9) 240 | Maximum tree depth for LightGBM base learners. The tuple provided is the search 241 | space used for the hyperparameter optimization (Hyperopt). 242 | 243 | num_leaves : tuple, default=(20, 50, 100, 500) 244 | Maximum tree leaves. The tuple provided is the search 245 | space used for the hyperparameter optimization (Hyperopt). 246 | 247 | learning_rate : tuple, default=(0.01, 0.1, 0.3, 0.5) 248 | `learning_rate` of LightGBM. The tuple provided is the search space used for the 249 | hyperparameter optimization (Hyperopt). 250 | 251 | boosting_type : ("dart", "gbdt", "rf"), default=("gbdt",) 252 | The type of boosting type to use: "dart" dropouts meet Multiple Additive 253 | Regression Trees; "gbdt" traditional Gradient Boosting Decision Tree; "rf" Random Forest. 254 | The tuple provided is the search space used for the hyperparameter optimization (Hyperopt). 255 | 256 | min_child_weight : tuple, default=(1, 5, 15, 100) 257 | `min_child_weight` of LightGBM. `min_child_weight` defines the 258 | minimum sum of instance weight (hessian) needed in a child. If the tree 259 | partition step results in a leaf node with the sum of instance weight 260 | less than `min_child_weight`, then the building process will give up further 261 | partitioning. The larger `min_child_weight` is, the more conservative LightGBM 262 | algorithm will be. The tuple provided is the search space used for the hyperparameter 263 | optimization (Hyperopt). 264 | 265 | subsample : tuple, default=(1.0,) 266 | LightGBM subsample ratio of the training instances. Setting it to 0.5 means 267 | that LightGBM would randomly sample half of the training data prior to 268 | growing trees, and this will prevent overfitting. Subsampling will occur 269 | once in every boosting iteration. The tuple provided is the search space used for 270 | the hyperparameter optimization (Hyperopt). 271 | 272 | subsample_for_bin : tuple, default=(200000,) 273 | Number of samples for constructing bins. The tuple provided is the 274 | search space used for the hyperparameter optimization (Hyperopt). 275 | 276 | colsample_bytree : tuple, default=(1.0,) 277 | LightGBM subsample ratio of columns when constructing each tree. 278 | Subsampling occurs once for every tree constructed. The tuple provided is the search 279 | space used for the hyperparameter optimization (Hyperopt). 280 | 281 | reg_alpha : tuple, default=(0,) 282 | `reg_alpha` of LightGBM. `reg_alpha` corresponds to the L1 regularization 283 | term on the weights. Increasing this value will make LightGBM model more 284 | conservative. The tuple provided is the search space used for the hyperparameter 285 | optimization (Hyperopt). 286 | 287 | reg_lambda : tuple, default=(0.1, 1.0, 5.0) 288 | `reg_lambda` of LightGBM. `reg_lambda` corresponds to the L2 regularization 289 | term on the weights. Increasing this value will make LightGBM model more 290 | conservative. The tuple provided is the search space used for the hyperparameter 291 | optimization (Hyperopt). 292 | 293 | n_jobs : int, default=None 294 | The number of jobs to run in parallel. 295 | ``n_jobs=None`` means 1. ``n_jobs=-1`` means using all processors. 296 | 297 | random_state : int, RandomState instance or None, default=None 298 | Controls the randomness of the base learner LightGBM and 299 | the Hyperopt algorithm. 300 | 301 | Returns 302 | ------- 303 | model: object 304 | LightGBM model with the best configuration and fitted on the input data. 305 | """ 306 | space = { 307 | "n_estimators": hp.choice("n_estimators", n_estimators), 308 | "max_depth": hp.choice("max_depth", max_depth), 309 | "num_leaves": hp.choice("num_leaves", num_leaves), 310 | "learning_rate": hp.choice("learning_rate", learning_rate), 311 | "boosting_type": hp.choice("boosting_type", boosting_type), 312 | "min_child_weight": hp.choice("min_child_weight", min_child_weight), 313 | "subsample": hp.choice("subsample", subsample), 314 | "subsample_for_bin": hp.choice("subsample_for_bin", subsample_for_bin), 315 | "colsample_bytree": hp.choice("colsample_bytree", colsample_bytree), 316 | "reg_alpha": hp.choice("reg_alpha", reg_alpha), 317 | "reg_lambda": hp.choice("reg_lambda", reg_lambda), 318 | "objective": "regression", 319 | "n_jobs": n_jobs, 320 | "random_state": random_state, 321 | } 322 | 323 | # Get best configuration 324 | def p_model(params): 325 | reg = lgbm.LGBMRegressor(**params, verbose=-1) 326 | reg.fit(X, y) 327 | scorer = check_scoring(reg, scoring=metric) 328 | return scorer(reg, X, y) 329 | 330 | global best 331 | best = -np.inf 332 | 333 | def f(params): 334 | global best 335 | perf = p_model(params) 336 | if perf > best: 337 | best = perf 338 | return {"loss": -best, "status": STATUS_OK} 339 | 340 | rstate = np.random.default_rng(random_state) 341 | best_config = fmin( 342 | fn=f, 343 | space=space, 344 | algo=tpe.suggest, 345 | max_evals=n_iter, 346 | trials=Trials(), 347 | rstate=rstate, 348 | verbose=0, 349 | ) 350 | 351 | # Fit best model 352 | final_params = { 353 | "n_estimators": n_estimators[best_config["n_estimators"]], 354 | "max_depth": max_depth[best_config["max_depth"]], 355 | "num_leaves": num_leaves[best_config["num_leaves"]], 356 | "learning_rate": learning_rate[best_config["learning_rate"]], 357 | "boosting_type": boosting_type[best_config["boosting_type"]], 358 | "min_child_weight": min_child_weight[best_config["min_child_weight"]], 359 | "subsample": subsample[best_config["subsample"]], 360 | "subsample_for_bin": subsample_for_bin[best_config["subsample_for_bin"]], 361 | "colsample_bytree": colsample_bytree[best_config["colsample_bytree"]], 362 | "reg_alpha": reg_alpha[best_config["reg_alpha"]], 363 | "reg_lambda": reg_lambda[best_config["reg_lambda"]], 364 | "objective": "regression", 365 | "n_jobs": n_jobs, 366 | "random_state": random_state, 367 | } 368 | reg = lgbm.LGBMRegressor(**final_params, verbose=-1) 369 | return reg.fit(X, y) 370 | -------------------------------------------------------------------------------- /lce/_xgboost.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from hyperopt import fmin, tpe, hp, STATUS_OK, Trials 3 | from sklearn.metrics import check_scoring 4 | from sklearn.preprocessing import OneHotEncoder 5 | import xgboost as xgb 6 | 7 | 8 | def xgb_opt_classifier( 9 | X, 10 | y, 11 | n_iter=10, 12 | metric="accuracy", 13 | n_estimators=(10, 50, 100), 14 | max_depth=(3, 6, 9), 15 | learning_rate=(0.01, 0.1, 0.3, 0.5), 16 | booster=("gbtree",), 17 | gamma=(0, 1, 10), 18 | min_child_weight=(1, 5, 15, 100), 19 | subsample=(1.0,), 20 | colsample_bytree=(1.0,), 21 | colsample_bylevel=(1.0,), 22 | colsample_bynode=(1.0,), 23 | reg_alpha=(0,), 24 | reg_lambda=(0.1, 1.0, 5.0), 25 | n_jobs=None, 26 | random_state=None, 27 | ): 28 | """ 29 | Get XGBoost model with the best hyperparameters configuration. 30 | 31 | Parameters 32 | ---------- 33 | X : array-like of shape (n_samples, n_features) 34 | The training input samples. 35 | 36 | y : array-like of shape (n_samples,) 37 | The class labels. 38 | 39 | n_iter: int, default=10 40 | Number of iterations to set the hyperparameters of the base classifier (XGBoost) 41 | in Hyperopt. 42 | 43 | metric: string, default="accuracy" 44 | The score of the base classifier (XGBoost) optimized by Hyperopt. Supported metrics 45 | are the ones from `scikit-learn `_. 46 | 47 | n_estimators : tuple, default=(10, 50, 100) 48 | The number of XGBoost estimators. The number of estimators of 49 | XGBoost corresponds to the number of boosting rounds. The tuple provided is 50 | the search space used for the hyperparameter optimization (Hyperopt). 51 | 52 | max_depth : tuple, default=(3, 6, 9) 53 | Maximum tree depth for XGBoost base learners. The tuple provided is the search 54 | space used for the hyperparameter optimization (Hyperopt). 55 | 56 | learning_rate : tuple, default=(0.01, 0.1, 0.3, 0.5) 57 | `learning_rate` of XGBoost. The learning rate corresponds to the 58 | step size shrinkage used in update to prevent overfitting. After each 59 | boosting step, the learning rate shrinks the feature weights to make the boosting 60 | process more conservative. The tuple provided is the search space used for the 61 | hyperparameter optimization (Hyperopt). 62 | 63 | booster : ("dart", "gblinear", "gbtree"), default=("gbtree",) 64 | The type of booster to use. "gbtree" and "dart" use tree based models 65 | while "gblinear" uses linear functions. The tuple provided is the search space used 66 | for the hyperparameter optimization (Hyperopt). 67 | 68 | gamma : tuple, default=(0, 1, 10) 69 | `gamma` of XGBoost. `gamma` corresponds to the minimum loss reduction 70 | required to make a further partition on a leaf node of the tree. 71 | The larger `gamma` is, the more conservative XGBoost algorithm will be. 72 | The tuple provided is the search space used for the hyperparameter optimization 73 | (Hyperopt). 74 | 75 | min_child_weight : tuple, default=(1, 5, 15, 100) 76 | `min_child_weight` of XGBoost. `min_child_weight` defines the 77 | minimum sum of instance weight (hessian) needed in a child. If the tree 78 | partition step results in a leaf node with the sum of instance weight 79 | less than `min_child_weight`, then the building process will give up further 80 | partitioning. The larger `min_child_weight` is, the more conservative XGBoost 81 | algorithm will be. The tuple provided is the search space used for the hyperparameter 82 | optimization (Hyperopt). 83 | 84 | subsample : tuple, default=(1.0,) 85 | XGBoost subsample ratio of the training instances. Setting it to 0.5 means 86 | that XGBoost would randomly sample half of the training data prior to 87 | growing trees, and this will prevent overfitting. Subsampling will occur 88 | once in every boosting iteration. The tuple provided is the search space used for 89 | the hyperparameter optimization (Hyperopt). 90 | 91 | colsample_bytree : tuple, default=(1.0,) 92 | XGBoost subsample ratio of columns when constructing each tree. 93 | Subsampling occurs once for every tree constructed. The tuple provided is the search 94 | space used for the hyperparameter optimization (Hyperopt). 95 | 96 | colsample_bylevel : tuple, default=(1.0,) 97 | XGBoost subsample ratio of columns for each level. Subsampling occurs 98 | once for every new depth level reached in a tree. Columns are subsampled 99 | from the set of columns chosen for the current tree. The tuple provided is the search 100 | space used for the hyperparameter optimization (Hyperopt). 101 | 102 | colsample_bynode : tuple, default=(1.0,) 103 | XGBoost subsample ratio of columns for each node (split). Subsampling 104 | occurs once every time a new split is evaluated. Columns are subsampled 105 | from the set of columns chosen for the current level. The tuple provided is the search 106 | space used for the hyperparameter optimization (Hyperopt). 107 | 108 | reg_alpha : tuple, default=(0,) 109 | `reg_alpha` of XGBoost. `reg_alpha` corresponds to the L1 regularization 110 | term on the weights. Increasing this value will make XGBoost model more 111 | conservative. The tuple provided is the search space used for the hyperparameter 112 | optimization (Hyperopt). 113 | 114 | reg_lambda : tuple, default=(0.1, 1.0, 5.0) 115 | `reg_lambda` of XGBoost. `reg_lambda` corresponds to the L2 regularization 116 | term on the weights. Increasing this value will make XGBoost model more 117 | conservative. The tuple provided is the search space used for the hyperparameter 118 | optimization (Hyperopt). 119 | 120 | n_jobs : int, default=None 121 | The number of jobs to run in parallel. 122 | ``n_jobs=None`` means 1. ``n_jobs=-1`` means using all processors. 123 | 124 | random_state : int, RandomState instance or None, default=None 125 | Controls the randomness of the base learner XGBoost and 126 | the Hyperopt algorithm. 127 | 128 | Returns 129 | ------- 130 | model: object 131 | XGBoost model with the best configuration and fitted on the input data. 132 | """ 133 | # Parameters 134 | classes, y = np.unique(y, return_inverse=True) 135 | n_classes = classes.size 136 | 137 | space = { 138 | "n_estimators": hp.choice("n_estimators", n_estimators), 139 | "max_depth": hp.choice("max_depth", max_depth), 140 | "learning_rate": hp.choice("learning_rate", learning_rate), 141 | "booster": hp.choice("booster", booster), 142 | "gamma": hp.choice("gamma", gamma), 143 | "min_child_weight": hp.choice("min_child_weight", min_child_weight), 144 | "subsample": hp.choice("subsample", subsample), 145 | "colsample_bytree": hp.choice("colsample_bytree", colsample_bytree), 146 | "colsample_bylevel": hp.choice("colsample_bylevel", colsample_bylevel), 147 | "colsample_bynode": hp.choice("colsample_bynode", colsample_bynode), 148 | "reg_alpha": hp.choice("reg_alpha", reg_alpha), 149 | "reg_lambda": hp.choice("reg_lambda", reg_lambda), 150 | "objective": "multi:softprob", 151 | "num_class": n_classes, 152 | "n_jobs": n_jobs, 153 | "random_state": random_state, 154 | } 155 | 156 | # Get best configuration 157 | def p_model(params): 158 | clf = xgb.XGBClassifier(**params, use_label_encoder=False, verbosity=0) 159 | clf.fit(X, y) 160 | if n_classes == 2: 161 | onehot_encoder = OneHotEncoder(sparse=False) 162 | y_score = onehot_encoder.fit_transform(y.reshape(len(y), 1)) 163 | else: 164 | y_score = y 165 | scorer = check_scoring(clf, scoring=metric) 166 | return scorer(clf, X, y_score) 167 | 168 | global best 169 | best = -np.inf 170 | 171 | def f(params): 172 | global best 173 | perf = p_model(params) 174 | if perf > best: 175 | best = perf 176 | return {"loss": -best, "status": STATUS_OK} 177 | 178 | rstate = np.random.default_rng(random_state) 179 | best_config = fmin( 180 | fn=f, 181 | space=space, 182 | algo=tpe.suggest, 183 | max_evals=n_iter, 184 | trials=Trials(), 185 | rstate=rstate, 186 | verbose=0, 187 | ) 188 | 189 | # Fit best model 190 | final_params = { 191 | "n_estimators": n_estimators[best_config["n_estimators"]], 192 | "max_depth": max_depth[best_config["max_depth"]], 193 | "learning_rate": learning_rate[best_config["learning_rate"]], 194 | "booster": booster[best_config["booster"]], 195 | "gamma": gamma[best_config["gamma"]], 196 | "min_child_weight": min_child_weight[best_config["min_child_weight"]], 197 | "subsample": subsample[best_config["subsample"]], 198 | "colsample_bytree": colsample_bytree[best_config["colsample_bytree"]], 199 | "colsample_bylevel": colsample_bylevel[best_config["colsample_bylevel"]], 200 | "colsample_bynode": colsample_bynode[best_config["colsample_bynode"]], 201 | "reg_alpha": reg_alpha[best_config["reg_alpha"]], 202 | "reg_lambda": reg_lambda[best_config["reg_lambda"]], 203 | "objective": "multi:softprob", 204 | "num_class": n_classes, 205 | "n_jobs": n_jobs, 206 | "random_state": random_state, 207 | } 208 | clf = xgb.XGBClassifier(**final_params, use_label_encoder=False, verbosity=0) 209 | return clf.fit(X, y) 210 | 211 | 212 | def xgb_opt_regressor( 213 | X, 214 | y, 215 | n_iter=10, 216 | metric="neg_mean_squared_error", 217 | n_estimators=(10, 50, 100), 218 | max_depth=(3, 6, 9), 219 | learning_rate=(0.01, 0.1, 0.3, 0.5), 220 | booster=("gbtree",), 221 | gamma=(0, 1, 10), 222 | min_child_weight=(1, 5, 15, 100), 223 | subsample=(1.0,), 224 | colsample_bytree=(1.0,), 225 | colsample_bylevel=(1.0,), 226 | colsample_bynode=(1.0,), 227 | reg_alpha=(0,), 228 | reg_lambda=(0.1, 1.0, 5.0), 229 | n_jobs=None, 230 | random_state=None, 231 | ): 232 | """ 233 | Get XGBoost model with the best hyperparameters configuration. 234 | 235 | Parameters 236 | ---------- 237 | X : array-like of shape (n_samples, n_features) 238 | The training input samples. 239 | 240 | y : array-like of shape (n_samples,) 241 | The target values (real numbers). 242 | 243 | n_iter: int, default=10 244 | Number of iterations to set the hyperparameters of the base regressor (XGBoost) 245 | in Hyperopt. 246 | 247 | metric: string, default="neg_mean_squared_error" 248 | The score of the base regressor (XGBoost) optimized by Hyperopt. Supported metrics 249 | are the ones from `scikit-learn `_. 250 | 251 | n_estimators : tuple, default=(10, 50, 100) 252 | The number of XGBoost estimators. The number of estimators of 253 | XGBoost corresponds to the number of boosting rounds. The tuple provided is 254 | the search space used for the hyperparameter optimization (Hyperopt). 255 | 256 | max_depth : tuple, default=(3, 6, 9) 257 | Maximum tree depth for XGBoost base learners. The tuple provided is the search 258 | space used for the hyperparameter optimization (Hyperopt). 259 | 260 | learning_rate : tuple, default=(0.01, 0.1, 0.3, 0.5) 261 | `learning_rate` of XGBoost. The learning rate corresponds to the 262 | step size shrinkage used in update to prevent overfitting. After each 263 | boosting step, the learning rate shrinks the feature weights to make the boosting 264 | process more conservative. The tuple provided is the search space used for the 265 | hyperparameter optimization (Hyperopt). 266 | 267 | booster : ("dart", "gblinear", "gbtree"), default=("gbtree",) 268 | The type of booster to use. "gbtree" and "dart" use tree based models 269 | while "gblinear" uses linear functions. The tuple provided is the search space used 270 | for the hyperparameter optimization (Hyperopt). 271 | 272 | gamma : tuple, default=(0, 1, 10) 273 | `gamma` of XGBoost. `gamma` corresponds to the minimum loss reduction 274 | required to make a further partition on a leaf node of the tree. 275 | The larger `gamma` is, the more conservative XGBoost algorithm will be. 276 | The tuple provided is the search space used for the hyperparameter optimization 277 | (Hyperopt). 278 | 279 | min_child_weight : tuple, default=(1, 5, 15, 100) 280 | `min_child_weight` of XGBoost. `min_child_weight` defines the 281 | minimum sum of instance weight (hessian) needed in a child. If the tree 282 | partition step results in a leaf node with the sum of instance weight 283 | less than `min_child_weight`, then the building process will give up further 284 | partitioning. The larger `min_child_weight` is, the more conservative XGBoost 285 | algorithm will be. The tuple provided is the search space used for the hyperparameter 286 | optimization (Hyperopt). 287 | 288 | subsample : tuple, default=(1.0,) 289 | XGBoost subsample ratio of the training instances. Setting it to 0.5 means 290 | that XGBoost would randomly sample half of the training data prior to 291 | growing trees, and this will prevent overfitting. Subsampling will occur 292 | once in every boosting iteration. The tuple provided is the search space used for 293 | the hyperparameter optimization (Hyperopt). 294 | 295 | colsample_bytree : tuple, default=(1.0,) 296 | XGBoost subsample ratio of columns when constructing each tree. 297 | Subsampling occurs once for every tree constructed. The tuple provided is the search 298 | space used for the hyperparameter optimization (Hyperopt). 299 | 300 | colsample_bylevel : tuple, default=(1.0,) 301 | XGBoost subsample ratio of columns for each level. Subsampling occurs 302 | once for every new depth level reached in a tree. Columns are subsampled 303 | from the set of columns chosen for the current tree. The tuple provided is the search 304 | space used for the hyperparameter optimization (Hyperopt). 305 | 306 | colsample_bynode : tuple, default=(1.0,) 307 | XGBoost subsample ratio of columns for each node (split). Subsampling 308 | occurs once every time a new split is evaluated. Columns are subsampled 309 | from the set of columns chosen for the current level. The tuple provided is the search 310 | space used for the hyperparameter optimization (Hyperopt). 311 | 312 | reg_alpha : tuple, default=(0,) 313 | `reg_alpha` of XGBoost. `reg_alpha` corresponds to the L1 regularization 314 | term on the weights. Increasing this value will make XGBoost model more 315 | conservative. The tuple provided is the search space used for the hyperparameter 316 | optimization (Hyperopt). 317 | 318 | reg_lambda : tuple, default=(0.1, 1.0, 5.0) 319 | `reg_lambda` of XGBoost. `reg_lambda` corresponds to the L2 regularization 320 | term on the weights. Increasing this value will make XGBoost model more 321 | conservative. The tuple provided is the search space used for the hyperparameter 322 | optimization (Hyperopt). 323 | 324 | n_jobs : int, default=None 325 | The number of jobs to run in parallel. 326 | ``n_jobs=None`` means 1. ``n_jobs=-1`` means using all processors. 327 | 328 | random_state : int, RandomState instance or None, default=None 329 | Controls the randomness of the base learner XGBoost and 330 | the Hyperopt algorithm. 331 | 332 | Returns 333 | ------- 334 | model: object 335 | XGBoost model with the best configuration and fitted on the input data. 336 | """ 337 | space = { 338 | "n_estimators": hp.choice("n_estimators", n_estimators), 339 | "max_depth": hp.choice("max_depth", max_depth), 340 | "learning_rate": hp.choice("learning_rate", learning_rate), 341 | "booster": hp.choice("booster", booster), 342 | "gamma": hp.choice("gamma", gamma), 343 | "min_child_weight": hp.choice("min_child_weight", min_child_weight), 344 | "subsample": hp.choice("subsample", subsample), 345 | "colsample_bytree": hp.choice("colsample_bytree", colsample_bytree), 346 | "colsample_bylevel": hp.choice("colsample_bylevel", colsample_bylevel), 347 | "colsample_bynode": hp.choice("colsample_bynode", colsample_bynode), 348 | "reg_alpha": hp.choice("reg_alpha", reg_alpha), 349 | "reg_lambda": hp.choice("reg_lambda", reg_lambda), 350 | "objective": "reg:squarederror", 351 | "n_jobs": n_jobs, 352 | "random_state": random_state, 353 | } 354 | 355 | # Get best configuration 356 | def p_model(params): 357 | reg = xgb.XGBRegressor(**params, verbosity=0) 358 | reg.fit(X, y) 359 | scorer = check_scoring(reg, scoring=metric) 360 | return scorer(reg, X, y) 361 | 362 | global best 363 | best = -np.inf 364 | 365 | def f(params): 366 | global best 367 | perf = p_model(params) 368 | if perf > best: 369 | best = perf 370 | return {"loss": -best, "status": STATUS_OK} 371 | 372 | rstate = np.random.default_rng(random_state) 373 | best_config = fmin( 374 | fn=f, 375 | space=space, 376 | algo=tpe.suggest, 377 | max_evals=n_iter, 378 | trials=Trials(), 379 | rstate=rstate, 380 | verbose=0, 381 | ) 382 | 383 | # Fit best model 384 | final_params = { 385 | "n_estimators": n_estimators[best_config["n_estimators"]], 386 | "max_depth": max_depth[best_config["max_depth"]], 387 | "learning_rate": learning_rate[best_config["learning_rate"]], 388 | "booster": booster[best_config["booster"]], 389 | "gamma": gamma[best_config["gamma"]], 390 | "min_child_weight": min_child_weight[best_config["min_child_weight"]], 391 | "subsample": subsample[best_config["subsample"]], 392 | "colsample_bytree": colsample_bytree[best_config["colsample_bytree"]], 393 | "colsample_bylevel": colsample_bylevel[best_config["colsample_bylevel"]], 394 | "colsample_bynode": colsample_bynode[best_config["colsample_bynode"]], 395 | "reg_alpha": reg_alpha[best_config["reg_alpha"]], 396 | "reg_lambda": reg_lambda[best_config["reg_lambda"]], 397 | "objective": "reg:squarederror", 398 | "n_jobs": n_jobs, 399 | "random_state": random_state, 400 | } 401 | reg = xgb.XGBRegressor(**final_params, verbosity=0) 402 | return reg.fit(X, y) 403 | -------------------------------------------------------------------------------- /lce/_lce.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numbers 3 | import numpy as np 4 | from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin 5 | from sklearn.ensemble import BaggingClassifier, BaggingRegressor 6 | from sklearn.preprocessing import LabelEncoder 7 | from sklearn.utils.multiclass import check_classification_targets 8 | from sklearn.utils.validation import check_X_y, check_array, check_is_fitted 9 | 10 | from ._lcetree import LCETreeClassifier, LCETreeRegressor 11 | 12 | 13 | class LCEClassifier(ClassifierMixin, BaseEstimator): 14 | """ 15 | A **Local Cascade Ensemble (LCE) classifier**. LCEClassifier is **compatible with scikit-learn**; 16 | it passes the `check_estimator `_. 17 | Therefore, it can interact with scikit-learn pipelines and model selection tools. 18 | 19 | 20 | Parameters 21 | ---------- 22 | n_estimators : int, default=10 23 | The number of trees in the ensemble. 24 | 25 | bootstrap : bool, default=True 26 | Whether bootstrap samples are used when building trees. If False, the 27 | whole dataset is used to build each tree. 28 | 29 | criterion : {"gini", "entropy"}, default="gini" 30 | The function to measure the quality of a split. Supported criteria are 31 | "gini" for the Gini impurity and "entropy" for the information gain. 32 | 33 | splitter : {"best", "random"}, default="best" 34 | The strategy used to choose the split at each node. Supported strategies 35 | are "best" to choose the best split and "random" to choose the best random 36 | split. 37 | 38 | max_depth : int, default=2 39 | The maximum depth of a tree. 40 | 41 | max_features : int, float or {"auto", "sqrt", "log"}, default=None 42 | The number of features to consider when looking for the best split: 43 | 44 | - If int, then consider `max_features` features at each split. 45 | - If float, then `max_features` is a fraction and 46 | `round(max_features * n_features)` features are considered at each 47 | split. 48 | - If "auto", then `max_features=sqrt(n_features)`. 49 | - If "sqrt", then `max_features=sqrt(n_features)` (same as "auto"). 50 | - If "log2", then `max_features=log2(n_features)`. 51 | - If None, then `max_features=n_features`. 52 | 53 | Note: the search for a split does not stop until at least one 54 | valid partition of the node samples is found, even if it requires to 55 | effectively inspect more than ``max_features`` features. 56 | 57 | max_samples : int or float, default=1.0 58 | The number of samples to draw from X to train each base estimator 59 | (with replacement by default, see ``bootstrap`` for more details). 60 | 61 | - If int, then draw `max_samples` samples. 62 | - If float, then draw `max_samples * X.shape[0]` samples. Thus, `max_samples` should be in the interval `(0.0, 1.0]`. 63 | 64 | min_samples_leaf : int or float, default=1 65 | The minimum number of samples required to be at a leaf node. 66 | A split point at any depth will only be considered if it leaves at 67 | least ``min_samples_leaf`` training samples in each of the left and 68 | right branches. 69 | 70 | - If int, then consider `min_samples_leaf` as the minimum number. 71 | - If float, then `min_samples_leaf` is a fraction and 72 | `ceil(min_samples_leaf * n_samples)` are the minimum 73 | number of samples for each node. 74 | 75 | n_iter: int, default=10 76 | Number of iterations to set the hyperparameters of each node base 77 | classifier in Hyperopt. 78 | 79 | metric: string, default="accuracy" 80 | The score of the base classifier optimized by Hyperopt. Supported metrics 81 | are the ones from `scikit-learn `_. 82 | 83 | base_learner : {"catboost", "lightgbm", "xgboost"}, default="xgboost" 84 | The base classifier trained in each node of a tree. 85 | 86 | base_n_estimators : tuple, default=(10, 50, 100) 87 | The number of estimators of the base learner. The tuple provided is 88 | the search space used for the hyperparameter optimization (Hyperopt). 89 | 90 | base_max_depth : tuple, default=(3, 6, 9) 91 | Maximum tree depth for base learners. The tuple provided is the search 92 | space used for the hyperparameter optimization (Hyperopt). 93 | 94 | base_num_leaves : tuple, default=(20, 50, 100, 500) 95 | Maximum tree leaves (applicable to LightGBM only). The tuple provided is the search 96 | space used for the hyperparameter optimization (Hyperopt). 97 | 98 | base_learning_rate : tuple, default=(0.01, 0.1, 0.3, 0.5) 99 | `learning_rate` of the base learner. The tuple provided is the search space used for the 100 | hyperparameter optimization (Hyperopt). 101 | 102 | base_booster : ("dart", "gblinear", "gbtree"), default=("gbtree",) 103 | The type of booster to use (applicable to XGBoost only). "gbtree" and "dart" use tree based models 104 | while "gblinear" uses linear functions. The tuple provided is the search space used 105 | for the hyperparameter optimization (Hyperopt). 106 | 107 | base_boosting_type : ("dart", "gbdt", "rf"), default=("gbdt",) 108 | The type of boosting type to use (applicable to LightGBM only): "dart" dropouts meet Multiple Additive 109 | Regression Trees; "gbdt" traditional Gradient Boosting Decision Tree; "rf" Random Forest. 110 | The tuple provided is the search space used for the hyperparameter optimization (Hyperopt). 111 | 112 | base_gamma : tuple, default=(0, 1, 10) 113 | `gamma` of XGBoost. `gamma` corresponds to the minimum loss reduction 114 | required to make a further partition on a leaf node of the tree. 115 | The larger `gamma` is, the more conservative XGBoost algorithm will be. 116 | The tuple provided is the search space used for the hyperparameter optimization 117 | (Hyperopt). 118 | 119 | base_min_child_weight : tuple, default=(1, 5, 15, 100) 120 | `min_child_weight` of base learner (applicable to LightGBM and XGBoost only). `min_child_weight` defines the 121 | minimum sum of instance weight (hessian) needed in a child. If the tree 122 | partition step results in a leaf node with the sum of instance weight 123 | less than `min_child_weight`, then the building process will give up further 124 | partitioning. The larger `min_child_weight` is, the more conservative the base learner 125 | algorithm will be. The tuple provided is the search space used for the hyperparameter 126 | optimization (Hyperopt). 127 | 128 | base_subsample : tuple, default=(1.0,) 129 | Base learner subsample ratio of the training instances (applicable to LightGBM and XGBoost only). 130 | Setting it to 0.5 means that the base learner would randomly sample half of the training data prior to 131 | growing trees, and this will prevent overfitting. Subsampling will occur 132 | once in every boosting iteration. The tuple provided is the search space used for 133 | the hyperparameter optimization (Hyperopt). 134 | 135 | base_subsample_for_bin : tuple, default=(200000,) 136 | Number of samples for constructing bins (applicable to LightGBM only). The tuple provided is the 137 | search space used for the hyperparameter optimization (Hyperopt). 138 | 139 | base_colsample_bytree : tuple, default=(1.0,) 140 | Base learner subsample ratio of columns when constructing each tree (applicable to LightGBM and XGBoost only). 141 | Subsampling occurs once for every tree constructed. The tuple provided is the search 142 | space used for the hyperparameter optimization (Hyperopt). 143 | 144 | base_colsample_bylevel : tuple, default=(1.0,) 145 | Subsample ratio of columns for each level (applicable to CatBoost and XGBoost only). Subsampling occurs 146 | once for every new depth level reached in a tree. Columns are subsampled 147 | from the set of columns chosen for the current tree. The tuple provided is the search 148 | space used for the hyperparameter optimization (Hyperopt). 149 | 150 | base_colsample_bynode : tuple, default=(1.0,) 151 | Subsample ratio of columns for each node split (applicable to XGBoost only). Subsampling 152 | occurs once every time a new split is evaluated. Columns are subsampled 153 | from the set of columns chosen for the current level. The tuple provided is the search 154 | space used for the hyperparameter optimization (Hyperopt). 155 | 156 | base_reg_alpha : tuple, default=(0,) 157 | `reg_alpha` of the base learner (applicable to LightGBM and XGBoost only). 158 | `reg_alpha` corresponds to the L1 regularization term on the weights. 159 | Increasing this value will make the base learner more conservative. 160 | The tuple provided is the search space used for the hyperparameter optimization (Hyperopt). 161 | 162 | base_reg_lambda : tuple, default=(0.1, 1.0, 5.0) 163 | `reg_lambda` of the base learner. `reg_lambda` corresponds to the L2 regularization term 164 | on the weights. Increasing this value will make the base learner more 165 | conservative. The tuple provided is the search space used for the hyperparameter 166 | optimization (Hyperopt). 167 | 168 | n_jobs : int, default=None 169 | The number of jobs to run in parallel. 170 | ``n_jobs=None`` means 1. ``n_jobs=-1`` means using all processors. 171 | 172 | random_state : int, RandomState instance or None, default=None 173 | Controls the randomness of the bootstrapping of the samples used 174 | when building trees (if ``bootstrap=True``), the sampling of the 175 | features to consider when looking for the best split at each node 176 | (if ``max_features < n_features``), the base classifier and 177 | the Hyperopt algorithm. 178 | 179 | verbose : int, default=0 180 | Controls the verbosity when fitting. 181 | 182 | Attributes 183 | ---------- 184 | base_estimator_ : LCETreeClassifier 185 | The child estimator template used to create the collection of fitted 186 | sub-estimators. 187 | 188 | estimators_ : list of LCETreeClassifier 189 | The collection of fitted sub-estimators. 190 | 191 | classes_ : ndarray of shape (n_classes,) or a list of such arrays 192 | The classes labels. 193 | 194 | n_classes_ : int 195 | The number of classes. 196 | 197 | n_features_in_ : int 198 | The number of features when ``fit`` is performed. 199 | 200 | encoder_ : LabelEncoder 201 | The encoder to have target labels with value between 0 and n_classes-1. 202 | 203 | Notes 204 | ----- 205 | The default values for the parameters controlling the size of the trees 206 | (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and 207 | unpruned trees which can potentially be very large on some data sets. To 208 | reduce memory consumption, the complexity and size of the trees should be 209 | controlled by setting those parameter values. 210 | 211 | The features are always randomly permuted at each split. Therefore, 212 | the best found split may vary, even with the same training data, 213 | ``max_features=n_features`` and ``bootstrap=False``, if the improvement 214 | of the criterion is identical for several splits enumerated during the 215 | search of the best split. To obtain a deterministic behaviour during 216 | fitting, ``random_state`` has to be fixed. 217 | 218 | References 219 | ---------- 220 | .. [1] Fauvel, K., E. Fromont, V. Masson, P. Faverdin and A. Termier. "XEM: An Explainable-by-Design Ensemble Method for Multivariate Time Series Classification", Data Mining and Knowledge Discovery, 36(3):917-957, 2022. https://hal.inria.fr/hal-03599214/document 221 | """ 222 | 223 | def __init__( 224 | self, 225 | n_estimators=10, 226 | bootstrap=True, 227 | criterion="gini", 228 | splitter="best", 229 | max_depth=2, 230 | max_features=None, 231 | max_samples=1.0, 232 | min_samples_leaf=1, 233 | n_iter=10, 234 | metric="accuracy", 235 | base_learner="xgboost", 236 | base_n_estimators=(10, 50, 100), 237 | base_max_depth=(3, 6, 9), 238 | base_num_leaves=(20, 50, 100, 500), 239 | base_learning_rate=(0.01, 0.1, 0.3, 0.5), 240 | base_booster=("gbtree",), 241 | base_boosting_type=("gbdt",), 242 | base_gamma=(0, 1, 10), 243 | base_min_child_weight=(1, 5, 15, 100), 244 | base_subsample=(1.0,), 245 | base_subsample_for_bin=(200000,), 246 | base_colsample_bytree=(1.0,), 247 | base_colsample_bylevel=(1.0,), 248 | base_colsample_bynode=(1.0,), 249 | base_reg_alpha=(0,), 250 | base_reg_lambda=(0.1, 1.0, 5.0), 251 | n_jobs=None, 252 | random_state=None, 253 | verbose=0, 254 | ): 255 | self.n_estimators = n_estimators 256 | self.bootstrap = bootstrap 257 | self.criterion = criterion 258 | self.splitter = splitter 259 | self.max_depth = max_depth 260 | self.max_features = max_features 261 | self.max_samples = max_samples 262 | self.min_samples_leaf = min_samples_leaf 263 | self.n_iter = n_iter 264 | self.metric = metric 265 | self.base_learner = base_learner 266 | self.base_n_estimators = base_n_estimators 267 | self.base_max_depth = base_max_depth 268 | self.base_num_leaves = base_num_leaves 269 | self.base_learning_rate = base_learning_rate 270 | self.base_booster = base_booster 271 | self.base_boosting_type = base_boosting_type 272 | self.base_gamma = base_gamma 273 | self.base_min_child_weight = base_min_child_weight 274 | self.base_subsample = base_subsample 275 | self.base_subsample_for_bin = base_subsample_for_bin 276 | self.base_colsample_bytree = base_colsample_bytree 277 | self.base_colsample_bylevel = base_colsample_bylevel 278 | self.base_colsample_bynode = base_colsample_bynode 279 | self.base_reg_alpha = base_reg_alpha 280 | self.base_reg_lambda = base_reg_lambda 281 | self.n_jobs = n_jobs 282 | self.random_state = random_state 283 | self.verbose = verbose 284 | 285 | def _generate_estimator(self): 286 | """Generate an estimator.""" 287 | est = LCETreeClassifier() 288 | est.n_classes_in = self.n_classes_ 289 | est.criterion = self.criterion 290 | est.splitter = self.splitter 291 | est.max_depth = self.max_depth 292 | est.max_features = self.max_features 293 | est.min_samples_leaf = self.min_samples_leaf 294 | est.n_iter = self.n_iter 295 | est.metric = self.metric 296 | est.base_learner = self.base_learner 297 | est.base_n_estimators = self.base_n_estimators 298 | est.base_max_depth = self.base_max_depth 299 | est.base_num_leaves = self.base_num_leaves 300 | est.base_learning_rate = self.base_learning_rate 301 | est.base_booster = self.base_booster 302 | est.base_boosting_type = self.base_boosting_type 303 | est.base_gamma = self.base_gamma 304 | est.base_min_child_weight = self.base_min_child_weight 305 | est.base_subsample = self.base_subsample 306 | est.base_subsample_for_bin = self.base_subsample_for_bin 307 | est.base_colsample_bytree = self.base_colsample_bytree 308 | est.base_colsample_bylevel = self.base_colsample_bylevel 309 | est.base_colsample_bynode = self.base_colsample_bynode 310 | est.base_reg_alpha = self.base_reg_alpha 311 | est.base_reg_alpha = self.base_reg_lambda 312 | est.n_jobs = self.n_jobs 313 | est.random_state = self.random_state 314 | est.verbose = self.verbose 315 | return est 316 | 317 | def _more_tags(self): 318 | """Update scikit-learn estimator tags.""" 319 | return {"allow_nan": True, "requires_y": True} 320 | 321 | def _validate_extra_parameters(self, X): 322 | """Validate parameters not already validated by methods employed.""" 323 | # Validate max_depth 324 | if isinstance(self.max_depth, numbers.Integral): 325 | if not (0 <= self.max_depth): 326 | raise ValueError( 327 | "max_depth must be greater than or equal to 0, " 328 | "got {0}.".format(self.max_depth) 329 | ) 330 | else: 331 | raise ValueError("max_depth must be int") 332 | 333 | # Validate min_samples_leaf 334 | if isinstance(self.min_samples_leaf, numbers.Integral): 335 | if not 1 <= self.min_samples_leaf: 336 | raise ValueError( 337 | "min_samples_leaf must be at least 1 " 338 | "or in (0, 0.5], got %s" % self.min_samples_leaf 339 | ) 340 | elif isinstance(self.min_samples_leaf, float): 341 | if not 0.0 < self.min_samples_leaf <= 0.5: 342 | raise ValueError( 343 | "min_samples_leaf must be at least 1 " 344 | "or in (0, 0.5], got %s" % self.min_samples_leaf 345 | ) 346 | self.min_samples_leaf = int(math.ceil(self.min_samples_leaf * X.shape[0])) 347 | else: 348 | raise ValueError("min_samples_leaf must be int or float") 349 | 350 | # Validate n_iter 351 | if isinstance(self.n_iter, numbers.Integral): 352 | if self.n_iter <= 0: 353 | raise ValueError( 354 | "n_iter must be greater than 0, " "got {0}.".format(self.n_iter) 355 | ) 356 | else: 357 | raise ValueError("n_iter must be int") 358 | 359 | # Validate verbose 360 | if isinstance(self.verbose, numbers.Integral): 361 | if self.verbose < 0: 362 | raise ValueError( 363 | "verbose must be greater than or equal to 0, " 364 | "got {0}.".format(self.verbose) 365 | ) 366 | else: 367 | raise ValueError("verbose must be int") 368 | 369 | def fit(self, X, y): 370 | """ 371 | Build a forest of LCE trees from the training set (X, y). 372 | 373 | Parameters 374 | ---------- 375 | X : array-like of shape (n_samples, n_features) 376 | The training input samples. 377 | 378 | y : array-like of shape (n_samples,) 379 | The class labels. 380 | 381 | Returns 382 | ------- 383 | self : object 384 | """ 385 | X, y = check_X_y(X, y, force_all_finite="allow-nan") 386 | check_classification_targets(y) 387 | self._validate_extra_parameters(X) 388 | self.n_features_in_ = X.shape[1] 389 | self.X_ = True 390 | self.y_ = True 391 | self.classes_, y = np.unique(y, return_inverse=True) 392 | self.n_classes_ = self.classes_.size 393 | self.encoder_ = LabelEncoder() 394 | self.encoder_.fit(self.classes_) 395 | self.base_estimator_ = self._generate_estimator() 396 | self.estimators_ = BaggingClassifier( 397 | base_estimator=self.base_estimator_, 398 | n_estimators=self.n_estimators, 399 | bootstrap=self.bootstrap, 400 | max_samples=self.max_samples, 401 | n_jobs=self.n_jobs, 402 | random_state=self.random_state, 403 | ) 404 | self.estimators_.fit(X, y) 405 | return self 406 | 407 | def predict(self, X): 408 | """ 409 | Predict class for X. 410 | The predicted class of an input sample is computed as the class with 411 | the highest mean predicted probability. 412 | 413 | Parameters 414 | ---------- 415 | X : array-like of shape (n_samples, n_features) 416 | The training input samples. 417 | 418 | Returns 419 | ------- 420 | y : ndarray of shape (n_samples,) 421 | The predicted classes. 422 | """ 423 | check_is_fitted(self, ["X_", "y_"]) 424 | X = check_array(X, force_all_finite="allow-nan") 425 | predictions = self.estimators_.predict(X) 426 | return self.encoder_.inverse_transform(predictions) 427 | 428 | def predict_proba(self, X): 429 | """ 430 | Predict class probabilities for X. 431 | The predicted class probabilities of an input sample are computed as 432 | the mean predicted class probabilities of the base estimators in the 433 | ensemble. 434 | 435 | Parameters 436 | ---------- 437 | X : array-like of shape (n_samples, n_features) 438 | The training input samples. 439 | 440 | Returns 441 | ------- 442 | y : ndarray of shape (n_samples,) 443 | The class probabilities of the input samples. The order of the 444 | classes corresponds to that in the attribute ``classes_``. 445 | """ 446 | check_is_fitted(self, ["X_", "y_"]) 447 | X = check_array(X, force_all_finite="allow-nan") 448 | return self.estimators_.predict_proba(X) 449 | 450 | def set_params(self, **params): 451 | """ 452 | Set the parameters of the estimator. 453 | 454 | Parameters 455 | ---------- 456 | **params : dict 457 | Estimator parameters. 458 | 459 | Returns 460 | ------- 461 | self : object 462 | """ 463 | if not params: 464 | return self 465 | 466 | for key, value in params.items(): 467 | if hasattr(self, key): 468 | setattr(self, key, value) 469 | 470 | return self 471 | 472 | 473 | class LCERegressor(RegressorMixin, BaseEstimator): 474 | """ 475 | A **Local Cascade Ensemble (LCE) regressor**. LCERegressor is **compatible with scikit-learn**; 476 | it passes the `check_estimator `_. 477 | Therefore, it can interact with scikit-learn pipelines and model selection tools. 478 | 479 | 480 | Parameters 481 | ---------- 482 | n_estimators : int, default=10 483 | The number of trees in the ensemble. 484 | 485 | bootstrap : bool, default=True 486 | Whether bootstrap samples are used when building trees. If False, the 487 | whole dataset is used to build each tree. 488 | 489 | criterion : {"squared_error", "friedman_mse", "absolute_error", "poisson"}, default="squared_error" 490 | The function to measure the quality of a split. Supported criteria are "squared_error" for 491 | the mean squared error, which is equal to variance reduction as feature selection 492 | criterion and minimizes the L2 loss using the mean of each terminal node, 493 | "friedman_mse", which uses mean squared error with Friedman's improvement score 494 | for potential splits, "absolute_error" for the mean absolute error, which 495 | minimizes the L1 loss using the median of each terminal node, and "poisson" 496 | which uses reduction in Poisson deviance to find splits. 497 | 498 | splitter : {"best", "random"}, default="best" 499 | The strategy used to choose the split at each node. Supported strategies 500 | are "best" to choose the best split and "random" to choose the best random 501 | split. 502 | 503 | max_depth : int, default=2 504 | The maximum depth of a tree. 505 | 506 | max_features : int, float or {"auto", "sqrt", "log"}, default=None 507 | The number of features to consider when looking for the best split: 508 | 509 | - If int, then consider `max_features` features at each split. 510 | - If float, then `max_features` is a fraction and 511 | `round(max_features * n_features)` features are considered at each 512 | split. 513 | - If "auto", then `max_features=sqrt(n_features)`. 514 | - If "sqrt", then `max_features=sqrt(n_features)` (same as "auto"). 515 | - If "log2", then `max_features=log2(n_features)`. 516 | - If None, then `max_features=n_features`. 517 | 518 | Note: the search for a split does not stop until at least one 519 | valid partition of the node samples is found, even if it requires to 520 | effectively inspect more than ``max_features`` features. 521 | 522 | max_samples : int or float, default=1.0 523 | The number of samples to draw from X to train each base estimator 524 | (with replacement by default, see ``bootstrap`` for more details). 525 | 526 | - If int, then draw `max_samples` samples. 527 | - If float, then draw `max_samples * X.shape[0]` samples. Thus, `max_samples` should be in the interval `(0.0, 1.0]`. 528 | 529 | min_samples_leaf : int or float, default=1 530 | The minimum number of samples required to be at a leaf node. 531 | A split point at any depth will only be considered if it leaves at 532 | least ``min_samples_leaf`` training samples in each of the left and 533 | right branches. 534 | 535 | - If int, then consider `min_samples_leaf` as the minimum number. 536 | - If float, then `min_samples_leaf` is a fraction and 537 | `ceil(min_samples_leaf * n_samples)` are the minimum 538 | number of samples for each node. 539 | 540 | n_iter: int, default=10 541 | Number of iterations to set the hyperparameters of each node base 542 | regressor in Hyperopt. 543 | 544 | metric: string, default="neg_mean_squared_error" 545 | The score of the base regressor optimized by Hyperopt. Supported metrics 546 | are the ones from `scikit-learn `_. 547 | 548 | base_learner : {"catboost", "lightgbm", "xgboost"}, default="xgboost" 549 | The base classifier trained in each node of a tree. 550 | 551 | base_n_estimators : tuple, default=(10, 50, 100) 552 | The number of estimators of the base learner. The tuple provided is 553 | the search space used for the hyperparameter optimization (Hyperopt). 554 | 555 | base_max_depth : tuple, default=(3, 6, 9) 556 | Maximum tree depth for base learners. The tuple provided is the search 557 | space used for the hyperparameter optimization (Hyperopt). 558 | 559 | base_num_leaves : tuple, default=(20, 50, 100, 500) 560 | Maximum tree leaves (applicable to LightGBM only). The tuple provided is the search 561 | space used for the hyperparameter optimization (Hyperopt). 562 | 563 | base_learning_rate : tuple, default=(0.01, 0.1, 0.3, 0.5) 564 | `learning_rate` of the base learner. The tuple provided is the search space used for the 565 | hyperparameter optimization (Hyperopt). 566 | 567 | base_booster : ("dart", "gblinear", "gbtree"), default=("gbtree",) 568 | The type of booster to use (applicable to XGBoost only). "gbtree" and "dart" use tree based models 569 | while "gblinear" uses linear functions. The tuple provided is the search space used 570 | for the hyperparameter optimization (Hyperopt). 571 | 572 | base_boosting_type : ("dart", "gbdt", "rf"), default=("gbdt",) 573 | The type of boosting type to use (applicable to LightGBM only): "dart" dropouts meet Multiple Additive 574 | Regression Trees; "gbdt" traditional Gradient Boosting Decision Tree; "rf" Random Forest. 575 | The tuple provided is the search space used for the hyperparameter optimization (Hyperopt). 576 | 577 | base_gamma : tuple, default=(0, 1, 10) 578 | `gamma` of XGBoost. `gamma` corresponds to the minimum loss reduction 579 | required to make a further partition on a leaf node of the tree. 580 | The larger `gamma` is, the more conservative XGBoost algorithm will be. 581 | The tuple provided is the search space used for the hyperparameter optimization 582 | (Hyperopt). 583 | 584 | base_min_child_weight : tuple, default=(1, 5, 15, 100) 585 | `min_child_weight` of base learner (applicable to LightGBM and XGBoost only). `min_child_weight` defines the 586 | minimum sum of instance weight (hessian) needed in a child. If the tree 587 | partition step results in a leaf node with the sum of instance weight 588 | less than `min_child_weight`, then the building process will give up further 589 | partitioning. The larger `min_child_weight` is, the more conservative the base learner 590 | algorithm will be. The tuple provided is the search space used for the hyperparameter 591 | optimization (Hyperopt). 592 | 593 | base_subsample : tuple, default=(1.0,) 594 | Base learner subsample ratio of the training instances (applicable to LightGBM and XGBoost only). 595 | Setting it to 0.5 means that the base learner would randomly sample half of the training data prior to 596 | growing trees, and this will prevent overfitting. Subsampling will occur 597 | once in every boosting iteration. The tuple provided is the search space used for 598 | the hyperparameter optimization (Hyperopt). 599 | 600 | base_subsample_for_bin : tuple, default=(200000,) 601 | Number of samples for constructing bins (applicable to LightGBM only). The tuple provided is the 602 | search space used for the hyperparameter optimization (Hyperopt). 603 | 604 | base_colsample_bytree : tuple, default=(1.0,) 605 | Base learner subsample ratio of columns when constructing each tree (applicable to LightGBM and XGBoost only). 606 | Subsampling occurs once for every tree constructed. The tuple provided is the search 607 | space used for the hyperparameter optimization (Hyperopt). 608 | 609 | base_colsample_bylevel : tuple, default=(1.0,) 610 | Subsample ratio of columns for each level (applicable to CatBoost and XGBoost only). Subsampling occurs 611 | once for every new depth level reached in a tree. Columns are subsampled 612 | from the set of columns chosen for the current tree. The tuple provided is the search 613 | space used for the hyperparameter optimization (Hyperopt). 614 | 615 | base_colsample_bynode : tuple, default=(1.0,) 616 | Subsample ratio of columns for each node split (applicable to XGBoost only). Subsampling 617 | occurs once every time a new split is evaluated. Columns are subsampled 618 | from the set of columns chosen for the current level. The tuple provided is the search 619 | space used for the hyperparameter optimization (Hyperopt). 620 | 621 | base_reg_alpha : tuple, default=(0,) 622 | `reg_alpha` of the base learner (applicable to LightGBM and XGBoost only). 623 | `reg_alpha` corresponds to the L1 regularization term on the weights. 624 | Increasing this value will make the base learner more conservative. 625 | The tuple provided is the search space used for the hyperparameter optimization (Hyperopt). 626 | 627 | base_reg_lambda : tuple, default=(0.1, 1.0, 5.0) 628 | `reg_lambda` of the base learner. `reg_lambda` corresponds to the L2 regularization term 629 | on the weights. Increasing this value will make the base learner more 630 | conservative. The tuple provided is the search space used for the hyperparameter 631 | optimization (Hyperopt). 632 | 633 | n_jobs : int, default=None 634 | The number of jobs to run in parallel. 635 | ``n_jobs=None`` means 1. ``n_jobs=-1`` means using all processors. 636 | 637 | random_state : int, RandomState instance or None, default=None 638 | Controls the randomness of the bootstrapping of the samples used 639 | when building trees (if ``bootstrap=True``), the sampling of the 640 | features to consider when looking for the best split at each node 641 | (if ``max_features < n_features``), the base classifier (XGBoost) and 642 | the Hyperopt algorithm. 643 | 644 | verbose : int, default=0 645 | Controls the verbosity when fitting. 646 | 647 | Attributes 648 | ---------- 649 | base_estimator_ : LCETreeRegressor 650 | The child estimator template used to create the collection of fitted 651 | sub-estimators. 652 | 653 | estimators_ : list of LCETreeRegressor 654 | The collection of fitted sub-estimators. 655 | 656 | n_features_in_ : int 657 | The number of features when ``fit`` is performed. 658 | 659 | Notes 660 | ----- 661 | The default values for the parameters controlling the size of the trees 662 | (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and 663 | unpruned trees which can potentially be very large on some data sets. To 664 | reduce memory consumption, the complexity and size of the trees should be 665 | controlled by setting those parameter values. 666 | 667 | The features are always randomly permuted at each split. Therefore, 668 | the best found split may vary, even with the same training data, 669 | ``max_features=n_features`` and ``bootstrap=False``, if the improvement 670 | of the criterion is identical for several splits enumerated during the 671 | search of the best split. To obtain a deterministic behaviour during 672 | fitting, ``random_state`` has to be fixed. 673 | """ 674 | 675 | def __init__( 676 | self, 677 | n_estimators=10, 678 | bootstrap=True, 679 | criterion="squared_error", 680 | splitter="best", 681 | max_depth=2, 682 | max_features=None, 683 | max_samples=1.0, 684 | min_samples_leaf=1, 685 | metric="neg_mean_squared_error", 686 | n_iter=10, 687 | base_learner="xgboost", 688 | base_n_estimators=(10, 50, 100), 689 | base_max_depth=(3, 6, 9), 690 | base_num_leaves=(20, 50, 100, 500), 691 | base_learning_rate=(0.01, 0.1, 0.3, 0.5), 692 | base_booster=("gbtree",), 693 | base_boosting_type=("gbdt",), 694 | base_gamma=(0, 1, 10), 695 | base_min_child_weight=(1, 5, 15, 100), 696 | base_subsample=(1.0,), 697 | base_subsample_for_bin=(200000,), 698 | base_colsample_bytree=(1.0,), 699 | base_colsample_bylevel=(1.0,), 700 | base_colsample_bynode=(1.0,), 701 | base_reg_alpha=(0,), 702 | base_reg_lambda=(0.1, 1.0, 5.0), 703 | n_jobs=None, 704 | random_state=None, 705 | verbose=0, 706 | ): 707 | self.n_estimators = n_estimators 708 | self.bootstrap = bootstrap 709 | self.criterion = criterion 710 | self.splitter = splitter 711 | self.max_depth = max_depth 712 | self.max_features = max_features 713 | self.max_samples = max_samples 714 | self.min_samples_leaf = min_samples_leaf 715 | self.n_iter = n_iter 716 | self.metric = metric 717 | self.base_learner = base_learner 718 | self.base_n_estimators = base_n_estimators 719 | self.base_max_depth = base_max_depth 720 | self.base_num_leaves = base_num_leaves 721 | self.base_learning_rate = base_learning_rate 722 | self.base_booster = base_booster 723 | self.base_boosting_type = base_boosting_type 724 | self.base_gamma = base_gamma 725 | self.base_min_child_weight = base_min_child_weight 726 | self.base_subsample = base_subsample 727 | self.base_subsample_for_bin = base_subsample_for_bin 728 | self.base_colsample_bytree = base_colsample_bytree 729 | self.base_colsample_bylevel = base_colsample_bylevel 730 | self.base_colsample_bynode = base_colsample_bynode 731 | self.base_reg_alpha = base_reg_alpha 732 | self.base_reg_lambda = base_reg_lambda 733 | self.n_jobs = n_jobs 734 | self.random_state = random_state 735 | self.verbose = verbose 736 | 737 | def _generate_estimator(self): 738 | """Generate an estimator.""" 739 | est = LCETreeRegressor() 740 | est.criterion = self.criterion 741 | est.splitter = self.splitter 742 | est.max_depth = self.max_depth 743 | est.max_features = self.max_features 744 | est.min_samples_leaf = self.min_samples_leaf 745 | est.n_iter = self.n_iter 746 | est.metric = self.metric 747 | est.base_learner = self.base_learner 748 | est.base_n_estimators = self.base_n_estimators 749 | est.base_max_depth = self.base_max_depth 750 | est.base_num_leaves = self.base_num_leaves 751 | est.base_learning_rate = self.base_learning_rate 752 | est.base_booster = self.base_booster 753 | est.base_boosting_type = self.base_boosting_type 754 | est.base_gamma = self.base_gamma 755 | est.base_min_child_weight = self.base_min_child_weight 756 | est.base_subsample = self.base_subsample 757 | est.base_subsample_for_bin = self.base_subsample_for_bin 758 | est.base_colsample_bytree = self.base_colsample_bytree 759 | est.base_colsample_bylevel = self.base_colsample_bylevel 760 | est.base_colsample_bynode = self.base_colsample_bynode 761 | est.base_reg_alpha = self.base_reg_alpha 762 | est.base_reg_alpha = self.base_reg_lambda 763 | est.n_jobs = self.n_jobs 764 | est.random_state = self.random_state 765 | est.verbose = self.verbose 766 | return est 767 | 768 | def _more_tags(self): 769 | """Update scikit-learn estimator tags.""" 770 | return {"allow_nan": True, "requires_y": True} 771 | 772 | def _validate_extra_parameters(self, X): 773 | """Validate parameters not already validated by methods employed.""" 774 | # Validate max_depth 775 | if isinstance(self.max_depth, numbers.Integral): 776 | if not (0 <= self.max_depth): 777 | raise ValueError( 778 | "max_depth must be greater than or equal to 0, " 779 | "got {0}.".format(self.max_depth) 780 | ) 781 | else: 782 | raise ValueError("max_depth must be int") 783 | 784 | # Validate min_samples_leaf 785 | if isinstance(self.min_samples_leaf, numbers.Integral): 786 | if not 1 <= self.min_samples_leaf: 787 | raise ValueError( 788 | "min_samples_leaf must be at least 1 " 789 | "or in (0, 0.5], got %s" % self.min_samples_leaf 790 | ) 791 | elif isinstance(self.min_samples_leaf, float): 792 | if not 0.0 < self.min_samples_leaf <= 0.5: 793 | raise ValueError( 794 | "min_samples_leaf must be at least 1 " 795 | "or in (0, 0.5], got %s" % self.min_samples_leaf 796 | ) 797 | self.min_samples_leaf = int(math.ceil(self.min_samples_leaf * X.shape[0])) 798 | else: 799 | raise ValueError("min_samples_leaf must be int or float") 800 | 801 | # Validate n_iter 802 | if isinstance(self.n_iter, numbers.Integral): 803 | if self.n_iter <= 0: 804 | raise ValueError( 805 | "n_iter must be greater than 0, " "got {0}.".format(self.n_iter) 806 | ) 807 | else: 808 | raise ValueError("n_iter must be int") 809 | 810 | # Validate verbose 811 | if isinstance(self.verbose, numbers.Integral): 812 | if self.verbose < 0: 813 | raise ValueError( 814 | "verbose must be greater than or equal to 0, " 815 | "got {0}.".format(self.verbose) 816 | ) 817 | else: 818 | raise ValueError("verbose must be int") 819 | 820 | def fit(self, X, y): 821 | """ 822 | Build a forest of LCE trees from the training set (X, y). 823 | 824 | Parameters 825 | ---------- 826 | X : array-like of shape (n_samples, n_features) 827 | The training input samples. 828 | 829 | y : array-like of shape (n_samples,) 830 | The target values (real numbers). 831 | 832 | Returns 833 | ------- 834 | self : object 835 | """ 836 | X, y = check_X_y(X, y, y_numeric=True, force_all_finite="allow-nan") 837 | self._validate_extra_parameters(X) 838 | self.n_features_in_ = X.shape[1] 839 | self.X_ = True 840 | self.y_ = True 841 | self.base_estimator_ = self._generate_estimator() 842 | self.estimators_ = BaggingRegressor( 843 | base_estimator=self.base_estimator_, 844 | n_estimators=self.n_estimators, 845 | bootstrap=self.bootstrap, 846 | max_samples=self.max_samples, 847 | n_jobs=self.n_jobs, 848 | random_state=self.random_state, 849 | ) 850 | self.estimators_.fit(X, y) 851 | return self 852 | 853 | def predict(self, X): 854 | """ 855 | Predict regression target for X. 856 | The predicted regression target of an input sample is computed as the 857 | mean predicted regression targets of the trees in the forest. 858 | 859 | Parameters 860 | ---------- 861 | X : array-like of shape (n_samples, n_features) 862 | The training input samples. 863 | 864 | Returns 865 | ------- 866 | y : ndarray of shape (n_samples,) 867 | The predicted values. 868 | """ 869 | check_is_fitted(self, ["X_", "y_"]) 870 | X = check_array(X, force_all_finite="allow-nan") 871 | return self.estimators_.predict(X) 872 | 873 | def set_params(self, **params): 874 | """ 875 | Set the parameters of the estimator. 876 | 877 | Parameters 878 | ---------- 879 | **params : dict 880 | Estimator parameters. 881 | 882 | Returns 883 | ------- 884 | self : object 885 | """ 886 | if not params: 887 | return self 888 | 889 | for key, value in params.items(): 890 | if hasattr(self, key): 891 | setattr(self, key, value) 892 | 893 | return self 894 | --------------------------------------------------------------------------------