├── lce
    ├── tests
    │   ├── __init__.py
    │   └── test_lce.py
    ├── _version.py
    ├── __init__.py
    ├── _catboost.py
    ├── _lightgbm.py
    ├── _xgboost.py
    └── _lce.py
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── config.yml
    │   ├── doc_improvement.yml
    │   ├── feature_request.yml
    │   └── bug_report.yml
    ├── CODE_OF_CONDUCT.md
    ├── PULL_REQUEST_TEMPLATE.md
    ├── workflows
    │   └── lce.yml
    └── CONTRIBUTING.md
├── examples
    ├── README.rst
    ├── lceclassifier_iris.py
    ├── lceregressor_diabetes.py
    ├── lceclassifier_iris_cv.py
    └── lceregressor_missing_diabetes.py
├── doc
    ├── _images
    │   ├── Figure_LCE.png
    │   ├── Figure_BaggingvsBoosting.png
    │   └── logo_lce.svg
    ├── api.rst
    ├── _templates
    │   ├── function.rst
    │   ├── numpydoc_docstring.py
    │   └── class.rst
    ├── _static
    │   ├── css
    │   │   └── project-template.css
    │   └── js
    │   │   └── copybutton.js
    ├── reference.rst
    ├── index.rst
    ├── contribute.rst
    ├── make.bat
    ├── Makefile
    ├── conf.py
    └── tutorial.rst
├── requirements.txt
├── setup.cfg
├── .readthedocs.yml
├── .codecov.yml
├── .gitignore
├── .circleci
    └── config.yml
├── setup.py
├── logo
    └── logo_lce.svg
├── README.rst
└── LICENSE


/lce/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lce/_version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.3.4"
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false


--------------------------------------------------------------------------------
/examples/README.rst:
--------------------------------------------------------------------------------
1 | .. _general_examples:
2 | 
3 | General examples
4 | ================


--------------------------------------------------------------------------------
/doc/_images/Figure_LCE.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LocalCascadeEnsemble/LCE/HEAD/doc/_images/Figure_LCE.png


--------------------------------------------------------------------------------
/doc/_images/Figure_BaggingvsBoosting.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LocalCascadeEnsemble/LCE/HEAD/doc/_images/Figure_BaggingvsBoosting.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | catboost==1.1.1
2 | hyperopt==0.2.7
3 | lightgbm==3.3.5
4 | numpy==1.23.3
5 | pandas==1.5.0
6 | scikit-learn==1.1.2
7 | xgboost==1.6.2 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description_file = README.rst
3 | 
4 | [aliases]
5 | test = pytest
6 | 
7 | [tool:pytest]
8 | addopts = --doctest-modules
9 | 


--------------------------------------------------------------------------------
/.github/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | Code of Conduct
2 | ===============
3 | 
4 | This repository follows the principles of the Python Software Foundation: https://www.python.org/psf/codeofconduct/
5 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | formats:
2 |   - none
3 | requirements_file: requirements.txt
4 | python:
5 |   version: 3.8
6 |   pip_install: true
7 |   extra_requirements:
8 |     - tests
9 |     - docs


--------------------------------------------------------------------------------
/lce/__init__.py:
--------------------------------------------------------------------------------
1 | from ._lce import LCEClassifier
2 | from ._lce import LCERegressor
3 | from ._version import __version__
4 | 
5 | 
6 | __all__ = ["LCEClassifier", "LCERegressor", "__version__"]
7 | 


--------------------------------------------------------------------------------
/.codecov.yml:
--------------------------------------------------------------------------------
 1 | comment: false
 2 | 
 3 | coverage:
 4 |   status:
 5 |     project:
 6 |       default:
 7 |         target: auto
 8 |         threshold: 5%
 9 |     patch:
10 |       default:
11 |         target: auto
12 |         threshold: 5%


--------------------------------------------------------------------------------
/doc/api.rst:
--------------------------------------------------------------------------------
 1 | #############
 2 | API
 3 | #############
 4 | 
 5 | .. _APIDocumentation:
 6 | 
 7 | This is the API documentation of LCE.
 8 | 
 9 | .. currentmodule:: lce
10 | 
11 | .. autosummary::
12 |    :toctree: generated/
13 |    :template: class.rst
14 | 
15 |     LCEClassifier
16 | 	LCERegressor


--------------------------------------------------------------------------------
/doc/_templates/function.rst:
--------------------------------------------------------------------------------
 1 | :mod:`{{module}}`.{{objname}}
 2 | {{ underline }}====================
 3 | 
 4 | .. currentmodule:: {{ module }}
 5 | 
 6 | .. autofunction:: {{ objname }}
 7 | 
 8 | .. include:: {{module}}.{{objname}}.examples
 9 | 
10 | .. raw:: html
11 | 
12 |     <div style='clear:both'></div>
13 | 


--------------------------------------------------------------------------------
/doc/_templates/numpydoc_docstring.py:
--------------------------------------------------------------------------------
 1 | {{index}}
 2 | {{summary}}
 3 | {{extended_summary}}
 4 | {{parameters}}
 5 | {{returns}}
 6 | {{yields}}
 7 | {{other_parameters}}
 8 | {{attributes}}
 9 | {{raises}}
10 | {{warns}}
11 | {{warnings}}
12 | {{see_also}}
13 | {{notes}}
14 | {{references}}
15 | {{examples}}
16 | {{methods}}
17 | 


--------------------------------------------------------------------------------
/doc/_static/css/project-template.css:
--------------------------------------------------------------------------------
 1 | @import url("theme.css");
 2 | 
 3 | .highlight a {
 4 |     text-decoration: underline;
 5 | }
 6 | 
 7 | .deprecated p {
 8 |     padding: 10px 7px 10px 10px;
 9 |     color: #b94a48;
10 |     background-color: #F3E5E5;
11 |     border: 1px solid #eed3d7;
12 | }
13 | 
14 | .deprecated p span.versionmodified {
15 |     font-weight: bold;
16 | }
17 | 


--------------------------------------------------------------------------------
/doc/_templates/class.rst:
--------------------------------------------------------------------------------
 1 | :mod:`{{module}}`.{{objname}}
 2 | {{ underline }}==============
 3 | 
 4 | .. currentmodule:: {{ module }}
 5 | 
 6 | .. autoclass:: {{ objname }}
 7 |    :members:
 8 | 
 9 |    {% block methods %}
10 |    .. automethod:: __init__
11 |    {% endblock %}
12 | 
13 | .. include:: {{module}}.{{objname}}.examples
14 | 
15 | .. raw:: html
16 | 
17 |     <div style='clear:both'></div>
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/doc_improvement.yml:
--------------------------------------------------------------------------------
 1 | name: Documentation Improvement
 2 | description: Create a report to help improve the documentation. Alternatively you can just open a pull request with the suggested change.
 3 | 
 4 | body:
 5 | - type: textarea
 6 |   attributes:
 7 |     label: Describe the issue
 8 |   validations:
 9 |     required: true
10 | - type: textarea
11 |   attributes:
12 |     label: Explain the improvement


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yml:
--------------------------------------------------------------------------------
 1 | name: Feature Request
 2 | description: Suggest an enhancement to LCE.
 3 | 
 4 | body:
 5 | - type: textarea
 6 |   attributes:
 7 |     label: Describe the workflow you want to enable
 8 |   validations:
 9 |     required: true
10 | - type: textarea
11 |   attributes:
12 |     label: Describe your proposed solution
13 |   validations:
14 |     required: true
15 | - type: textarea
16 |   attributes:
17 |     label: Describe alternatives you've considered, if relevant
18 | - type: textarea
19 |   attributes:
20 |     label: Additional information
21 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | # Description
 2 | 
 3 | Please include a summary of the change and which issue is fixed. List any dependencies that are required for this change.
 4 | 
 5 | Fixes # (issue)
 6 | 
 7 | # How Has This Been Tested?
 8 | 
 9 | Please describe the tests that you ran to verify your changes. Please also note any relevant details of your test configuration.
10 | 
11 | 
12 | # Checklist:
13 | 
14 | - [ ] I have performed a self-review of my own code
15 | - [ ] I have commented my code
16 | - [ ] My changes generate no warning
17 | - [ ] My code follows the style guidelines of this project
18 | - [ ] I have made the corresponding changes to the documentation


--------------------------------------------------------------------------------
/.github/workflows/lce.yml:
--------------------------------------------------------------------------------
 1 | name: lce workflow
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     strategy:
 8 |       matrix:
 9 |         os: [ubuntu-latest, macos-latest, windows-latest]
10 |     runs-on: ${{ matrix.os }}
11 |     name: test python lce
12 |     defaults:
13 |       run:
14 |         working-directory: ./lce
15 |     steps:
16 |       - uses: actions/checkout@v2
17 |       - uses: actions/setup-python@v2
18 |         with:
19 |           python-version: 3.8
20 |       - name: install dependencies
21 |         run: |
22 |           pip install -r ../requirements.txt
23 |           pip install pytest pytest-cov
24 |       - name: run tests and collect coverage
25 |         run: pytest --cov=./ --cov-report=xml
26 |       - name: upload coverage reports to Codecov with GitHub Action
27 |         uses: codecov/codecov-action@v3
28 | 


--------------------------------------------------------------------------------
/examples/lceclassifier_iris.py:
--------------------------------------------------------------------------------
 1 | """
 2 | =============================
 3 | LCEClassifier on Iris dataset
 4 | =============================
 5 | 
 6 | An example of :class:`lce.LCEClassifier`
 7 | """
 8 | 
 9 | from lce import LCEClassifier
10 | from sklearn.datasets import load_iris
11 | from sklearn.metrics import accuracy_score
12 | from sklearn.model_selection import train_test_split
13 | 
14 | 
15 | # Load data and generate a train/test split
16 | data = load_iris()
17 | X_train, X_test, y_train, y_test = train_test_split(
18 |     data.data, data.target, random_state=0
19 | )
20 | 
21 | # Train LCEClassifier with default parameters
22 | clf = LCEClassifier(n_jobs=-1, random_state=0)
23 | clf.fit(X_train, y_train)
24 | 
25 | # Make prediction and compute accuracy score
26 | y_pred = clf.predict(X_test)
27 | accuracy = accuracy_score(y_test, y_pred)
28 | print("Accuracy: {:.1f}%".format(accuracy * 100))
29 | 


--------------------------------------------------------------------------------
/examples/lceregressor_diabetes.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ================================
 3 | LCERegressor on Diabetes dataset
 4 | ================================
 5 | 
 6 | An example of :class:`lce.LCERegressor`
 7 | """
 8 | 
 9 | from lce import LCERegressor
10 | from sklearn.datasets import load_diabetes
11 | from sklearn.metrics import mean_squared_error
12 | from sklearn.model_selection import train_test_split
13 | 
14 | 
15 | # Load data and generate a train/test split
16 | data = load_diabetes()
17 | X_train, X_test, y_train, y_test = train_test_split(
18 |     data.data, data.target, random_state=0
19 | )
20 | 
21 | # Train LCERegressor with default parameters
22 | reg = LCERegressor(n_jobs=-1, random_state=0)
23 | reg.fit(X_train, y_train)
24 | 
25 | # Make prediction
26 | y_pred = reg.predict(X_test)
27 | mse = mean_squared_error(y_test, y_pred)
28 | print("The mean squared error (MSE) on test set: {:.0f}".format(mse))
29 | 


--------------------------------------------------------------------------------
/examples/lceclassifier_iris_cv.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ======================================================================
 3 | LCEClassifier on Iris dataset with scikit-learn cross validation score
 4 | ======================================================================
 5 | 
 6 | An example of :class:`lce.LCEClassifier`
 7 | """
 8 | 
 9 | from lce import LCEClassifier
10 | from sklearn.datasets import load_iris
11 | from sklearn.model_selection import cross_val_score, train_test_split
12 | 
13 | # Load data
14 | data = load_iris()
15 | X_train, X_test, y_train, y_test = train_test_split(
16 |     data.data, data.target, random_state=0
17 | )
18 | 
19 | # Set LCEClassifier with default parameters
20 | clf = LCEClassifier(n_jobs=-1, random_state=0)
21 | 
22 | # Compute cross-validation scores
23 | cv_scores = cross_val_score(clf, X_train, y_train, cv=3)
24 | cv_scores = [round(elem * 100, 1) for elem in cv_scores.tolist()]
25 | print("Cross-validation scores on train set: ", cv_scores)
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # scikit-learn specific
10 | doc/_build/
11 | doc/auto_examples/
12 | doc/modules/generated/
13 | doc/datasets/generated/
14 | 
15 | # Distribution / packaging
16 | 
17 | .Python
18 | env/
19 | build/
20 | develop-eggs/
21 | dist/
22 | downloads/
23 | eggs/
24 | .eggs/
25 | lib/
26 | lib64/
27 | parts/
28 | sdist/
29 | var/
30 | *.egg-info/
31 | .installed.cfg
32 | *.egg
33 | 
34 | # PyInstaller
35 | #  Usually these files are written by a python script from a template
36 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
37 | *.manifest
38 | *.spec
39 | 
40 | # Installer logs
41 | pip-log.txt
42 | pip-delete-this-directory.txt
43 | 
44 | # Unit test / coverage reports
45 | htmlcov/
46 | .tox/
47 | .coverage
48 | .coverage.*
49 | .cache
50 | nosetests.xml
51 | coverage.xml
52 | *,cover
53 | .hypothesis/
54 | 
55 | # Translations
56 | *.mo
57 | *.pot
58 | 
59 | # Django stuff:
60 | *.log
61 | 
62 | # Sphinx documentation
63 | doc/_build/
64 | doc/generated/
65 | 
66 | # PyBuilder
67 | target/
68 | 


--------------------------------------------------------------------------------
/doc/reference.rst:
--------------------------------------------------------------------------------
 1 | #############
 2 | Citation
 3 | #############
 4 | 
 5 | Here are the reference papers. If you use LCE, we would appreciate citations::
 6 | 
 7 | 	@article{Fauvel-LCE,
 8 | 	  author = {Fauvel, K. and E. Fromont and V. Masson and P. Faverdin and A. Termier},
 9 | 	  title = {{LCE: An Augmented Combination of Bagging and Boosting in Python}},
10 | 	  journal = {arXiv},
11 | 	  year = {2023}
12 | 	}
13 | 	
14 | 
15 | 	@article{Fauvel-LCEDAMI,
16 | 	  author = {Fauvel, K. and E. Fromont and V. Masson and P. Faverdin and A. Termier},
17 | 	  title = {{XEM: An Explainable-by-Design Ensemble Method for Multivariate Time Series Classification}},
18 | 	  journal = {Data Mining and Knowledge Discovery},
19 | 	  year = {2022},
20 | 	  volume = {36},
21 | 	  number = {3},
22 | 	  pages = {917-957}
23 | 	}
24 | 
25 | 
26 | 	@inproceedings{Fauvel-LCEKDD,
27 | 	  author = {Fauvel, K. and V. Masson and E. Fromont and P. Faverdin and A. Termier},
28 | 	  title = {{Towards Sustainable Dairy Management - A Machine Learning Enhanced Method for Estrus Detection}},
29 | 	  booktitle = {Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining},
30 | 	  year = {2019}
31 | 	}
32 | 
33 | 


--------------------------------------------------------------------------------
/examples/lceregressor_missing_diabetes.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ====================================================
 3 | LCERegressor on Diabetes dataset with missing values
 4 | ====================================================
 5 | 
 6 | An example of :class:`lce.LCERegressor`
 7 | """
 8 | 
 9 | import numpy as np
10 | from lce import LCERegressor
11 | from sklearn.datasets import load_diabetes
12 | from sklearn.metrics import mean_squared_error
13 | from sklearn.model_selection import train_test_split
14 | 
15 | 
16 | # Load data and generate a train/test split
17 | data = load_diabetes()
18 | X_train, X_test, y_train, y_test = train_test_split(
19 |     data.data, data.target, random_state=0
20 | )
21 | 
22 | # Input 20% of missing values per variable in the train set
23 | np.random.seed(0)
24 | m = 0.2
25 | for j in range(0, X_train.shape[1]):
26 |     sub = np.random.choice(X_train.shape[0], int(X_train.shape[0] * m))
27 |     X_train[sub, j] = np.nan
28 | 
29 | # Train LCERegressor with default parameters
30 | reg = LCERegressor(n_jobs=-1, random_state=0)
31 | reg.fit(X_train, y_train)
32 | 
33 | # Make prediction
34 | y_pred = reg.predict(X_test)
35 | mse = mean_squared_error(y_test, y_pred)
36 | print("The mean squared error (MSE) on test set: {:.0f}".format(mse))
37 | 


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | jobs:
 4 |   build:
 5 |     docker:
 6 |       - image: circleci/python:3.10
 7 |     working_directory: ~/repo
 8 |     steps:
 9 |       - checkout
10 |       - run:
11 |           name: install dependencies
12 |           command: |
13 |             wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
14 |             chmod +x miniconda.sh && ./miniconda.sh -b -p ~/miniconda
15 |             export PATH="~/miniconda/bin:$PATH"
16 |             conda update --yes --quiet conda
17 |             conda create -n testenv --yes --quiet python=3.10
18 |             source activate testenv
19 |             pip install catboost==1.1.1 hyperopt==0.2.7 lightgbm==3.3.5 numpy==1.23.3 numpydoc pandas==1.5.0 scikit-learn==1.1.2 sphinx sphinx-gallery sphinx_rtd_theme pillow pytest pytest-cov xgboost==1.6.2 
20 |             pip install .
21 |             cd doc
22 |             make html
23 |       - store_artifacts:
24 |           path: doc/_build/html/
25 |           destination: doc
26 |       - store_artifacts:
27 |           path: ~/log.txt
28 |       - run: ls -ltrh doc/_build/html
29 |     filters:
30 |       branches:
31 |         ignore: gh-pages
32 |         
33 |   
34 | workflows:
35 |   version: 2
36 |   workflow:
37 |     jobs:
38 |       - build
39 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | .. image:: _images/logo_lce.svg
 3 |    :align: center
 4 |    :width: 40%
 5 | 
 6 | |
 7 |  
 8 | Welcome to the documentation of LCE!
 9 | ====================================
10 | 
11 | | **Local Cascade Ensemble (LCE)** is a *high-performing*, *scalable* and *user-friendly* machine learning method for the general tasks of **Classification** and **Regression**.
12 | | In particular, LCE:
13 |  
14 | - Enhances the prediction performance of Random Forest and XGBoost by combining their strengths and adopting a complementary diversification approach
15 | - Supports parallel processing to ensure scalability
16 | - Handles missing data by design
17 | - Adopts scikit-learn API for the ease of use
18 | - Adheres to scikit-learn conventions to allow interaction with scikit-learn pipelines and model selection tools
19 | - Is released in open source and commercially usable - Apache 2.0 license
20 | 
21 | .. toctree::
22 |    :maxdepth: 1
23 |    :hidden:
24 |    :caption: Tutorial
25 | 
26 |    tutorial
27 | 
28 | .. toctree::
29 |    :maxdepth: 2
30 |    :hidden:	
31 |    :caption: Documentation
32 | 
33 |    api
34 |    
35 | .. toctree::
36 |    :maxdepth: 2
37 |    :hidden:	
38 |    :caption: Contribute
39 | 
40 |    contribute
41 |    
42 | .. toctree::
43 |    :maxdepth: 2
44 |    :hidden:
45 |    :caption: Reference
46 | 
47 |    reference
48 | 


--------------------------------------------------------------------------------
/doc/contribute.rst:
--------------------------------------------------------------------------------
 1 | #################
 2 | Contribute to LCE
 3 | #################
 4 | 
 5 | 
 6 | They are multiple ways to participate in LCE; for instance, school projects for students, research questions for academics, performance maximization through collaboration for professionals:
 7 | 
 8 | - Add a star to LCE GitHub repository. It seems insignificant but it is key for LCE referencing and visibility
 9 | - Answer queries on the issue tracker, investigate bugs, and review other developers’ pull requests to make sure the existing version is running as expected
10 | - Develop new test cases to make the codebase more robust
11 | - Extend LCE capabilities by adding new features (e.g., modeling flexibility, faithful explainability-by-design, GPU support). For more detailed information about new feature ideas, please refer to the article `"LCE: The Most Powerful Machine Learning Method?" <https://towardsdatascience.com/lce-the-most-powerful-machine-learning-method-e8ea77f317d6?source=friends_link&sk=c8911ad03dd1e0e3fd02a17835609737>`_
12 | - Design tutorials in various media outlets, targeting different audiences, to make more people discover LCE
13 | 
14 | For organizations, it is possible to sponsor the project to cover the expenses needed to develop LCE along with the highest standards (e.g., professional services like a robust continuous integration infrastructure, workshop expenses).
15 | 


--------------------------------------------------------------------------------
/.github/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | Contributing 
 2 | ============
 3 | 
 4 | They are multiple ways to participate in LCE; for instance, school projects for students, research questions for academics, performance maximization through collaboration for professionals:
 5 | 
 6 | - Add a star to LCE GitHub repository. It seems insignificant but it is key for LCE referencing and visibility
 7 | - Answer queries on the issue tracker, investigate bugs, and review other developers’ pull requests to make sure the existing version is running as expected
 8 | - Develop new test cases to make the codebase more robust
 9 | - Extend LCE capabilities by adding new features (e.g., modeling flexibility, faithful explainability-by-design, GPU support). For more detailed information about new feature ideas, please refer to the article ["LCE: The Most Powerful Machine Learning Method?"](https://towardsdatascience.com/lce-the-most-powerful-machine-learning-method-e8ea77f317d6?source=friends_link&sk=c8911ad03dd1e0e3fd02a17835609737)
10 | - Design tutorials in various media outlets, targeting different audiences, to make more people discover LCE
11 | 
12 | For organizations, it is possible to sponsor the project to cover the expenses needed to develop LCE along with the highest standards (e.g., professional services like a robust continuous integration infrastructure, workshop expenses).
13 | 
14 | 
15 | Contributing to related projects
16 | --------------------------------
17 | 
18 | LCE thrives in an ecosystem of several related projects, which may also have relevant issues to work on. These projects include:
19 | 
20 | - [hyperopt](https://github.com/hyperopt/hyperopt/issues)
21 | - [numpy](https://github.com/numpy/numpy/issues)
22 | - [pandas](https://github.com/pandas-dev/pandas/issues)
23 | - [scikit-learn](https://github.com/scikit-learn/scikit-learn/issues)
24 | - [xgboost](https://github.com/dmlc/xgboost/issues)
25 | 
26 | 
27 | Code of Conduct
28 | ---------------
29 | 
30 | This repository follows the principles of the Python Software Foundation: https://www.python.org/psf/codeofconduct/.


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import codecs
 4 | import os
 5 | 
 6 | from setuptools import find_packages, setup
 7 | 
 8 | # get __version__ from _version.py
 9 | ver_file = os.path.join("lce", "_version.py")
10 | with open(ver_file) as f:
11 |     exec(f.read())
12 | 
13 | DISTNAME = "lcensemble"
14 | DESCRIPTION = "Local Cascade Ensemble package"
15 | with codecs.open("README.rst", encoding="utf-8-sig") as f:
16 |     LONG_DESCRIPTION = f.read()
17 | LONG_DESCRIPTION_TYPE = "text/x-rst"
18 | MAINTAINER = "Kevin Fauvel"
19 | MAINTAINER_EMAIL = "kfauvel.lce@gmail.com"
20 | URL = "https://lce.readthedocs.io/en/latest/"
21 | LICENSE = "Apache-2.0"
22 | DOWNLOAD_URL = "https://github.com/LocalCascadeEnsemble/LCE"
23 | VERSION = __version__
24 | INSTALL_REQUIRES = [
25 |     "catboost==1.1.1",
26 |     "hyperopt==0.2.7",
27 |     "lightgbm==3.3.5",
28 |     "numpy==1.23.3",
29 |     "pandas==1.5.0",
30 |     "scikit-learn==1.1.2",
31 |     "xgboost==1.6.2",
32 | ]
33 | PROJECT_URLS = {
34 |     "Documentation": "https://lce.readthedocs.io/en/latest/",
35 | }
36 | CLASSIFIERS = [
37 |     "License :: OSI Approved :: Apache Software License",
38 |     "Programming Language :: Python :: 3",
39 |     "Programming Language :: Python :: 3.8",
40 |     "Programming Language :: Python :: 3.9",
41 |     "Programming Language :: Python :: 3.10",
42 |     "Operating System :: OS Independent",
43 | ]
44 | EXTRAS_REQUIRE = {
45 |     "tests": ["pytest", "pytest-cov"],
46 |     "docs": ["sphinx", "sphinx-gallery", "sphinx_rtd_theme", "numpydoc", "pillow"],
47 | }
48 | 
49 | setup(
50 |     name=DISTNAME,
51 |     python_requires=">=3.8",
52 |     maintainer=MAINTAINER,
53 |     maintainer_email=MAINTAINER_EMAIL,
54 |     description=DESCRIPTION,
55 |     license=LICENSE,
56 |     url=URL,
57 |     version=VERSION,
58 |     download_url=DOWNLOAD_URL,
59 |     project_urls=PROJECT_URLS,
60 |     long_description=LONG_DESCRIPTION,
61 |     long_description_content_type=LONG_DESCRIPTION_TYPE,
62 |     zip_safe=False,
63 |     classifiers=CLASSIFIERS,
64 |     packages=find_packages(),
65 |     install_requires=INSTALL_REQUIRES,
66 |     extras_require=EXTRAS_REQUIRE,
67 | )
68 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.yml:
--------------------------------------------------------------------------------
 1 | name: Bug Report
 2 | description: Create a report to help reproduce and correct the bug.
 3 | 
 4 | body:
 5 | - type: textarea
 6 |   attributes:
 7 |     label: Describe the bug
 8 |     description: >
 9 |       A concise description of the bug.
10 |   validations:
11 |     required: true
12 |     
13 | - type: textarea
14 |   attributes:
15 |     label: Observed Results
16 |     description: |
17 |       Please paste or describe the results you observe. If you observe an error, please paste the error message including the **full traceback** of the exception.
18 |   validations:
19 |     required: true
20 |     
21 | - type: textarea
22 |   attributes:
23 |     label: Code to Reproduce
24 |     description: |
25 |       Please add a code example that can reproduce the error when running it. Be as succinct as possible, **do not depend on external data files**: instead you can use [sklearn.datasets.make_regression](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html), [sklearn.datasets.make_classification](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html) or generate synthetic data using `numpy.random`.
26 | 
27 |       Crafting a code example requires some effort on your side but it really helps to quickly reproduce the problem and analyze its cause without any ambiguity. Ambiguous bug reports tend to be slower to fix because they will require more effort and discussions to pin-point the precise conditions necessary to reproduce the problem.
28 |     placeholder: |
29 |       ```
30 |       Sample code to reproduce the problem
31 |       ```
32 |   validations:
33 |     required: true
34 | - type: textarea
35 |   attributes:
36 |     label: Expected Results
37 |     description: >
38 |       Please paste or describe the expected results.
39 |     placeholder: >
40 |       Example: No error is thrown.
41 |   validations:
42 |     required: true
43 | - type: textarea
44 |   attributes:
45 |     label: Version
46 |     render: shell
47 |     description: |
48 |       Please run the following and paste the output below.
49 |       ```python
50 |       import lce; lce.__version__
51 |       ```
52 |   validations:
53 |     required: true
54 | - type: markdown
55 |   attributes:
56 |     value: >
57 |       Thanks for contributing 🎉!
58 | 


--------------------------------------------------------------------------------
/logo/logo_lce.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <svg width="210mm" height="200mm" version="1.1" viewBox="0 0 210 200" xmlns="http://www.w3.org/2000/svg" xmlns:cc="http://creativecommons.org/ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:osb="http://www.openswatchbook.org/uri/2009/osb" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
 3 |  <metadata>
 4 |   <rdf:RDF>
 5 |    <cc:Work rdf:about="">
 6 |     <dc:format>image/svg+xml</dc:format>
 7 |     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
 8 |     <dc:title/>
 9 |    </cc:Work>
10 |   </rdf:RDF>
11 |  </metadata>
12 |  <g stroke-width=".26458">
13 |   <path d="m19.655-20.738h36.286v107.35h-36.286v-107.35z" opacity="0"/>
14 |   <path d="m67.63 20.603h27.491v78.759h-27.491v-78.759z" opacity="0"/>
15 |   <path d="m67.63 20.603h43.095v52.754h-43.095v-52.754z" opacity="0"/>
16 |   <path d="m69.116 28.404h18.204v40.866h-18.204v-40.866z" opacity="0"/>
17 |  </g>
18 |  <g fill="#fff" stroke="#d3d3d3">
19 |   <path d="m70.388 20.865h23.033v71.329h-23.033v-71.329z"/>
20 |   <path d="m93.483 37.429h23.033v71.329h-23.033v-71.329z"/>
21 |   <path d="m116.51 54.492h23.033v71.329h-23.033v-71.329z"/>
22 |  </g>
23 |  <path transform="scale(.26458)" d="m401.04 549.44c-8.1718 0-15.134 0.68342-20.889 2.0508-5.7548 1.3674-10.473 3.5894-14.156 6.666-3.6831 3.0766-6.3888 7.0423-8.1152 11.896-1.6113 4.7858-2.416 10.597-2.416 17.434 0 6.7685 0.80468 12.544 2.416 17.33 1.7264 4.7858 4.4322 8.7515 8.1152 11.896 3.6831 3.0766 8.3427 5.3338 13.982 6.7695 5.6397 1.3674 12.375 2.0508 20.201 2.0508 6.3303 0 11.912-0.3085 16.746-0.92383 4.9491-0.54692 9.1487-1.2304 12.602-2.0508v-5.2305c0-1.7092-1.4958-2.5625-4.4883-2.5625-1.9566 0-4.7759 0.20503-8.459 0.61523-3.6831 0.34185-8.0568 0.51172-13.121 0.51172-4.9491 0-9.2074-0.3749-12.775-1.127-3.568-0.82044-6.5615-2.2576-8.9785-4.3086-2.3019-2.1194-4.0268-5.0581-5.1777-8.8184-1.151-3.8286-1.7266-8.718-1.7266-14.666 0-5.4011 0.46009-9.8784 1.3809-13.434 0.92076-3.6236 2.532-6.4958 4.834-8.6152 2.3019-2.1878 5.2935-3.7264 8.9766-4.6152 3.6831-0.88877 8.2292-1.332 13.639-1.332 4.4887 0 8.8626 0.13669 13.121 0.41015 4.2586 0.27346 8.1155 0.54683 11.568 0.82031v-5.0254c0-0.88879-0.28779-1.6073-0.86328-2.1543-0.46037-0.61533-1.3825-1.0937-2.7637-1.4356-2.5321-0.68367-6.0425-1.1953-10.531-1.5371-4.4888-0.4102-8.8626-0.61523-13.121-0.61523zm-127.62 1.0254v71.479c0 1.6408 1.439 2.4609 4.3164 2.4609h63.014v-6.9727c0-1.6408-1.437-2.4609-4.3144-2.4609h-44.197v-62.045c0-0.75206-0.40332-1.3339-1.209-1.7442-0.80567-0.47858-1.8414-0.71679-3.1074-0.71679h-14.502zm175 0v71.479c0 1.6408 1.439 2.4609 4.3164 2.4609h68.021v-6.7676c0-0.75205-0.40329-1.3339-1.209-1.7441-0.80567-0.41023-1.8414-0.61523-3.1074-0.61523h-49.203v-23.588h46.613v-6.8711c0-1.6408-1.3803-2.4609-4.1426-2.4609h-42.471v-22.664h51.275v-6.7676c0-1.6409-1.3235-2.4609-3.9707-2.4609h-66.123z" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal"/>
24 |  <path d="m16.768 181.4h176.46" fill="none" stroke="#d3d3d3" stroke-width="1.5"/>
25 | </svg>
26 | 


--------------------------------------------------------------------------------
/doc/_images/logo_lce.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <svg width="210mm" height="200mm" version="1.1" viewBox="0 0 210 200" xmlns="http://www.w3.org/2000/svg" xmlns:cc="http://creativecommons.org/ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:osb="http://www.openswatchbook.org/uri/2009/osb" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
 3 |  <metadata>
 4 |   <rdf:RDF>
 5 |    <cc:Work rdf:about="">
 6 |     <dc:format>image/svg+xml</dc:format>
 7 |     <dc:type rdf:resource="http://purl.org/dc/dcmitype/StillImage"/>
 8 |     <dc:title/>
 9 |    </cc:Work>
10 |   </rdf:RDF>
11 |  </metadata>
12 |  <g stroke-width=".26458">
13 |   <path d="m19.655-20.738h36.286v107.35h-36.286v-107.35z" opacity="0"/>
14 |   <path d="m67.63 20.603h27.491v78.759h-27.491v-78.759z" opacity="0"/>
15 |   <path d="m67.63 20.603h43.095v52.754h-43.095v-52.754z" opacity="0"/>
16 |   <path d="m69.116 28.404h18.204v40.866h-18.204v-40.866z" opacity="0"/>
17 |  </g>
18 |  <g fill="#fff" stroke="#d3d3d3">
19 |   <path d="m70.388 20.865h23.033v71.329h-23.033v-71.329z"/>
20 |   <path d="m93.483 37.429h23.033v71.329h-23.033v-71.329z"/>
21 |   <path d="m116.51 54.492h23.033v71.329h-23.033v-71.329z"/>
22 |  </g>
23 |  <path transform="scale(.26458)" d="m401.04 549.44c-8.1718 0-15.134 0.68342-20.889 2.0508-5.7548 1.3674-10.473 3.5894-14.156 6.666-3.6831 3.0766-6.3888 7.0423-8.1152 11.896-1.6113 4.7858-2.416 10.597-2.416 17.434 0 6.7685 0.80468 12.544 2.416 17.33 1.7264 4.7858 4.4322 8.7515 8.1152 11.896 3.6831 3.0766 8.3427 5.3338 13.982 6.7695 5.6397 1.3674 12.375 2.0508 20.201 2.0508 6.3303 0 11.912-0.3085 16.746-0.92383 4.9491-0.54692 9.1487-1.2304 12.602-2.0508v-5.2305c0-1.7092-1.4958-2.5625-4.4883-2.5625-1.9566 0-4.7759 0.20503-8.459 0.61523-3.6831 0.34185-8.0568 0.51172-13.121 0.51172-4.9491 0-9.2074-0.3749-12.775-1.127-3.568-0.82044-6.5615-2.2576-8.9785-4.3086-2.3019-2.1194-4.0268-5.0581-5.1777-8.8184-1.151-3.8286-1.7266-8.718-1.7266-14.666 0-5.4011 0.46009-9.8784 1.3809-13.434 0.92076-3.6236 2.532-6.4958 4.834-8.6152 2.3019-2.1878 5.2935-3.7264 8.9766-4.6152 3.6831-0.88877 8.2292-1.332 13.639-1.332 4.4887 0 8.8626 0.13669 13.121 0.41015 4.2586 0.27346 8.1155 0.54683 11.568 0.82031v-5.0254c0-0.88879-0.28779-1.6073-0.86328-2.1543-0.46037-0.61533-1.3825-1.0937-2.7637-1.4356-2.5321-0.68367-6.0425-1.1953-10.531-1.5371-4.4888-0.4102-8.8626-0.61523-13.121-0.61523zm-127.62 1.0254v71.479c0 1.6408 1.439 2.4609 4.3164 2.4609h63.014v-6.9727c0-1.6408-1.437-2.4609-4.3144-2.4609h-44.197v-62.045c0-0.75206-0.40332-1.3339-1.209-1.7442-0.80567-0.47858-1.8414-0.71679-3.1074-0.71679h-14.502zm175 0v71.479c0 1.6408 1.439 2.4609 4.3164 2.4609h68.021v-6.7676c0-0.75205-0.40329-1.3339-1.209-1.7441-0.80567-0.41023-1.8414-0.61523-3.1074-0.61523h-49.203v-23.588h46.613v-6.8711c0-1.6408-1.3803-2.4609-4.1426-2.4609h-42.471v-22.664h51.275v-6.7676c0-1.6409-1.3235-2.4609-3.9707-2.4609h-66.123z" style="font-feature-settings:normal;font-variant-caps:normal;font-variant-ligatures:normal;font-variant-numeric:normal"/>
24 |  <path d="m16.768 181.4h176.46" fill="none" stroke="#d3d3d3" stroke-width="1.5"/>
25 | </svg>
26 | 


--------------------------------------------------------------------------------
/doc/_static/js/copybutton.js:
--------------------------------------------------------------------------------
 1 | $(document).ready(function() {
 2 |     /* Add a [>>>] button on the top-right corner of code samples to hide
 3 |      * the >>> and ... prompts and the output and thus make the code
 4 |      * copyable. */
 5 |     var div = $('.highlight-python .highlight,' +
 6 |                 '.highlight-python3 .highlight,' +
 7 |                 '.highlight-pycon .highlight,' +
 8 | 		'.highlight-default .highlight')
 9 |     var pre = div.find('pre');
10 | 
11 |     // get the styles from the current theme
12 |     pre.parent().parent().css('position', 'relative');
13 |     var hide_text = 'Hide the prompts and output';
14 |     var show_text = 'Show the prompts and output';
15 |     var border_width = pre.css('border-top-width');
16 |     var border_style = pre.css('border-top-style');
17 |     var border_color = pre.css('border-top-color');
18 |     var button_styles = {
19 |         'cursor':'pointer', 'position': 'absolute', 'top': '0', 'right': '0',
20 |         'border-color': border_color, 'border-style': border_style,
21 |         'border-width': border_width, 'color': border_color, 'text-size': '75%',
22 |         'font-family': 'monospace', 'padding-left': '0.2em', 'padding-right': '0.2em',
23 |         'border-radius': '0 3px 0 0'
24 |     }
25 | 
26 |     // create and add the button to all the code blocks that contain >>>
27 |     div.each(function(index) {
28 |         var jthis = $(this);
29 |         if (jthis.find('.gp').length > 0) {
30 |             var button = $('<span class="copybutton">&gt;&gt;&gt;</span>');
31 |             button.css(button_styles)
32 |             button.attr('title', hide_text);
33 |             button.data('hidden', 'false');
34 |             jthis.prepend(button);
35 |         }
36 |         // tracebacks (.gt) contain bare text elements that need to be
37 |         // wrapped in a span to work with .nextUntil() (see later)
38 |         jthis.find('pre:has(.gt)').contents().filter(function() {
39 |             return ((this.nodeType == 3) && (this.data.trim().length > 0));
40 |         }).wrap('<span>');
41 |     });
42 | 
43 |     // define the behavior of the button when it's clicked
44 |     $('.copybutton').click(function(e){
45 |         e.preventDefault();
46 |         var button = $(this);
47 |         if (button.data('hidden') === 'false') {
48 |             // hide the code output
49 |             button.parent().find('.go, .gp, .gt').hide();
50 |             button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'hidden');
51 |             button.css('text-decoration', 'line-through');
52 |             button.attr('title', show_text);
53 |             button.data('hidden', 'true');
54 |         } else {
55 |             // show the code output
56 |             button.parent().find('.go, .gp, .gt').show();
57 |             button.next('pre').find('.gt').nextUntil('.gp, .go').css('visibility', 'visible');
58 |             button.css('text-decoration', 'none');
59 |             button.attr('title', hide_text);
60 |             button.data('hidden', 'false');
61 |         }
62 |     });
63 | });
64 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | 
  2 | .. raw:: html
  3 | 
  4 | 	<p align="center">
  5 | 		<img src="./logo/logo_lce.svg" width="35%">	
  6 | 	</p>
  7 | 	
  8 | 	<div align="center">
  9 | 		<a href="https://circleci.com/gh/LocalCascadeEnsemble/LCE/tree/main">
 10 | 			<img src="https://circleci.com/gh/LocalCascadeEnsemble/LCE/tree/main.svg?style=shield">
 11 | 		</a>
 12 | 		<a href="https://codecov.io/gh/LocalCascadeEnsemble/LCE">
 13 | 			<img src="https://codecov.io/gh/LocalCascadeEnsemble/LCE/branch/main/graph/badge.svg?token=VTA64P4GTF">
 14 | 		</a>
 15 | 		<a href="https://lce.readthedocs.io/en/latest/?badge=latest">
 16 | 			<img src="https://readthedocs.org/projects/lce/badge/?version=latest">
 17 | 		</a>
 18 | 		<a href="https://pypi.python.org/pypi/lcensemble/">		
 19 | 			<img src="https://badge.fury.io/py/lcensemble.svg">
 20 | 		</a>		
 21 | 		<a href="https://pypi.python.org/pypi/lcensemble/">		
 22 | 			<img src="https://img.shields.io/pypi/pyversions/lcensemble.svg">
 23 | 		</a>
 24 | 		<a href="https://github.com/psf/black">	
 25 | 			<img src="https://img.shields.io/badge/code%20style-black-000000.svg">
 26 | 		</a>
 27 | 		<a href="https://pypi.python.org/pypi/lcensemble/">		
 28 | 			<img src="https://img.shields.io/github/license/LocalCascadeEnsemble/LCE.svg">
 29 | 		</a>
 30 | 	</div>
 31 |    
 32 | | **Local Cascade Ensemble (LCE)** is a *high-performing*, *scalable* and *user-friendly* machine learning method for the general tasks of **Classification** and **Regression**.
 33 | | In particular, LCE:
 34 |  
 35 | - Enhances the prediction performance of Random Forest and XGBoost by combining their strengths and adopting a complementary diversification approach
 36 | - Supports parallel processing to ensure scalability
 37 | - Handles missing data by design
 38 | - Adopts scikit-learn API for the ease of use
 39 | - Adheres to scikit-learn conventions to allow interaction with scikit-learn pipelines and model selection tools
 40 | - Is released in open source and commercially usable - Apache 2.0 license
 41 | 
 42 | 
 43 | Getting Started
 44 | ===============
 45 | 
 46 | This section presents a quick start tutorial showing snippets for you to try out LCE.
 47 | 
 48 | Installation
 49 | ------------
 50 | 
 51 | You can install LCE from `PyPI <https://pypi.org/project/lcensemble/>`_ with ``pip``::
 52 | 
 53 | 	pip install lcensemble
 54 | 	
 55 | Or ``conda``::
 56 | 
 57 | 	conda install -c conda-forge lcensemble
 58 | 	
 59 | 	
 60 | First Example on Iris Dataset
 61 | -----------------------------
 62 | 
 63 | LCEClassifier accuracy on an Iris test set:
 64 | 
 65 | .. code-block:: python
 66 | 
 67 | 	from lce import LCEClassifier
 68 | 	from sklearn.datasets import load_iris
 69 | 	from sklearn.metrics import accuracy_score
 70 | 	from sklearn.model_selection import train_test_split
 71 | 
 72 | 
 73 | 	# Load data and generate a train/test split
 74 | 	data = load_iris()
 75 | 	X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, random_state=0)
 76 | 
 77 | 	# Train LCEClassifier with default parameters
 78 | 	clf = LCEClassifier(n_jobs=-1, random_state=0)
 79 | 	clf.fit(X_train, y_train)
 80 | 
 81 | 	# Make prediction and compute accuracy score
 82 | 	y_pred = clf.predict(X_test)
 83 | 	accuracy = accuracy_score(y_test, y_pred)
 84 | 	print("Accuracy: {:.1f}%".format(accuracy*100))
 85 | 	
 86 | .. code-block::
 87 | 	
 88 | 	Accuracy: 97.4%
 89 | 
 90 | 
 91 | Documentation
 92 | =============
 93 | 
 94 | LCE documentation, including API documentation and general examples, can be found `here <https://lce.readthedocs.io/en/latest/>`_.
 95 | 
 96 | 
 97 | Contribute to LCE
 98 | =================
 99 | 
100 | Your valuable contribution will help make this package more powerful, and better for the community.
101 | There are multiple ways to participate, check out this `page <https://lce.readthedocs.io/en/latest/contribute.html>`_!
102 | 
103 | 
104 | Reference Papers
105 | ================
106 | 
107 | LCE originated from a research at `Inria, France <https://www.inria.fr/en>`_. 
108 | Here are the reference papers:
109 | 
110 | .. [1] Fauvel, K., E. Fromont, V. Masson, P. Faverdin and A. Termier. LCE: An Augmented Combination of Bagging and Boosting in Python. arXiv, 2023
111 | 
112 | .. [2] Fauvel, K., E. Fromont, V. Masson, P. Faverdin and A. Termier. XEM: An Explainable-by-Design Ensemble Method for Multivariate Time Series Classification. Data Mining and Knowledge Discovery, 36(3):917–957, 2022
113 | 
114 | .. [3] Fauvel, K., V. Masson, E. Fromont, P. Faverdin and A. Termier. Towards Sustainable Dairy Management - A Machine Learning Enhanced Method for Estrus Detection. In Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining, 2019
115 | 
116 | If you use LCE, we would appreciate citations.
117 | 
118 | 
119 | Contact
120 | =======
121 | 
122 | If you have any question, you can contact me here: `Kevin Fauvel <https://www.linkedin.com/in/kevin-fauvel-phd-cfa-caia-51b7777a/>`_.


--------------------------------------------------------------------------------
/doc/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	goto end
 41 | )
 42 | 
 43 | if "%1" == "clean" (
 44 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 45 | 	del /q /s %BUILDDIR%\*
 46 | 	goto end
 47 | )
 48 | 
 49 | 
 50 | %SPHINXBUILD% 2> nul
 51 | if errorlevel 9009 (
 52 | 	echo.
 53 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 54 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 55 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 56 | 	echo.may add the Sphinx directory to PATH.
 57 | 	echo.
 58 | 	echo.If you don't have Sphinx installed, grab it from
 59 | 	echo.http://sphinx-doc.org/
 60 | 	exit /b 1
 61 | )
 62 | 
 63 | if "%1" == "html" (
 64 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 65 | 	if errorlevel 1 exit /b 1
 66 | 	echo.
 67 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 68 | 	goto end
 69 | )
 70 | 
 71 | if "%1" == "dirhtml" (
 72 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 73 | 	if errorlevel 1 exit /b 1
 74 | 	echo.
 75 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 76 | 	goto end
 77 | )
 78 | 
 79 | if "%1" == "singlehtml" (
 80 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 81 | 	if errorlevel 1 exit /b 1
 82 | 	echo.
 83 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 84 | 	goto end
 85 | )
 86 | 
 87 | if "%1" == "pickle" (
 88 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 89 | 	if errorlevel 1 exit /b 1
 90 | 	echo.
 91 | 	echo.Build finished; now you can process the pickle files.
 92 | 	goto end
 93 | )
 94 | 
 95 | if "%1" == "json" (
 96 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 97 | 	if errorlevel 1 exit /b 1
 98 | 	echo.
 99 | 	echo.Build finished; now you can process the JSON files.
100 | 	goto end
101 | )
102 | 
103 | if "%1" == "htmlhelp" (
104 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
105 | 	if errorlevel 1 exit /b 1
106 | 	echo.
107 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
108 | .hhp project file in %BUILDDIR%/htmlhelp.
109 | 	goto end
110 | )
111 | 
112 | if "%1" == "qthelp" (
113 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
114 | 	if errorlevel 1 exit /b 1
115 | 	echo.
116 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
117 | .qhcp project file in %BUILDDIR%/qthelp, like this:
118 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\project-template.qhcp
119 | 	echo.To view the help file:
120 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\project-template.ghc
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "devhelp" (
125 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished.
129 | 	goto end
130 | )
131 | 
132 | if "%1" == "epub" (
133 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
134 | 	if errorlevel 1 exit /b 1
135 | 	echo.
136 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
137 | 	goto end
138 | )
139 | 
140 | if "%1" == "latex" (
141 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
142 | 	if errorlevel 1 exit /b 1
143 | 	echo.
144 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
145 | 	goto end
146 | )
147 | 
148 | if "%1" == "latexpdf" (
149 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
150 | 	cd %BUILDDIR%/latex
151 | 	make all-pdf
152 | 	cd %BUILDDIR%/..
153 | 	echo.
154 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
155 | 	goto end
156 | )
157 | 
158 | if "%1" == "latexpdfja" (
159 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
160 | 	cd %BUILDDIR%/latex
161 | 	make all-pdf-ja
162 | 	cd %BUILDDIR%/..
163 | 	echo.
164 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
165 | 	goto end
166 | )
167 | 
168 | if "%1" == "text" (
169 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
170 | 	if errorlevel 1 exit /b 1
171 | 	echo.
172 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
173 | 	goto end
174 | )
175 | 
176 | if "%1" == "man" (
177 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
178 | 	if errorlevel 1 exit /b 1
179 | 	echo.
180 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
181 | 	goto end
182 | )
183 | 
184 | if "%1" == "texinfo" (
185 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
186 | 	if errorlevel 1 exit /b 1
187 | 	echo.
188 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
189 | 	goto end
190 | )
191 | 
192 | if "%1" == "gettext" (
193 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
194 | 	if errorlevel 1 exit /b 1
195 | 	echo.
196 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
197 | 	goto end
198 | )
199 | 
200 | if "%1" == "changes" (
201 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
202 | 	if errorlevel 1 exit /b 1
203 | 	echo.
204 | 	echo.The overview file is in %BUILDDIR%/changes.
205 | 	goto end
206 | )
207 | 
208 | if "%1" == "linkcheck" (
209 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
210 | 	if errorlevel 1 exit /b 1
211 | 	echo.
212 | 	echo.Link check complete; look for any errors in the above output ^
213 | or in %BUILDDIR%/linkcheck/output.txt.
214 | 	goto end
215 | )
216 | 
217 | if "%1" == "doctest" (
218 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
219 | 	if errorlevel 1 exit /b 1
220 | 	echo.
221 | 	echo.Testing of doctests in the sources finished, look at the ^
222 | results in %BUILDDIR%/doctest/output.txt.
223 | 	goto end
224 | )
225 | 
226 | if "%1" == "xml" (
227 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
228 | 	if errorlevel 1 exit /b 1
229 | 	echo.
230 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
231 | 	goto end
232 | )
233 | 
234 | if "%1" == "pseudoxml" (
235 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
236 | 	if errorlevel 1 exit /b 1
237 | 	echo.
238 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
239 | 	goto end
240 | )
241 | 
242 | :end
243 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 34 | 	@echo "  epub       to make an epub"
 35 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 36 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 37 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 38 | 	@echo "  text       to make text files"
 39 | 	@echo "  man        to make manual pages"
 40 | 	@echo "  texinfo    to make Texinfo files"
 41 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 42 | 	@echo "  gettext    to make PO message catalogs"
 43 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 44 | 	@echo "  xml        to make Docutils-native XML files"
 45 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 46 | 	@echo "  linkcheck  to check all external links for integrity"
 47 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 48 | 
 49 | clean:
 50 | 	-rm -rf $(BUILDDIR)/*
 51 | 	-rm -rf auto_examples/
 52 | 	-rm -rf generated/*
 53 | 	-rm -rf modules/generated/*
 54 | 
 55 | html:
 56 | 	# These two lines make the build a bit more lengthy, and the
 57 | 	# the embedding of images more robust
 58 | 	rm -rf $(BUILDDIR)/html/_images
 59 | 	#rm -rf _build/doctrees/
 60 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 61 | 	@echo
 62 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 63 | 
 64 | dirhtml:
 65 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 66 | 	@echo
 67 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 68 | 
 69 | singlehtml:
 70 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 71 | 	@echo
 72 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 73 | 
 74 | pickle:
 75 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 76 | 	@echo
 77 | 	@echo "Build finished; now you can process the pickle files."
 78 | 
 79 | json:
 80 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 81 | 	@echo
 82 | 	@echo "Build finished; now you can process the JSON files."
 83 | 
 84 | htmlhelp:
 85 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 86 | 	@echo
 87 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 88 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 89 | 
 90 | qthelp:
 91 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 92 | 	@echo
 93 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 94 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 95 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/project-template.qhcp"
 96 | 	@echo "To view the help file:"
 97 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/project-template.qhc"
 98 | 
 99 | devhelp:
100 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
101 | 	@echo
102 | 	@echo "Build finished."
103 | 	@echo "To view the help file:"
104 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/project-template"
105 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/project-template"
106 | 	@echo "# devhelp"
107 | 
108 | epub:
109 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
110 | 	@echo
111 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
112 | 
113 | latex:
114 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
115 | 	@echo
116 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
117 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
118 | 	      "(use \`make latexpdf' here to do that automatically)."
119 | 
120 | latexpdf:
121 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
122 | 	@echo "Running LaTeX files through pdflatex..."
123 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
124 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
125 | 
126 | latexpdfja:
127 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
128 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
129 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
130 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
131 | 
132 | text:
133 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
134 | 	@echo
135 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
136 | 
137 | man:
138 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
139 | 	@echo
140 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
141 | 
142 | texinfo:
143 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
144 | 	@echo
145 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
146 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
147 | 	      "(use \`make info' here to do that automatically)."
148 | 
149 | info:
150 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
151 | 	@echo "Running Texinfo files through makeinfo..."
152 | 	make -C $(BUILDDIR)/texinfo info
153 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
154 | 
155 | gettext:
156 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
157 | 	@echo
158 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
159 | 
160 | changes:
161 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
162 | 	@echo
163 | 	@echo "The overview file is in $(BUILDDIR)/changes."
164 | 
165 | linkcheck:
166 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
167 | 	@echo
168 | 	@echo "Link check complete; look for any errors in the above output " \
169 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
170 | 
171 | doctest:
172 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
173 | 	@echo "Testing of doctests in the sources finished, look at the " \
174 | 	      "results in $(BUILDDIR)/doctest/output.txt."
175 | 
176 | xml:
177 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
178 | 	@echo
179 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
180 | 
181 | pseudoxml:
182 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
183 | 	@echo
184 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
185 | 


--------------------------------------------------------------------------------
/lce/_catboost.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
  3 | from sklearn.metrics import check_scoring
  4 | from catboost import CatBoostClassifier, CatBoostRegressor
  5 | 
  6 | 
  7 | def catboost_opt_classifier(
  8 |     X,
  9 |     y,
 10 |     n_iter=10,
 11 |     metric="accuracy",
 12 |     n_estimators=(10, 50, 100),
 13 |     max_depth=(3, 6, 9),
 14 |     learning_rate=(0.01, 0.1, 0.3, 0.5),
 15 |     colsample_bylevel=(1.0,),
 16 |     reg_lambda=(0.1, 1.0, 5.0),
 17 |     n_jobs=None,
 18 |     random_state=None,
 19 | ):
 20 |     """
 21 |     Get CatBoost model with the best hyperparameters configuration.
 22 | 
 23 |     Parameters
 24 |     ----------
 25 |     X : array-like of shape (n_samples, n_features)
 26 |         The training input samples.
 27 | 
 28 |     y : array-like of shape (n_samples,)
 29 |         The class labels.
 30 | 
 31 |     n_iter: int, default=10
 32 |         Number of iterations to set the hyperparameters of the base classifier (CatBoost)
 33 |         in Hyperopt.
 34 | 
 35 |     metric: string, default="accuracy"
 36 |         The score of the base classifier (CatBoost) optimized by Hyperopt. Supported metrics
 37 |         are the ones from `scikit-learn <https://scikit-learn.org/stable/modules/model_evaluation.html>`_.
 38 | 
 39 |     n_estimators : tuple, default=(10, 50, 100)
 40 |         The number of estimators for the base learner. The tuple provided is
 41 |         the search space used for the hyperparameter optimization (Hyperopt).
 42 |         
 43 |     max_depth : tuple, default=(3, 6, 9)
 44 |         Maximum tree depth for the base learner. The tuple provided is the search
 45 |         space used for the hyperparameter optimization (Hyperopt).
 46 |         
 47 |     learning_rate : tuple, default=(0.01, 0.1, 0.3, 0.5)
 48 |         `learning_rate` of the base learner. The tuple provided is the search space used for the
 49 |         hyperparameter optimization (Hyperopt).
 50 |         
 51 |     colsample_bylevel : tuple, default=(1.0,)
 52 |         Subsample ratio of columns for each level. Subsampling occurs
 53 |         once for every new depth level reached in a tree. Columns are subsampled
 54 |         from the set of columns chosen for the current tree. The tuple provided is the search
 55 |         space used for the hyperparameter optimization (Hyperopt).
 56 | 
 57 |     reg_lambda : tuple, default=(0.1, 1.0, 5.0)
 58 |         `reg_lambda` / `l2_leaf_reg` of CatBoost. The tuple provided is the search 
 59 |         space used for the hyperparameter optimization (Hyperopt).
 60 |         
 61 |     n_jobs : int, default=None
 62 |         The number of jobs to run in parallel.
 63 |         ``n_jobs=None`` means 1. ``n_jobs=-1`` means using all processors.
 64 | 
 65 |     random_state : int, RandomState instance or None, default=None
 66 |         Controls the randomness of the base learner CatBoost and
 67 |         the Hyperopt algorithm.
 68 | 
 69 |     Returns
 70 |     -------
 71 |     model: object
 72 |         CatBoost model with the best configuration and fitted on the input data.
 73 |     """
 74 |     # Parameters
 75 |     classes, y = np.unique(y, return_inverse=True)
 76 |     
 77 |     space = {
 78 |         "n_estimators": hp.choice("n_estimators", n_estimators),
 79 |         "depth": hp.choice("max_depth", max_depth),
 80 |         "learning_rate": hp.choice("learning_rate", learning_rate),
 81 |         "colsample_bylevel": hp.choice("colsample_bylevel", colsample_bylevel),
 82 |         "reg_lambda": hp.choice("reg_lambda", reg_lambda),
 83 |         "thread_count": n_jobs,
 84 |         "random_state": random_state,
 85 |     }
 86 | 
 87 |     # Get best configuration
 88 |     def p_model(params):
 89 |         clf = CatBoostClassifier(**params, verbose=False)
 90 |         clf.fit(X, y)
 91 |         scorer = check_scoring(clf, scoring=metric)
 92 |         return scorer(clf, X, y)
 93 | 
 94 |     global best
 95 |     best = -np.inf
 96 | 
 97 |     def f(params):
 98 |         global best
 99 |         perf = p_model(params)
100 |         if perf > best:
101 |             best = perf
102 |         return {"loss": -best, "status": STATUS_OK}
103 | 
104 |     rstate = np.random.default_rng(random_state)
105 |     best_config = fmin(
106 |         fn=f,
107 |         space=space,
108 |         algo=tpe.suggest,
109 |         max_evals=n_iter,
110 |         trials=Trials(),
111 |         rstate=rstate,
112 |         verbose=0,
113 |     )
114 | 
115 |     # Fit best model
116 |     final_params = {
117 |         "n_estimators": n_estimators[best_config["n_estimators"]],
118 |         "depth": max_depth[best_config["max_depth"]],
119 |         "learning_rate": learning_rate[best_config["learning_rate"]],
120 |         "colsample_bylevel": colsample_bylevel[best_config["colsample_bylevel"]],
121 |         "reg_lambda": reg_lambda[best_config["reg_lambda"]],   
122 |         "thread_count": n_jobs,
123 |         "random_state": random_state,
124 |     }
125 |     clf = CatBoostClassifier(**final_params, verbose=False)
126 |     return clf.fit(X, y)
127 | 
128 | 
129 | def catboost_opt_regressor(
130 |     X,
131 |     y,
132 |     n_iter=10,
133 |     metric="neg_mean_squared_error",
134 |     n_estimators=(10, 50, 100),
135 |     max_depth=(3, 6, 9),
136 |     learning_rate=(0.01, 0.1, 0.3, 0.5),
137 |     colsample_bylevel=(1.0,),
138 |     reg_lambda=(0.1, 1.0, 5.0),
139 |     n_jobs=None,
140 |     random_state=None,
141 | ):
142 |     """
143 |     Get CatBoost model with the best hyperparameters configuration.
144 | 
145 |     Parameters
146 |     ----------
147 |     X : array-like of shape (n_samples, n_features)
148 |         The training input samples.
149 | 
150 |     y : array-like of shape (n_samples,)
151 |         The target values (real numbers).
152 | 
153 |     n_iter: int, default=10
154 |         Number of iterations to set the hyperparameters of the base regressor (CatBoost)
155 |         in Hyperopt.
156 | 
157 |     metric: string, default="neg_mean_squared_error"
158 |         The score of the base regressor (CatBoost) optimized by Hyperopt. Supported metrics
159 |         are the ones from `scikit-learn <https://scikit-learn.org/stable/modules/model_evaluation.html>`_.
160 | 
161 |     n_estimators : tuple, default=(10, 50, 100)
162 |         The number of estimators for the base learner. The tuple provided is
163 |         the search space used for the hyperparameter optimization (Hyperopt).
164 |         
165 |     max_depth : tuple, default=(3, 6, 9)
166 |         Maximum tree depth for the base learner. The tuple provided is the search
167 |         space used for the hyperparameter optimization (Hyperopt).
168 |         
169 |     learning_rate : tuple, default=(0.01, 0.1, 0.3, 0.5)
170 |         `learning_rate` of the base learner. The tuple provided is the search space used for the
171 |         hyperparameter optimization (Hyperopt).
172 |         
173 |     colsample_bylevel : tuple, default=(1.0,)
174 |         Subsample ratio of columns for each level. Subsampling occurs
175 |         once for every new depth level reached in a tree. Columns are subsampled
176 |         from the set of columns chosen for the current tree. The tuple provided is the search
177 |         space used for the hyperparameter optimization (Hyperopt).
178 | 
179 |     reg_lambda : tuple, default=(0.1, 1.0, 5.0)
180 |         `reg_lambda` / `l2_leaf_reg` of CatBoost. The tuple provided is the search 
181 |         space used for the hyperparameter optimization (Hyperopt).
182 |         
183 |     n_jobs : int, default=None
184 |         The number of jobs to run in parallel.
185 |         ``n_jobs=None`` means 1. ``n_jobs=-1`` means using all processors.
186 | 
187 |     random_state : int, RandomState instance or None, default=None
188 |         Controls the randomness of the base learner CatBoost and
189 |         the Hyperopt algorithm.
190 | 
191 |     Returns
192 |     -------
193 |     model: object
194 |         CatBoost model with the best configuration and fitted on the input data.
195 |     """
196 |     space = {
197 |         "n_estimators": hp.choice("n_estimators", n_estimators),
198 |         "depth": hp.choice("max_depth", max_depth),
199 |         "learning_rate": hp.choice("learning_rate", learning_rate),
200 |         "colsample_bylevel": hp.choice("colsample_bylevel", colsample_bylevel),
201 |         "reg_lambda": hp.choice("reg_lambda", reg_lambda),
202 |         "thread_count": n_jobs,
203 |         "random_state": random_state,
204 |     }
205 | 
206 |     # Get best configuration
207 |     def p_model(params):
208 |         reg = CatBoostRegressor(**params, verbose=False)
209 |         reg.fit(X, y)
210 |         scorer = check_scoring(reg, scoring=metric)
211 |         return scorer(reg, X, y)
212 | 
213 |     global best
214 |     best = -np.inf
215 | 
216 |     def f(params):
217 |         global best
218 |         perf = p_model(params)
219 |         if perf > best:
220 |             best = perf
221 |         return {"loss": -best, "status": STATUS_OK}
222 | 
223 |     rstate = np.random.default_rng(random_state)
224 |     best_config = fmin(
225 |         fn=f,
226 |         space=space,
227 |         algo=tpe.suggest,
228 |         max_evals=n_iter,
229 |         trials=Trials(),
230 |         rstate=rstate,
231 |         verbose=0,
232 |     )
233 | 
234 |     # Fit best model
235 |     final_params = {
236 |         "n_estimators": n_estimators[best_config["n_estimators"]],
237 |         "depth": max_depth[best_config["max_depth"]],
238 |         "learning_rate": learning_rate[best_config["learning_rate"]],
239 |         "colsample_bylevel": colsample_bylevel[best_config["colsample_bylevel"]],
240 |         "reg_lambda": reg_lambda[best_config["reg_lambda"]],
241 |         "thread_count": n_jobs,
242 |         "random_state": random_state,
243 |     }
244 |     reg = CatBoostRegressor(**final_params, verbose=False)
245 |     return reg.fit(X, y)
246 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    Copyright 2023, Kevin FAUVEL
179 | 
180 |    Licensed under the Apache License, Version 2.0 (the "License");
181 |    you may not use this file except in compliance with the License.
182 |    You may obtain a copy of the License at
183 | 
184 |        http://www.apache.org/licenses/LICENSE-2.0
185 | 
186 |    Unless required by applicable law or agreed to in writing, software
187 |    distributed under the License is distributed on an "AS IS" BASIS,
188 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
189 |    See the License for the specific language governing permissions and
190 |    limitations under the License.


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # project-template documentation build configuration file, created by
  4 | # sphinx-quickstart on Mon Jan 18 14:44:12 2016.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys
 16 | import os
 17 | 
 18 | import sphinx_gallery
 19 | import sphinx_rtd_theme
 20 | 
 21 | # Add to sys.path the top-level directory where the package is located.
 22 | sys.path.insert(0, os.path.abspath(".."))
 23 | 
 24 | # If extensions (or modules to document with autodoc) are in another directory,
 25 | # add these directories to sys.path here. If the directory is relative to the
 26 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 27 | # sys.path.insert(0, os.path.abspath('.'))
 28 | 
 29 | # -- General configuration ------------------------------------------------
 30 | 
 31 | # If your documentation needs a minimal Sphinx version, state it here.
 32 | # needs_sphinx = '1.0'
 33 | 
 34 | # Add any Sphinx extension module names here, as strings. They can be
 35 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 36 | # ones.
 37 | extensions = [
 38 |     "sphinx.ext.autodoc",
 39 |     "sphinx.ext.autosummary",
 40 |     "sphinx.ext.doctest",
 41 |     "sphinx.ext.intersphinx",
 42 |     "sphinx.ext.viewcode",
 43 |     "numpydoc",
 44 |     "sphinx_gallery.gen_gallery",
 45 | ]
 46 | 
 47 | # this is needed for some reason...
 48 | # see https://github.com/numpy/numpydoc/issues/69
 49 | numpydoc_show_class_members = False
 50 | 
 51 | # pngmath / imgmath compatibility layer for different sphinx versions
 52 | import sphinx
 53 | from distutils.version import LooseVersion
 54 | 
 55 | if LooseVersion(sphinx.__version__) < LooseVersion("1.4"):
 56 |     extensions.append("sphinx.ext.pngmath")
 57 | else:
 58 |     extensions.append("sphinx.ext.imgmath")
 59 | 
 60 | autodoc_default_flags = ["members", "inherited-members"]
 61 | 
 62 | # Add any paths that contain templates here, relative to this directory.
 63 | templates_path = ["_templates"]
 64 | 
 65 | # generate autosummary even if no references
 66 | autosummary_generate = True
 67 | 
 68 | # The suffix of source filenames.
 69 | source_suffix = ".rst"
 70 | 
 71 | # The encoding of source files.
 72 | # source_encoding = 'utf-8-sig'
 73 | 
 74 | # Generate the plots for the gallery
 75 | plot_gallery = True
 76 | 
 77 | # The master toctree document.
 78 | master_doc = "index"
 79 | 
 80 | # General information about the project.
 81 | project = "LCE"
 82 | copyright = "2023, Kevin Fauvel"
 83 | 
 84 | # The version info for the project you're documenting, acts as replacement for
 85 | # |version| and |release|, also used in various other places throughout the
 86 | # built documents.
 87 | #
 88 | # The short X.Y version.
 89 | from lce import __version__
 90 | 
 91 | version = __version__
 92 | # The full version, including alpha/beta/rc tags.
 93 | release = __version__
 94 | 
 95 | # The language for content autogenerated by Sphinx. Refer to documentation
 96 | # for a list of supported languages.
 97 | # language = None
 98 | 
 99 | # There are two options for replacing |today|: either, you set today to some
100 | # non-false value, then it is used:
101 | # today = ''
102 | # Else, today_fmt is used as the format for a strftime call.
103 | # today_fmt = '%B %d, %Y'
104 | 
105 | # List of patterns, relative to source directory, that match files and
106 | # directories to ignore when looking for source files.
107 | exclude_patterns = ["_build", "_templates"]
108 | 
109 | # The reST default role (used for this markup: `text`) to use for all
110 | # documents.
111 | # default_role = None
112 | 
113 | # If true, '()' will be appended to :func: etc. cross-reference text.
114 | # add_function_parentheses = True
115 | 
116 | # If true, the current module name will be prepended to all description
117 | # unit titles (such as .. function::).
118 | # add_module_names = True
119 | 
120 | # If true, sectionauthor and moduleauthor directives will be shown in the
121 | # output. They are ignored by default.
122 | # show_authors = False
123 | 
124 | # The name of the Pygments (syntax highlighting) style to use.
125 | pygments_style = "sphinx"
126 | 
127 | # Custom style
128 | html_style = "css/project-template.css"
129 | 
130 | # A list of ignored prefixes for module index sorting.
131 | # modindex_common_prefix = []
132 | 
133 | # If true, keep warnings as "system message" paragraphs in the built documents.
134 | # keep_warnings = False
135 | 
136 | 
137 | # -- Options for HTML output ----------------------------------------------
138 | 
139 | # The theme to use for HTML and HTML Help pages.  See the documentation for
140 | # a list of builtin themes.
141 | html_theme = "sphinx_rtd_theme"
142 | 
143 | # Theme options are theme-specific and customize the look and feel of a theme
144 | # further.  For a list of options available for each theme, see the
145 | # documentation.
146 | # html_theme_options = {}
147 | 
148 | # Add any paths that contain custom themes here, relative to this directory.
149 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
150 | 
151 | # The name for this set of Sphinx documents.  If None, it defaults to
152 | # "<project> v<release> documentation".
153 | # html_title = None
154 | 
155 | # A shorter title for the navigation bar.  Default is the same as html_title.
156 | # html_short_title = None
157 | 
158 | # The name of an image file (relative to this directory) to place at the top
159 | # of the sidebar.
160 | # html_logo = None
161 | 
162 | # The name of an image file (within the static path) to use as favicon of the
163 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
164 | # pixels large.
165 | # html_favicon = None
166 | 
167 | # Add any paths that contain custom static files (such as style sheets) here,
168 | # relative to this directory. They are copied after the builtin static files,
169 | # so a file named "default.css" will overwrite the builtin "default.css".
170 | html_static_path = ["_static"]
171 | 
172 | html_logo = "../logo/logo_lce.svg"
173 | html_theme_options = {
174 |     "logo_only": True,
175 |     "display_version": False,
176 | }
177 | 
178 | # Add any extra paths that contain custom files (such as robots.txt or
179 | # .htaccess) here, relative to this directory. These files are copied
180 | # directly to the root of the documentation.
181 | # html_extra_path = []
182 | 
183 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
184 | # using the given strftime format.
185 | # html_last_updated_fmt = '%b %d, %Y'
186 | 
187 | # If true, SmartyPants will be used to convert quotes and dashes to
188 | # typographically correct entities.
189 | # html_use_smartypants = True
190 | 
191 | # Custom sidebar templates, maps document names to template names.
192 | # html_sidebars = {}
193 | 
194 | # Additional templates that should be rendered to pages, maps page names to
195 | # template names.
196 | # html_additional_pages = {}
197 | 
198 | # If false, no module index is generated.
199 | # html_domain_indices = True
200 | 
201 | # If false, no index is generated.
202 | # html_use_index = True
203 | 
204 | # If true, the index is split into individual pages for each letter.
205 | # html_split_index = False
206 | 
207 | # If true, links to the reST sources are added to the pages.
208 | # html_show_sourcelink = True
209 | 
210 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
211 | # html_show_sphinx = True
212 | 
213 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
214 | # html_show_copyright = True
215 | 
216 | # If true, an OpenSearch description file will be output, and all pages will
217 | # contain a <link> tag referring to it.  The value of this option must be the
218 | # base URL from which the finished HTML is served.
219 | # html_use_opensearch = ''
220 | 
221 | # This is the file name suffix for HTML files (e.g. ".xhtml").
222 | # html_file_suffix = None
223 | 
224 | # Output file base name for HTML help builder.
225 | # htmlhelp_basename = 'project-templatedoc'
226 | htmlhelp_basename = "lcedoc"
227 | 
228 | 
229 | # -- Options for LaTeX output ---------------------------------------------
230 | 
231 | latex_elements = {
232 |     # The paper size ('letterpaper' or 'a4paper').
233 |     #'papersize': 'letterpaper',
234 |     # The font size ('10pt', '11pt' or '12pt').
235 |     #'pointsize': '10pt',
236 |     # Additional stuff for the LaTeX preamble.
237 |     #'preamble': '',
238 | }
239 | 
240 | # Grouping the document tree into LaTeX files. List of tuples
241 | # (source start file, target name, title,
242 | #  author, documentclass [howto, manual, or own class]).
243 | latex_documents = [
244 |     ("index", "LCE.tex", "LCE Documentation", "Kevin Fauvel", "manual"),
245 | ]
246 | 
247 | # The name of an image file (relative to this directory) to place at the top of
248 | # the title page.
249 | # latex_logo = None
250 | 
251 | # For "manual" documents, if this is true, then toplevel headings are parts,
252 | # not chapters.
253 | # latex_use_parts = False
254 | 
255 | # If true, show page references after internal links.
256 | # latex_show_pagerefs = False
257 | 
258 | # If true, show URL addresses after external links.
259 | # latex_show_urls = False
260 | 
261 | # Documents to append as an appendix to all manuals.
262 | # latex_appendices = []
263 | 
264 | # If false, no module index is generated.
265 | # latex_domain_indices = True
266 | 
267 | 
268 | # -- Options for manual page output ---------------------------------------
269 | 
270 | # One entry per manual page. List of tuples
271 | # (source start file, name, description, authors, manual section).
272 | man_pages = [("index", "LCE", "LCE Documentation", ["Kevin Fauvel"], 1)]
273 | 
274 | # If true, show URL addresses after external links.
275 | # man_show_urls = False
276 | 
277 | 
278 | # -- Options for Texinfo output -------------------------------------------
279 | 
280 | # Grouping the document tree into Texinfo files. List of tuples
281 | # (source start file, target name, title, author,
282 | #  dir menu entry, description, category)
283 | texinfo_documents = [
284 |     (
285 |         "index",
286 |         "LCE",
287 |         "LCE Documentation",
288 |         "Kevin Fauvel",
289 |         "LCE",
290 |         "Local Cascade Ensemble.",
291 |         "Miscellaneous",
292 |     ),
293 | ]
294 | 
295 | # Documents to append as an appendix to all manuals.
296 | # texinfo_appendices = []
297 | 
298 | # If false, no module index is generated.
299 | # texinfo_domain_indices = True
300 | 
301 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
302 | # texinfo_show_urls = 'footnote'
303 | 
304 | # If true, do not generate a @detailmenu in the "Top" node's menu.
305 | # texinfo_no_detailmenu = False
306 | 
307 | 
308 | # Example configuration for intersphinx: refer to the Python standard library.
309 | # intersphinx configuration
310 | intersphinx_mapping = {
311 |     "python": ("https://docs.python.org/{.major}".format(sys.version_info), None),
312 |     "numpy": ("https://docs.scipy.org/doc/numpy/", None),
313 |     "pandas": ("https://pandas.pydata.org/", None),
314 |     "sklearn": ("http://scikit-learn.org/stable", None),
315 | }
316 | 
317 | # sphinx-gallery configuration
318 | sphinx_gallery_conf = {
319 |     "doc_module": "LCE",
320 |     "backreferences_dir": os.path.join("generated"),
321 |     "reference_url": {"LCE": None},
322 | }
323 | 
324 | 
325 | def setup(app):
326 |     # a copy button to copy snippet of code from the documentation
327 |     app.add_js_file("js/copybutton.js")
328 | 


--------------------------------------------------------------------------------
/lce/tests/test_lce.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.datasets import (
  3 |     load_breast_cancer,
  4 |     load_diabetes,
  5 |     load_iris,
  6 |     make_regression,
  7 | )
  8 | from sklearn.utils.estimator_checks import check_estimator
  9 | import unittest
 10 | import warnings
 11 | 
 12 | from .._lce import LCEClassifier, LCERegressor
 13 | 
 14 | 
 15 | class Test(unittest.TestCase):
 16 |     """Tests of LCE"""
 17 | 
 18 |     def test_classifier_params(self):
 19 |         # Load Iris dataset
 20 |         data = load_iris()
 21 | 
 22 |         # max_depth
 23 |         with self.assertRaises(ValueError):
 24 |             LCEClassifier(max_depth=-1).fit(data.data, data.target)
 25 |         with self.assertRaises(ValueError):
 26 |             LCEClassifier(max_depth=1.1).fit(data.data, data.target)
 27 | 
 28 |         # min_samples_leaf
 29 |         with self.assertRaises(ValueError):
 30 |             LCEClassifier(min_samples_leaf=0).fit(data.data, data.target)
 31 |         with self.assertRaises(ValueError):
 32 |             LCEClassifier(min_samples_leaf=1.1).fit(data.data, data.target)
 33 |         with self.assertRaises(ValueError):
 34 |             LCEClassifier(min_samples_leaf="a").fit(data.data, data.target)
 35 |         with warnings.catch_warnings():
 36 |             LCEClassifier(min_samples_leaf=0.3).fit(data.data, data.target)
 37 | 
 38 |         # n_iter
 39 |         with self.assertRaises(ValueError):
 40 |             LCEClassifier(n_iter=-1).fit(data.data, data.target)
 41 |         with self.assertRaises(ValueError):
 42 |             LCEClassifier(n_iter=1.1).fit(data.data, data.target)
 43 | 
 44 |         # verbose
 45 |         with self.assertRaises(ValueError):
 46 |             LCEClassifier(verbose=-1).fit(data.data, data.target)
 47 |         with self.assertRaises(ValueError):
 48 |             LCEClassifier(verbose=1.1).fit(data.data, data.target)
 49 |         with warnings.catch_warnings():
 50 |             LCEClassifier(verbose=1).fit(data.data, data.target)
 51 | 
 52 |     def test_classifier(self):
 53 |         # Load Breast Cancer dataset
 54 |         data = load_breast_cancer()
 55 |         
 56 |         # Fit and predict (base learner: CatBoost)
 57 |         with warnings.catch_warnings():
 58 |             clf = LCEClassifier(
 59 |                 n_estimators=3,
 60 |                 max_depth=50,
 61 |                 min_samples_leaf=1,
 62 |                 base_learner="catboost",
 63 |                 random_state=0,
 64 |                 verbose=1,
 65 |             ).fit(data.data, data.target)
 66 |             clf.predict(data.data)
 67 |             
 68 |         # Fit and predict (base learner: LightGBM)
 69 |         with warnings.catch_warnings():
 70 |             clf = LCEClassifier(
 71 |                 n_estimators=3,
 72 |                 max_depth=50,
 73 |                 min_samples_leaf=1,
 74 |                 base_learner="lightgbm",
 75 |                 random_state=0,
 76 |                 verbose=1,
 77 |             ).fit(data.data, data.target)
 78 |             clf.predict(data.data)
 79 | 
 80 |         # Fit and predict (base learner: XGBoost)
 81 |         with warnings.catch_warnings():
 82 |             clf = LCEClassifier(
 83 |                 n_estimators=3,
 84 |                 max_depth=50,
 85 |                 min_samples_leaf=1,
 86 |                 base_learner="xgboost",
 87 |                 random_state=0,
 88 |                 verbose=1,
 89 |             ).fit(data.data, data.target)
 90 |             clf.predict(data.data)
 91 |     
 92 | 
 93 |     def test_classifier_missing(self):
 94 |         # Load Iris dataset
 95 |         data = load_iris()
 96 |         
 97 |         # Input 2% of missing values per variable (base learner: CatBoost)
 98 |         np.random.seed(0)
 99 |         m = 0.02
100 |         for j in range(0, data.data.shape[1]):
101 |             sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m))
102 |             temp = data.data
103 |             temp[sub, j] = np.nan
104 | 
105 |         with warnings.catch_warnings():
106 |             clf = LCEClassifier(
107 |                 n_estimators=3,
108 |                 max_depth=50,
109 |                 min_samples_leaf=1,
110 |                 base_learner="catboost",
111 |                 random_state=0,
112 |                 verbose=1,
113 |             ).fit(temp, data.target)
114 |             clf.predict(temp)
115 |             
116 |         # Input 2% of missing values per variable (base learner: LightGBM)
117 |         np.random.seed(0)
118 |         m = 0.02
119 |         for j in range(0, data.data.shape[1]):
120 |             sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m))
121 |             temp = data.data
122 |             temp[sub, j] = np.nan
123 | 
124 |         with warnings.catch_warnings():
125 |             clf = LCEClassifier(
126 |                 n_estimators=3,
127 |                 max_depth=50,
128 |                 min_samples_leaf=1,
129 |                 base_learner="lightgbm",
130 |                 random_state=0,
131 |                 verbose=1,
132 |             ).fit(temp, data.target)
133 |             clf.predict(temp)
134 | 
135 |         # Input 2% of missing values per variable (base learner: XGBoost)
136 |         np.random.seed(0)
137 |         m = 0.02
138 |         for j in range(0, data.data.shape[1]):
139 |             sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m))
140 |             temp = data.data
141 |             temp[sub, j] = np.nan
142 | 
143 |         with warnings.catch_warnings():
144 |             clf = LCEClassifier(
145 |                 n_estimators=3,
146 |                 max_depth=50,
147 |                 min_samples_leaf=1,
148 |                 base_learner="xgboost",
149 |                 random_state=0,
150 |                 verbose=1,
151 |             ).fit(temp, data.target)
152 |             clf.predict(temp)
153 | 
154 |         # Input 20% of missing values per variable (base learner: XGBoost)
155 |         np.random.seed(0)
156 |         m = 0.2
157 |         for j in range(0, data.data.shape[1]):
158 |             sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m))
159 |             temp = data.data
160 |             temp[sub, j] = np.nan
161 | 
162 |         with warnings.catch_warnings():
163 |             clf = LCEClassifier(
164 |                 n_estimators=3, max_depth=50, min_samples_leaf=1, base_learner="xgboost", random_state=0
165 |             ).fit(temp, data.target)
166 |             clf.predict(temp)
167 | 
168 |         # Input 60% of missing values per variable (base learner: XGBoost)
169 |         np.random.seed(0)
170 |         m = 0.6
171 |         for j in range(0, data.data.shape[1]):
172 |             sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m))
173 |             temp = data.data
174 |             temp[sub, j] = np.nan
175 | 
176 |         with warnings.catch_warnings():
177 |             clf = LCEClassifier(
178 |                 n_estimators=3, max_depth=50, min_samples_leaf=1, base_learner="xgboost", random_state=0
179 |             ).fit(temp, data.target)
180 |             clf.predict(temp)
181 | 
182 |         # Input 100% of missing values per variable (base learner: XGBoost)
183 |         np.random.seed(0)
184 |         m = 1.0
185 |         for j in range(0, data.data.shape[1]):
186 |             sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m))
187 |             temp = data.data
188 |             temp[sub, j] = np.nan
189 | 
190 |         with warnings.catch_warnings():
191 |             clf = LCEClassifier(
192 |                 n_estimators=3, max_depth=50, min_samples_leaf=1, base_learner="xgboost", random_state=0
193 |             ).fit(temp, data.target)
194 |             clf.predict(temp)
195 | 
196 |     def test_classifier_sklearn_estimator(self):
197 |         # scikit-learn check estimator
198 |         assert check_estimator(LCEClassifier()) == None
199 | 
200 |     def test_regressor_params(self):
201 |         # Load Diabetes dataset
202 |         data = load_diabetes()
203 | 
204 |         # max_depth
205 |         with self.assertRaises(ValueError):
206 |             LCERegressor(max_depth=-1).fit(data.data, data.target)
207 |         with self.assertRaises(ValueError):
208 |             LCERegressor(max_depth=1.1).fit(data.data, data.target)
209 | 
210 |         # min_samples_leaf
211 |         with self.assertRaises(ValueError):
212 |             LCERegressor(min_samples_leaf=0).fit(data.data, data.target)
213 |         with self.assertRaises(ValueError):
214 |             LCERegressor(min_samples_leaf=1.1).fit(data.data, data.target)
215 |         with self.assertRaises(ValueError):
216 |             LCERegressor(min_samples_leaf="a").fit(data.data, data.target)
217 |         with warnings.catch_warnings():
218 |             LCERegressor(min_samples_leaf=0.3).fit(data.data, data.target)
219 | 
220 |         # n_iter
221 |         with self.assertRaises(ValueError):
222 |             LCERegressor(n_iter=-1).fit(data.data, data.target)
223 |         with self.assertRaises(ValueError):
224 |             LCERegressor(n_iter=1.1).fit(data.data, data.target)
225 | 
226 |         # verbose
227 |         with self.assertRaises(ValueError):
228 |             LCERegressor(verbose=-1).fit(data.data, data.target)
229 |         with self.assertRaises(ValueError):
230 |             LCERegressor(verbose=1.1).fit(data.data, data.target)
231 |         with warnings.catch_warnings():
232 |             LCERegressor(verbose=1).fit(data.data, data.target)
233 | 
234 |     def test_regressor(self):
235 |         # Load dataset
236 |         n_samples, n_features = 100, 20
237 |         rng = np.random.RandomState(0)
238 |         X, y = make_regression(n_samples, n_features, random_state=rng)
239 | 
240 |         # Fit and predict (base learner: CatBoost)
241 |         with warnings.catch_warnings():
242 |             reg = LCERegressor(
243 |                 n_estimators=3,
244 |                 max_depth=50,
245 |                 min_samples_leaf=1,
246 |                 base_learner="catboost",
247 |                 random_state=0,
248 |                 verbose=1,
249 |             ).fit(X, y)
250 |             reg.predict(X)
251 |             
252 |         # Fit and predict (base learner: LightGBM)
253 |         with warnings.catch_warnings():
254 |             reg = LCERegressor(
255 |                 n_estimators=3,
256 |                 max_depth=50,
257 |                 min_samples_leaf=1,
258 |                 base_learner="lightgbm",
259 |                 random_state=0,
260 |                 verbose=1,
261 |             ).fit(X, y)
262 |             reg.predict(X)
263 |             
264 |         # Fit and predict (base learner: XGBoost)
265 |         with warnings.catch_warnings():
266 |             reg = LCERegressor(
267 |                 n_estimators=3,
268 |                 max_depth=50,
269 |                 min_samples_leaf=1,
270 |                 base_learner="xgboost",
271 |                 random_state=0,
272 |                 verbose=1,
273 |             ).fit(X, y)
274 |             reg.predict(X)
275 | 
276 |     def test_regressor_missing(self):
277 |         # Load Diabetes dataset
278 |         data = load_diabetes()
279 |         
280 |         # Input 2% of missing values per variable (base learner: CatBoost)
281 |         np.random.seed(0)
282 |         m = 0.02
283 |         for j in range(0, data.data.shape[1]):
284 |             sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m))
285 |             temp = data.data
286 |             temp[sub, j] = np.nan
287 | 
288 |         with warnings.catch_warnings():
289 |             reg = LCERegressor(
290 |                 n_estimators=3,
291 |                 max_depth=50,
292 |                 min_samples_leaf=1,
293 |                 base_learner="catboost",
294 |                 random_state=0,
295 |                 verbose=1,
296 |             ).fit(temp, data.target)
297 |             reg.predict(temp)
298 |             
299 |         # Input 2% of missing values per variable (base learner: LightGBM)
300 |         np.random.seed(0)
301 |         m = 0.02
302 |         for j in range(0, data.data.shape[1]):
303 |             sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m))
304 |             temp = data.data
305 |             temp[sub, j] = np.nan
306 | 
307 |         with warnings.catch_warnings():
308 |             reg = LCERegressor(
309 |                 n_estimators=3,
310 |                 max_depth=50,
311 |                 min_samples_leaf=1,
312 |                 base_learner="lightgbm",
313 |                 random_state=0,
314 |                 verbose=1,
315 |             ).fit(temp, data.target)
316 |             reg.predict(temp)
317 | 
318 |         # Input 2% of missing values per variable (base learner: XGBoost)
319 |         np.random.seed(0)
320 |         m = 0.02
321 |         for j in range(0, data.data.shape[1]):
322 |             sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m))
323 |             temp = data.data
324 |             temp[sub, j] = np.nan
325 | 
326 |         with warnings.catch_warnings():
327 |             reg = LCERegressor(
328 |                 n_estimators=3,
329 |                 max_depth=50,
330 |                 min_samples_leaf=1,
331 |                 base_learner="xgboost",
332 |                 random_state=0,
333 |                 verbose=1,
334 |             ).fit(temp, data.target)
335 |             reg.predict(temp)
336 | 
337 |         # Input 20% of missing values per variable (base learner: XGBoost)
338 |         np.random.seed(0)
339 |         m = 0.2
340 |         for j in range(0, data.data.shape[1]):
341 |             sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m))
342 |             temp = data.data
343 |             temp[sub, j] = np.nan
344 | 
345 |         with warnings.catch_warnings():
346 |             reg = LCERegressor(
347 |                 n_estimators=3, max_depth=50, min_samples_leaf=1, base_learner="xgboost", random_state=0
348 |             ).fit(temp, data.target)
349 |             reg.predict(temp)
350 | 
351 |         # Input 60% of missing values per variable (base learner: XGBoost)
352 |         np.random.seed(0)
353 |         m = 0.6
354 |         for j in range(0, data.data.shape[1]):
355 |             sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m))
356 |             temp = data.data
357 |             temp[sub, j] = np.nan
358 | 
359 |         with warnings.catch_warnings():
360 |             reg = LCERegressor(
361 |                 n_estimators=3, max_depth=50, min_samples_leaf=1, base_learner="xgboost", random_state=0
362 |             ).fit(temp, data.target)
363 |             reg.predict(temp)
364 | 
365 |         # Input 100% of missing values per variable (base learner: XGBoost)
366 |         np.random.seed(0)
367 |         m = 1.0
368 |         for j in range(0, data.data.shape[1]):
369 |             sub = np.random.choice(data.data.shape[0], int(data.data.shape[0] * m))
370 |             temp = data.data
371 |             temp[sub, j] = np.nan
372 | 
373 |         with warnings.catch_warnings():
374 |             reg = LCERegressor(
375 |                 n_estimators=3, max_depth=50, min_samples_leaf=1, base_learner="xgboost", random_state=0
376 |             ).fit(temp, data.target)
377 |             reg.predict(temp)
378 | 
379 |     def test_regressor_sklearn_estimator(self):
380 |         # scikit-learn check estimator
381 |         assert check_estimator(LCERegressor()) == None
382 | 


--------------------------------------------------------------------------------
/doc/tutorial.rst:
--------------------------------------------------------------------------------
  1 | LCE Presentation
  2 | ================
  3 | As shown in “Why Do Tree-Based Models still Outperform Deep Learning on Tabular Data?” [8]_, **the widely used tree-based models remain the state-of-the-art machine learning methods in many cases**. 
  4 | **Local Cascade Ensemble (LCE)** [7]_ proposes to combine the strengths of the top performing tree-based ensemble methods - Random Forest [3]_ and eXtreme Gradient Boosting (XGBoost) [4]_, 
  5 | and integrates a supplementary diversification approach which enables it to be **a better generalizing predictor**.
  6 | 
  7 | Overview
  8 | --------
  9 | The construction of an ensemble method involves combining accurate and diverse individual predictors. 
 10 | There are **two complementary ways** to generate diverse predictors: *(i)* by **changing the training data distribution** and *(ii)* by **learning different parts of the training data**.
 11 | 
 12 | **LCE adopts these two diversification approaches.** 
 13 | First, *(i)* LCE combines the two well-known methods that modify the distribution of the original training data with complementary effects on the bias-variance trade-off: bagging [2]_ (variance reduction) and boosting [11]_ (bias reduction). 
 14 | Then, *(ii)* LCE learns different parts of the training data to capture new relationships that cannot be discovered globally based on a divide-and-conquer strategy (a decision tree). 
 15 | Before detailing how LCE combines these methods, we introduce the key concepts behind them that will be used in the explanation of LCE.
 16 | 
 17 | Concepts
 18 | --------
 19 | The bias-variance trade-off defines the capacity of the learning algorithm to generalize beyond the training set. 
 20 | The *bias* is the component of the prediction error that results from systematic errors of the learning algorithm. 
 21 | A high bias means that the learning algorithm is not able to capture the underlying structure of the training set (underfitting). 
 22 | The *variance* measures the sensitivity of the learning algorithm to changes in the training set. 
 23 | A high variance means that the algorithm is learning too closely the training set (overfitting). 
 24 | The objective is to minimize both the bias and variance. *Bagging* has a main effect on variance reduction; it is a method for generating multiple versions of a predictor (bootstrap replicates) and using these to get an aggregated predictor. 
 25 | The current state-of-the-art method that employs bagging is Random Forest [3]_. 
 26 | Whereas, *boosting* has a main effect on bias reduction; it is a method for iteratively learning weak predictors and adding them to create a final strong one. 
 27 | After a weak learner is added, the data weights are readjusted, allowing future weak learners to focus more on the examples that previous weak learners mispredicted. 
 28 | The current state-of-the-art method that uses boosting is XGBoost [4]_. 
 29 | The following Figure illustrates the difference between bagging and boosting methods.
 30 | 
 31 | 
 32 | .. image:: _images/Figure_BaggingvsBoosting.png
 33 |    :width: 90%
 34 |    :align: center
 35 |   
 36 | 
 37 | LCE 
 38 | ---
 39 | LCE combines a boosting-bagging approach to handle the bias-variance trade-off faced by machine learning models; in addition, it adopts a divide-and-conquer approach to individualize predictor errors on different parts of the training data. 
 40 | LCE is represented in the following Figure.
 41 | 
 42 | .. image:: _images/Figure_LCE.png
 43 |    :width: 90%
 44 |    :align: center
 45 | 
 46 | 
 47 | Specifically, LCE is based on cascade generalization: it uses a set of predictors sequentially, and adds new attributes to the input dataset at each stage. 
 48 | The new attributes are derived from the output given by a predictor (e.g., class probabilities for a classifier), called a base learner. 
 49 | LCE applies cascade generalization locally following a divide-and-conquer strategy - a decision tree, and reduces bias across a decision tree through the use of boosting-based predictors as base learners. 
 50 | The current best performing state-of-the-art boosting algorithm is adopted as base learner by default (XGBoost, e.g., XGB¹°, XGB¹¹ in above Figure). 
 51 | CatBoost [10]_ and LightGBM [9]_ can also be chosen as base learner.
 52 | When growing the tree, boosting is propagated down the tree by adding the output of the base learner at each decision node as new attributes to the dataset (e.g., XGB¹°(D¹) in above Figure). 
 53 | Prediction outputs indicate the ability of the base learner to correctly predict a sample. 
 54 | At the next tree level, the outputs added to the dataset are exploited by the base learner as a weighting scheme to focus more on previously mispredicted samples. 
 55 | Then, the overfitting generated by the boosted decision tree is mitigated by the use of bagging. 
 56 | Bagging provides variance reduction by creating multiple predictors from random sampling with replacement of the original dataset (e.g., D¹, D² in above Figure). 
 57 | Finally, trees are aggregated with a simple majority vote. 
 58 | In order to be applied as a predictor, LCE stores, in each node, the model generated by the base learner.
 59 | 
 60 | Missing Data
 61 | ------------
 62 | LCE natively handles missing data. 
 63 | Similar to XGBoost, LCE excludes missing values for the split and uses block propagation. 
 64 | During a node split, block propagation sends all samples with missing data to the side of the decision node with less errors.
 65 | 
 66 | Hyperparameters
 67 | ---------------
 68 | The hyperparameters of LCE are the classical ones in tree-based learning (e.g., ``max_depth``, ``max_features``, ``n_estimators``). 
 69 | Moreover, LCE learns a specific XGBoost model at each node of a tree, and it only requires the ranges of XGBoost hyperparameters to be specified. 
 70 | Then, the hyperparameters of each XGBoost model are automatically set by Hyperopt [1]_, a sequential model-based optimization using a tree of Parzen estimators algorithm. 
 71 | Hyperopt chooses the next hyperparameters from both the previous choices and a tree-based optimization algorithm. 
 72 | Tree of Parzen estimators meets or exceeds grid search and random search performance for hyperparameters setting. 
 73 | The full list of LCE hyperparameters is available in its :ref:`API documentation <APIDocumentation>`.
 74 | 
 75 | Published Results
 76 | -----------------
 77 | LCE has been initially designed for a specific application in [6]_, and then evaluated on the public UCI datasets [5]_ in [7]_. 
 78 | Results show that LCE obtains on average a better prediction performance than the state-of-the-art classifiers, including Random Forest and XGBoost.
 79 | For a comparison between LCE, Random Forest and XGBoost on different public datasets, using the public implementations of the aforementioned algorithms, please refer to the article published in Towards Data Science `"LCE: The Most Powerful Machine Learning Method?" <https://towardsdatascience.com/lce-the-most-powerful-machine-learning-method-e8ea77f317d6?source=friends_link&sk=c8911ad03dd1e0e3fd02a17835609737>`_.
 80 | 
 81 | 
 82 | References
 83 | ----------
 84 | .. [1] Bergstra, J., R. Bardenet, Y. Bengio and B. Kégl. Algorithms for Hyper-Parameter Optimization. In Proceedings of the 24th International Conference on Neural Information Processing Systems, 2011
 85 | .. [2] Breiman, L. Bagging Predictors. Machine Learning, 24(2):123–140, 1996
 86 | .. [3] Breiman, L. Random Forests. Machine Learning, 45(1):5–32, 2001
 87 | .. [4] Chen, T. and C. Guestrin. XGBoost: A Scalable Tree Boosting System. In Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, 2016
 88 | .. [5] Dua, D. and C. Graff. UCI Machine Learning Repository, 2017
 89 | .. [6] Fauvel, K., V. Masson, E. Fromont, P. Faverdin and A. Termier. Towards Sustainable Dairy Management - A Machine Learning Enhanced Method for Estrus Detection. In Proceedings of the 25th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, 2019
 90 | .. [7] Fauvel, K., E. Fromont, V. Masson, P. Faverdin and A. Termier. XEM: An Explainable-by-Design Ensemble Method for Multivariate Time Series Classification. Data Mining and Knowledge Discovery, 36(3):917–957, 2022
 91 | .. [8] Grinsztajn, L., E. Oyallon and G. Varoquaux. Why Do Tree-Based Models still Outperform Deep Learning on Typical Tabular Data? In Proceedings of the 36th Conference on Neural Information Processing Systems Datasets and Benchmarks Track, 2022
 92 | .. [9] Ke, G., Q. Meng, T. Finley, T. Wang, W. Chen, W. Ma, Q. Ye and T. Liu. LightGBM: A Highly Efficient Gradient Boosting Decision Tree. In Proceedings of the 31st International Conference on Neural Information Processing Systems, 2017
 93 | .. [10] Prokhorenkova, L., G. Gusev, A. Vorobev, A. Dorogush and A. Gulin. CatBoost: Unbiased Boosting with Categorical Features. In Proceedings of the 32nd International Conference on Neural Information Processing Systems, 2018
 94 | .. [11] Schapire, R. The Strength of Weak Learnability. Machine Learning, 5(2):197–227, 1990
 95 | 
 96 | 
 97 | 
 98 | Installation
 99 | ============
100 | 
101 | You can install LCE from `PyPI <https://pypi.org/project/lcensemble/>`_ with ``pip``::
102 | 
103 | 	pip install lcensemble
104 | 	
105 | Or ``conda``::
106 | 
107 | 	conda install -c conda-forge lcensemble
108 | 
109 | 
110 | Code Examples
111 | =============
112 | 
113 | The following examples illustrate the use of LCE on public datasets for a classification and a regression task.
114 | They also demonstrate the compatibility of LCE with scikit-learn pipelines and model selection tools through the use of ``cross_val_score``.
115 | An example of LCE on a dataset including missing values is also shown.
116 | 
117 | Classification
118 | --------------
119 | 
120 | - **Example 1: LCE on Iris Dataset**
121 | 
122 | .. code-block:: python
123 | 
124 | 	from lce import LCEClassifier
125 | 	from sklearn.datasets import load_iris
126 | 	from sklearn.metrics import accuracy_score
127 | 	from sklearn.model_selection import train_test_split
128 | 
129 | 
130 | 	# Load data and generate a train/test split
131 | 	data = load_iris()
132 | 	X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, random_state=0)
133 | 
134 | 	# Train LCEClassifier with default parameters
135 | 	clf = LCEClassifier(n_jobs=-1, random_state=0)
136 | 	clf.fit(X_train, y_train)
137 | 
138 | 	# Make prediction and compute accuracy score
139 | 	y_pred = clf.predict(X_test)
140 | 	accuracy = accuracy_score(y_test, y_pred)
141 | 	print("Accuracy: {:.1f}%".format(accuracy*100))
142 |    
143 | .. code-block::
144 | 	
145 | 	Accuracy: 97.4%
146 | 
147 | 
148 | - **Example 2: LCE with scikit-learn cross validation score**
149 | This example demonstrates the compatibility of LCE with scikit-learn pipelines and model selection tools through the use of ``cross_val_score``.
150 | 
151 | .. code-block:: python
152 | 
153 | 	from lce import LCEClassifier
154 | 	from sklearn.datasets import load_iris
155 | 	from sklearn.model_selection import cross_val_score, train_test_split
156 | 
157 | 	# Load data
158 | 	data = load_iris()
159 | 	X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, random_state=0)
160 | 
161 | 	# Set LCEClassifier with default parameters
162 | 	clf = LCEClassifier(n_jobs=-1, random_state=0)
163 | 
164 | 	# Compute cross-validation scores
165 | 	cv_scores = cross_val_score(clf, X_train, y_train, cv=3)
166 | 	cv_scores = [round(elem*100, 1) for elem in cv_scores.tolist()]
167 | 	print("Cross-validation scores on train set: ", cv_scores)
168 |    
169 | .. code-block::
170 | 	
171 | 	Cross-validation scores on train set:  [94.7, 100.0, 94.6]
172 | 
173 | 
174 | Regression
175 | ----------
176 | 
177 | - **Example 3: LCE on Diabetes Dataset**
178 | 
179 | .. code-block:: python
180 | 
181 | 	from lce import LCERegressor
182 | 	from sklearn.datasets import load_diabetes
183 | 	from sklearn.metrics import mean_squared_error
184 | 	from sklearn.model_selection import train_test_split
185 | 
186 | 
187 | 	# Load data and generate a train/test split
188 | 	data = load_diabetes()
189 | 	X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, random_state=0)
190 | 
191 | 	# Train LCERegressor with default parameters
192 | 	reg = LCERegressor(n_jobs=-1, random_state=0)
193 | 	reg.fit(X_train, y_train)
194 | 
195 | 	# Make prediction 
196 | 	y_pred = reg.predict(X_test)
197 | 	mse = mean_squared_error(y_test, y_pred)
198 | 	print("The mean squared error (MSE) on test set: {:.0f}".format(mse))
199 | 	
200 | .. code-block::
201 | 	
202 | 	The mean squared error (MSE) on test set: 3761
203 | 	  
204 | 
205 | - **Example 4: LCE with missing values**
206 | This example illustrates the robustness of LCE to missing values. The Diabetes train set is modified with 20% of missing values per variable.
207 | 
208 | .. code-block:: python
209 | 
210 | 	import numpy as np
211 | 	from lce import LCERegressor
212 | 	from sklearn.datasets import load_diabetes
213 | 	from sklearn.metrics import mean_squared_error
214 | 	from sklearn.model_selection import train_test_split
215 | 
216 | 
217 | 	# Load data and generate a train/test split
218 | 	data = load_diabetes()
219 | 	X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, random_state=0)
220 | 
221 | 	# Input 20% of missing values per variable in the train set
222 | 	np.random.seed(0)
223 | 	m = 0.2
224 | 	for j in range(0, X_train.shape[1]):
225 | 		sub = np.random.choice(X_train.shape[0], int(X_train.shape[0]*m))
226 | 		X_train[sub, j] = np.nan
227 | 
228 | 	# Train LCERegressor with default parameters
229 | 	reg = LCERegressor(n_jobs=-1, random_state=0)
230 | 	reg.fit(X_train, y_train)
231 | 
232 | 	# Make prediction
233 | 	y_pred = reg.predict(X_test)
234 | 	mse = mean_squared_error(y_test, y_pred)
235 | 	print("The mean squared error (MSE) on test set: {:.0f}".format(mse))
236 | 
237 | .. code-block::
238 | 	
239 | 	The mean squared error (MSE) on test set: 3895
240 | 	
241 | 
242 | Python Source Files
243 | -------------------
244 | 
245 | 
246 | .. raw:: html
247 | 
248 |     <div class="sphx-glr-thumbcontainer" tooltip="LCEClassifier on Iris Dataset">
249 | 
250 | .. only:: html
251 | 
252 |  .. figure:: _images/logo_lce.svg
253 |      :alt: LCEClassifier on Iris dataset
254 | 
255 |      :ref:`sphx_glr_auto_examples_lceclassifier_iris.py`
256 | 
257 | .. raw:: html
258 | 
259 |     </div>
260 | 
261 | .. toctree::
262 |    :hidden:
263 | 
264 |    /auto_examples/lceclassifier_iris
265 |    
266 |    
267 | 
268 | .. raw:: html
269 | 
270 |     <div class="sphx-glr-thumbcontainer" tooltip="LCEClassifier on Iris Dataset with scikit-learn cross validation score">
271 | 
272 | .. only:: html
273 | 
274 |  .. figure:: _images/logo_lce.svg
275 |      :alt: LCEClassifier on Iris dataset with scikit-learn cross validation score
276 | 
277 |      :ref:`sphx_glr_auto_examples_lceclassifier_iris_cv.py`
278 | 
279 | .. raw:: html
280 | 
281 |     </div>
282 | 
283 | .. toctree::
284 |    :hidden:
285 | 
286 |    /auto_examples/lceclassifier_iris_cv
287 |    
288 | 
289 | 
290 | .. raw:: html
291 | 
292 |     <div class="sphx-glr-thumbcontainer" tooltip="LCERegressor on Diabetes Dataset">
293 | 
294 | .. only:: html
295 | 
296 |  .. figure:: _images/logo_lce.svg
297 |      :alt: LCERegressor on Diabetes dataset
298 | 
299 |      :ref:`sphx_glr_auto_examples_lceregressor_diabetes.py`
300 | 
301 | .. raw:: html
302 | 
303 |     </div>
304 | 
305 | 
306 | .. toctree::
307 |    :hidden:
308 | 
309 |    /auto_examples/lceregressor_diabetes
310 |    
311 |  
312 | .. raw:: html
313 | 
314 |     <div class="sphx-glr-thumbcontainer" tooltip="LCERegressor with missing values">
315 | 
316 | .. only:: html
317 | 
318 |  .. figure:: _images/logo_lce.svg
319 |      :alt: LCERegressor on Diabetes dataset with missing values
320 | 
321 |      :ref:`sphx_glr_auto_examples_lceregressor_missing_diabetes.py`
322 | 
323 | .. raw:: html
324 | 
325 |     </div>
326 | 
327 | 
328 | .. toctree::
329 |    :hidden:
330 | 
331 |    /auto_examples/lceregressor_missing_diabetes
332 |    
333 |    
334 | 
335 | .. raw:: html
336 | 
337 |     <div class="sphx-glr-clear"></div>
338 | 
339 | 
340 | 
341 | .. only :: html
342 | 
343 |  .. container:: sphx-glr-footer
344 |     :class: sphx-glr-footer-gallery
345 | 
346 | 
347 |   .. container:: sphx-glr-download sphx-glr-download-python
348 | 
349 |     :download:`Download all examples in Python source code: auto_examples_python.zip </auto_examples/auto_examples_python.zip>`
350 | 
351 | 
352 | 
353 |   .. container:: sphx-glr-download sphx-glr-download-jupyter
354 | 
355 |     :download:`Download all examples in Jupyter notebooks: auto_examples_jupyter.zip </auto_examples/auto_examples_jupyter.zip>`
356 | 
357 | 
358 | .. only:: html
359 | 
360 |  .. rst-class:: sphx-glr-signature
361 | 
362 |     `Gallery generated by Sphinx-Gallery <https://sphinx-gallery.github.io>`_
363 |    


--------------------------------------------------------------------------------
/lce/_lightgbm.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
  3 | from sklearn.metrics import check_scoring
  4 | import lightgbm as lgbm
  5 | 
  6 | 
  7 | def lgbm_opt_classifier(
  8 |     X,
  9 |     y,
 10 |     n_iter=10,
 11 |     metric="accuracy",
 12 |     n_estimators=(10, 50, 100),
 13 |     max_depth=(3, 6, 9),
 14 |     num_leaves=(20, 50, 100, 500),
 15 |     learning_rate=(0.01, 0.1, 0.3, 0.5),
 16 |     boosting_type=("gbdt",),
 17 |     min_child_weight=(1, 5, 15, 100),
 18 |     subsample=(1.0,),
 19 |     subsample_for_bin=(200000,),
 20 |     colsample_bytree=(1.0,),
 21 |     reg_alpha=(0,),
 22 |     reg_lambda=(0.1, 1.0, 5.0),
 23 |     n_jobs=None,
 24 |     random_state=None,
 25 | ):
 26 |     """
 27 |     Get LightGBM model with the best hyperparameters configuration.
 28 | 
 29 |     Parameters
 30 |     ----------
 31 |     X : array-like of shape (n_samples, n_features)
 32 |         The training input samples.
 33 | 
 34 |     y : array-like of shape (n_samples,)
 35 |         The class labels.
 36 | 
 37 |     n_iter: int, default=10
 38 |         Number of iterations to set the hyperparameters of the base classifier (LightGBM)
 39 |         in Hyperopt.
 40 | 
 41 |     metric: string, default="accuracy"
 42 |         The score of the base classifier (LightGBM) optimized by Hyperopt. Supported metrics
 43 |         are the ones from `scikit-learn <https://scikit-learn.org/stable/modules/model_evaluation.html>`_.
 44 | 
 45 |     n_estimators : tuple, default=(10, 50, 100)
 46 |         The number of LightGBM estimators. The number of estimators of
 47 |         LightGBM corresponds to the number of boosting rounds. The tuple provided is
 48 |         the search space used for the hyperparameter optimization (Hyperopt).
 49 | 
 50 |     max_depth : tuple, default=(3, 6, 9)
 51 |         Maximum tree depth for LightGBM base learners. The tuple provided is the search
 52 |         space used for the hyperparameter optimization (Hyperopt).
 53 |         
 54 |     num_leaves : tuple, default=(20, 50, 100, 500)
 55 |         Maximum tree leaves. The tuple provided is the search
 56 |         space used for the hyperparameter optimization (Hyperopt).
 57 | 
 58 |     learning_rate : tuple, default=(0.01, 0.1, 0.3, 0.5)
 59 |         `learning_rate` of LightGBM. The tuple provided is the search space used for the
 60 |         hyperparameter optimization (Hyperopt).
 61 |         
 62 |     boosting_type : ("dart", "gbdt", "rf"), default=("gbdt",)
 63 |         The type of boosting type to use: "dart" dropouts meet Multiple Additive 
 64 |         Regression Trees; "gbdt" traditional Gradient Boosting Decision Tree; "rf" Random Forest. 
 65 |         The tuple provided is the search space used for the hyperparameter optimization (Hyperopt).
 66 | 
 67 |     min_child_weight : tuple, default=(1, 5, 15, 100)
 68 |         `min_child_weight` of LightGBM. `min_child_weight` defines the
 69 |         minimum sum of instance weight (hessian) needed in a child. If the tree
 70 |         partition step results in a leaf node with the sum of instance weight
 71 |         less than `min_child_weight`, then the building process will give up further
 72 |         partitioning. The larger `min_child_weight` is, the more conservative LightGBM
 73 |         algorithm will be. The tuple provided is the search space used for the hyperparameter
 74 |         optimization (Hyperopt).
 75 | 
 76 |     subsample : tuple, default=(1.0,)
 77 |         LightGBM subsample ratio of the training instances. Setting it to 0.5 means
 78 |         that LightGBM would randomly sample half of the training data prior to
 79 |         growing trees, and this will prevent overfitting. Subsampling will occur
 80 |         once in every boosting iteration. The tuple provided is the search space used for
 81 |         the hyperparameter optimization (Hyperopt).
 82 |         
 83 |     subsample_for_bin : tuple, default=(200000,)
 84 |         Number of samples for constructing bins. The tuple provided is the
 85 |         search space used for the hyperparameter optimization (Hyperopt).
 86 | 
 87 |     colsample_bytree : tuple, default=(1.0,)
 88 |         LightGBM subsample ratio of columns when constructing each tree.
 89 |         Subsampling occurs once for every tree constructed. The tuple provided is the search
 90 |         space used for the hyperparameter optimization (Hyperopt).
 91 | 
 92 |     reg_alpha : tuple, default=(0,)
 93 |         `reg_alpha` of LightGBM. `reg_alpha` corresponds to the L1 regularization
 94 |         term on the weights. Increasing this value will make LightGBM model more
 95 |         conservative. The tuple provided is the search space used for the hyperparameter
 96 |         optimization (Hyperopt).
 97 | 
 98 |     reg_lambda : tuple, default=(0.1, 1.0, 5.0)
 99 |         `reg_lambda` of LightGBM. `reg_lambda` corresponds to the L2 regularization
100 |         term on the weights. Increasing this value will make LightGBM model more
101 |         conservative. The tuple provided is the search space used for the hyperparameter
102 |         optimization (Hyperopt).
103 | 
104 |     n_jobs : int, default=None
105 |         The number of jobs to run in parallel.
106 |         ``n_jobs=None`` means 1. ``n_jobs=-1`` means using all processors.
107 | 
108 |     random_state : int, RandomState instance or None, default=None
109 |         Controls the randomness of the base learner LightGBM and
110 |         the Hyperopt algorithm.
111 | 
112 |     Returns
113 |     -------
114 |     model: object
115 |         LightGBM model with the best configuration and fitted on the input data.
116 |     """
117 |     # Parameters
118 |     classes, y = np.unique(y, return_inverse=True)
119 |     n_classes = classes.size
120 |         
121 |     if n_classes == 2:
122 |         objective = "binary"
123 |         num_class = 1
124 |     else: 
125 |         objective = "multiclass"
126 |         num_class = n_classes
127 | 
128 |     space = {
129 |         "n_estimators": hp.choice("n_estimators", n_estimators),
130 |         "max_depth": hp.choice("max_depth", max_depth),
131 |         "num_leaves": hp.choice("num_leaves", num_leaves),
132 |         "learning_rate": hp.choice("learning_rate", learning_rate),
133 |         "boosting_type": hp.choice("boosting_type", boosting_type),
134 |         "min_child_weight": hp.choice("min_child_weight", min_child_weight),
135 |         "subsample": hp.choice("subsample", subsample),
136 |         "subsample_for_bin": hp.choice("subsample_for_bin", subsample_for_bin),
137 |         "colsample_bytree": hp.choice("colsample_bytree", colsample_bytree),
138 |         "reg_alpha": hp.choice("reg_alpha", reg_alpha),
139 |         "reg_lambda": hp.choice("reg_lambda", reg_lambda),
140 |         "objective": objective,
141 |         "num_class": num_class,
142 |         "n_jobs": n_jobs,
143 |         "random_state": random_state,
144 |     }
145 | 
146 |     # Get best configuration
147 |     def p_model(params):
148 |         clf = lgbm.LGBMClassifier(**params, verbose=-1)
149 |         clf.fit(X, y)
150 |         scorer = check_scoring(clf, scoring=metric)
151 |         return scorer(clf, X, y)
152 | 
153 |     global best
154 |     best = -np.inf
155 | 
156 |     def f(params):
157 |         global best
158 |         perf = p_model(params)
159 |         if perf > best:
160 |             best = perf
161 |         return {"loss": -best, "status": STATUS_OK}
162 | 
163 |     rstate = np.random.default_rng(random_state)
164 |     best_config = fmin(
165 |         fn=f,
166 |         space=space,
167 |         algo=tpe.suggest,
168 |         max_evals=n_iter,
169 |         trials=Trials(),
170 |         rstate=rstate,
171 |         verbose=0,
172 |     )
173 | 
174 |     # Fit best model
175 |     final_params = {
176 |         "n_estimators": n_estimators[best_config["n_estimators"]],
177 |         "max_depth": max_depth[best_config["max_depth"]],
178 |         "num_leaves": num_leaves[best_config["num_leaves"]],
179 |         "learning_rate": learning_rate[best_config["learning_rate"]],
180 |         "boosting_type": boosting_type[best_config["boosting_type"]],
181 |         "min_child_weight": min_child_weight[best_config["min_child_weight"]],
182 |         "subsample": subsample[best_config["subsample"]],
183 |         "subsample_for_bin": subsample_for_bin[best_config["subsample_for_bin"]],
184 |         "colsample_bytree": colsample_bytree[best_config["colsample_bytree"]],
185 |         "reg_alpha": reg_alpha[best_config["reg_alpha"]],
186 |         "reg_lambda": reg_lambda[best_config["reg_lambda"]],
187 |         "objective": objective,
188 |         "num_class": num_class,
189 |         "n_jobs": n_jobs,
190 |         "random_state": random_state,
191 |     }
192 |     clf = lgbm.LGBMClassifier(**final_params, verbose=-1)
193 |     return clf.fit(X, y)
194 | 
195 | 
196 | def lgbm_opt_regressor(
197 |     X,
198 |     y,
199 |     n_iter=10,
200 |     metric="neg_mean_squared_error",
201 |     n_estimators=(10, 50, 100),
202 |     max_depth=(3, 6, 9),
203 |     num_leaves=(20, 50, 100, 500),
204 |     learning_rate=(0.01, 0.1, 0.3, 0.5),
205 |     boosting_type=("gbdt",),
206 |     min_child_weight=(1, 5, 15, 100),
207 |     subsample=(1.0,),
208 |     subsample_for_bin=(200000,),
209 |     colsample_bytree=(1.0,),
210 |     reg_alpha=(0,),
211 |     reg_lambda=(0.1, 1.0, 5.0),
212 |     n_jobs=None,
213 |     random_state=None,
214 | ):
215 |     """
216 |     Get LightGBM model with the best hyperparameters configuration.
217 | 
218 |     Parameters
219 |     ----------
220 |     X : array-like of shape (n_samples, n_features)
221 |         The training input samples.
222 | 
223 |     y : array-like of shape (n_samples,)
224 |         The target values (real numbers).
225 | 
226 |     n_iter: int, default=10
227 |         Number of iterations to set the hyperparameters of the base regressor (LightGBM)
228 |         in Hyperopt.
229 | 
230 |     metric: string, default="neg_mean_squared_error"
231 |         The score of the base regressor (LightGBM) optimized by Hyperopt. Supported metrics
232 |         are the ones from `scikit-learn <https://scikit-learn.org/stable/modules/model_evaluation.html>`_.
233 | 
234 |     n_estimators : tuple, default=(10, 50, 100)
235 |         The number of LightGBM estimators. The number of estimators of
236 |         LightGBM corresponds to the number of boosting rounds. The tuple provided is
237 |         the search space used for the hyperparameter optimization (Hyperopt).
238 | 
239 |     max_depth : tuple, default=(3, 6, 9)
240 |         Maximum tree depth for LightGBM base learners. The tuple provided is the search
241 |         space used for the hyperparameter optimization (Hyperopt).
242 |         
243 |     num_leaves : tuple, default=(20, 50, 100, 500)
244 |         Maximum tree leaves. The tuple provided is the search
245 |         space used for the hyperparameter optimization (Hyperopt).
246 | 
247 |     learning_rate : tuple, default=(0.01, 0.1, 0.3, 0.5)
248 |         `learning_rate` of LightGBM. The tuple provided is the search space used for the
249 |         hyperparameter optimization (Hyperopt).
250 |         
251 |     boosting_type : ("dart", "gbdt", "rf"), default=("gbdt",)
252 |         The type of boosting type to use: "dart" dropouts meet Multiple Additive 
253 |         Regression Trees; "gbdt" traditional Gradient Boosting Decision Tree; "rf" Random Forest. 
254 |         The tuple provided is the search space used for the hyperparameter optimization (Hyperopt).
255 | 
256 |     min_child_weight : tuple, default=(1, 5, 15, 100)
257 |         `min_child_weight` of LightGBM. `min_child_weight` defines the
258 |         minimum sum of instance weight (hessian) needed in a child. If the tree
259 |         partition step results in a leaf node with the sum of instance weight
260 |         less than `min_child_weight`, then the building process will give up further
261 |         partitioning. The larger `min_child_weight` is, the more conservative LightGBM
262 |         algorithm will be. The tuple provided is the search space used for the hyperparameter
263 |         optimization (Hyperopt).
264 | 
265 |     subsample : tuple, default=(1.0,)
266 |         LightGBM subsample ratio of the training instances. Setting it to 0.5 means
267 |         that LightGBM would randomly sample half of the training data prior to
268 |         growing trees, and this will prevent overfitting. Subsampling will occur
269 |         once in every boosting iteration. The tuple provided is the search space used for
270 |         the hyperparameter optimization (Hyperopt).
271 |         
272 |     subsample_for_bin : tuple, default=(200000,)
273 |         Number of samples for constructing bins. The tuple provided is the
274 |         search space used for the hyperparameter optimization (Hyperopt).
275 | 
276 |     colsample_bytree : tuple, default=(1.0,)
277 |         LightGBM subsample ratio of columns when constructing each tree.
278 |         Subsampling occurs once for every tree constructed. The tuple provided is the search
279 |         space used for the hyperparameter optimization (Hyperopt).
280 | 
281 |     reg_alpha : tuple, default=(0,)
282 |         `reg_alpha` of LightGBM. `reg_alpha` corresponds to the L1 regularization
283 |         term on the weights. Increasing this value will make LightGBM model more
284 |         conservative. The tuple provided is the search space used for the hyperparameter
285 |         optimization (Hyperopt).
286 | 
287 |     reg_lambda : tuple, default=(0.1, 1.0, 5.0)
288 |         `reg_lambda` of LightGBM. `reg_lambda` corresponds to the L2 regularization
289 |         term on the weights. Increasing this value will make LightGBM model more
290 |         conservative. The tuple provided is the search space used for the hyperparameter
291 |         optimization (Hyperopt).
292 | 
293 |     n_jobs : int, default=None
294 |         The number of jobs to run in parallel.
295 |         ``n_jobs=None`` means 1. ``n_jobs=-1`` means using all processors.
296 | 
297 |     random_state : int, RandomState instance or None, default=None
298 |         Controls the randomness of the base learner LightGBM and
299 |         the Hyperopt algorithm.
300 | 
301 |     Returns
302 |     -------
303 |     model: object
304 |         LightGBM model with the best configuration and fitted on the input data.
305 |     """
306 |     space = {
307 |         "n_estimators": hp.choice("n_estimators", n_estimators),
308 |         "max_depth": hp.choice("max_depth", max_depth),
309 |         "num_leaves": hp.choice("num_leaves", num_leaves),
310 |         "learning_rate": hp.choice("learning_rate", learning_rate),
311 |         "boosting_type": hp.choice("boosting_type", boosting_type),
312 |         "min_child_weight": hp.choice("min_child_weight", min_child_weight),
313 |         "subsample": hp.choice("subsample", subsample),
314 |         "subsample_for_bin": hp.choice("subsample_for_bin", subsample_for_bin),
315 |         "colsample_bytree": hp.choice("colsample_bytree", colsample_bytree),
316 |         "reg_alpha": hp.choice("reg_alpha", reg_alpha),
317 |         "reg_lambda": hp.choice("reg_lambda", reg_lambda),
318 |         "objective": "regression",
319 |         "n_jobs": n_jobs,
320 |         "random_state": random_state,
321 |     }
322 | 
323 |     # Get best configuration
324 |     def p_model(params):
325 |         reg = lgbm.LGBMRegressor(**params, verbose=-1)
326 |         reg.fit(X, y)
327 |         scorer = check_scoring(reg, scoring=metric)
328 |         return scorer(reg, X, y)
329 | 
330 |     global best
331 |     best = -np.inf
332 | 
333 |     def f(params):
334 |         global best
335 |         perf = p_model(params)
336 |         if perf > best:
337 |             best = perf
338 |         return {"loss": -best, "status": STATUS_OK}
339 | 
340 |     rstate = np.random.default_rng(random_state)
341 |     best_config = fmin(
342 |         fn=f,
343 |         space=space,
344 |         algo=tpe.suggest,
345 |         max_evals=n_iter,
346 |         trials=Trials(),
347 |         rstate=rstate,
348 |         verbose=0,
349 |     )
350 | 
351 |     # Fit best model
352 |     final_params = {
353 |         "n_estimators": n_estimators[best_config["n_estimators"]],
354 |         "max_depth": max_depth[best_config["max_depth"]],
355 |         "num_leaves": num_leaves[best_config["num_leaves"]],
356 |         "learning_rate": learning_rate[best_config["learning_rate"]],
357 |         "boosting_type": boosting_type[best_config["boosting_type"]],
358 |         "min_child_weight": min_child_weight[best_config["min_child_weight"]],
359 |         "subsample": subsample[best_config["subsample"]],
360 |         "subsample_for_bin": subsample_for_bin[best_config["subsample_for_bin"]],
361 |         "colsample_bytree": colsample_bytree[best_config["colsample_bytree"]],
362 |         "reg_alpha": reg_alpha[best_config["reg_alpha"]],
363 |         "reg_lambda": reg_lambda[best_config["reg_lambda"]],
364 |         "objective": "regression",
365 |         "n_jobs": n_jobs,
366 |         "random_state": random_state,
367 |     }
368 |     reg = lgbm.LGBMRegressor(**final_params, verbose=-1)
369 |     return reg.fit(X, y)
370 | 


--------------------------------------------------------------------------------
/lce/_xgboost.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
  3 | from sklearn.metrics import check_scoring
  4 | from sklearn.preprocessing import OneHotEncoder
  5 | import xgboost as xgb
  6 | 
  7 | 
  8 | def xgb_opt_classifier(
  9 |     X,
 10 |     y,
 11 |     n_iter=10,
 12 |     metric="accuracy",
 13 |     n_estimators=(10, 50, 100),
 14 |     max_depth=(3, 6, 9),
 15 |     learning_rate=(0.01, 0.1, 0.3, 0.5),
 16 |     booster=("gbtree",),
 17 |     gamma=(0, 1, 10),
 18 |     min_child_weight=(1, 5, 15, 100),
 19 |     subsample=(1.0,),
 20 |     colsample_bytree=(1.0,),
 21 |     colsample_bylevel=(1.0,),
 22 |     colsample_bynode=(1.0,),
 23 |     reg_alpha=(0,),
 24 |     reg_lambda=(0.1, 1.0, 5.0),
 25 |     n_jobs=None,
 26 |     random_state=None,
 27 | ):
 28 |     """
 29 |     Get XGBoost model with the best hyperparameters configuration.
 30 | 
 31 |     Parameters
 32 |     ----------
 33 |     X : array-like of shape (n_samples, n_features)
 34 |         The training input samples.
 35 | 
 36 |     y : array-like of shape (n_samples,)
 37 |         The class labels.
 38 | 
 39 |     n_iter: int, default=10
 40 |         Number of iterations to set the hyperparameters of the base classifier (XGBoost)
 41 |         in Hyperopt.
 42 | 
 43 |     metric: string, default="accuracy"
 44 |         The score of the base classifier (XGBoost) optimized by Hyperopt. Supported metrics
 45 |         are the ones from `scikit-learn <https://scikit-learn.org/stable/modules/model_evaluation.html>`_.
 46 | 
 47 |     n_estimators : tuple, default=(10, 50, 100)
 48 |         The number of XGBoost estimators. The number of estimators of
 49 |         XGBoost corresponds to the number of boosting rounds. The tuple provided is
 50 |         the search space used for the hyperparameter optimization (Hyperopt).
 51 | 
 52 |     max_depth : tuple, default=(3, 6, 9)
 53 |         Maximum tree depth for XGBoost base learners. The tuple provided is the search
 54 |         space used for the hyperparameter optimization (Hyperopt).
 55 | 
 56 |     learning_rate : tuple, default=(0.01, 0.1, 0.3, 0.5)
 57 |         `learning_rate` of XGBoost. The learning rate corresponds to the
 58 |         step size shrinkage used in update to prevent overfitting. After each
 59 |         boosting step, the learning rate shrinks the feature weights to make the boosting
 60 |         process more conservative. The tuple provided is the search space used for the
 61 |         hyperparameter optimization (Hyperopt).
 62 | 
 63 |     booster : ("dart", "gblinear", "gbtree"), default=("gbtree",)
 64 |         The type of booster to use. "gbtree" and "dart" use tree based models
 65 |         while "gblinear" uses linear functions. The tuple provided is the search space used
 66 |         for the hyperparameter optimization (Hyperopt).
 67 | 
 68 |     gamma : tuple, default=(0, 1, 10)
 69 |         `gamma` of XGBoost. `gamma` corresponds to the minimum loss reduction
 70 |         required to make a further partition on a leaf node of the tree.
 71 |         The larger `gamma` is, the more conservative XGBoost algorithm will be.
 72 |         The tuple provided is the search space used for the hyperparameter optimization
 73 |         (Hyperopt).
 74 | 
 75 |     min_child_weight : tuple, default=(1, 5, 15, 100)
 76 |         `min_child_weight` of XGBoost. `min_child_weight` defines the
 77 |         minimum sum of instance weight (hessian) needed in a child. If the tree
 78 |         partition step results in a leaf node with the sum of instance weight
 79 |         less than `min_child_weight`, then the building process will give up further
 80 |         partitioning. The larger `min_child_weight` is, the more conservative XGBoost
 81 |         algorithm will be. The tuple provided is the search space used for the hyperparameter
 82 |         optimization (Hyperopt).
 83 | 
 84 |     subsample : tuple, default=(1.0,)
 85 |         XGBoost subsample ratio of the training instances. Setting it to 0.5 means
 86 |         that XGBoost would randomly sample half of the training data prior to
 87 |         growing trees, and this will prevent overfitting. Subsampling will occur
 88 |         once in every boosting iteration. The tuple provided is the search space used for
 89 |         the hyperparameter optimization (Hyperopt).
 90 | 
 91 |     colsample_bytree : tuple, default=(1.0,)
 92 |         XGBoost subsample ratio of columns when constructing each tree.
 93 |         Subsampling occurs once for every tree constructed. The tuple provided is the search
 94 |         space used for the hyperparameter optimization (Hyperopt).
 95 | 
 96 |     colsample_bylevel : tuple, default=(1.0,)
 97 |         XGBoost subsample ratio of columns for each level. Subsampling occurs
 98 |         once for every new depth level reached in a tree. Columns are subsampled
 99 |         from the set of columns chosen for the current tree. The tuple provided is the search
100 |         space used for the hyperparameter optimization (Hyperopt).
101 | 
102 |     colsample_bynode : tuple, default=(1.0,)
103 |         XGBoost subsample ratio of columns for each node (split). Subsampling
104 |         occurs once every time a new split is evaluated. Columns are subsampled
105 |         from the set of columns chosen for the current level. The tuple provided is the search
106 |         space used for the hyperparameter optimization (Hyperopt).
107 | 
108 |     reg_alpha : tuple, default=(0,)
109 |         `reg_alpha` of XGBoost. `reg_alpha` corresponds to the L1 regularization
110 |         term on the weights. Increasing this value will make XGBoost model more
111 |         conservative. The tuple provided is the search space used for the hyperparameter
112 |         optimization (Hyperopt).
113 | 
114 |     reg_lambda : tuple, default=(0.1, 1.0, 5.0)
115 |         `reg_lambda` of XGBoost. `reg_lambda` corresponds to the L2 regularization
116 |         term on the weights. Increasing this value will make XGBoost model more
117 |         conservative. The tuple provided is the search space used for the hyperparameter
118 |         optimization (Hyperopt).
119 | 
120 |     n_jobs : int, default=None
121 |         The number of jobs to run in parallel.
122 |         ``n_jobs=None`` means 1. ``n_jobs=-1`` means using all processors.
123 | 
124 |     random_state : int, RandomState instance or None, default=None
125 |         Controls the randomness of the base learner XGBoost and
126 |         the Hyperopt algorithm.
127 | 
128 |     Returns
129 |     -------
130 |     model: object
131 |         XGBoost model with the best configuration and fitted on the input data.
132 |     """
133 |     # Parameters
134 |     classes, y = np.unique(y, return_inverse=True)
135 |     n_classes = classes.size
136 | 
137 |     space = {
138 |         "n_estimators": hp.choice("n_estimators", n_estimators),
139 |         "max_depth": hp.choice("max_depth", max_depth),
140 |         "learning_rate": hp.choice("learning_rate", learning_rate),
141 |         "booster": hp.choice("booster", booster),
142 |         "gamma": hp.choice("gamma", gamma),
143 |         "min_child_weight": hp.choice("min_child_weight", min_child_weight),
144 |         "subsample": hp.choice("subsample", subsample),
145 |         "colsample_bytree": hp.choice("colsample_bytree", colsample_bytree),
146 |         "colsample_bylevel": hp.choice("colsample_bylevel", colsample_bylevel),
147 |         "colsample_bynode": hp.choice("colsample_bynode", colsample_bynode),
148 |         "reg_alpha": hp.choice("reg_alpha", reg_alpha),
149 |         "reg_lambda": hp.choice("reg_lambda", reg_lambda),
150 |         "objective": "multi:softprob",
151 |         "num_class": n_classes,
152 |         "n_jobs": n_jobs,
153 |         "random_state": random_state,
154 |     }
155 | 
156 |     # Get best configuration
157 |     def p_model(params):
158 |         clf = xgb.XGBClassifier(**params, use_label_encoder=False, verbosity=0)
159 |         clf.fit(X, y)
160 |         if n_classes == 2:
161 |             onehot_encoder = OneHotEncoder(sparse=False)
162 |             y_score = onehot_encoder.fit_transform(y.reshape(len(y), 1))
163 |         else:
164 |             y_score = y
165 |         scorer = check_scoring(clf, scoring=metric)
166 |         return scorer(clf, X, y_score)
167 | 
168 |     global best
169 |     best = -np.inf
170 | 
171 |     def f(params):
172 |         global best
173 |         perf = p_model(params)
174 |         if perf > best:
175 |             best = perf
176 |         return {"loss": -best, "status": STATUS_OK}
177 | 
178 |     rstate = np.random.default_rng(random_state)
179 |     best_config = fmin(
180 |         fn=f,
181 |         space=space,
182 |         algo=tpe.suggest,
183 |         max_evals=n_iter,
184 |         trials=Trials(),
185 |         rstate=rstate,
186 |         verbose=0,
187 |     )
188 | 
189 |     # Fit best model
190 |     final_params = {
191 |         "n_estimators": n_estimators[best_config["n_estimators"]],
192 |         "max_depth": max_depth[best_config["max_depth"]],
193 |         "learning_rate": learning_rate[best_config["learning_rate"]],
194 |         "booster": booster[best_config["booster"]],
195 |         "gamma": gamma[best_config["gamma"]],
196 |         "min_child_weight": min_child_weight[best_config["min_child_weight"]],
197 |         "subsample": subsample[best_config["subsample"]],
198 |         "colsample_bytree": colsample_bytree[best_config["colsample_bytree"]],
199 |         "colsample_bylevel": colsample_bylevel[best_config["colsample_bylevel"]],
200 |         "colsample_bynode": colsample_bynode[best_config["colsample_bynode"]],
201 |         "reg_alpha": reg_alpha[best_config["reg_alpha"]],
202 |         "reg_lambda": reg_lambda[best_config["reg_lambda"]],
203 |         "objective": "multi:softprob",
204 |         "num_class": n_classes,
205 |         "n_jobs": n_jobs,
206 |         "random_state": random_state,
207 |     }
208 |     clf = xgb.XGBClassifier(**final_params, use_label_encoder=False, verbosity=0)
209 |     return clf.fit(X, y)
210 | 
211 | 
212 | def xgb_opt_regressor(
213 |     X,
214 |     y,
215 |     n_iter=10,
216 |     metric="neg_mean_squared_error",
217 |     n_estimators=(10, 50, 100),
218 |     max_depth=(3, 6, 9),
219 |     learning_rate=(0.01, 0.1, 0.3, 0.5),
220 |     booster=("gbtree",),
221 |     gamma=(0, 1, 10),
222 |     min_child_weight=(1, 5, 15, 100),
223 |     subsample=(1.0,),
224 |     colsample_bytree=(1.0,),
225 |     colsample_bylevel=(1.0,),
226 |     colsample_bynode=(1.0,),
227 |     reg_alpha=(0,),
228 |     reg_lambda=(0.1, 1.0, 5.0),
229 |     n_jobs=None,
230 |     random_state=None,
231 | ):
232 |     """
233 |     Get XGBoost model with the best hyperparameters configuration.
234 | 
235 |     Parameters
236 |     ----------
237 |     X : array-like of shape (n_samples, n_features)
238 |         The training input samples.
239 | 
240 |     y : array-like of shape (n_samples,)
241 |         The target values (real numbers).
242 | 
243 |     n_iter: int, default=10
244 |         Number of iterations to set the hyperparameters of the base regressor (XGBoost)
245 |         in Hyperopt.
246 | 
247 |     metric: string, default="neg_mean_squared_error"
248 |         The score of the base regressor (XGBoost) optimized by Hyperopt. Supported metrics
249 |         are the ones from `scikit-learn <https://scikit-learn.org/stable/modules/model_evaluation.html>`_.
250 | 
251 |     n_estimators : tuple, default=(10, 50, 100)
252 |         The number of XGBoost estimators. The number of estimators of
253 |         XGBoost corresponds to the number of boosting rounds. The tuple provided is
254 |         the search space used for the hyperparameter optimization (Hyperopt).
255 | 
256 |     max_depth : tuple, default=(3, 6, 9)
257 |         Maximum tree depth for XGBoost base learners. The tuple provided is the search
258 |         space used for the hyperparameter optimization (Hyperopt).
259 | 
260 |     learning_rate : tuple, default=(0.01, 0.1, 0.3, 0.5)
261 |         `learning_rate` of XGBoost. The learning rate corresponds to the
262 |         step size shrinkage used in update to prevent overfitting. After each
263 |         boosting step, the learning rate shrinks the feature weights to make the boosting
264 |         process more conservative. The tuple provided is the search space used for the
265 |         hyperparameter optimization (Hyperopt).
266 | 
267 |     booster : ("dart", "gblinear", "gbtree"), default=("gbtree",)
268 |         The type of booster to use. "gbtree" and "dart" use tree based models
269 |         while "gblinear" uses linear functions. The tuple provided is the search space used
270 |         for the hyperparameter optimization (Hyperopt).
271 | 
272 |     gamma : tuple, default=(0, 1, 10)
273 |         `gamma` of XGBoost. `gamma` corresponds to the minimum loss reduction
274 |         required to make a further partition on a leaf node of the tree.
275 |         The larger `gamma` is, the more conservative XGBoost algorithm will be.
276 |         The tuple provided is the search space used for the hyperparameter optimization
277 |         (Hyperopt).
278 | 
279 |     min_child_weight : tuple, default=(1, 5, 15, 100)
280 |         `min_child_weight` of XGBoost. `min_child_weight` defines the
281 |         minimum sum of instance weight (hessian) needed in a child. If the tree
282 |         partition step results in a leaf node with the sum of instance weight
283 |         less than `min_child_weight`, then the building process will give up further
284 |         partitioning. The larger `min_child_weight` is, the more conservative XGBoost
285 |         algorithm will be. The tuple provided is the search space used for the hyperparameter
286 |         optimization (Hyperopt).
287 | 
288 |     subsample : tuple, default=(1.0,)
289 |         XGBoost subsample ratio of the training instances. Setting it to 0.5 means
290 |         that XGBoost would randomly sample half of the training data prior to
291 |         growing trees, and this will prevent overfitting. Subsampling will occur
292 |         once in every boosting iteration. The tuple provided is the search space used for
293 |         the hyperparameter optimization (Hyperopt).
294 | 
295 |     colsample_bytree : tuple, default=(1.0,)
296 |         XGBoost subsample ratio of columns when constructing each tree.
297 |         Subsampling occurs once for every tree constructed. The tuple provided is the search
298 |         space used for the hyperparameter optimization (Hyperopt).
299 | 
300 |     colsample_bylevel : tuple, default=(1.0,)
301 |         XGBoost subsample ratio of columns for each level. Subsampling occurs
302 |         once for every new depth level reached in a tree. Columns are subsampled
303 |         from the set of columns chosen for the current tree. The tuple provided is the search
304 |         space used for the hyperparameter optimization (Hyperopt).
305 | 
306 |     colsample_bynode : tuple, default=(1.0,)
307 |         XGBoost subsample ratio of columns for each node (split). Subsampling
308 |         occurs once every time a new split is evaluated. Columns are subsampled
309 |         from the set of columns chosen for the current level. The tuple provided is the search
310 |         space used for the hyperparameter optimization (Hyperopt).
311 | 
312 |     reg_alpha : tuple, default=(0,)
313 |         `reg_alpha` of XGBoost. `reg_alpha` corresponds to the L1 regularization
314 |         term on the weights. Increasing this value will make XGBoost model more
315 |         conservative. The tuple provided is the search space used for the hyperparameter
316 |         optimization (Hyperopt).
317 | 
318 |     reg_lambda : tuple, default=(0.1, 1.0, 5.0)
319 |         `reg_lambda` of XGBoost. `reg_lambda` corresponds to the L2 regularization
320 |         term on the weights. Increasing this value will make XGBoost model more
321 |         conservative. The tuple provided is the search space used for the hyperparameter
322 |         optimization (Hyperopt).
323 | 
324 |     n_jobs : int, default=None
325 |         The number of jobs to run in parallel.
326 |         ``n_jobs=None`` means 1. ``n_jobs=-1`` means using all processors.
327 | 
328 |     random_state : int, RandomState instance or None, default=None
329 |         Controls the randomness of the base learner XGBoost and
330 |         the Hyperopt algorithm.
331 | 
332 |     Returns
333 |     -------
334 |     model: object
335 |         XGBoost model with the best configuration and fitted on the input data.
336 |     """
337 |     space = {
338 |         "n_estimators": hp.choice("n_estimators", n_estimators),
339 |         "max_depth": hp.choice("max_depth", max_depth),
340 |         "learning_rate": hp.choice("learning_rate", learning_rate),
341 |         "booster": hp.choice("booster", booster),
342 |         "gamma": hp.choice("gamma", gamma),
343 |         "min_child_weight": hp.choice("min_child_weight", min_child_weight),
344 |         "subsample": hp.choice("subsample", subsample),
345 |         "colsample_bytree": hp.choice("colsample_bytree", colsample_bytree),
346 |         "colsample_bylevel": hp.choice("colsample_bylevel", colsample_bylevel),
347 |         "colsample_bynode": hp.choice("colsample_bynode", colsample_bynode),
348 |         "reg_alpha": hp.choice("reg_alpha", reg_alpha),
349 |         "reg_lambda": hp.choice("reg_lambda", reg_lambda),
350 |         "objective": "reg:squarederror",
351 |         "n_jobs": n_jobs,
352 |         "random_state": random_state,
353 |     }
354 | 
355 |     # Get best configuration
356 |     def p_model(params):
357 |         reg = xgb.XGBRegressor(**params, verbosity=0)
358 |         reg.fit(X, y)
359 |         scorer = check_scoring(reg, scoring=metric)
360 |         return scorer(reg, X, y)
361 | 
362 |     global best
363 |     best = -np.inf
364 | 
365 |     def f(params):
366 |         global best
367 |         perf = p_model(params)
368 |         if perf > best:
369 |             best = perf
370 |         return {"loss": -best, "status": STATUS_OK}
371 | 
372 |     rstate = np.random.default_rng(random_state)
373 |     best_config = fmin(
374 |         fn=f,
375 |         space=space,
376 |         algo=tpe.suggest,
377 |         max_evals=n_iter,
378 |         trials=Trials(),
379 |         rstate=rstate,
380 |         verbose=0,
381 |     )
382 | 
383 |     # Fit best model
384 |     final_params = {
385 |         "n_estimators": n_estimators[best_config["n_estimators"]],
386 |         "max_depth": max_depth[best_config["max_depth"]],
387 |         "learning_rate": learning_rate[best_config["learning_rate"]],
388 |         "booster": booster[best_config["booster"]],
389 |         "gamma": gamma[best_config["gamma"]],
390 |         "min_child_weight": min_child_weight[best_config["min_child_weight"]],
391 |         "subsample": subsample[best_config["subsample"]],
392 |         "colsample_bytree": colsample_bytree[best_config["colsample_bytree"]],
393 |         "colsample_bylevel": colsample_bylevel[best_config["colsample_bylevel"]],
394 |         "colsample_bynode": colsample_bynode[best_config["colsample_bynode"]],
395 |         "reg_alpha": reg_alpha[best_config["reg_alpha"]],
396 |         "reg_lambda": reg_lambda[best_config["reg_lambda"]],
397 |         "objective": "reg:squarederror",
398 |         "n_jobs": n_jobs,
399 |         "random_state": random_state,
400 |     }
401 |     reg = xgb.XGBRegressor(**final_params, verbosity=0)
402 |     return reg.fit(X, y)
403 | 


--------------------------------------------------------------------------------
/lce/_lce.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import numbers
  3 | import numpy as np
  4 | from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
  5 | from sklearn.ensemble import BaggingClassifier, BaggingRegressor
  6 | from sklearn.preprocessing import LabelEncoder
  7 | from sklearn.utils.multiclass import check_classification_targets
  8 | from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
  9 | 
 10 | from ._lcetree import LCETreeClassifier, LCETreeRegressor
 11 | 
 12 | 
 13 | class LCEClassifier(ClassifierMixin, BaseEstimator):
 14 |     """
 15 |     A **Local Cascade Ensemble (LCE) classifier**. LCEClassifier is **compatible with scikit-learn**;
 16 |     it passes the `check_estimator <https://scikit-learn.org/stable/modules/generated/sklearn.utils.estimator_checks.check_estimator.html#sklearn.utils.estimator_checks.check_estimator>`_.
 17 |     Therefore, it can interact with scikit-learn pipelines and model selection tools.
 18 | 
 19 | 
 20 |     Parameters
 21 |     ----------
 22 |     n_estimators : int, default=10
 23 |         The number of trees in the ensemble.
 24 | 
 25 |     bootstrap : bool, default=True
 26 |         Whether bootstrap samples are used when building trees. If False, the
 27 |         whole dataset is used to build each tree.
 28 | 
 29 |     criterion : {"gini", "entropy"}, default="gini"
 30 |         The function to measure the quality of a split. Supported criteria are
 31 |         "gini" for the Gini impurity and "entropy" for the information gain.
 32 | 
 33 |     splitter : {"best", "random"}, default="best"
 34 |         The strategy used to choose the split at each node. Supported strategies
 35 |         are "best" to choose the best split and "random" to choose the best random
 36 |         split.
 37 | 
 38 |     max_depth : int, default=2
 39 |         The maximum depth of a tree.
 40 | 
 41 |     max_features : int, float or {"auto", "sqrt", "log"}, default=None
 42 |         The number of features to consider when looking for the best split:
 43 | 
 44 |         - If int, then consider `max_features` features at each split.
 45 |         - If float, then `max_features` is a fraction and
 46 |           `round(max_features * n_features)` features are considered at each
 47 |           split.
 48 |         - If "auto", then `max_features=sqrt(n_features)`.
 49 |         - If "sqrt", then `max_features=sqrt(n_features)` (same as "auto").
 50 |         - If "log2", then `max_features=log2(n_features)`.
 51 |         - If None, then `max_features=n_features`.
 52 | 
 53 |         Note: the search for a split does not stop until at least one
 54 |         valid partition of the node samples is found, even if it requires to
 55 |         effectively inspect more than ``max_features`` features.
 56 | 
 57 |     max_samples : int or float, default=1.0
 58 |         The number of samples to draw from X to train each base estimator
 59 |         (with replacement by default, see ``bootstrap`` for more details).
 60 | 
 61 |         - If int, then draw `max_samples` samples.
 62 |         - If float, then draw `max_samples * X.shape[0]` samples. Thus, `max_samples` should be in the interval `(0.0, 1.0]`.
 63 | 
 64 |     min_samples_leaf : int or float, default=1
 65 |         The minimum number of samples required to be at a leaf node.
 66 |         A split point at any depth will only be considered if it leaves at
 67 |         least ``min_samples_leaf`` training samples in each of the left and
 68 |         right branches.
 69 | 
 70 |         - If int, then consider `min_samples_leaf` as the minimum number.
 71 |         - If float, then `min_samples_leaf` is a fraction and
 72 |           `ceil(min_samples_leaf * n_samples)` are the minimum
 73 |           number of samples for each node.
 74 | 
 75 |     n_iter: int, default=10
 76 |         Number of iterations to set the hyperparameters of each node base
 77 |         classifier in Hyperopt.
 78 | 
 79 |     metric: string, default="accuracy"
 80 |         The score of the base classifier optimized by Hyperopt. Supported metrics
 81 |         are the ones from `scikit-learn <https://scikit-learn.org/stable/modules/model_evaluation.html>`_.
 82 | 
 83 |     base_learner : {"catboost", "lightgbm", "xgboost"}, default="xgboost"
 84 |         The base classifier trained in each node of a tree.
 85 | 
 86 |     base_n_estimators : tuple, default=(10, 50, 100)
 87 |         The number of estimators of the base learner. The tuple provided is
 88 |         the search space used for the hyperparameter optimization (Hyperopt).
 89 | 
 90 |     base_max_depth : tuple, default=(3, 6, 9)
 91 |         Maximum tree depth for base learners. The tuple provided is the search
 92 |         space used for the hyperparameter optimization (Hyperopt).
 93 |         
 94 |     base_num_leaves : tuple, default=(20, 50, 100, 500)
 95 |         Maximum tree leaves (applicable to LightGBM only). The tuple provided is the search
 96 |         space used for the hyperparameter optimization (Hyperopt).
 97 | 
 98 |     base_learning_rate : tuple, default=(0.01, 0.1, 0.3, 0.5)
 99 |         `learning_rate` of the base learner. The tuple provided is the search space used for the
100 |         hyperparameter optimization (Hyperopt).
101 | 
102 |     base_booster : ("dart", "gblinear", "gbtree"), default=("gbtree",)
103 |         The type of booster to use (applicable to XGBoost only). "gbtree" and "dart" use tree based models
104 |         while "gblinear" uses linear functions. The tuple provided is the search space used
105 |         for the hyperparameter optimization (Hyperopt).
106 |         
107 |     base_boosting_type : ("dart", "gbdt", "rf"), default=("gbdt",)
108 |         The type of boosting type to use (applicable to LightGBM only): "dart" dropouts meet Multiple Additive 
109 |         Regression Trees; "gbdt" traditional Gradient Boosting Decision Tree; "rf" Random Forest. 
110 |         The tuple provided is the search space used for the hyperparameter optimization (Hyperopt).
111 | 
112 |     base_gamma : tuple, default=(0, 1, 10)
113 |         `gamma` of XGBoost. `gamma` corresponds to the minimum loss reduction
114 |         required to make a further partition on a leaf node of the tree.
115 |         The larger `gamma` is, the more conservative XGBoost algorithm will be.
116 |         The tuple provided is the search space used for the hyperparameter optimization
117 |         (Hyperopt).
118 | 
119 |     base_min_child_weight : tuple, default=(1, 5, 15, 100)
120 |         `min_child_weight` of base learner (applicable to LightGBM and XGBoost only). `min_child_weight` defines the
121 |         minimum sum of instance weight (hessian) needed in a child. If the tree
122 |         partition step results in a leaf node with the sum of instance weight
123 |         less than `min_child_weight`, then the building process will give up further
124 |         partitioning. The larger `min_child_weight` is, the more conservative the base learner
125 |         algorithm will be. The tuple provided is the search space used for the hyperparameter
126 |         optimization (Hyperopt).
127 | 
128 |     base_subsample : tuple, default=(1.0,)
129 |         Base learner subsample ratio of the training instances (applicable to LightGBM and XGBoost only). 
130 |         Setting it to 0.5 means that the base learner would randomly sample half of the training data prior to
131 |         growing trees, and this will prevent overfitting. Subsampling will occur
132 |         once in every boosting iteration. The tuple provided is the search space used for
133 |         the hyperparameter optimization (Hyperopt).
134 |         
135 |     base_subsample_for_bin : tuple, default=(200000,)
136 |         Number of samples for constructing bins (applicable to LightGBM only). The tuple provided is the
137 |         search space used for the hyperparameter optimization (Hyperopt).
138 | 
139 |     base_colsample_bytree : tuple, default=(1.0,)
140 |         Base learner subsample ratio of columns when constructing each tree (applicable to LightGBM and XGBoost only).
141 |         Subsampling occurs once for every tree constructed. The tuple provided is the search
142 |         space used for the hyperparameter optimization (Hyperopt).
143 | 
144 |     base_colsample_bylevel : tuple, default=(1.0,)
145 |         Subsample ratio of columns for each level (applicable to CatBoost and XGBoost only). Subsampling occurs
146 |         once for every new depth level reached in a tree. Columns are subsampled
147 |         from the set of columns chosen for the current tree. The tuple provided is the search
148 |         space used for the hyperparameter optimization (Hyperopt).
149 | 
150 |     base_colsample_bynode : tuple, default=(1.0,)
151 |         Subsample ratio of columns for each node split (applicable to XGBoost only). Subsampling
152 |         occurs once every time a new split is evaluated. Columns are subsampled
153 |         from the set of columns chosen for the current level. The tuple provided is the search
154 |         space used for the hyperparameter optimization (Hyperopt).
155 | 
156 |     base_reg_alpha : tuple, default=(0,)
157 |         `reg_alpha` of the base learner (applicable to LightGBM and XGBoost only). 
158 |         `reg_alpha` corresponds to the L1 regularization term on the weights. 
159 |         Increasing this value will make the base learner more conservative. 
160 |         The tuple provided is the search space used for the hyperparameter optimization (Hyperopt).
161 | 
162 |     base_reg_lambda : tuple, default=(0.1, 1.0, 5.0)
163 |         `reg_lambda` of the base learner. `reg_lambda` corresponds to the L2 regularization term 
164 |         on the weights. Increasing this value will make the base learner more
165 |         conservative. The tuple provided is the search space used for the hyperparameter
166 |         optimization (Hyperopt).
167 | 
168 |     n_jobs : int, default=None
169 |         The number of jobs to run in parallel.
170 |         ``n_jobs=None`` means 1. ``n_jobs=-1`` means using all processors.
171 | 
172 |     random_state : int, RandomState instance or None, default=None
173 |         Controls the randomness of the bootstrapping of the samples used
174 |         when building trees (if ``bootstrap=True``), the sampling of the
175 |         features to consider when looking for the best split at each node
176 |         (if ``max_features < n_features``), the base classifier and
177 |         the Hyperopt algorithm.
178 | 
179 |     verbose : int, default=0
180 |         Controls the verbosity when fitting.
181 | 
182 |     Attributes
183 |     ----------
184 |     base_estimator_ : LCETreeClassifier
185 |         The child estimator template used to create the collection of fitted
186 |         sub-estimators.
187 | 
188 |     estimators_ : list of LCETreeClassifier
189 |         The collection of fitted sub-estimators.
190 | 
191 |     classes_ : ndarray of shape (n_classes,) or a list of such arrays
192 |         The classes labels.
193 | 
194 |     n_classes_ : int
195 |         The number of classes.
196 | 
197 |     n_features_in_ : int
198 |         The number of features when ``fit`` is performed.
199 | 
200 |     encoder_ : LabelEncoder
201 |         The encoder to have target labels with value between 0 and n_classes-1.
202 | 
203 |     Notes
204 |     -----
205 |     The default values for the parameters controlling the size of the trees
206 |     (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
207 |     unpruned trees which can potentially be very large on some data sets. To
208 |     reduce memory consumption, the complexity and size of the trees should be
209 |     controlled by setting those parameter values.
210 | 
211 |     The features are always randomly permuted at each split. Therefore,
212 |     the best found split may vary, even with the same training data,
213 |     ``max_features=n_features`` and ``bootstrap=False``, if the improvement
214 |     of the criterion is identical for several splits enumerated during the
215 |     search of the best split. To obtain a deterministic behaviour during
216 |     fitting, ``random_state`` has to be fixed.
217 | 
218 |     References
219 |     ----------
220 |     .. [1] Fauvel, K., E. Fromont, V. Masson, P. Faverdin and A. Termier. "XEM: An Explainable-by-Design Ensemble Method for Multivariate Time Series Classification", Data Mining and Knowledge Discovery, 36(3):917-957, 2022. https://hal.inria.fr/hal-03599214/document
221 |     """
222 | 
223 |     def __init__(
224 |         self,
225 |         n_estimators=10,
226 |         bootstrap=True,
227 |         criterion="gini",
228 |         splitter="best",
229 |         max_depth=2,
230 |         max_features=None,
231 |         max_samples=1.0,
232 |         min_samples_leaf=1,
233 |         n_iter=10,
234 |         metric="accuracy",
235 |         base_learner="xgboost",
236 |         base_n_estimators=(10, 50, 100),
237 |         base_max_depth=(3, 6, 9),
238 |         base_num_leaves=(20, 50, 100, 500),
239 |         base_learning_rate=(0.01, 0.1, 0.3, 0.5),
240 |         base_booster=("gbtree",),
241 |         base_boosting_type=("gbdt",),
242 |         base_gamma=(0, 1, 10),
243 |         base_min_child_weight=(1, 5, 15, 100),
244 |         base_subsample=(1.0,),
245 |         base_subsample_for_bin=(200000,),
246 |         base_colsample_bytree=(1.0,),
247 |         base_colsample_bylevel=(1.0,),
248 |         base_colsample_bynode=(1.0,),
249 |         base_reg_alpha=(0,),
250 |         base_reg_lambda=(0.1, 1.0, 5.0),
251 |         n_jobs=None,
252 |         random_state=None,
253 |         verbose=0,
254 |     ):
255 |         self.n_estimators = n_estimators
256 |         self.bootstrap = bootstrap
257 |         self.criterion = criterion
258 |         self.splitter = splitter
259 |         self.max_depth = max_depth
260 |         self.max_features = max_features
261 |         self.max_samples = max_samples
262 |         self.min_samples_leaf = min_samples_leaf
263 |         self.n_iter = n_iter
264 |         self.metric = metric
265 |         self.base_learner = base_learner
266 |         self.base_n_estimators = base_n_estimators
267 |         self.base_max_depth = base_max_depth
268 |         self.base_num_leaves = base_num_leaves
269 |         self.base_learning_rate = base_learning_rate
270 |         self.base_booster = base_booster
271 |         self.base_boosting_type = base_boosting_type
272 |         self.base_gamma = base_gamma
273 |         self.base_min_child_weight = base_min_child_weight
274 |         self.base_subsample = base_subsample
275 |         self.base_subsample_for_bin = base_subsample_for_bin
276 |         self.base_colsample_bytree = base_colsample_bytree
277 |         self.base_colsample_bylevel = base_colsample_bylevel
278 |         self.base_colsample_bynode = base_colsample_bynode
279 |         self.base_reg_alpha = base_reg_alpha
280 |         self.base_reg_lambda = base_reg_lambda
281 |         self.n_jobs = n_jobs
282 |         self.random_state = random_state
283 |         self.verbose = verbose
284 | 
285 |     def _generate_estimator(self):
286 |         """Generate an estimator."""
287 |         est = LCETreeClassifier()
288 |         est.n_classes_in = self.n_classes_
289 |         est.criterion = self.criterion
290 |         est.splitter = self.splitter
291 |         est.max_depth = self.max_depth
292 |         est.max_features = self.max_features
293 |         est.min_samples_leaf = self.min_samples_leaf
294 |         est.n_iter = self.n_iter
295 |         est.metric = self.metric
296 |         est.base_learner = self.base_learner
297 |         est.base_n_estimators = self.base_n_estimators
298 |         est.base_max_depth = self.base_max_depth
299 |         est.base_num_leaves = self.base_num_leaves
300 |         est.base_learning_rate = self.base_learning_rate
301 |         est.base_booster = self.base_booster
302 |         est.base_boosting_type = self.base_boosting_type
303 |         est.base_gamma = self.base_gamma
304 |         est.base_min_child_weight = self.base_min_child_weight
305 |         est.base_subsample = self.base_subsample
306 |         est.base_subsample_for_bin = self.base_subsample_for_bin        
307 |         est.base_colsample_bytree = self.base_colsample_bytree        
308 |         est.base_colsample_bylevel = self.base_colsample_bylevel
309 |         est.base_colsample_bynode = self.base_colsample_bynode
310 |         est.base_reg_alpha = self.base_reg_alpha
311 |         est.base_reg_alpha = self.base_reg_lambda
312 |         est.n_jobs = self.n_jobs
313 |         est.random_state = self.random_state
314 |         est.verbose = self.verbose
315 |         return est
316 | 
317 |     def _more_tags(self):
318 |         """Update scikit-learn estimator tags."""
319 |         return {"allow_nan": True, "requires_y": True}
320 | 
321 |     def _validate_extra_parameters(self, X):
322 |         """Validate parameters not already validated by methods employed."""
323 |         # Validate max_depth
324 |         if isinstance(self.max_depth, numbers.Integral):
325 |             if not (0 <= self.max_depth):
326 |                 raise ValueError(
327 |                     "max_depth must be greater than or equal to 0, "
328 |                     "got {0}.".format(self.max_depth)
329 |                 )
330 |         else:
331 |             raise ValueError("max_depth must be int")
332 | 
333 |         # Validate min_samples_leaf
334 |         if isinstance(self.min_samples_leaf, numbers.Integral):
335 |             if not 1 <= self.min_samples_leaf:
336 |                 raise ValueError(
337 |                     "min_samples_leaf must be at least 1 "
338 |                     "or in (0, 0.5], got %s" % self.min_samples_leaf
339 |                 )
340 |         elif isinstance(self.min_samples_leaf, float):
341 |             if not 0.0 < self.min_samples_leaf <= 0.5:
342 |                 raise ValueError(
343 |                     "min_samples_leaf must be at least 1 "
344 |                     "or in (0, 0.5], got %s" % self.min_samples_leaf
345 |                 )
346 |             self.min_samples_leaf = int(math.ceil(self.min_samples_leaf * X.shape[0]))
347 |         else:
348 |             raise ValueError("min_samples_leaf must be int or float")
349 | 
350 |         # Validate n_iter
351 |         if isinstance(self.n_iter, numbers.Integral):
352 |             if self.n_iter <= 0:
353 |                 raise ValueError(
354 |                     "n_iter must be greater than 0, " "got {0}.".format(self.n_iter)
355 |                 )
356 |         else:
357 |             raise ValueError("n_iter must be int")
358 | 
359 |         # Validate verbose
360 |         if isinstance(self.verbose, numbers.Integral):
361 |             if self.verbose < 0:
362 |                 raise ValueError(
363 |                     "verbose must be greater than or equal to 0, "
364 |                     "got {0}.".format(self.verbose)
365 |                 )
366 |         else:
367 |             raise ValueError("verbose must be int")
368 | 
369 |     def fit(self, X, y):
370 |         """
371 |         Build a forest of LCE trees from the training set (X, y).
372 | 
373 |         Parameters
374 |         ----------
375 |         X : array-like of shape (n_samples, n_features)
376 |             The training input samples.
377 | 
378 |         y : array-like of shape (n_samples,)
379 |             The class labels.
380 | 
381 |         Returns
382 |         -------
383 |         self : object
384 |         """
385 |         X, y = check_X_y(X, y, force_all_finite="allow-nan")
386 |         check_classification_targets(y)
387 |         self._validate_extra_parameters(X)
388 |         self.n_features_in_ = X.shape[1]
389 |         self.X_ = True
390 |         self.y_ = True
391 |         self.classes_, y = np.unique(y, return_inverse=True)
392 |         self.n_classes_ = self.classes_.size
393 |         self.encoder_ = LabelEncoder()
394 |         self.encoder_.fit(self.classes_)
395 |         self.base_estimator_ = self._generate_estimator()
396 |         self.estimators_ = BaggingClassifier(
397 |             base_estimator=self.base_estimator_,
398 |             n_estimators=self.n_estimators,
399 |             bootstrap=self.bootstrap,
400 |             max_samples=self.max_samples,
401 |             n_jobs=self.n_jobs,
402 |             random_state=self.random_state,
403 |         )
404 |         self.estimators_.fit(X, y)
405 |         return self
406 | 
407 |     def predict(self, X):
408 |         """
409 |         Predict class for X.
410 |         The predicted class of an input sample is computed as the class with
411 |         the highest mean predicted probability.
412 | 
413 |         Parameters
414 |         ----------
415 |         X : array-like of shape (n_samples, n_features)
416 |             The training input samples.
417 | 
418 |         Returns
419 |         -------
420 |         y : ndarray of shape (n_samples,)
421 |             The predicted classes.
422 |         """
423 |         check_is_fitted(self, ["X_", "y_"])
424 |         X = check_array(X, force_all_finite="allow-nan")
425 |         predictions = self.estimators_.predict(X)
426 |         return self.encoder_.inverse_transform(predictions)
427 | 
428 |     def predict_proba(self, X):
429 |         """
430 |         Predict class probabilities for X.
431 |         The predicted class probabilities of an input sample are computed as
432 |         the mean predicted class probabilities of the base estimators in the
433 |         ensemble.
434 | 
435 |         Parameters
436 |         ----------
437 |         X : array-like of shape (n_samples, n_features)
438 |             The training input samples.
439 | 
440 |         Returns
441 |         -------
442 |         y : ndarray of shape (n_samples,)
443 |             The class probabilities of the input samples. The order of the
444 |             classes corresponds to that in the attribute ``classes_``.
445 |         """
446 |         check_is_fitted(self, ["X_", "y_"])
447 |         X = check_array(X, force_all_finite="allow-nan")
448 |         return self.estimators_.predict_proba(X)
449 | 
450 |     def set_params(self, **params):
451 |         """
452 |         Set the parameters of the estimator.
453 | 
454 |         Parameters
455 |         ----------
456 |         **params : dict
457 |             Estimator parameters.
458 | 
459 |         Returns
460 |         -------
461 |         self : object
462 |         """
463 |         if not params:
464 |             return self
465 | 
466 |         for key, value in params.items():
467 |             if hasattr(self, key):
468 |                 setattr(self, key, value)
469 | 
470 |         return self
471 | 
472 | 
473 | class LCERegressor(RegressorMixin, BaseEstimator):
474 |     """
475 |     A **Local Cascade Ensemble (LCE) regressor**. LCERegressor is **compatible with scikit-learn**;
476 |     it passes the `check_estimator <https://scikit-learn.org/stable/modules/generated/sklearn.utils.estimator_checks.check_estimator.html#sklearn.utils.estimator_checks.check_estimator>`_.
477 |     Therefore, it can interact with scikit-learn pipelines and model selection tools.
478 | 
479 | 
480 |     Parameters
481 |     ----------
482 |     n_estimators : int, default=10
483 |         The number of trees in the ensemble.
484 | 
485 |     bootstrap : bool, default=True
486 |         Whether bootstrap samples are used when building trees. If False, the
487 |         whole dataset is used to build each tree.
488 | 
489 |     criterion : {"squared_error", "friedman_mse", "absolute_error", "poisson"}, default="squared_error"
490 |         The function to measure the quality of a split. Supported criteria are "squared_error" for
491 |         the mean squared error, which is equal to variance reduction as feature selection
492 |         criterion and minimizes the L2 loss using the mean of each terminal node,
493 |         "friedman_mse", which uses mean squared error with Friedman's improvement score
494 |         for potential splits, "absolute_error" for the mean absolute error, which
495 |         minimizes the L1 loss using the median of each terminal node, and "poisson"
496 |         which uses reduction in Poisson deviance to find splits.
497 | 
498 |     splitter : {"best", "random"}, default="best"
499 |         The strategy used to choose the split at each node. Supported strategies
500 |         are "best" to choose the best split and "random" to choose the best random
501 |         split.
502 | 
503 |     max_depth : int, default=2
504 |         The maximum depth of a tree.
505 | 
506 |     max_features : int, float or {"auto", "sqrt", "log"}, default=None
507 |         The number of features to consider when looking for the best split:
508 | 
509 |         - If int, then consider `max_features` features at each split.
510 |         - If float, then `max_features` is a fraction and
511 |           `round(max_features * n_features)` features are considered at each
512 |           split.
513 |         - If "auto", then `max_features=sqrt(n_features)`.
514 |         - If "sqrt", then `max_features=sqrt(n_features)` (same as "auto").
515 |         - If "log2", then `max_features=log2(n_features)`.
516 |         - If None, then `max_features=n_features`.
517 | 
518 |         Note: the search for a split does not stop until at least one
519 |         valid partition of the node samples is found, even if it requires to
520 |         effectively inspect more than ``max_features`` features.
521 | 
522 |     max_samples : int or float, default=1.0
523 |         The number of samples to draw from X to train each base estimator
524 |         (with replacement by default, see ``bootstrap`` for more details).
525 | 
526 |         - If int, then draw `max_samples` samples.
527 |         - If float, then draw `max_samples * X.shape[0]` samples. Thus, `max_samples` should be in the interval `(0.0, 1.0]`.
528 | 
529 |     min_samples_leaf : int or float, default=1
530 |         The minimum number of samples required to be at a leaf node.
531 |         A split point at any depth will only be considered if it leaves at
532 |         least ``min_samples_leaf`` training samples in each of the left and
533 |         right branches.
534 | 
535 |         - If int, then consider `min_samples_leaf` as the minimum number.
536 |         - If float, then `min_samples_leaf` is a fraction and
537 |           `ceil(min_samples_leaf * n_samples)` are the minimum
538 |           number of samples for each node.
539 | 
540 |     n_iter: int, default=10
541 |         Number of iterations to set the hyperparameters of each node base
542 |         regressor in Hyperopt.
543 | 
544 |     metric: string, default="neg_mean_squared_error"
545 |         The score of the base regressor optimized by Hyperopt. Supported metrics
546 |         are the ones from `scikit-learn <https://scikit-learn.org/stable/modules/model_evaluation.html>`_.
547 | 
548 |     base_learner : {"catboost", "lightgbm", "xgboost"}, default="xgboost"
549 |         The base classifier trained in each node of a tree.
550 | 
551 |     base_n_estimators : tuple, default=(10, 50, 100)
552 |         The number of estimators of the base learner. The tuple provided is
553 |         the search space used for the hyperparameter optimization (Hyperopt).
554 | 
555 |     base_max_depth : tuple, default=(3, 6, 9)
556 |         Maximum tree depth for base learners. The tuple provided is the search
557 |         space used for the hyperparameter optimization (Hyperopt).
558 |         
559 |     base_num_leaves : tuple, default=(20, 50, 100, 500)
560 |         Maximum tree leaves (applicable to LightGBM only). The tuple provided is the search
561 |         space used for the hyperparameter optimization (Hyperopt).
562 | 
563 |     base_learning_rate : tuple, default=(0.01, 0.1, 0.3, 0.5)
564 |         `learning_rate` of the base learner. The tuple provided is the search space used for the
565 |         hyperparameter optimization (Hyperopt).
566 | 
567 |     base_booster : ("dart", "gblinear", "gbtree"), default=("gbtree",)
568 |         The type of booster to use (applicable to XGBoost only). "gbtree" and "dart" use tree based models
569 |         while "gblinear" uses linear functions. The tuple provided is the search space used
570 |         for the hyperparameter optimization (Hyperopt).
571 |         
572 |     base_boosting_type : ("dart", "gbdt", "rf"), default=("gbdt",)
573 |         The type of boosting type to use (applicable to LightGBM only): "dart" dropouts meet Multiple Additive 
574 |         Regression Trees; "gbdt" traditional Gradient Boosting Decision Tree; "rf" Random Forest. 
575 |         The tuple provided is the search space used for the hyperparameter optimization (Hyperopt).
576 | 
577 |     base_gamma : tuple, default=(0, 1, 10)
578 |         `gamma` of XGBoost. `gamma` corresponds to the minimum loss reduction
579 |         required to make a further partition on a leaf node of the tree.
580 |         The larger `gamma` is, the more conservative XGBoost algorithm will be.
581 |         The tuple provided is the search space used for the hyperparameter optimization
582 |         (Hyperopt).
583 | 
584 |     base_min_child_weight : tuple, default=(1, 5, 15, 100)
585 |         `min_child_weight` of base learner (applicable to LightGBM and XGBoost only). `min_child_weight` defines the
586 |         minimum sum of instance weight (hessian) needed in a child. If the tree
587 |         partition step results in a leaf node with the sum of instance weight
588 |         less than `min_child_weight`, then the building process will give up further
589 |         partitioning. The larger `min_child_weight` is, the more conservative the base learner
590 |         algorithm will be. The tuple provided is the search space used for the hyperparameter
591 |         optimization (Hyperopt).
592 | 
593 |     base_subsample : tuple, default=(1.0,)
594 |         Base learner subsample ratio of the training instances (applicable to LightGBM and XGBoost only). 
595 |         Setting it to 0.5 means that the base learner would randomly sample half of the training data prior to
596 |         growing trees, and this will prevent overfitting. Subsampling will occur
597 |         once in every boosting iteration. The tuple provided is the search space used for
598 |         the hyperparameter optimization (Hyperopt).
599 |         
600 |     base_subsample_for_bin : tuple, default=(200000,)
601 |         Number of samples for constructing bins (applicable to LightGBM only). The tuple provided is the
602 |         search space used for the hyperparameter optimization (Hyperopt).
603 | 
604 |     base_colsample_bytree : tuple, default=(1.0,)
605 |         Base learner subsample ratio of columns when constructing each tree (applicable to LightGBM and XGBoost only).
606 |         Subsampling occurs once for every tree constructed. The tuple provided is the search
607 |         space used for the hyperparameter optimization (Hyperopt).
608 | 
609 |     base_colsample_bylevel : tuple, default=(1.0,)
610 |         Subsample ratio of columns for each level (applicable to CatBoost and XGBoost only). Subsampling occurs
611 |         once for every new depth level reached in a tree. Columns are subsampled
612 |         from the set of columns chosen for the current tree. The tuple provided is the search
613 |         space used for the hyperparameter optimization (Hyperopt).
614 | 
615 |     base_colsample_bynode : tuple, default=(1.0,)
616 |         Subsample ratio of columns for each node split (applicable to XGBoost only). Subsampling
617 |         occurs once every time a new split is evaluated. Columns are subsampled
618 |         from the set of columns chosen for the current level. The tuple provided is the search
619 |         space used for the hyperparameter optimization (Hyperopt).
620 | 
621 |     base_reg_alpha : tuple, default=(0,)
622 |         `reg_alpha` of the base learner (applicable to LightGBM and XGBoost only). 
623 |         `reg_alpha` corresponds to the L1 regularization term on the weights. 
624 |         Increasing this value will make the base learner more conservative. 
625 |         The tuple provided is the search space used for the hyperparameter optimization (Hyperopt).
626 | 
627 |     base_reg_lambda : tuple, default=(0.1, 1.0, 5.0)
628 |         `reg_lambda` of the base learner. `reg_lambda` corresponds to the L2 regularization term 
629 |         on the weights. Increasing this value will make the base learner more
630 |         conservative. The tuple provided is the search space used for the hyperparameter
631 |         optimization (Hyperopt).
632 | 
633 |     n_jobs : int, default=None
634 |         The number of jobs to run in parallel.
635 |         ``n_jobs=None`` means 1. ``n_jobs=-1`` means using all processors.
636 | 
637 |     random_state : int, RandomState instance or None, default=None
638 |         Controls the randomness of the bootstrapping of the samples used
639 |         when building trees (if ``bootstrap=True``), the sampling of the
640 |         features to consider when looking for the best split at each node
641 |         (if ``max_features < n_features``), the base classifier (XGBoost) and
642 |         the Hyperopt algorithm.
643 | 
644 |     verbose : int, default=0
645 |         Controls the verbosity when fitting.
646 | 
647 |     Attributes
648 |     ----------
649 |     base_estimator_ : LCETreeRegressor
650 |         The child estimator template used to create the collection of fitted
651 |         sub-estimators.
652 | 
653 |     estimators_ : list of LCETreeRegressor
654 |         The collection of fitted sub-estimators.
655 | 
656 |     n_features_in_ : int
657 |         The number of features when ``fit`` is performed.
658 | 
659 |     Notes
660 |     -----
661 |     The default values for the parameters controlling the size of the trees
662 |     (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
663 |     unpruned trees which can potentially be very large on some data sets. To
664 |     reduce memory consumption, the complexity and size of the trees should be
665 |     controlled by setting those parameter values.
666 | 
667 |     The features are always randomly permuted at each split. Therefore,
668 |     the best found split may vary, even with the same training data,
669 |     ``max_features=n_features`` and ``bootstrap=False``, if the improvement
670 |     of the criterion is identical for several splits enumerated during the
671 |     search of the best split. To obtain a deterministic behaviour during
672 |     fitting, ``random_state`` has to be fixed.
673 |     """
674 | 
675 |     def __init__(
676 |         self,
677 |         n_estimators=10,
678 |         bootstrap=True,
679 |         criterion="squared_error",
680 |         splitter="best",
681 |         max_depth=2,
682 |         max_features=None,
683 |         max_samples=1.0,
684 |         min_samples_leaf=1,
685 |         metric="neg_mean_squared_error",
686 |         n_iter=10,
687 |         base_learner="xgboost",
688 |         base_n_estimators=(10, 50, 100),
689 |         base_max_depth=(3, 6, 9),
690 |         base_num_leaves=(20, 50, 100, 500),
691 |         base_learning_rate=(0.01, 0.1, 0.3, 0.5),
692 |         base_booster=("gbtree",),
693 |         base_boosting_type=("gbdt",),
694 |         base_gamma=(0, 1, 10),
695 |         base_min_child_weight=(1, 5, 15, 100),
696 |         base_subsample=(1.0,),
697 |         base_subsample_for_bin=(200000,),
698 |         base_colsample_bytree=(1.0,),
699 |         base_colsample_bylevel=(1.0,),
700 |         base_colsample_bynode=(1.0,),
701 |         base_reg_alpha=(0,),
702 |         base_reg_lambda=(0.1, 1.0, 5.0),
703 |         n_jobs=None,
704 |         random_state=None,
705 |         verbose=0,
706 |     ):
707 |         self.n_estimators = n_estimators
708 |         self.bootstrap = bootstrap
709 |         self.criterion = criterion
710 |         self.splitter = splitter
711 |         self.max_depth = max_depth
712 |         self.max_features = max_features
713 |         self.max_samples = max_samples
714 |         self.min_samples_leaf = min_samples_leaf
715 |         self.n_iter = n_iter
716 |         self.metric = metric
717 |         self.base_learner = base_learner
718 |         self.base_n_estimators = base_n_estimators
719 |         self.base_max_depth = base_max_depth
720 |         self.base_num_leaves = base_num_leaves
721 |         self.base_learning_rate = base_learning_rate
722 |         self.base_booster = base_booster
723 |         self.base_boosting_type = base_boosting_type
724 |         self.base_gamma = base_gamma
725 |         self.base_min_child_weight = base_min_child_weight
726 |         self.base_subsample = base_subsample
727 |         self.base_subsample_for_bin = base_subsample_for_bin
728 |         self.base_colsample_bytree = base_colsample_bytree
729 |         self.base_colsample_bylevel = base_colsample_bylevel
730 |         self.base_colsample_bynode = base_colsample_bynode
731 |         self.base_reg_alpha = base_reg_alpha
732 |         self.base_reg_lambda = base_reg_lambda
733 |         self.n_jobs = n_jobs
734 |         self.random_state = random_state
735 |         self.verbose = verbose
736 | 
737 |     def _generate_estimator(self):
738 |         """Generate an estimator."""
739 |         est = LCETreeRegressor()
740 |         est.criterion = self.criterion
741 |         est.splitter = self.splitter
742 |         est.max_depth = self.max_depth
743 |         est.max_features = self.max_features
744 |         est.min_samples_leaf = self.min_samples_leaf
745 |         est.n_iter = self.n_iter
746 |         est.metric = self.metric
747 |         est.base_learner = self.base_learner
748 |         est.base_n_estimators = self.base_n_estimators
749 |         est.base_max_depth = self.base_max_depth
750 |         est.base_num_leaves = self.base_num_leaves
751 |         est.base_learning_rate = self.base_learning_rate
752 |         est.base_booster = self.base_booster
753 |         est.base_boosting_type = self.base_boosting_type
754 |         est.base_gamma = self.base_gamma
755 |         est.base_min_child_weight = self.base_min_child_weight
756 |         est.base_subsample = self.base_subsample
757 |         est.base_subsample_for_bin = self.base_subsample_for_bin        
758 |         est.base_colsample_bytree = self.base_colsample_bytree        
759 |         est.base_colsample_bylevel = self.base_colsample_bylevel
760 |         est.base_colsample_bynode = self.base_colsample_bynode
761 |         est.base_reg_alpha = self.base_reg_alpha
762 |         est.base_reg_alpha = self.base_reg_lambda
763 |         est.n_jobs = self.n_jobs
764 |         est.random_state = self.random_state
765 |         est.verbose = self.verbose
766 |         return est
767 | 
768 |     def _more_tags(self):
769 |         """Update scikit-learn estimator tags."""
770 |         return {"allow_nan": True, "requires_y": True}
771 | 
772 |     def _validate_extra_parameters(self, X):
773 |         """Validate parameters not already validated by methods employed."""
774 |         # Validate max_depth
775 |         if isinstance(self.max_depth, numbers.Integral):
776 |             if not (0 <= self.max_depth):
777 |                 raise ValueError(
778 |                     "max_depth must be greater than or equal to 0, "
779 |                     "got {0}.".format(self.max_depth)
780 |                 )
781 |         else:
782 |             raise ValueError("max_depth must be int")
783 | 
784 |         # Validate min_samples_leaf
785 |         if isinstance(self.min_samples_leaf, numbers.Integral):
786 |             if not 1 <= self.min_samples_leaf:
787 |                 raise ValueError(
788 |                     "min_samples_leaf must be at least 1 "
789 |                     "or in (0, 0.5], got %s" % self.min_samples_leaf
790 |                 )
791 |         elif isinstance(self.min_samples_leaf, float):
792 |             if not 0.0 < self.min_samples_leaf <= 0.5:
793 |                 raise ValueError(
794 |                     "min_samples_leaf must be at least 1 "
795 |                     "or in (0, 0.5], got %s" % self.min_samples_leaf
796 |                 )
797 |             self.min_samples_leaf = int(math.ceil(self.min_samples_leaf * X.shape[0]))
798 |         else:
799 |             raise ValueError("min_samples_leaf must be int or float")
800 | 
801 |         # Validate n_iter
802 |         if isinstance(self.n_iter, numbers.Integral):
803 |             if self.n_iter <= 0:
804 |                 raise ValueError(
805 |                     "n_iter must be greater than 0, " "got {0}.".format(self.n_iter)
806 |                 )
807 |         else:
808 |             raise ValueError("n_iter must be int")
809 | 
810 |         # Validate verbose
811 |         if isinstance(self.verbose, numbers.Integral):
812 |             if self.verbose < 0:
813 |                 raise ValueError(
814 |                     "verbose must be greater than or equal to 0, "
815 |                     "got {0}.".format(self.verbose)
816 |                 )
817 |         else:
818 |             raise ValueError("verbose must be int")
819 | 
820 |     def fit(self, X, y):
821 |         """
822 |         Build a forest of LCE trees from the training set (X, y).
823 | 
824 |         Parameters
825 |         ----------
826 |         X : array-like of shape (n_samples, n_features)
827 |             The training input samples.
828 | 
829 |         y : array-like of shape (n_samples,)
830 |             The target values (real numbers).
831 | 
832 |         Returns
833 |         -------
834 |         self : object
835 |         """
836 |         X, y = check_X_y(X, y, y_numeric=True, force_all_finite="allow-nan")
837 |         self._validate_extra_parameters(X)
838 |         self.n_features_in_ = X.shape[1]
839 |         self.X_ = True
840 |         self.y_ = True
841 |         self.base_estimator_ = self._generate_estimator()
842 |         self.estimators_ = BaggingRegressor(
843 |             base_estimator=self.base_estimator_,
844 |             n_estimators=self.n_estimators,
845 |             bootstrap=self.bootstrap,
846 |             max_samples=self.max_samples,
847 |             n_jobs=self.n_jobs,
848 |             random_state=self.random_state,
849 |         )
850 |         self.estimators_.fit(X, y)
851 |         return self
852 | 
853 |     def predict(self, X):
854 |         """
855 |         Predict regression target for X.
856 |         The predicted regression target of an input sample is computed as the
857 |         mean predicted regression targets of the trees in the forest.
858 | 
859 |         Parameters
860 |         ----------
861 |         X : array-like of shape (n_samples, n_features)
862 |             The training input samples.
863 | 
864 |         Returns
865 |         -------
866 |         y : ndarray of shape (n_samples,)
867 |             The predicted values.
868 |         """
869 |         check_is_fitted(self, ["X_", "y_"])
870 |         X = check_array(X, force_all_finite="allow-nan")
871 |         return self.estimators_.predict(X)
872 | 
873 |     def set_params(self, **params):
874 |         """
875 |         Set the parameters of the estimator.
876 | 
877 |         Parameters
878 |         ----------
879 |         **params : dict
880 |             Estimator parameters.
881 | 
882 |         Returns
883 |         -------
884 |         self : object
885 |         """
886 |         if not params:
887 |             return self
888 | 
889 |         for key, value in params.items():
890 |             if hasattr(self, key):
891 |                 setattr(self, key, value)
892 | 
893 |         return self
894 | 


--------------------------------------------------------------------------------