├── .github └── workflows │ └── ci.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── backbone_learn ├── __init__.py ├── backbone │ ├── __init__ .py │ ├── backbone_base.py │ ├── backbone_clustering.py │ ├── backbone_decision_tree.py │ ├── backbone_sparse_regression.py │ ├── backbone_supervised.py │ ├── backbone_unsupervised.py │ ├── subproblem_costructor.py │ └── subproblem_feature_selector.py ├── exact_solvers │ ├── __init__.py │ ├── benders_oct_decision_tree.py │ ├── exact_solver_base.py │ ├── lobnb_regression.py │ ├── lobnb_regression_model.py │ └── mio_clustering.py ├── heuristic_solvers │ ├── __init__.py │ ├── cart_decision_tree.py │ ├── heauristic_solver_base.py │ ├── kmeans_solver.py │ └── lasso_regression.py ├── screen_selectors │ ├── __init__.py │ ├── linear_regression_selector.py │ ├── pearson_correlation_selector.py │ ├── screen_selector_base.py │ └── variable_selector.py └── utils │ ├── __init__ .py │ └── utils.py ├── docs ├── Makefile ├── api_docs │ ├── backbone_learn.exact_solvers.rst │ ├── backbone_learn.heuristic_solvers.rst │ ├── backbone_learn.rst │ └── backbone_learn.screen_selectors.rst ├── conf.py ├── contribution_guide │ ├── CONDUCT.rst │ └── CONTRIBUTING.rst ├── developer_guide │ ├── custom_implementation_guide.rst │ └── example_usage_customized_backbone.rst ├── getting_started │ ├── clustering.rst │ ├── decision_trees.rst │ ├── installation.rst │ └── sparse_regression.rst ├── index.rst ├── make.bat ├── modules.rst ├── requirements.txt └── user_guide │ ├── backbone_framework.rst │ ├── indicators_explanation.rst │ └── overview.rst ├── examples ├── clustering_toy_example.ipynb ├── decision_tree_toy_example.ipynb └── sparse_regression_toy_example.ipynb ├── experiments ├── __init__.py ├── benchmark_clustering.py ├── benchmark_decision_tree.py ├── benchmark_sparse_regression.py ├── clustering_results.json ├── decision_tree_results.json ├── sparse_regression_results.json └── utils.py ├── poetry.lock ├── pyproject.toml ├── references.md └── tests ├── __init__.py ├── test_backbone ├── test_backbone_clustering.py ├── test_backbone_decision_tree.py ├── test_backbone_sparse_regression.py ├── test_subproblem_constructor.py └── test_subproblem_selector.py ├── test_exact_solvers ├── test_benders_oct_decision_tree.py ├── test_lobnb_regression.py └── test_mio_clustering.py ├── test_heuristic_solvers ├── test_cart_decision_tree.py ├── test_kmeans.py └── test_sparse_regression.py ├── test_screen_selectors ├── test_linear_regression.py └── test_pearson_correlation.py └── test_utils.py /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI workflow 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | test: 11 | 12 | runs-on: ubuntu-latest 13 | 14 | strategy: 15 | matrix: 16 | python-version: ['3.9', '3.10', '3.11'] 17 | 18 | steps: 19 | - uses: actions/checkout@v2 20 | 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v2 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | 26 | - name: Install Poetry 27 | run: | 28 | pip install poetry 29 | 30 | - name: Install dependencies with Poetry 31 | run: | 32 | poetry install 33 | 34 | - name: Install odtlearn from GitHub with Poetry Environment 35 | run: | 36 | poetry run pip install git+https://github.com/D3M-Research-Group/odtlearn.git#egg=odtlearn 37 | - name: Install pytest-cov within Poetry Environment 38 | run: | 39 | poetry run pip install pytest-cov 40 | 41 | - name: Run pytest using Poetry 42 | run: | 43 | poetry run pytest --cov=backbone_learn --cov-report=xml 44 | 45 | - name: Upload coverage to Codecov 46 | uses: codecov/codecov-action@v3 47 | with: 48 | token: ${{ secrets.CODECOV_TOKEN }} 49 | file: ./coverage.xml 50 | flags: unittests 51 | name: codecov-umbrella 52 | fail_ci_if_error: false 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | .DS_Store 6 | draft/ 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | .pybuilder/ 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | # For a library or package, you might want to ignore these files since the code is 88 | # intended to run in multiple environments; otherwise, check them in: 89 | # .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # poetry 99 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 100 | # This is especially recommended for binary packages to ensure reproducibility, and is more 101 | # commonly ignored for libraries. 102 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 103 | #poetry.lock 104 | 105 | # pdm 106 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 107 | #pdm.lock 108 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 109 | # in version control. 110 | # https://pdm.fming.dev/#use-with-ide 111 | .pdm.toml 112 | 113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 114 | __pypackages__/ 115 | 116 | # Celery stuff 117 | celerybeat-schedule 118 | celerybeat.pid 119 | 120 | # SageMath parsed files 121 | *.sage.py 122 | 123 | # Environments 124 | .env 125 | .venv 126 | env/ 127 | venv/ 128 | ENV/ 129 | env.bak/ 130 | venv.bak/ 131 | 132 | # Spyder project settings 133 | .spyderproject 134 | .spyproject 135 | 136 | # Rope project settings 137 | .ropeproject 138 | 139 | # mkdocs documentation 140 | /site 141 | 142 | # mypy 143 | .mypy_cache/ 144 | .dmypy.json 145 | dmypy.json 146 | 147 | # Pyre type checker 148 | .pyre/ 149 | 150 | # pytype static type analyzer 151 | .pytype/ 152 | 153 | # Cython debug symbols 154 | cython_debug/ 155 | 156 | # PyCharm 157 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 158 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 159 | # and can be added to the global gitignore or merged into this file. For a more nuclear 160 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 161 | #.idea/ 162 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.5.0 4 | hooks: 5 | - id: check-byte-order-marker 6 | - id: check-merge-conflict 7 | - id: check-symlinks 8 | - id: check-toml 9 | - id: check-yaml 10 | - id: detect-private-key 11 | - id: end-of-file-fixer 12 | - id: mixed-line-ending 13 | - id: trailing-whitespace 14 | - repo: https://github.com/psf/black 15 | rev: 22.8.0 16 | hooks: 17 | - id: black 18 | args: [--line-length=100, --exclude=""] 19 | 20 | # this is not technically always safe but usually is 21 | # use comments `# isort: off` and `# isort: on` to disable/re-enable isort 22 | - repo: https://github.com/pycqa/isort 23 | rev: 5.12.0 24 | hooks: 25 | - id: isort 26 | args: [--line-length=100, --profile=black] 27 | 28 | # this is slightly dangerous because python imports have side effects 29 | # and this tool removes unused imports, which may be providing 30 | # necessary side effects for the code to run 31 | - repo: https://github.com/PyCQA/autoflake 32 | rev: v1.6.1 33 | hooks: 34 | - id: autoflake 35 | args: 36 | - "--in-place" 37 | - "--expand-star-imports" 38 | - "--remove-duplicate-keys" 39 | - "--remove-unused-variables" 40 | - "--remove-all-unused-imports" 41 | 42 | # https://github.com/openai/evals/blob/main/.pre-commit-config.yaml 43 | # https://github.com/run-llama/llama_index/blob/main/.pre-commit-config.yaml 44 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 2 | version: 2 3 | 4 | # Set the OS, Python version and other tools you might need 5 | build: 6 | os: ubuntu-22.04 7 | tools: 8 | python: "3.9" 9 | 10 | python: 11 | install: 12 | - requirements: docs/requirements.txt 13 | 14 | 15 | # Build documentation in the "docs/" directory with Sphinx 16 | sphinx: 17 | configuration: docs/conf.py 18 | -------------------------------------------------------------------------------- /CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct for BackboneLearn 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | * Using welcoming and inclusive language 12 | * Being respectful of differing viewpoints and experiences 13 | * Gracefully accepting constructive criticism 14 | * Focusing on what is best for the community 15 | * Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | * Trolling, insulting/derogatory comments, and personal or political attacks 21 | * Public or private harassment 22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | * Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | 26 | ## Our Responsibilities 27 | 28 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 29 | 30 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned with this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 31 | 32 | ## Scope 33 | 34 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 35 | 36 | ## Enforcement 37 | 38 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at chziakas@gmail.com. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. 39 | 40 | This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org), version 1.4, available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 41 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to BackboneLearn 2 | 3 | ## Introduction 4 | 5 | Thank you for considering a contribution to BackboneLearn! 6 | This project is dedicated to advancing the field of mixed integer optimization in machine learning. Contributions in various forms, such as new methods, bug fixes, or documentation enhancements, are all highly valued. 7 | 8 | ## How to Contribute 9 | 10 | ### Step 1: Understanding the Project 11 | 12 | Explore the [BackboneLearn documentation](Readme.md) to understand the project's scope, functionalities, and architecture. We encourage future contributors to review our research paper and other key open-source libraries. 13 | 14 | ### Step 2: Setting Up 15 | 16 | **Fork and Clone the Repository:** 17 | 18 | * Fork the BackboneLearn repository on GitHub. 19 | * Clone your fork to your local environment. 20 | 21 | **Environment Setup:** 22 | 23 | * Install Python 3.9, if not already installed. 24 | * Set up your development environment using Poetry for dependency management: 25 | 26 | ```bash 27 | pip install poetry 28 | poetry install 29 | ``` 30 | * Utilize pre-commit hooks to maintain code quality: 31 | 32 | ```bash 33 | pre-commit install 34 | ``` 35 | 36 | ### Step 3: Making Changes 37 | 38 | **Finding or Creating an Issue:** 39 | 40 | * Check the [GitHub issues](link-to-issues) for existing tasks or bugs. 41 | * You can also create new issues for proposing features or improvements. 42 | 43 | **Creating a Branch:** 44 | 45 | * Create a new branch in your fork for your changes. 46 | 47 | **Developing:** 48 | 49 | * Implement your changes, ensuring to adhere to the project's coding standards. 50 | * Write or update tests using Pytest. 51 | * Run and pass all tests before submission. 52 | 53 | ### Step 4: Submitting Your Contribution 54 | 55 | **Committing and Pushing Changes:** 56 | 57 | * Commit your changes with clear messages. 58 | * Push your changes to your fork on GitHub. 59 | 60 | **Creating a Pull Request:** 61 | 62 | * Open a pull request against the main BackboneLearn repository. 63 | * Describe your changes in detail and link to any relevant issues. 64 | 65 | **Code Review and GitHub Actions:** 66 | 67 | * Engage in the code review process. 68 | * Your code will be automatically checked by GitHub Actions. 69 | * Make necessary changes based on feedback. 70 | 71 | ## Reporting Issues and Suggestions 72 | 73 | **Check Existing Issues:** 74 | 75 | Before reporting, search the GitHub issue to ensure it's unique. 76 | 77 | **Create New Issue:** 78 | 79 | Navigate to 'Issues' in the BackboneLearn repository and click 'New Issue'. 80 | 81 | **Fill the Template and Submit:** 82 | 83 | Provide a detailed title and description. Include steps to reproduce, expected and actual behavior, code snippets, or screenshots if applicable. After completing the form, click 'Submit new issue'. 84 | 85 | ## Custom Implementation Guide 86 | 87 | To build customized backbone algorithms for BackboneLearn, follow these steps: 88 | 89 | ### Custom Screening Method 90 | 91 | **Extend `ScreenSelectorBase` in `'backbone_learn/screen_selector'`.** 92 | 93 | **Implement the `calculate_utilities` Method:** 94 | 95 | * Compute utilities or importances for each feature based on your criteria. 96 | * Features with the lowest scores may be considered for elimination. 97 | * The number of features to keep is `alpha * n_features`. 98 | 99 | ### Custom Heuristic Method 100 | 101 | **Extend `HeuristicSolverBase` in `'backbone_learn/heuristic_solvers'`.** 102 | **Implement `fit` and `get_relevant_features`:** 103 | 104 | * Train a model within each subproblem efficiently. 105 | * Fit a sparse model to the data inputs of the subproblem. 106 | * Identify and extract relevant features. 107 | * Define necessary parameters in the `init` or `fit` method. 108 | 109 | ### Custom Exact Solver 110 | 111 | **Extend `ExactSolverBase` in `'backbone_learn/exact_solvers'`.** 112 | **Implement `fit` and `predict` Methods:** 113 | 114 | * Apply `fit` to the reduced backbone set. 115 | * Use a method with optimality guarantees. 116 | * Ensure the model can be used for prediction. 117 | 118 | ### Custom Backbone Algorithm 119 | 120 | **Extend `BackboneSupervised` or `BackboneUnsupervised` in `'backbone_learn/backbone'`.** 121 | **Implement `set_solvers`:** 122 | 123 | * Add customized screen selector, heuristic solver, and exact solver. 124 | * Optionally define a screen selector and heuristic solver. 125 | * Pass parameters manually. 126 | 127 | ### Example Usage for Customized Backbone Algorithm 128 | 129 | ```python 130 | class CustomBackboneAlgorithm(BackboneSupervised): 131 | def set_solvers(self, **kwargs): 132 | self.screen_selector = CustomScreenSelector(**kwargs) 133 | self.heuristic_solver = CustomHeuristicSolver(**kwargs) 134 | self.exact_solver = CustomExactSolver(**kwargs) 135 | 136 | # Initialize with custom parameters 137 | backbone_algorithm = CustomBackboneAlgorithm(alpha=0.5, beta=0.3, num_subproblems=3, **kwargs) 138 | 139 | # Fit the model 140 | backbone_algorithm.fit(X, y) 141 | 142 | # Make predictions 143 | predictions = backbone_algorithm.predict(X_new) 144 | 145 | ``` 146 | 147 | ## Questions or Issues? 148 | 149 | If you encounter any issues or have questions, please open a discussion issue on GitHub. 150 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | # BackboneLearn: A Library for Scaling Mixed-Integer Optimization-Based Machine Learning 4 | 5 | [![Paper](https://img.shields.io/badge/arXiv-2211.11747-red)](https://arxiv.org/abs/2311.13695) 6 | [![API](https://img.shields.io/website-up-down-green-red/http/monip.org.svg)](http://backbone-learn.readthedocs.io) 7 | [![PyPI package](https://badge.fury.io/py/backbone-learn.svg)](https://pypi.org/project/backbone-learn/) 8 | [![codecov](https://codecov.io/github/chziakas/backbone-learn/graph/badge.svg?token=XB32XVKLSB)](https://app.codecov.io/gh/chziakas/backbone-learn) 9 | [![Codacy Badge](https://app.codacy.com/project/badge/Grade/6c042e62d2fa48d09bf58095c170410b)](https://app.codacy.com/gh/chziakas/backbone-learn/dashboard?utm_source=gh&utm_medium=referral&utm_content=&utm_campaign=Badge_grade) 10 | 11 |
12 | 13 | ## Overview 14 | 15 | *BackboneLearn* is an open-source software package and framework for scaling mixed integer optimization (MIO) problems with indicator variables to high-dimensional problems. This optimization paradigm can naturally be used to formulate fundamental problems in interpretable supervised learning (e.g., sparse regression and decision trees), in unsupervised learning (e.g., clustering), and beyond; *BackboneLearn* solves the aforementioned problems faster than exact methods and with higher accuracy than commonly used heuristics, while also scaling to large problem sizes. The package is built in Python and is user-friendly and easily extendible: users can directly implement a backbone algorithm for their MIO problem at hand. 16 | 17 | ### What do we mean by indicators? 18 | 19 | Indicators are binary variables that are part of the MIO problem we use to train the aforementioned models. 20 | 21 | * Sparse regression: Each regression coefficient (and the corresponding feature) is paired with an indicator, which is 1 if the coefficient is nonzero and 0 otherwise. 22 | * Decision trees: An indicator corresponds to a feature in a decision tree node, being nonzero if that feature is chosen for branching at that node. 23 | * Clustering: An indicator represents whether a pair of data points are in the same cluster, being nonzero if they are clustered together. 24 | 25 | ## BackboneLearn 26 | 27 | The backbone framework, upon which *BackboneLearn* is built, operates in two phases: we first extract a “backbone set” of potentially ``relevant indicators'' (i.e., indicators that are nonzero in the optimal solution) by solving a number of specially chosen, tractable subproblems; we then use traditional techniques to solve a reduced problem to optimality or near-optimality, considering only the backbone indicators. A screening step often proceeds the first phase, to discard indicators that are almost surely irrelevant. For more details, check the paper by Bertsimas and Digalakis Jr (2022) . 28 | 29 | ## Getting Started 30 | 31 | ### Installation 32 | 33 | Install BackboneLearn using pip: 34 | ```python 35 | pip install backbone-learn 36 | ``` 37 | Note: For ODT implementations, please follow the instructions described in the libary to install compile CBC from source using coinbrew: https://github.com/D3M-Research-Group/odtlearn#cbc-binaries 38 | 39 | ### Example Usage 40 | Here are some example usages of BackboneLearn for different tasks that you could find in 'notebooks': 41 | 42 | #### Sparse Regression with L0BnB Model 43 | 44 | ```python 45 | from backbone_learn.backbone.backbone_sparse_regression import BackboneSparseRegression 46 | # Initialize BackboneSparseRegression 47 | backbone = BackboneSparseRegression(alpha=0.5, beta=0.5, num_subproblems=5, num_iterations=1, lambda_2=0.001, max_nonzeros=10) 48 | # Fit the model 49 | backbone.fit(X, y) 50 | # Make predictions 51 | y_pred = backbone.predict(X) 52 | ``` 53 | 54 | #### Decision Trees with BendersOCT Model 55 | 56 | ```python 57 | from backbone_learn.backbone.backbone_decision_tree import BackboneDecisionTree 58 | # Initialize BackboneDecisionTree 59 | backbone = BackboneDecisionTree(alpha=0.5, beta=0.5, num_subproblems=5, num_iterations=1, depth=3, _lambda=0.5) 60 | # Fit the model 61 | backbone.fit(X, y) 62 | # Make predictions 63 | y_pred = backbone.predict(X) 64 | ``` 65 | 66 | #### Clustering with MIO Formulation Model 67 | 68 | ```python 69 | from backbone_learn.backbone.backbone_clustering import BackboneClustering 70 | # Initialize BackboneClustering 71 | backbone = BackboneClustering(beta=1.0, num_subproblems=5, num_iterations=1, n_clusters=5) 72 | # Fit the model 73 | backbone.fit(X) 74 | # Make predictions 75 | y_pred = backbone.predict(X) 76 | ``` 77 | 78 | ### Running Simulations 79 | 80 | To run benchmark simulations, please follow these commands and change the hyper-parameters for each script in 'experiments': 81 | 82 | ```bash 83 | # Install Poetry dependencies 84 | poetry install 85 | # Activate Poetry shell (optional but recommended) 86 | poetry shell 87 | # Run benchmark simulations for decision trees, sparse regression, and clustering 88 | python experiments/benchmark_decision_tree.py 89 | python experiments/benchmark_sparse_regression.py 90 | python experiments/benchmark_clustering.py 91 | ``` 92 | 93 | ## Custom Implementation Guide 94 | 95 | Follow these steps to implement your backbone algorithms in *BackboneLearn*: 96 | 97 | ### Custom Screening Method 98 | 99 | * Extend `ScreenSelectorBase` in 'backbone_learn/screen_selector'. 100 | * Implement the `calculate_utilities` method, which computes utilities (or importances) for each feature based on your screening criteria. Features with the lowest scores may be candidates for elimination. The number of features to keep is defined as a fraction of the total features (`alpha * n_features`). 101 | 102 | ### Custom Heuristic Method 103 | 104 | * Extend `HeuristicSolverBase` in 'backbone_learn/heuristic_solvers'. 105 | * Implement `fit` and `get_relevant_features`. The `fit` method trains a model within each subproblem, which needs to be highly efficient. This function fits a sparse model (regression, trees, etc.) to the data inputs of the subproblem. The `get_relevant_features` method identifies and extracts all features deemed relevant by the model. Ensure to add any necessary parameters in the `init` or `fit` method and pass them to the backbone. The fraction of features to include in the feature set for each subproblem is defined by 'beta'. The number of subproblems and iterations are defined by 'num_subproblems' and 'n_iterations', accordingly. 106 | 107 | ### Custom Exact Solver 108 | 109 | * Extend `ExactSolverBase` in 'backbone_learn/exact_solvers'. 110 | * Implement `fit` and `predict` methods. The `fit` function is applied to the reduced backbone set and does not need to be efficient; instead, it should use a method with optimality guarantees. The model parameters can again be defined in `init` and passed in the backbone class. The `predict` function ensures that the model can be used for prediction. 111 | 112 | ### Custom Backbone Algorithm 113 | 114 | * Extend `BackboneSupervised` for supervised algorithms or `BackboneUnsupervised` for unsupervised algorithms in 'backbone_learn/backbone'. The supervised algorithms perform feature selection, while the unsupervised algorithms perform data selection. 115 | * Implement `set_solvers`. Add your customized screen selector, heuristic solver, and exact solver as class instances. It's optional to define a screen selector and heuristic solver. Parameters for each solver are recommended to be passed manually. See examples in `backbone_learn/backbone/backbone_decision_tree`. 116 | 117 | ### Example Usage for Customized Backbone Algorithm 118 | 119 | Here's an example of how you can crate a custom Backbone algorithm for supervised method: 120 | 121 | ```python 122 | class CustomBackboneAlgorithm(BackboneSupervised): 123 | def set_solvers(self, **kwargs): 124 | # Init screen selector or set None to skip 125 | self.screen_selector = CustomScreenSelector(**kwarg) 126 | # Init heuristic solver or set None to skip 127 | self.heuristic_solver = CustomHeuristicSolver(**kwarg) 128 | # Init exact solver 129 | self.exact_solver = CustomExactSolver(**kwarg) 130 | ``` 131 | 132 | Here's how you can use the custom Backbone algorithm: 133 | 134 | ```python 135 | # Initialize with custom parameters 136 | backbone_algorithm = CustomBackboneAlgorithm(alpha=0.5, beta=0.3, num_subproblems=3,**kwarg) 137 | 138 | # Fit the model 139 | backbone_algorithm.fit(X, y) 140 | 141 | # Make predictions 142 | predictions = backbone_algorithm.predict(X_new) 143 | ``` 144 | 145 | ## Contributing 146 | 147 | We welcome contributions to BackboneLearn! Whether it's improving the code, fixing bugs, or enhancing documentation, every contribution is valuable. Please refer to our [CONTRIBUTING.md](CONTRIBUTING.md) for detailed information on how you can contribute. 148 | 149 | ## Code of Conduct 150 | 151 | In our commitment to fostering an inclusive and respectful community, we expect all participants to adhere to our Code of Conduct. Please read the [Code of Conduct](CONDUCT.md) to understand the expectations for all those who interact within the BackboneLearn community. 152 | 153 | ## Citing BackboneLearn 154 | 155 | If you find BackboneLearn useful in your research, please consider citing the following papers. 156 | 157 | Paper 1 (Toolkit): 158 | 159 | > @article{digalakis2023backbonelearn, 160 | > title={BackboneLearn: A Library for Scaling Mixed-Integer Optimization-Based Machine Learning}, 161 | > author={Vassilis Digalakis Jr au2 and Christos Ziakas}, 162 | > year={2023}, 163 | > eprint={2311.13695}, 164 | > archivePrefix={arXiv}, 165 | > primaryClass={cs.LG} 166 | > } 167 | 168 | Paper 2 Methodology: 169 | 170 | > @article{bertsimas2022backbone, 171 | > title={The backbone method for ultra-high dimensional sparse machine learning}, 172 | > author={Bertsimas, Dimitris and Digalakis Jr, Vassilis}, 173 | > journal={Machine Learning}, 174 | > volume={111}, 175 | > number={6}, 176 | > pages={2161--2212}, 177 | > year={2022}, 178 | > publisher={Springer} 179 | > } 180 | 181 | ## License 182 | 183 | [MIT License](LICENSE) 184 | -------------------------------------------------------------------------------- /backbone_learn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chziakas/backbone-learn/576749c7a1ffa1e57ea4c018a88a052d20fad2e2/backbone_learn/__init__.py -------------------------------------------------------------------------------- /backbone_learn/backbone/__init__ .py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chziakas/backbone-learn/576749c7a1ffa1e57ea4c018a88a052d20fad2e2/backbone_learn/backbone/__init__ .py -------------------------------------------------------------------------------- /backbone_learn/backbone/backbone_base.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | import logging 5 | from abc import ABC, abstractmethod 6 | from typing import List, Tuple 7 | 8 | import numpy as np 9 | 10 | from .subproblem_costructor import SubproblemConstructor 11 | 12 | logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") 13 | 14 | 15 | class BackboneBase(ABC): 16 | """ 17 | Base class for backbone algorithms. 18 | 19 | Attributes: 20 | beta (float): Proportion of screened variables to use in subproblems. 21 | num_subproblems (int): The number of subproblems to create. 22 | threshold (float): Threshold for determining feature significance. 23 | num_iterations (int): Number of iterations. 24 | screen_selector (ScreenSelectorBase): An instance of a screen selector class. 25 | exact_solver (ExactSolverBase): An instance of an exact solver class. 26 | heuristic_solver (HeuristicSolverBase): An instance of a heuristic solver class. 27 | variables_exact_idx (List[int]): Indices of variables selected for the exact solver. 28 | """ 29 | 30 | def __init__( 31 | self, 32 | beta: float = 0.5, 33 | num_subproblems: int = 5, 34 | threshold: float = 0.001, 35 | num_iterations: int = 10, 36 | **kwargs, 37 | ): 38 | """ 39 | Initializes the BackboneBase with specific parameters. 40 | 41 | Args: 42 | beta (float): Proportion of screened variables to use in subproblems. 43 | num_subproblems (int): The number of subproblems to create. 44 | threshold (float, optional): Threshold for determining feature significance. 45 | num_iterations (int, optional): Number of iterations. 46 | """ 47 | self.beta = beta 48 | self.num_subproblems = num_subproblems 49 | self.threshold = threshold 50 | self.num_iterations = num_iterations 51 | self.screen_selector = None 52 | self.exact_solver = None 53 | self.heuristic_solver = None 54 | self.variables_exact_idx = None 55 | self.n_samples_backbone = None 56 | self.init_parameters = kwargs 57 | self.set_solvers(**kwargs) 58 | 59 | @abstractmethod 60 | def set_solvers(self, **kwargs): 61 | """ 62 | Initialize and set the solvers of the Backbone algorithm. 63 | """ 64 | 65 | @abstractmethod 66 | def preprocessing_backbone(self, X_selected: np.ndarray) -> np.ndarray: 67 | """Preprocess data specific to the learning method during backbone construction.""" 68 | 69 | @abstractmethod 70 | def set_utilities(self, X: np.ndarray) -> np.ndarray: 71 | """Set utilities based on the learning method.""" 72 | 73 | @abstractmethod 74 | def utilize_variables( 75 | self, X_selected: np.ndarray, variables_exact_idx: List[int] 76 | ) -> np.ndarray: 77 | """Utilize variables based on the learning method.""" 78 | 79 | @abstractmethod 80 | def preprocessing_predict(self, X: np.ndarray) -> np.ndarray: 81 | """Preprocess data for making predictions based on the learning method.""" 82 | 83 | @abstractmethod 84 | def get_relevant_variables( 85 | self, feature_idx: List[int], threshold: float = None 86 | ) -> List[Tuple[int, int]]: 87 | """ 88 | Abstract method to retrieve relevant variables based on the learning type. 89 | 90 | This method should be implemented in subclasses to handle the specifics of 91 | identifying relevant variables in both supervised and unsupervised learning contexts. 92 | 93 | Args: 94 | feature_idx (List[int]): List of feature indices to consider. 95 | threshold (float, optional): A threshold value used for variable selection, 96 | applicable in some supervised learning scenarios. 97 | 98 | Returns: 99 | List[Tuple[int, int]]: A list of tuples where each tuple represents a pair of indices 100 | of relevant variables. The structure of these tuples and what they 101 | represent may vary between supervised and unsupervised learning. 102 | """ 103 | 104 | def fit(self, X: np.ndarray, y: np.ndarray = None): 105 | """ 106 | Run the backbone method using the specified screen selector, exact solver, and heuristic solver. 107 | 108 | Args: 109 | X (np.ndarray): Feature dataset. 110 | y (np.ndarray): Target values. 111 | """ 112 | X_selected, utilities = self._perform_screen_selection(X, y) 113 | self.variables_exact_idx = self._construct_backbone(X_selected, utilities, y) 114 | self._fit_exact_solver(X_selected, y) 115 | 116 | def _perform_screen_selection( 117 | self, X: np.ndarray, y: np.ndarray 118 | ) -> Tuple[np.ndarray, np.ndarray]: 119 | """ 120 | Perform screen selection if a screen selector is provided. 121 | 122 | Args: 123 | X (np.ndarray): Feature dataset. 124 | y (np.ndarray): Target values. 125 | 126 | Returns: 127 | Tuple[np.ndarray, np.ndarray]: The selected features and their utilities. 128 | """ 129 | if self.screen_selector: 130 | logging.info("Screen selection started.") 131 | X_selected = self.screen_selector.select(X, y) 132 | utilities = self.screen_selector.utilities[self.screen_selector.indices_keep] 133 | logging.info( 134 | f"Number of variables included in the heuristic solver: {X_selected.shape[1]}" 135 | ) 136 | else: 137 | logging.info("Screen selection skipped. Using all features.") 138 | X_selected = X 139 | utilities = self.set_utilities(X) 140 | 141 | return X_selected, utilities 142 | 143 | def _construct_backbone( 144 | self, X_selected: np.ndarray, utilities: np.ndarray, y: np.ndarray = None 145 | ) -> List: 146 | """ 147 | Construct the backbone using a heuristic solver, or return all features if the heuristic solver is not provided. 148 | 149 | Args: 150 | X_selected (np.ndarray): Selected feature dataset after screen selection. 151 | utilities (np.ndarray): Utilities of the selected features. 152 | y (np.ndarray): Target values. 153 | 154 | Returns: 155 | List: The indices of the variables selected by the backbone or all feature indices if no heuristic solver is provided. 156 | """ 157 | if not self.heuristic_solver: 158 | logging.info("Heuristic solver not provided. Using all features for the exact solver.") 159 | return list(range(X_selected.shape[1])) 160 | 161 | X_selected = self.preprocessing_backbone(X_selected) 162 | logging.info( 163 | f"""Backbone construction with heuristic solver started for iterations:{self.num_iterations}, 164 | subproblems:{self.num_subproblems} , and beta:{self.beta}""" 165 | ) 166 | backbone_sets = [] 167 | for i in range(self.num_iterations): 168 | logging.info(f"Iteration {i + 1} started.") 169 | constructor = SubproblemConstructor(utilities, self.beta, self.num_subproblems) 170 | subproblems = constructor.construct_subproblems() 171 | for sub, feature_idx in enumerate(subproblems): 172 | feature_idx.sort() 173 | subset = X_selected[:, feature_idx] 174 | self.n_samples_backbone = len(feature_idx) 175 | self.heuristic_solver.__init__(**self.init_parameters) 176 | subset = self.preprocessing_backbone(subset) 177 | self.heuristic_solver.fit(subset, y, random_state=sub) 178 | rel_variables_global = self.get_relevant_variables(feature_idx, self.threshold) 179 | backbone_sets.append(rel_variables_global) 180 | logging.info(f"Iteration {i + 1} completed.") 181 | backbone_set = self.build_backbone_set(backbone_sets) 182 | if self.screen_selector is None: 183 | logging.info(f"Backbone set idx: {backbone_set}") 184 | else: 185 | logging.info(f"Backbone set idx: {self.screen_selector.indices_keep[backbone_set]}") 186 | return backbone_set 187 | 188 | def _fit_exact_solver(self, X_selected: np.ndarray, y: np.ndarray): 189 | """ 190 | Fit the exact solver with the selected variables from the backbone. 191 | 192 | Args: 193 | X_selected (np.ndarray): Selected feature dataset after screen selection. 194 | y (np.ndarray): Target values. 195 | """ 196 | # logging.info(f"Backbone set constructed with variables: {self.variables_exact_idx}") 197 | X_selected_exact = self.utilize_variables(X_selected, self.variables_exact_idx) 198 | logging.info( 199 | f"Number of variables included in the exact solver: {X_selected_exact.shape[1]}" 200 | ) 201 | self.exact_solver.fit(X_selected_exact, y) 202 | logging.info("Exact problem solved.") 203 | 204 | def predict(self, X: np.ndarray) -> np.ndarray: 205 | """ 206 | Makes predictions using the fitted exact solver model. 207 | 208 | Args: 209 | X (np.ndarray): The input feature matrix. 210 | 211 | Returns: 212 | np.ndarray: Predicted values. 213 | """ 214 | X_pred = self.preprocessing_predict(X) 215 | return self.exact_solver.predict(X_pred) 216 | -------------------------------------------------------------------------------- /backbone_learn/backbone/backbone_clustering.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | from ..exact_solvers.mio_clustering import MIOClustering 5 | from ..heuristic_solvers.kmeans_solver import KMeansSolver 6 | from .backbone_unsupervised import BackboneUnsupervised 7 | 8 | 9 | class BackboneClustering(BackboneUnsupervised): 10 | """ 11 | Specific implementation of the Backbone method for clustering. 12 | 13 | This class uses K-means for heuristic solving and retains MIO optimzer for exact solving. 14 | No screen selector is used in this approach, as K-means is considered efficient for feature selection. 15 | 16 | Inherits from: 17 | BackboneBase (ABC): The abstract base class for backbone algorithms. 18 | """ 19 | 20 | def set_solvers(self, n_clusters: int = 10, time_limit: int = 1000): 21 | """ 22 | Initializes the clustering method with specified components. 23 | 24 | Args: 25 | n_clusters (int, optional): Number of clusters for K-means. Defaults to 10. 26 | time_limit (int): Time limit for the optimization process. 27 | """ 28 | self.screen_selector = None # No screen selector for this clustering approach 29 | self.heuristic_solver = KMeansSolver(n_clusters=n_clusters) 30 | self.exact_solver = MIOClustering(n_clusters=n_clusters, time_limit=time_limit) 31 | -------------------------------------------------------------------------------- /backbone_learn/backbone/backbone_decision_tree.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | from ..exact_solvers.benders_oct_decision_tree import BendersOCTDecisionTree 5 | from ..heuristic_solvers.cart_decision_tree import CARTDecisionTree 6 | from ..screen_selectors.pearson_correlation_selector import PearsonCorrelationSelector 7 | from .backbone_supervised import BackboneSupervised 8 | 9 | 10 | class BackboneDecisionTree(BackboneSupervised): 11 | """ 12 | Specific implementation of the Backbone method for sparse regression. 13 | 14 | This class combines Pearson correlation for feature screening, L0BnB for exact solving, and Lasso for heuristic solving to construct a sparse regression model. 15 | 16 | Inherits from: 17 | BackboneBase (ABC): The abstract base class for backbone algorithms. 18 | """ 19 | 20 | def set_solvers( 21 | self, 22 | alpha=0.5, 23 | depth=3, 24 | time_limit=1000, 25 | _lambda=0.5, 26 | num_threads=None, 27 | obj_mode="acc", 28 | n_bins=2, 29 | is_data_fit=False, 30 | ): 31 | """ 32 | Initializes the sparse regression method with specified components. 33 | 34 | Args: 35 | alpha (float): Proportion of features to retain after screening. Defaults to 0.5. 36 | depth (int, optional): Depth of BendersOCT tree. Defaults to 3. 37 | time_limit (int): Time limit for the optimization process. 38 | _lambda (float): Regularization parameter. 39 | num_threads (int or None): Number of threads for parallel processing. 40 | obj_mode (str): Objective mode, e.g., 'acc' for accuracy. 41 | n_bins (int): Number of bins for KBinsDiscretizer. Defaults to 2. 42 | is_data_fit (bool): Whether data are in the format required for OCT 43 | """ 44 | self.screen_selector = PearsonCorrelationSelector(alpha) 45 | self.exact_solver = BendersOCTDecisionTree( 46 | depth=depth, 47 | time_limit=time_limit, 48 | _lambda=_lambda, 49 | num_threads=num_threads, 50 | obj_mode=obj_mode, 51 | n_bins=n_bins, 52 | is_data_fit=is_data_fit, 53 | ) 54 | self.heuristic_solver = CARTDecisionTree() 55 | -------------------------------------------------------------------------------- /backbone_learn/backbone/backbone_sparse_regression.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | from ..exact_solvers.lobnb_regression import L0BnBRegression 5 | from ..heuristic_solvers.lasso_regression import LassoRegression 6 | from ..screen_selectors.pearson_correlation_selector import PearsonCorrelationSelector 7 | from .backbone_supervised import BackboneSupervised 8 | 9 | 10 | class BackboneSparseRegression(BackboneSupervised): 11 | """ 12 | Specific implementation of the Backbone method for sparse regression. 13 | 14 | This class combines Pearson correlation for feature screening, L0BnB for exact solving, and Lasso for heuristic solving to construct a sparse regression model. 15 | 16 | Inherits from: 17 | BackboneBase (ABC): The abstract base class for backbone algorithms. 18 | """ 19 | 20 | def set_solvers( 21 | self, alpha: float = 0.5, lambda_2: float = 0.001, max_nonzeros: int = 10, time_limit=1000 22 | ): 23 | """ 24 | Initializes the sparse regression method with specified components. 25 | 26 | Args: 27 | alpha (float): Proportion of features to retain after screening. 28 | lambda_2 (float, optional): Regularization parameter lambda_2 in the L0BnB model, controlling the trade-off 29 | between the model's complexity and fit. Defaults to 0.001. 30 | max_nonzeros (int, optional): Maximum number of non-zero coefficients the model is allowed to have, 31 | enforcing sparsity. Defaults to 10. 32 | time_limit (int): Time limit for the optimization process. 33 | """ 34 | self.screen_selector = PearsonCorrelationSelector(alpha) 35 | self.exact_solver = L0BnBRegression(lambda_2, max_nonzeros, time_limit) 36 | self.heuristic_solver = LassoRegression() 37 | -------------------------------------------------------------------------------- /backbone_learn/backbone/backbone_supervised.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | from typing import List 5 | 6 | import numpy as np 7 | 8 | from ..utils.utils import Utils 9 | from .backbone_base import BackboneBase 10 | 11 | 12 | class BackboneSupervised(BackboneBase): 13 | """ 14 | Implementation for supervised learning specific operations. 15 | """ 16 | 17 | def preprocessing_backbone(self, X_selected: np.ndarray) -> np.ndarray: 18 | """ 19 | Perform preprocessing specific to supervised learning during backbone construction. 20 | 21 | Args: 22 | X_selected (np.ndarray): The selected feature dataset after screen selection. 23 | 24 | Returns: 25 | np.ndarray: The preprocessed dataset, which is the same as input for supervised learning. 26 | """ 27 | return X_selected 28 | 29 | def set_utilities(self, X: np.ndarray) -> np.ndarray: 30 | """ 31 | Set utilities for supervised learning, typically one for each feature. 32 | 33 | Args: 34 | X (np.ndarray): The feature dataset. 35 | 36 | Returns: 37 | np.ndarray: An array of utilities, one for each feature. 38 | """ 39 | return np.ones(X.shape[1]) 40 | 41 | def utilize_variables( 42 | self, X_selected: np.ndarray, variables_exact_idx: List[int] 43 | ) -> np.ndarray: 44 | """ 45 | Utilize selected variables in the dataset after processing the backbone sets. 46 | 47 | Args: 48 | X_selected (np.ndarray): The selected feature dataset after screen selection. 49 | variables_exact_idx (List[int]): List of indices for variables selected by the backbone. 50 | 51 | Returns: 52 | np.ndarray: Dataset with only the selected variables. 53 | """ 54 | return X_selected[:, variables_exact_idx] 55 | 56 | def preprocessing_predict(self, X: np.ndarray) -> np.ndarray: 57 | """ 58 | Preprocess the dataset before making predictions in supervised learning. 59 | 60 | Args: 61 | X (np.ndarray): The input feature matrix. 62 | 63 | Returns: 64 | np.ndarray: Preprocessed dataset for prediction. 65 | """ 66 | 67 | if self.screen_selector is not None: 68 | X = X[:, self.screen_selector.indices_keep] 69 | if self.heuristic_solver is not None: 70 | X = X[:, self.variables_exact_idx] 71 | return X 72 | 73 | def get_relevant_variables(self, feature_idx: List[int], threshold: float = None) -> List[int]: 74 | """ 75 | Implements the retrieval of relevant variables for supervised learning. 76 | 77 | In supervised learning, this typically involves identifying variables or features 78 | that are relevant based on a certain threshold or criterion. 79 | 80 | Args: 81 | feature_idx (List[int]): List of feature indices to consider. 82 | threshold (float, optional): Threshold value for variable selection. 83 | 84 | Returns: 85 | List[int]: A list of indices representing the relevant variables. 86 | """ 87 | # Get relevant variables from the heuristic solver 88 | rel_variables_local = self.heuristic_solver.get_relevant_variables(threshold) 89 | 90 | # Return the global indices of the relevant variables 91 | return [feature_idx[idx] for idx in rel_variables_local] 92 | 93 | def build_backbone_set(self, backbone_sets): 94 | """ 95 | Merges a list of backbone sets into a single list, removes duplicates, and sorts the backbone list. 96 | 97 | Args: 98 | backbone_sets (list of list of int): The list of lists of backbone sets 99 | 100 | Returns: 101 | list: A backbone set sorted with unique elements. 102 | """ 103 | 104 | return Utils.merge_lists_and_sort(backbone_sets) 105 | -------------------------------------------------------------------------------- /backbone_learn/backbone/backbone_unsupervised.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | import numpy as np 4 | 5 | from ..utils.utils import Utils 6 | from .backbone_base import BackboneBase 7 | 8 | 9 | class BackboneUnsupervised(BackboneBase): 10 | """ 11 | Implementation for unsupervised learning specific operations. 12 | """ 13 | 14 | def preprocessing_backbone(self, X_selected: np.ndarray) -> np.ndarray: 15 | """ 16 | Perform preprocessing specific to unsupervised learning during backbone construction. 17 | This typically involves transposing the dataset. 18 | 19 | Args: 20 | X_selected (np.ndarray): The selected feature dataset after screen selection. 21 | 22 | Returns: 23 | np.ndarray: The transposed dataset for unsupervised learning. 24 | """ 25 | return X_selected.transpose() 26 | 27 | def set_utilities(self, X: np.ndarray) -> np.ndarray: 28 | """ 29 | Set utilities for unsupervised learning, typically one for each sample. 30 | 31 | Args: 32 | X (np.ndarray): The feature dataset. 33 | 34 | Returns: 35 | np.ndarray: An array of utilities, one for each sample. 36 | """ 37 | return np.ones(X.shape[0]) 38 | 39 | def utilize_variables( 40 | self, X_selected: np.ndarray, variables_exact_idx: List[int] 41 | ) -> np.ndarray: 42 | """ 43 | Utilize selected variables in the dataset after processing the backbone sets in unsupervised learning. 44 | In unsupervised learning, the entire dataset is often used as is. 45 | 46 | Args: 47 | X_selected (np.ndarray): The selected feature dataset after screen selection. 48 | variables_exact_idx (List[int]): List of indices for variables selected by the backbone (unused in unsupervised). 49 | 50 | Returns: 51 | np.ndarray: Dataset with all features, as variable selection is typically not performed in unsupervised learning. 52 | """ 53 | if self.heuristic_solver is not None: 54 | self.exact_solver.ls_pairs_diff_cluster = variables_exact_idx 55 | return X_selected 56 | 57 | def preprocessing_predict(self, X: np.ndarray) -> np.ndarray: 58 | """ 59 | Preprocess the dataset before making predictions in unsupervised learning. 60 | Typically, the entire dataset is used as is. 61 | 62 | Args: 63 | X (np.ndarray): The input feature matrix. 64 | 65 | Returns: 66 | np.ndarray: The original dataset, as preprocessing is typically not required for predictions in unsupervised learning. 67 | """ 68 | return X 69 | 70 | def get_relevant_variables( 71 | self, feature_idx: List[int], threshold: float = None 72 | ) -> List[Tuple[int, int]]: 73 | """ 74 | Implements the retrieval of relevant variables for unsupervised learning. 75 | 76 | In unsupervised learning, this method identifies pairs of variables that 77 | are considered relevant based on the learning model used. 78 | 79 | Args: 80 | feature_idx (List[int]): List of feature indices to consider. 81 | 82 | Returns: 83 | List[Tuple[int, int]]: A list of tuples, where each tuple contains a pair of indices 84 | representing relevant variable pairs. 85 | """ 86 | rel_variables_local = self.heuristic_solver.get_relevant_variables() 87 | return rel_variables_local 88 | 89 | def build_backbone_set(self, backbone_sets) -> List: 90 | """ 91 | Find tuples that are common to all backbone sets 92 | 93 | Args: 94 | backbone_sets (list of list of int): The list of lists of backbone sets 95 | 96 | Returns: 97 | list: A backbone set with the tuples 98 | """ 99 | excluded_pairs = Utils.merge_lists_and_sort(backbone_sets) 100 | num_points = self.n_samples_backbone 101 | self.exact_solver.ls_pairs_same_cluster = Utils.generate_index_pairs( 102 | num_points, excluded_pairs 103 | ) 104 | 105 | return Utils.find_common_tuples(backbone_sets) 106 | -------------------------------------------------------------------------------- /backbone_learn/backbone/subproblem_costructor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | from typing import List 5 | 6 | import numpy as np 7 | 8 | from .subproblem_feature_selector import SubproblemFeatureSelector 9 | 10 | 11 | class SubproblemConstructor: 12 | """ 13 | Manages the construction of all subproblems. 14 | """ 15 | 16 | def __init__(self, utilities: np.ndarray, beta: float, num_subproblems: int): 17 | """ 18 | Initializes the SubproblemConstructor with given parameters. 19 | 20 | Args: 21 | utilities (np.ndarray): Array of feature utilities. 22 | beta (float): Proportion of screened features to use in each subproblem. 23 | num_subproblems (int): Number of subproblems to create. 24 | """ 25 | self._utilities = utilities 26 | self._num_features = utilities.shape[0] 27 | self._beta = beta 28 | self._num_features_subproblem = int(np.ceil(beta * self._num_features)) 29 | self._num_subproblems = num_subproblems 30 | 31 | @property 32 | def utilities(self) -> np.ndarray: 33 | """Returns the utilities of the features.""" 34 | return self._utilities 35 | 36 | @property 37 | def num_features(self) -> int: 38 | """Returns the total number of features.""" 39 | return self._num_features 40 | 41 | @property 42 | def beta(self) -> float: 43 | """Returns the beta proportion for subproblem feature selection.""" 44 | return self._beta 45 | 46 | @property 47 | def num_features_subproblem(self) -> int: 48 | """Returns the number of features for each subproblem.""" 49 | return self._num_features_subproblem 50 | 51 | @property 52 | def num_subproblems(self) -> int: 53 | """Returns the number of subproblems to create.""" 54 | return self._num_subproblems 55 | 56 | def construct_subproblems(self) -> List[List[int]]: 57 | """ 58 | Constructs and returns all subproblems. 59 | 60 | Returns: 61 | List[List[int]]: A list containing the indices of selected features for each subproblem. 62 | """ 63 | subproblems = [] 64 | selector = SubproblemFeatureSelector(self.utilities, self.num_features_subproblem) 65 | for _ in range(self.num_subproblems): 66 | subproblems.append(selector.select()) 67 | 68 | return subproblems 69 | -------------------------------------------------------------------------------- /backbone_learn/backbone/subproblem_feature_selector.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | from typing import List 5 | 6 | import numpy as np 7 | 8 | from ..screen_selectors.variable_selector import VariableSelector 9 | 10 | 11 | class SubproblemFeatureSelector(VariableSelector): 12 | """ 13 | Selects features for a single subproblem based on utilities. 14 | """ 15 | 16 | def __init__(self, utilities: np.ndarray, num_features_to_select: int): 17 | """ 18 | Initializes the SubproblemFeatureSelector with given parameters. 19 | 20 | Args: 21 | utilities (np.ndarray): Array of feature utilities. 22 | num_features_to_select (int): Number of features to select for the subproblem. 23 | """ 24 | self._utilities = utilities 25 | self._num_features_to_select = num_features_to_select 26 | self._probability_distribution = self.compute_probability_distribution(utilities) 27 | 28 | @property 29 | def utilities(self) -> np.ndarray: 30 | """Returns the utilities of the features.""" 31 | return self._utilities 32 | 33 | @property 34 | def probability_distribution(self) -> np.ndarray: 35 | """Returns the probability distribution for feature selection.""" 36 | return self._probability_distribution 37 | 38 | @property 39 | def num_features_to_select(self) -> int: 40 | """Returns the number of features to select for a subproblem.""" 41 | return self._num_features_to_select 42 | 43 | @staticmethod 44 | def compute_probability_distribution(utilities: np.ndarray) -> np.ndarray: 45 | """ 46 | Computes the probability distribution for selecting features. 47 | 48 | Args: 49 | utilities (np.ndarray): Array of feature utilities. 50 | 51 | Returns: 52 | np.ndarray: Normalized probability distribution based on utilities. 53 | """ 54 | normalized_utilities = utilities / np.max(utilities) 55 | exp_utilities = np.exp(normalized_utilities + 1) 56 | probability_distribution = exp_utilities / exp_utilities.sum() 57 | return probability_distribution 58 | 59 | def select(self) -> List[int]: 60 | """ 61 | Samples a subset of features based on computed probability distribution. 62 | 63 | Returns: 64 | List[int]: Indices of the selected features. 65 | """ 66 | selected_features_idx = np.random.choice( 67 | len(self.utilities), 68 | size=self.num_features_to_select, 69 | replace=False, 70 | p=self._probability_distribution, 71 | ) 72 | 73 | return selected_features_idx.tolist() 74 | -------------------------------------------------------------------------------- /backbone_learn/exact_solvers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chziakas/backbone-learn/576749c7a1ffa1e57ea4c018a88a052d20fad2e2/backbone_learn/exact_solvers/__init__.py -------------------------------------------------------------------------------- /backbone_learn/exact_solvers/benders_oct_decision_tree.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | import numpy as np 5 | from odtlearn.flow_oct import BendersOCT 6 | from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder 7 | 8 | from .exact_solver_base import ExactSolverBase 9 | 10 | 11 | class BendersOCTDecisionTree(ExactSolverBase): 12 | """ 13 | Implements the BendersOCT model for Optimal Classification Trees. 14 | 15 | Attributes: 16 | model (BendersOCT): The BendersOCT model. 17 | accuracy_score (float): The accuracy score of the trained model. 18 | est_X (KBinsDiscretizer): The KBinsDiscretizer instance for discretizing features. 19 | enc (OneHotEncoder): The OneHotEncoder instance for encoding categorical variables. 20 | """ 21 | 22 | def __init__( 23 | self, 24 | depth=3, 25 | time_limit=1000, 26 | _lambda=0.5, 27 | num_threads=None, 28 | obj_mode="acc", 29 | n_bins=2, 30 | is_data_fit=False, 31 | ): 32 | """ 33 | Initializes the BendersOCTDecisionTree with default or specified values. 34 | 35 | Args: 36 | depth (int): Maximum depth of the tree. 37 | time_limit (int): Time limit for the optimization process. 38 | _lambda (float): Regularization parameter. 39 | num_threads (int or None): Number of threads for parallel processing. 40 | obj_mode (str): Objective mode, e.g., 'acc' for accuracy. 41 | n_bins (int): Number of bins for KBinsDiscretizer. 42 | is_data_fit (bool): Whether data are in the format required for OCT 43 | """ 44 | super().__init__() 45 | self._model = BendersOCT( 46 | solver="cbc", 47 | depth=depth, 48 | time_limit=time_limit, 49 | _lambda=_lambda, 50 | num_threads=num_threads, 51 | obj_mode=obj_mode, 52 | ) 53 | self.accuracy_score = None 54 | self.est_X = KBinsDiscretizer(n_bins=n_bins, encode="ordinal", strategy="quantile") 55 | self.enc = OneHotEncoder(handle_unknown="error", drop="if_binary") 56 | self.is_data_fit = is_data_fit 57 | 58 | @property 59 | def model(self): 60 | """ 61 | Returns the BendersOCT model instance. 62 | """ 63 | return self._model 64 | 65 | def preprocess_features(self, X): 66 | """ 67 | Transforms the features using already fitted KBinsDiscretizer and OneHotEncoder. 68 | 69 | Args: 70 | X (np.ndarray or pd.DataFrame): The input features (test data). 71 | 72 | Returns: 73 | np.ndarray: Transformed features. 74 | """ 75 | X_bin = self.est_X.transform(X) 76 | return self.enc.transform(X_bin).toarray() 77 | 78 | def fit_preprocessors(self, X_train): 79 | """ 80 | Fits preprocessors to the training data. 81 | 82 | Args: 83 | X_train (np.ndarray or pd.DataFrame): The training features. 84 | """ 85 | self.est_X.fit(X_train) 86 | X_bin = self.est_X.transform(X_train) 87 | self.enc.fit(X_bin) 88 | 89 | def fit(self, X, y) -> None: 90 | """ 91 | Fits the BendersOCT model to the preprocessed data. 92 | 93 | Args: 94 | X (np.ndarray or pd.DataFrame): The input features. 95 | y (np.ndarray or pd.Series): The target variable. 96 | """ 97 | if self.is_data_fit: 98 | X_preprocessed = X 99 | else: 100 | self.fit_preprocessors(X) 101 | X_preprocessed = self.preprocess_features(X) 102 | self._model.fit(X_preprocessed, y) 103 | 104 | def predict(self, X) -> np.ndarray: 105 | """ 106 | Predicts using the fitted BendersOCT model. 107 | 108 | Args: 109 | X (np.ndarray or pd.DataFrame): The input features. 110 | 111 | Returns: 112 | np.ndarray: Predicted values. 113 | """ 114 | if self.is_data_fit: 115 | X_preprocessed = X 116 | else: 117 | X_preprocessed = self.preprocess_features(X) 118 | return self._model.predict(X_preprocessed) 119 | -------------------------------------------------------------------------------- /backbone_learn/exact_solvers/exact_solver_base.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | from abc import ABC, abstractmethod 5 | 6 | import numpy as np 7 | 8 | 9 | class ExactSolverBase(ABC): 10 | """ 11 | Abstract class for solving subproblems in various contexts. 12 | 13 | This class provides a framework for defining solvers that can fit models to data and make predictions. 14 | Derived classes need to implement the `fit` and `predict` methods according to the specifics of the solver. 15 | """ 16 | 17 | @property 18 | def model(self): 19 | """ 20 | This property should be implemented by subclasses to return the model instance used in the exact approach. 21 | """ 22 | return self._model 23 | 24 | @abstractmethod 25 | def fit(self, X: np.ndarray, y: np.ndarray): 26 | """ 27 | Fits a model to the given data. 28 | 29 | This method should be implemented to solve a subproblem using the input data matrix X and the target vector y. 30 | It should fit a model based on the specific algorithm implemented in the derived class. 31 | 32 | Args: 33 | X (np.ndarray): The input feature matrix. 34 | y (np.ndarray): The target vector. 35 | 36 | Returns: 37 | None: The method should fit the model to the data, with the results stored internally within the class instance. 38 | """ 39 | 40 | @abstractmethod 41 | def predict(self, X: np.ndarray) -> np.ndarray: 42 | """ 43 | Makes predictions using the fitted model. 44 | 45 | This method should be implemented to provide predictions based on the model fitted using the `fit` method. 46 | It should process the input feature matrix X and return predictions. 47 | 48 | Args: 49 | X (np.ndarray): The input feature matrix for which predictions are to be made. 50 | 51 | Returns: 52 | np.ndarray: An array of predictions made by the model. 53 | """ 54 | -------------------------------------------------------------------------------- /backbone_learn/exact_solvers/lobnb_regression.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | import numpy as np 5 | 6 | from .exact_solver_base import ExactSolverBase 7 | from .lobnb_regression_model import L0BnBRegressionModel 8 | 9 | 10 | class L0BnBRegression(ExactSolverBase): 11 | """ 12 | Implements a regression solver using the L0BnB method for feature selection. 13 | 14 | This class is designed to provide an easy-to-use interface for the L0BnB regression model, allowing for fitting and predictions on datasets. 15 | 16 | Attributes: 17 | model (L0BnBRegressionModel): An instance of the L0BnBRegressionModel class. 18 | """ 19 | 20 | def __init__(self, lambda_2: float = 0.01, max_nonzeros: int = 10, time_limit: int = 1000): 21 | """ 22 | Initializes the L0BnBRegression with specified parameters for the L0BnB optimization process. 23 | 24 | Args: 25 | lambda_2 (float): Regularization parameter lambda_2 for the L0BnB model. 26 | max_nonzeros (int): Maximum number of non-zero coefficients allowed in the model. 27 | time_limit (int): Time limit for the optimization process. 28 | """ 29 | self._model = L0BnBRegressionModel(lambda_2, max_nonzeros, time_limit) 30 | 31 | def fit(self, X: np.ndarray, y: np.ndarray) -> None: 32 | """ 33 | Fits the L0BnB regression model to the provided data. 34 | 35 | Args: 36 | X (np.ndarray): The feature matrix for the regression model. 37 | y (np.ndarray): The target values for the regression model. 38 | """ 39 | self._model.fit(X, y) 40 | 41 | def predict(self, X: np.ndarray) -> np.ndarray: 42 | """ 43 | Generates predictions using the fitted L0BnB regression model. 44 | 45 | This method delegates the prediction task to the `predict` method of the `L0BnBRegressionModel` instance. It requires the model to be already fitted. 46 | 47 | Args: 48 | X (np.ndarray): The feature matrix for which predictions are to be made. 49 | 50 | Returns: 51 | np.ndarray: Predicted values based on the fitted model. 52 | 53 | """ 54 | 55 | return self._model.predict(X) 56 | -------------------------------------------------------------------------------- /backbone_learn/exact_solvers/lobnb_regression_model.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | import numpy as np 5 | from l0bnb import fit_path 6 | 7 | 8 | class L0BnBRegressionModel: 9 | """ 10 | A wrapper class for the L0BnB regression model, facilitating model fitting and predictions. 11 | 12 | This class serves as a convenient interface for the L0BnB feature selection method, specifically tailored for regression tasks. It stores the regression coefficients and intercept after fitting the model to the provided data. 13 | 14 | Attributes: 15 | coefficients (np.ndarray or None): Coefficients of the regression model. None before the model is fitted. 16 | intercept (float or None): Intercept of the regression model. None before the model is fitted. 17 | lambda_2 (float): Regularization parameter lambda_2 in the L0BnB model, controlling the trade-off between 18 | the model's complexity and fit. 19 | max_nonzeros (int): Maximum number of non-zero coefficients the model is allowed to have, enforcing sparsity. 20 | """ 21 | 22 | def __init__(self, lambda_2: float, max_nonzeros: int, time_limit: int): 23 | """ 24 | Initializes the L0BnBRegressionModel with specified parameters for L0BnB optimization. 25 | 26 | Args: 27 | lambda_2 (float): Regularization parameter for the L0BnB optimization process. 28 | max_nonzeros (int): Constraint on the maximum number of non-zero coefficients in the model. 29 | time_limit (int): Time limit for the optimization process. 30 | """ 31 | self.coefficients = None 32 | self.intercept = None 33 | self.lambda_2 = lambda_2 34 | self.max_nonzeros = max_nonzeros 35 | self.time_limit = time_limit 36 | 37 | def fit( 38 | self, 39 | X: np.ndarray, 40 | y: np.ndarray, 41 | ) -> None: 42 | """ 43 | Fits the L0BnB regression model to the given dataset. 44 | 45 | The method uses the L0BnB algorithm to find a sparse set of coefficients that best fit the data. 46 | 47 | Args: 48 | X (np.ndarray): The feature matrix for the regression model. 49 | y (np.ndarray): The target values for the regression model. 50 | 51 | """ 52 | 53 | solutions = fit_path( 54 | X, y, lambda_2=self.lambda_2, max_nonzeros=self.max_nonzeros, time_limit=self.time_limit 55 | ) 56 | if solutions: 57 | selected_model = solutions[-1] 58 | self.coefficients = selected_model["B"] 59 | self.intercept = selected_model["B0"] 60 | 61 | def predict(self, X: np.ndarray) -> np.ndarray: 62 | """ 63 | Generates predictions for the given input features using the fitted L0BnB regression model. 64 | 65 | Args: 66 | X (np.ndarray): The feature matrix for making predictions. 67 | 68 | Returns: 69 | np.ndarray: The predicted values. 70 | 71 | """ 72 | return np.dot(X, self.coefficients) + self.intercept 73 | -------------------------------------------------------------------------------- /backbone_learn/exact_solvers/mio_clustering.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | from typing import List, Optional, Tuple 5 | 6 | import numpy as np 7 | from pulp import PULP_CBC_CMD, LpBinary, LpMinimize, LpProblem, LpVariable, lpSum 8 | 9 | 10 | class MIOClustering: 11 | """ 12 | Class for solving clustering problems using Mixed-Integer Optimization. 13 | """ 14 | 15 | def __init__( 16 | self, 17 | n_clusters: int = None, 18 | time_limit: float = 1200, 19 | ls_pairs_diff_cluster: Optional[List[Tuple[int, int]]] = None, 20 | ls_pairs_same_cluster: Optional[List[Tuple[int, int]]] = None, 21 | ): 22 | self.n_clusters = n_clusters 23 | self.ls_pairs_diff_cluster = ls_pairs_diff_cluster 24 | self.ls_pairs_same_cluster = ls_pairs_same_cluster 25 | self.time_limit = time_limit 26 | self.model = LpProblem("Clustering MIO", LpMinimize) 27 | self.z = None # For storing solution for z variables 28 | self.y = None # For storing solution for y variables 29 | self.cluster_means = None 30 | 31 | @staticmethod 32 | def euclidean_distance(point1: np.ndarray, point2: np.ndarray) -> float: 33 | return np.linalg.norm(point1 - point2) 34 | 35 | def _initialize_variables(self, num_points: int): 36 | """ 37 | Initialize the decision variables for the optimization problem. 38 | 39 | Args: 40 | num_points (int): The number of data points. 41 | 42 | Returns: 43 | Tuple: A tuple containing the dictionaries of z and y variables. 44 | """ 45 | z = LpVariable.dicts( 46 | "z", 47 | [ 48 | (i, j, k) 49 | for i in range(num_points - 1) 50 | for j in range(i + 1, num_points) 51 | for k in range(self.n_clusters) 52 | ], 53 | 0, 54 | 1, 55 | LpBinary, 56 | ) 57 | 58 | y = LpVariable.dicts( 59 | "y", 60 | [(i, k) for i in range(num_points) for k in range(self.n_clusters)], 61 | 0, 62 | 1, 63 | LpBinary, 64 | ) 65 | 66 | return z, y 67 | 68 | def _calculate_distances_noise(self, X: np.ndarray) -> np.ndarray: 69 | """ 70 | Calculate and return the matrix of pairwise distances with added noise. 71 | 72 | Args: 73 | X (np.ndarray): The input feature matrix. 74 | 75 | Returns:2 76 | np.ndarray: The matrix of pairwise distances with noise. 77 | """ 78 | distances = np.linalg.norm(X[:, np.newaxis] - X, axis=2) 79 | min_dist = np.min(distances[np.nonzero(distances)]) 80 | noise = 0.1 * min_dist * (2 * np.random.rand(X.shape[0], X.shape[0], self.n_clusters) - 1) 81 | return distances[:, :, np.newaxis] + noise 82 | 83 | def _calculate_distances(self, X: np.ndarray) -> np.ndarray: 84 | """ 85 | Calculate and return the matrix of pairwise distances 86 | 87 | Args: 88 | X (np.ndarray): The input feature matrix. 89 | 90 | Returns:2 91 | np.ndarray: The matrix of pairwise distances with noise. 92 | """ 93 | distances = np.linalg.norm(X[:, np.newaxis] - X, axis=2) 94 | return np.tile(distances[:, :, np.newaxis], (1, 1, self.n_clusters)) 95 | 96 | def _add_constraints(self, num_points: int, z: dict, y: dict, coef: np.ndarray, b: int): 97 | """ 98 | Add constraints to the optimization model. 99 | 100 | Args: 101 | num_points (int): The number of data points. 102 | z (dict): The decision variables representing pair assignments. 103 | y (dict): The decision variables representing individual assignments. 104 | coef (np.ndarray): Coefficient matrix for the objective function. 105 | b (int): Minimum number of points per cluster. 106 | """ 107 | # Objective 108 | 109 | z_opt, y_opt = self._initialize_variables(num_points) 110 | 111 | if self.ls_pairs_diff_cluster: 112 | for (i, j) in self.ls_pairs_diff_cluster: 113 | for k in range(self.n_clusters): 114 | z_opt[i, j, k].setInitialValue(0) 115 | z_opt[i, j, k].fixValue() 116 | 117 | self.model += lpSum( 118 | z_opt[i, j, k] * coef[i, j, k] 119 | for i in range(num_points - 1) 120 | for j in range(i + 1, num_points) 121 | for k in range(self.n_clusters) 122 | ) 123 | 124 | # Each point is assigned to exactly one cluster 125 | for i in range(num_points): 126 | self.model += lpSum(y_opt[i, k] for k in range(self.n_clusters)) == 1 127 | 128 | # Each cluster has at least b points 129 | for k in range(self.n_clusters): 130 | self.model += lpSum(y_opt[i, k] for i in range(num_points)) >= b 131 | 132 | # Relationship between y and z variables 133 | for i in range(num_points - 1): 134 | for j in range(i + 1, num_points): 135 | for k in range(self.n_clusters): 136 | self.model += z_opt[i, j, k] <= y_opt[i, k] 137 | self.model += z_opt[i, j, k] <= y_opt[j, k] 138 | self.model += z_opt[i, j, k] >= y_opt[i, k] + y_opt[j, k] - 1 139 | 140 | # Exclusion constraints 141 | if self.ls_pairs_diff_cluster: 142 | for (i, j) in self.ls_pairs_diff_cluster: 143 | for k in range(self.n_clusters): 144 | self.model += y_opt[i, k] + y_opt[j, k] <= 1 145 | 146 | # Inclusion constraints 147 | if self.ls_pairs_same_cluster: 148 | for (i, j) in self.ls_pairs_same_cluster: 149 | for k in range(self.n_clusters): 150 | self.model += y_opt[i, k] - y_opt[j, k] == 0 151 | 152 | def extract_cluster_means(self, X: np.ndarray) -> np.ndarray: 153 | """ 154 | Extract cluster means after fitting the model. 155 | 156 | Args: 157 | X (np.ndarray): The input feature matrix. 158 | 159 | Returns: 160 | np.ndarray: An array of cluster means. 161 | """ 162 | num_points = len(X) 163 | # Initialize an array to store means 164 | cluster_means = np.zeros((self.n_clusters, X.shape[1])) 165 | 166 | for k in range(self.n_clusters): 167 | cluster_points = [] # List to store data points assigned to the current cluster 168 | for i in range(num_points): 169 | if self.y[i, k] == 1.0: 170 | cluster_points.append(X[i]) 171 | 172 | if cluster_points: 173 | cluster_means[k] = np.mean(cluster_points, axis=0) 174 | 175 | return cluster_means 176 | 177 | def fit(self, X: np.ndarray, y: Optional[np.ndarray] = None): 178 | """ 179 | Fit the model to the given data using Mixed-Integer Optimization. 180 | 181 | Args: 182 | X (np.ndarray): The input feature matrix. 183 | y (Optional[np.ndarray]): The target vector (not used in this model). 184 | """ 185 | num_points = len(X) 186 | b = int((num_points / self.n_clusters) * 0.1) # Minimum number of points per cluster 187 | 188 | coef = self._calculate_distances_noise(X) 189 | 190 | self._add_constraints(num_points, self.z, self.y, coef, b) 191 | 192 | solver = PULP_CBC_CMD(timeLimit=self.time_limit, warmStart=True) 193 | 194 | # Solve the problem 195 | self.model.solve(solver) 196 | 197 | self.y = np.zeros((num_points, self.n_clusters)) 198 | self.z = np.zeros((num_points, num_points, self.n_clusters)) 199 | 200 | for v in self.model.variables(): 201 | var_value = v.varValue 202 | var_name = v.name 203 | if var_name.startswith("y_"): 204 | # Parse the indices for y 205 | i, k = ( 206 | var_name.replace("(", "") 207 | .replace(")", "") 208 | .replace("y_", "") 209 | .replace(",", "") 210 | .split("_") 211 | ) 212 | i, k = int(i), int(k) 213 | self.y[i, k] = var_value 214 | elif var_name.startswith("z_"): 215 | # Parse the indices for z 216 | i, j, k = ( 217 | var_name.replace("(", "") 218 | .replace(")", "") 219 | .replace("z_", "") 220 | .replace(",", "") 221 | .split("_") 222 | ) 223 | i, j, k = int(i), int(j), int(k) 224 | self.z[i, j, k] = var_value 225 | 226 | # Extract and store cluster means 227 | self.labels = self._get_cluster_assingments(X.shape[0]) 228 | self.cluster_centers = self._compute_cluster_centers(X) 229 | self.wcss = self._compute_wcss(X) 230 | self.silhouette_score = self._compute_silhouette_score(X) 231 | 232 | def _get_cluster_assingments(self, n_rows: int) -> np.ndarray: 233 | """ 234 | Predict cluster assignments for new data points based on stored cluster means. 235 | 236 | Args: 237 | new_data (np.ndarray): The new data points for which predictions are to be made. 238 | 239 | Returns: 240 | np.ndarray: An array of cluster assignments for the new data points. 241 | """ 242 | cluster_assignments = np.zeros(n_rows, dtype=int) 243 | 244 | for i in range(n_rows): 245 | cluster_assignments[i] = np.argmax(self.y[i, :]) # np.argmin(distances) 246 | return cluster_assignments 247 | 248 | def _compute_wcss(self, X: np.ndarray) -> float: 249 | """ 250 | Compute the Within-Cluster Sum of Squares (WCSS) for the fitted model. 251 | 252 | Args: 253 | X (np.ndarray): The input feature matrix used for fitting the model. 254 | 255 | Returns: 256 | float: The computed WCSS value. 257 | 258 | Raises: 259 | ValueError: If the model has not been fitted yet or if cluster means are not available. 260 | """ 261 | 262 | wcss = 0.0 263 | cluster_labels_pred = self.labels 264 | 265 | for cluster_idx in range(self.n_clusters): 266 | cluster_points = X[cluster_labels_pred == cluster_idx] 267 | wcss += np.sum((cluster_points - self.cluster_centers[cluster_idx]) ** 2) 268 | return wcss 269 | 270 | def _compute_cluster_centers(self, X: np.ndarray) -> np.ndarray: 271 | """ 272 | Extract cluster means after fitting the model. 273 | 274 | Args: 275 | X (np.ndarray): The input feature matrix. 276 | 277 | Returns: 278 | np.ndarray: An array of cluster means. 279 | """ 280 | # Initialize an array to store means 281 | cluster_centers = np.zeros((self.n_clusters, X.shape[1])) 282 | for i in range(X.shape[0]): 283 | for k in range(self.n_clusters): 284 | cluster_points = [] # List to store data points assigned to the current cluster 285 | if self.labels[i] == k: 286 | cluster_points.append(X[i, :]) 287 | if cluster_points: 288 | cluster_centers[k] = np.mean(cluster_points, axis=0) 289 | 290 | return cluster_centers 291 | 292 | def _compute_silhouette_score(self, X: np.ndarray) -> float: 293 | """ """ 294 | from sklearn.metrics import silhouette_score 295 | 296 | silhouette_avg = silhouette_score(X, self.labels) 297 | return silhouette_avg 298 | 299 | def predict(self, X: np.ndarray) -> np.ndarray: 300 | """ 301 | Predict cluster assignments for new data points based on stored cluster means. 302 | 303 | Args: 304 | new_data (np.ndarray): The new data points for which predictions are to be made. 305 | 306 | Returns: 307 | np.ndarray: An array of cluster assignments for the new data points. 308 | """ 309 | 310 | num_new_points = len(X) 311 | n_clusters = self.n_clusters 312 | 313 | cluster_assignments = np.zeros(num_new_points, dtype=int) 314 | 315 | for i in range(num_new_points): 316 | # Calculate distances between the new data point and cluster means 317 | distances = [np.linalg.norm(X[i] - self.cluster_centers[k]) for k in range(n_clusters)] 318 | cluster_assignments[i] = np.argmin(distances) 319 | return cluster_assignments 320 | -------------------------------------------------------------------------------- /backbone_learn/heuristic_solvers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chziakas/backbone-learn/576749c7a1ffa1e57ea4c018a88a052d20fad2e2/backbone_learn/heuristic_solvers/__init__.py -------------------------------------------------------------------------------- /backbone_learn/heuristic_solvers/cart_decision_tree.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | import numpy as np 5 | from sklearn.model_selection import GridSearchCV 6 | from sklearn.tree import DecisionTreeClassifier 7 | 8 | from .heauristic_solver_base import HeuristicSolverBase 9 | 10 | 11 | class CARTDecisionTree(HeuristicSolverBase): 12 | """ 13 | Implements a Classification And Regression Tree (CART) Decision Tree with cross-validation using AUC. 14 | This solver is a heuristic approach for fitting a decision tree model and identifying significant features. 15 | 16 | Attributes: 17 | _model (DecisionTreeClassifier): An instance of the sklearn DecisionTreeClassifier. 18 | _auc_score (float): The maximum AUC score obtained during cross-validation. 19 | """ 20 | 21 | def __init__(self, **kwargs): 22 | """ 23 | Initializes the CARTDecisionTree with a DecisionTreeClassifier model. 24 | """ 25 | self._model = DecisionTreeClassifier() 26 | self._auc_score = None 27 | 28 | @property 29 | def auc_score(self) -> float: 30 | """ 31 | Returns the maximum AUC score obtained from cross-validation. 32 | 33 | Returns: 34 | float: The maximum AUC score. 35 | """ 36 | return self._auc_score 37 | 38 | def fit(self, X: np.ndarray, y: np.ndarray, cv_folds: int = 5, random_state: int = 0) -> None: 39 | """ 40 | Fits a CART Decision Tree model to the data using hyperparameter tuning with cross-validation and evaluates it using AUC. 41 | 42 | Args: 43 | X (np.ndarray): The input features as a NumPy array. 44 | y (np.ndarray): The target labels as a NumPy array. 45 | cv_folds (int): The number of folds to use for cross-validation. 46 | 47 | """ 48 | self._model.set_params(random_state=random_state) 49 | # Define the parameter grid for hyperparameter tuning 50 | param_grid = {"max_depth": [None, 5, 10, 20], "min_samples_leaf": [1, 2, 4]} 51 | 52 | # Initialize GridSearchCV with the model and parameter grid 53 | grid_search = GridSearchCV( 54 | self._model, param_grid, cv=cv_folds, scoring="roc_auc", verbose=1 55 | ) 56 | 57 | # Perform the grid search on the provided data 58 | grid_search.fit(X, y) 59 | 60 | # Update the model with the best found parameters 61 | self._model = grid_search.best_estimator_ 62 | 63 | # Store the best AUC score 64 | self._auc_score = grid_search.best_score_ 65 | 66 | def get_relevant_variables(self, threshold: float) -> np.ndarray: 67 | """ 68 | Identifies features with importance greater than a specified threshold. 69 | 70 | Args: 71 | threshold (float): The threshold for determining feature relevance. 72 | 73 | Returns: 74 | np.ndarray: An array of indices of relevant features. 75 | """ 76 | 77 | significant_indices = np.where(self._model.feature_importances_ > threshold)[0] 78 | return significant_indices 79 | 80 | def predict(self, X: np.ndarray) -> np.ndarray: 81 | """ 82 | Predicts the target labels for the given data. 83 | 84 | Args: 85 | X (np.ndarray): The input features as a NumPy array. 86 | 87 | Returns: 88 | np.ndarray: The predicted target labels. 89 | """ 90 | return self._model.predict(X) 91 | -------------------------------------------------------------------------------- /backbone_learn/heuristic_solvers/heauristic_solver_base.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | from abc import ABC, abstractmethod 5 | 6 | import numpy as np 7 | 8 | 9 | class HeuristicSolverBase(ABC): 10 | """ 11 | Abstract class for heuristic solvers. 12 | 13 | This class provides a framework for defining heuristic solvers that can fit models to data and identify relevant features. 14 | Derived classes need to implement the `fit` and `get_relevant_features` methods according to their specific heuristic approach. 15 | """ 16 | 17 | @abstractmethod 18 | def fit(self, X: np.ndarray, y: np.ndarray, random_state: int): 19 | """ 20 | Fits a model to the given data using a heuristic approach. 21 | 22 | This method should be implemented to solve a subproblem using the input data matrix X and the target vector y. 23 | It should fit a model based on a heuristic algorithm specific to the derived class. 24 | 25 | Args: 26 | X (np.ndarray): The input feature matrix. 27 | y (np.ndarray): The target vector. 28 | random_state (int): The seed used by the random number generator. Default is 0. 29 | 30 | Returns: 31 | None: The method should fit the model to the data, with the results stored internally within the class instance. 32 | """ 33 | 34 | @property 35 | def model(self): 36 | # Return the fitted model 37 | return self._model 38 | 39 | @abstractmethod 40 | def get_relevant_variables(self, **kwargs): 41 | """ 42 | Identifies relevant variables with importance greater than a specified threshold. 43 | 44 | This method should be implemented to determine the most relevant variables based on the model fitted using the `fit` method. 45 | It should return the indices of variables that will be used for the exact solver 46 | """ 47 | -------------------------------------------------------------------------------- /backbone_learn/heuristic_solvers/kmeans_solver.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | from typing import List, Tuple 5 | 6 | import numpy as np 7 | from sklearn.cluster import KMeans 8 | from sklearn.metrics import silhouette_score 9 | 10 | from .heauristic_solver_base import HeuristicSolverBase 11 | 12 | 13 | class KMeansSolver(HeuristicSolverBase): 14 | """ 15 | A heuristic solver that applies KMeans clustering to identify relevant instances. 16 | """ 17 | 18 | def __init__(self, n_clusters: int = 10, **kwargs) -> None: 19 | """ 20 | Initializes the KMeansHeuristicSolver with a specified number of clusters. 21 | Args: 22 | n_clusters (int): The number of clusters to form. 23 | """ 24 | self.n_clusters: int = n_clusters 25 | self._model = KMeans() 26 | self.wcss = None 27 | 28 | def _compute_cluster_centers(self, X: np.ndarray) -> np.ndarray: 29 | """ 30 | Extract cluster means after fitting the model. 31 | 32 | Args: 33 | X (np.ndarray): The input feature matrix. 34 | 35 | Returns: 36 | np.ndarray: An array of cluster means. 37 | """ 38 | # Initialize an array to store means 39 | cluster_centers = np.zeros((self.n_clusters, X.shape[1])) 40 | for i in range(X.shape[0]): 41 | for k in range(self.n_clusters): 42 | cluster_points = [] # List to store data points assigned to the current cluster 43 | if self._model.labels_[i] == k: 44 | cluster_points.append(X[i, :]) 45 | if cluster_points: 46 | cluster_centers[k] = np.mean(cluster_points, axis=0) 47 | 48 | return cluster_centers 49 | 50 | def fit( 51 | self, 52 | X: np.ndarray, 53 | y: np.ndarray = None, 54 | init: str = "k-means++", 55 | n_init: int = 10, 56 | max_iter: int = 300, 57 | tol: float = 0.0001, 58 | random_state: int = 0, 59 | ) -> None: 60 | """ 61 | Applies KMeans clustering to the data with customizable hyperparameters. 62 | Args: 63 | X (np.ndarray): Input feature matrix. 64 | y (np.ndarray): Target vector (not used in clustering). 65 | init (str): Method for initialization. 66 | n_init (int): Number of time the k-means algorithm will be run with different centroid seeds. 67 | max_iter (int): Maximum number of iterations of the k-means algorithm for a single run. 68 | tol (float): Relative tolerance with regards to Frobenius norm of the difference in the cluster centers. 69 | random_state (int): Determines random number generation for centroid initialization. 70 | """ 71 | if X.shape[0] < self.n_clusters: 72 | self.n_clusters = X.shape[0] 73 | 74 | self._model.set_params( 75 | n_clusters=self.n_clusters, 76 | init="random", 77 | n_init=n_init, 78 | max_iter=max_iter, 79 | tol=tol, 80 | random_state=random_state, 81 | ) 82 | self._model.fit(X) 83 | self.cluster_centers = self._compute_cluster_centers(X) 84 | self.wcss = self._compute_wcss(X) 85 | self.silhouette_score = self._compute_silhouette_score(X) 86 | 87 | def get_relevant_variables(self) -> List[Tuple[int, int]]: 88 | """ 89 | Identifies tuples of instance indices that are not in the same cluster. 90 | Returns: 91 | List of tuples: Each tuple contains indices of instances not in the same cluster. 92 | """ 93 | 94 | n = len(self._model.labels_) 95 | grid_x, grid_y = np.meshgrid(np.arange(n), np.arange(n), indexing="ij") 96 | mask = self._model.labels_[grid_x] != self._model.labels_[grid_y] 97 | upper_triangle_mask = np.triu(mask, k=1) 98 | i_indices, j_indices = np.where(upper_triangle_mask) 99 | different_pairs = [(min(i, j), max(i, j)) for i, j in zip(i_indices, j_indices)] 100 | return different_pairs 101 | 102 | def _compute_wcss(self, X: np.ndarray) -> float: 103 | """ 104 | Mthod to calculate the Within-Cluster Sum of Squares (WCSS). 105 | 106 | Args: 107 | X (np.ndarray): The input feature matrix. 108 | 109 | Returns: 110 | float: The WCSS value. 111 | """ 112 | wcss = 0.0 113 | cluster_labels_pred = self._model.labels_ 114 | 115 | for cluster_idx in range(self.n_clusters): 116 | cluster_points = X[cluster_labels_pred == cluster_idx] 117 | wcss += np.sum((cluster_points - self.cluster_centers[cluster_idx]) ** 2) 118 | 119 | return wcss 120 | 121 | def _compute_silhouette_score(self, X: np.ndarray) -> float: 122 | """ """ 123 | # Check if the number of unique clusters is 1 or equal to the number of samples 124 | if len(set(self._model.labels_)) == 1 or len(X) == len(set(self._model.labels_)): 125 | # Silhouette score cannot be computed in this case 126 | return 0.0 127 | silhouette_avg = silhouette_score(X, self._model.labels_) 128 | return silhouette_avg 129 | -------------------------------------------------------------------------------- /backbone_learn/heuristic_solvers/lasso_regression.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | import numpy as np 5 | from sklearn.linear_model import LassoCV 6 | 7 | from .heauristic_solver_base import HeuristicSolverBase 8 | 9 | 10 | class LassoRegression(HeuristicSolverBase): 11 | """ 12 | Implements Lasso regression for feature selection using cross-validation. 13 | 14 | This class uses Lasso (Least Absolute Shrinkage and Selection Operator) regression, 15 | which is a type of linear regression that uses shrinkage. Shrinkage is where data values 16 | are shrunk towards a central point, like the mean. The lasso procedure encourages simple, 17 | sparse models (i.e. models with fewer parameters). 18 | 19 | Attributes: 20 | _model (LassoCV): The LassoCV regression model. 21 | _mse_score (float): The mean squared error score of the trained model. 22 | """ 23 | 24 | def __init__(self, **kwargs): 25 | """ 26 | Initializes the LassoRegression with specified cross-validation folds and random state. 27 | 28 | Args: 29 | cv_folds (int): The number of cross-validation folds to use. Default is 5. 30 | """ 31 | self._model = LassoCV() 32 | self._mse_score = None 33 | 34 | @property 35 | def mse_score(self) -> float: 36 | """ 37 | Returns the mean squared error score of the trained model. 38 | 39 | Returns: 40 | float: The mean squared error score. 41 | """ 42 | return self._mse_score 43 | 44 | def fit( 45 | self, 46 | X: np.ndarray, 47 | y: np.ndarray, 48 | alphas=None, 49 | max_iter=1000, 50 | tol=0.0001, 51 | selection="cyclic", 52 | cv_folds=5, 53 | random_state=0, 54 | ) -> None: 55 | """ 56 | Fits a sparse regression model to the data using LassoCV. 57 | 58 | Args: 59 | X (np.ndarray): The input feature matrix. 60 | y (np.ndarray): The target variable. 61 | alphas (array-like, optional): List of alphas where to compute the models. If None alphas are set automatically. 62 | max_iter (int): The maximum number of iterations. 63 | tol (float): The tolerance for the optimization. 64 | selection (str): If set to 'random', a random coefficient is updated every iteration. 65 | """ 66 | self._model.set_params( 67 | cv=cv_folds, 68 | random_state=random_state, 69 | alphas=alphas, 70 | max_iter=max_iter, 71 | tol=tol, 72 | selection=selection, 73 | ) 74 | self._model.fit(X, y) # Fit the _model on the dataset 75 | 76 | def get_relevant_variables(self, threshold: float) -> np.ndarray: 77 | """ 78 | Identifies features with coefficients greater than a specified threshold. 79 | 80 | Args: 81 | threshold (float): The threshold for determining feature relevance. 82 | 83 | Returns: 84 | np.ndarray: Indices of features whose coefficients are above the threshold. 85 | """ 86 | 87 | significant_indices = np.where(np.abs(self._model.coef_) > threshold)[0] 88 | return significant_indices 89 | 90 | def predict(self, X: np.ndarray) -> np.ndarray: 91 | """ 92 | Predicts the target values for the given data using the trained Lasso model. 93 | 94 | Args: 95 | X (np.ndarray): The input feature matrix. 96 | 97 | Returns: 98 | np.ndarray: The predicted target values. 99 | """ 100 | return self._model.predict(X) 101 | 102 | def keep_top_features(self, n_non_zeros: int) -> None: 103 | """ 104 | Retain only the top 'n_non_zeros' features in the Lasso model. 105 | 106 | Args: 107 | n_non_zeros (int): Number of features to retain. 108 | """ 109 | 110 | # Get the absolute values of the coefficients 111 | coef_magnitude = np.abs(self._model.coef_) 112 | 113 | # Find the threshold for the top 'n_non_zeros' coefficients 114 | threshold = ( 115 | np.sort(coef_magnitude)[-n_non_zeros] if n_non_zeros < len(coef_magnitude) else 0 116 | ) 117 | 118 | # Zero out coefficients that are below the threshold 119 | self._model.coef_[coef_magnitude < threshold] = 0 120 | -------------------------------------------------------------------------------- /backbone_learn/screen_selectors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chziakas/backbone-learn/576749c7a1ffa1e57ea4c018a88a052d20fad2e2/backbone_learn/screen_selectors/__init__.py -------------------------------------------------------------------------------- /backbone_learn/screen_selectors/linear_regression_selector.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | import numpy as np 5 | 6 | from .screen_selector_base import ScreenSelectorBase 7 | 8 | 9 | class LinearRegressionSelector(ScreenSelectorBase): 10 | """ 11 | Screen selector that uses linear regression coefficients for calculating utilities. 12 | """ 13 | 14 | def calculate_utilities(self, X: np.ndarray, y: np.ndarray) -> np.ndarray: 15 | """ 16 | Calculate utilities based on the coefficients of a linear regression model. 17 | """ 18 | # Add intercept term to X 19 | X_with_intercept = np.hstack([np.ones((X.shape[0], 1)), X]) 20 | 21 | # Calculate coefficients using normal equation 22 | try: 23 | inv = np.linalg.inv(np.dot(X_with_intercept.T, X_with_intercept)) 24 | coefficients = np.dot(inv, np.dot(X_with_intercept.T, y))[1:] # Exclude intercept 25 | except np.linalg.LinAlgError: 26 | # If X'X is not invertible, return zero utilities 27 | coefficients = np.zeros(X.shape[1]) 28 | 29 | # Set utilities as the absolute value of coefficients 30 | utilities = np.abs(coefficients) 31 | return utilities 32 | -------------------------------------------------------------------------------- /backbone_learn/screen_selectors/pearson_correlation_selector.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | import numpy as np 5 | 6 | from .screen_selector_base import ScreenSelectorBase 7 | 8 | 9 | class PearsonCorrelationSelector(ScreenSelectorBase): 10 | """ 11 | Screen selector that uses Pearson correlation for calculating utilities. 12 | """ 13 | 14 | def calculate_utilities(self, X: np.ndarray, y: np.ndarray) -> np.ndarray: 15 | """ 16 | Calculate utilities based on Pearson correlation. 17 | """ 18 | n_features = X.shape[1] 19 | utilities = np.zeros(n_features) 20 | 21 | y_mean = PearsonCorrelationSelector.compute_mean(y) 22 | y_std = PearsonCorrelationSelector.compute_std(y) 23 | 24 | for i in range(n_features): 25 | x_mean = PearsonCorrelationSelector.compute_mean(X[:, i]) 26 | x_std = PearsonCorrelationSelector.compute_std(X[:, i]) 27 | 28 | if x_std == 0 or y_std == 0: 29 | correlation = 0 30 | else: 31 | covariance = PearsonCorrelationSelector.compute_covariance( 32 | X[:, i], y, x_mean, y_mean 33 | ) 34 | correlation = covariance / (x_std * y_std) 35 | 36 | utilities[i] = np.abs(correlation) 37 | return utilities 38 | 39 | @staticmethod 40 | def compute_mean(array: np.ndarray) -> float: 41 | """ 42 | Compute the mean of a numpy array. 43 | """ 44 | return np.mean(array) 45 | 46 | @staticmethod 47 | def compute_std(array: np.ndarray) -> float: 48 | """ 49 | Compute the standard deviation of a numpy array. 50 | """ 51 | return np.std(array) 52 | 53 | @staticmethod 54 | def compute_covariance(x: np.ndarray, y: np.ndarray, x_mean: float, y_mean: float) -> float: 55 | """ 56 | Compute the covariance between two numpy arrays. 57 | """ 58 | return np.mean((x - x_mean) * (y - y_mean)) 59 | -------------------------------------------------------------------------------- /backbone_learn/screen_selectors/screen_selector_base.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | 3 | import numpy as np 4 | 5 | from ..utils.utils import Utils 6 | from .variable_selector import VariableSelector 7 | 8 | 9 | class ScreenSelectorBase(VariableSelector): 10 | """ 11 | Abstract base class for screen selectors. 12 | """ 13 | 14 | def __init__(self, alpha: float = 1.0): 15 | """ 16 | Initialize the ScreenSelectorBase with a default alpha value. 17 | 18 | Args: 19 | alpha (float): The proportion of features to retain after screening. 20 | """ 21 | self.alpha = alpha 22 | self.utilities = None 23 | self.indices_keep = None 24 | 25 | @abstractmethod 26 | def calculate_utilities(self, X: np.ndarray, y: np.ndarray) -> np.ndarray: 27 | """ 28 | Calculate the utilities for each feature. 29 | 30 | Args: 31 | X (np.ndarray): The feature matrix. 32 | y (np.ndarray): The target array. 33 | 34 | Returns: 35 | np.ndarray: This method should return the utilities 36 | """ 37 | 38 | def select(self, X: np.ndarray, y: np.ndarray) -> np.ndarray: 39 | """ 40 | Selects features based on calculated utilities and alpha value. 41 | 42 | Args: 43 | X (np.ndarray): The feature matrix. 44 | y (np.ndarray): The target array. 45 | 46 | Returns: 47 | np.ndarray: The selected feature matrix. 48 | """ 49 | n_features = X.shape[1] 50 | 51 | # Calculate utilities if not already done 52 | if self.utilities is None: 53 | self.utilities = self.calculate_utilities(X, y) 54 | 55 | # Determine the number of features to keep 56 | num_keep = int(self.alpha * n_features) 57 | 58 | # Select indices of the top utilities 59 | self.indices_keep = self.select_indices(self.utilities, num_keep) 60 | return X[:, self.indices_keep] 61 | 62 | @staticmethod 63 | def select_indices(utilities: np.ndarray, num_keep: int) -> np.ndarray: 64 | """ 65 | Selects indices of the top utilities. 66 | 67 | Args: 68 | utilities (np.ndarray): Array of utilities for each feature. 69 | num_keep (int): Number of top features to keep. 70 | 71 | Returns: 72 | np.ndarray: Indices of the top utilities. 73 | """ 74 | return Utils.find_idx_highest(utilities, num_keep) 75 | -------------------------------------------------------------------------------- /backbone_learn/screen_selectors/variable_selector.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | from abc import ABC, abstractmethod 5 | 6 | import numpy as np 7 | 8 | 9 | class VariableSelector(ABC): 10 | """ 11 | Abstract base class for variable selectors. 12 | """ 13 | 14 | @abstractmethod 15 | def select(self, X: np.ndarray, y: np.ndarray) -> np.ndarray: 16 | """ 17 | Abstract method to select features. 18 | 19 | Args: 20 | X (np.ndarray): The feature matrix. 21 | y (np.ndarray): The target array. 22 | 23 | Returns: 24 | np.ndarray: The selected feature matrix. 25 | """ 26 | -------------------------------------------------------------------------------- /backbone_learn/utils/__init__ .py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chziakas/backbone-learn/576749c7a1ffa1e57ea4c018a88a052d20fad2e2/backbone_learn/utils/__init__ .py -------------------------------------------------------------------------------- /backbone_learn/utils/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | import numpy as np 5 | 6 | 7 | class Utils: 8 | @staticmethod 9 | def keep_lowest(arr: np.ndarray, num_keep: int) -> np.ndarray: 10 | """ 11 | Keep the specified number of lowest values in a numpy array. 12 | 13 | This function identifies the `num_keep` lowest values in the provided numpy array `arr` 14 | and returns a new array containing only these values. 15 | 16 | Parameters: 17 | - arr (np.ndarray): The input numpy array from which the lowest values are to be selected. 18 | - num_keep (int): The number of lowest values to keep from the array. 19 | 20 | Returns: 21 | - np.ndarray: A numpy array containing the lowest `num_keep` values from the input array. 22 | 23 | """ 24 | indices_keep = np.argpartition(arr, num_keep)[:num_keep] 25 | mask = np.zeros(len(arr), dtype=bool) 26 | mask[indices_keep] = True 27 | return arr[mask] 28 | 29 | @staticmethod 30 | def find_idx_highest(arr: np.ndarray, num_keep: int) -> np.ndarray: 31 | """ 32 | Keep the specified number of highest values in a numpy array. 33 | 34 | This function identifies the `num_keep` highest values in the provided numpy array `arr` 35 | and returns the indices of these values. 36 | 37 | Parameters: 38 | - arr (np.ndarray): The input numpy array from which the highest values are to be selected. 39 | - num_keep (int): The number of highest values whose indices are to be kept. 40 | 41 | Returns: 42 | - np.ndarray: An array of indices corresponding to the highest `num_keep` values in the input array. 43 | 44 | Raises: 45 | - ValueError: If `num_keep` is larger than the size of `arr` or if `num_keep` is negative. 46 | 47 | """ 48 | if not (0 <= num_keep <= len(arr)): 49 | raise ValueError( 50 | "num_keep must be non-negative and less than or equal to the length of arr" 51 | ) 52 | 53 | # np.argpartition is used to find the indices of the `num_keep` highest values 54 | # We use -num_keep to find the highest values (since argpartition sorts ascendingly) 55 | indices = np.argpartition(arr, -num_keep)[-num_keep:] 56 | 57 | # Sort the indices to get them in the order they appear in the original array 58 | return np.sort(indices) 59 | 60 | @staticmethod 61 | def merge_lists_and_sort(list_of_lists): 62 | """ 63 | Merges a list of lists into a single list, removes duplicates, and sorts the list. 64 | 65 | Args: 66 | list_of_lists (list of list of int): The list of lists to merge. 67 | 68 | Returns: 69 | list: A sorted list with unique elements. 70 | """ 71 | merged_list = list(set(item for sublist in list_of_lists for item in sublist)) 72 | merged_list.sort() 73 | return merged_list 74 | 75 | @staticmethod 76 | def find_common_tuples(list_of_lists): 77 | """ 78 | Find tuples that are common to all sublists within a given list of lists. 79 | 80 | Parameters: 81 | list_of_lists (list of list of tuples): A list containing sublists, where each sublist contains tuples. 82 | 83 | Returns: 84 | list: A list of tuples that are common to all sublists. 85 | """ 86 | if not list_of_lists or not all(list_of_lists): 87 | return [] 88 | # Find common tuples by intersecting all sublists 89 | common_tuples = set(list_of_lists[0]) 90 | for sublist in list_of_lists[1:]: 91 | common_tuples.intersection_update(sublist) 92 | 93 | return list(common_tuples) 94 | 95 | @staticmethod 96 | def generate_index_pairs(total_points, excluded_pairs): 97 | """ 98 | Generate a list of index pairs from a range of points, excluding specific pairs. 99 | 100 | Parameters: 101 | total_points (int): The total number of points to consider for generating index pairs. 102 | excluded_pairs (list of tuples): A list of index pairs to be excluded from the generated pairs. 103 | 104 | Returns: 105 | list: A list of tuples, each representing a pair of indices that are not in the excluded pairs. 106 | """ 107 | index_pairs = [] 108 | for i in range(total_points - 1): 109 | for j in range(i + 1, total_points): 110 | if (i, j) not in excluded_pairs: 111 | index_pairs.append((i, j)) 112 | return index_pairs 113 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/api_docs/backbone_learn.exact_solvers.rst: -------------------------------------------------------------------------------- 1 | backbone\_learn.exact\_solvers package 2 | ====================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | backbone\_learn.exact\_solvers.benders\_oct\_decision\_tree module 8 | ------------------------------------------------------------------ 9 | 10 | .. automodule:: backbone_learn.exact_solvers.benders_oct_decision_tree 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | backbone\_learn.exact\_solvers.exact\_solver\_base module 16 | --------------------------------------------------------- 17 | 18 | .. automodule:: backbone_learn.exact_solvers.exact_solver_base 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | backbone\_learn.exact\_solvers.lobnb\_regression module 24 | ------------------------------------------------------- 25 | 26 | .. automodule:: backbone_learn.exact_solvers.lobnb_regression 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | backbone\_learn.exact\_solvers.lobnb\_regression\_model module 32 | -------------------------------------------------------------- 33 | 34 | .. automodule:: backbone_learn.exact_solvers.lobnb_regression_model 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | backbone\_learn.exact\_solvers.mio\_clustering module 40 | ----------------------------------------------------- 41 | 42 | .. automodule:: backbone_learn.exact_solvers.mio_clustering 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | Module contents 48 | --------------- 49 | 50 | .. automodule:: backbone_learn.exact_solvers 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | -------------------------------------------------------------------------------- /docs/api_docs/backbone_learn.heuristic_solvers.rst: -------------------------------------------------------------------------------- 1 | backbone\_learn.heuristic\_solvers package 2 | ========================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | backbone\_learn.heuristic\_solvers.cart\_decision\_tree module 8 | -------------------------------------------------------------- 9 | 10 | .. automodule:: backbone_learn.heuristic_solvers.cart_decision_tree 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | backbone\_learn.heuristic\_solvers.heauristic\_solver\_base module 16 | ------------------------------------------------------------------ 17 | 18 | .. automodule:: backbone_learn.heuristic_solvers.heauristic_solver_base 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | backbone\_learn.heuristic\_solvers.kmeans\_solver module 24 | -------------------------------------------------------- 25 | 26 | .. automodule:: backbone_learn.heuristic_solvers.kmeans_solver 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | backbone\_learn.heuristic\_solvers.lasso\_regression module 32 | ----------------------------------------------------------- 33 | 34 | .. automodule:: backbone_learn.heuristic_solvers.lasso_regression 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | Module contents 40 | --------------- 41 | 42 | .. automodule:: backbone_learn.heuristic_solvers 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | -------------------------------------------------------------------------------- /docs/api_docs/backbone_learn.rst: -------------------------------------------------------------------------------- 1 | backbone\_learn package 2 | ======================= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | backbone_learn.exact_solvers 11 | backbone_learn.heuristic_solvers 12 | backbone_learn.screen_selectors 13 | 14 | Module contents 15 | --------------- 16 | 17 | .. automodule:: backbone_learn 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | -------------------------------------------------------------------------------- /docs/api_docs/backbone_learn.screen_selectors.rst: -------------------------------------------------------------------------------- 1 | backbone\_learn.screen\_selectors package 2 | ========================================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | backbone\_learn.screen\_selectors.linear\_regression\_selector module 8 | --------------------------------------------------------------------- 9 | 10 | .. automodule:: backbone_learn.screen_selectors.linear_regression_selector 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | backbone\_learn.screen\_selectors.pearson\_correlation\_selector module 16 | ----------------------------------------------------------------------- 17 | 18 | .. automodule:: backbone_learn.screen_selectors.pearson_correlation_selector 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | backbone\_learn.screen\_selectors.screen\_selector\_base module 24 | --------------------------------------------------------------- 25 | 26 | .. automodule:: backbone_learn.screen_selectors.screen_selector_base 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | backbone\_learn.screen\_selectors.variable\_selector module 32 | ----------------------------------------------------------- 33 | 34 | .. automodule:: backbone_learn.screen_selectors.variable_selector 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | Module contents 40 | --------------- 41 | 42 | .. automodule:: backbone_learn.screen_selectors 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | import os 3 | import sys 4 | 5 | sys.path.insert(0, os.path.abspath("..")) 6 | 7 | # 8 | # For the full list of built-in configuration values, see the documentation: 9 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 10 | 11 | # -- Project information ----------------------------------------------------- 12 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 13 | 14 | project = "backbone-learn" 15 | copyright = "2023, Christos Ziakas, Vassilis Digalakis Jr" 16 | author = "Christos Ziakas, Vassilis Digalakis Jr" 17 | 18 | # -- General configuration --------------------------------------------------- 19 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 20 | 21 | extensions = [ 22 | "readthedocs_ext.readthedocs", 23 | "sphinx.ext.viewcode", 24 | "sphinx.ext.todo", 25 | "sphinx.ext.autodoc", 26 | "sphinx.ext.napoleon", 27 | ] 28 | 29 | templates_path = ["_templates"] 30 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 31 | 32 | 33 | # -- Options for HTML output ------------------------------------------------- 34 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 35 | html_theme = "sphinx_rtd_theme" 36 | html_static_path = ["_static"] 37 | -------------------------------------------------------------------------------- /docs/contribution_guide/CONDUCT.rst: -------------------------------------------------------------------------------- 1 | Code of Conduct for BackboneLearn 2 | ================================= 3 | 4 | Our Pledge 5 | ---------- 6 | 7 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 8 | 9 | Our Standards 10 | ------------- 11 | 12 | Examples of behavior that contributes to creating a positive environment include: 13 | 14 | - Using welcoming and inclusive language 15 | - Being respectful of differing viewpoints and experiences 16 | - Gracefully accepting constructive criticism 17 | - Focusing on what is best for the community 18 | - Showing empathy towards other community members 19 | 20 | Examples of unacceptable behavior by participants include: 21 | 22 | - The use of sexualized language or imagery and unwelcome sexual attention or advances 23 | - Trolling, insulting/derogatory comments, and personal or political attacks 24 | - Public or private harassment 25 | - Publishing others' private information, such as a physical or electronic address, without explicit permission 26 | - Other conduct which could reasonably be considered inappropriate in a professional setting 27 | 28 | Our Responsibilities 29 | --------------------- 30 | 31 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 32 | 33 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned with this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 34 | 35 | Scope 36 | ----- 37 | 38 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 39 | 40 | Enforcement 41 | ----------- 42 | 43 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at chziakas@gmail.com. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. 44 | 45 | This Code of Conduct is adapted from the `Contributor Covenant `_, version 1.4, available at `https://www.contributor-covenant.org/version/1/4/code-of-conduct.html `_ 46 | -------------------------------------------------------------------------------- /docs/contribution_guide/CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | Contributing to BackboneLearn 2 | ============================= 3 | 4 | Introduction 5 | ------------ 6 | 7 | Thank you for considering a contribution to BackboneLearn! 8 | This project is dedicated to advancing the field of mixed integer optimization in machine learning. Contributions in various forms, such as new methods, bug fixes, or documentation enhancements, are all highly valued. 9 | 10 | How to Contribute 11 | ----------------- 12 | 13 | Step 1: Understanding the Project 14 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 15 | 16 | Explore the `BackboneLearn documentation `_ to understand the project's scope, functionalities, and architecture. We encourage future contributors to review our research paper and other key open-source libraries. 17 | 18 | Step 2: Setting Up 19 | ^^^^^^^^^^^^^^^^^^ 20 | 21 | **Fork and Clone the Repository:** 22 | 23 | - Fork the BackboneLearn repository on GitHub. 24 | - Clone your fork to your local environment. 25 | 26 | **Environment Setup:** 27 | 28 | - Install Python 3.9, if not already installed. 29 | - Set up your development environment using Poetry for dependency management: 30 | 31 | .. code-block:: bash 32 | 33 | pip install poetry 34 | poetry install 35 | 36 | - Utilize pre-commit hooks to maintain code quality: 37 | 38 | .. code-block:: bash 39 | 40 | pre-commit install 41 | 42 | Step 3: Making Changes 43 | ^^^^^^^^^^^^^^^^^^^^^ 44 | 45 | **Finding or Creating an Issue:** 46 | 47 | - Check the `GitHub issues `_ for existing tasks or bugs. 48 | - You can also create new issues for proposing features or improvements. 49 | 50 | **Creating a Branch:** 51 | 52 | - Create a new branch in your fork for your changes. 53 | 54 | **Developing:** 55 | 56 | - Implement your changes, ensuring to adhere to the project's coding standards. 57 | - Write or update tests using Pytest. 58 | - Run and pass all tests before submission. 59 | 60 | Step 4: Submitting Your Contribution 61 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 62 | 63 | **Committing and Pushing Changes:** 64 | 65 | - Commit your changes with clear messages. 66 | - Push your changes to your fork on GitHub. 67 | 68 | **Creating a Pull Request:** 69 | 70 | - Open a pull request against the main BackboneLearn repository. 71 | - Describe your changes in detail and link to any relevant issues. 72 | 73 | **Code Review and GitHub Actions:** 74 | 75 | - Engage in the code review process. 76 | - Your code will be automatically checked by GitHub Actions. 77 | - Make necessary changes based on feedback. 78 | 79 | 80 | Reporting Issues and Suggestions 81 | -------------------------------- 82 | 83 | **Check Existing Issues:** 84 | 85 | Before reporting, search the GitHub issue to ensure it's unique. 86 | 87 | **Create New Issue:** 88 | 89 | Navigate to 'Issues' in the BackboneLearn repository and click 'New Issue'. 90 | 91 | **Fill the Template and Submit:** 92 | 93 | Provide a detailed title and description. Include steps to reproduce, expected and actual behavior, code snippets, or screenshots if applicable. After completing the form, click 'Submit new issue'. 94 | 95 | Questions or Issues? 96 | -------------------- 97 | 98 | If you encounter any issues or have questions, please open a discussion issue on GitHub. 99 | -------------------------------------------------------------------------------- /docs/developer_guide/custom_implementation_guide.rst: -------------------------------------------------------------------------------- 1 | Custom Implementation Guide 2 | --------------------------- 3 | 4 | To build customized backbone algorithms for BackboneLearn, follow these steps: 5 | 6 | Custom Screening Method 7 | ^^^^^^^^^^^^^^^^^^^^^^^ 8 | 9 | **Extend `ScreenSelectorBase` in `'backbone_learn/screen_selector'`.** 10 | 11 | **Implement the `calculate_utilities` Method:** 12 | 13 | - Compute utilities or importances for each feature based on your criteria. 14 | - Features with the lowest scores may be considered for elimination. 15 | - The number of features to keep is `alpha * n_features`. 16 | 17 | Custom Heuristic Method 18 | ^^^^^^^^^^^^^^^^^^^^^^^ 19 | 20 | **Extend `HeuristicSolverBase` in `'backbone_learn/heuristic_solvers'`.** 21 | **Implement `fit` and `get_relevant_features`:** 22 | 23 | - Train a model within each subproblem efficiently. 24 | - Fit a sparse model to the data inputs of the subproblem. 25 | - Identify and extract relevant features. 26 | - Define necessary parameters in the `init` or `fit` method. 27 | 28 | Custom Exact Solver 29 | ^^^^^^^^^^^^^^^^^^^ 30 | 31 | **Extend `ExactSolverBase` in `'backbone_learn/exact_solvers'`.** 32 | **Implement `fit` and `predict` Methods:** 33 | 34 | - Apply `fit` to the reduced backbone set. 35 | - Use a method with optimality guarantees. 36 | - Ensure the model can be used for prediction. 37 | 38 | Custom Backbone Algorithm 39 | ^^^^^^^^^^^^^^^^^^^^^^^^^ 40 | 41 | **Extend `BackboneSupervised` or `BackboneUnsupervised` in `'backbone_learn/backbone'`.** 42 | **Implement `set_solvers`:** 43 | 44 | - Add customized screen selector, heuristic solver, and exact solver. 45 | - Optionally define a screen selector and heuristic solver. 46 | - Pass parameters manually. 47 | 48 | Example Usage for Customized Backbone Algorithm 49 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 50 | 51 | .. code-block:: python 52 | 53 | class CustomBackboneAlgorithm(BackboneSupervised): 54 | def set_solvers(self, **kwargs): 55 | self.screen_selector = CustomScreenSelector(**kwargs) 56 | self.heuristic_solver = CustomHeuristicSolver(**kwargs) 57 | self.exact_solver = CustomExactSolver(**kwargs) 58 | 59 | # Initialize with custom parameters 60 | backbone_algorithm = CustomBackboneAlgorithm(alpha=0.5, beta=0.3, num_subproblems=3, **kwargs) 61 | 62 | # Fit the model 63 | backbone_algorithm.fit(X, y) 64 | 65 | # Make predictions 66 | predictions = backbone_algorithm.predict(X_new) 67 | -------------------------------------------------------------------------------- /docs/developer_guide/example_usage_customized_backbone.rst: -------------------------------------------------------------------------------- 1 | Example Usage for Customized Backbone Algorithm 2 | ----------------------------------------------- 3 | 4 | Here's an example of how you can create a custom Backbone algorithm for a supervised method: 5 | 6 | .. code-block:: python 7 | 8 | class CustomBackboneAlgorithm(BackboneSupervised): 9 | def set_solvers(self, **kwargs): 10 | # Init screen selector or set None to skip 11 | self.screen_selector = CustomScreenSelector(**kwargs) 12 | # Init heuristic solver or set None to skip 13 | self.heuristic_solver = CustomHeuristicSolver(**kwargs) 14 | # Init exact solver 15 | self.exact_solver = CustomExactSolver(**kwargs) 16 | 17 | Here's how you can use the custom Backbone algorithm: 18 | 19 | .. code-block:: python 20 | 21 | # Initialize with custom parameters 22 | backbone_algorithm = CustomBackboneAlgorithm(alpha=0.5, beta=0.3, num_subproblems=3, **kwargs) 23 | 24 | # Fit the model 25 | backbone_algorithm.fit(X, y) 26 | 27 | # Make predictions 28 | predictions = backbone_algorithm.predict(X_new) 29 | -------------------------------------------------------------------------------- /docs/getting_started/clustering.rst: -------------------------------------------------------------------------------- 1 | Clustering with MIO Formulation Model 2 | ------------------------------------- 3 | 4 | .. code-block:: python 5 | 6 | from backbone_learn.backbone.backbone_clustering import BackboneClustering 7 | # Initialize BackboneClustering 8 | backbone = BackboneClustering(beta=1.0, num_subproblems=5, num_iterations=1, n_clusters=5) 9 | # Fit the model 10 | backbone.fit(X) 11 | # Make predictions 12 | y_pred = backbone.predict(X) 13 | -------------------------------------------------------------------------------- /docs/getting_started/decision_trees.rst: -------------------------------------------------------------------------------- 1 | Decision Trees with BendersOCT Model 2 | ------------------------------------ 3 | 4 | .. code-block:: python 5 | 6 | from backbone_learn.backbone.backbone_decision_tree import BackboneDecisionTree 7 | # Initialize BackboneDecisionTree 8 | backbone = BackboneDecisionTree(alpha=0.5, beta=0.5, num_subproblems=5, num_iterations=1, depth=3, _lambda=0.5) 9 | # Fit the model 10 | backbone.fit(X, y) 11 | # Make predictions 12 | y_pred = backbone.predict(X) 13 | -------------------------------------------------------------------------------- /docs/getting_started/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ------------ 3 | 4 | Install BackboneLearn using pip: 5 | 6 | .. code-block:: bash 7 | 8 | pip install backbone-learn 9 | 10 | .. note:: 11 | 12 | For ODT implementations, please follow the instructions described in the library to install and compile CBC from source using coinbrew: `ODTLearn CBC Binaries `_. 13 | -------------------------------------------------------------------------------- /docs/getting_started/sparse_regression.rst: -------------------------------------------------------------------------------- 1 | Sparse Regression with L0BnB Model 2 | ---------------------------------- 3 | 4 | .. code-block:: python 5 | 6 | from backbone_learn.backbone.backbone_sparse_regression import BackboneSparseRegression 7 | # Initialize BackboneSparseRegression 8 | backbone = BackboneSparseRegression(alpha=0.5, beta=0.5, num_subproblems=5, num_iterations=1, lambda_2=0.001, max_nonzeros=10) 9 | # Fit the model 10 | backbone.fit(X, y) 11 | # Make predictions 12 | y_pred = backbone.predict(X) 13 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. _backbone-learn-documentation: 2 | 3 | ============================== 4 | BackboneLearn Documentation 5 | ============================== 6 | 7 | BackboneLearn: A Framework for Scaling Mixed Integer Optimization Problems with Indicator Variables to High-Dimensional Problems 8 | =============================================================================================================================== 9 | 10 | Introduction 11 | ------------ 12 | 13 | Welcome to the BackboneLearn documentation. This framework is designed to scale mixed integer optimization problems with indicator variables to high-dimensional problems, providing advanced tools and methodologies for researchers and developers. Here, you will find all the necessary information to get started, understand the core concepts, and effectively use or contribute to BackboneLearn. 14 | 15 | .. contents:: 16 | :local: 17 | :depth: 1 18 | 19 | .. _getting-started: 20 | 21 | Getting Started 22 | --------------- 23 | .. toctree:: 24 | :maxdepth: 1 25 | 26 | getting_started/installation 27 | getting_started/sparse_regression 28 | getting_started/decision_trees 29 | getting_started/clustering 30 | 31 | .. _user-guide: 32 | 33 | User Guide 34 | ---------- 35 | .. toctree:: 36 | :maxdepth: 1 37 | 38 | user_guide/overview 39 | user_guide/indicators_explanation 40 | user_guide/backbone_framework 41 | 42 | .. _developer-guide: 43 | 44 | Developer Guide 45 | --------------- 46 | .. toctree:: 47 | :maxdepth: 1 48 | 49 | developer_guide/custom_implementation_guide 50 | developer_guide/example_usage_customized_backbone 51 | 52 | .. _contribution-guide: 53 | 54 | Contribution Guide 55 | ------------------ 56 | .. toctree:: 57 | :maxdepth: 1 58 | 59 | contribution_guide/CONTRIBUTING 60 | contribution_guide/CONDUCT 61 | 62 | .. _api-documentation: 63 | 64 | API Documentation 65 | ----------------- 66 | .. toctree:: 67 | :maxdepth: 1 68 | 69 | api_docs/backbone_learn 70 | api_docs/backbone_learn.screen_selectors 71 | api_docs/backbone_learn.heuristic_solvers 72 | api_docs/backbone_learn.exact_solvers 73 | 74 | Indices and Tables 75 | ------------------ 76 | * :ref:`genindex` 77 | * :ref:`modindex` 78 | * :ref:`search` 79 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/modules.rst: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx==5.1.1 2 | sphinx-rtd-theme==1.0.0 3 | numpy ==1.26.1 4 | scikit-learn == 1.3.2 5 | l0bnb == 1.0.0 6 | seaborn == 0.13.0 7 | pulp == 2.7.0 8 | git+https://github.com/D3M-Research-Group/odtlearn.git#egg=odtlearn 9 | -------------------------------------------------------------------------------- /docs/user_guide/backbone_framework.rst: -------------------------------------------------------------------------------- 1 | Backbone Framework 2 | ------------------ 3 | 4 | The backbone framework, upon which *BackboneLearn* is built, operates in two phases: we first extract a “backbone set” of potentially ``relevant indicators`` (i.e., indicators that are nonzero in the optimal solution) by solving a number of specially chosen, tractable subproblems; we then use traditional techniques to solve a reduced problem to optimality or near-optimality, considering only the backbone indicators. A screening step often precedes the first phase, to discard indicators that are almost surely irrelevant. For more details, check the paper by Bertsimas and Digalakis Jr (2022) `here `_. 5 | -------------------------------------------------------------------------------- /docs/user_guide/indicators_explanation.rst: -------------------------------------------------------------------------------- 1 | What do we mean by indicators? 2 | ============================== 3 | 4 | Indicators are binary variables that are part of the MIO problem we use to train the aforementioned models. 5 | 6 | - Sparse regression: Each regression coefficient (and the corresponding feature) is paired with an indicator, which is 1 if the coefficient is nonzero and 0 otherwise. 7 | - Decision trees: An indicator corresponds to a feature in a decision tree node, being nonzero if that feature is chosen for branching at that node. 8 | - Clustering: An indicator represents whether a pair of data points are in the same cluster, being nonzero if they are clustered together. 9 | -------------------------------------------------------------------------------- /docs/user_guide/overview.rst: -------------------------------------------------------------------------------- 1 | Overview 2 | ======== 3 | 4 | *BackboneLearn* is an open-source software package and framework for scaling mixed integer optimization (MIO) problems with indicator variables to high-dimensional problems. This optimization paradigm can naturally be used to formulate fundamental problems in interpretable supervised learning (e.g., sparse regression and decision trees), in unsupervised learning (e.g., clustering), and beyond; *BackboneLearn* solves the aforementioned problems faster than exact methods and with higher accuracy than commonly used heuristics, while also scaling to large problem sizes. The package is built in Python and is user-friendly and easily extendible: users can directly implement a backbone algorithm for their MIO problem at hand. For more details, check our paper: `here `_. 5 | -------------------------------------------------------------------------------- /examples/decision_tree_toy_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%%capture\n", 10 | "!pip install backbone-learn\n", 11 | "!pip install git+https://github.com/D3M-Research-Group/odtlearn.git#egg=odtlearn\n" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "name": "stdout", 21 | "output_type": "stream", 22 | "text": [ 23 | "DYLD_LIBRARY_PATH: 'os.environ.get('DYLD_LIBRARY_PATH')'\n", 24 | "PMIP_CBC_LIBRARY: 'os.environ.get('PMIP_CBC_LIBRARY')'\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "import os\n", 30 | "# Manually replace '~' with your actual home directory path\n", 31 | "home_directory = os.path.expanduser('~')\n", 32 | "os.environ['DYLD_LIBRARY_PATH'] = os.path.join(home_directory, 'CBC/dist/lib')\n", 33 | "os.environ['PMIP_CBC_LIBRARY'] = os.path.join(home_directory, 'CBC/dist/lib/libCbc.dylib')\n", 34 | "\n", 35 | "# Verify if the environment variables are set\n", 36 | "print(f\"DYLD_LIBRARY_PATH: 'os.environ.get('DYLD_LIBRARY_PATH')'\")\n", 37 | "print(f\"PMIP_CBC_LIBRARY: 'os.environ.get('PMIP_CBC_LIBRARY')'\")" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 3, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "name": "stderr", 47 | "output_type": "stream", 48 | "text": [ 49 | "2023-11-22 21:18:59,819 - INFO - Screen selection started.\n", 50 | "2023-11-22 21:18:59,819 - INFO - Number of variables included in the heuristic solver: 2\n", 51 | "2023-11-22 21:18:59,819 - INFO - Backbone construction with heuristic solver started for iterations:1,\n", 52 | " subproblems:5 , and beta:0.5\n", 53 | "2023-11-22 21:18:59,820 - INFO - Iteration 1 started.\n" 54 | ] 55 | }, 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "Fitting 5 folds for each of 12 candidates, totalling 60 fits\n", 61 | "Fitting 5 folds for each of 12 candidates, totalling 60 fits\n", 62 | "Fitting 5 folds for each of 12 candidates, totalling 60 fits\n", 63 | "Fitting 5 folds for each of 12 candidates, totalling 60 fits\n" 64 | ] 65 | }, 66 | { 67 | "name": "stderr", 68 | "output_type": "stream", 69 | "text": [ 70 | "2023-11-22 21:19:00,098 - INFO - Iteration 1 completed.\n", 71 | "2023-11-22 21:19:00,099 - INFO - Backbone set idx: [0 1]\n", 72 | "2023-11-22 21:19:00,099 - INFO - Number of variables included in the exact solver: 2\n" 73 | ] 74 | }, 75 | { 76 | "name": "stdout", 77 | "output_type": "stream", 78 | "text": [ 79 | "Fitting 5 folds for each of 12 candidates, totalling 60 fits\n", 80 | "Cbc0045I May be able to increase cutoff increment to 0.0999 - but we have lazy constraints\n", 81 | "Cbc0031I 11 added rows had average density of 5.4545455\n", 82 | "Cbc0013I At root node, 11 cuts changed objective from 90 to 89.65 in 8 passes\n", 83 | "Cbc0014I Cut generator 0 (LazyConstraints) - 331 row cuts average 4.3 elements, 0 column cuts (0 active) in 0.019 seconds - new frequency is 1\n", 84 | "Cbc0014I Cut generator 1 (Probing) - 0 row cuts average 0.0 elements, 0 column cuts (0 active) in 0.000 seconds - new frequency is -100\n", 85 | "Cbc0014I Cut generator 2 (Gomory) - 0 row cuts average 0.0 elements, 0 column cuts (0 active) in 0.000 seconds - new frequency is -100\n", 86 | "Cbc0014I Cut generator 3 (Knapsack) - 0 row cuts average 0.0 elements, 0 column cuts (0 active) in 0.000 seconds - new frequency is -100\n", 87 | "Cbc0014I Cut generator 4 (MixedIntegerRounding2) - 0 row cuts average 0.0 elements, 0 column cuts (0 active) in 0.000 seconds - new frequency is -100\n", 88 | "Cbc0014I Cut generator 7 (ZeroHalf) - 0 row cuts average 0.0 elements, 0 column cuts (0 active) in 0.000 seconds - new frequency is -100\n", 89 | "Cbc0004I Integer solution of 80 found after 56 iterations and 5 nodes (0.16 seconds)\n", 90 | "Cbc0004I Integer solution of 81.7 found after 249 iterations and 25 nodes (0.41 seconds)\n", 91 | "Cbc0016I Integer solution of 81.8 found by strong branching after 275 iterations and 28 nodes (0.45 seconds)\n", 92 | "Cbc0010I After 48 nodes, 2 on tree, 81.8 best solution, best possible 88.866667 (0.69 seconds)\n", 93 | "Cbc0010I After 119 nodes, 1 on tree, 81.8 best solution, best possible 87.75 (1.40 seconds)\n" 94 | ] 95 | }, 96 | { 97 | "name": "stderr", 98 | "output_type": "stream", 99 | "text": [ 100 | "2023-11-22 21:19:01,791 - INFO - Exact problem solved.\n" 101 | ] 102 | }, 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "Cbc0001I Search completed - best objective 81.80000000000004, took 1293 iterations and 146 nodes (1.66 seconds)\n", 108 | "Cbc0032I Strong branching done 260 times (560 iterations), fathomed 0 nodes and fixed 0 variables\n", 109 | "Cbc0035I Maximum depth 7, 7 variables fixed on reduced cost\n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "from sklearn.datasets import make_classification\n", 115 | "from backbone_learn.backbone.backbone_decision_tree import BackboneDecisionTree\n", 116 | "\n", 117 | "# Generate synhetic data. \n", 118 | "X, y = make_classification(n_samples=100, n_features=5, n_classes=2, random_state=17)\n", 119 | "\n", 120 | "# Run backbone for decision tress\n", 121 | "backbone = BackboneDecisionTree(alpha=0.5, beta=0.5, num_subproblems=5, num_iterations=1, depth=3, time_limit=100, _lambda=0.1, num_threads=None, obj_mode=\"acc\")\n", 122 | "backbone.fit(X, y)\n", 123 | "\n", 124 | "# Make predictions\n", 125 | "y_pred = backbone.predict(X)" 126 | ] 127 | } 128 | ], 129 | "metadata": { 130 | "kernelspec": { 131 | "display_name": "Python 3", 132 | "language": "python", 133 | "name": "python3" 134 | }, 135 | "language_info": { 136 | "codemirror_mode": { 137 | "name": "ipython", 138 | "version": 3 139 | }, 140 | "file_extension": ".py", 141 | "mimetype": "text/x-python", 142 | "name": "python", 143 | "nbconvert_exporter": "python", 144 | "pygments_lexer": "ipython3", 145 | "version": "3.9.7" 146 | }, 147 | "orig_nbformat": 4 148 | }, 149 | "nbformat": 4, 150 | "nbformat_minor": 2 151 | } 152 | -------------------------------------------------------------------------------- /examples/sparse_regression_toy_example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 9, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%%capture\n", 10 | "!pip install backbone-learn;\n" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 8, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "name": "stderr", 20 | "output_type": "stream", 21 | "text": [ 22 | "2023-11-22 21:39:29,435 - INFO - Iteration 1 completed.\n", 23 | "2023-11-22 21:39:29,437 - INFO - Backbone set idx: [ 7 13 14 26 30 45 54 62 63 79 83 97 98 106 113 132 136 138\n", 24 | " 143 145 153 154 156 167 174 192 209 211 212 216 223 241 243 244 251 255\n", 25 | " 256 261 267 274 284 296 299 311 357 360 365 367 381 391 394 398 409 419\n", 26 | " 425 442 446 448 463 465 469 470 476 481 483 493]\n", 27 | "2023-11-22 21:39:29,440 - INFO - Number of variables included in the exact solver: 66\n" 28 | ] 29 | }, 30 | { 31 | "name": "stdout", 32 | "output_type": "stream", 33 | "text": [ 34 | "Preprocessing Data.\n", 35 | "BnB Started.\n", 36 | "Iteration: 1. Number of non-zeros: 1\n", 37 | "Iteration: 2. Number of non-zeros: 1\n", 38 | "Iteration: 3. Number of non-zeros: 2\n", 39 | "Iteration: 4. Number of non-zeros: 3\n", 40 | "Iteration: 5. Number of non-zeros: 4\n", 41 | "Iteration: 6. Number of non-zeros: 4\n", 42 | "Iteration: 7. Number of non-zeros: 4\n", 43 | "Iteration: 8. Number of non-zeros: 4\n", 44 | "Iteration: 9. Number of non-zeros: 4\n", 45 | "Iteration: 10. Number of non-zeros: 4\n", 46 | "Iteration: 11. Number of non-zeros: 4\n", 47 | "Iteration: 12. Number of non-zeros: 4\n", 48 | "Iteration: 13. Number of non-zeros: 4\n", 49 | "Iteration: 14. Number of non-zeros: 4\n", 50 | "Iteration: 15. Number of non-zeros: 4\n" 51 | ] 52 | }, 53 | { 54 | "name": "stderr", 55 | "output_type": "stream", 56 | "text": [ 57 | "2023-11-22 21:39:30,987 - INFO - Exact problem solved.\n" 58 | ] 59 | }, 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | "Iteration: 16. Number of non-zeros: 4\n", 65 | "Iteration: 17. Number of non-zeros: 65\n" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "# Import libraries\n", 71 | "from sklearn.datasets import make_regression\n", 72 | "from backbone_learn.backbone.backbone_sparse_regression import BackboneSparseRegression\n", 73 | " \n", 74 | "# Generate fake data\n", 75 | "X, y = make_regression(\n", 76 | " n_samples=100, n_features=500, n_informative=5, noise=0.1, random_state=42\n", 77 | ")\n", 78 | "\n", 79 | "# Run BackboneSparseRegression\n", 80 | "backbone = BackboneSparseRegression(alpha=0.5, beta=0.5, num_subproblems=5, num_iterations= 1, lambda_2 = 0.001, max_nonzeros= 10)\n", 81 | "backbone.fit(X, y)\n", 82 | "\n", 83 | "# Make predicitons\n", 84 | "y_pred = backbone.predict(X)" 85 | ] 86 | } 87 | ], 88 | "metadata": { 89 | "kernelspec": { 90 | "display_name": "Python 3", 91 | "language": "python", 92 | "name": "python3" 93 | }, 94 | "language_info": { 95 | "codemirror_mode": { 96 | "name": "ipython", 97 | "version": 3 98 | }, 99 | "file_extension": ".py", 100 | "mimetype": "text/x-python", 101 | "name": "python", 102 | "nbconvert_exporter": "python", 103 | "pygments_lexer": "ipython3", 104 | "version": "3.9.7" 105 | }, 106 | "orig_nbformat": 4 107 | }, 108 | "nbformat": 4, 109 | "nbformat_minor": 2 110 | } 111 | -------------------------------------------------------------------------------- /experiments/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chziakas/backbone-learn/576749c7a1ffa1e57ea4c018a88a052d20fad2e2/experiments/__init__.py -------------------------------------------------------------------------------- /experiments/benchmark_clustering.py: -------------------------------------------------------------------------------- 1 | import time 2 | from itertools import product 3 | 4 | import numpy as np 5 | from sklearn.datasets import make_blobs 6 | from utils import save_results 7 | 8 | from backbone_learn.backbone.backbone_clustering import BackboneClustering 9 | from backbone_learn.heuristic_solvers.kmeans_solver import KMeansSolver 10 | 11 | # Define parameter ranges for BackboneClustering 12 | beta_range = [1.0] 13 | num_subproblems_range = [3, 5, 10] 14 | num_iterations_range = [1] 15 | n_clusters_range = [5] 16 | n_features_range = [2] 17 | n_samples_range = [200] 18 | 19 | # Constants 20 | random_state = 17 21 | time_limit = 3600 22 | log_filename = "clustering_results.json" 23 | 24 | 25 | results = [] 26 | # Experiment loop 27 | for n_samples, n_clusters, n_features in product( 28 | n_samples_range, n_clusters_range, n_features_range 29 | ): 30 | X, _ = make_blobs( 31 | n_samples=n_samples, n_features=n_features, cluster_std=1.0, random_state=random_state 32 | ) 33 | X = np.random.default_rng(random_state).standard_normal((n_samples, n_features)) - 0.5 + X 34 | 35 | # KMeansSolver model iteration (labeled as 'heuristic') 36 | heuristic_model = KMeansSolver(n_clusters=n_clusters) 37 | start_time = time.time() 38 | heuristic_model.fit(X, random_state=random_state) 39 | heuristic_runtime = time.time() - start_time 40 | heuristic_wcss = heuristic_model.wcss 41 | heuristic_silhouette = heuristic_model.silhouette_score 42 | 43 | # Record heuristic model results 44 | result_heuristic = { 45 | "model_name": "heuristic", 46 | "n_samples": n_samples, 47 | "n_clusters": n_clusters, 48 | "n_features": n_features, 49 | "WCSS": heuristic_wcss, 50 | "silhouette": heuristic_silhouette, 51 | "Runtime (seconds)": heuristic_runtime, 52 | } 53 | results.append(result_heuristic) 54 | save_results(results, log_filename) 55 | 56 | # BackboneClustering model iterations for 'backbone' solvers 57 | for beta, num_subproblems, num_iterations in product( 58 | beta_range, num_subproblems_range, num_iterations_range 59 | ): 60 | backbone_model = BackboneClustering( 61 | beta=beta, 62 | num_subproblems=num_subproblems, 63 | num_iterations=num_iterations, 64 | n_clusters=n_clusters, 65 | time_limit=time_limit, 66 | ) 67 | start_time = time.time() 68 | backbone_model.fit(X) 69 | backbone_runtime = time.time() - start_time 70 | backbone_size_diff_cluster = len(backbone_model.exact_solver.ls_pairs_diff_cluster) 71 | backbone_size_same_cluster = len(backbone_model.exact_solver.ls_pairs_same_cluster) 72 | backbone_wcss = backbone_model.exact_solver.wcss 73 | backbone_wcss_heur = backbone_model.heuristic_solver.wcss 74 | backbone_silhouette_heur = backbone_model.heuristic_solver.silhouette_score 75 | backbone_silhouette = backbone_model.exact_solver.silhouette_score 76 | 77 | # Record backbone model results 78 | result_backbone = { 79 | "model_name": "backbone", 80 | "n_samples": n_samples, 81 | "n_clusters": n_clusters, 82 | "beta": beta, 83 | "num_subproblems": num_subproblems, 84 | "num_iterations": num_iterations, 85 | "backbone_size_same_cluster": backbone_size_same_cluster, 86 | "backbone_size_diff_cluster": backbone_size_diff_cluster, 87 | "WCSS": backbone_wcss, 88 | "WCSS_heur": backbone_wcss_heur, 89 | "silhouette": backbone_silhouette, 90 | "silhouette_heur": backbone_silhouette_heur, 91 | "Runtime (seconds)": backbone_runtime, 92 | } 93 | results.append(result_backbone) 94 | save_results(results, log_filename) 95 | 96 | # BackboneClustering model iterations for 'exact' solver 97 | exact_model = BackboneClustering(n_clusters=n_clusters, time_limit=time_limit) 98 | exact_model.screen_selector = None 99 | exact_model.heuristic_solver = None 100 | start_time = time.time() 101 | exact_model.fit(X) 102 | exact_runtime = time.time() - start_time 103 | exact_wcss = exact_model.exact_solver.wcss 104 | exact_silhouette = exact_model.exact_solver.silhouette_score 105 | 106 | # Record exact model results 107 | result_exact = { 108 | "model_name": "exact", 109 | "n_samples": n_samples, 110 | "n_clusters": n_clusters, 111 | "n_features": n_features, 112 | "WCSS": exact_wcss, 113 | "silhouette": exact_silhouette, 114 | "Runtime (seconds)": exact_runtime, 115 | } 116 | 117 | results.append(result_exact) 118 | save_results(results, log_filename) 119 | 120 | save_results(results, log_filename) 121 | # Print or further process results 122 | for result in results: 123 | print(result) 124 | -------------------------------------------------------------------------------- /experiments/benchmark_decision_tree.py: -------------------------------------------------------------------------------- 1 | import time 2 | from itertools import product 3 | 4 | from sklearn.datasets import make_classification 5 | from sklearn.metrics import roc_auc_score 6 | from sklearn.model_selection import train_test_split 7 | from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder 8 | from utils import save_results 9 | 10 | from backbone_learn.backbone.backbone_decision_tree import BackboneDecisionTree 11 | from backbone_learn.heuristic_solvers.cart_decision_tree import CARTDecisionTree 12 | 13 | # Define parameter ranges for Backbone parameters 14 | alpha_range = [0.1, 0.5] 15 | beta_range = [0.5, 0.9] 16 | num_subproblems_range = [5, 10] 17 | num_iterations_range = [1] 18 | # Define parameter ranges for FlowOCT parameters 19 | depth_range = [2] 20 | _lambda_range = [0.5] 21 | 22 | # Define dataset parameters 23 | n_informative = 4 24 | n_bins = 5 25 | n_features_range = [20] 26 | n_samples = 500 27 | n_classes = 2 28 | random_state = 17 29 | time_limit = 3600 30 | log_filename = "decision_tree_results.json" 31 | results = [] 32 | 33 | # Experiment loop 34 | for n_features in n_features_range: 35 | # Generate synthetic classification data 36 | X, y = make_classification( 37 | n_samples=n_samples, 38 | n_informative=n_informative, 39 | n_features=n_features, 40 | n_classes=n_classes, 41 | random_state=random_state, 42 | ) 43 | # Convert features to binary 44 | est_X = KBinsDiscretizer( 45 | n_bins=n_bins, encode="ordinal", strategy="quantile", random_state=random_state 46 | ) 47 | est_X.fit(X) 48 | X_bin = est_X.transform(X) 49 | enc = OneHotEncoder(handle_unknown="error", drop="if_binary") 50 | X_cat_enc = enc.fit_transform(X_bin).toarray() 51 | # Split data into train and test sets 52 | X_train, X_test, y_train, y_test = train_test_split( 53 | X_cat_enc, y, test_size=0.2, random_state=random_state 54 | ) 55 | 56 | for depth in depth_range: 57 | # CARTDecisionTree model iteration for heuristic_model 58 | heuristic_model = CARTDecisionTree(max_depth=depth) 59 | start_time = time.time() 60 | heuristic_model.fit(X_train, y_train, random_state=random_state) 61 | runtime = time.time() - start_time 62 | y_pred_heuristic = heuristic_model.predict(X_test) 63 | auc_score_heuristic = roc_auc_score(y_test, y_pred_heuristic) 64 | 65 | # Record heuristic model results 66 | result_heuristic = { 67 | "model_name": "heuristic", 68 | "n_features": int(n_features * n_bins), 69 | "n_samples": n_samples, 70 | "n_informative": n_informative, 71 | "depth": depth, 72 | "AUC Score": auc_score_heuristic, 73 | "Runtime (seconds)": runtime, 74 | } 75 | results.append(result_heuristic) 76 | save_results(results, log_filename) 77 | 78 | for _lambda in _lambda_range: 79 | # BackboneDecisionTree model iterations for 'backbone' solution 80 | for alpha, beta, num_subproblems, num_iterations in product( 81 | alpha_range, beta_range, num_subproblems_range, num_iterations_range 82 | ): 83 | backbone_model = BackboneDecisionTree( 84 | alpha=alpha, 85 | beta=beta, 86 | num_subproblems=num_subproblems, 87 | num_iterations=num_iterations, 88 | depth=depth, 89 | time_limit=time_limit, 90 | threshold=0.001, 91 | n_bins=n_bins, 92 | is_data_fit=True, 93 | ) 94 | start_time = time.time() 95 | backbone_model.fit(X_train, y_train) 96 | runtime = time.time() - start_time 97 | y_pred_backbone = backbone_model.predict(X_test) 98 | backbone_size = len(backbone_model.variables_exact_idx) 99 | auc_score_backbone = roc_auc_score(y_test, y_pred_backbone) 100 | 101 | # Record backbone model results 102 | result_backbone = { 103 | "model_name": "backbone", 104 | "n_features": int(n_features * n_bins), 105 | "n_samples": n_samples, 106 | "n_informative": n_informative, 107 | "alpha": alpha, 108 | "beta": beta, 109 | "num_subproblems": num_subproblems, 110 | "num_iterations": num_iterations, 111 | "depth": depth, 112 | "_lambda": _lambda, 113 | "backbone_size": backbone_size, 114 | "AUC Score": auc_score_backbone, 115 | "Runtime (seconds)": runtime, 116 | } 117 | results.append(result_backbone) 118 | save_results(results, log_filename) 119 | # BackboneDecisionTree model iterations for 'exact' solution 120 | exact_model = None 121 | exact_model = BackboneDecisionTree( 122 | depth=depth, _lambda=_lambda, time_limit=time_limit, n_bins=n_bins, is_data_fit=True 123 | ) 124 | start_time = time.time() 125 | exact_model.fit(X_train, y_train) 126 | runtime = time.time() - start_time 127 | y_pred_exact = exact_model.predict(X_test) 128 | auc_score_exact = roc_auc_score(y_test, y_pred_exact) 129 | 130 | # Record exact model results 131 | result_exact = { 132 | "model_name": "exact", 133 | "n_features": int(n_features * n_bins), 134 | "n_samples": n_samples, 135 | "n_informative": n_informative, 136 | "depth": depth, 137 | "_lambda": _lambda, 138 | "AUC Score": auc_score_exact, 139 | "Runtime (seconds)": runtime, 140 | } 141 | results.append(result_exact) 142 | save_results(results, log_filename) 143 | 144 | save_results(results, log_filename) 145 | # Print or further process results 146 | for result in results: 147 | print(result) 148 | -------------------------------------------------------------------------------- /experiments/benchmark_sparse_regression.py: -------------------------------------------------------------------------------- 1 | import time 2 | from itertools import product 3 | 4 | from l0bnb import gen_synthetic 5 | from sklearn.metrics import r2_score 6 | from sklearn.model_selection import train_test_split 7 | from utils import save_results 8 | 9 | from backbone_learn.backbone.backbone_sparse_regression import BackboneSparseRegression 10 | from backbone_learn.heuristic_solvers.lasso_regression import LassoRegression 11 | 12 | # Define parameter ranges for Backbone parameters 13 | alpha_range = [0.1, 0.5] 14 | beta_range = [0.5, 0.9] 15 | num_subproblems_range = [5, 10] 16 | num_iterations_range = [1] 17 | # Define parameter ranges for Lobnb parameters 18 | lambda_2_range = [0.01] 19 | n_non_zeros = 10 20 | max_nonzeros = 10 21 | 22 | # Define range for features and other constants 23 | n_features_range = [5000] 24 | n_samples = 500 25 | random_state = 17 26 | time_limit = 1800 27 | log_filename = "sparse_regression_results.json" 28 | results = [] 29 | 30 | 31 | # Experiment loop 32 | for n_features in n_features_range: 33 | # Generate synthetic regression data 34 | X, y, b = gen_synthetic(n=n_samples, p=n_features, supp_size=n_non_zeros) 35 | 36 | # Split data into train and test sets 37 | X_train, X_test, y_train, y_test = train_test_split( 38 | X, y, test_size=0.2, random_state=random_state 39 | ) 40 | 41 | # Lasso regression model iteration for heuristic_model 42 | heuristic_model = LassoRegression() 43 | start_time = time.time() 44 | heuristic_model.fit(X_train, y_train, random_state=random_state) 45 | runtime = time.time() - start_time 46 | heuristic_model.keep_top_features(n_non_zeros) 47 | y_pred_heuristic = heuristic_model.predict(X_test) 48 | r2_score_heuristic = r2_score(y_test, y_pred_heuristic) 49 | 50 | # Record heuristic model results 51 | result_heuristic = { 52 | "model_name": "heuristic", 53 | "n_features": n_features, 54 | "R2 Score": r2_score_heuristic, 55 | "Runtime (seconds)": runtime, 56 | } 57 | results.append(result_heuristic) 58 | save_results(results, log_filename) 59 | 60 | # BackboneSparseRegression model iterations for 'backbone' and 'exact' solvers 61 | for lambda_2 in lambda_2_range: 62 | # Exact msodel iteration using BackboneSparseRegression 63 | exact_model = BackboneSparseRegression(max_nonzeros=max_nonzeros, time_limit=time_limit) 64 | exact_model.screen_selector = None 65 | exact_model.heuristic_solver = None 66 | start_time = time.time() 67 | exact_model.fit(X_train, y_train) 68 | runtime = time.time() - start_time 69 | y_pred_exact = exact_model.predict(X_test) 70 | r2_score_exact = r2_score(y_test, y_pred_exact) 71 | 72 | # Record exact model results 73 | result_exact = { 74 | "model_name": "exact", 75 | "n_features": n_features, 76 | "lambda_2": lambda_2, 77 | "R2 Score": r2_score_exact, 78 | "Runtime (seconds)": runtime, 79 | } 80 | results.append(result_exact) 81 | save_results(results, log_filename) 82 | 83 | # Backbone model iteration using BackboneSparseRegression 84 | for alpha, beta, num_subproblems, num_iterations in product( 85 | alpha_range, beta_range, num_subproblems_range, num_iterations_range 86 | ): 87 | backbone_model = BackboneSparseRegression( 88 | alpha=alpha, 89 | beta=beta, 90 | num_subproblems=num_subproblems, 91 | num_iterations=num_iterations, 92 | lambda_2=lambda_2, 93 | max_nonzeros=max_nonzeros, 94 | time_limit=time_limit, 95 | ) 96 | start_time = time.time() 97 | backbone_model.fit(X_train, y_train) 98 | runtime = time.time() - start_time 99 | y_pred_backbone = backbone_model.predict(X_test) 100 | backbone_size = len(backbone_model.variables_exact_idx) 101 | r2_score_backbone = r2_score(y_test, y_pred_backbone) 102 | 103 | # Record backbone model results 104 | result_backbone = { 105 | "model_name": "backbone", 106 | "n_features": n_features, 107 | "backbone_size": backbone_size, 108 | "alpha": alpha, 109 | "beta": beta, 110 | "num_subproblems": num_subproblems, 111 | "num_iterations": num_iterations, 112 | "lambda_2": lambda_2, 113 | "R2 Score": r2_score_backbone, 114 | "Runtime (seconds)": runtime, 115 | } 116 | results.append(result_backbone) 117 | save_results(results, log_filename) 118 | 119 | save_results(results, log_filename) 120 | # Printresults 121 | for result in results: 122 | print(result) 123 | -------------------------------------------------------------------------------- /experiments/clustering_results.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "model_name": "heuristic", 4 | "n_samples": 200, 5 | "n_clusters": 5, 6 | "n_features": 2, 7 | "WCSS": 1485.2453931611212, 8 | "silhouette": 0.45449417943273795, 9 | "Runtime (seconds)": 0.2598001956939697 10 | }, 11 | { 12 | "model_name": "backbone", 13 | "n_samples": 200, 14 | "n_clusters": 5, 15 | "beta": 1.0, 16 | "num_subproblems": 3, 17 | "num_iterations": 1, 18 | "backbone_size_same_cluster": 3800, 19 | "backbone_size_diff_cluster": 15045, 20 | "WCSS": 1670.2469025116184, 21 | "WCSS_heur": 1666.0759199384097, 22 | "silhouette": 0.48111898151214705, 23 | "silhouette_heur": 0.4819429619600324, 24 | "Runtime (seconds)": 115.66834926605225 25 | }, 26 | { 27 | "model_name": "backbone", 28 | "n_samples": 200, 29 | "n_clusters": 5, 30 | "beta": 1.0, 31 | "num_subproblems": 5, 32 | "num_iterations": 1, 33 | "backbone_size_same_cluster": 3483, 34 | "backbone_size_diff_cluster": 14679, 35 | "WCSS": 1670.2469025116184, 36 | "WCSS_heur": 1782.0085648273534, 37 | "silhouette": 0.48111898151214705, 38 | "silhouette_heur": 0.4692787030931956, 39 | "Runtime (seconds)": 116.12881302833557 40 | }, 41 | { 42 | "model_name": "backbone", 43 | "n_samples": 200, 44 | "n_clusters": 5, 45 | "beta": 1.0, 46 | "num_subproblems": 10, 47 | "num_iterations": 1, 48 | "backbone_size_same_cluster": 3483, 49 | "backbone_size_diff_cluster": 14679, 50 | "WCSS": 1670.2469025116181, 51 | "WCSS_heur": 1422.6397776725594, 52 | "silhouette": 0.48111898151214705, 53 | "silhouette_heur": 0.47963434498809876, 54 | "Runtime (seconds)": 116.12559533119202 55 | } 56 | ] 57 | -------------------------------------------------------------------------------- /experiments/decision_tree_results.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "model_name": "heuristic", 4 | "n_features": 100, 5 | "n_samples": 500, 6 | "n_informative": 4, 7 | "depth": 2, 8 | "AUC Score": 0.7075320512820511, 9 | "Runtime (seconds)": 0.5711669921875 10 | }, 11 | { 12 | "model_name": "backbone", 13 | "n_features": 100, 14 | "n_samples": 500, 15 | "n_informative": 4, 16 | "alpha": 0.1, 17 | "beta": 0.5, 18 | "num_subproblems": 5, 19 | "num_iterations": 1, 20 | "depth": 2, 21 | "_lambda": 0.5, 22 | "backbone_size": 9, 23 | "AUC Score": 0.7139423076923077, 24 | "Runtime (seconds)": 33.90367794036865 25 | }, 26 | { 27 | "model_name": "backbone", 28 | "n_features": 100, 29 | "n_samples": 500, 30 | "n_informative": 4, 31 | "alpha": 0.1, 32 | "beta": 0.5, 33 | "num_subproblems": 10, 34 | "num_iterations": 1, 35 | "depth": 2, 36 | "_lambda": 0.5, 37 | "backbone_size": 10, 38 | "AUC Score": 0.7139423076923077, 39 | "Runtime (seconds)": 108.0183379650116 40 | }, 41 | { 42 | "model_name": "backbone", 43 | "n_features": 100, 44 | "n_samples": 500, 45 | "n_informative": 4, 46 | "alpha": 0.1, 47 | "beta": 0.9, 48 | "num_subproblems": 5, 49 | "num_iterations": 1, 50 | "depth": 2, 51 | "_lambda": 0.5, 52 | "backbone_size": 10, 53 | "AUC Score": 0.7139423076923077, 54 | "Runtime (seconds)": 130.53349113464355 55 | }, 56 | { 57 | "model_name": "backbone", 58 | "n_features": 100, 59 | "n_samples": 500, 60 | "n_informative": 4, 61 | "alpha": 0.1, 62 | "beta": 0.9, 63 | "num_subproblems": 10, 64 | "num_iterations": 1, 65 | "depth": 2, 66 | "_lambda": 0.5, 67 | "backbone_size": 10, 68 | "AUC Score": 0.7139423076923077, 69 | "Runtime (seconds)": 151.21881985664368 70 | }, 71 | { 72 | "model_name": "backbone", 73 | "n_features": 100, 74 | "n_samples": 500, 75 | "n_informative": 4, 76 | "alpha": 0.5, 77 | "beta": 0.5, 78 | "num_subproblems": 5, 79 | "num_iterations": 1, 80 | "depth": 2, 81 | "_lambda": 0.5, 82 | "backbone_size": 45, 83 | "AUC Score": 0.7139423076923077, 84 | "Runtime (seconds)": 3601.8234469890594 85 | }, 86 | { 87 | "model_name": "backbone", 88 | "n_features": 100, 89 | "n_samples": 500, 90 | "n_informative": 4, 91 | "alpha": 0.5, 92 | "beta": 0.5, 93 | "num_subproblems": 10, 94 | "num_iterations": 1, 95 | "depth": 2, 96 | "_lambda": 0.5, 97 | "backbone_size": 48, 98 | "AUC Score": 0.7139423076923077, 99 | "Runtime (seconds)": 2760.6268701553345 100 | }, 101 | { 102 | "model_name": "backbone", 103 | "n_features": 100, 104 | "n_samples": 500, 105 | "n_informative": 4, 106 | "alpha": 0.5, 107 | "beta": 0.9, 108 | "num_subproblems": 5, 109 | "num_iterations": 1, 110 | "depth": 2, 111 | "_lambda": 0.5, 112 | "backbone_size": 42, 113 | "AUC Score": 0.7139423076923077, 114 | "Runtime (seconds)": 2284.8329169750214 115 | }, 116 | { 117 | "model_name": "backbone", 118 | "n_features": 100, 119 | "n_samples": 500, 120 | "n_informative": 4, 121 | "alpha": 0.5, 122 | "beta": 0.9, 123 | "num_subproblems": 10, 124 | "num_iterations": 1, 125 | "depth": 2, 126 | "_lambda": 0.5, 127 | "backbone_size": 46, 128 | "AUC Score": 0.7139423076923077, 129 | "Runtime (seconds)": 3136.7788972854614 130 | }, 131 | { 132 | "model_name": "exact", 133 | "n_features": 100, 134 | "n_samples": 500, 135 | "n_informative": 4, 136 | "depth": 2, 137 | "_lambda": 0.5, 138 | "AUC Score": 0.6386217948717949, 139 | "Runtime (seconds)": 3613.021523952484 140 | } 141 | ] 142 | -------------------------------------------------------------------------------- /experiments/sparse_regression_results.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "model_name": "heuristic", 4 | "n_features": 5000, 5 | "R2 Score": 0.871319780401133, 6 | "Runtime (seconds)": 14.54098629951477 7 | }, 8 | { 9 | "model_name": "exact", 10 | "n_features": 5000, 11 | "lambda_2": 0.01, 12 | "R2 Score": 0.8831016782086357, 13 | "Runtime (seconds)": 1175.0771400928497 14 | }, 15 | { 16 | "model_name": "backbone", 17 | "n_features": 5000, 18 | "backbone_size": 376, 19 | "alpha": 0.1, 20 | "beta": 0.5, 21 | "num_subproblems": 5, 22 | "num_iterations": 1, 23 | "lambda_2": 0.01, 24 | "R2 Score": 0.883565153769904, 25 | "Runtime (seconds)": 483.44028091430664 26 | }, 27 | { 28 | "model_name": "backbone", 29 | "n_features": 5000, 30 | "backbone_size": 449, 31 | "alpha": 0.1, 32 | "beta": 0.5, 33 | "num_subproblems": 10, 34 | "num_iterations": 1, 35 | "lambda_2": 0.01, 36 | "R2 Score": 0.883565153769904, 37 | "Runtime (seconds)": 691.6709449291229 38 | }, 39 | { 40 | "model_name": "backbone", 41 | "n_features": 5000, 42 | "backbone_size": 87, 43 | "alpha": 0.1, 44 | "beta": 0.9, 45 | "num_subproblems": 5, 46 | "num_iterations": 1, 47 | "lambda_2": 0.01, 48 | "R2 Score": 0.883565153769904, 49 | "Runtime (seconds)": 205.71164202690125 50 | }, 51 | { 52 | "model_name": "backbone", 53 | "n_features": 5000, 54 | "backbone_size": 307, 55 | "alpha": 0.1, 56 | "beta": 0.9, 57 | "num_subproblems": 10, 58 | "num_iterations": 1, 59 | "lambda_2": 0.01, 60 | "R2 Score": 0.883565153769904, 61 | "Runtime (seconds)": 637.3638026714325 62 | }, 63 | { 64 | "model_name": "backbone", 65 | "n_features": 5000, 66 | "backbone_size": 471, 67 | "alpha": 0.5, 68 | "beta": 0.5, 69 | "num_subproblems": 5, 70 | "num_iterations": 1, 71 | "lambda_2": 0.01, 72 | "R2 Score": 0.883565153769904, 73 | "Runtime (seconds)": 574.69575715065 74 | }, 75 | { 76 | "model_name": "backbone", 77 | "n_features": 5000, 78 | "backbone_size": 308, 79 | "alpha": 0.5, 80 | "beta": 0.5, 81 | "num_subproblems": 10, 82 | "num_iterations": 1, 83 | "lambda_2": 0.01, 84 | "R2 Score": 0.883565153769904, 85 | "Runtime (seconds)": 647.5597870349884 86 | }, 87 | { 88 | "model_name": "backbone", 89 | "n_features": 5000, 90 | "backbone_size": 40, 91 | "alpha": 0.5, 92 | "beta": 0.9, 93 | "num_subproblems": 5, 94 | "num_iterations": 1, 95 | "lambda_2": 0.01, 96 | "R2 Score": 0.883565153769904, 97 | "Runtime (seconds)": 93.9878761768341 98 | }, 99 | { 100 | "model_name": "backbone", 101 | "n_features": 5000, 102 | "backbone_size": 87, 103 | "alpha": 0.5, 104 | "beta": 0.9, 105 | "num_subproblems": 10, 106 | "num_iterations": 1, 107 | "lambda_2": 0.01, 108 | "R2 Score": 0.883565153769904, 109 | "Runtime (seconds)": 202.19328689575195 110 | } 111 | ] 112 | -------------------------------------------------------------------------------- /experiments/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | 5 | def save_results(results, filename): 6 | """ 7 | Saves the provided results to a file in JSON format. 8 | 9 | This function takes a dictionary of results and a filename, then writes the results to the specified file in 10 | the same directory as the script. The results are stored in JSON format with an indentation of 4 spaces for readability. 11 | 12 | Args: 13 | results (dict): A dictionary containing the results data to be saved. 14 | filename (str): The name of the file where the results will be saved. The file will be created in the same 15 | directory as the script. 16 | 17 | Raises: 18 | IOError: If the file cannot be opened or written to. 19 | """ 20 | script_dir = os.path.dirname(os.path.abspath(__file__)) 21 | filepath = os.path.join(script_dir, filename) 22 | with open(filepath, "w") as file: 23 | json.dump(results, file, indent=4) 24 | 25 | 26 | def load_results(filename): 27 | """ 28 | Loads existing results from a JSON file. 29 | 30 | This function attempts to open and read the specified JSON file. If the file exists, it parses the JSON content into a Python object (typically a list or dictionary, depending on the JSON structure) and returns it. If the file does not exist, the function returns an empty list. 31 | 32 | Args: 33 | filename (str): The name of the file from which to load the results. 34 | 35 | Returns: 36 | list or dict: The contents of the JSON file parsed into a Python object. Returns an empty list if the file does not exist. 37 | 38 | Raises: 39 | json.JSONDecodeError: If the file contents are not valid JSON. 40 | IOError: If the file cannot be opened for reasons other than non-existence. 41 | """ 42 | try: 43 | with open(filename, "r") as file: 44 | return json.load(file) 45 | except FileNotFoundError: 46 | return [] 47 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "backbone-learn" 3 | version = "0.1.3" 4 | description = "A Library for Scaling Mixed-Integer Optimization-Based Machine Learning." 5 | authors = ["Christos Ziakas, Vassilis Digalakis Jr."] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = ">=3.9,<3.12" 10 | pre-commit = "^3.5.0" 11 | pytest = "^7.4.3" 12 | numpy = "^1.26.1" 13 | scikit-learn = "^1.3.2" 14 | l0bnb = "^1.0.0" 15 | seaborn = "^0.13.0" 16 | pulp = "^2.7.0" 17 | jupyter = "^1.0.0" 18 | ipykernel = "^6.26.0" 19 | sphinx = "^7.2.6" 20 | 21 | 22 | [tool.poetry.group.dev.dependencies] 23 | ipykernel = "^6.26.0" 24 | 25 | [build-system] 26 | requires = ["poetry-core"] 27 | build-backend = "poetry.core.masonry.api" 28 | -------------------------------------------------------------------------------- /references.md: -------------------------------------------------------------------------------- 1 | @article{hazimeh2021sparse, 2 | title={Sparse regression at scale: Branch-and-bound rooted in first-order optimization}, 3 | author={Hazimeh, Hussein and Mazumder, Rahul and Saab, Ali}, 4 | journal={Mathematical Programming}, 5 | pages={1--42}, 6 | year={2021}, 7 | publisher={Springer} 8 | } 9 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chziakas/backbone-learn/576749c7a1ffa1e57ea4c018a88a052d20fad2e2/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_backbone/test_backbone_clustering.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | import numpy as np 5 | import pytest 6 | from sklearn.datasets import make_blobs 7 | 8 | from backbone_learn.backbone.backbone_clustering import BackboneClustering 9 | from backbone_learn.heuristic_solvers.kmeans_solver import KMeansSolver 10 | 11 | 12 | @pytest.fixture 13 | def sample_data(): 14 | X, _ = make_blobs(n_samples=10, centers=2, n_features=3, random_state=17) 15 | return X 16 | 17 | 18 | def test_backbone_clustering(sample_data): 19 | n_clusters = 5 20 | backbone_model = BackboneClustering( 21 | beta=1.0, num_subproblems=1, num_iterations=1, n_clusters=n_clusters, time_limit=3600 22 | ) 23 | backbone_model.fit(sample_data) 24 | 25 | # Test if the model has fitted 26 | if backbone_model.exact_solver is None: 27 | raise AssertionError("Backbone model's exact solver is not initialized") 28 | if backbone_model.heuristic_solver is None: 29 | raise AssertionError("Backbone model's heuristic solver is not initialized") 30 | 31 | # Test constraints are applied 32 | for (i, j) in backbone_model.exact_solver.ls_pairs_diff_cluster: 33 | for k in range(n_clusters): 34 | y_sum = backbone_model.exact_solver.y[i, k] + backbone_model.exact_solver.y[j, k] 35 | if y_sum >= 2: 36 | raise AssertionError("Constraint on y_sum violated") 37 | if backbone_model.exact_solver.z[i, j, k] != 0.0: 38 | raise AssertionError("Constraint on z violated") 39 | 40 | # Test silhouette scores 41 | if not (0 <= backbone_model.exact_solver.silhouette_score <= 1): 42 | raise AssertionError("Exact solver's silhouette score out of range") 43 | if not (0 <= backbone_model.heuristic_solver.silhouette_score <= 1): 44 | raise AssertionError("Heuristic solver's silhouette score out of range") 45 | 46 | 47 | def test_kmeans_solver(sample_data): 48 | n_clusters = 5 49 | heuristic_model = KMeansSolver(n_clusters=n_clusters) 50 | heuristic_model.fit(sample_data) 51 | 52 | # Test if the model has fitted 53 | if heuristic_model.model is None: 54 | raise AssertionError("KMeans solver model is not initialized") 55 | 56 | # Test silhouette scores 57 | if not (0 <= heuristic_model.silhouette_score <= 1): 58 | raise AssertionError("KMeans solver's silhouette score out of range") 59 | 60 | # Test WCSS 61 | if heuristic_model.wcss < 0: 62 | raise AssertionError("KMeans solver's WCSS is negative") 63 | 64 | 65 | def test_preprocessing_predict_without_assert(sample_data): 66 | # Instantiate BackboneClustering class 67 | backbone_model = BackboneClustering() 68 | 69 | # Call preprocessing_predict method with sample data 70 | processed_X = backbone_model.preprocessing_predict(sample_data) 71 | 72 | # Check if the returned dataset is the same as the input 73 | if not np.array_equal(processed_X, sample_data): 74 | raise ValueError("preprocessing_predict should return the original dataset without changes") 75 | -------------------------------------------------------------------------------- /tests/test_backbone/test_backbone_decision_tree.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | import numpy as np 5 | import pytest 6 | 7 | from backbone_learn.backbone.backbone_decision_tree import BackboneDecisionTree 8 | from backbone_learn.exact_solvers.benders_oct_decision_tree import BendersOCTDecisionTree 9 | from backbone_learn.heuristic_solvers.cart_decision_tree import CARTDecisionTree 10 | 11 | 12 | @pytest.fixture 13 | def sample_data(): 14 | # Create a simple dataset for testing 15 | X = np.random.rand(50, 2) # 50 samples, 2 features 16 | y = np.random.randint(0, 2, 50) # Binary target 17 | return X, y 18 | 19 | 20 | def test_initialization(): 21 | """Test initialization of BackboneDecisionTree.""" 22 | backbone = BackboneDecisionTree() 23 | if not isinstance(backbone, BackboneDecisionTree): 24 | raise AssertionError("Initialization of BackboneDecisionTree failed") 25 | 26 | 27 | def test_set_solvers(sample_data): 28 | """Test the set_solvers method with sample data.""" 29 | backbone = BackboneDecisionTree() 30 | backbone.set_solvers( 31 | alpha=0.5, depth=3, time_limit=1000, _lambda=0.5, num_threads=1, obj_mode="acc", n_bins=2 32 | ) 33 | 34 | # Test if solvers are set correctly 35 | if not isinstance(backbone.exact_solver, BendersOCTDecisionTree): 36 | raise AssertionError("exact_solver is not an instance of BendersOCTDecisionTree") 37 | if not isinstance(backbone.heuristic_solver, CARTDecisionTree): 38 | raise AssertionError("heuristic_solver is not an instance of CARTDecisionTree") 39 | 40 | 41 | def test_feature_screening(sample_data): 42 | """Test the feature screening process.""" 43 | X, y = sample_data 44 | backbone = BackboneDecisionTree() 45 | backbone.set_solvers(alpha=0.5) 46 | screened_features = backbone.screen_selector.select(X, y) 47 | 48 | # Test that the number of features after screening is correct 49 | if not (0 < screened_features.shape[1] <= X.shape[1]): 50 | raise AssertionError("Feature screening did not return correct number of features") 51 | 52 | 53 | def test_exact_solver_integration(sample_data): 54 | """Test the integration of the exact solver.""" 55 | X, y = sample_data 56 | backbone = BackboneDecisionTree() 57 | backbone.set_solvers(depth=3, time_limit=500, _lambda=0.5) 58 | backbone.exact_solver.fit(X, y) 59 | 60 | # Asserting model has been fitted 61 | if backbone.exact_solver.model is None: 62 | raise AssertionError("exact_solver model has not been fitted") 63 | 64 | 65 | def test_heuristic_solver_integration(sample_data): 66 | """Test the integration of the heuristic solver.""" 67 | X, y = sample_data 68 | backbone = BackboneDecisionTree() 69 | backbone.set_solvers() 70 | backbone.heuristic_solver.fit(X, y) 71 | 72 | # Asserting model has been fitted and can predict 73 | predictions = backbone.heuristic_solver.predict(X) 74 | if len(predictions) != len(y): 75 | raise AssertionError("Length of predictions does not match length of y") 76 | if not isinstance(predictions, np.ndarray): 77 | raise AssertionError("Predictions are not an instance of np.ndarray") 78 | -------------------------------------------------------------------------------- /tests/test_backbone/test_backbone_sparse_regression.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | from sklearn.datasets import make_regression 5 | 6 | from backbone_learn.backbone.backbone_sparse_regression import BackboneSparseRegression 7 | 8 | 9 | def test_backbone_sparse_regression_initialization(): 10 | backbone = BackboneSparseRegression(alpha=0.5, beta=0.3, num_subproblems=2) 11 | if not backbone.screen_selector.alpha == 0.5: 12 | raise AssertionError("Backbone screen_selector alpha not set correctly") 13 | if not backbone.beta == 0.3: 14 | raise AssertionError("Backbone beta not set correctly") 15 | if not backbone.num_subproblems == 2: 16 | raise AssertionError("Backbone num_subproblems not set correctly") 17 | 18 | 19 | def test_backbone_sparse_regression_predict(): 20 | X, y = make_regression( 21 | n_samples=100, n_features=50, n_informative=20, noise=0.1, random_state=42 22 | ) 23 | backbone = BackboneSparseRegression(alpha=0.5, beta=0.3, num_subproblems=2) 24 | backbone.fit(X, y) 25 | predictions = backbone.predict(X) 26 | 27 | # Validate the predictions 28 | if not len(predictions) == len(y): 29 | raise AssertionError("Prediction length mismatch") 30 | 31 | 32 | def test_backbone_sparse_regression_predict_no_screening(): 33 | X, y = make_regression( 34 | n_samples=100, n_features=50, n_informative=20, noise=0.1, random_state=42 35 | ) 36 | backbone = BackboneSparseRegression(alpha=0.5, beta=0.3, num_subproblems=2) 37 | backbone.screen_selector = None 38 | backbone.fit(X, y) 39 | predictions = backbone.predict(X) 40 | 41 | # Validate the predictions 42 | if not len(predictions) == len(y): 43 | raise AssertionError("Prediction length mismatch with no screening") 44 | 45 | 46 | def test_backbone_sparse_regression_predict_no_backbone(): 47 | X, y = make_regression( 48 | n_samples=100, n_features=50, n_informative=20, noise=0.1, random_state=42 49 | ) 50 | backbone = BackboneSparseRegression(alpha=0.5, beta=0.3, num_subproblems=2) 51 | backbone.heuristic_solver = None 52 | backbone.fit(X, y) 53 | predictions = backbone.predict(X) 54 | 55 | # Validate the predictions 56 | if not len(predictions) == len(y): 57 | raise AssertionError("Prediction length mismatch with no backbone") 58 | -------------------------------------------------------------------------------- /tests/test_backbone/test_subproblem_constructor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | import numpy as np 5 | 6 | from backbone_learn.backbone.subproblem_costructor import SubproblemConstructor 7 | 8 | 9 | def test_subproblem_constructor_initialization(): 10 | utilities = np.array([1, 2, 3, 4, 5]) 11 | beta = 0.5 12 | num_subproblems = 3 13 | 14 | constructor = SubproblemConstructor(utilities, beta, num_subproblems) 15 | 16 | if constructor.num_features != 5: 17 | raise AssertionError("Constructor num_features not set correctly") 18 | if constructor.beta != beta: 19 | raise AssertionError("Constructor beta not set correctly") 20 | if constructor.num_features_subproblem != 3: # 5 * 0.5 rounded up 21 | raise AssertionError("Constructor num_features_subproblem not set correctly") 22 | if constructor.num_subproblems != num_subproblems: 23 | raise AssertionError("Constructor num_subproblems not set correctly") 24 | 25 | 26 | def test_subproblem_constructor_correct_number_of_subproblems(): 27 | utilities = np.array([1, 2, 3, 4, 5]) 28 | beta = 0.5 29 | num_subproblems = 3 30 | 31 | constructor = SubproblemConstructor(utilities, beta, num_subproblems) 32 | subproblems = constructor.construct_subproblems() 33 | 34 | if len(subproblems) != num_subproblems: 35 | raise AssertionError("Incorrect number of subproblems created") 36 | 37 | 38 | def test_subproblem_constructor_correct_number_of_features_in_subproblems(): 39 | utilities = np.array([1, 2, 3, 4, 5, 6]) 40 | beta = 0.4 41 | num_subproblems = 2 42 | 43 | constructor = SubproblemConstructor(utilities, beta, num_subproblems) 44 | subproblems = constructor.construct_subproblems() 45 | 46 | for subproblem in subproblems: 47 | if len(subproblem) != 3: # 6 * 0.4 rounded up 48 | raise AssertionError("Incorrect number of features in a subproblem") 49 | 50 | 51 | def test_subproblem_constructor_valid_indices(): 52 | utilities = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) 53 | beta = 0.3 54 | num_subproblems = 3 55 | 56 | constructor = SubproblemConstructor(utilities, beta, num_subproblems) 57 | subproblems = constructor.construct_subproblems() 58 | 59 | total_features = len(utilities) 60 | 61 | for subproblem in subproblems: 62 | # Check if all indices are within range 63 | if not all(0 <= idx < total_features for idx in subproblem): 64 | raise AssertionError("Invalid indices in subproblem") 65 | 66 | # Check for duplicates within a subproblem 67 | if len(set(subproblem)) != len(subproblem): 68 | raise AssertionError("Duplicates found in a subproblem") 69 | 70 | 71 | def test_create_subsets_from_X(): 72 | # Simulating a dataset X 73 | np.random.seed(0) # Setting a seed for reproducibility 74 | X = np.random.rand(100, 10) # 100 samples, 10 features 75 | 76 | # Generate random utilities for features 77 | utilities = np.random.rand(10) 78 | 79 | # Initialize SubproblemConstructor 80 | beta = 0.3 81 | num_subproblems = 3 82 | constructor = SubproblemConstructor(utilities, beta, num_subproblems) 83 | 84 | # Create subproblems and corresponding subsets of X 85 | subproblems = constructor.construct_subproblems() 86 | subsets = [X[:, subproblem] for subproblem in subproblems] 87 | 88 | # Assertions and checks 89 | if len(subsets) != num_subproblems: 90 | raise AssertionError("Incorrect number of subsets created") 91 | for subset, subproblem in zip(subsets, subproblems): 92 | if subset.shape[1] != len(subproblem): # Check the number of features in each subset 93 | raise AssertionError("Mismatch in number of features in a subset") 94 | -------------------------------------------------------------------------------- /tests/test_backbone/test_subproblem_selector.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | import numpy as np 5 | 6 | from backbone_learn.backbone.subproblem_feature_selector import SubproblemFeatureSelector 7 | 8 | 9 | def test_subproblem_feature_selector_initialization(): 10 | utilities = np.array([0.1, 0.4, 0.6, 0.3]) 11 | num_features_to_select = 2 12 | selector = SubproblemFeatureSelector(utilities, num_features_to_select) 13 | 14 | if not np.array_equal(selector.utilities, utilities): 15 | raise AssertionError("Selector utilities not set correctly") 16 | if selector.num_features_to_select != num_features_to_select: 17 | raise AssertionError("Selector num_features_to_select not set correctly") 18 | 19 | 20 | def test_subproblem_feature_selector_selection(): 21 | utilities = np.array([0.1, 0.4, 0.6, 0.3]) 22 | num_features_to_select = 2 23 | selector = SubproblemFeatureSelector(utilities, num_features_to_select) 24 | 25 | selected_indices = selector.select() 26 | 27 | # Check if the length of the selected indices is correct 28 | if len(selected_indices) != num_features_to_select: 29 | raise AssertionError("Incorrect number of features selected") 30 | 31 | # Check if selected indices are valid 32 | if not all([idx in range(len(utilities)) for idx in selected_indices]): 33 | raise AssertionError("Invalid indices selected") 34 | 35 | 36 | def test_subproblem_feature_selector_probability_distribution(): 37 | utilities = np.array([0, 10, 20, 30]) 38 | num_features_to_select = 1 39 | selector = SubproblemFeatureSelector(utilities, num_features_to_select) 40 | 41 | selector.select() 42 | 43 | # Run the selection multiple times to verify distribution 44 | counts = np.zeros(len(utilities)) 45 | for _ in range(1000): 46 | idx = selector.select()[0] 47 | counts[idx] += 1 48 | 49 | if np.argmax(counts) != np.argmax(utilities): 50 | raise AssertionError("Probability distribution does not align with utility values") 51 | 52 | 53 | def test_compute_probability_distribution(): 54 | utilities = np.array([1, 2, 3, 4, 5]) 55 | expected_probabilities = np.exp(utilities / np.max(utilities) + 1) 56 | expected_probabilities /= expected_probabilities.sum() 57 | 58 | selector = SubproblemFeatureSelector(utilities, num_features_to_select=3) 59 | computed_probabilities = selector.probability_distribution 60 | 61 | if not np.allclose(computed_probabilities.sum(), 1): 62 | raise AssertionError("Probabilities should sum up to 1") 63 | 64 | if not np.allclose(computed_probabilities, expected_probabilities): 65 | raise AssertionError("Computed probabilities do not match expected values") 66 | -------------------------------------------------------------------------------- /tests/test_exact_solvers/test_benders_oct_decision_tree.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | import numpy as np 5 | from sklearn.datasets import make_classification 6 | from sklearn.preprocessing import KBinsDiscretizer, OneHotEncoder 7 | 8 | from backbone_learn.exact_solvers.benders_oct_decision_tree import BendersOCTDecisionTree 9 | 10 | 11 | def test_default_initialization(): 12 | tree = BendersOCTDecisionTree() 13 | if not isinstance(tree.est_X, KBinsDiscretizer): 14 | raise AssertionError("tree.est_X is not an instance of KBinsDiscretizer") 15 | if not isinstance(tree.enc, OneHotEncoder): 16 | raise AssertionError("tree.enc is not an instance of OneHotEncoder") 17 | if tree.is_data_fit: 18 | raise AssertionError("tree.is_data_fit should be False on default initialization") 19 | 20 | 21 | def test_preprocess_features_with_numpy(): 22 | tree = BendersOCTDecisionTree() 23 | X = np.random.rand(10, 2) # Sample data 24 | tree.fit_preprocessors(X) # Fit preprocessors first 25 | X_transformed = tree.preprocess_features(X) 26 | if X_transformed.shape != (10, 2): # Expected shape 27 | raise AssertionError("X_transformed does not have the expected shape") 28 | 29 | 30 | def test_fit_preprocessors(): 31 | tree = BendersOCTDecisionTree() 32 | X_train = np.random.rand(10, 2) # Sample training data 33 | tree.fit_preprocessors(X_train) 34 | if tree.est_X.n_bins_ is None: # est_X should be fitted 35 | raise AssertionError("tree.est_X.n_bins_ is not set after fitting preprocessors") 36 | if tree.enc.categories_ is None: # enc should be fitted 37 | raise AssertionError("tree.enc.categories_ is not set after fitting preprocessors") 38 | 39 | 40 | def test_predict_after_fitting_preprocessors(): 41 | X, y = make_classification(n_samples=100, n_features=5, random_state=42) 42 | model = BendersOCTDecisionTree() 43 | model.fit(X, y) 44 | predictions = model.predict(X) 45 | if len(predictions) != len(y): 46 | raise AssertionError("Prediction length mismatch with input") 47 | 48 | 49 | def test_predict_after_fitting_preprocessors_fit(): 50 | X, y = make_classification(n_samples=100, n_features=5, random_state=42) 51 | X_binary = (X > 0.5).astype(int) # Convert to binary (0 or 1) 52 | model = BendersOCTDecisionTree(is_data_fit=True) # Set is_data_fit to True for testing 53 | model.fit(X_binary, y) 54 | predictions = model.predict(X_binary) 55 | if len(predictions) != len(y): 56 | raise AssertionError("Prediction length mismatch with input") 57 | 58 | 59 | def test_predict_after_fitting_preprocessors(): 60 | X, y = make_classification(n_samples=100, n_features=5, random_state=42) 61 | model = BendersOCTDecisionTree(is_data_fit=False) 62 | model.fit(X, y) 63 | predictions = model.predict(X) 64 | if len(predictions) != len(y): 65 | raise AssertionError("Prediction length mismatch with input") 66 | -------------------------------------------------------------------------------- /tests/test_exact_solvers/test_lobnb_regression.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | from sklearn.datasets import make_regression 5 | 6 | from backbone_learn.exact_solvers.lobnb_regression import L0BnBRegression 7 | 8 | 9 | def test_l0bnb_regression(): 10 | # Generate a synthetic regression dataset 11 | X, y = make_regression( 12 | n_samples=100, n_features=20, n_informative=5, noise=0.1, random_state=42 13 | ) 14 | 15 | # Initialize and fit the L0BnBRegression model 16 | l0bnb_reg = L0BnBRegression(lambda_2=0.01, max_nonzeros=10) 17 | l0bnb_reg.fit(X, y) 18 | 19 | # Test that solutions are found 20 | if len(l0bnb_reg.model.coefficients) == 0: 21 | raise AssertionError("L0BnBRegression model did not find any coefficients") 22 | if l0bnb_reg.model is None: 23 | raise AssertionError("L0BnBRegression model is not initialized") 24 | 25 | # Test predictions 26 | predictions = l0bnb_reg.predict(X) 27 | if len(predictions) != len(y): 28 | raise AssertionError("Number of predictions does not match number of samples") 29 | -------------------------------------------------------------------------------- /tests/test_exact_solvers/test_mio_clustering.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | import numpy as np 5 | from sklearn.datasets import make_blobs 6 | 7 | from backbone_learn.exact_solvers.mio_clustering import MIOClustering 8 | 9 | 10 | def test_calculate_distances(): 11 | X = np.array([[0, 0], [3, 4]]) 12 | clustering = MIOClustering(n_clusters=2) 13 | distances = clustering._calculate_distances(X) 14 | if distances.shape != (2, 2, 2): 15 | raise AssertionError("Distances array does not have the expected shape (2, 2, 2)") 16 | if not (4.5 <= distances[0, 1, 0] <= 5.5): 17 | raise AssertionError("Distance calculation is not within the expected range (4.5 to 5.5)") 18 | 19 | 20 | def test_fit_predict(): 21 | X, _ = make_blobs(n_samples=10, n_features=2, centers=3, random_state=42) 22 | clustering = MIOClustering(n_clusters=3) 23 | clustering.fit(X) 24 | assignments = clustering.predict(X) 25 | if len(assignments) != 10: 26 | raise AssertionError("Number of assignments does not match number of samples") 27 | if len(set(assignments)) != 3: # Assuming 3 clusters are identified 28 | raise AssertionError( 29 | "Number of unique assignments does not match expected number of clusters" 30 | ) 31 | 32 | 33 | def test_euclidean_distance(): 34 | point1 = np.array([0, 0]) 35 | point2 = np.array([3, 4]) 36 | calculated_distance = MIOClustering.euclidean_distance(point1, point2) 37 | expected_distance = 5 38 | if not np.isclose(calculated_distance, expected_distance): 39 | raise AssertionError("Euclidean distance calculation is incorrect") 40 | 41 | 42 | def test_extract_cluster_means(): 43 | X = np.array([[1, 2], [2, 3], [1, 2], [4, 5]]) 44 | # Simulate a scenario where the model has been fitted 45 | mio_clustering = MIOClustering(n_clusters=2) 46 | mio_clustering.y = np.array( 47 | [[1, 0], [1, 0], [1, 0], [0, 1]] 48 | ) # Assume first three points in cluster 0 and last point in cluster 1 49 | 50 | cluster_means = mio_clustering.extract_cluster_means(X) 51 | expected_means = np.array( 52 | [[4 / 3, 7 / 3], [4, 5]] # Mean of the first three points # Mean of the last point 53 | ) 54 | if not np.allclose(cluster_means, expected_means): 55 | raise AssertionError("Cluster means calculation is incorrect") 56 | -------------------------------------------------------------------------------- /tests/test_heuristic_solvers/test_cart_decision_tree.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | from sklearn.datasets import make_classification 5 | 6 | from backbone_learn.heuristic_solvers.cart_decision_tree import CARTDecisionTree 7 | 8 | 9 | def test_fit_method(): 10 | X, y = make_classification(n_samples=100, n_features=4, n_classes=2, random_state=42) 11 | cart = CARTDecisionTree() 12 | (X > 0.0).astype(int) 13 | cart.fit(X, y) 14 | 15 | if cart.model is None: 16 | raise AssertionError("CARTDecisionTree model not initialized after fit") 17 | if not isinstance(cart.auc_score, float): 18 | raise AssertionError("CARTDecisionTree auc_score is not a float value") 19 | if not (0 <= cart.auc_score <= 1): 20 | raise AssertionError("CARTDecisionTree auc_score is out of the expected range (0-1)") 21 | 22 | 23 | def test_get_significant_features(): 24 | # Create a synthetic dataset 25 | X, y = make_classification(n_samples=100, n_features=10, n_classes=2, random_state=42) 26 | (X > 0.0).astype(int) 27 | # Initialize and fit the CARTDecisionTree model 28 | cart = CARTDecisionTree() 29 | cart.fit(X, y) 30 | 31 | # Set a threshold for significant features 32 | threshold = 0.1 33 | significant_features = cart.get_relevant_variables(threshold) 34 | 35 | # Check if the method identifies significant features correctly 36 | if len(significant_features) < 0: 37 | raise AssertionError("Number of significant features is less than 0") 38 | if not all(cart.model.feature_importances_[idx] > threshold for idx in significant_features): 39 | raise AssertionError("Identified significant features do not meet the threshold") 40 | 41 | 42 | def test_cart_decision_tree_predict(): 43 | X_train, y_train = make_classification( 44 | n_samples=100, n_features=4, n_classes=2, random_state=42 45 | ) 46 | X_test, _ = make_classification(n_samples=20, n_features=4, n_classes=2, random_state=42) 47 | 48 | model = CARTDecisionTree() 49 | model.fit(X_train, y_train) 50 | predictions = model.predict(X_test) 51 | 52 | if len(predictions) != len(X_test): 53 | raise AssertionError("Number of predictions does not match number of test samples") 54 | if not all(pred in [0, 1] for pred in predictions): 55 | raise AssertionError("Predictions contain values outside of [0, 1]") 56 | -------------------------------------------------------------------------------- /tests/test_heuristic_solvers/test_kmeans.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | import numpy as np 5 | import pytest 6 | 7 | from backbone_learn.heuristic_solvers.kmeans_solver import KMeansSolver 8 | 9 | 10 | @pytest.fixture 11 | def sample_data(): 12 | return np.array([[1, 2], [1, 4], [1, 0], [10, 2], [10, 4], [10, 0]]) 13 | 14 | 15 | def test_fit(sample_data): 16 | """Test fitting the KMeans model.""" 17 | solver = KMeansSolver(n_clusters=2) 18 | solver.fit(sample_data) 19 | if solver._model is None: 20 | raise AssertionError("KMeans model not initialized after fit") 21 | if len(solver.cluster_centers) != 2: 22 | raise AssertionError("Number of cluster centers is not 2 as expected") 23 | 24 | 25 | def test_compute_cluster_centers(sample_data): 26 | """Test computation of cluster centers.""" 27 | solver = KMeansSolver(n_clusters=2) 28 | solver.fit(sample_data) 29 | if solver.cluster_centers.shape != (2, 2): 30 | raise AssertionError("Cluster centers shape is not (2, 2) as expected") 31 | 32 | 33 | def test_compute_wcss(sample_data): 34 | """Test computation of WCSS.""" 35 | solver = KMeansSolver(n_clusters=2) 36 | solver.fit(sample_data) 37 | wcss = solver._compute_wcss(sample_data) 38 | if not isinstance(wcss, float): 39 | raise AssertionError("WCSS is not a float value") 40 | if wcss <= 0: 41 | raise AssertionError("WCSS is not greater than 0") 42 | 43 | 44 | def test_get_relevant_variables(sample_data): 45 | """Test identification of relevant variables.""" 46 | solver = KMeansSolver(n_clusters=2) 47 | solver.fit(sample_data) 48 | relevant_vars = solver.get_relevant_variables() 49 | if not isinstance(relevant_vars, list): 50 | raise AssertionError("Relevant variables not returned as a list") 51 | if not all(isinstance(pair, tuple) for pair in relevant_vars): 52 | raise AssertionError("Elements in relevant variables are not tuples") 53 | 54 | 55 | def test_compute_silhouette_score(sample_data): 56 | """Test computation of silhouette score.""" 57 | solver = KMeansSolver(n_clusters=2) 58 | solver.fit(sample_data) 59 | score = solver._compute_silhouette_score(sample_data) 60 | if not isinstance(score, float): 61 | raise AssertionError("Silhouette score is not a float value") 62 | 63 | 64 | def test_fit_with_fewer_points_than_clusters(): 65 | X = np.random.rand(5, 3) # 5 data points, 3 features 66 | solver = KMeansSolver(n_clusters=10) 67 | solver.fit(X) 68 | if not solver: 69 | raise AssertionError("Solver did not fit properly with fewer points than clusters") 70 | -------------------------------------------------------------------------------- /tests/test_heuristic_solvers/test_sparse_regression.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | import numpy as np 5 | import pytest 6 | from sklearn.datasets import make_regression 7 | 8 | from backbone_learn.heuristic_solvers.lasso_regression import LassoRegression 9 | 10 | 11 | # Test Initialization 12 | def test_initialization(): 13 | reg = LassoRegression() 14 | if not isinstance(reg, LassoRegression): 15 | raise AssertionError("LassoRegression instance is not created correctly") 16 | 17 | 18 | # Test Model Fitting 19 | def test_fit(): 20 | X, y = make_regression(n_samples=100, n_features=10, noise=0.1) 21 | reg = LassoRegression() 22 | reg.fit(X, y) 23 | if not hasattr(reg._model, "coef_"): 24 | raise AssertionError("Model coefficients (coef_) not found after fitting") 25 | 26 | 27 | # Test Predict Function 28 | def test_predict(): 29 | X, y = make_regression(n_samples=100, n_features=10, noise=0.1) 30 | reg = LassoRegression() 31 | reg.fit(X, y) 32 | predictions = reg.predict(X) 33 | if len(predictions) != len(y): 34 | raise AssertionError("Prediction length mismatch") 35 | 36 | 37 | # Test Getting Relevant Variables 38 | def test_get_relevant_variables(): 39 | X, y = make_regression(n_samples=100, n_features=10, noise=0.1) 40 | reg = LassoRegression() 41 | reg.fit(X, y) 42 | threshold = 0.1 43 | significant_vars = reg.get_relevant_variables(threshold) 44 | if not isinstance(significant_vars, np.ndarray): 45 | raise AssertionError("Output is not numpy array") 46 | if len(significant_vars) == 0: 47 | raise AssertionError("No significant variables found") 48 | 49 | 50 | # Test Keeping Top Features 51 | def test_keep_top_features(): 52 | n_features = 10 53 | n_non_zeros = 5 54 | X, y = make_regression(n_samples=100, n_features=n_features, noise=0.1) 55 | reg = LassoRegression() 56 | reg.fit(X, y) 57 | reg.keep_top_features(n_non_zeros) 58 | if np.count_nonzero(reg._model.coef_) > n_non_zeros: 59 | raise AssertionError("More features retained than specified") 60 | 61 | 62 | # Test Error on Unfitted Model Prediction 63 | def test_error_on_unfitted_predict(): 64 | reg = LassoRegression() 65 | X = np.random.randn(10, 5) 66 | with pytest.raises(ValueError): 67 | _ = reg.predict(X) 68 | 69 | 70 | def test_mse_score(): 71 | # Create a small synthetic dataset 72 | X, y = make_regression(n_samples=100, n_features=10, noise=0.1, random_state=42) 73 | 74 | # Initialize and fit the LassoRegression model 75 | lasso_reg = LassoRegression() 76 | lasso_reg.fit(X, y) 77 | 78 | # Manually calculate MSE for comparison 79 | predictions = lasso_reg.predict(X) 80 | expected_mse = np.mean((y - predictions) ** 2) 81 | 82 | # Set the _mse_score manually (if it's not set in fit) 83 | lasso_reg._mse_score = expected_mse 84 | 85 | # Test that mse_score property returns the correct MSE 86 | if not np.isclose(lasso_reg.mse_score, expected_mse): 87 | raise AssertionError("The mse_score property did not return the expected value") 88 | -------------------------------------------------------------------------------- /tests/test_screen_selectors/test_linear_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from backbone_learn.screen_selectors.linear_regression_selector import LinearRegressionSelector 4 | 5 | 6 | def test_linear_regression_selector(): 7 | # Test data: a simple linear relationship 8 | X = np.array([[1], [2], [3], [4]]) 9 | y = np.array([3, 5, 7, 9]) 10 | 11 | # Initialize LinearRegressionSelector 12 | selector = LinearRegressionSelector() 13 | utilities = selector.calculate_utilities(X, y) 14 | 15 | # Expected coefficients: [1, 2] (without considering the intercept) 16 | expected_utilities = np.array([2]) 17 | 18 | # Verify that calculated utilities match expected values 19 | if not np.allclose(utilities, expected_utilities): 20 | raise AssertionError(f"Expected utilities {expected_utilities}, got {utilities}") 21 | 22 | 23 | # Test Utilities Calculation with Singular Matrix 24 | def test_utilities_with_singular_matrix(): 25 | selector = LinearRegressionSelector() 26 | X = np.ones((10, 3)) # Singular matrix (not invertible) 27 | y = np.random.rand(10) 28 | utilities = selector.calculate_utilities(X, y) 29 | 30 | if not np.all(utilities == 0): 31 | raise AssertionError("Utilities should be zero for singular matrix") 32 | -------------------------------------------------------------------------------- /tests/test_screen_selectors/test_pearson_correlation.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from backbone_learn.screen_selectors.pearson_correlation_selector import PearsonCorrelationSelector 5 | 6 | 7 | @pytest.fixture 8 | def synthetic_data(): 9 | X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) 10 | y = np.array([1, 2, 3]) 11 | return X, y 12 | 13 | 14 | def test_initialization(): 15 | selector = PearsonCorrelationSelector() 16 | if selector.alpha != 1.0: 17 | raise AssertionError("Selector alpha not initialized to 1.0") 18 | if selector.utilities is not None: 19 | raise AssertionError("Selector utilities not initialized as None") 20 | if selector.indices_keep is not None: 21 | raise AssertionError("Selector indices_keep not initialized as None") 22 | 23 | 24 | def test_utilities_computation(synthetic_data): 25 | X, y = synthetic_data 26 | selector = PearsonCorrelationSelector() 27 | utilities = selector.calculate_utilities(X, y) 28 | 29 | if utilities is None: 30 | raise AssertionError("Utilities computation returned None") 31 | if len(utilities) != X.shape[1]: 32 | raise AssertionError("Incorrect number of utilities computed") 33 | 34 | 35 | def test_compute_mean(): 36 | array = np.array([1, 2, 3, 4, 5]) 37 | mean = PearsonCorrelationSelector.compute_mean(array) 38 | if mean != np.mean(array): 39 | raise AssertionError("Computed mean does not match expected mean") 40 | 41 | 42 | def test_compute_std(): 43 | array = np.array([1, 2, 3, 4, 5]) 44 | std = PearsonCorrelationSelector.compute_std(array) 45 | if std != np.std(array): 46 | raise AssertionError("Computed standard deviation does not match expected value") 47 | 48 | 49 | def test_compute_covariance(): 50 | x = np.array([1, 2, 3]) 51 | y = np.array([1, 2, 3]) 52 | x_mean = np.mean(x) 53 | y_mean = np.mean(y) 54 | covariance = PearsonCorrelationSelector.compute_covariance(x, y, x_mean, y_mean) 55 | expected_covariance = np.mean((x - x_mean) * (y - y_mean)) 56 | if covariance != expected_covariance: 57 | raise AssertionError("Computed covariance does not match expected value") 58 | 59 | 60 | def test_select_with_custom_alpha(synthetic_data): 61 | X = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) 62 | y = np.array([1, 2, 3]) 63 | alpha = 0.5 64 | selector = PearsonCorrelationSelector(alpha=alpha) 65 | X_selected = selector.select(X, y) 66 | 67 | expected_features = int(alpha * X.shape[1]) 68 | if X_selected.shape[1] != expected_features: 69 | raise AssertionError("Selected features do not match expected number based on alpha") 70 | 71 | 72 | def test_select_indices(): 73 | utilities = np.array([0.1, 0.5, 0.2]) 74 | num_keep = 2 75 | selected_indices = PearsonCorrelationSelector.select_indices(utilities, num_keep) 76 | 77 | if len(selected_indices) != num_keep: 78 | raise AssertionError("Incorrect number of indices selected") 79 | if not np.array_equal(selected_indices, np.array([1, 2])): 80 | raise AssertionError("Selected indices do not match expected top two utilities") 81 | 82 | 83 | def test_calculate_utilities_with_zero_correlation(): 84 | # Create a dataset where features are uncorrelated with the target 85 | np.random.seed(42) 86 | X = np.random.rand(100, 5) # 100 samples, 5 features 87 | y = np.full(100, 0.5) # Target variable, constant and therefore uncorrelated with X 88 | 89 | selector = PearsonCorrelationSelector() 90 | utilities = selector.calculate_utilities(X, y) 91 | 92 | # Check if all calculated utilities are zero 93 | if not np.all(utilities == 0): 94 | raise AssertionError("Utilities should be zero for uncorrelated features and target") 95 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Vassilis Digalakis Jr, Christos Ziakas 2 | # Licensed under the MIT License. 3 | 4 | import sys 5 | 6 | import numpy as np 7 | 8 | from backbone_learn.utils.utils import Utils 9 | 10 | 11 | def test_keep_lowest(): 12 | print(sys.path) 13 | # Test keeping 3 lowest values 14 | arr = np.array([5, 2, 7, 3, 8, 1]) 15 | expected = np.array([2, 3, 1]) 16 | if not np.array_equal(Utils.keep_lowest(arr, 3), expected): 17 | raise AssertionError("Test keeping 3 lowest values failed") 18 | 19 | # Test keeping lowest value 20 | arr = np.array([5, 2, 7]) 21 | expected = np.array([2]) 22 | if not np.array_equal(Utils.keep_lowest(arr, 1), expected): 23 | raise AssertionError("Test keeping lowest value failed") 24 | 25 | # Test empty array 26 | arr = np.array([]) 27 | expected = np.array([]) 28 | if not np.array_equal(Utils.keep_lowest(arr, 0), expected): 29 | raise AssertionError("Test empty array failed") 30 | 31 | 32 | def test_keep_highest_standard(): 33 | arr = np.array([1, 3, 2, 4, 5]) 34 | if not np.array_equal(Utils.find_idx_highest(arr, 3), np.array([1, 3, 4])): 35 | raise AssertionError("Test keep highest standard failed") 36 | 37 | 38 | def test_keep_highest_edge_cases(): 39 | # Edge case: Empty array 40 | if not (Utils.find_idx_highest(np.array([]), 0).size == 0): 41 | raise AssertionError("Edge case with empty array failed") 42 | 43 | # Edge case: num_keep is equal to the length of the array 44 | arr = np.array([1, 2, 3]) 45 | if not np.array_equal(Utils.find_idx_highest(arr, 3), np.array([0, 1, 2])): 46 | raise AssertionError("Edge case with num_keep equal to array length failed") 47 | 48 | 49 | def test_keep_highest_error_handling(): 50 | arr = np.array([1, 2, 3]) 51 | 52 | # num_keep is negative 53 | try: 54 | Utils.find_idx_highest(arr, -1) 55 | raise AssertionError("No error raised for negative num_keep") 56 | except ValueError: 57 | pass 58 | 59 | # num_keep is larger than array length 60 | try: 61 | Utils.find_idx_highest(arr, 4) 62 | raise AssertionError("No error raised for num_keep larger than array length") 63 | except ValueError: 64 | pass 65 | 66 | 67 | def test_merge_lists_and_sort(): 68 | # Test with a simple case 69 | input_lists = [[1, 3, 2], [3, 4], [5, 1]] 70 | expected_output = [1, 2, 3, 4, 5] 71 | if not Utils.merge_lists_and_sort(input_lists) == expected_output: 72 | raise AssertionError("Simple case merge and sort failed") 73 | 74 | # Test with empty lists 75 | input_lists = [[], [1], [], [2, 1]] 76 | expected_output = [1, 2] 77 | if not Utils.merge_lists_and_sort(input_lists) == expected_output: 78 | raise AssertionError("Test with empty lists failed") 79 | 80 | # Test with all empty lists 81 | input_lists = [[], []] 82 | expected_output = [] 83 | if not Utils.merge_lists_and_sort(input_lists) == expected_output: 84 | raise AssertionError("Test with all empty lists failed") 85 | 86 | # Test with tuples 87 | input_lists = [[(1, 2), (3, 4)], [(3, 4), (5, 6)], [(1, 2)]] 88 | expected_output = [(1, 2), (3, 4), (5, 6)] 89 | if not Utils.merge_lists_and_sort(input_lists) == expected_output: 90 | raise AssertionError("Test with tuples failed") 91 | 92 | 93 | def test_find_common_tuples(): 94 | if not Utils.find_common_tuples([[(1, 2), (1, 3)], [(2, 3), (1, 3)], [(1, 3), (4, 5)]]) == [ 95 | (1, 3) 96 | ]: 97 | raise AssertionError("Test with three sublists failed") 98 | 99 | if not (Utils.find_common_tuples([[(1, 2), (3, 4)], [(5, 6)], [(7, 8)]]) == []): 100 | raise AssertionError("Test with no common tuples failed") 101 | 102 | if not Utils.find_common_tuples([[(1, 2), (3, 4)], [(1, 2), (4, 5)], [(1, 2), (5, 6)]]) == [ 103 | (1, 2) 104 | ]: 105 | raise AssertionError("Test with common tuple in all lists failed") 106 | 107 | if not Utils.find_common_tuples([]) == []: 108 | raise AssertionError("Test with empty list failed") 109 | 110 | if not Utils.find_common_tuples([[]]) == []: 111 | raise AssertionError("Test with single empty sublist failed") 112 | 113 | 114 | def test_generate_index_pairs(): 115 | total_points = 4 116 | excluded_pairs = [(0, 2), (1, 3)] 117 | expected_output = [(0, 1), (0, 3), (1, 2), (2, 3)] 118 | 119 | if not Utils.generate_index_pairs(total_points, excluded_pairs) == expected_output: 120 | raise AssertionError("Test for generate index pairs failed") 121 | --------------------------------------------------------------------------------