├── .dockerignore ├── .github └── workflows │ ├── pr.md │ ├── pr.yml │ └── release.yml ├── .gitignore ├── CHANGELOG.md ├── CITATION.cff ├── CNAME ├── LICENSE ├── Makefile ├── README.md ├── data ├── electricity.arff ├── electricity.csv ├── electricity_tiny.arff ├── electricity_tiny.csv ├── fried.arff └── fried.csv ├── docker ├── .env ├── docker-compose.yml └── dockerfile ├── docs ├── _static │ ├── .placeholder │ └── css │ │ └── citation.css ├── _templates │ └── autosummary │ │ ├── class.rst │ │ └── module.rst ├── about.rst ├── api │ └── index.rst ├── conf.py ├── contributing │ ├── docs.rst │ ├── index.rst │ ├── learners.md │ ├── tests.md │ ├── updating_moajar.md │ └── vcs.md ├── docker.md ├── images │ ├── CapyMOA.jpeg │ ├── arf100_cpu_time.png │ ├── arf100_cpu_time_dark.png │ ├── benchmark_20240422_221824_performance_plot_wallclock.png │ ├── docker_demo.png │ └── profiles │ │ ├── anton_lee.jpg │ │ ├── guilherme_weigert_cassales.jpg │ │ ├── heitor_murilo_gomes.jpg │ │ ├── justin_liu.jpg │ │ ├── marco_heyden.jpg │ │ ├── nuwan_gunasekara.jpg │ │ ├── vitor_cerqueira.jpg │ │ └── yibin_sun.jpg ├── index.rst ├── installation.rst ├── tutorials.rst └── util │ └── github_link.py ├── invoke.yml ├── notebooks ├── 00_getting_started.ipynb ├── 01_evaluation.ipynb ├── 02_sklearn.ipynb ├── 03_pytorch.ipynb ├── 04_drift_streams.ipynb ├── 05_new_learner.ipynb ├── 06_advanced_API.ipynb ├── 07_pipelines.ipynb ├── 08_prediction_interval.ipynb ├── 09_automl.ipynb ├── 10_ocl.ipynb ├── ClusTree_clustering_evolution.gif ├── Clustream_clustering_evolution.gif ├── Clustream_with_Kmeans_clustering_evolution.gif ├── DeNSTReaM_clustering_custom_name.gif ├── SSL_example.ipynb ├── anomaly_detection.ipynb ├── benchmarking.py ├── clustering.ipynb ├── drift_detection.ipynb ├── parallel_ensembles.ipynb ├── save_and_load_model.ipynb ├── settings_autoclass.json ├── under_construction.jpeg └── util │ └── nbmock.py ├── pyproject.toml ├── src ├── __init__.py └── capymoa │ ├── __about__.py │ ├── __init__.py │ ├── _pickle.py │ ├── _prepare_jpype.py │ ├── _utils.py │ ├── ann │ ├── __init__.py │ └── _perceptron.py │ ├── anomaly │ ├── __init__.py │ ├── _autoencoder.py │ ├── _half_space_trees.py │ ├── _online_isolation_forest.py │ └── _stream_rhf.py │ ├── automl │ ├── __init__.py │ └── _autoclass.py │ ├── base │ ├── __init__.py │ ├── _base.py │ ├── _classifier.py │ ├── _regressor.py │ └── _ssl.py │ ├── classifier │ ├── __init__.py │ ├── _adaptive_random_forest.py │ ├── _csmote.py │ ├── _dynamic_weighted_majority.py │ ├── _efdt.py │ ├── _finetune.py │ ├── _hoeffding_adaptive_tree.py │ ├── _hoeffding_tree.py │ ├── _knn.py │ ├── _leveraging_bagging.py │ ├── _majority_class.py │ ├── _naive_bayes.py │ ├── _no_change.py │ ├── _online_adwin_bagging.py │ ├── _online_bagging.py │ ├── _online_smooth_boost.py │ ├── _oza_boost.py │ ├── _passive_aggressive_classifier.py │ ├── _samknn.py │ ├── _sgbt.py │ ├── _sgd_classifier.py │ ├── _shrubs_classifier.py │ ├── _shrubs_ensemble.py │ ├── _srp.py │ └── _weightedknn.py │ ├── clusterers │ ├── __init__.py │ ├── _clustream.py │ ├── _clustream_with_kmeans.py │ ├── _clustree.py │ └── _denstream_with_dbscan.py │ ├── datasets │ ├── __init__.py │ ├── __main__.py │ ├── _datasets.py │ ├── _source_list.py │ ├── _utils.py │ └── downloader.py │ ├── drift │ ├── __init__.py │ ├── base_detector.py │ ├── detectors │ │ ├── __init__.py │ │ ├── abcd.py │ │ ├── abcd_components │ │ │ ├── __init__.py │ │ │ ├── feature_extraction.py │ │ │ ├── std.py │ │ │ └── windowing.py │ │ ├── adwin.py │ │ ├── cusum.py │ │ ├── ddm.py │ │ ├── ewma_chart.py │ │ ├── geometric_ma.py │ │ ├── hddm_a.py │ │ ├── hddm_w.py │ │ ├── page_hinkley.py │ │ ├── rddm.py │ │ ├── seed.py │ │ └── stepd.py │ └── eval_detector.py │ ├── env.py │ ├── evaluation │ ├── __init__.py │ ├── _progress_bar.py │ ├── evaluation.py │ ├── results.py │ └── visualization.py │ ├── instance.py │ ├── jar │ ├── Home.class │ └── home.java │ ├── misc.py │ ├── ocl │ ├── __init__.py │ ├── ann.py │ ├── base.py │ ├── datasets.py │ ├── evaluation.py │ ├── strategy │ │ ├── __init__.py │ │ ├── _experience_replay.py │ │ ├── _ncm.py │ │ └── _slda.py │ └── util │ │ └── data.py │ ├── prediction_interval │ ├── __init__.py │ ├── _adaptive_prediction_interval.py │ └── _mean_and_standard_deviation_estimation.py │ ├── regressor │ ├── __init__.py │ ├── _adaptive_random_forest.py │ ├── _arffimtdd.py │ ├── _fimtdd.py │ ├── _knn.py │ ├── _orto.py │ ├── _passive_aggressive_regressor.py │ ├── _sgd_regressor.py │ ├── _shrubs_regressor.py │ ├── _soknl.py │ └── _soknl_base_tree.py │ ├── splitcriteria.py │ ├── ssl │ └── classifier │ │ ├── __init__.py │ │ └── _osnn.py │ ├── stream │ ├── __init__.py │ ├── _stream.py │ ├── drift.py │ ├── generator.py │ ├── preprocessing │ │ ├── __init__.py │ │ ├── pipeline.py │ │ └── transformer.py │ └── torch.py │ └── type_alias.py ├── tasks.py └── tests ├── conftest.py ├── ocl ├── test_datasets.py └── test_strategy.py ├── test_anomaly_detectors.py ├── test_classifiers.py ├── test_datasets.py ├── test_evaluation.py ├── test_import.py ├── test_misc.py ├── test_moajar.py ├── test_pi.py ├── test_progress_bar.py ├── test_regressors.py ├── test_ssl_classifiers.py └── test_stream.py /.dockerignore: -------------------------------------------------------------------------------- 1 | * 2 | !notebooks/*.ipynb 3 | -------------------------------------------------------------------------------- /.github/workflows/pr.md: -------------------------------------------------------------------------------- 1 | # Pull Request Automation Workflow 2 | Whenever a pull request is opened a number of automated checks are run to ensure breaking changes are not introduced and to ensure consistency of style and formatting. 3 | 4 | These checks are defined by the `.github/workflows/pr.yml` file and this documentation is defined by `.github/workflows/pr.md`. 5 | 6 | If you encounter any issues, reading the error messages and trying to reproduce them locally is a good first step. 7 | Don't hesitate to ask for assistance in the pull request or join the Discord server. 8 | 9 | ## Tests 10 | 11 | This job runs formatting, linting, tests, doctests, and checks notebooks. 12 | 13 | - If "**Tests and Doctests**" step fails the [Adding Tests guide](https://capymoa.org/contributing/tests.html) may help. 14 | - If "**Check Notebooks**" step fail the [Notebooks guide](https://capymoa.org/contributing/docs.html#notebooks) may help. 15 | 16 | ## Code Style 17 | 18 | This job uses `ruff` to check the code style. If this job fail the [Linting and Formatting guide](https://capymoa.org/contributing/vcs.html#linting-and-formatting) may help. 19 | 20 | ## Commit Style 21 | 22 | This job checks if commit messages are conventional commit compliant. If these checks fail, the [Commit Messages guide](https://capymoa.org/contributing/vcs.html#commit-messages) may help. **Don't worry too much about this check, as the reviewer can assist by squashing and merging commits with a compliant message.** 23 | 24 | ## Check Documentation 25 | 26 | This job ensures that the documentation can be built successfully. If this check fails, the [Documentation guide](https://capymoa.org/contributing/docs.html) may help. 27 | -------------------------------------------------------------------------------- /.github/workflows/pr.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | # TODO: In the future we ought to perform code coverage checks 5 | 6 | name: Pull Request 7 | 8 | on: 9 | push: 10 | branches: [ "main" ] 11 | pull_request: 12 | branches: [ "main" ] 13 | 14 | concurrency: 15 | group: ${{ github.workflow }}-${{ github.ref }} 16 | cancel-in-progress: true 17 | 18 | permissions: 19 | contents: read 20 | pull-requests: read 21 | 22 | env: 23 | PYTHON_VERSION: "3.10" 24 | 25 | jobs: 26 | tests: 27 | name: "Tests" 28 | timeout-minutes: 20 29 | runs-on: ubuntu-latest 30 | strategy: 31 | fail-fast: true 32 | 33 | steps: 34 | - uses: actions/checkout@v3 35 | 36 | - name: Add Job Summary 37 | run: cat .github/workflows/pr.md >$GITHUB_STEP_SUMMARY 38 | 39 | - name: Set up Python ${{ env.PYTHON_VERSION }} 40 | uses: actions/setup-python@v5 41 | with: 42 | python-version: ${{ env.PYTHON_VERSION }} 43 | cache: 'pip' # caching pip dependencies 44 | 45 | - name: Install dependencies 46 | run: | 47 | python -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu 48 | python -m pip install ".[dev]" 49 | 50 | - name: Cache Test Datasets 51 | id: cache-test-datasets 52 | uses: actions/cache@v4 53 | with: 54 | path: data 55 | key: "test-datasets" 56 | 57 | - name: PyTest 58 | run: invoke test.pytest 59 | 60 | - name: Doctest 61 | run: invoke test.doctest 62 | 63 | - name: Check Notebooks 64 | run: invoke test.nb 65 | 66 | lint: 67 | name: "Code Style" 68 | timeout-minutes: 10 69 | runs-on: ubuntu-latest 70 | steps: 71 | - uses: actions/checkout@v4 72 | - name: Formatting 73 | uses: astral-sh/ruff-action@v3 74 | with: 75 | args: "format --check" 76 | - name: Linting 77 | uses: astral-sh/ruff-action@v3 78 | 79 | commit: 80 | name: "Commit Style" 81 | runs-on: ubuntu-latest 82 | steps: 83 | - uses: actions/checkout@v4 84 | - uses: wagoid/commitlint-github-action@v6 85 | 86 | documentation: 87 | name: "Documentation" 88 | timeout-minutes: 10 89 | runs-on: ubuntu-latest 90 | steps: 91 | - uses: actions/checkout@v3 92 | - name: Set up Python 93 | uses: actions/setup-python@v5 94 | with: 95 | python-version: ${{ env.PYTHON_VERSION }} 96 | cache: 'pip' # caching pip dependencies 97 | - name: Install Dependencies 98 | run: | 99 | sudo apt-get install -y pandoc 100 | python -m pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu 101 | python -m pip install ".[dev,doc]" 102 | - name: Documentation Coverage Report 103 | run: invoke docs.coverage 104 | - name: Build Documentation 105 | run: invoke docs.build 106 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use CapyMOA, please cite it as described below." 3 | version: 0.8.2 4 | doi: "" 5 | date-released: 2025-02-11 6 | authors: 7 | - family-names: "Gomes" 8 | given-names: "Heitor Murilo" 9 | orcid: "https://orcid.org/0000-0002-5276-637X" 10 | - family-names: "Lee" 11 | given-names: "Anton" 12 | orcid: "https://orcid.org/0009-0008-6566-7785" 13 | - family-names: "Gunasekara" 14 | given-names: "Nuwan" 15 | orcid: "https://orcid.org/0000-0002-7964-6036" 16 | - family-names: "Sun" 17 | given-names: "Yibin" 18 | orcid: "https://orcid.org/0000-0002-8325-1889" 19 | - family-names: "Cassales" 20 | given-names: "Guilherme Weigert" 21 | orcid: "https://orcid.org/0000-0003-4029-2047" 22 | - family-names: "Liu" 23 | given-names: "Justin Jia" 24 | - family-names: "Heyden" 25 | given-names: "Marco" 26 | orcid: "https://orcid.org/0000-0003-4981-709X" 27 | - family-names: "Cerqueira" 28 | given-names: "Vitor" 29 | orcid: "https://orcid.org/0000-0002-9694-8423" 30 | - family-names: "Bahri" 31 | given-names: "Maroua" 32 | orcid: "https://orcid.org/0000-0002-7420-7464" 33 | - family-names: "Koh" 34 | given-names: "Yun Sing" 35 | orcid: "https://orcid.org/0000-0001-7256-4049" 36 | - family-names: "Pfahringer" 37 | given-names: "Bernhard" 38 | orcid: "https://orcid.org/0000-0002-3732-5787" 39 | - family-names: "Bifet" 40 | given-names: "Albert" 41 | orcid: "https://orcid.org/0000-0002-8339-7773" 42 | license: "BSD-3-Clause" 43 | url: "https://github.com/adaptive-machine-learning/CapyMOA" 44 | preferred-citation: 45 | type: article 46 | authors: 47 | - family-names: "Gomes" 48 | given-names: "Heitor Murilo" 49 | orcid: "https://orcid.org/0000-0002-5276-637X" 50 | - family-names: "Lee" 51 | given-names: "Anton" 52 | orcid: "https://orcid.org/0009-0008-6566-7785" 53 | - family-names: "Gunasekara" 54 | given-names: "Nuwan" 55 | orcid: "https://orcid.org/0000-0002-7964-6036" 56 | - family-names: "Sun" 57 | given-names: "Yibin" 58 | orcid: "https://orcid.org/0000-0002-8325-1889" 59 | - family-names: "Cassales" 60 | given-names: "Guilherme Weigert" 61 | orcid: "https://orcid.org/0000-0003-4029-2047" 62 | - family-names: "Liu" 63 | given-names: "Justin Jia" 64 | - family-names: "Heyden" 65 | given-names: "Marco" 66 | orcid: "https://orcid.org/0000-0003-4981-709X" 67 | - family-names: "Cerqueira" 68 | given-names: "Vitor" 69 | orcid: "https://orcid.org/0000-0002-9694-8423" 70 | - family-names: "Bahri" 71 | given-names: "Maroua" 72 | orcid: "https://orcid.org/0000-0002-7420-7464" 73 | - family-names: "Koh" 74 | given-names: "Yun Sing" 75 | orcid: "https://orcid.org/0000-0001-7256-4049" 76 | - family-names: "Pfahringer" 77 | given-names: "Bernhard" 78 | orcid: "https://orcid.org/0000-0002-3732-5787" 79 | - family-names: "Bifet" 80 | given-names: "Albert" 81 | orcid: "https://orcid.org/0000-0002-8339-7773" 82 | title: "CapyMOA: Efficient Machine Learning for Data Streams in Python" 83 | journal: "arXiv" 84 | year: 2025 85 | doi: "10.48550/arXiv.2502.07432" -------------------------------------------------------------------------------- /CNAME: -------------------------------------------------------------------------------- 1 | capymoa.org -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2024, The CapyMOA developers 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | 1. Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | 2. Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | 3. Neither the name of the copyright holder nor the names of its 16 | contributors may be used to endorse or promote products derived from 17 | this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | # Variables 3 | NOTEBOOKS := $(wildcard notebooks/*.ipynb) 4 | 5 | test: 6 | python -m pytest --durations=0 7 | @echo "All Unit Tests Passed" 8 | @echo "Testing Notebook" 9 | python -m pytest --nbmake -n=auto notebooks 10 | @echo "All Notebook Tests Passed" 11 | 12 | download: 13 | python scripts/download_datasets_and_moa.py 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # [CapyMOA](https://capymoa.org) 2 | 3 | ![Banner Image](https://github.com/adaptive-machine-learning/CapyMOA/raw/main/docs/images/CapyMOA.jpeg) 4 | 5 | [![PyPi Version](https://img.shields.io/pypi/v/capymoa)](https://pypi.org/project/capymoa/) 6 | [![Join the Discord](https://img.shields.io/discord/1235780483845984367?label=Discord)](https://discord.gg/spd2gQJGAb) 7 | [![Documentation](https://img.shields.io/badge/docs-latest-blue)](https://capymoa.org) 8 | [![GitHub](https://img.shields.io/github/stars/adaptive-machine-learning/CapyMOA?style=social)](https://github.com/adaptive-machine-learning/CapyMOA) 9 | 10 | 11 | Machine learning library tailored for data streams. Featuring a Python API 12 | tightly integrated with MOA (**Stream Learners**), PyTorch (**Neural 13 | Networks**), and scikit-learn (**Machine Learning**). CapyMOA provides a 14 | **fast** python interface to leverage the state-of-the-art algorithms in the 15 | field of data streams. 16 | 17 | To setup CapyMOA, simply install it via pip. If you have any issues with the 18 | installation (like not having Java installed) or if you want GPU support, please 19 | refer to the [installation guide](https://capymoa.org/installation). Once installed take a 20 | look at the [tutorials](https://capymoa.org/tutorials.html) to get started. 21 | 22 | ```bash 23 | # CapyMOA requires Java. This checks if you have it installed 24 | java -version 25 | 26 | # CapyMOA requires PyTorch. This installs the CPU version 27 | pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu 28 | 29 | # Install CapyMOA and its dependencies 30 | pip install capymoa 31 | 32 | # Check that the install worked 33 | python -c "import capymoa; print(capymoa.__version__)" 34 | ``` 35 | 36 | > **⚠️ WARNING** 37 | > 38 | > CapyMOA is still in the early stages of development. The API is subject to 39 | > change until version 1.0.0. If you encounter any issues, please report 40 | > them in [GitHub Issues](https://github.com/adaptive-machine-learning/CapyMOA/issues) 41 | > or talk to us on [Discord](https://discord.gg/spd2gQJGAb). 42 | 43 | --- 44 | 45 | ![Benchmark Image](https://github.com/adaptive-machine-learning/CapyMOA/raw/main/docs/images/arf100_cpu_time.png) 46 | Benchmark comparing CapyMOA against other data stream libraries. The benchmark 47 | was performed using an ensemble of 100 ARF learners trained on 48 | `capymoa.datasets.RTG_2abrupt` dataset containing 100,000 samples and 30 49 | features. You can find the code to reproduce this benchmark in 50 | [`notebooks/benchmarking.py`](https://github.com/adaptive-machine-learning/CapyMOA/blob/main/notebooks/benchmarking.py). 51 | *CapyMOA has the speed of MOA with the flexibility of Python and the richness of 52 | Python's data science ecosystem.* 53 | 54 | ## Cite Us 55 | 56 | If you use CapyMOA in your research, please cite us using the following BibTeX item. 57 | ``` 58 | @misc{ 59 | gomes2025capymoaefficientmachinelearning, 60 | title={{CapyMOA}: Efficient Machine Learning for Data Streams in Python}, 61 | author={Heitor Murilo Gomes and Anton Lee and Nuwan Gunasekara and Yibin Sun and Guilherme Weigert Cassales and Justin Jia Liu and Marco Heyden and Vitor Cerqueira and Maroua Bahri and Yun Sing Koh and Bernhard Pfahringer and Albert Bifet}, 62 | year={2025}, 63 | eprint={2502.07432}, 64 | archivePrefix={arXiv}, 65 | primaryClass={cs.LG}, 66 | url={https://arxiv.org/abs/2502.07432}, 67 | } 68 | ``` 69 | -------------------------------------------------------------------------------- /docker/.env: -------------------------------------------------------------------------------- 1 | CAPYMOA_VERSION="0.9.1" -------------------------------------------------------------------------------- /docker/docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | jupyter-capymoa: 3 | image: jupyter-capymoa:${CAPYMOA_VERSION} 4 | ports: 5 | - "8888:8888" 6 | build: 7 | context: ../ 8 | dockerfile: docker/dockerfile 9 | args: 10 | - CAPYMOA_VERSION=${CAPYMOA_VERSION} 11 | -------------------------------------------------------------------------------- /docker/dockerfile: -------------------------------------------------------------------------------- 1 | FROM quay.io/jupyter/base-notebook:latest 2 | 3 | # Install OpenJDK-8. 4 | # 5 | # * `apt-get clean` and `rm -rf /var/lib/apt/lists/*` is used to reduce the 6 | # image size by not caching the downloaded index files. 7 | # * `gcc, g++` is installed to compile dependencies that are not pre-built. 8 | USER root 9 | RUN apt-get update --yes && \ 10 | apt-get install -y --no-install-recommends gcc g++ openjdk-8-jre && \ 11 | apt-get clean && \ 12 | rm -rf /var/lib/apt/lists/* 13 | USER ${NB_UID} 14 | 15 | # Install the latest version of the `capymoa` package. 16 | ARG CAPYMOA_VERSION 17 | RUN python -m pip install --no-cache-dir torch torchvision \ 18 | --index-url https://download.pytorch.org/whl/cpu && \ 19 | python -m pip install --no-cache-dir capymoa==${CAPYMOA_VERSION} 20 | 21 | ENV CAPYMOA_DATASETS_DIR=${HOME}/data 22 | COPY --chown=${NB_UID} data ${HOME}/sample/data 23 | COPY --chown=${NB_UID} notebooks ${HOME}/sample/notebooks 24 | -------------------------------------------------------------------------------- /docs/_static/.placeholder: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/docs/_static/.placeholder -------------------------------------------------------------------------------- /docs/_static/css/citation.css: -------------------------------------------------------------------------------- 1 | /* make citations muted and smaller */ 2 | .citation { 3 | font-size: var(--pst-font-size-milli); 4 | font-weight: var(--pst-font-weight-caption); 5 | color: var(--pst-color-text-muted); 6 | font-style: italic; 7 | } 8 | 9 | /* add horizontal rule */ 10 | .citation-list { 11 | margin-top: var(--pst-font-size-base); 12 | margin-bottom: var(--pst-font-size-base); 13 | padding-top: var(--pst-font-size-base); 14 | border-top: 1px solid var(--pst-color-border); 15 | } 16 | -------------------------------------------------------------------------------- /docs/_templates/autosummary/class.rst: -------------------------------------------------------------------------------- 1 | {{ name | escape | underline}} 2 | 3 | .. currentmodule:: {{ module }} 4 | .. autoclass:: {{ objname }} 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | :special-members: __init__, __call__, __iter__, __next__ 9 | :member-order: groupwise 10 | {%- if module not in inherited_members_module_denylist %} 11 | :inherited-members: 12 | {% endif %} 13 | -------------------------------------------------------------------------------- /docs/_templates/autosummary/module.rst: -------------------------------------------------------------------------------- 1 | {{ name | escape | underline}} 2 | 3 | .. currentmodule:: {{ fullname }} 4 | 5 | .. automodule:: {{ fullname }} 6 | {% block modules %} 7 | {%- if modules %} 8 | Modules 9 | ------- 10 | 11 | .. autosummary:: 12 | :toctree: 13 | :recursive: 14 | {% for item in modules %} 15 | {{ item }} 16 | {%- endfor %} 17 | {%- endif %} 18 | {% endblock %} 19 | {% block classes %} 20 | {%- if classes %} 21 | Classes 22 | ------- 23 | 24 | .. autosummary:: 25 | :toctree: 26 | :nosignatures: 27 | {% for item in classes %} 28 | {{ item }} 29 | {%- endfor %} 30 | {%- endif %} 31 | {%- endblock %} 32 | 33 | {% block attributes %} 34 | {%- if attributes %} 35 | Module Attributes 36 | ----------------- 37 | 38 | {%- for item in attributes %} 39 | .. autodata:: {{ item }} 40 | {%- endfor %} 41 | {%- endif %} 42 | {%- endblock %} 43 | {% block functions %} 44 | {%- if functions %} 45 | Functions 46 | --------- 47 | 48 | {%- for item in functions %} 49 | .. autofunction:: {{ item }} 50 | {%- endfor %} 51 | {%- endif %} 52 | {%- endblock %} 53 | 54 | {% block exceptions %} 55 | {%- if exceptions %} 56 | Exceptions 57 | ---------- 58 | 59 | {%- for item in exceptions %} 60 | .. autoexception:: {{ item }} 61 | {%- endfor %} 62 | {%- endif %} 63 | {%- endblock %} 64 | -------------------------------------------------------------------------------- /docs/api/index.rst: -------------------------------------------------------------------------------- 1 | API Reference 2 | ============= 3 | 4 | Welcome to the capymoa API reference. This documentation is automatically 5 | generated from the source code and provides detailed information on the classes 6 | and functions available in capymoa. 7 | 8 | If you are looking to just use CapyMOA, you should start with the 9 | :ref:`tutorials`. 10 | 11 | Types 12 | ----- 13 | 14 | These module provide interfaces for learners, and other basic types used by 15 | capymoa. 16 | 17 | .. autosummary:: 18 | :toctree: modules 19 | :caption: Types 20 | :recursive: 21 | 22 | capymoa.base 23 | capymoa.type_alias 24 | capymoa.instance 25 | 26 | Data Streams 27 | ------------ 28 | 29 | These modules provide classes for loading, and simulating data streams. It also 30 | includes utilities for simulating concept drifts. 31 | 32 | .. autosummary:: 33 | :toctree: modules 34 | :caption: Data Streams 35 | :recursive: 36 | 37 | capymoa.datasets 38 | capymoa.stream 39 | 40 | Problem Settings 41 | ---------------- 42 | 43 | These modules provide classes for defining machine learning problem settings. 44 | 45 | .. autosummary:: 46 | :toctree: modules 47 | :caption: Problem Settings 48 | :recursive: 49 | 50 | capymoa.classifier 51 | capymoa.regressor 52 | capymoa.anomaly 53 | capymoa.ssl 54 | capymoa.ocl 55 | capymoa.drift 56 | capymoa.clusterers 57 | capymoa.automl 58 | 59 | Evaluation 60 | ---------- 61 | 62 | These modules provide classes for evaluating learners. 63 | 64 | .. autosummary:: 65 | :toctree: modules 66 | :caption: Evaluation 67 | :recursive: 68 | 69 | capymoa.evaluation 70 | capymoa.prediction_interval 71 | 72 | Miscellaneous 73 | ------------- 74 | 75 | These modules provide miscellaneous utilities. 76 | 77 | .. autosummary:: 78 | :toctree: modules 79 | :caption: Miscellaneous 80 | :recursive: 81 | 82 | capymoa.ann 83 | capymoa.splitcriteria 84 | capymoa.misc 85 | capymoa.env 86 | 87 | Functions 88 | --------- 89 | 90 | .. automodule:: capymoa 91 | :members: -------------------------------------------------------------------------------- /docs/contributing/index.rst: -------------------------------------------------------------------------------- 1 | Contributing 2 | ============ 3 | 4 | This part of the documentation is for developers and contributors. 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | 9 | learners 10 | tests 11 | docs 12 | vcs 13 | updating_moajar 14 | -------------------------------------------------------------------------------- /docs/contributing/tests.md: -------------------------------------------------------------------------------- 1 | # Adding Tests 2 | 3 | Ensure you have installed the development dependencies by following the instructions 4 | in the [installation guide](../installation.rst). To run all tests, use the following command: 5 | 6 | ```bash 7 | invoke test 8 | ``` 9 | 10 | ## PyTest 11 | 12 | Tests can be added to the ``tests`` directory. PyTest will automatically discover 13 | and run these tests. They should be named ``test_*.py``, and the test functions 14 | should be named ``test_*``. See the [PyTest documentation](https://docs.pytest.org) 15 | for more information. 16 | 17 | Use this PyTest style tests for parameterized tests, tests that require fixtures, 18 | and tests that require setup. 19 | 20 | These tests can be run with: 21 | 22 | ```bash 23 | pytest 24 | ``` 25 | 26 | Or to run a specific test 27 | 28 | ```bash 29 | pytest tests/test_*.py 30 | ``` 31 | 32 | Or to run with the same configuration as continuous integration: 33 | 34 | ```bash 35 | invoke test.pytest 36 | ``` 37 | 38 | ## Doctest 39 | 40 | [Doctest](https://docs.python.org/3/library/doctest.html) allows you to write 41 | tests directly in the docstrings of your code, making it easier to keep documentation 42 | up-to-date. The tests are written as examples in a Python interactive shell. 43 | 44 | Use doctest style test to document code with simple tested examples. 45 | 46 | Here's an example of a function with a doctest: 47 | 48 | ```python 49 | def hello_world(): 50 | """ 51 | >>> hello_world() 52 | Hello, World! 53 | """ 54 | print("Hello, World!") 55 | ``` 56 | 57 | You can run this test with: 58 | 59 | ```bash 60 | pytest --doctest-modules -k src 61 | ``` 62 | 63 | The `-k src` flag tells PyTest to run tests only in the `src` directory. This is useful 64 | if you only want to run doctests in source code but not PyTest tests. 65 | 66 | Alternatively, you can run all unit tests with the same configuration as continuous integration: 67 | 68 | ```bash 69 | invoke test.doctest 70 | ``` 71 | -------------------------------------------------------------------------------- /docs/contributing/updating_moajar.md: -------------------------------------------------------------------------------- 1 | # Updating CapyMOA's `moa.jar` version 2 | 3 | This document describes how to change the version of MOA that the CapyMOA 4 | project uses. **It is only intended for developers who are contributing to 5 | CapyMOA**. Before you start, make sure you have the following: 6 | 7 | * You have installed the development dependencies and have an editable install 8 | of CapyMOA. If you have not, follow the instructions in the [installation guide](../installation.rst). 9 | 10 | * You must **NOT** have set the environment variables that would 11 | override the default `moa.jar` location. `CAPYMOA_MOA_JAR` must be unset. 12 | 13 | ## Refreshing the `moa.jar` 14 | 15 | When a developer wants to replace the `moa.jar` in their local CapyMOA as a 16 | consequence of pulling or rebasing changes, they can run the following command: 17 | 18 | ```console 19 | python -m invoke refresh-moa 20 | ``` 21 | 22 | ## Changing Project MOA Version 23 | 24 | When a developer needs to update the version of MOA that the capymoa project uses 25 | they need to follow these steps: 26 | 27 | 1. **Upload the new version of MOA to the CapyMOA Dropbox.** 28 | 29 | Please name the new version with the date, e.g. `240412_moa.jar` 30 | (`yymmdd_moa.jar`) so we may rollback easily if needed. 31 | 2. **Update `invoke.yml`'s `moa_url` to point to the new version of MOA.** 32 | 33 | This file tells capymoa where to download the `moa.jar` from during the 34 | packaging process. 35 | * Must be the complete URL. 36 | 3. **Remove the old `moa.jar` with `python -m invoke refresh-moa`.** 37 | 4. **Update `tests/test_moajar.py` with the updated sha256 hash.** 38 | * macOS: ```shasum -a 256 moa_jar_file.jar``` 39 | * linux: ```sha256sum moa_jar_file.jar``` 40 | 41 | `tests/test_moajar.py` is used to avoid using an outdated version of MOA by 42 | mistake. It will generate a warning for the user if the hash of the `moa.jar` 43 | does not match the one in the file. The tests on Github Actions will fail 44 | if the hash does not match the file downloaded from the URL in `invoke.yml`. 45 | 46 | 5. **Verify that capymoa is pointing to the new version of MOA by running:** 47 | 48 | ```console 49 | $ python -c "import capymoa; capymoa.about()" 50 | CapyMOA 0.2.0 51 | CAPYMOA_DATASETS_DIR: ... 52 | CAPYMOA_MOA_JAR: .../CapyMOA/src/capymoa/jar/moa.jar 53 | CAPYMOA_JVM_ARGS: ['-Xmx8g', '-Xss10M'] 54 | JAVA_HOME: ... 55 | MOA version: A SHA256 hash of the actual 56 | JAVA version: ... 57 | ``` 58 | 59 | In particular, check the if the hash matches the one you calculated in the 60 | previous step. If it does not, you should double-check the URL in 61 | `invoke.yml` and re-run `python -m invoke build.clean-moa` and `python -m 62 | invoke build.download-moa`. 63 | 64 | 6. Your pull request should include the changes to `invoke.yml`, and 65 | `tests/test_moajar.py`. 66 | -------------------------------------------------------------------------------- /docs/contributing/vcs.md: -------------------------------------------------------------------------------- 1 | # Version Control 2 | 3 | This document outlines the version control practices used in the CapyMOA project. 4 | 5 | ## Linting and Formatting 6 | 7 | Linting is the process of automatically checking code for style, syntax, and other issues. 8 | Code formatting ensures code meets a consistent style. 9 | Together they help ensure the codebase is consistent and clean. 10 | Developers don't have to worry about formatting and reviewers can focus on code rather than style preferences. 11 | 12 | CapyMOA uses the [ruff](https://astral.sh/ruff) linter to enforce both. 13 | Checks are run automatically using GitHub actions on every pull request. 14 | 15 | You will need to run `ruff` locally before committing changes. 16 | Ruff is installed as part of the [development dependencies](../installation.rst). 17 | 18 | To format files run: 19 | ```bash 20 | ruff format # or python -m invoke format 21 | ``` 22 | 23 | To lint files run: 24 | ```bash 25 | ruff check # or python -m invoke lint 26 | ``` 27 | 28 | Furthermore, `python -m invoke commit` will run the linter and check that 29 | formatting is correct before committing. 30 | 31 | 32 | ## Commit Messages 33 | 34 | **tldr; Run `python -m invoke commit` (or `invoke commit`, `python -m commitizen commit`) to commit changes.** (Requires that you've [installed the optional development dependencies](../installation.rst).) 35 | 36 | CapyMOA uses conventional commit messages to streamline the release process. 37 | 38 | > "The Conventional Commits specification is a lightweight convention on top of 39 | > commit messages. It provides an easy set of rules for creating an explicit 40 | > commit history; which makes it easier to write automated tools on top of. 41 | > This convention dovetails with SemVer, by describing the features, fixes, 42 | > and breaking changes made in commit messages." -- [conventionalcommits.org](https://www.conventionalcommits.org/en/v1.0.0/#summary) 43 | 44 | Conventional commits are structured as follows: 45 | 46 | [optional scope]: 47 | 48 | or 49 | 50 | [optional scope]: 51 | 52 | [optional body] 53 | 54 | [optional footer(s)] 55 | 56 | Here are some basic examples: 57 | 58 | docs: correct spelling of CHANGELOG 59 | 60 | 61 | feat(lang): add Polish language 62 | 63 | Where: 64 | 65 | * `` is one of 66 | * `feat`: New feature. **Will increment the MINOR version number.** 67 | * `fix`: Bug fix. **Will increment the patch version number.** 68 | * `build`: Changes that affect the build system or external dependencies 69 | * `chore`: Repetitive tasks such as updating dependencies 70 | * `ci`: Changes to continuous integration configuration files and scripts 71 | * `docs`: Documentation changes 72 | * `perf`: Performance improvement 73 | * `refactor`: Code changes that neither fix a bug nor add a feature 74 | * `revert`: Revert a previous commit 75 | * `style`: Changes that do not affect the meaning of the code (white-space, formatting, missing semi-colons, etc) 76 | * `test`: Adding missing tests or correcting existing tests 77 | * `[optional scope]` is a module or component affected by the commit. A top level python module is a good example of a scope: 78 | * `classifier` 79 | * `datasets` 80 | * `stream` 81 | * etc. 82 | 83 | Its okay to leave out the scope if its not obvious or not applicable. 84 | 85 | * `` This should be a short, concise lowercase description of the change in the imperative mood (e.g. "add ...", "change ...", "fix", "remove..."). 86 | 87 | ## Breaking Changes 88 | 89 | If the API changes in a way that is not backwards-compatible, the commit message 90 | should include a `!` after the type/scope, e.g. `feat(classifiers)!: ...`. 91 | 92 | You can and probably should include more information in the body and footer of 93 | the commit message to explain the breaking change. See [conventionalcommits.org](https://www.conventionalcommits.org/en/v1.0.0/) for more information. 94 | 95 | chore!: drop support for python 3.9 96 | 97 | BREAKING CHANGE: use Python features only available in 3.10 98 | -------------------------------------------------------------------------------- /docs/docker.md: -------------------------------------------------------------------------------- 1 | # Docker 2 | 3 | CapyMOA provides a Docker image application containing a ready-to-go Jupyter 4 | notebook. If you require a reproducible environment or want to experiment with 5 | CapyMOA and have experience with Docker, this is the best place to begin. 6 | 7 | Before you begin, you must have Docker installed. You can download and install 8 | Docker from the [official website](https://docs.docker.com/get-docker/). 9 | 10 | ![Image of CapyMOA Jupyter Notebook](images/docker_demo.png) 11 | 12 | 13 | ## Example 14 | 15 | ```bash 16 | mkdir data work 17 | docker run \ 18 | -p 8888:8888 \ 19 | -v ./work:/home/jovyan/work \ 20 | -v ./data:/home/jovyan/data \ 21 | tachyonic/jupyter-capymoa 22 | ``` 23 | 24 | With the container running, you can access the jupyter notebook interface by 25 | visiting `http://:8888/?token=` in a browser, where: 26 | 27 | * The hostname is the name of the computer running Docker. Usually, this is 28 | `localhost`. 29 | * The token is the secret token printed in the console. 30 | 31 | Once you have accessed the jupyter notebook interface, you can can play with the sample datasets and notebooks provided in `sample/notebooks` directory. 32 | 33 | ### Notes 34 | 35 | * [`-p, --port`](https://docs.docker.com/engine/reference/run/#exposed-ports) maps 36 | port 8888 in the container to port 10000 on the host machine. If you do change 37 | the port on the host machine, make sure to change the port in the URL above. 38 | * [`-v`](https://docs.docker.com/storage/bind-mounts/) flag lets the container 39 | access the `./work` and `./data` directories on the host machine as 40 | `/home/jovyan/work` and `/home/jovyan/data` in the container, respectively. 41 | You can change the host directories to any directory on your machine. 42 | 43 | 44 | ## Directory Structure 45 | 46 | * `/home/jovyan`: The home directory for the default user in the container. 47 | * `sample`: Contains sample datasets and notebooks. 48 | * `data`: Contains sample datasets. 49 | * `notebooks`: Contains sample notebooks. 50 | * `work`: An empty directory where you can mount your work. 51 | * `data`: The default directory where datasets will be downloaded to. You 52 | can mount a directory to this location to access datasets from your host 53 | machine and avoid downloading them multiple times. 54 | 55 | ## Environment Variables 56 | 57 | See the [Environment Variables](https://capymoa.org/api/env.html#module-capymoa.env) 58 | section for a list of environment variables that can be set to configure the tool. 59 | You can set these environment variables using the [`-e, --env`](https://docs.docker.com/reference/cli/docker/container/run/#env) flag when running the Docker. For example, 60 | to change the JVM arguments `CAPYMOA_JVM_ARGS` 61 | 62 | ```bash 63 | docker run -p 8888:8888 -e CAPYMOA_JVM_ARGS="-Xmx4g" jupyter-capymoa 64 | ``` 65 | 66 | 67 | ## Docker Compose Example 68 | 69 | You can also use [Docker Compose](https://docs.docker.com/compose/) to remove the need to remember the command line arguments. Create a `docker-compose.yml` file with the following contents: 70 | 71 | ```yaml 72 | services: 73 | jupyter-capymoa: 74 | image: tachyonic/jupyter-capymoa 75 | ports: 76 | - "8888:8888" 77 | volumes: 78 | - ./work:/home/jovyan/work 79 | - ./data:/home/jovyan/data 80 | ``` 81 | 82 | Then run the following command: 83 | 84 | ```bash 85 | docker-compose up 86 | ``` 87 | 88 | Just like the earlier example you can access the jupyter notebook interface by visiting `http://:8888/?token=` in a browser. 89 | -------------------------------------------------------------------------------- /docs/images/CapyMOA.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/docs/images/CapyMOA.jpeg -------------------------------------------------------------------------------- /docs/images/arf100_cpu_time.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/docs/images/arf100_cpu_time.png -------------------------------------------------------------------------------- /docs/images/arf100_cpu_time_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/docs/images/arf100_cpu_time_dark.png -------------------------------------------------------------------------------- /docs/images/benchmark_20240422_221824_performance_plot_wallclock.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/docs/images/benchmark_20240422_221824_performance_plot_wallclock.png -------------------------------------------------------------------------------- /docs/images/docker_demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/docs/images/docker_demo.png -------------------------------------------------------------------------------- /docs/images/profiles/anton_lee.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/docs/images/profiles/anton_lee.jpg -------------------------------------------------------------------------------- /docs/images/profiles/guilherme_weigert_cassales.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/docs/images/profiles/guilherme_weigert_cassales.jpg -------------------------------------------------------------------------------- /docs/images/profiles/heitor_murilo_gomes.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/docs/images/profiles/heitor_murilo_gomes.jpg -------------------------------------------------------------------------------- /docs/images/profiles/justin_liu.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/docs/images/profiles/justin_liu.jpg -------------------------------------------------------------------------------- /docs/images/profiles/marco_heyden.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/docs/images/profiles/marco_heyden.jpg -------------------------------------------------------------------------------- /docs/images/profiles/nuwan_gunasekara.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/docs/images/profiles/nuwan_gunasekara.jpg -------------------------------------------------------------------------------- /docs/images/profiles/vitor_cerqueira.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/docs/images/profiles/vitor_cerqueira.jpg -------------------------------------------------------------------------------- /docs/images/profiles/yibin_sun.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/docs/images/profiles/yibin_sun.jpg -------------------------------------------------------------------------------- /docs/tutorials.rst: -------------------------------------------------------------------------------- 1 | .. _tutorials: 2 | 3 | Tutorials 4 | ========= 5 | These tutorials will show you how to get started with the CapyMOA library. 6 | 7 | 8 | .. toctree:: 9 | :maxdepth: 1 10 | :caption: Tutorials: 11 | 12 | notebooks/00_getting_started.ipynb 13 | notebooks/01_evaluation.ipynb 14 | notebooks/02_sklearn.ipynb 15 | notebooks/03_pytorch.ipynb 16 | notebooks/04_drift_streams.ipynb 17 | notebooks/05_new_learner.ipynb 18 | notebooks/06_advanced_API.ipynb 19 | notebooks/07_pipelines.ipynb 20 | notebooks/08_prediction_interval.ipynb 21 | notebooks/09_automl.ipynb 22 | notebooks/10_ocl.ipynb 23 | 24 | .. toctree:: 25 | :maxdepth: 1 26 | :caption: Extra Tutorials: 27 | 28 | notebooks/SSL_example.ipynb 29 | notebooks/anomaly_detection.ipynb 30 | notebooks/drift_detection.ipynb 31 | notebooks/parallel_ensembles.ipynb 32 | notebooks/save_and_load_model.ipynb 33 | notebooks/clustering.ipynb 34 | 35 | Talks 36 | ===== 37 | 38 | Talks and workshops on CapyMOA: 39 | 40 | * `ECML/PAKDD 2024 "Navigating Complex Machine Learning Challenges in Streaming Data - A Hands-On Tutorial" `_ 41 | * `Kiwi PyCon 2024 "Data Stream AI" `_ 42 | * IJCAI 2024 "Machine Learning for Streaming Data" 43 | * PAKDD 2024 "Machine Learning for Streaming Data" 44 | * KDD 2024 "Practical Machine Learning for Streaming Data" 45 | -------------------------------------------------------------------------------- /docs/util/github_link.py: -------------------------------------------------------------------------------- 1 | """Code taken from: 2 | https://github.com/scikit-learn/scikit-learn/blob/8721245511de2f225ff5f9aa5f5fadce663cd4a3/doc/sphinxext/github_link.py 3 | """ 4 | 5 | import inspect 6 | import os 7 | import subprocess 8 | import sys 9 | from functools import partial 10 | from operator import attrgetter 11 | 12 | REVISION_CMD = "git rev-parse --short HEAD" 13 | 14 | 15 | def _get_git_revision(): 16 | try: 17 | revision = subprocess.check_output(REVISION_CMD.split()).strip() 18 | except (subprocess.CalledProcessError, OSError): 19 | print("Failed to execute git to get revision") 20 | return None 21 | return revision.decode("utf-8") 22 | 23 | 24 | def _linkcode_resolve(domain, info, package, url_fmt, revision): 25 | """Determine a link to online source for a class/method/function 26 | 27 | This is called by sphinx.ext.linkcode 28 | 29 | An example with a long-untouched module that everyone has 30 | >>> _linkcode_resolve('py', {'module': 'tty', 31 | ... 'fullname': 'setraw'}, 32 | ... package='tty', 33 | ... url_fmt='https://hg.python.org/cpython/file/' 34 | ... '{revision}/Lib/{package}/{path}#L{lineno}', 35 | ... revision='xxxx') 36 | 'https://hg.python.org/cpython/file/xxxx/Lib/tty/tty.py#L18' 37 | """ 38 | if revision is None: 39 | return 40 | if domain not in ("py", "pyx"): 41 | return 42 | if not info.get("module") or not info.get("fullname"): 43 | return 44 | 45 | class_name = info["fullname"].split(".")[0] 46 | module = __import__(info["module"], fromlist=[class_name]) 47 | 48 | try: 49 | obj = attrgetter(info["fullname"])(module) 50 | except AttributeError: 51 | return 52 | 53 | # Unwrap the object to get the correct source 54 | # file in case that is wrapped by a decorator 55 | obj = inspect.unwrap(obj) 56 | 57 | try: 58 | fn = inspect.getsourcefile(obj) 59 | except Exception: 60 | fn = None 61 | if not fn: 62 | try: 63 | fn = inspect.getsourcefile(sys.modules[obj.__module__]) 64 | except Exception: 65 | fn = None 66 | if not fn: 67 | return 68 | 69 | fn = os.path.relpath(fn, start=os.path.dirname(__import__(package).__file__)) 70 | try: 71 | lineno = inspect.getsourcelines(obj)[1] 72 | except Exception: 73 | lineno = "" 74 | return url_fmt.format(revision=revision, package=package, path=fn, lineno=lineno) 75 | 76 | 77 | def make_linkcode_resolve(package, url_fmt): 78 | """Returns a linkcode_resolve function for the given URL format 79 | 80 | revision is a git commit reference (hash or name) 81 | 82 | package is the name of the root module of the package 83 | 84 | url_fmt is along the lines of ('https://github.com/USER/PROJECT/' 85 | 'blob/{revision}/{package}/' 86 | '{path}#L{lineno}') 87 | """ 88 | revision = _get_git_revision() 89 | return partial( 90 | _linkcode_resolve, revision=revision, package=package, url_fmt=url_fmt 91 | ) 92 | -------------------------------------------------------------------------------- /invoke.yml: -------------------------------------------------------------------------------- 1 | # This file contains the configuration for the invoke command 2 | 3 | moa_path: "src/capymoa/jar/moa.jar" 4 | 5 | # DROPBOX: When using a dropbox link, ensure that the dl=1 query parameter is 6 | # present in the link. This ensures the file is downloaded directly instead 7 | # of going to the dropbox page 8 | moa_url: "https://www.dropbox.com/scl/fi/pdo2eqrmx04hdgduwy6qa/250328_moa.jar?rlkey=k1jyudsqlm89fzemndw75u4rn&st=9s4zpubi&dl=1" 9 | 10 | # What notebooks to skip when running them as tests. 11 | # YOU SHOULD NOT SKIP NOTEBOOKS! 12 | # Your notebook will eventually break and we need to know when it does. 13 | test_skip_notebooks: 14 | - notebooks/anomaly_detection.ipynb 15 | - notebooks/06_advanced_API.ipynb 16 | -------------------------------------------------------------------------------- /notebooks/ClusTree_clustering_evolution.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/notebooks/ClusTree_clustering_evolution.gif -------------------------------------------------------------------------------- /notebooks/Clustream_clustering_evolution.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/notebooks/Clustream_clustering_evolution.gif -------------------------------------------------------------------------------- /notebooks/Clustream_with_Kmeans_clustering_evolution.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/notebooks/Clustream_with_Kmeans_clustering_evolution.gif -------------------------------------------------------------------------------- /notebooks/DeNSTReaM_clustering_custom_name.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/notebooks/DeNSTReaM_clustering_custom_name.gif -------------------------------------------------------------------------------- /notebooks/settings_autoclass.json: -------------------------------------------------------------------------------- 1 | { 2 | 3 | "windowSize" : 1000, 4 | "ensembleSize" : 10, 5 | "newConfigurations" : 10, 6 | "keepCurrentModel" : true, 7 | "lambda" : 0.05, 8 | "preventAlgorithmDeath" : true, 9 | "keepGlobalIncumbent" : true, 10 | "keepAlgorithmIncumbents" : true, 11 | "keepInitialConfigurations" : true, 12 | "useTestEnsemble" : true, 13 | "resetProbability" : 0.01, 14 | "numberOfCores" : 1, 15 | "performanceMeasureMaximisation": true, 16 | 17 | "algorithms": [ 18 | { 19 | "algorithm": "moa.classifiers.lazy.kNN", 20 | "parameters": [ 21 | {"parameter": "k", "type":"integer", "value":10, "range":[2,30]} 22 | ] 23 | } 24 | , 25 | { 26 | "algorithm": "moa.classifiers.trees.HoeffdingTree", 27 | "parameters": [ 28 | {"parameter": "g", "type":"integer", "value":200, "range":[10, 200]}, 29 | {"parameter": "c", "type":"float", "value":0.01, "range":[0, 1]} 30 | ] 31 | } 32 | , 33 | { 34 | "algorithm": "moa.classifiers.lazy.kNNwithPAWandADWIN", 35 | "parameters": [ 36 | {"parameter": "k", "type":"integer", "value":10, "range":[2,30]} 37 | ] 38 | } 39 | , 40 | { 41 | "algorithm": "moa.classifiers.trees.HoeffdingAdaptiveTree", 42 | "parameters": [ 43 | {"parameter": "g", "type":"integer", "value":200, "range":[10, 200]}, 44 | {"parameter": "c", "type":"float", "value":0.01, "range":[0, 1]} 45 | ] 46 | } 47 | 48 | ] 49 | } 50 | -------------------------------------------------------------------------------- /notebooks/under_construction.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/notebooks/under_construction.jpeg -------------------------------------------------------------------------------- /notebooks/util/nbmock.py: -------------------------------------------------------------------------------- 1 | """The nbmock module provides support for mocking datasets to speed up testing.""" 2 | 3 | from os import environ 4 | 5 | 6 | def mock_datasets(): 7 | """Mock the datasets to use the tiny versions for testing.""" 8 | import unittest.mock as mock 9 | from capymoa.datasets import ElectricityTiny, CovtypeTiny, FriedTiny 10 | from capymoa.ocl.datasets import TinySplitMNIST 11 | 12 | mock.patch("capymoa.datasets.Electricity", ElectricityTiny).start() 13 | mock.patch("capymoa.datasets.Covtype", CovtypeTiny).start() 14 | mock.patch("capymoa.datasets.Fried", FriedTiny).start() 15 | mock.patch("capymoa.ocl.datasets.SplitMNIST", TinySplitMNIST).start() 16 | 17 | 18 | def is_nb_fast() -> bool: 19 | """Should the notebook be run with faster settings. 20 | 21 | Some notebooks are slow to run because they use large datasets and run 22 | for many iterations. This is good for documentation purposes but not for 23 | testing. This function returns True if the notebook should be run with 24 | faster settings. 25 | 26 | Care should be taken to hide cells in capymoa.org that are meant for testing 27 | only. This is done by adding ``"nbsphinx": "hidden"`` to the cell metadata. 28 | See: https://nbsphinx.readthedocs.io/en/0.9.3/hidden-cells.html 29 | """ 30 | return bool(environ.get("NB_FAST", False)) 31 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/src/__init__.py -------------------------------------------------------------------------------- /src/capymoa/__about__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.9.1" 2 | -------------------------------------------------------------------------------- /src/capymoa/__init__.py: -------------------------------------------------------------------------------- 1 | """Machine learning library tailored for data streams.""" 2 | 3 | from ._prepare_jpype import _start_jpype, about 4 | from .__about__ import __version__ 5 | 6 | # It is important that this is called before importing any other module 7 | _start_jpype() 8 | 9 | from . import stream # noqa Module imported here to ensure that jpype has been started 10 | 11 | 12 | __all__ = [ 13 | "about", 14 | "__version__", 15 | "stream", 16 | ] 17 | -------------------------------------------------------------------------------- /src/capymoa/_prepare_jpype.py: -------------------------------------------------------------------------------- 1 | # Python imports 2 | import jpype 3 | import jpype.imports 4 | import os 5 | from pathlib import Path 6 | from hashlib import sha256 7 | import subprocess 8 | from .__about__ import __version__ 9 | from .env import ( 10 | capymoa_jvm_args, 11 | capymoa_moa_jar, 12 | capymoa_datasets_dir, 13 | ) 14 | 15 | _CAPYMOA_PACKAGE_ROOT = Path(__file__).parent 16 | 17 | 18 | class CapymoaImportError(RuntimeError): 19 | pass 20 | 21 | 22 | def _get_java_home() -> Path: 23 | """Find java home. 24 | 25 | Respects the JAVA_HOME environment variable if it is set, otherwise tries to 26 | find the java home by running a special java program that prints it. 27 | """ 28 | 29 | if "JAVA_HOME" in os.environ: 30 | java_home = Path(os.environ["JAVA_HOME"]) 31 | 32 | if not java_home.exists(): 33 | raise CapymoaImportError( 34 | f"The JAVA_HOME (`{java_home}`) environment variable is set, " 35 | "but the path does not exist." 36 | ) 37 | else: 38 | # We can find the java home by asking a special java program to print it for us 39 | java_class_path = _CAPYMOA_PACKAGE_ROOT / "jar" 40 | try: 41 | result = subprocess.run( 42 | ["java", "-classpath", java_class_path.as_posix(), "Home"], 43 | capture_output=True, 44 | ) 45 | except FileNotFoundError: 46 | raise CapymoaImportError( 47 | "Java not found ensure `java -version` runs successfully. " 48 | "Alternatively, you may set the JAVA_HOME environment variable to the " 49 | "path of your Java installation for non-standard installations." 50 | ) 51 | 52 | java_home = Path(result.stdout.decode().strip()) 53 | 54 | assert java_home.exists(), ( 55 | f"The java.home reported by the java program does not exist: {java_home}" 56 | ) 57 | 58 | return java_home 59 | 60 | 61 | def _moa_hash(): 62 | with open(capymoa_moa_jar(), "rb") as f: 63 | return sha256(f.read()).hexdigest() 64 | 65 | 66 | def about(): 67 | """Print useful debug information about the CapyMOA setup. 68 | 69 | >>> import capymoa 70 | >>> capymoa.about() # doctest: +ELLIPSIS 71 | CapyMOA ... 72 | """ 73 | java_version = jpype.java.lang.System.getProperty("java.version") 74 | print(f"CapyMOA {__version__}") 75 | print(f" CAPYMOA_DATASETS_DIR: {capymoa_datasets_dir()}") 76 | print(f" CAPYMOA_MOA_JAR: {capymoa_moa_jar()}") 77 | print(f" CAPYMOA_JVM_ARGS: {capymoa_jvm_args()}") 78 | print(f" JAVA_HOME: {_get_java_home()}") 79 | print(f" MOA version: {_moa_hash()}") 80 | print(f" JAVA version: {java_version}") 81 | 82 | 83 | def _start_jpype(): 84 | # If it has already been started, we don't need to start it again 85 | if jpype.isJVMStarted(): 86 | return 87 | 88 | # Jpype is looking for the JAVA_HOME environment variable. 89 | os.environ["JAVA_HOME"] = _get_java_home().as_posix() 90 | 91 | # Add the MOA jar to the classpath 92 | moa_jar = capymoa_moa_jar() 93 | if not (moa_jar.exists() and moa_jar.is_file()): 94 | raise CapymoaImportError(f"MOA jar not found at `{moa_jar}`.") 95 | jpype.addClassPath(moa_jar) 96 | 97 | # Start the JVM 98 | jpype.startJVM(jpype.getDefaultJVMPath(), *capymoa_jvm_args()) 99 | 100 | # The JVM automatically shutdown with python, no need to explicitly call the shutdown method 101 | # https://jpype.readthedocs.io/en/latest/userguide.html#shutdownjvm 102 | -------------------------------------------------------------------------------- /src/capymoa/ann/__init__.py: -------------------------------------------------------------------------------- 1 | """Artificial Neural Networks for CapyMOA.""" 2 | 3 | from ._perceptron import Perceptron 4 | 5 | __all__ = [ 6 | "Perceptron", 7 | ] 8 | -------------------------------------------------------------------------------- /src/capymoa/ann/_perceptron.py: -------------------------------------------------------------------------------- 1 | from capymoa.stream._stream import Schema 2 | from torch import nn 3 | from torch import Tensor 4 | 5 | 6 | class Perceptron(nn.Module): 7 | """A simple feedforward neural network with one hidden layer.""" 8 | 9 | def __init__(self, schema: Schema, hidden_size: int = 50): 10 | """Initialize the model. 11 | 12 | :param schema: Schema describing the data types and shapes. 13 | :param hidden_size: Number of hidden units in the first layer. 14 | """ 15 | super(Perceptron, self).__init__() 16 | in_features = schema.get_num_attributes() 17 | out_features = schema.get_num_classes() 18 | self._fc1 = nn.Linear(in_features, hidden_size) 19 | self._relu = nn.ReLU() 20 | self._fc2 = nn.Linear(hidden_size, out_features) 21 | 22 | def forward(self, x: Tensor) -> Tensor: 23 | """Forward pass through the network. 24 | 25 | :param x: Input tensor of shape ``(batch_size, num_features)``. 26 | :return: Output tensor of shape ``(batch_size, num_classes)``. 27 | """ 28 | x = self._fc1(x) 29 | x = self._relu(x) 30 | x = self._fc2(x) 31 | return x 32 | -------------------------------------------------------------------------------- /src/capymoa/anomaly/__init__.py: -------------------------------------------------------------------------------- 1 | from ._half_space_trees import HalfSpaceTrees 2 | from ._online_isolation_forest import OnlineIsolationForest 3 | from ._autoencoder import Autoencoder 4 | from ._stream_rhf import StreamRHF 5 | 6 | __all__ = [ 7 | "HalfSpaceTrees", 8 | "OnlineIsolationForest", 9 | "Autoencoder", 10 | "StreamRHF", 11 | ] 12 | -------------------------------------------------------------------------------- /src/capymoa/anomaly/_half_space_trees.py: -------------------------------------------------------------------------------- 1 | from capymoa.base import ( 2 | MOAAnomalyDetector, 3 | ) 4 | 5 | from moa.classifiers.oneclass import HSTrees as _MOA_HSTrees 6 | 7 | 8 | class HalfSpaceTrees(MOAAnomalyDetector): 9 | """Half-Space Trees 10 | 11 | This class implements the Half-Space Trees (HS-Trees) algorithm, which is 12 | an ensemble anomaly detector capable of adapting to concept drift. 13 | 14 | HS-Trees is implemented in MOA (Massive Online Analysis) and provides several 15 | parameters for customization. 16 | 17 | References: 18 | 19 | `Fast anomaly detection for streaming data. 20 | Swee Chuan Tan, Kai Ming Ting and Tony Fei Liu. 21 | International joint conference on artificial intelligence (IJCAI), 106, 1469-1495, 2017. 22 | `_ 23 | 24 | Example: 25 | 26 | >>> from capymoa.datasets import ElectricityTiny 27 | >>> from capymoa.anomaly import HalfSpaceTrees 28 | >>> from capymoa.evaluation import AnomalyDetectionEvaluator 29 | >>> stream = ElectricityTiny() 30 | >>> schema = stream.get_schema() 31 | >>> learner = HalfSpaceTrees(schema) 32 | >>> evaluator = AnomalyDetectionEvaluator(schema) 33 | >>> while stream.has_more_instances(): 34 | ... instance = stream.next_instance() 35 | ... proba = learner.score_instance(instance) 36 | ... evaluator.update(instance.y_index, proba) 37 | ... learner.train(instance) 38 | >>> auc = evaluator.auc() 39 | >>> print(f"AUC: {auc:.2f}") 40 | AUC: 0.54 41 | 42 | """ 43 | 44 | def __init__( 45 | self, 46 | schema=None, 47 | CLI=None, 48 | random_seed=1, 49 | window_size=100, 50 | number_of_trees=25, 51 | max_depth=15, 52 | anomaly_threshold=0.5, 53 | size_limit=0.1, 54 | ): 55 | """Construct a Half-Space Trees anomaly detector 56 | 57 | :param schema: The schema of the stream. If not provided, it will be inferred from the data. 58 | :param CLI: Command Line Interface (CLI) options for configuring the HS-Trees algorithm. 59 | :param random_seed: Random seed for reproducibility. 60 | :param window_size: The size of the window for each tree. 61 | :param number_of_trees: The number of trees in the ensemble. 62 | :param max_depth: The maximum depth of each tree. 63 | """ 64 | if CLI is None: 65 | self.window_size = window_size 66 | self.number_of_trees = number_of_trees 67 | self.max_depth = max_depth 68 | self.anomaly_threshold = anomaly_threshold 69 | self.size_limit = size_limit 70 | CLI = ( 71 | f"-p {self.window_size} -t {self.number_of_trees} -h {self.max_depth} \ 72 | -a {self.anomaly_threshold} -s {self.size_limit}" 73 | ) 74 | 75 | super().__init__( 76 | schema=schema, CLI=CLI, random_seed=random_seed, moa_learner=_MOA_HSTrees() 77 | ) 78 | 79 | def __str__(self): 80 | # Overrides the default class name from MOA 81 | return "HalfSpaceTrees" 82 | -------------------------------------------------------------------------------- /src/capymoa/automl/__init__.py: -------------------------------------------------------------------------------- 1 | from ._autoclass import AutoClass 2 | 3 | __all__ = ["AutoClass"] 4 | -------------------------------------------------------------------------------- /src/capymoa/automl/_autoclass.py: -------------------------------------------------------------------------------- 1 | from capymoa.base import ( 2 | Classifier, 3 | MOAClassifier, 4 | ) 5 | from capymoa.stream import Schema 6 | from capymoa._utils import build_cli_str_from_mapping_and_locals 7 | from moa.classifiers.meta.AutoML import AutoClass as _MOA_AUTOCLASS 8 | import os 9 | 10 | 11 | class AutoClass(MOAClassifier): 12 | """AutoClass 13 | 14 | Reference: 15 | `Maroua Bahri, Nikolaos Georgantas. 16 | Autoclass: Automl for data stream classification. 17 | In BigData, IEEE, 2023. `_ 18 | 19 | """ 20 | 21 | def __init__( 22 | self, 23 | schema: Schema = None, 24 | random_seed: int = 0, 25 | configuration_json: str = "../../data/settings_autoclass.json", 26 | base_classifiers: list[Classifier] = [ 27 | "lazy.kNN", 28 | "trees.HoeffdingTree", 29 | "trees.HoeffdingAdaptiveTree", 30 | ], 31 | number_active_classifiers: int = 1, 32 | weight_classifiers: bool = False, 33 | ): 34 | """AutoClass automl algorithm by Bahri and Georgantas. 35 | 36 | Note that configuration json file reading is delegated to the MOA object, thus in the configuration file 37 | the name of the learners should correspond to the MOA class full name. 38 | 39 | :param schema: The schema of the stream. 40 | :param random_seed: The random seed passed to the MOA learner. 41 | :param configuration: A json file with the configuration for learners 42 | :param base_classifiers: The learners that compose the ensemble 43 | :param number_active_classifiers: The number of active classifiers (used for voting) 44 | :param weight_classifiers: Uses online performance estimation to weight the classifiers 45 | """ 46 | 47 | # Check if the json configuration file exists. 48 | if not os.path.exists(configuration_json): 49 | raise FileNotFoundError( 50 | f"The configuration json file was not found: {configuration_json}" 51 | ) 52 | 53 | mapping = { 54 | # Configuration json file or dictionary 55 | "configuration_json": "-f", 56 | # How many instances before we re-evaluate the best classifier 57 | # "grace_period": "-g", not used currently 58 | # The classifiers the ensemble consists of 59 | "base_classifiers": "-b", 60 | # The number of active classifiers (used for voting) 61 | "number_active_classifiers": "-k", 62 | # Uses online performance estimation to weight the classifiers 63 | "weight_classifiers": "-p", 64 | } 65 | 66 | if all(isinstance(classifier, str) for classifier in base_classifiers): 67 | # Join the list of strings as 'x,y' 68 | base_classifiers = ",".join(base_classifiers) 69 | # Check if base_classifiers is a list of Classifier objects 70 | elif all( 71 | issubclass(classifier, MOAClassifier) for classifier in base_classifiers 72 | ): 73 | # Join the strings from the classifiers' class names 74 | base_classifiers = ",".join( 75 | str(classifier(schema).moa_learner.getClass().getName()) 76 | for classifier in base_classifiers 77 | ) 78 | else: 79 | raise ValueError( 80 | "base_classifiers must be either a list of strings or a list of Classifier objects" 81 | ) 82 | 83 | config_str = build_cli_str_from_mapping_and_locals(mapping, locals()) 84 | super(AutoClass, self).__init__( 85 | moa_learner=_MOA_AUTOCLASS, 86 | schema=schema, 87 | CLI=config_str, 88 | random_seed=random_seed, 89 | ) 90 | -------------------------------------------------------------------------------- /src/capymoa/base/__init__.py: -------------------------------------------------------------------------------- 1 | from capymoa.base._base import ( 2 | AnomalyDetector, 3 | Clusterer, 4 | ClusteringResult, 5 | MOAAnomalyDetector, 6 | MOAClusterer, 7 | MOAPredictionIntervalLearner, 8 | PredictionIntervalLearner, 9 | _extract_moa_drift_detector_CLI, 10 | _extract_moa_learner_CLI, 11 | ) 12 | from capymoa.base._classifier import ( 13 | BatchClassifier, 14 | Classifier, 15 | MOAClassifier, 16 | SKClassifier, 17 | ) 18 | from capymoa.base._regressor import BatchRegressor, MOARegressor, Regressor, SKRegressor 19 | from capymoa.base._ssl import ( 20 | ClassifierSSL, 21 | MOAClassifierSSL, 22 | ) 23 | 24 | __all__ = [ 25 | "_extract_moa_drift_detector_CLI", 26 | "_extract_moa_learner_CLI", 27 | "Classifier", 28 | "BatchClassifier", 29 | "MOAClassifier", 30 | "SKClassifier", 31 | "ClassifierSSL", 32 | "MOAClassifierSSL", 33 | "Regressor", 34 | "BatchRegressor", 35 | "MOARegressor", 36 | "SKRegressor", 37 | "AnomalyDetector", 38 | "Clusterer", 39 | "ClusteringResult", 40 | "MOAAnomalyDetector", 41 | "MOAClusterer", 42 | "MOAPredictionIntervalLearner", 43 | "PredictionIntervalLearner", 44 | ] 45 | -------------------------------------------------------------------------------- /src/capymoa/base/_ssl.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | 3 | from ._base import Instance 4 | from ._classifier import Classifier, MOAClassifier 5 | 6 | 7 | class ClassifierSSL(Classifier): 8 | """Base class for semi-supervised learning classifiers.""" 9 | 10 | @abstractmethod 11 | def train_on_unlabeled(self, instance: Instance): 12 | pass 13 | 14 | 15 | class MOAClassifierSSL(MOAClassifier, ClassifierSSL): 16 | """Wrapper for using MOA semi-supervised learning classifiers.""" 17 | 18 | def train_on_unlabeled(self, instance: Instance): 19 | self.moa_learner.trainOnUnlabeledInstance(instance.java_instance.getData()) 20 | -------------------------------------------------------------------------------- /src/capymoa/classifier/__init__.py: -------------------------------------------------------------------------------- 1 | from ._adaptive_random_forest import AdaptiveRandomForestClassifier 2 | from ._efdt import EFDT 3 | from ._hoeffding_tree import HoeffdingTree 4 | from ._naive_bayes import NaiveBayes 5 | from ._online_bagging import OnlineBagging 6 | from ._online_adwin_bagging import OnlineAdwinBagging 7 | from ._leveraging_bagging import LeveragingBagging 8 | from ._passive_aggressive_classifier import PassiveAggressiveClassifier 9 | from ._sgd_classifier import SGDClassifier 10 | from ._knn import KNN 11 | from ._sgbt import StreamingGradientBoostedTrees 12 | from ._oza_boost import OzaBoost 13 | from ._majority_class import MajorityClass 14 | from ._no_change import NoChange 15 | from ._online_smooth_boost import OnlineSmoothBoost 16 | from ._srp import StreamingRandomPatches 17 | from ._hoeffding_adaptive_tree import HoeffdingAdaptiveTree 18 | from ._samknn import SAMkNN 19 | from ._dynamic_weighted_majority import DynamicWeightedMajority 20 | from ._csmote import CSMOTE 21 | from ._weightedknn import WeightedkNN 22 | from ._shrubs_classifier import ShrubsClassifier 23 | from ._finetune import Finetune 24 | 25 | __all__ = [ 26 | "AdaptiveRandomForestClassifier", 27 | "EFDT", 28 | "HoeffdingTree", 29 | "NaiveBayes", 30 | "OnlineBagging", 31 | "OnlineAdwinBagging", 32 | "LeveragingBagging", 33 | "KNN", 34 | "PassiveAggressiveClassifier", 35 | "SGDClassifier", 36 | "StreamingGradientBoostedTrees", 37 | "OzaBoost", 38 | "MajorityClass", 39 | "NoChange", 40 | "OnlineSmoothBoost", 41 | "StreamingRandomPatches", 42 | "HoeffdingAdaptiveTree", 43 | "SAMkNN", 44 | "DynamicWeightedMajority", 45 | "CSMOTE", 46 | "WeightedkNN", 47 | "ShrubsClassifier", 48 | "Finetune", 49 | ] 50 | -------------------------------------------------------------------------------- /src/capymoa/classifier/_csmote.py: -------------------------------------------------------------------------------- 1 | from capymoa.base import ( 2 | MOAClassifier, 3 | ) 4 | from capymoa.stream import Schema 5 | from capymoa._utils import build_cli_str_from_mapping_and_locals 6 | from moa.classifiers.meta.imbalanced import CSMOTE as _MOA_CSMOTE 7 | 8 | 9 | class CSMOTE(MOAClassifier): 10 | """CSMOTE 11 | 12 | This strategy saves all the minority samples in a window managed by ADWIN. Meanwhile, a model is trained with 13 | the input data. When the minority sample ratio falls below a certain threshold, an online version of SMOTE is 14 | applied. A random minority sample is chosen from the window, and a new synthetic sample is generated until the 15 | minority sample ratio is greater than or equal to the threshold. The model is then trained with the newly 16 | generated samples. 17 | 18 | Reference: 19 | `Alessio Bernardo, Heitor Murilo Gomes, Jacob Montiel, Bernhard Pfahringer, Albert Bifet, Emanuele Della Valle. 20 | C-SMOTE: Continuous Synthetic Minority Oversampling for Evolving Data Streams. 21 | In BigData, IEEE, 2020. `_ 22 | 23 | 24 | Example usages: 25 | 26 | >>> from capymoa.datasets import ElectricityTiny 27 | >>> from capymoa.classifier import CSMOTE 28 | >>> from capymoa.evaluation import prequential_evaluation 29 | >>> stream = ElectricityTiny() 30 | >>> schema = stream.get_schema() 31 | >>> learner = CSMOTE(schema) 32 | >>> results = prequential_evaluation(stream, learner, max_instances=1000) 33 | >>> results["cumulative"].accuracy() 34 | 83.1 35 | """ 36 | 37 | def __init__( 38 | self, 39 | schema: Schema = None, 40 | random_seed: int = 0, 41 | base_learner="trees.HoeffdingTree", 42 | neighbors: int = 10, 43 | threshold: float = 0.5, 44 | min_size_allowed: int = 100, 45 | disable_drift_detection: bool = False, 46 | ): 47 | """Continuous Synthetic Minority Oversampling (C-SMOTE) by Bernardo et al. 48 | 49 | :param schema: The schema of the stream. 50 | :param random_seed: The random seed passed to the MOA learner. 51 | :param base_learner: The base learner to be trained. Default AdaptiveRandomForestClassifier. 52 | :param neighbors: Number of neighbors for SMOTE. 53 | :param threshold: Minority class samples threshold. 54 | :param min_size_allowed: Minimum number of samples in the minority class for applying SMOTE. 55 | :param disable_drift_detection: If set, disables ADWIN drift detector 56 | """ 57 | 58 | mapping = { 59 | "base_learner": "-l", 60 | "neighbors": "-k", 61 | "threshold": "-t", 62 | "min_size_allowed": "-m", 63 | "disable_drift_detection": "-d", 64 | } 65 | 66 | assert isinstance(base_learner, str), ( 67 | "Only MOA CLI strings are supported for CSMOTE base_learner, at the moment." 68 | ) 69 | 70 | config_str = build_cli_str_from_mapping_and_locals(mapping, locals()) 71 | super(CSMOTE, self).__init__( 72 | moa_learner=_MOA_CSMOTE, 73 | schema=schema, 74 | CLI=config_str, 75 | random_seed=random_seed, 76 | ) 77 | -------------------------------------------------------------------------------- /src/capymoa/classifier/_dynamic_weighted_majority.py: -------------------------------------------------------------------------------- 1 | from capymoa.base import MOAClassifier 2 | from capymoa.stream import Schema 3 | from capymoa._utils import build_cli_str_from_mapping_and_locals 4 | from moa.classifiers.meta import DynamicWeightedMajority as _MOA_DWM 5 | 6 | 7 | class DynamicWeightedMajority(MOAClassifier): 8 | """Dynamic Weighted Majority Classifier. 9 | 10 | Reference: 11 | 12 | J. Zico Kolter and Marcus A. Maloof. Dynamic weighted majority: An ensemble 13 | method for drifting concepts. The Journal of Machine Learning Research, 14 | 8:2755-2790, December 2007. ISSN 1532-4435. URL 15 | http://dl.acm.org/citation.cfm?id=1314498.1390333. 16 | 17 | Example usages: 18 | 19 | >>> from capymoa.datasets import ElectricityTiny 20 | >>> from capymoa.classifier import DynamicWeightedMajority 21 | >>> from capymoa.evaluation import prequential_evaluation 22 | >>> stream = ElectricityTiny() 23 | >>> schema = stream.get_schema() 24 | >>> learner = DynamicWeightedMajority(schema) 25 | >>> results = prequential_evaluation(stream, learner, max_instances=1000) 26 | >>> results["cumulative"].accuracy() 27 | 85.7 28 | """ 29 | 30 | def __init__( 31 | self, 32 | schema: Schema, 33 | random_seed: int = 1, 34 | base_learner="bayes.NaiveBayes", 35 | period: int = 50, 36 | beta: float = 0.5, 37 | theta: float = 0.01, 38 | max_experts: int = 10000, # overwrite Integer.MAX_VALUE in Java with 10000 39 | ): 40 | """Dynamic Weighted Majority classifier 41 | 42 | param: base_learner: the base learner to be used, default naive bayes. 43 | param: period: period between expert removal, creation, and weight update, default 50. 44 | param: beta: factor to punish mistakes by, default 0.5. 45 | param: theta: minimum fraction of weight per model, default 0.01. 46 | param: max_experts: maximum number of allowed experts, default unlimited. 47 | 48 | """ 49 | 50 | mapping = { 51 | "base_learner": "-l", 52 | "period": "-p", 53 | "beta": "-b", 54 | "theta": "-t", 55 | "max_experts": "-e", 56 | } 57 | 58 | config_str = build_cli_str_from_mapping_and_locals(mapping, locals()) 59 | super(DynamicWeightedMajority, self).__init__( 60 | schema=schema, 61 | random_seed=random_seed, 62 | CLI=config_str, 63 | moa_learner=_MOA_DWM, 64 | ) 65 | -------------------------------------------------------------------------------- /src/capymoa/classifier/_hoeffding_tree.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Union 3 | 4 | from capymoa.base import MOAClassifier 5 | from capymoa.splitcriteria import SplitCriterion, _split_criterion_to_cli_str 6 | from capymoa.stream import Schema 7 | from capymoa._utils import build_cli_str_from_mapping_and_locals, _leaf_prediction 8 | 9 | import moa.classifiers.trees as moa_trees 10 | 11 | 12 | class HoeffdingTree(MOAClassifier): 13 | """Hoeffding Tree classifier. 14 | 15 | Parameters 16 | ---------- 17 | schema 18 | The schema of the stream 19 | random_seed 20 | The random seed passed to the moa learner 21 | grace_period 22 | Number of instances a leaf should observe between split attempts. 23 | split_criterion 24 | Split criterion to use. Defaults to `InfoGainSplitCriterion` 25 | confidence 26 | Significance level to calculate the Hoeffding bound. The significance level is given by 27 | `1 - delta`. Values closer to zero imply longer split decision delays. 28 | tie_threshold 29 | Threshold below which a split will be forced to break ties. 30 | leaf_prediction 31 | Prediction mechanism used at leafs.
32 | - 0 - Majority Class
33 | - 1 - Naive Bayes
34 | - 2 - Naive Bayes Adaptive
35 | nb_threshold 36 | Number of instances a leaf should observe before allowing Naive Bayes. 37 | numeric_attribute_observer 38 | The Splitter or Attribute Observer (AO) used to monitor the class statistics of numeric 39 | features and perform splits. 40 | binary_split 41 | If True, only allow binary splits. 42 | max_byte_size 43 | The max size of the tree, in bytes. 44 | memory_estimate_period 45 | Interval (number of processed instances) between memory consumption checks. 46 | stop_mem_management 47 | If True, stop growing as soon as memory limit is hit. 48 | remove_poor_attrs 49 | If True, disable poor attributes to reduce memory usage. 50 | disable_prepruning 51 | If True, disable merit-based tree pre-pruning. 52 | """ 53 | 54 | def __init__( 55 | self, 56 | schema: Schema | None = None, 57 | random_seed: int = 0, 58 | grace_period: int = 200, 59 | split_criterion: Union[str, SplitCriterion] = "InfoGainSplitCriterion", 60 | confidence: float = 1e-3, 61 | tie_threshold: float = 0.05, 62 | leaf_prediction: int = "NaiveBayesAdaptive", 63 | nb_threshold: int = 0, 64 | numeric_attribute_observer: str = "GaussianNumericAttributeClassObserver", 65 | binary_split: bool = False, 66 | max_byte_size: float = 33554433, 67 | memory_estimate_period: int = 1000000, 68 | stop_mem_management: bool = True, 69 | remove_poor_attrs: bool = False, 70 | disable_prepruning: bool = True, 71 | ): 72 | mapping = { 73 | "grace_period": "-g", 74 | "max_byte_size": "-m", 75 | "numeric_attribute_observer": "-n", 76 | "memory_estimate_period": "-e", 77 | "split_criterion": "-s", 78 | "confidence": "-c", 79 | "tie_threshold": "-t", 80 | "binary_split": "-b", 81 | "stop_mem_management": "-z", 82 | "remove_poor_attrs": "-r", 83 | "disable_prepruning": "-p", 84 | "leaf_prediction": "-l", 85 | "nb_threshold": "-q", 86 | } 87 | split_criterion = _split_criterion_to_cli_str(split_criterion) 88 | leaf_prediction = _leaf_prediction(leaf_prediction) 89 | config_str = build_cli_str_from_mapping_and_locals(mapping, locals()) 90 | super(HoeffdingTree, self).__init__( 91 | moa_learner=moa_trees.HoeffdingTree, 92 | schema=schema, 93 | CLI=config_str, 94 | random_seed=random_seed, 95 | ) 96 | -------------------------------------------------------------------------------- /src/capymoa/classifier/_knn.py: -------------------------------------------------------------------------------- 1 | from capymoa.base import MOAClassifier 2 | from moa.classifiers.lazy import kNN as _moa_kNN 3 | 4 | 5 | class KNN(MOAClassifier): 6 | """ 7 | The default number of neighbors (k) is set to 3 instead of 10 (as in MOA) 8 | """ 9 | 10 | def __init__(self, schema=None, CLI=None, random_seed=1, k=3, window_size=1000): 11 | # Important, should create the MOA object before invoking the super class __init__ 12 | self.moa_learner = _moa_kNN() 13 | super().__init__( 14 | schema=schema, 15 | CLI=CLI, 16 | random_seed=random_seed, 17 | moa_learner=self.moa_learner, 18 | ) 19 | 20 | # Initialize instance attributes with default values, CLI was not set. 21 | if self.CLI is None: 22 | self.k = k 23 | self.window_size = window_size 24 | self.moa_learner.getOptions().setViaCLIString( 25 | f"-k {self.k} -w {self.window_size}" 26 | ) 27 | self.moa_learner.prepareForUse() 28 | self.moa_learner.resetLearning() 29 | 30 | def __str__(self): 31 | # Overrides the default class name from MOA 32 | return "kNN" 33 | -------------------------------------------------------------------------------- /src/capymoa/classifier/_majority_class.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from capymoa.base import ( 4 | MOAClassifier, 5 | ) 6 | from capymoa.stream import Schema 7 | from capymoa._utils import build_cli_str_from_mapping_and_locals 8 | 9 | from moa.classifiers.functions import MajorityClass as _MOA_MajorityClass 10 | 11 | 12 | class MajorityClass(MOAClassifier): 13 | """Majority class classifier. 14 | 15 | Always predicts the class that has been observed most frequently the in the training data. 16 | 17 | Example usages: 18 | 19 | >>> from capymoa.datasets import ElectricityTiny 20 | >>> from capymoa.classifier import MajorityClass 21 | >>> from capymoa.evaluation import prequential_evaluation 22 | >>> stream = ElectricityTiny() 23 | >>> schema = stream.get_schema() 24 | >>> learner = MajorityClass(schema) 25 | >>> results = prequential_evaluation(stream, learner, max_instances=1000) 26 | >>> results["cumulative"].accuracy() 27 | 50.2 28 | """ 29 | 30 | def __init__( 31 | self, 32 | schema: Schema | None = None, 33 | ): 34 | """Majority class classifier. 35 | 36 | :param schema: The schema of the stream. 37 | """ 38 | 39 | mapping = {} 40 | 41 | config_str = build_cli_str_from_mapping_and_locals(mapping, locals()) 42 | super(MajorityClass, self).__init__( 43 | moa_learner=_MOA_MajorityClass, 44 | schema=schema, 45 | CLI=config_str, 46 | ) 47 | -------------------------------------------------------------------------------- /src/capymoa/classifier/_naive_bayes.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | import typing 3 | 4 | from capymoa.base import MOAClassifier 5 | from capymoa.stream import Schema 6 | 7 | import moa.classifiers.bayes as moa_bayes 8 | 9 | 10 | class NaiveBayes(MOAClassifier): 11 | """Naive Bayes incremental learner. 12 | Performs classic Bayesian prediction while making the naive assumption that all inputs are independent. Naive Bayes is a classifier algorithm known for its simplicity and low computational cost. Given n different classes, the trained Naive Bayes classifier predicts, for every unlabeled instance I, the class C to which it belongs with high accuracy. 13 | 14 | :param schema: The schema of the stream, defaults to None. 15 | :param random_seed: The random seed passed to the MOA learner, defaults to 0. 16 | """ 17 | 18 | def __init__(self, schema: typing.Union[Schema, None] = None, random_seed: int = 0): 19 | super(NaiveBayes, self).__init__( 20 | moa_learner=moa_bayes.NaiveBayes(), schema=schema, random_seed=random_seed 21 | ) 22 | 23 | def __str__(self): 24 | # Overrides the default class name from MOA (OzaBag) 25 | return "Naive Bayes CapyMOA Classifier" 26 | -------------------------------------------------------------------------------- /src/capymoa/classifier/_no_change.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from capymoa.base import ( 4 | MOAClassifier, 5 | ) 6 | from capymoa.stream import Schema 7 | from capymoa._utils import build_cli_str_from_mapping_and_locals 8 | 9 | from moa.classifiers.functions import NoChange as _MOA_NoChange 10 | 11 | 12 | class NoChange(MOAClassifier): 13 | """NoChange classifier. 14 | 15 | Always predicts the last class seen. 16 | 17 | Example usages: 18 | 19 | >>> from capymoa.datasets import ElectricityTiny 20 | >>> from capymoa.classifier import NoChange 21 | >>> from capymoa.evaluation import prequential_evaluation 22 | >>> stream = ElectricityTiny() 23 | >>> schema = stream.get_schema() 24 | >>> learner = NoChange(schema) 25 | >>> results = prequential_evaluation(stream, learner, max_instances=1000) 26 | >>> results["cumulative"].accuracy() 27 | 85.9 28 | """ 29 | 30 | def __init__( 31 | self, 32 | schema: Schema | None = None, 33 | ): 34 | """NoChange class classifier. 35 | 36 | :param schema: The schema of the stream. 37 | """ 38 | 39 | mapping = {} 40 | 41 | config_str = build_cli_str_from_mapping_and_locals(mapping, locals()) 42 | super(NoChange, self).__init__( 43 | moa_learner=_MOA_NoChange, 44 | schema=schema, 45 | CLI=config_str, 46 | ) 47 | -------------------------------------------------------------------------------- /src/capymoa/classifier/_online_smooth_boost.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from capymoa.base import ( 4 | MOAClassifier, 5 | ) 6 | from capymoa.stream import Schema 7 | from capymoa._utils import build_cli_str_from_mapping_and_locals 8 | 9 | from moa.classifiers.meta import OnlineSmoothBoost as _MOA_OnlineSmoothBoost 10 | 11 | 12 | class OnlineSmoothBoost(MOAClassifier): 13 | """OnlineSmoothBoost Classifier 14 | 15 | Incremental on-line boosting with Theoretical Justifications of Shang-Tse Chen 16 | 17 | Reference: 18 | 19 | `An Online Boosting Algorithm with Theoretical Justifications. 20 | Shang-Tse Chen, Hsuan-Tien Lin, Chi-Jen Lu. 21 | ICML, 2012. 22 | `_ 23 | 24 | Example usages: 25 | 26 | >>> from capymoa.datasets import ElectricityTiny 27 | >>> from capymoa.classifier import OnlineSmoothBoost 28 | >>> from capymoa.evaluation import prequential_evaluation 29 | >>> stream = ElectricityTiny() 30 | >>> schema = stream.get_schema() 31 | >>> learner = OnlineSmoothBoost(schema) 32 | >>> results = prequential_evaluation(stream, learner, max_instances=1000) 33 | >>> results["cumulative"].accuracy() 34 | 87.8 35 | """ 36 | 37 | def __init__( 38 | self, 39 | schema: Schema | None = None, 40 | random_seed: int = 0, 41 | base_learner="trees.HoeffdingTree", 42 | boosting_iterations: int = 100, 43 | gamma=0.1, 44 | ): 45 | """OnlineSmoothBoost Classifier 46 | 47 | :param schema: The schema of the stream. 48 | :param random_seed: The random seed passed to the MOA learner. 49 | :param base_learner: The base learner to be trained. Default trees.HoeffdingTree. 50 | :param boosting_iterations: The number of boosting iterations (ensemble size). 51 | :param gamma: The value of the gamma parameter. 52 | """ 53 | 54 | mapping = { 55 | "base_learner": "-l", 56 | "boosting_iterations": "-s", 57 | "gamma": "-g", 58 | } 59 | 60 | assert isinstance(base_learner, str), ( 61 | "Only MOA CLI strings are supported for OnlineSmoothBoost base_learner, at the moment." 62 | ) 63 | 64 | config_str = build_cli_str_from_mapping_and_locals(mapping, locals()) 65 | super(OnlineSmoothBoost, self).__init__( 66 | moa_learner=_MOA_OnlineSmoothBoost, 67 | schema=schema, 68 | CLI=config_str, 69 | random_seed=random_seed, 70 | ) 71 | -------------------------------------------------------------------------------- /src/capymoa/classifier/_oza_boost.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from capymoa.base import ( 4 | MOAClassifier, 5 | ) 6 | from capymoa.stream import Schema 7 | from capymoa._utils import build_cli_str_from_mapping_and_locals 8 | 9 | from moa.classifiers.meta import OzaBoost as _MOA_OzaBoost 10 | 11 | 12 | class OzaBoost(MOAClassifier): 13 | """Incremental on-line boosting classifier of Oza and Russell. 14 | 15 | For the boosting method, Oza and Russell note that the 16 | weighting procedure of AdaBoost actually divides the total example weight 17 | into two halves – half of the weight is assigned to the correctly classified 18 | examples, and the other half goes to the misclassified examples. They use the 19 | Poisson distribution for deciding the random probability that an example is 20 | used for training, only this time the parameter changes according to the 21 | boosting weight of the example as it is passed through each model in 22 | sequence. 23 | 24 | Reference: 25 | 26 | `Online bagging and boosting. 27 | Nikunj Oza, Stuart Russell. 28 | Artificial Intelligence and Statistics 2001. 29 | `_ 30 | 31 | Example usages: 32 | 33 | >>> from capymoa.datasets import ElectricityTiny 34 | >>> from capymoa.classifier import OzaBoost 35 | >>> from capymoa.evaluation import prequential_evaluation 36 | >>> stream = ElectricityTiny() 37 | >>> schema = stream.get_schema() 38 | >>> learner = OzaBoost(schema) 39 | >>> results = prequential_evaluation(stream, learner, max_instances=1000) 40 | >>> results["cumulative"].accuracy() 41 | 88.8 42 | """ 43 | 44 | def __init__( 45 | self, 46 | schema: Schema | None = None, 47 | random_seed: int = 0, 48 | base_learner="trees.HoeffdingTree", 49 | boosting_iterations: int = 10, 50 | use_pure_boost: bool = False, 51 | ): 52 | """Incremental on-line boosting classifier of Oza and Russell. 53 | 54 | :param schema: The schema of the stream. 55 | :param random_seed: The random seed passed to the MOA learner. 56 | :param base_learner: The base learner to be trained. Default trees.HoeffdingTree. 57 | :param boosting_iterations: The number of boosting iterations. 58 | :param use_pure_boost: Boost with weights only; no poisson.. 59 | """ 60 | 61 | mapping = { 62 | "base_learner": "-l", 63 | "boosting_iterations": "-s", 64 | "use_pure_boost": "-p", 65 | } 66 | 67 | assert isinstance(base_learner, str), ( 68 | "Only MOA CLI strings are supported for OzaBoost base_learner, at the moment." 69 | ) 70 | 71 | config_str = build_cli_str_from_mapping_and_locals(mapping, locals()) 72 | super(OzaBoost, self).__init__( 73 | moa_learner=_MOA_OzaBoost, 74 | schema=schema, 75 | CLI=config_str, 76 | random_seed=random_seed, 77 | ) 78 | -------------------------------------------------------------------------------- /src/capymoa/classifier/_passive_aggressive_classifier.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Dict, Union, Literal 2 | from capymoa.base import SKClassifier 3 | from sklearn.linear_model import ( 4 | PassiveAggressiveClassifier as _SKPassiveAggressiveClassifier, 5 | ) 6 | from capymoa.stream._stream import Schema 7 | 8 | 9 | class PassiveAggressiveClassifier(SKClassifier): 10 | """Streaming Passive Aggressive Classifier 11 | 12 | This wraps :sklearn:`linear_model.PassiveAggressiveClassifier` for 13 | ease of use in the streaming context. Some options are missing because 14 | they are not relevant in the streaming context. 15 | 16 | `Online Passive-Aggressive Algorithms K. Crammer, O. Dekel, J. Keshat, S. 17 | Shalev-Shwartz, Y. Singer - JMLR (2006) 18 | `_ 19 | 20 | >>> from capymoa.datasets import ElectricityTiny 21 | >>> from capymoa.classifier import PassiveAggressiveClassifier 22 | >>> from capymoa.evaluation import prequential_evaluation 23 | >>> stream = ElectricityTiny() 24 | >>> schema = stream.get_schema() 25 | >>> learner = PassiveAggressiveClassifier(schema) 26 | >>> results = prequential_evaluation(stream, learner, max_instances=1000) 27 | >>> results["cumulative"].accuracy() 28 | 84.3 29 | """ 30 | 31 | sklearner: _SKPassiveAggressiveClassifier 32 | """The underlying scikit-learn object. See: :sklearn:`linear_model.PassiveAggressiveClassifier`""" 33 | 34 | def __init__( 35 | self, 36 | schema: Schema, 37 | max_step_size: float = 1.0, 38 | fit_intercept: bool = True, 39 | loss: str = "hinge", 40 | n_jobs: Optional[int] = None, 41 | class_weight: Union[Dict[int, float], None, Literal["balanced"]] = None, 42 | average: bool = False, 43 | random_seed=1, 44 | ): 45 | """Construct a passive aggressive classifier. 46 | 47 | :param schema: Stream schema 48 | :param max_step_size: Maximum step size (regularization). 49 | :param fit_intercept: Whether the intercept should be estimated or not. 50 | If False, the data is assumed to be already centered. 51 | :param loss: The loss function to be used: hinge: equivalent to PA-I in 52 | the reference paper. squared_hinge: equivalent to PA-II in the reference paper. 53 | :param n_jobs: The number of CPUs to use to do the OVA (One Versus All, 54 | for multi-class problems) computation. None means 1 unless in a 55 | ``joblib.parallel_backend`` context. -1 means using all processors. 56 | :param class_weight: Preset for the ``sklearner.class_weight`` fit parameter. 57 | 58 | Weights associated with classes. If not given, all classes are 59 | supposed to have weight one. 60 | 61 | The “balanced” mode uses the values of y to automatically adjust 62 | weights inversely proportional to class frequencies in the input 63 | data as ``n_samples / (n_classes * np.bincount(y))``. 64 | :param average: When set to True, computes the averaged SGD weights and 65 | stores the result in the ``sklearner.coef_`` attribute. If set to an int greater 66 | than 1, averaging will begin once the total number of samples 67 | seen reaches average. So ``average=10`` will begin averaging after 68 | seeing 10 samples. 69 | :param random_seed: Seed for the random number generator. 70 | """ 71 | 72 | super().__init__( 73 | _SKPassiveAggressiveClassifier( 74 | C=max_step_size, 75 | fit_intercept=fit_intercept, 76 | early_stopping=False, 77 | shuffle=False, 78 | verbose=0, 79 | loss=loss, 80 | n_jobs=n_jobs, 81 | warm_start=False, 82 | class_weight=class_weight, 83 | average=average, 84 | random_state=random_seed, 85 | ), 86 | schema, 87 | random_seed, 88 | ) 89 | 90 | def __str__(self): 91 | return str("PassiveAggressiveClassifier") 92 | -------------------------------------------------------------------------------- /src/capymoa/classifier/_samknn.py: -------------------------------------------------------------------------------- 1 | from capymoa.base import MOAClassifier 2 | from moa.classifiers.lazy import SAMkNN as _MOA_SAMkNN 3 | from capymoa.stream import Schema 4 | from capymoa._utils import build_cli_str_from_mapping_and_locals 5 | 6 | 7 | class SAMkNN(MOAClassifier): 8 | """Self Adjusted Memory k Nearest Neighbor (SAMkNN) Classifier 9 | 10 | Reference: 11 | 12 | "KNN Classifier with Self Adjusting Memory for Heterogeneous Concept Drift" 13 | Viktor Losing, Barbara Hammer and Heiko Wersing 14 | http://ieeexplore.ieee.org/document/7837853 15 | PDF can be found at https://pub.uni-bielefeld.de/download/2907622/2907623 16 | BibTex: 17 | "@INPROCEEDINGS{7837853, 18 | author={V. Losing and B. Hammer and H. Wersing}, 19 | booktitle={2016 IEEE 16th International Conference on Data Mining (ICDM)}, 20 | title={KNN Classifier with Self Adjusting Memory for Heterogeneous Concept Drift}, 21 | year={2016}, 22 | ages={291-300}, 23 | keywords={data mining;optimisation;pattern classification;Big Data;Internet of Things;KNN classifier;SAM-kNN robustness;data mining;k nearest neighbor algorithm;metaparameter optimization;nonstationary data streams;performance evaluation;self adjusting memory model;Adaptation models;Benchmark testing;Biological system modeling;Data mining;Heuristic algorithms;Prediction algorithms;Predictive models;Data streams;concept drift;data mining;kNN}, 24 | doi={10.1109/ICDM.2016.0040}, 25 | month={Dec} 26 | }" 27 | 28 | Example usages: 29 | 30 | >>> from capymoa.datasets import ElectricityTiny 31 | >>> from capymoa.classifier import SAMkNN 32 | >>> from capymoa.evaluation import prequential_evaluation 33 | >>> stream = ElectricityTiny() 34 | >>> schema = stream.get_schema() 35 | >>> learner = SAMkNN(schema) 36 | >>> results = prequential_evaluation(stream, learner, max_instances=1000) 37 | >>> results["cumulative"].accuracy() 38 | 78.60000000000001 39 | """ 40 | 41 | def __init__( 42 | self, 43 | schema: Schema, 44 | random_seed: int = 1, 45 | k: int = 5, 46 | limit: int = 5000, 47 | min_stm_size: int = 50, 48 | relative_ltm_size: float = 0.4, 49 | recalculate_stm_error: bool = False, 50 | ): 51 | """Self Adjusted Memory k Nearest Neighbor (SAMkNN) Classifier 52 | 53 | :param schema: The schema of the stream. 54 | :param random_seed: The random seed passed to the MOA learner. 55 | :param k: The number of nearest neighbors. 56 | :param limit: The maximum number of instances to store. 57 | :param min_stm_size: The minimum number of instances in the STM. 58 | :param relative_ltm_size: The allowed LTM size relative to the total limit. 59 | :param recalculate_stm_error: Recalculates the error rate of the STM for size adaption (Costly operation). 60 | Otherwise, an approximation is used. 61 | """ 62 | 63 | mapping = { 64 | "k": "-k", 65 | "limit": "-w", 66 | "min_stm_size": "-m", 67 | "relative_ltm_size": "-p", 68 | "recalculate_stm_error": "-r", 69 | } 70 | 71 | config_str = build_cli_str_from_mapping_and_locals(mapping, locals()) 72 | self.moa_learner = _MOA_SAMkNN() 73 | super(SAMkNN, self).__init__( 74 | schema=schema, 75 | random_seed=random_seed, 76 | CLI=config_str, 77 | moa_learner=self.moa_learner, 78 | ) 79 | -------------------------------------------------------------------------------- /src/capymoa/classifier/_weightedknn.py: -------------------------------------------------------------------------------- 1 | from capymoa.base import MOAClassifier 2 | from moa.classifiers.lazy import WeightedkNN as _MOA_WeightedkNN 3 | from capymoa.stream import Schema 4 | from capymoa._utils import build_cli_str_from_mapping_and_locals 5 | 6 | 7 | class WeightedkNN(MOAClassifier): 8 | """WeightedKNN 9 | Reference: 10 | 11 | 'Effective Weighted k-Nearest Neighbors for Dynamic Data Streams' 12 | Maroua Bahri 13 | IEEE International Conference on Big Data (Big Data), 2022 14 | 15 | Example usages: 16 | >>> from capymoa.datasets import ElectricityTiny 17 | >>> from capymoa.classifier import WeightedkNN 18 | >>> from capymoa.evaluation import prequential_evaluation 19 | >>> stream = ElectricityTiny() 20 | >>> schema = stream.get_schema() 21 | >>> learner = WeightedkNN(schema) 22 | >>> results = prequential_evaluation(stream, learner, max_instances=1000) 23 | >>> results["cumulative"].accuracy() 24 | 74.7 25 | """ 26 | 27 | def __init__(self, schema: Schema, k: int = 10, limit: int = 1000): 28 | """Weighted KNN Classifier 29 | :param schema: The schema of the stream. 30 | :param k: The number of neighbors. 31 | :param w: The maximum number of instances to store. 32 | """ 33 | 34 | mapping = { 35 | "k": "-k", 36 | "limit": "-w", 37 | } 38 | 39 | config_str = build_cli_str_from_mapping_and_locals(mapping, locals()) 40 | self.moa_learner = _MOA_WeightedkNN() 41 | super(WeightedkNN, self).__init__( 42 | schema=schema, 43 | CLI=config_str, 44 | moa_learner=self.moa_learner, 45 | ) 46 | -------------------------------------------------------------------------------- /src/capymoa/clusterers/__init__.py: -------------------------------------------------------------------------------- 1 | from ._clustream import Clustream 2 | from ._clustream_with_kmeans import Clustream_with_kmeans 3 | from ._clustree import ClusTree 4 | from ._denstream_with_dbscan import Denstream_with_dbscan 5 | # from ._dstream import Dstream 6 | 7 | __all__ = [ 8 | "Clustream", 9 | "Clustream_with_kmeans", 10 | "ClusTree", 11 | "Denstream_with_dbscan", 12 | # "Dstream" 13 | ] 14 | -------------------------------------------------------------------------------- /src/capymoa/clusterers/_clustream.py: -------------------------------------------------------------------------------- 1 | from capymoa.base import MOAClusterer 2 | import typing 3 | from moa.clusterers.clustream import Clustream as _MOA_Clustream 4 | from capymoa.stream import Schema 5 | from capymoa._utils import build_cli_str_from_mapping_and_locals 6 | # import numpy as np 7 | 8 | 9 | class Clustream(MOAClusterer): 10 | """ 11 | Clustream clustering algorithm without Macro-clustering. 12 | """ 13 | 14 | def __init__( 15 | self, 16 | schema: typing.Union[Schema, None] = None, 17 | time_window: int = 1000, 18 | max_num_kernels: int = 100, 19 | kernel_radi_factor: float = 2, 20 | ): 21 | """Clustream clusterer. 22 | 23 | :param schema: The schema of the stream. 24 | :param time_window: The size of the time window. 25 | :param max_num_kernels: Maximum number of micro kernels to use. 26 | :param kernel_radi_factor: Multiplier for the kernel radius 27 | """ 28 | 29 | mapping = { 30 | "time_window": "-h", 31 | "max_num_kernels": "-k", 32 | "kernel_radi_factor": "-t", 33 | } 34 | 35 | config_str = build_cli_str_from_mapping_and_locals(mapping, locals()) 36 | self.moa_learner = _MOA_Clustream() 37 | super(Clustream, self).__init__( 38 | schema=schema, CLI=config_str, moa_learner=self.moa_learner 39 | ) 40 | 41 | def implements_micro_clusters(self) -> bool: 42 | return True 43 | 44 | def implements_macro_clusters(self) -> bool: 45 | return False 46 | 47 | # def predict(self, X): 48 | # clusters = self.get_micro_clustering_result() 49 | # min_dist = np.inf 50 | # closest_center = None 51 | # for center in clusters.get_centers(): 52 | # if np.linalg.norm(center - X) < min_dist: 53 | # min_dist = np.linalg.norm(center - X) 54 | # closest_center = center 55 | # print(closest_center) 56 | # return closest_center 57 | -------------------------------------------------------------------------------- /src/capymoa/clusterers/_clustream_with_kmeans.py: -------------------------------------------------------------------------------- 1 | from capymoa.base import MOAClusterer 2 | import typing 3 | from moa.clusterers.clustream import WithKmeans as _MOA_Clustream_WKM 4 | from capymoa.stream import Schema 5 | from capymoa._utils import build_cli_str_from_mapping_and_locals 6 | # import numpy as np 7 | 8 | 9 | class Clustream_with_kmeans(MOAClusterer): 10 | """ 11 | Clustream clustering algorithm without Macro-clustering. 12 | """ 13 | 14 | def __init__( 15 | self, 16 | schema: typing.Union[Schema, None] = None, 17 | time_window: int = 1000, 18 | max_num_kernels: int = 100, 19 | kernel_radi_factor: float = 2, 20 | k_option: int = 5, 21 | ): 22 | """Clustream clusterer with K-means offline clustering. 23 | 24 | :param schema: The schema of the stream. 25 | :param time_window: The size of the time window. 26 | :param max_num_kernels: Maximum number of micro kernels to use. 27 | :param kernel_radi_factor: Multiplier for the kernel radius 28 | :param k_option: Number of clusters to use in the k-means offline step 29 | """ 30 | 31 | mapping = { 32 | "time_window": "-h", 33 | "max_num_kernels": "-m", 34 | "kernel_radi_factor": "-t", 35 | "k_option": "-k", 36 | } 37 | 38 | config_str = build_cli_str_from_mapping_and_locals(mapping, locals()) 39 | self.moa_learner = _MOA_Clustream_WKM() 40 | super(Clustream_with_kmeans, self).__init__( 41 | schema=schema, CLI=config_str, moa_learner=self.moa_learner 42 | ) 43 | 44 | def implements_micro_clusters(self) -> bool: 45 | return True 46 | 47 | def implements_macro_clusters(self) -> bool: 48 | return True 49 | 50 | # def predict(self, X): 51 | # clusters = self.get_micro_clustering_result() 52 | # min_dist = np.inf 53 | # closest_center = None 54 | # for center in clusters.get_centers(): 55 | # if np.linalg.norm(center - X) < min_dist: 56 | # min_dist = np.linalg.norm(center - X) 57 | # closest_center = center 58 | # print(closest_center) 59 | # return closest_center 60 | 61 | def __str__(self): 62 | return "Clustream with KMeans" 63 | -------------------------------------------------------------------------------- /src/capymoa/clusterers/_clustree.py: -------------------------------------------------------------------------------- 1 | from capymoa.base import MOAClusterer 2 | import typing 3 | from moa.clusterers.clustree import ClusTree as _MOA_ClusTree 4 | from capymoa.stream import Schema 5 | from capymoa._utils import build_cli_str_from_mapping_and_locals 6 | # import numpy as np 7 | 8 | 9 | class ClusTree(MOAClusterer): 10 | """ 11 | ClusTree clustering algorithm without Macro-clustering. 12 | """ 13 | 14 | def __init__( 15 | self, 16 | schema: typing.Union[Schema, None] = None, 17 | horizon: int = 1000, 18 | max_height: int = 8, 19 | breadth_first_strategy: bool = False, 20 | ): 21 | """Clustream clusterer. 22 | 23 | :param schema: The schema of the stream 24 | :param horizon: The size of the time window 25 | :param max_height: The maximum height of the tree 26 | :param breadth_first_strategy: Whether to use breadth-first strategy 27 | """ 28 | 29 | mapping = {"horizon": "-h", "max_height": "-H", "breadth_first_strategy": "-B"} 30 | 31 | config_str = build_cli_str_from_mapping_and_locals(mapping, locals()) 32 | self.moa_learner = _MOA_ClusTree() 33 | super(ClusTree, self).__init__( 34 | schema=schema, CLI=config_str, moa_learner=self.moa_learner 35 | ) 36 | 37 | def implements_micro_clusters(self) -> bool: 38 | return True 39 | 40 | def implements_macro_clusters(self) -> bool: 41 | return False 42 | 43 | # def predict(self, X): 44 | # clusters = self.get_micro_clustering_result() 45 | # min_dist = np.inf 46 | # closest_center = None 47 | # for center in clusters.get_centers(): 48 | # if np.linalg.norm(center - X) < min_dist: 49 | # min_dist = np.linalg.norm(center - X) 50 | # closest_center = center 51 | # print(closest_center) 52 | # return closest_center 53 | -------------------------------------------------------------------------------- /src/capymoa/clusterers/_denstream_with_dbscan.py: -------------------------------------------------------------------------------- 1 | from capymoa.base import MOAClusterer 2 | import typing 3 | from moa.clusterers.denstream import WithDBSCAN as _MOA_denstream_with_dbscan 4 | from capymoa.stream import Schema 5 | from capymoa._utils import build_cli_str_from_mapping_and_locals 6 | # import numpy as np 7 | 8 | 9 | class Denstream_with_dbscan(MOAClusterer): 10 | """ 11 | Denstream clustering algorithm with DBSCAN Macro-clustering. 12 | """ 13 | 14 | def __init__( 15 | self, 16 | schema: typing.Union[Schema, None] = None, 17 | horizon: int = 1000, 18 | epsilon: float = 0.02, 19 | beta: float = 0.2, 20 | mu: int = 1, 21 | init_points: int = 1000, 22 | offline_option: float = 2, 23 | lambda_option: float = 0.25, 24 | speed: int = 100, 25 | ): 26 | """Clustream clusterer. 27 | 28 | :param schema: The schema of the stream 29 | :param horizon: The size of the time window 30 | :param epsilon: The epsilon neighborhood 31 | :param beta: The beta parameter 32 | :param mu: The mu parameter 33 | :param init_points: The number of initial points 34 | :param offline_option: The offline multiplier for epsilon 35 | :param lambda_option: The lambda option 36 | :param speed: Number of incoming data points per time unit 37 | """ 38 | 39 | mapping = { 40 | "horizon": "-h", 41 | "epsilon": "-e", 42 | "beta": "-b", 43 | "mu": "-m", 44 | "init_points": "-i", 45 | "offline_option": "-o", 46 | "lambda_option": "-l", 47 | "speed": "-s", 48 | } 49 | 50 | config_str = build_cli_str_from_mapping_and_locals(mapping, locals()) 51 | self.moa_learner = _MOA_denstream_with_dbscan() 52 | super(Denstream_with_dbscan, self).__init__( 53 | schema=schema, CLI=config_str, moa_learner=self.moa_learner 54 | ) 55 | 56 | def implements_micro_clusters(self) -> bool: 57 | return True 58 | 59 | def implements_macro_clusters(self) -> bool: 60 | return True 61 | 62 | # def predict(self, X): 63 | # clusters = self.get_micro_clustering_result() 64 | # min_dist = np.inf 65 | # closest_center = None 66 | # for center in clusters.get_centers(): 67 | # if np.linalg.norm(center - X) < min_dist: 68 | # min_dist = np.linalg.norm(center - X) 69 | # closest_center = center 70 | # print(closest_center) 71 | # return closest_center 72 | 73 | def __str__(self): 74 | return "Denstream with DBSCAN" 75 | -------------------------------------------------------------------------------- /src/capymoa/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | """CapyMOA comes with some datasets 'out of the box'. Simply import the dataset 2 | and start using it, the data will be downloaded automatically if it is not 3 | already present in the download directory. You can configure where the datasets 4 | are downloaded to by setting an environment variable (See :mod:`capymoa.env`) 5 | 6 | >>> from capymoa.datasets import ElectricityTiny 7 | >>> stream = ElectricityTiny() 8 | >>> stream.next_instance().x 9 | array([0. , 0.056443, 0.439155, 0.003467, 0.422915, 0.414912]) 10 | 11 | Alternatively, you may download the datasets all at once with the command line interface 12 | provided by ``capymoa.datasets``: 13 | 14 | .. code-block:: bash 15 | 16 | python -m capymoa.datasets --help 17 | 18 | """ 19 | 20 | from ._datasets import ( 21 | Bike, 22 | CovtFD, 23 | Covtype, 24 | CovtypeNorm, 25 | CovtypeTiny, 26 | Electricity, 27 | ElectricityTiny, 28 | Fried, 29 | FriedTiny, 30 | Hyper100k, 31 | RBFm_100k, 32 | RTG_2abrupt, 33 | Sensor, 34 | ) 35 | from ._utils import get_download_dir 36 | from . import downloader 37 | 38 | __all__ = [ 39 | "Bike", 40 | "CovtFD", 41 | "Covtype", 42 | "CovtypeNorm", 43 | "CovtypeTiny", 44 | "Electricity", 45 | "ElectricityTiny", 46 | "Fried", 47 | "FriedTiny", 48 | "Hyper100k", 49 | "RBFm_100k", 50 | "RTG_2abrupt", 51 | "Sensor", 52 | "downloader", 53 | "get_download_dir", 54 | ] 55 | -------------------------------------------------------------------------------- /src/capymoa/datasets/__main__.py: -------------------------------------------------------------------------------- 1 | """This module defines the command line interface for downloading datasets.""" 2 | 3 | import click 4 | from ._source_list import SOURCE_LIST 5 | from ._utils import get_download_dir, is_already_downloaded, download_extract 6 | from typing import Set 7 | from typing import Optional 8 | 9 | 10 | @click.command() 11 | @click.option( 12 | "--dataset", 13 | "-d", 14 | multiple=True, 15 | help="The dataset to download. If not specified, all datasets will be downloaded.", 16 | ) 17 | @click.option( 18 | "--out", 19 | "-o", 20 | type=click.Path(exists=True, file_okay=False, dir_okay=True), 21 | default=None, 22 | help="Where should the datasets be downloaded to?" 23 | + " Defaults to the environment variable CAPYMOA_DATASETS_DIR or `./data` if not set.", 24 | ) 25 | @click.option( 26 | "--format", 27 | "-f", 28 | type=click.Choice(["arff", "csv"]), 29 | default="arff", 30 | help="The format to download. Defaults to ARFF.", 31 | ) 32 | @click.option( 33 | "--force", 34 | "-F", 35 | is_flag=True, 36 | help="Force download even if the file exists. Defaults to False.", 37 | ) 38 | @click.option( 39 | "--yes", 40 | "-y", 41 | is_flag=True, 42 | help="Skip the confirmation prompt.", 43 | ) 44 | def download_datasets( 45 | out: Optional[str], dataset: Set[str], format: str, force: bool, yes: bool 46 | ): 47 | """Download one or more datasets. 48 | 49 | Example: ``python -m capymoa.datasets -d Sensor -d Hyper100k`` 50 | 51 | An alternative to downloading datasets with this CLI tool is to use the 52 | dataset classes in ``capymoa.datasets``. 53 | """ 54 | if len(dataset) != 0: 55 | filtered_sources = dict(filter(lambda x: x[0] in dataset, SOURCE_LIST.items())) 56 | else: 57 | filtered_sources = SOURCE_LIST 58 | 59 | download_dir_path = get_download_dir(out) 60 | 61 | # List the datasets to be downloaded 62 | click.echo( 63 | f"Downloading the following datasets to {click.format_filename(download_dir_path)}" 64 | ) 65 | for name, source in filtered_sources.items(): 66 | col_name = f" {name:20}" 67 | url = getattr(source, format, None) 68 | if url is None: 69 | col_status = f"Skipped: No {format} available" 70 | fg = "yellow" 71 | elif is_already_downloaded(url, download_dir_path) and not force: 72 | col_status = "Skipped: Already downloaded" 73 | fg = "green" 74 | else: 75 | col_status = url 76 | fg = "blue" 77 | 78 | click.secho(f"{col_name} {col_status}", fg=fg) 79 | 80 | # Are they sure? 81 | if not yes: 82 | click.confirm("Do you want to continue?", abort=True) 83 | click.echo("") 84 | 85 | # Download the datasets 86 | for name, source in filtered_sources.items(): 87 | url = getattr(source, format, None) 88 | if url is None: 89 | click.secho(f"Skipping {name}: No {format} available", fg="yellow") 90 | continue 91 | if is_already_downloaded(url, download_dir_path) and not force: 92 | click.secho(f"Skipping {name}: Already downloaded", fg="green") 93 | continue 94 | 95 | click.echo(f"Downloading and extracting {name} from {url}") 96 | extracted_filename = download_extract(url, download_dir_path) 97 | click.secho(f"Downloaded {name} to {extracted_filename}", fg="green") 98 | 99 | 100 | if __name__ == "__main__": 101 | download_datasets() 102 | -------------------------------------------------------------------------------- /src/capymoa/datasets/downloader.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from abc import ABC, abstractmethod 3 | from pathlib import Path 4 | from tempfile import TemporaryDirectory 5 | from typing import Any, Optional 6 | 7 | import wget 8 | from moa.streams import ArffFileStream 9 | 10 | from capymoa.stream import MOAStream 11 | from capymoa.datasets._utils import extract, get_download_dir 12 | import os 13 | 14 | 15 | class DownloadableDataset(MOAStream, ABC): 16 | _filename: str = None 17 | """Name of the dataset in the capymoa dataset directory""" 18 | _length: int 19 | """Number of instances in the dataset""" 20 | 21 | def __init__( 22 | self, 23 | directory: str = get_download_dir(), 24 | auto_download: bool = True, 25 | CLI: Optional[str] = None, 26 | schema: Optional[str] = None, 27 | ): 28 | assert self._filename is not None, "Filename must be set in subclass" 29 | self._path = self._resolve_dataset( 30 | auto_download, 31 | Path(directory).resolve(), 32 | ) 33 | moa_stream = self.to_stream(self._path) 34 | super().__init__(schema=schema, CLI=CLI, moa_stream=moa_stream) 35 | 36 | def _resolve_dataset(self, auto_download: bool, directory: Path): 37 | directory.mkdir(parents=True, exist_ok=True) 38 | stream = directory / self._filename 39 | 40 | if not stream.exists(): 41 | if auto_download: 42 | with TemporaryDirectory() as working_directory: 43 | working_directory = Path(working_directory) 44 | stream_archive = self.download(working_directory) 45 | tmp_stream = self.extract(stream_archive) 46 | stream = shutil.move(tmp_stream, stream) 47 | else: 48 | raise FileNotFoundError( 49 | f"Dataset {self._filename} not found in {directory}" 50 | ) 51 | 52 | return stream 53 | 54 | def get_path(self): 55 | return self._path 56 | 57 | @abstractmethod 58 | def download(self, working_directory: Path) -> Path: 59 | """Download the dataset and return the path to the downloaded dataset 60 | within the working directory. 61 | 62 | :param working_directory: The directory to download the dataset to. 63 | :return: The path to the downloaded dataset within the working directory. 64 | """ 65 | pass 66 | 67 | @abstractmethod 68 | def extract(self, stream_archive: Path) -> Path: 69 | """Extract the dataset from the archive and return the path to the 70 | extracted dataset. 71 | 72 | :param stream_archive: The path to the archive containing the dataset. 73 | :return: The path to the extracted dataset. 74 | """ 75 | pass 76 | 77 | @abstractmethod 78 | def to_stream(self, stream: Path): 79 | """Convert the dataset to a MOA stream. 80 | 81 | :param stream: The path to the dataset. 82 | :return: A MOA stream. 83 | """ 84 | pass 85 | 86 | def __len__(self) -> int: 87 | return self._length 88 | 89 | def __str__(self) -> str: 90 | return type(self).__name__ 91 | 92 | 93 | class DownloadARFFGzip(DownloadableDataset): 94 | _remote_url = None 95 | 96 | def download(self, working_directory: Path) -> Path: 97 | assert self._remote_url is not None, "Remote URL must be set in subclass" 98 | 99 | print(f"Downloading {self._filename}") 100 | # wget creates temporary files in the current working directory. We need to 101 | # change the working directory to avoid cluttering the current directory. 102 | wd = os.getcwd() 103 | os.chdir(working_directory) 104 | path = wget.download(self._remote_url, working_directory.as_posix()) 105 | os.chdir(wd) 106 | return Path(path) 107 | 108 | def extract(self, stream_archive: Path) -> Path: 109 | return extract(stream_archive) 110 | 111 | def to_stream(self, stream: Path) -> Any: 112 | return ArffFileStream(stream.as_posix(), -1) 113 | -------------------------------------------------------------------------------- /src/capymoa/drift/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/src/capymoa/drift/__init__.py -------------------------------------------------------------------------------- /src/capymoa/drift/base_detector.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any, Dict 3 | from typing_extensions import override 4 | 5 | from jpype import _jpype 6 | 7 | 8 | class BaseDriftDetector(ABC): 9 | """Drift Detector""" 10 | 11 | def __init__(self): 12 | super().__init__() 13 | 14 | self.in_concept_change = None 15 | self.in_warning_zone = None 16 | self.detection_index = [] 17 | self.warning_index = [] 18 | self.data = [] 19 | self.idx = 0 20 | 21 | @abstractmethod 22 | def get_params(self) -> Dict[str, Any]: 23 | """Get the hyper-parameters of the drift detector.""" 24 | 25 | def reset(self, clean_history: bool = False) -> None: 26 | """Reset the drift detector. 27 | 28 | :param clean_history: Whether to reset detection history, defaults to False 29 | """ 30 | self.in_concept_change = False 31 | self.in_warning_zone = False 32 | 33 | if clean_history: 34 | self.detection_index = [] 35 | self.warning_index = [] 36 | self.data = [] 37 | self.idx = 0 38 | 39 | @abstractmethod 40 | def add_element(self, element: float) -> None: 41 | """Update the drift detector with a new input value. 42 | 43 | :param element: A value to update the drift detector with. Usually, 44 | this is the prediction error of a model. 45 | """ 46 | raise NotImplementedError 47 | 48 | def detected_change(self) -> bool: 49 | """Is the detector currently detecting a concept drift?""" 50 | return self.in_concept_change 51 | 52 | def detected_warning(self) -> bool: 53 | """Is the detector currently warning of an upcoming concept drift?""" 54 | return self.in_warning_zone 55 | 56 | 57 | class MOADriftDetector(BaseDriftDetector): 58 | """ 59 | A wrapper class for using MOA (Massive Online Analysis) drift detectors in CapyMOA. 60 | """ 61 | 62 | def __init__(self, moa_detector, CLI=None): 63 | """ 64 | :param moa_detector: The MOA detector object or class identifier. 65 | :param CLI: The command-line interface (CLI) configuration for the MOA drift detector, defaults to None 66 | """ 67 | super().__init__() 68 | 69 | self.CLI = CLI 70 | 71 | if isinstance(moa_detector, type): 72 | if isinstance(moa_detector, _jpype._JClass): 73 | moa_detector = moa_detector() 74 | else: # this is not a Java object, thus it certainly isn't a MOA learner 75 | raise ValueError("Invalid MOA detector provided.") 76 | 77 | self.moa_detector = moa_detector 78 | 79 | # If the CLI is None, we assume the object has already been configured 80 | # or that default values should be used. 81 | if self.CLI is not None: 82 | self.moa_detector.getOptions().setViaCLIString(CLI) 83 | 84 | self.moa_detector.prepareForUse() 85 | self.moa_detector.resetLearning() 86 | 87 | def __str__(self): 88 | full_name = str(self.moa_detector.getClass().getCanonicalName()) 89 | return full_name.rsplit(".", 1)[1] if "." in full_name else full_name 90 | 91 | def CLI_help(self): 92 | return str(self.moa_detector.getOptions().getHelpString()) 93 | 94 | @override 95 | def add_element(self, element: float) -> None: 96 | self.moa_detector.input(element) 97 | self.data.append(element) 98 | self.idx += 1 99 | 100 | self.in_concept_change = self.moa_detector.getChange() 101 | self.in_warning_zone = self.moa_detector.getWarningZone() 102 | 103 | if self.in_warning_zone: 104 | self.warning_index.append(self.idx) 105 | 106 | if self.in_concept_change: 107 | self.detection_index.append(self.idx) 108 | 109 | @override 110 | def get_params(self) -> Dict[str, Any]: 111 | options = list(self.moa_detector.getOptions().getOptionArray()) 112 | return {opt.getName(): opt.getValueAsCLIString() for opt in options} 113 | -------------------------------------------------------------------------------- /src/capymoa/drift/detectors/__init__.py: -------------------------------------------------------------------------------- 1 | from .adwin import ADWIN 2 | from .cusum import CUSUM 3 | from .ddm import DDM 4 | from .ewma_chart import EWMAChart 5 | from .geometric_ma import GeometricMovingAverage 6 | from .hddm_a import HDDMAverage 7 | from .hddm_w import HDDMWeighted 8 | from .page_hinkley import PageHinkley 9 | from .rddm import RDDM 10 | from .seed import SEED 11 | from .stepd import STEPD 12 | from .abcd import ABCD 13 | 14 | __all__ = [ 15 | "ADWIN", 16 | "CUSUM", 17 | "DDM", 18 | "EWMAChart", 19 | "GeometricMovingAverage", 20 | "HDDMAverage", 21 | "HDDMWeighted", 22 | "PageHinkley", 23 | "RDDM", 24 | "SEED", 25 | "STEPD", 26 | "ABCD", 27 | ] 28 | -------------------------------------------------------------------------------- /src/capymoa/drift/detectors/abcd_components/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/src/capymoa/drift/detectors/abcd_components/__init__.py -------------------------------------------------------------------------------- /src/capymoa/drift/detectors/abcd_components/std.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Aggregate: 5 | def __init__(self, n, mean, m2): 6 | self.n = n 7 | self._mean = mean 8 | self.m2 = m2 9 | 10 | def variance(self): 11 | if self.n < 2: 12 | return np.nan 13 | return self.m2 / (self.n - 1) 14 | 15 | def mean(self): 16 | return self._mean 17 | 18 | def std(self): 19 | if self.n < 2: 20 | return np.nan 21 | with np.errstate(all="ignore"): 22 | return np.sqrt(self.variance()) 23 | 24 | 25 | class PairwiseAggregate: 26 | def __init__(self, agg1: Aggregate, agg2: Aggregate): 27 | self.agg1 = agg1 28 | self.agg2 = agg2 29 | 30 | def n(self): 31 | return self.agg1.n, self.agg2.n 32 | 33 | def mean(self): 34 | return self.agg1.mean(), self.agg2.mean() 35 | 36 | def variance(self): 37 | return self.agg1.variance(), self.agg2.variance() 38 | 39 | def std(self): 40 | return self.agg1.std(), self.agg2.std() 41 | 42 | 43 | class PairwiseVariance: 44 | def __init__(self, max_size: int): 45 | self.aggregates = [] 46 | self.max_size = max_size 47 | 48 | def __len__(self): 49 | return len(self.aggregates) 50 | 51 | def update(self, value): 52 | if len(self.aggregates) == 0: 53 | aggregate = Aggregate(n=1, mean=value, m2=0) 54 | self.aggregates.append(aggregate) 55 | last_aggregate = self.aggregates[-1] 56 | count = last_aggregate.n + 1 57 | mean = last_aggregate.mean() 58 | delta = value - mean 59 | new_mean = mean + delta / count 60 | delta2 = value - new_mean 61 | m2 = last_aggregate.m2 + delta * delta2 62 | new_aggregate = Aggregate(n=count, mean=new_mean, m2=m2) 63 | self.aggregates.append(new_aggregate) 64 | if len(self.aggregates) > self.max_size: 65 | self.aggregates = self.aggregates[-self.max_size :] 66 | 67 | def reset(self): 68 | self.aggregates = [] 69 | 70 | def pairwise_aggregate(self, index: int): 71 | agg1 = self.aggregates[index - 1] 72 | agg2 = self.aggregates[-1] 73 | 74 | n_ab = agg2.n 75 | n_a = agg1.n 76 | n_b = n_ab - n_a 77 | 78 | mean_ab = agg2.mean() 79 | mean_a = agg1.mean() 80 | mean_b = (n_ab * mean_ab - n_a * mean_a) / n_b 81 | 82 | delta = mean_b - mean_a 83 | m2_ab = agg2.m2 84 | m2_a = agg1.m2 85 | m2_b = m2_ab - m2_a - delta**2 * (n_a * n_b) / n_ab 86 | 87 | return PairwiseAggregate(agg1, Aggregate(n=n_b, mean=mean_b, m2=m2_b)) 88 | -------------------------------------------------------------------------------- /src/capymoa/drift/detectors/adwin.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from capymoa.drift.base_detector import MOADriftDetector 4 | 5 | from moa.classifiers.core.driftdetection import ( 6 | ADWINChangeDetector as _ADWINChangeDetector, 7 | ) 8 | 9 | 10 | class ADWIN(MOADriftDetector): 11 | """ADWIN Drift Detector 12 | 13 | Example: 14 | -------- 15 | 16 | >>> import numpy as np 17 | >>> from capymoa.drift.detectors import ADWIN 18 | >>> np.random.seed(0) 19 | >>> detector = ADWIN(delta=0.001) 20 | >>> 21 | >>> data_stream = np.random.randint(2, size=2000) 22 | >>> for i in range(999, 2000): 23 | ... data_stream[i] = np.random.randint(4, high=8) 24 | >>> 25 | >>> for i in range(2000): 26 | ... detector.add_element(data_stream[i]) 27 | ... if detector.detected_change(): 28 | ... print('Change detected in data: ' + str(data_stream[i]) + ' - at index: ' + str(i)) 29 | Change detected in data: 4 - at index: 1023 30 | Change detected in data: 5 - at index: 1055 31 | 32 | Reference: 33 | ---------- 34 | 35 | Bifet, Albert, and Ricard Gavalda. "Learning from time-changing data with adaptive windowing." 36 | Proceedings of the 2007 SIAM international conference on data mining. 37 | Society for Industrial and Applied Mathematics, 2007. 38 | 39 | """ 40 | 41 | def __init__(self, delta: float = 0.002, CLI: Optional[str] = None): 42 | if CLI is None: 43 | CLI = f"-a {delta}" 44 | 45 | super().__init__(moa_detector=_ADWINChangeDetector(), CLI=CLI) 46 | 47 | self.delta = delta 48 | self.get_params() 49 | -------------------------------------------------------------------------------- /src/capymoa/drift/detectors/cusum.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from capymoa.drift.base_detector import MOADriftDetector 4 | 5 | from moa.classifiers.core.driftdetection import CusumDM as _CusumDM 6 | 7 | 8 | class CUSUM(MOADriftDetector): 9 | """CUSUM Drift Detector 10 | 11 | Example usages: 12 | 13 | >>> import numpy as np 14 | >>> np.random.seed(0) 15 | >>> from capymoa.drift.detectors import CUSUM 16 | >>> 17 | >>> detector = CUSUM(delta=0.005, lambda_=60) 18 | >>> 19 | >>> data_stream = np.random.randint(2, size=2000) 20 | >>> for i in range(999, 2000): 21 | ... data_stream[i] = np.random.randint(4, high=8) 22 | >>> 23 | >>> for i in range(2000): 24 | ... detector.add_element(data_stream[i]) 25 | ... if detector.detected_change(): 26 | ... print('Change detected in data: ' + str(data_stream[i]) + ' - at index: ' + str(i)) 27 | Change detected in data: 6 - at index: 1011 28 | Change detected in data: 7 - at index: 1556 29 | 30 | """ 31 | 32 | def __init__( 33 | self, 34 | min_n_instances: int = 30, 35 | delta: float = 0.005, 36 | lambda_: float = 50, 37 | CLI: Optional[str] = None, 38 | ): 39 | if CLI is None: 40 | CLI = f"-n {min_n_instances} -d {delta} -l {lambda_}" 41 | 42 | super().__init__(moa_detector=_CusumDM(), CLI=CLI) 43 | 44 | self.min_n_instances = min_n_instances 45 | self.delta = delta 46 | self.lambda_ = lambda_ 47 | self.get_params() 48 | -------------------------------------------------------------------------------- /src/capymoa/drift/detectors/ddm.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from capymoa.drift.base_detector import MOADriftDetector 4 | 5 | from moa.classifiers.core.driftdetection import DDM as _DDM 6 | 7 | 8 | class DDM(MOADriftDetector): 9 | """Drift-Detection-Method (DDM) Drift Detector 10 | 11 | Example: 12 | -------- 13 | 14 | >>> import numpy as np 15 | >>> from capymoa.drift.detectors import DDM 16 | >>> np.random.seed(0) 17 | >>> 18 | >>> detector = DDM() 19 | >>> 20 | >>> data_stream = np.random.randint(2, size=2000) 21 | >>> for i in range(999, 2000): 22 | ... data_stream[i] = np.random.randint(4, high=8) 23 | >>> 24 | >>> for i in range(2000): 25 | ... detector.add_element(data_stream[i]) 26 | ... if detector.detected_change(): 27 | ... print('Change detected in data: ' + str(data_stream[i]) + ' - at index: ' + str(i)) 28 | Change detected in data: 4 - at index: 1005 29 | 30 | Reference: 31 | ---------- 32 | 33 | Gama, Joao, et al. "Learning with drift detection." Advances in Artificial 34 | Intelligence–SBIA 2004: 17th Brazilian Symposium on Artificial Intelligence, 35 | Sao Luis, Maranhao, Brazil, September 29-Ocotber 1, 2004. 36 | 37 | """ 38 | 39 | def __init__( 40 | self, 41 | min_n_instances: int = 30, 42 | warning_level: float = 2.0, 43 | out_control_level: float = 3.0, 44 | CLI: Optional[str] = None, 45 | ): 46 | if CLI is None: 47 | CLI = f"-n {min_n_instances} -w {warning_level} -o {out_control_level}" 48 | 49 | super().__init__(moa_detector=_DDM(), CLI=CLI) 50 | 51 | self.min_n_instances = min_n_instances 52 | self.warning_level = warning_level 53 | self.out_control_level = out_control_level 54 | self.get_params() 55 | -------------------------------------------------------------------------------- /src/capymoa/drift/detectors/ewma_chart.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from capymoa.drift.base_detector import MOADriftDetector 4 | 5 | from moa.classifiers.core.driftdetection import EWMAChartDM as _EWMAChartDM 6 | 7 | 8 | class EWMAChart(MOADriftDetector): 9 | """EWMA Charts Drift Detector 10 | 11 | Example: 12 | -------- 13 | 14 | >>> import numpy as np 15 | >>> from capymoa.drift.detectors import EWMAChart 16 | >>> np.random.seed(0) 17 | >>> 18 | >>> detector = EWMAChart() 19 | >>> 20 | >>> data_stream = np.random.randint(2, size=2000) 21 | >>> for i in range(999, 2000): 22 | ... data_stream[i] = np.random.randint(4, high=8) 23 | >>> 24 | >>> for i in range(2000): 25 | ... detector.add_element(data_stream[i]) 26 | ... if detector.detected_change(): 27 | ... print('Change detected in data: ' + str(data_stream[i]) + ' - at index: ' + str(i)) 28 | Change detected in data: 5 - at index: 999 29 | 30 | Reference: 31 | ---------- 32 | 33 | Ross, Gordon J., et al. "Exponentially weighted moving average charts for 34 | detecting concept drift." Pattern recognition letters 33.2 (2012): 191-198. 35 | 36 | """ 37 | 38 | def __init__( 39 | self, min_n_instances: int = 30, lambda_: float = 0.2, CLI: Optional[str] = None 40 | ): 41 | if CLI is None: 42 | CLI = f"-n {min_n_instances} -l {lambda_} " 43 | 44 | super().__init__(moa_detector=_EWMAChartDM(), CLI=CLI) 45 | 46 | self.min_n_instances = min_n_instances 47 | self.lambda_ = lambda_ 48 | self.get_params() 49 | -------------------------------------------------------------------------------- /src/capymoa/drift/detectors/geometric_ma.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from capymoa.drift.base_detector import MOADriftDetector 4 | 5 | from moa.classifiers.core.driftdetection import ( 6 | GeometricMovingAverageDM as _GeometricMovingAverageDM, 7 | ) 8 | 9 | 10 | class GeometricMovingAverage(MOADriftDetector): 11 | """Geometric Moving Average Test Drift Detector 12 | 13 | Example: 14 | -------- 15 | 16 | >>> import numpy as np 17 | >>> from capymoa.drift.detectors import GeometricMovingAverage 18 | >>> np.random.seed(0) 19 | >>> 20 | >>> detector = GeometricMovingAverage() 21 | >>> 22 | >>> data_stream = np.random.randint(2, size=2000) 23 | >>> for i in range(999, 2000): 24 | ... data_stream[i] = np.random.randint(4, high=8) 25 | >>> 26 | >>> for i in range(2000): 27 | ... detector.add_element(data_stream[i]) 28 | ... if detector.detected_change(): 29 | ... print('Change detected in data: ' + str(data_stream[i]) + ' - at index: ' + str(i)) 30 | Change detected in data: 4 - at index: 1023 31 | 32 | """ 33 | 34 | def __init__( 35 | self, 36 | min_n_instances: int = 30, 37 | lambda_: float = 1.0, 38 | alpha: float = 0.99, 39 | CLI: Optional[str] = None, 40 | ): 41 | if CLI is None: 42 | CLI = f"-n {min_n_instances} -l {lambda_} -a {alpha}" 43 | 44 | super().__init__(moa_detector=_GeometricMovingAverageDM(), CLI=CLI) 45 | 46 | self.min_n_instances = min_n_instances 47 | self.lambda_ = lambda_ 48 | self.alpha = alpha 49 | self.get_params() 50 | -------------------------------------------------------------------------------- /src/capymoa/drift/detectors/hddm_a.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from capymoa.drift.base_detector import MOADriftDetector 4 | 5 | from moa.classifiers.core.driftdetection import HDDM_A_Test as _HDDM_A_Test 6 | 7 | 8 | class HDDMAverage(MOADriftDetector): 9 | """Average Hoeffding's bounds Drift Detector 10 | 11 | Example usages: 12 | --------------- 13 | 14 | >>> import numpy as np 15 | >>> from capymoa.drift.detectors import HDDMAverage 16 | >>> np.random.seed(0) 17 | >>> 18 | >>> detector = HDDMAverage(drift_confidence=1e-10) 19 | >>> 20 | >>> data_stream = np.random.randint(2, size=2000) 21 | >>> for i in range(999, 2000): 22 | ... data_stream[i] = np.random.randint(4, high=8) 23 | >>> 24 | >>> for i in range(2000): 25 | ... detector.add_element(data_stream[i]) 26 | ... if detector.detected_change(): 27 | ... print('Change detected in data: ' + str(data_stream[i]) + ' - at index: ' + str(i)) 28 | Change detected in data: 5 - at index: 999 29 | Change detected in data: 7 - at index: 1019 30 | Change detected in data: 7 - at index: 1330 31 | 32 | Reference: 33 | ---------- 34 | 35 | Frias-Blanco, Isvani, et al. "Online and non-parametric drift 36 | detection methods based on Hoeffding’s bounds." IEEE Transactions on 37 | Knowledge and Data Engineering 27.3 (2014): 810-823. 38 | 39 | """ 40 | 41 | TEST_TYPES = ["Two-sided", "One-sided"] 42 | 43 | def __init__( 44 | self, 45 | drift_confidence: float = 0.001, 46 | warning_confidence: float = 0.005, 47 | test_type: str = "Two-sided", 48 | CLI: Optional[str] = None, 49 | ): 50 | assert test_type in self.TEST_TYPES, "Wrong test type" 51 | 52 | if CLI is None: 53 | CLI = f"-d {drift_confidence} -w {warning_confidence} -t {test_type}" 54 | 55 | super().__init__(moa_detector=_HDDM_A_Test(), CLI=CLI) 56 | 57 | self.drift_confidence = drift_confidence 58 | self.warning_confidence = warning_confidence 59 | self.test_type = test_type 60 | self.get_params() 61 | -------------------------------------------------------------------------------- /src/capymoa/drift/detectors/hddm_w.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from capymoa.drift.base_detector import MOADriftDetector 4 | 5 | from moa.classifiers.core.driftdetection import HDDM_W_Test as _HDDM_W_Test 6 | 7 | 8 | class HDDMWeighted(MOADriftDetector): 9 | """Weighted Hoeffding's bounds Drift Detector 10 | 11 | Example usages: 12 | --------------- 13 | 14 | >>> import numpy as np 15 | >>> from capymoa.drift.detectors import HDDMWeighted 16 | >>> np.random.seed(0) 17 | >>> 18 | >>> detector = HDDMWeighted(lambda_=0.001) 19 | >>> 20 | >>> data_stream = np.random.randint(2, size=2000) 21 | >>> for i in range(999, 2000): 22 | ... data_stream[i] = np.random.randint(4, high=8) 23 | >>> 24 | >>> for i in range(2000): 25 | ... detector.add_element(data_stream[i]) 26 | ... if detector.detected_change(): 27 | ... print('Change detected in data: ' + str(data_stream[i]) + ' - at index: ' + str(i)) 28 | Change detected in data: 6 - at index: 1234 29 | 30 | Reference: 31 | ---------- 32 | 33 | Frias-Blanco, Isvani, et al. "Online and non-parametric drift detection 34 | methods based on Hoeffding’s bounds." IEEE Transactions on Knowledge and 35 | Data Engineering 27.3 (2014): 810-823. 36 | 37 | """ 38 | 39 | TEST_TYPES = ["Two-sided", "One-sided"] 40 | 41 | def __init__( 42 | self, 43 | drift_confidence: float = 0.001, 44 | warning_confidence: float = 0.005, 45 | lambda_: float = 0.05, 46 | test_type: str = "Two-sided", 47 | CLI: Optional[str] = None, 48 | ): 49 | assert test_type in self.TEST_TYPES, "Wrong test type" 50 | 51 | if CLI is None: 52 | CLI = ( 53 | f"-d {drift_confidence} " 54 | f"-w {warning_confidence} " 55 | f"-m {lambda_} " 56 | f"-t {test_type}" 57 | ) 58 | 59 | super().__init__(moa_detector=_HDDM_W_Test(), CLI=CLI) 60 | 61 | self.drift_confidence = drift_confidence 62 | self.warning_confidence = warning_confidence 63 | self.lambda_ = lambda_ 64 | self.test_type = test_type 65 | self.get_params() 66 | 67 | def add_element(self, element: float): 68 | if not isinstance(element, float): 69 | element = float(element) 70 | 71 | self.moa_detector.input(element) 72 | self.data.append(element) 73 | 74 | self.estimation = self.moa_detector.getEstimation() 75 | self.delay = self.moa_detector.getDelay() 76 | self.in_concept_change = self.moa_detector.getChange() 77 | self.in_warning_zone = self.moa_detector.getWarningZone() 78 | -------------------------------------------------------------------------------- /src/capymoa/drift/detectors/page_hinkley.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from capymoa.drift.base_detector import MOADriftDetector 4 | 5 | from moa.classifiers.core.driftdetection import PageHinkleyDM as _PageHinkleyDM 6 | 7 | 8 | class PageHinkley(MOADriftDetector): 9 | """Page-Hinkley Drift Detector 10 | 11 | Example: 12 | -------- 13 | 14 | >>> import numpy as np 15 | >>> from capymoa.drift.detectors import PageHinkley 16 | >>> np.random.seed(0) 17 | >>> 18 | >>> detector = PageHinkley() 19 | >>> 20 | >>> data_stream = np.random.randint(2, size=2000) 21 | >>> for i in range(999, 2000): 22 | ... data_stream[i] = np.random.randint(4, high=8) 23 | >>> 24 | >>> for i in range(2000): 25 | ... detector.add_element(data_stream[i]) 26 | ... if detector.detected_change(): 27 | ... print('Change detected in data: ' + str(data_stream[i]) + ' - at index: ' + str(i)) 28 | Change detected in data: 7 - at index: 1014 29 | Change detected in data: 7 - at index: 1685 30 | 31 | Reference: 32 | ---------- 33 | 34 | Page. 1954. Continuous Inspection Schemes. Biometrika 41, 1/2 (1954), 35 | 100-115. 36 | 37 | """ 38 | 39 | def __init__( 40 | self, 41 | min_n_instances: int = 30, 42 | delta: float = 0.005, 43 | lambda_: float = 50.0, 44 | alpha: float = 0.9999, 45 | CLI: Optional[str] = None, 46 | ): 47 | if CLI is None: 48 | CLI = f"-n {min_n_instances} -d {delta} -l {lambda_} -a {alpha}" 49 | 50 | super().__init__(moa_detector=_PageHinkleyDM(), CLI=CLI) 51 | 52 | self.min_n_instances = min_n_instances 53 | self.delta = delta 54 | self.lambda_ = lambda_ 55 | self.alpha = alpha 56 | self.get_params() 57 | -------------------------------------------------------------------------------- /src/capymoa/drift/detectors/rddm.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from capymoa.drift.base_detector import MOADriftDetector 4 | 5 | from moa.classifiers.core.driftdetection import RDDM as _RDDM 6 | 7 | 8 | class RDDM(MOADriftDetector): 9 | """Reactive Drift Detection Method Drift Detector 10 | 11 | Example: 12 | -------- 13 | 14 | >>> import numpy as np 15 | >>> from capymoa.drift.detectors import RDDM 16 | >>> np.random.seed(0) 17 | >>> 18 | >>> detector = RDDM() 19 | >>> 20 | >>> data_stream = np.random.randint(2, size=2000) 21 | >>> for i in range(999, 2000): 22 | ... data_stream[i] = np.random.randint(4, high=8) 23 | >>> 24 | >>> for i in range(2000): 25 | ... detector.add_element(data_stream[i]) 26 | ... if detector.detected_change(): 27 | ... print('Change detected in data: ' + str(data_stream[i]) + ' - at index: ' + str(i)) 28 | Change detected in data: 6 - at index: 1003 29 | 30 | Reference: 31 | ---------- 32 | 33 | Barros, R. S., Cabral, D. R., Gonçalves Jr, P. M., & Santos, S. G. (2017). 34 | RDDM: Reactive drift detection method. Expert Systems with Applications, 90, 344-355. 35 | 36 | """ 37 | 38 | def __init__( 39 | self, 40 | min_n_instances: int = 129, 41 | warning_level: float = 1.773, 42 | drift_level: float = 2.258, 43 | max_size_concept: int = 40000, 44 | min_size_concept: int = 7000, 45 | warning_limit: int = 1400, 46 | CLI: Optional[str] = None, 47 | ): 48 | if CLI is None: 49 | CLI = ( 50 | f"-n {min_n_instances} " 51 | f"-w {warning_level} " 52 | f"-o {drift_level} " 53 | f"-x {max_size_concept} " 54 | f"-y {min_size_concept} " 55 | f"-z {warning_limit}" 56 | ) 57 | 58 | super().__init__(moa_detector=_RDDM(), CLI=CLI) 59 | 60 | self.min_n_instances = min_n_instances 61 | self.warning_level = warning_level 62 | self.drift_level = drift_level 63 | self.max_size_concept = max_size_concept 64 | self.min_size_concept = min_size_concept 65 | self.warning_limit = warning_limit 66 | self.get_params() 67 | -------------------------------------------------------------------------------- /src/capymoa/drift/detectors/seed.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from capymoa.drift.base_detector import MOADriftDetector 4 | 5 | from moa.classifiers.core.driftdetection import ( 6 | SEEDChangeDetector as _SEEDChangeDetector, 7 | ) 8 | 9 | 10 | class SEED(MOADriftDetector): 11 | """Seed Drift Detector 12 | 13 | Example: 14 | -------- 15 | 16 | >>> import numpy as np 17 | >>> from capymoa.drift.detectors import SEED 18 | >>> np.random.seed(0) 19 | >>> 20 | >>> detector = SEED() 21 | >>> 22 | >>> data_stream = np.random.randint(2, size=2000) 23 | >>> for i in range(999, 2000): 24 | ... data_stream[i] = np.random.randint(4, high=8) 25 | >>> 26 | >>> for i in range(2000): 27 | ... detector.add_element(data_stream[i]) 28 | ... if detector.detected_change(): 29 | ... print('Change detected in data: ' + str(data_stream[i]) + ' - at index: ' + str(i)) 30 | Change detected in data: 4 - at index: 1023 31 | Change detected in data: 6 - at index: 1343 32 | 33 | Reference: 34 | ---------- 35 | 36 | Huang, David Tse Jung, et al. "Detecting volatility shift in data streams." 37 | 2014 IEEE International Conference on Data Mining. IEEE, 2014. 38 | 39 | """ 40 | 41 | def __init__( 42 | self, 43 | delta: float = 0.05, 44 | block_size: int = 32, 45 | epsilon_prime: float = 0.01, 46 | alpha: float = 0.8, 47 | compress_term: int = 75, 48 | CLI: Optional[str] = None, 49 | ): 50 | if CLI is None: 51 | CLI = ( 52 | f"-d {delta} " 53 | f"-b {block_size} " 54 | f"-e {epsilon_prime} " 55 | f"-a {alpha} " 56 | f"-c {compress_term}" 57 | ) 58 | 59 | super().__init__(moa_detector=_SEEDChangeDetector(), CLI=CLI) 60 | 61 | self.delta = delta 62 | self.block_size = block_size 63 | self.epsilon_prime = epsilon_prime 64 | self.alpha = alpha 65 | self.compress_term = compress_term 66 | self.get_params() 67 | -------------------------------------------------------------------------------- /src/capymoa/drift/detectors/stepd.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from capymoa.drift.base_detector import MOADriftDetector 4 | 5 | from moa.classifiers.core.driftdetection import STEPD as _STEPD 6 | 7 | 8 | class STEPD(MOADriftDetector): 9 | """Statistical Test of Equal Proportions Drift Detector 10 | 11 | Example: 12 | -------- 13 | 14 | >>> import numpy as np 15 | >>> from capymoa.drift.detectors import STEPD 16 | >>> np.random.seed(0) 17 | >>> 18 | >>> detector = STEPD() 19 | >>> 20 | >>> data_stream = np.random.randint(2, size=2000) 21 | >>> for i in range(999, 2000): 22 | ... data_stream[i] = np.random.randint(4, high=8) 23 | >>> 24 | >>> for i in range(2000): 25 | ... detector.add_element(data_stream[i]) 26 | ... if detector.detected_change(): 27 | ... print('Change detected in data: ' + str(data_stream[i]) + ' - at index: ' + str(i)) 28 | Change detected in data: 6 - at index: 1001 29 | 30 | Reference: 31 | ---------- 32 | 33 | Nishida, Kyosuke, and Koichiro Yamauchi. "Detecting concept drift using 34 | statistical testing." International conference on discovery science. Berlin, 35 | Heidelberg: Springer Berlin Heidelberg, 2007. 36 | """ 37 | 38 | def __init__( 39 | self, 40 | window_size: int = 30, 41 | alpha_drift: float = 0.003, 42 | alpha_warning: float = 0.05, 43 | CLI: Optional[str] = None, 44 | ): 45 | if CLI is None: 46 | CLI = f"-r {window_size} -o {alpha_drift} -w {alpha_warning}" 47 | 48 | super().__init__(moa_detector=_STEPD(), CLI=CLI) 49 | 50 | self.window_size = window_size 51 | self.alpha_drift = alpha_drift 52 | self.alpha_warning = alpha_warning 53 | self.get_params() 54 | -------------------------------------------------------------------------------- /src/capymoa/env.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | CapyMOA supports a few environment variables that can be used to customize its behavior. 4 | None of these are required, but they can be useful in certain situations. 5 | 6 | * Use ``CAPYMOA_DATASETS_DIR`` to specify a custom directory where datasets will be stored. 7 | (See :func:`capymoa_datasets_dir`) 8 | * Use ``CAPYMOA_JVM_ARGS`` to specify custom JVM arguments. 9 | (See :func:`capymoa_jvm_args`) 10 | * Use ``CAPYMOA_MOA_JAR`` to specify a custom MOA jar file. 11 | (See :func:`capymoa_moa_jar`) 12 | * Use ``JAVA_HOME`` to specify the path to your Java installation. 13 | """ 14 | 15 | from os import environ 16 | from pathlib import Path 17 | from typing import List 18 | 19 | 20 | def capymoa_datasets_dir() -> Path: 21 | """Return the ``CAPYMOA_DATASETS_DIR`` environment variable or the default value ``./data``. 22 | 23 | 24 | The ``CAPYMOA_DATASETS_DIR`` environment variable can be used to specify a custom 25 | directory where datasets will be stored. Set it to a custom value in bash like this: 26 | 27 | .. code-block:: bash 28 | 29 | export CAPYMOA_DATASETS_DIR=/path/to/datasets 30 | python my_capy_moa_script.py 31 | 32 | We recommend setting this environment variable in your shell configuration file. 33 | 34 | :return: The path to the datasets directory. 35 | """ 36 | dataset_dir = Path(environ.get("CAPYMOA_DATASETS_DIR", "./data")) 37 | dataset_dir.mkdir(exist_ok=True) 38 | return dataset_dir 39 | 40 | 41 | def capymoa_jvm_args() -> List[str]: 42 | """Return the ``CAPYMOA_JVM_ARGS`` environment variable or the default value ``-Xmx8g -Xss10M``. 43 | 44 | The ``CAPYMOA_JVM_ARGS`` environment variable can be used to specify custom JVM 45 | arguments. Set it to a custom value in bash like this: 46 | 47 | .. code-block:: bash 48 | 49 | export CAPYMOA_JVM_ARGS="-Xmx16g -Xss10M" 50 | python my_capy_moa_script.py 51 | 52 | :return: A list of JVM arguments. 53 | """ 54 | return environ.get("CAPYMOA_JVM_ARGS", "-Xmx8g -Xss10M").split() 55 | 56 | 57 | def capymoa_moa_jar() -> Path: 58 | """Return the ``CAPYMOA_MOA_JAR`` environment variable or the built-in MOA jar file. 59 | 60 | **This is an advanced feature that is unnecessary for most users.** 61 | 62 | The ``CAPYMOA_MOA_JAR`` environment variable can be used to specify a custom path to 63 | the MOA jar file. Set it to a custom value in bash like this: 64 | 65 | .. code-block:: bash 66 | 67 | export CAPYMOA_MOA_JAR=/path/to/moa.jar 68 | python my_capy_moa_script.py 69 | 70 | :return: The path to the MOA jar file. 71 | """ 72 | default_moa_jar = Path(__file__).parent / "jar" / "moa.jar" 73 | return Path(environ.get("CAPYMOA_MOA_JAR", default_moa_jar)) 74 | -------------------------------------------------------------------------------- /src/capymoa/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .evaluation import ( 2 | prequential_evaluation, 3 | prequential_evaluation_multiple_learners, 4 | prequential_ssl_evaluation, 5 | prequential_evaluation_anomaly, 6 | ClassificationEvaluator, 7 | ClassificationWindowedEvaluator, 8 | RegressionWindowedEvaluator, 9 | RegressionEvaluator, 10 | PredictionIntervalEvaluator, 11 | PredictionIntervalWindowedEvaluator, 12 | AnomalyDetectionEvaluator, 13 | ClusteringEvaluator, 14 | ) 15 | from . import results 16 | 17 | __all__ = [ 18 | "prequential_evaluation", 19 | "prequential_ssl_evaluation", 20 | "prequential_evaluation_multiple_learners", 21 | "prequential_evaluation_anomaly", 22 | "ClassificationEvaluator", 23 | "ClassificationWindowedEvaluator", 24 | "RegressionWindowedEvaluator", 25 | "RegressionEvaluator", 26 | "PredictionIntervalEvaluator", 27 | "PredictionIntervalWindowedEvaluator", 28 | "AnomalyDetectionEvaluator", 29 | "ClusteringEvaluator", 30 | "results", 31 | ] 32 | -------------------------------------------------------------------------------- /src/capymoa/evaluation/_progress_bar.py: -------------------------------------------------------------------------------- 1 | """A private module that provides support for progress bars.""" 2 | 3 | from abc import ABC, abstractmethod 4 | from typing import Union 5 | from tqdm.std import tqdm 6 | 7 | 8 | class JavaIProgressBar(ABC): 9 | """A shared interface between Python and Java to support progress bars.""" 10 | 11 | @abstractmethod 12 | def get_total(self) -> int: 13 | """Get the expected total number of iterations.""" 14 | 15 | @abstractmethod 16 | def set_total(self, total: int): 17 | """Set the expected total number of iterations.""" 18 | 19 | @abstractmethod 20 | def get_progress(self) -> int: 21 | """Get the number of iterations that have been completed.""" 22 | 23 | @abstractmethod 24 | def set_progress(self, pos: int): 25 | """Set the number of iterations that have been completed.""" 26 | 27 | @abstractmethod 28 | def update(self, n: int): 29 | """Increment the number of iterations that have been completed.""" 30 | 31 | @abstractmethod 32 | def close(self): 33 | """Close the progress bar.""" 34 | 35 | 36 | class TqdmProgressBar(JavaIProgressBar): 37 | def __init__(self, progress_bar: tqdm): 38 | super().__init__() 39 | self.progress_bar = progress_bar 40 | 41 | def get_total(self) -> int: 42 | return self.progress_bar.total 43 | 44 | def set_total(self, total: int): 45 | self.progress_bar.total = total 46 | 47 | def get_progress(self) -> int: 48 | return self.progress_bar.n 49 | 50 | def set_progress(self, pos: int): 51 | self.update(pos - self.get_progress()) 52 | 53 | def update(self, n: int) -> None: 54 | self.progress_bar.update(n) 55 | 56 | def close(self) -> None: 57 | self.progress_bar.close() 58 | 59 | 60 | def resolve_progress_bar( 61 | progress_bar: Union[bool, tqdm], description: str 62 | ) -> Union[JavaIProgressBar, None]: 63 | """Helper function to turn a ``ProgressBarArg`` type into a ``JavaIProgressBar``.""" 64 | if isinstance(progress_bar, bool) and progress_bar is True: 65 | return TqdmProgressBar(tqdm(desc=description)) 66 | elif progress_bar is False: 67 | return None 68 | elif isinstance(progress_bar, tqdm): 69 | return TqdmProgressBar(progress_bar) 70 | else: 71 | raise TypeError(f"Invalid progress_bar type: {type(progress_bar)}") 72 | -------------------------------------------------------------------------------- /src/capymoa/jar/Home.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/adaptive-machine-learning/CapyMOA/05ed2e83ce48300c951390494a77e6ca7572e8bc/src/capymoa/jar/Home.class -------------------------------------------------------------------------------- /src/capymoa/jar/home.java: -------------------------------------------------------------------------------- 1 | // Simple program to return the java home 2 | class Home { 3 | public static void main(String[] args) { 4 | System.out.println(System.getProperty("java.home")); 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /src/capymoa/misc.py: -------------------------------------------------------------------------------- 1 | from capymoa._pickle import ( 2 | JPickler as DeprecatedJPickler, 3 | JUnpickler as DeprecatedJUnpickler, 4 | ) 5 | from jpype.pickle import JPickler, JUnpickler 6 | from deprecated import deprecated 7 | from jpype import JException 8 | from typing import BinaryIO 9 | from io import RawIOBase, BufferedIOBase 10 | 11 | 12 | # TODO: Remove this and capymoa._pickle in a future release 13 | @deprecated(version="v0.8.2", reason="Use ``save_model(...)`` instead.") 14 | def legacy_save_model(model, filename): 15 | """Save a model to a file. 16 | 17 | Use :func:`save_model` if possible. 18 | 19 | :param model: The model to save. 20 | :param filename: The file to save the model to. 21 | """ 22 | 23 | with open(filename, "wb") as fd: 24 | DeprecatedJPickler(fd).dump(model) 25 | 26 | 27 | # TODO: Remove this and capymoa._pickle in a future release 28 | @deprecated(version="v0.8.2", reason="Use ``load_model(...)`` instead.") 29 | def legacy_load_model(filename): 30 | """Load a model from a file. 31 | 32 | Use :func:`load_model` if possible. 33 | 34 | :param filename: The file to load the model from. 35 | """ 36 | 37 | with open(filename, "rb") as fd: 38 | return DeprecatedJUnpickler(fd).load() 39 | 40 | 41 | def save_model(model: object, file: BinaryIO) -> None: 42 | """Save a model to a jpype pickle file. 43 | 44 | >>> from capymoa.classifier import AdaptiveRandomForestClassifier 45 | >>> from capymoa.datasets import ElectricityTiny 46 | >>> from tempfile import TemporaryFile 47 | >>> stream = ElectricityTiny() 48 | >>> learner = AdaptiveRandomForestClassifier(schema=stream.get_schema()) 49 | >>> with TemporaryFile() as fd: 50 | ... save_model(learner, fd) 51 | 52 | See https://jpype.readthedocs.io/en/latest/api.html#jpype-pickle-module for 53 | more information. 54 | 55 | :param model: A python object optionally containing Java objects. 56 | :param file: The file-like object to save the model to. 57 | """ 58 | if not file.writable(): 59 | raise ValueError("File must be writable.") 60 | JPickler(file).dump(model) 61 | 62 | 63 | def load_model(file: BinaryIO) -> object: 64 | """Load a model from a jpype pickle file. 65 | 66 | If you are trying to load a model saved with a version of CapyMOA < 0.8.2, 67 | use :func:`legacy_load_model` and :func:`save_model` to reformat the model. 68 | 69 | See also: :func:`save_model`. 70 | 71 | :param file: The file-like object to load the model from. 72 | :return: The loaded model. 73 | """ 74 | if not isinstance(file, (RawIOBase, BufferedIOBase)): 75 | raise ValueError("File must be opened in binary mode.") 76 | if not file.readable(): 77 | raise ValueError("File must be readable.") 78 | try: 79 | return JUnpickler(file).load() 80 | except JException as e: 81 | raise RuntimeError( 82 | "Exception loading model.\n" 83 | "If you are trying to load a model saved with a version of CapyMOA < 0.8.2, " 84 | "use `legacy_load_model` and `save_model` to reformat the model." 85 | ) from e 86 | -------------------------------------------------------------------------------- /src/capymoa/ocl/__init__.py: -------------------------------------------------------------------------------- 1 | """Online Continual Learning (OCL) module. 2 | 3 | OCL is a setting where learners train on a sequence of tasks. A task is a 4 | specific concept or data distribution. After training the learner on each task, 5 | we evaluate the learner on all tasks. 6 | 7 | Continual learning is an important problem to deep learning because these models 8 | suffer from catastrophic forgetting, which occurs when a model forgets how to 9 | perform well after training on a new task. This is a consequence of a neural 10 | network's distributed representation. The term Continual Learning is often 11 | synonymous with overcoming catastrophic forgetting. Non-deep learning methods do 12 | not suffer from catastrophic forgetting. Care should be taken to distinguish 13 | between online continual learning with and without deep learning. 14 | 15 | Online continual learning (OCL) differs from data stream learning because the 16 | objective is performance on historic tasks rather than adaptation. Unlike 17 | traditional continual learning, OCL restricts training to a single data pass. 18 | 19 | >>> from capymoa.classifier import HoeffdingTree 20 | >>> from capymoa.ocl.datasets import TinySplitMNIST 21 | >>> from capymoa.ocl.evaluation import ocl_train_eval_loop 22 | >>> import numpy as np 23 | >>> scenario = TinySplitMNIST() 24 | >>> learner = HoeffdingTree(scenario.schema) 25 | >>> metrics = ocl_train_eval_loop(learner, scenario.train_loaders(32), scenario.test_loaders(32)) 26 | 27 | The final accuracy is the accuracy on all tasks after finishing training on all 28 | tasks: 29 | 30 | >>> print(f"Final Accuracy: {metrics.accuracy_final:0.2f}") 31 | Final Accuracy: 0.69 32 | 33 | The accuracy on each task after training on each task: 34 | 35 | >>> with np.printoptions(precision=2): 36 | ... print(metrics.accuracy_matrix) 37 | [[0.9 0. 0. 0.3 0. ] 38 | [0.88 0.9 0. 0.12 0. ] 39 | [0.77 0.82 0.62 0.12 0. ] 40 | [0.77 0.82 0.6 0.52 0. ] 41 | [0.77 0.82 0.57 0.52 0.75]] 42 | 43 | Notice that the accuracies in the upper triangle are close to zero because the 44 | learner has not trained on those tasks yet. The diagonal contains the accuracy 45 | on each task after training on that task. The lower triangle contains the 46 | accuracy on each task after training on all tasks. 47 | 48 | >>> print(f"Forward Transfer: {metrics.forward_transfer:0.2f}") 49 | Forward Transfer: 0.05 50 | 51 | >>> print(f"Backward Transfer: {metrics.backward_transfer:0.2f}") 52 | Backward Transfer: -0.07 53 | """ 54 | 55 | from . import base, datasets, evaluation, util, strategy 56 | 57 | __all__ = ["evaluation", "datasets", "strategy", "base", "util"] 58 | -------------------------------------------------------------------------------- /src/capymoa/ocl/ann.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor, nn 3 | 4 | from capymoa.stream import Schema 5 | 6 | 7 | class WNPerceptron(nn.Module): 8 | """A simple one hidden layer feedforward neural network with 9 | 10 | The output layer of a neural network is often problematic in continual 11 | learning because of the extreme and shifting class imbalance between tasks. 12 | [Lesort2021]_ suggest mitigating this by using a variant of weight 13 | normalization that parameterize the weights as a magnitude (set to the unit 14 | vector) and a direction. 15 | 16 | .. [Lesort2021] Lesort, T., George, T., & Rish, I. (2021). Continual 17 | Learning in Deep Networks: An Analysis of the Last Layer. 18 | """ 19 | 20 | def __init__(self, schema: Schema, hidden_size: int = 50): 21 | super().__init__() 22 | num_classes = schema.get_num_classes() 23 | 24 | self.fc1 = nn.Linear(schema.get_num_attributes(), hidden_size) 25 | self.fc2 = nn.Linear(hidden_size, num_classes, bias=False) 26 | self.fc2 = nn.utils.parametrizations.weight_norm(self.fc2, name="weight") 27 | weight_g = self.fc2.parametrizations.weight.original0 28 | # Set the magnitude to the unit vector 29 | weight_g.requires_grad_(False).fill_(1.0 / (num_classes**0.5)) 30 | 31 | def forward(self, x: Tensor) -> Tensor: 32 | x = torch.relu(self.fc1(x)) 33 | x = self.fc2(x) 34 | return x 35 | -------------------------------------------------------------------------------- /src/capymoa/ocl/base.py: -------------------------------------------------------------------------------- 1 | """Base classes for online continual learning algorithms. 2 | 3 | All OCL learners inherit from :class:`capymoa.base.Classifier` this module 4 | contains additional base classes for OCL learners that are aware of the task 5 | boundaries and/or the task identities during training and evaluation. 6 | """ 7 | 8 | from abc import ABC, abstractmethod 9 | 10 | 11 | class TaskBoundaryAware(ABC): 12 | """Interface for learners that are aware of the transition between tasks. 13 | 14 | Knowing the transition between tasks is required by some algorithms, but is 15 | a relaxation of the online continual learning setting. A researcher should 16 | be mindful and communicate when a learner is task-aware. 17 | 18 | >>> from capymoa.classifier import NoChange 19 | >>> from capymoa.ocl.datasets import TinySplitMNIST 20 | >>> from capymoa.ocl.base import TaskBoundaryAware 21 | >>> from capymoa.ocl.evaluation import ocl_train_eval_loop 22 | 23 | >>> class MyTaskBoundaryAware(TaskBoundaryAware, NoChange): 24 | ... def set_train_task(self, train_task_id: int): 25 | ... print(f"Training task {train_task_id}") 26 | 27 | >>> scenario = TinySplitMNIST() 28 | >>> learner = MyTaskBoundaryAware(scenario.schema) 29 | >>> _ = ocl_train_eval_loop(learner, scenario.train_loaders(32), scenario.test_loaders(32)) 30 | Training task 0 31 | Training task 1 32 | Training task 2 33 | Training task 3 34 | Training task 4 35 | """ 36 | 37 | @abstractmethod 38 | def set_train_task(self, train_task_id: int): 39 | """Called when a new training task starts. 40 | 41 | :param task_id: The ID of the new task. 42 | """ 43 | 44 | 45 | class TaskAware(TaskBoundaryAware): 46 | """Interface for learners that are aware of the task during evaluation. 47 | 48 | Knowing the task during inference greatly simplifies the learning problem. 49 | When using this interface your problem becomes a task-incremental online 50 | continual learning problem. 51 | 52 | >>> from capymoa.classifier import NoChange 53 | >>> from capymoa.ocl.datasets import TinySplitMNIST 54 | >>> from capymoa.ocl.base import TaskAware 55 | >>> from capymoa.ocl.evaluation import ocl_train_eval_loop 56 | 57 | >>> class MyTaskAware(TaskAware, NoChange): 58 | ... def set_train_task(self, train_task_id: int): 59 | ... print(f"Training task {train_task_id}") 60 | ... 61 | ... def set_test_task(self, test_task_id: int): 62 | ... print(f"Testing task {test_task_id}") 63 | 64 | >>> scenario = TinySplitMNIST() 65 | >>> learner = MyTaskAware(scenario.schema) 66 | >>> ocl_train_eval_loop(learner, scenario.train_loaders(32), scenario.test_loaders(32)) 67 | Training task 0 68 | Testing task 0 69 | Testing task 1 70 | Testing task 2 71 | Testing task 3 72 | Testing task 4 73 | Training task 1 74 | Testing task 0 75 | Testing task 1 76 | ... 77 | """ 78 | 79 | @abstractmethod 80 | def set_test_task(self, test_task_id: int): 81 | """Called when testing on a task starts. 82 | 83 | :param task_id: The ID of the task. 84 | """ 85 | -------------------------------------------------------------------------------- /src/capymoa/ocl/strategy/__init__.py: -------------------------------------------------------------------------------- 1 | from ._experience_replay import ExperienceReplay 2 | from ._slda import SLDA 3 | from ._ncm import NCM 4 | 5 | __all__ = ["ExperienceReplay", "SLDA", "NCM"] 6 | -------------------------------------------------------------------------------- /src/capymoa/ocl/strategy/_ncm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from typing import Tuple 4 | from capymoa.base import BatchClassifier 5 | from capymoa.stream import Schema 6 | from torch import Tensor, nn 7 | 8 | 9 | def _batch_cumulative_mean( 10 | batch: Tensor, count: int, mean: Tensor 11 | ) -> Tuple[int, Tensor]: 12 | """Update cumulative mean and count. 13 | 14 | :param batch: Current batch of data. Shape (batch_size, num_features). 15 | :param count: Current count of samples processed. 16 | :param mean: Current cumulative mean of the data. Shape (num_features,). 17 | :return: Updated count and cumulative mean. 18 | """ 19 | batch_size = batch.size(0) 20 | if batch_size == 0: 21 | return count, mean 22 | new_count = count + batch_size 23 | updated_mean = (count * mean + batch.sum(0)) / new_count 24 | return new_count, updated_mean 25 | 26 | 27 | class NCM(BatchClassifier): 28 | """Nearest Class Mean (NCM). 29 | 30 | NCM is a simple classifier that uses the mean of each class as a prototype. 31 | It calculates the distance from each input to the class means and assigns 32 | the class with the closest mean as the predicted class. 33 | """ 34 | 35 | _dtype = torch.float32 36 | 37 | def __init__( 38 | self, 39 | schema: Schema, 40 | pre_processor: nn.Module = nn.Identity(), 41 | num_features: int | None = None, 42 | device: torch.device | str = torch.device("cpu"), 43 | ): 44 | """Initialize a NCM classifier head. 45 | 46 | :param schema: Describes the shape and type of the data. 47 | :param pre_processor: A pre-processing module to apply to the input 48 | data, defaults to an identity module. 49 | :param num_features: Number of features once pre-processed, defaults to 50 | the number of attributes in the schema. 51 | :param device: Device to run the model on, defaults to CPU. 52 | """ 53 | super().__init__(schema) 54 | n_classes = schema.get_num_classes() 55 | n_feats = num_features or schema.get_num_attributes() 56 | self._device = device 57 | self._pre_processor = pre_processor.to(device) 58 | self._class_counts = torch.zeros((n_classes,), device=device, dtype=torch.int64) 59 | self._class_means = torch.zeros((n_classes, n_feats), device=device) 60 | 61 | @torch.no_grad() 62 | def batch_train(self, x: np.ndarray, y: np.ndarray) -> None: 63 | x_ = torch.from_numpy(x).to(self._device, self._dtype) # (batch_size, features) 64 | y_ = torch.from_numpy(y).to(self._device, self._dtype) # (batch_size,) 65 | x_ = self._pre_processor(x_) 66 | 67 | # Update mean and count 68 | for i in range(self.schema.get_num_classes()): 69 | mask = y_ == i 70 | self._class_counts[i], self._class_means[i] = _batch_cumulative_mean( 71 | batch=x_[mask], 72 | count=int(self._class_counts[i].item()), 73 | mean=self._class_means[i], 74 | ) 75 | 76 | @torch.no_grad() 77 | def batch_predict_proba(self, x: np.ndarray) -> np.ndarray: 78 | assert x.ndim == 2, "Input must be a 2D array (batch_size, features)" 79 | x_ = torch.from_numpy(x).to(self._device, self._dtype) 80 | x_ = self._pre_processor(x_) 81 | 82 | # Calculate distances to class means 83 | distances = torch.cdist( 84 | x_.unsqueeze(0), self._class_means.unsqueeze(0) 85 | ).squeeze(0) 86 | 87 | # Convert distances to pseudo-probabilities. Using the inverse weighted 88 | # distance method. 89 | inv_distances = 1 / (1 + distances) 90 | probabilities = inv_distances / inv_distances.sum(dim=1, keepdim=True) 91 | return probabilities.cpu().numpy() 92 | -------------------------------------------------------------------------------- /src/capymoa/prediction_interval/__init__.py: -------------------------------------------------------------------------------- 1 | from ._mean_and_standard_deviation_estimation import MVE 2 | from ._adaptive_prediction_interval import AdaPI 3 | 4 | __all__ = [ 5 | "MVE", 6 | "AdaPI", 7 | ] 8 | -------------------------------------------------------------------------------- /src/capymoa/prediction_interval/_adaptive_prediction_interval.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | from capymoa.base import ( 4 | MOAPredictionIntervalLearner, 5 | _extract_moa_learner_CLI, 6 | ) 7 | 8 | from capymoa.regressor import AdaptiveRandomForestRegressor 9 | 10 | from moa.classifiers.predictioninterval import AdaptivePredictionInterval as MOA_AdaPI 11 | 12 | 13 | class AdaPI(MOAPredictionIntervalLearner): 14 | def __init__( 15 | self, 16 | schema=None, 17 | CLI=None, 18 | random_seed=1, 19 | base_learner=None, 20 | confidence_level=0.95, 21 | limit=0.1, 22 | ): 23 | mappings = { 24 | "base_learner": "-l", 25 | "confidence_level": "-c", 26 | "limit": "-t", 27 | } 28 | 29 | config_str = "" 30 | parameters = inspect.signature(self.__init__).parameters 31 | for key in mappings: 32 | if key not in parameters: 33 | continue 34 | this_parameter = parameters[key] 35 | set_value = locals()[key] 36 | if isinstance(set_value, bool): 37 | if set_value: 38 | str_extension = mappings[key] + " " 39 | else: 40 | str_extension = "" 41 | else: 42 | if key == "base_learner": 43 | if base_learner is None: 44 | set_value = _extract_moa_learner_CLI( 45 | AdaptiveRandomForestRegressor(schema) 46 | ) 47 | elif type(base_learner) is str: 48 | set_value = base_learner 49 | else: 50 | set_value = _extract_moa_learner_CLI(base_learner) 51 | 52 | str_extension = f"{mappings[key]} {set_value} " 53 | config_str += str_extension 54 | 55 | self.moa_learner = MOA_AdaPI() 56 | 57 | if CLI is None: 58 | self.moa_learner.getOptions().setViaCLIString(config_str) 59 | self.moa_learner.prepareForUse() 60 | self.moa_learner.resetLearning() 61 | 62 | super().__init__( 63 | schema=schema, 64 | CLI=CLI, 65 | random_seed=random_seed, 66 | moa_learner=self.moa_learner, 67 | ) 68 | 69 | def __str__(self): 70 | # Overrides the default class name from MOA 71 | return "AdaptivePredictionInterval" 72 | -------------------------------------------------------------------------------- /src/capymoa/prediction_interval/_mean_and_standard_deviation_estimation.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | from capymoa.base import ( 4 | MOAPredictionIntervalLearner, 5 | _extract_moa_learner_CLI, 6 | ) 7 | 8 | from capymoa.regressor import AdaptiveRandomForestRegressor 9 | 10 | from moa.classifiers.predictioninterval import MVEPredictionInterval as MOA_MVE 11 | 12 | 13 | class MVE(MOAPredictionIntervalLearner): 14 | def __init__( 15 | self, 16 | schema=None, 17 | CLI=None, 18 | random_seed=1, 19 | base_learner=None, 20 | confidence_level=0.95, 21 | ): 22 | mappings = {"base_learner": "-l", "confidence_level": "-c"} 23 | 24 | config_str = "" 25 | parameters = inspect.signature(self.__init__).parameters 26 | for key in mappings: 27 | if key not in parameters: 28 | continue 29 | this_parameter = parameters[key] 30 | set_value = locals()[key] 31 | if isinstance(set_value, bool): 32 | if set_value: 33 | str_extension = mappings[key] + " " 34 | else: 35 | str_extension = "" 36 | else: 37 | if key == "base_learner": 38 | if base_learner is None: 39 | set_value = _extract_moa_learner_CLI( 40 | AdaptiveRandomForestRegressor(schema) 41 | ) 42 | elif type(base_learner) is str: 43 | set_value = base_learner 44 | else: 45 | set_value = _extract_moa_learner_CLI(base_learner) 46 | 47 | str_extension = f"{mappings[key]} {set_value} " 48 | config_str += str_extension 49 | 50 | self.moa_learner = MOA_MVE() 51 | 52 | if CLI is None: 53 | self.moa_learner.getOptions().setViaCLIString(config_str) 54 | self.moa_learner.prepareForUse() 55 | self.moa_learner.resetLearning() 56 | 57 | super().__init__( 58 | schema=schema, 59 | CLI=CLI, 60 | random_seed=random_seed, 61 | moa_learner=self.moa_learner, 62 | ) 63 | 64 | def __str__(self): 65 | # Overrides the default class name from MOA 66 | return "MVEPredictionInterval" 67 | -------------------------------------------------------------------------------- /src/capymoa/regressor/__init__.py: -------------------------------------------------------------------------------- 1 | from ._soknl_base_tree import SOKNLBT 2 | from ._soknl import SOKNL 3 | from ._orto import ORTO 4 | from ._knn import KNNRegressor 5 | from ._fimtdd import FIMTDD 6 | from ._arffimtdd import ARFFIMTDD 7 | from ._adaptive_random_forest import AdaptiveRandomForestRegressor 8 | from ._passive_aggressive_regressor import PassiveAggressiveRegressor 9 | from ._sgd_regressor import SGDRegressor 10 | from ._shrubs_regressor import ShrubsRegressor 11 | 12 | __all__ = [ 13 | "SOKNLBT", 14 | "SOKNL", 15 | "ORTO", 16 | "KNNRegressor", 17 | "FIMTDD", 18 | "ARFFIMTDD", 19 | "AdaptiveRandomForestRegressor", 20 | "PassiveAggressiveRegressor", 21 | "SGDRegressor", 22 | "ShrubsRegressor", 23 | ] 24 | -------------------------------------------------------------------------------- /src/capymoa/regressor/_arffimtdd.py: -------------------------------------------------------------------------------- 1 | # Library imports 2 | from typing import Optional, Union 3 | 4 | from capymoa.base import MOARegressor 5 | 6 | from capymoa.splitcriteria import SplitCriterion, _split_criterion_to_cli_str 7 | from capymoa.stream._stream import Schema 8 | from moa.classifiers.trees import ARFFIMTDD as _MOA_ARFFIMTDD 9 | 10 | 11 | class ARFFIMTDD(MOARegressor): 12 | """Modified Fast Incremental Model Tree with Drift Detection for basic 13 | learner for ARF-Regas described by Ikonomovska et al.""" 14 | 15 | def __init__( 16 | self, 17 | schema: Schema, 18 | subspace_size_size: int = 2, 19 | split_criterion: Union[SplitCriterion, str] = "VarianceReductionSplitCriterion", 20 | grace_period: int = 200, 21 | split_confidence: float = 1.0e-7, 22 | tie_threshold: float = 0.05, 23 | page_hinckley_alpha: float = 0.005, 24 | page_hinckley_threshold: int = 50, 25 | alternate_tree_fading_factor: float = 0.995, 26 | alternate_tree_t_min: int = 150, 27 | alternate_tree_time: int = 1500, 28 | learning_ratio: float = 0.02, 29 | learning_ratio_decay_factor: float = 0.001, 30 | learning_ratio_const: bool = False, 31 | random_seed: Optional[int] = None, 32 | ) -> None: 33 | """ 34 | Construct ARFFIMTDD. 35 | 36 | :param subspace_size_size: Number of features per subset for each node split. Negative values = #features - k 37 | :param split_criterion: Split criterion to use. 38 | :param grace_period: Number of instances a leaf should observe between split attempts. 39 | :param split_confidence: Allowed error in split decision, values close to 0 will take long to decide. 40 | :param tie_threshold: Threshold below which a split will be forced to break ties. 41 | :param page_hinckley_alpha: Alpha value to use in the Page Hinckley change detection tests. 42 | :param page_hinckley_threshold: Threshold value used in the Page Hinckley change detection tests. 43 | :param alternate_tree_fading_factor: Fading factor used to decide if an alternate tree should replace an original. 44 | :param alternate_tree_t_min: Tmin value used to decide if an alternate tree should replace an original. 45 | :param alternate_tree_time: The number of instances used to decide if an alternate tree should be discarded. 46 | :param learning_ratio: Learning ratio to used for training the Perceptrons in the leaves. 47 | :param learning_ratio_decay_factor: Learning rate decay factor (not used when learning rate is constant). 48 | :param learning_ratio_const: Keep learning rate constant instead of decaying. 49 | """ 50 | cli = [] 51 | 52 | cli.append(f"-k {subspace_size_size}") 53 | cli.append(f"-s ({_split_criterion_to_cli_str(split_criterion)})") 54 | cli.append(f"-g {grace_period}") 55 | cli.append(f"-c {split_confidence}") 56 | cli.append(f"-t {tie_threshold}") 57 | cli.append(f"-a {page_hinckley_alpha}") 58 | cli.append(f"-h {page_hinckley_threshold}") 59 | cli.append(f"-f {alternate_tree_fading_factor}") 60 | cli.append(f"-y {alternate_tree_t_min}") 61 | cli.append(f"-u {alternate_tree_time}") 62 | cli.append(f"-l {learning_ratio}") 63 | cli.append(f"-d {learning_ratio_decay_factor}") 64 | cli.append("-p") if learning_ratio_const else None 65 | 66 | self.moa_learner = _MOA_ARFFIMTDD() 67 | 68 | super().__init__( 69 | schema=schema, 70 | CLI=" ".join(cli), 71 | random_seed=random_seed, 72 | moa_learner=self.moa_learner, 73 | ) 74 | -------------------------------------------------------------------------------- /src/capymoa/regressor/_knn.py: -------------------------------------------------------------------------------- 1 | from capymoa.base import MOARegressor 2 | from moa.classifiers.lazy import kNN as _moa_kNN 3 | 4 | 5 | class KNNRegressor(MOARegressor): 6 | """K Nearest Neighbor for data stream regression with sliding window 7 | 8 | The default number of neighbors (k) is set to 3 instead of 10 (as in MOA) 9 | 10 | There is no specific publication for online KNN, please refer to: 11 | 12 | `Bifet, Albert, Ricard Gavalda, Geoffrey Holmes, and Bernhard Pfahringer. 13 | Machine learning for data streams: with practical examples in MOA. MIT press, 2023. 14 | `_ 15 | 16 | Example usage: 17 | 18 | >>> from capymoa.datasets import Fried 19 | >>> from capymoa.regressor import KNNRegressor 20 | >>> from capymoa.evaluation import prequential_evaluation 21 | >>> stream = Fried() 22 | >>> schema = stream.get_schema() 23 | >>> learner = KNNRegressor(schema) 24 | >>> results = prequential_evaluation(stream, learner, max_instances=1000) 25 | >>> results["cumulative"].rmse() 26 | 2.9811398077838542 27 | """ 28 | 29 | def __init__( 30 | self, 31 | schema=None, 32 | CLI=None, 33 | random_seed=1, 34 | k=3, 35 | median=False, 36 | window_size=1000, 37 | ): 38 | """ 39 | Constructing KNN Regressor. 40 | 41 | :param k: the number of the neighbours. 42 | :param median: choose to use mean or median as the aggregation for the final prediction. 43 | :param window_size: the size of the sliding window to store the instances. 44 | """ 45 | 46 | # Important, should create the MOA object before invoking the super class __init__ 47 | self.moa_learner = _moa_kNN() 48 | super().__init__( 49 | schema=schema, 50 | CLI=CLI, 51 | random_seed=random_seed, 52 | moa_learner=self.moa_learner, 53 | ) 54 | 55 | # Initialize instance attributes with default values, CLI was not set. 56 | if self.CLI is None: 57 | self.k = k 58 | self.median = median 59 | self.window_size = window_size 60 | self.moa_learner.getOptions().setViaCLIString( 61 | f"-k {self.k} {'-m' if self.median else ''} -w \ 62 | {self.window_size}" 63 | ) 64 | self.moa_learner.prepareForUse() 65 | self.moa_learner.resetLearning() 66 | 67 | def __str__(self): 68 | # Overrides the default class name from MOA 69 | return "kNNRegressor" 70 | -------------------------------------------------------------------------------- /src/capymoa/regressor/_passive_aggressive_regressor.py: -------------------------------------------------------------------------------- 1 | from capymoa.base import SKRegressor 2 | from sklearn.linear_model import ( 3 | PassiveAggressiveRegressor as _SKPassiveAggressiveRegressor, 4 | ) 5 | from capymoa.stream._stream import Schema 6 | 7 | 8 | class PassiveAggressiveRegressor(SKRegressor): 9 | """Streaming Passive Aggressive regressor 10 | 11 | This wraps :sklearn:`linear_model.PassiveAggressiveRegressor` for 12 | ease of use in the streaming context. Some options are missing because 13 | they are not relevant in the streaming context. 14 | 15 | Reference: 16 | 17 | `Online Passive-Aggressive Algorithms K. Crammer, O. Dekel, J. Keshat, S. 18 | Shalev-Shwartz, Y. Singer - JMLR (2006) 19 | `_ 20 | 21 | Example Usage: 22 | 23 | >>> from capymoa.datasets import Fried 24 | >>> from capymoa.regressor import PassiveAggressiveRegressor 25 | >>> from capymoa.evaluation import prequential_evaluation 26 | >>> stream = Fried() 27 | >>> schema = stream.get_schema() 28 | >>> learner = PassiveAggressiveRegressor(schema) 29 | >>> results = prequential_evaluation(stream, learner, max_instances=1000) 30 | >>> results["cumulative"].rmse() 31 | 3.700... 32 | """ 33 | 34 | sklearner: _SKPassiveAggressiveRegressor 35 | """The underlying scikit-learn object. See: :sklearn:`linear_model.PassiveAggressiveRegressor`""" 36 | 37 | def __init__( 38 | self, 39 | schema: Schema, 40 | max_step_size: float = 1.0, 41 | fit_intercept: bool = True, 42 | loss: str = "epsilon_insensitive", 43 | average: bool = False, 44 | random_seed=1, 45 | ): 46 | """Construct a passive aggressive regressor. 47 | 48 | :param schema: Stream schema 49 | :param max_step_size: Maximum step size (regularization). 50 | :param fit_intercept: Whether the intercept should be estimated or not. 51 | If False, the data is assumed to be already centered. 52 | :param loss: The loss function to be used: 53 | 54 | * ``"epsilon_insensitive"``: equivalent to PA-I in the reference paper. 55 | * ``"squared_epsilon_insensitive"``: equivalent to PA-II in the reference 56 | paper. 57 | 58 | :param average: When set to True, computes the averaged SGD weights and 59 | stores the result in the ``sklearner.coef_`` attribute. If set to an int greater 60 | than 1, averaging will begin once the total number of samples 61 | seen reaches average. So ``average=10`` will begin averaging after 62 | seeing 10 samples. 63 | :param random_seed: Seed for the random number generator. 64 | """ 65 | 66 | super().__init__( 67 | _SKPassiveAggressiveRegressor( 68 | C=max_step_size, 69 | fit_intercept=fit_intercept, 70 | early_stopping=False, 71 | shuffle=False, 72 | verbose=0, 73 | loss=loss, 74 | warm_start=False, 75 | average=average, 76 | random_state=random_seed, 77 | ), 78 | schema, 79 | random_seed, 80 | ) 81 | 82 | def __str__(self): 83 | return str("PassiveAggressiveRegressor") 84 | -------------------------------------------------------------------------------- /src/capymoa/regressor/_sgd_regressor.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Literal 2 | from capymoa.base import SKRegressor 3 | from sklearn.linear_model import ( 4 | SGDRegressor as _SKSGDRegressor, 5 | ) 6 | from capymoa.stream._stream import Schema 7 | 8 | 9 | class SGDRegressor(SKRegressor): 10 | """Streaming stochastic gradient descent regressor. 11 | 12 | This wraps :class:`sklearn.linear_model.SGDRegressor` for 13 | ease of use in the streaming context. Some options are missing because 14 | they are not relevant in the streaming context. Furthermore, the learning rate 15 | is constant. 16 | 17 | Example Usage: 18 | 19 | >>> from capymoa.datasets import Fried 20 | >>> from capymoa.regressor import PassiveAggressiveRegressor 21 | >>> from capymoa.evaluation import prequential_evaluation 22 | >>> stream = Fried() 23 | >>> schema = stream.get_schema() 24 | >>> learner = SGDRegressor(schema) 25 | >>> results = prequential_evaluation(stream, learner, max_instances=1000) 26 | >>> results["cumulative"].rmse() 27 | 4.62... 28 | """ 29 | 30 | sklearner: _SKSGDRegressor 31 | """The underlying scikit-learn object""" 32 | 33 | def __init__( 34 | self, 35 | schema: Schema, 36 | loss: Literal[ 37 | "squared_error", 38 | "huber", 39 | "epsilon_insensitive", 40 | "squared_epsilon_insensitive", 41 | ] = "squared_error", 42 | penalty: Optional[Literal["l2", "l1", "elasticnet"]] = "l2", 43 | alpha: float = 0.0001, 44 | l1_ratio: float = 0.15, 45 | fit_intercept: bool = True, 46 | epsilon: float = 0.1, 47 | learning_rate: str = "invscaling", 48 | eta0: float = 0.01, 49 | random_seed: Optional[int] = None, 50 | ): 51 | """Construct stochastic gradient descent Regressor. 52 | 53 | :param schema: Describes the datastream's structure. 54 | :param loss: The loss function to be used. 55 | :param penalty: The penalty (aka regularization term) to be used. 56 | :param alpha: Constant that multiplies the regularization term. 57 | :param l1_ratio: The Elastic Net mixing parameter, with ``0 <= l1_ratio <= 1``. 58 | ``l1_ratio=0`` corresponds to L2 penalty, ``l1_ratio=1`` to L1. 59 | Only used if ``penalty`` is 'elasticnet'. 60 | Values must be in the range ``[0.0, 1.0]``. 61 | :param fit_intercept: Whether the intercept (bias) should be estimated 62 | or not. If False, the data is assumed to be already centered. 63 | :param epsilon: Epsilon in the epsilon-insensitive loss functions; only 64 | if ``loss`` is 'huber', 'epsilon_insensitive', or 65 | 'squared_epsilon_insensitive'. For 'huber', determines the threshold 66 | at which it becomes less important to get the prediction exactly right. 67 | For epsilon-insensitive, any differences between the current prediction 68 | and the correct label are ignored if they are less than this threshold. 69 | :param learning_rate: The size of the gradient step. 70 | :param eta0: The initial learning rate for the 'constant', 'invscaling' or 71 | 'adaptive' schedules. The default value is 0.0 as ``eta0`` is not used by 72 | the default schedule 'optimal'. 73 | :param random_seed: Seed for reproducibility. 74 | """ 75 | 76 | super().__init__( 77 | _SKSGDRegressor( 78 | loss=loss, 79 | penalty=penalty, 80 | alpha=alpha, 81 | l1_ratio=l1_ratio, 82 | fit_intercept=fit_intercept, 83 | epsilon=epsilon, 84 | learning_rate=learning_rate, 85 | eta0=eta0, 86 | random_state=random_seed, 87 | ), 88 | schema, 89 | random_seed, 90 | ) 91 | 92 | def __str__(self): 93 | return str("SGDRegressor") 94 | -------------------------------------------------------------------------------- /src/capymoa/splitcriteria.py: -------------------------------------------------------------------------------- 1 | """Module containing split criteria for decision trees. 2 | 3 | Decision trees are built by splitting the data into groups based on a split 4 | criterion. The split criterion is a function that measures the quality of a 5 | split. 6 | """ 7 | 8 | from typing import Optional, Union 9 | import moa.classifiers.core.splitcriteria as moa_split 10 | 11 | 12 | class SplitCriterion: 13 | """Split criteria are used to evaluate the quality of a split in a decision tree.""" 14 | 15 | _java_object: Optional[moa_split.SplitCriterion] = None 16 | 17 | def java_object(self) -> moa_split.SplitCriterion: 18 | """Return the Java object that this class wraps.""" 19 | if self._java_object is None: 20 | raise RuntimeError("No Java object has been created.") 21 | return self._java_object 22 | 23 | 24 | class VarianceReductionSplitCriterion(SplitCriterion): 25 | """Goodness of split criterion based on variance reduction.""" 26 | 27 | def __init__(self): 28 | self._java_object = moa_split.VarianceReductionSplitCriterion() 29 | 30 | 31 | class InfoGainSplitCriterion(SplitCriterion): 32 | """Goodness of split using information gain.""" 33 | 34 | def __init__(self, min_branch_frac: float = 0.01): 35 | """ 36 | Construct InfoGainSplitCriterion. 37 | 38 | :param min_branch_frac: Minimum fraction of weight required down at least two branches. 39 | """ 40 | cli = [] 41 | cli.append(f"-f {min_branch_frac}") 42 | 43 | self._java_object = moa_split.InfoGainSplitCriterion() 44 | self._java_object.getOptions().setViaCLIString(" ".join(cli)) 45 | 46 | 47 | class GiniSplitCriterion(SplitCriterion): 48 | """Goodness of split using Gini impurity.""" 49 | 50 | def __init__(self): 51 | self._java_object = moa_split.GiniSplitCriterion() 52 | 53 | 54 | def _split_criterion_to_cli_str(split_criterion: Union[str, SplitCriterion]) -> str: 55 | """Convert a split criterion to a CLI string. 56 | 57 | Also strips any parentheses or whitespace from the beginning and end of the string. 58 | 59 | >>> _split_criterion_to_cli_str("(InfoGainSplitCriterion -f 0.5)") 60 | 'InfoGainSplitCriterion -f 0.5' 61 | >>> _split_criterion_to_cli_str(InfoGainSplitCriterion(0.5)) 62 | 'InfoGainSplitCriterion -f 0.5' 63 | 64 | :param split_criterion: The split criterion to convert 65 | :return: A CLI string representing the split criterion 66 | """ 67 | if isinstance(split_criterion, SplitCriterion): 68 | java_object = split_criterion.java_object() 69 | cli_options = java_object.getOptions().getAsCLIString() 70 | return f"{java_object.getClass().getSimpleName()} {cli_options}" 71 | elif isinstance(split_criterion, str): 72 | return split_criterion.strip().strip("() ") 73 | else: 74 | raise TypeError( 75 | f"Expected a string or SplitCriterion, got {type(split_criterion)}" 76 | ) 77 | -------------------------------------------------------------------------------- /src/capymoa/ssl/classifier/__init__.py: -------------------------------------------------------------------------------- 1 | from ._osnn import OSNN 2 | 3 | __all__ = ["OSNN"] 4 | -------------------------------------------------------------------------------- /src/capymoa/stream/__init__.py: -------------------------------------------------------------------------------- 1 | from ._stream import ( 2 | Stream, 3 | Schema, 4 | ARFFStream, 5 | stream_from_file, 6 | CSVStream, 7 | NumpyStream, 8 | MOAStream, 9 | ConcatStream, 10 | ) 11 | from .torch import TorchClassifyStream 12 | from . import drift, generator, preprocessing 13 | 14 | __all__ = [ 15 | "Stream", 16 | "Schema", 17 | "stream_from_file", 18 | "ARFFStream", 19 | "TorchClassifyStream", 20 | "CSVStream", 21 | "drift", 22 | "generator", 23 | "preprocessing", 24 | "NumpyStream", 25 | "MOAStream", 26 | "ConcatStream", 27 | ] 28 | -------------------------------------------------------------------------------- /src/capymoa/stream/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | from .pipeline import ( 2 | BasePipeline, 3 | ClassifierPipeline, 4 | ClassifierPipelineElement, 5 | DriftDetectorPipelineElement, 6 | PipelineElement, 7 | RandomSearchClassifierPE, 8 | RegressorPipeline, 9 | RegressorPipelineElement, 10 | TransformerPipelineElement, 11 | ) 12 | from .transformer import Transformer, MOATransformer 13 | 14 | __all__ = [ 15 | "BasePipeline", 16 | "ClassifierPipeline", 17 | "ClassifierPipelineElement", 18 | "DriftDetectorPipelineElement", 19 | "MOATransformer", 20 | "PipelineElement", 21 | "RandomSearchClassifierPE", 22 | "RegressorPipeline", 23 | "RegressorPipelineElement", 24 | "Transformer", 25 | "TransformerPipelineElement", 26 | ] 27 | -------------------------------------------------------------------------------- /src/capymoa/stream/preprocessing/transformer.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | 3 | from abc import ABC, abstractmethod 4 | 5 | from capymoa.stream import Schema, MOAStream 6 | from capymoa.instance import Instance 7 | from moa.streams import FilteredQueueStream 8 | import moa.streams.filters 9 | 10 | 11 | class Transformer(ABC): 12 | @abstractmethod 13 | def transform_instance(self, instance) -> Instance: 14 | raise NotImplementedError 15 | 16 | @abstractmethod 17 | def get_schema(self) -> Schema: 18 | raise NotImplementedError 19 | 20 | @abstractmethod 21 | def restart(self): 22 | raise NotImplementedError 23 | 24 | 25 | class MOATransformer(Transformer): 26 | def __init__( 27 | self, 28 | schema=None, 29 | moa_filter: moa.streams.filters.StreamFilter | None = None, 30 | CLI=None, 31 | ): 32 | self.schema = schema 33 | self.CLI = CLI 34 | self.moa_filter = moa_filter 35 | self._last_instance = None 36 | self._last_transformed_instance = None 37 | 38 | if self.CLI is not None: 39 | if self.moa_filter is not None: 40 | self.moa_filter.getOptions().setViaCLIString(CLI) 41 | else: 42 | raise RuntimeError("Must provide a moa_filter to set via CLI.") 43 | 44 | if self.moa_filter is not None: 45 | # Must call this method exactly here, because prepareForUse invoke the method to initialize the 46 | # header file of the stream (synthetic ones) 47 | self.moa_filter.prepareForUse() 48 | else: 49 | raise RuntimeError("Must provide a moa_filter to initialize the Schema.") 50 | 51 | if self.schema is None: 52 | if self.moa_filter is not None: 53 | self.schema = Schema(moa_header=self.moa_filter.getHeader()) 54 | else: 55 | raise RuntimeError( 56 | "Must provide a moa_filter to initialize the Schema." 57 | ) 58 | 59 | queue = FilteredQueueStream() 60 | self.filtered_stream = MOAStream( 61 | schema=schema, 62 | moa_stream=queue, 63 | CLI=f"-f ({self.moa_filter.getCLICreationString(self.moa_filter.__class__)})", 64 | ) 65 | 66 | def __str__(self): 67 | moa_filter_str = str( 68 | self.moa_filter.getCLICreationString(self.moa_filter.__class__) 69 | ) 70 | if moa_filter_str.endswith(" "): 71 | moa_filter_str = moa_filter_str[:-1] 72 | return f"Transformer({moa_filter_str})" 73 | 74 | def transform_instance(self, instance) -> Instance: 75 | # MOA filters are not stateless. 76 | # This hack avoids transforming an instance twice. 77 | if self._last_instance == instance: 78 | return self._last_transformed_instance 79 | self._last_instance = instance 80 | 81 | self.filtered_stream.moa_stream.addToQueue(instance.java_instance.instance) 82 | new_instance = self.filtered_stream.next_instance() 83 | 84 | self._last_transformed_instance = new_instance 85 | return new_instance 86 | 87 | def get_schema(self): 88 | return self.schema 89 | 90 | def restart(self): 91 | self.moa_filter.restart() 92 | 93 | def get_moa_filter(self): 94 | return self.moa_filter 95 | -------------------------------------------------------------------------------- /src/capymoa/type_alias.py: -------------------------------------------------------------------------------- 1 | from numpy import double 2 | from numpy.typing import NDArray 3 | 4 | FeatureVector = NDArray[double] 5 | """ 6 | Type definition for a feature vector, which is represented as a one dimensional 7 | NumPy array of double precision floating-point numbers. 8 | """ 9 | 10 | LabelIndex = int 11 | """ 12 | Type definition for a class-label index, which is a non-negative integer that is 13 | the index of the label in a list of labels. 14 | """ 15 | 16 | LabelProbabilities = NDArray[double] 17 | """ 18 | Type definition for a prediction probability, which is represented as a one 19 | dimensional NumPy array of double precision floating-point numbers. 20 | """ 21 | 22 | Label = str 23 | """ 24 | Type definition for a class label. 25 | """ 26 | 27 | TargetValue = double 28 | """ 29 | Alias for a dependent variable in a regression task. 30 | """ 31 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """This conftest.py contains pytest configuration and fixtures shared across all tests. 2 | 3 | - https://docs.pytest.org/en/stable/reference/fixtures.html#conftest-py-sharing-fixtures-across-multiple-files 4 | """ 5 | 6 | import os 7 | from capymoa.datasets._source_list import SOURCE_LIST 8 | from capymoa.datasets._utils import ( 9 | get_download_dir, 10 | download_extract, 11 | is_already_downloaded, 12 | ) 13 | 14 | 15 | def pytest_configure(config): 16 | os.chdir(config.rootpath) 17 | """Ensure that the working directory is the root of the project. 18 | 19 | We added this because previously, the working directory was wherever the 20 | pytest command was run from. This caused issues with relative paths in the 21 | tests. 22 | """ 23 | 24 | 25 | def download_required_testfiles(): 26 | csvs = ["ElectricityTiny", "FriedTiny"] 27 | arffs = ["ElectricityTiny", "FriedTiny"] 28 | download_dir = get_download_dir().absolute() 29 | 30 | for dataset in csvs: 31 | url = SOURCE_LIST[dataset].csv 32 | if not is_already_downloaded(url, download_dir): 33 | download_extract(url, download_dir) 34 | 35 | for dataset in arffs: 36 | url = SOURCE_LIST[dataset].arff 37 | if not is_already_downloaded(url, download_dir): 38 | download_extract(url, download_dir) 39 | 40 | 41 | download_required_testfiles() 42 | -------------------------------------------------------------------------------- /tests/ocl/test_datasets.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | from capymoa.ocl import datasets 3 | from capymoa.stream import Stream 4 | import numpy as np 5 | import pytest 6 | import inspect 7 | 8 | ALL_OCL_SCENARIO = [ 9 | cls 10 | for _, cls in inspect.getmembers(datasets) 11 | if inspect.isclass(cls) 12 | and issubclass(cls, datasets._BuiltInCIScenario) 13 | and cls != datasets._BuiltInCIScenario 14 | ] 15 | 16 | 17 | @pytest.mark.parametrize("scenario_type", ALL_OCL_SCENARIO) 18 | def test_ocl_split_datamodule_constructors( 19 | scenario_type: Type[datasets._BuiltInCIScenario], 20 | ): 21 | # Skip all except MNIST since downloading datasets can be slow on CI 22 | if scenario_type != datasets.TinySplitMNIST: 23 | pytest.skip("Skipping non-MNIST scenarios") 24 | 25 | scenario: datasets._BuiltInCIScenario = scenario_type() 26 | assert isinstance(scenario.train_tasks, list) 27 | assert isinstance(scenario.test_tasks, list) 28 | assert isinstance(scenario.train_streams, list) 29 | assert isinstance(scenario.test_streams, list) 30 | assert all(isinstance(task, Stream) for task in scenario.train_streams) 31 | assert all(isinstance(task, Stream) for task in scenario.test_streams) 32 | assert isinstance(scenario.task_schedule, list) 33 | assert len(scenario.task_schedule) == scenario.default_task_count 34 | assert len(scenario.train_tasks) == scenario.default_task_count 35 | assert len(scenario.test_tasks) == scenario.default_task_count 36 | 37 | train_instance = scenario.train_streams[0].next_instance() 38 | test_instance = scenario.test_streams[0].next_instance() 39 | 40 | assert isinstance(train_instance.x, np.ndarray) 41 | assert isinstance(test_instance.y_index, int) 42 | assert isinstance(test_instance.x, np.ndarray) 43 | assert isinstance(test_instance.y_index, int) 44 | 45 | assert len(scenario.train_streams) 46 | assert len(scenario.test_streams) 47 | assert all(len(task) for task in scenario.train_streams) 48 | assert all(len(task) for task in scenario.test_streams) 49 | -------------------------------------------------------------------------------- /tests/ocl/test_strategy.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, asdict 2 | from functools import partial 3 | from typing import Callable, List 4 | 5 | import pytest 6 | 7 | from capymoa.ann import Perceptron 8 | from capymoa.base import Classifier 9 | from capymoa.classifier import Finetune, HoeffdingTree 10 | from capymoa.ocl.datasets import TinySplitMNIST 11 | from capymoa.ocl.evaluation import ocl_train_eval_loop 12 | from capymoa.ocl.strategy import ExperienceReplay, SLDA, NCM 13 | from capymoa.stream import Schema 14 | 15 | import torch 16 | from torch import nn 17 | 18 | approx = partial(pytest.approx, abs=0.1) 19 | 20 | 21 | @dataclass(frozen=True) 22 | class Result: 23 | accuracy_final: float 24 | anytime_accuracy_all_avg: float 25 | ttt_accuracy: float 26 | 27 | 28 | @dataclass(frozen=True) 29 | class Case: 30 | name: str 31 | constructor: Callable[[Schema], Classifier] 32 | expected: Result 33 | batch_size: int = 32 34 | 35 | 36 | def pre_processor() -> nn.Module: 37 | """Create a pre-processor for the schema.""" 38 | torch.manual_seed(0) 39 | return nn.Sequential( 40 | nn.Linear(256, 512), 41 | nn.ReLU(), 42 | ) 43 | 44 | 45 | """ 46 | Add new test cases here. 47 | 48 | Use the `partial` function to create a new function with hyperparameters already 49 | set. 50 | """ 51 | TEST_CASES: List[Case] = [ 52 | Case("HoeffdingTree", HoeffdingTree, Result(69.0, 46.5, 57.0), batch_size=1), 53 | Case("HoeffdingTree", HoeffdingTree, Result(69.0, 46.5, 51.8), batch_size=32), 54 | Case("Finetune", partial(Finetune, model=Perceptron), Result(30.5, 20.7, 2.9)), 55 | Case("SLDA", SLDA, Result(75.5, 48.70, 74.2)), 56 | Case( 57 | "SLDA_with_preprocessor", 58 | lambda s: SLDA(s, pre_processor(), 512), 59 | Result(83.5, 52.7, 75.8), 60 | ), 61 | Case("NCM", NCM, Result(71.5, 46.2, 67.5)), 62 | Case( 63 | "NCM_with_preprocessor", 64 | lambda s: NCM(s, pre_processor(), 512), 65 | Result(69.5, 45.0, 66.8), 66 | ), 67 | Case( 68 | "ExperienceReplay", 69 | lambda schema: ExperienceReplay(Finetune(schema, Perceptron)), 70 | Result(30.0, 20.1, 3.0), 71 | ), 72 | ] 73 | 74 | 75 | @pytest.mark.parametrize("case", TEST_CASES, ids=[test.name for test in TEST_CASES]) 76 | def test_ocl_classifier(case: Case): 77 | scenario = TinySplitMNIST() 78 | learner = case.constructor(scenario.schema) 79 | result = ocl_train_eval_loop( 80 | learner, 81 | scenario.train_loaders(case.batch_size), 82 | scenario.test_loaders(case.batch_size), 83 | ) 84 | actual = Result( 85 | result.accuracy_final * 100, 86 | result.anytime_accuracy_all_avg * 100, 87 | result.ttt.accuracy(), 88 | ) 89 | assert asdict(actual) == approx(asdict(case.expected)), f"Case {case.name} failed." 90 | -------------------------------------------------------------------------------- /tests/test_anomaly_detectors.py: -------------------------------------------------------------------------------- 1 | from capymoa.evaluation import AnomalyDetectionEvaluator 2 | from capymoa.anomaly import ( 3 | HalfSpaceTrees, 4 | OnlineIsolationForest, 5 | Autoencoder, 6 | StreamRHF, 7 | ) 8 | from capymoa.base import AnomalyDetector 9 | from capymoa.base import MOAClassifier 10 | from capymoa.datasets import ElectricityTiny 11 | import pytest 12 | from functools import partial 13 | from typing import Callable, Optional 14 | from capymoa.base import _extract_moa_learner_CLI 15 | 16 | from capymoa.stream._stream import Schema 17 | 18 | 19 | @pytest.mark.parametrize( 20 | "learner_constructor,auc,cli_string", 21 | [ 22 | ( 23 | partial(HalfSpaceTrees, window_size=100, number_of_trees=25, max_depth=15), 24 | 0.54, 25 | None, 26 | ), 27 | ( 28 | partial( 29 | OnlineIsolationForest, 30 | window_size=100, 31 | num_trees=32, 32 | max_leaf_samples=32, 33 | ), 34 | 0.42, 35 | None, 36 | ), 37 | ( 38 | partial(Autoencoder, hidden_layer=2, learning_rate=0.5, threshold=0.6), 39 | 0.57, 40 | None, 41 | ), 42 | (partial(StreamRHF, num_trees=5, max_height=3), 0.72, None), 43 | ], 44 | ids=["HalfSpaceTrees", "OnlineIsolationForest", "Autoencoder", "StreamRHF"], 45 | ) 46 | def test_anomaly_detectors( 47 | learner_constructor: Callable[[Schema], AnomalyDetector], 48 | auc: float, 49 | cli_string: Optional[str], 50 | ): 51 | """Test on tiny is a fast running simple test to check if a learner's 52 | performance has changed. 53 | 54 | Notice how we use the `partial` function to creates a new function with 55 | hyperparameters already set. This allows us to use the same test function 56 | for different learners with different hyperparameters. 57 | 58 | :param learner_constructor: A partially applied constructor for the learner 59 | :param auc: Expected AUC score 60 | :param cli_string: Expected CLI string for the learner or None 61 | """ 62 | stream = ElectricityTiny() 63 | evaluator = AnomalyDetectionEvaluator(schema=stream.get_schema()) 64 | 65 | learner: AnomalyDetector = learner_constructor(schema=stream.get_schema()) 66 | 67 | for instance in stream: 68 | score = learner.score_instance(instance) 69 | evaluator.update(instance.y_index, score) 70 | learner.train(instance) 71 | 72 | # Check if the AUC score matches the expected value for both evaluator types 73 | actual_auc = evaluator.auc() 74 | assert actual_auc == pytest.approx(auc, abs=0.01), ( 75 | f"Basic Eval: Expected accuracy of {auc:0.1f} got {actual_auc: 0.01f}" 76 | ) 77 | 78 | # Optionally check the CLI string if it was provided 79 | if isinstance(learner, MOAClassifier) and cli_string is not None: 80 | cli_str = _extract_moa_learner_CLI(learner).strip("()") 81 | assert cli_str == cli_string, "CLI does not match expected value" 82 | -------------------------------------------------------------------------------- /tests/test_datasets.py: -------------------------------------------------------------------------------- 1 | from typing import Sized, Type 2 | import capymoa.datasets as capymoa_datasets 3 | from capymoa.datasets import ElectricityTiny 4 | from tempfile import TemporaryDirectory 5 | import pytest 6 | import numpy as np 7 | import platform 8 | from capymoa.datasets.downloader import DownloadableDataset 9 | from capymoa.stream import stream_from_file 10 | from subprocess import run 11 | import inspect 12 | 13 | _ALL_DOWNLOADABLE_DATASET = [ 14 | cls 15 | for _, cls in inspect.getmembers(capymoa_datasets) 16 | if inspect.isclass(cls) and issubclass(cls, DownloadableDataset) 17 | ] 18 | """Automatically collect all datasets that are instances of DownloadableDataset 19 | from the capymoa_datasets module. 20 | """ 21 | 22 | 23 | def test_electricity_tiny_auto_download(): 24 | # If windows skip 25 | if platform.system() == "Windows": 26 | # TODO: Explicitly closing streams might help but MOA does not support 27 | # this yet. 28 | pytest.skip("Skipping on Windows, because TemporaryDirectory fails to cleanup.") 29 | 30 | with TemporaryDirectory() as tmp_dir: 31 | # Ensure that the dataset is not downloaded 32 | with pytest.raises(FileNotFoundError): 33 | stream = ElectricityTiny(directory=tmp_dir, auto_download=False) 34 | 35 | stream = ElectricityTiny(directory=tmp_dir) 36 | first_instance: np.ndarray = stream.next_instance().x 37 | 38 | assert first_instance == pytest.approx( 39 | np.array([0, 0.056443, 0.439155, 0.003467, 0.422915, 0.414912]) 40 | ) 41 | 42 | # This should still work because the dataset is downloaded 43 | stream = ElectricityTiny(directory=tmp_dir, auto_download=False) 44 | 45 | 46 | def test_downloader_cli(): 47 | # If windows skip 48 | if platform.system() == "Windows": 49 | # TODO: Explicitly closing streams might help but MOA does not support 50 | # this yet. 51 | pytest.skip("Skipping on Windows, because TemporaryDirectory fails to cleanup.") 52 | 53 | with TemporaryDirectory() as tmp_dir: 54 | cmd = [ 55 | "python", 56 | "-m", 57 | "capymoa.datasets", 58 | "--out", 59 | tmp_dir, 60 | "--dataset", 61 | "ElectricityTiny", 62 | "--yes", 63 | ] 64 | run([*cmd, "--format", "csv"], check=True) 65 | run([*cmd, "--format", "arff"], check=True) 66 | 67 | csv_stream = stream_from_file(tmp_dir + "/electricity_tiny.csv") 68 | arff_stream = stream_from_file(tmp_dir + "/electricity_tiny.arff") 69 | 70 | while not csv_stream.has_more_instances(): 71 | csv_instance = csv_stream.next_instance() 72 | arff_instance = arff_stream.next_instance() 73 | assert csv_instance.x == pytest.approx(arff_instance.x) 74 | 75 | 76 | def test_electricity_tiny_schema(): 77 | schema = ElectricityTiny().schema 78 | assert schema.get_label_values() == ["0", "1"] 79 | assert schema.get_label_indexes() == [0, 1] 80 | assert schema.get_num_attributes() == 6 81 | assert schema.get_num_classes() == 2 82 | assert schema.is_regression() is False 83 | assert schema.is_classification() is True 84 | 85 | for y_index, y_value in enumerate(schema.get_label_values()): 86 | assert schema.get_index_for_label(y_value) == y_index 87 | assert schema.get_value_for_index(y_index) == y_value 88 | 89 | 90 | @pytest.mark.skip("This test is too slow") 91 | @pytest.mark.parametrize("dataset_type", _ALL_DOWNLOADABLE_DATASET) 92 | def test_all_datasets(dataset_type: Type[DownloadableDataset]): 93 | with TemporaryDirectory() as tmp_dir: 94 | dataset = dataset_type(directory=tmp_dir) 95 | 96 | i = 0 97 | while dataset.has_more_instances(): 98 | dataset.next_instance() 99 | i += 1 100 | 101 | assert str(dataset) 102 | assert isinstance(dataset, Sized), "Dataset must be an instance of Sized" 103 | assert len(dataset) == i, "Dataset length must be correct" 104 | -------------------------------------------------------------------------------- /tests/test_import.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import platform 3 | import os 4 | from capymoa._prepare_jpype import _get_java_home 5 | import tempfile 6 | from pathlib import Path 7 | from capymoa.env import capymoa_moa_jar 8 | import pytest 9 | import shutil 10 | 11 | PYTHON_EXE = os.sys.executable 12 | CMD = [PYTHON_EXE, "-c", "import capymoa"] 13 | CMD_ABOUT = [PYTHON_EXE, "-c", "import capymoa; capymoa.about()"] 14 | 15 | 16 | @pytest.fixture 17 | def env(): 18 | return os.environ.copy() 19 | 20 | 21 | def test_bad_infer_java_home(env): 22 | """Tests reporting errors when java cannot be found.""" 23 | del env["JAVA_HOME"] 24 | env["PATH"] = "" 25 | assert "JAVA_HOME" not in env 26 | result = subprocess.run(CMD_ABOUT, capture_output=True, env=env) 27 | print(result.stdout.decode()) 28 | assert result.returncode != 0 29 | exception = result.stderr.decode().splitlines()[-1] 30 | assert exception == ( 31 | "capymoa._prepare_jpype.CapymoaImportError: Java not found ensure " 32 | "`java -version` runs successfully. Alternatively, you may set the " 33 | "JAVA_HOME environment variable to the path of your Java installation " 34 | "for non-standard installations." 35 | ) 36 | 37 | 38 | def test_good_java_home(env): 39 | env["JAVA_HOME"] = _get_java_home().as_posix() 40 | result = subprocess.run(CMD, capture_output=True, env=env) 41 | assert result.returncode == 0 42 | 43 | 44 | def test_bad_java_home(env): 45 | notfound = Path("/notfound") 46 | env["JAVA_HOME"] = notfound.as_posix() 47 | result = subprocess.run(CMD, capture_output=True, env=env) 48 | assert result.returncode != 0 49 | exception = result.stderr.decode().splitlines()[-1] 50 | assert exception == ( 51 | f"capymoa._prepare_jpype.CapymoaImportError: The JAVA_HOME (`{str(notfound)}`) " 52 | "environment variable is set, but the path does not exist." 53 | ) 54 | 55 | 56 | def test_capymoa_moa_jar(env): 57 | notfound = Path("/notfound") 58 | env["CAPYMOA_MOA_JAR"] = notfound.as_posix() 59 | result = subprocess.run(CMD, capture_output=True, env=env) 60 | assert result.returncode != 0 61 | exception = result.stderr.decode().splitlines()[-1] 62 | assert exception == ( 63 | f"capymoa._prepare_jpype.CapymoaImportError: MOA jar not found at `{str(notfound)}`." 64 | ) 65 | 66 | 67 | def test_nonascii_capymoa(env): 68 | """Jpype and java used to struggle to start if the path to Jars contains 69 | non-ascii characters. This test ensures that this is no longer an issue. 70 | """ 71 | if platform.system() == "Windows": 72 | pytest.skip("Investigate why this fails on Windows and fix it.") 73 | 74 | with tempfile.TemporaryDirectory(suffix="☺") as d: 75 | moa_jar = shutil.copyfile(capymoa_moa_jar(), Path(d) / "moa.jar") 76 | env["CAPYMOA_MOA_JAR"] = moa_jar.as_posix() 77 | result = subprocess.run( 78 | [ 79 | PYTHON_EXE, 80 | "-c", 81 | "from capymoa.env import capymoa_moa_jar; print(capymoa_moa_jar())", 82 | ], 83 | capture_output=True, 84 | env=env, 85 | ) 86 | assert result.returncode == 0 87 | assert result.stdout.decode().splitlines()[-1].strip() == moa_jar.as_posix() 88 | 89 | 90 | def test_capymoa_datasets_dir(env): 91 | with tempfile.TemporaryDirectory() as d: 92 | env["CAPYMOA_DATASETS_DIR"] = d 93 | result = subprocess.run(CMD_ABOUT, capture_output=True, env=env) 94 | assert result.returncode == 0 95 | about = result.stdout.decode() 96 | assert f"CAPYMOA_DATASETS_DIR: {d}" in about 97 | 98 | 99 | def test_capymoa_jvm_args(env): 100 | env["CAPYMOA_JVM_ARGS"] = "-Xmx16g -Xss10M" 101 | result = subprocess.run(CMD_ABOUT, capture_output=True, env=env) 102 | assert result.returncode == 0 103 | about = result.stdout.decode() 104 | assert "CAPYMOA_JVM_ARGS: ['-Xmx16g', '-Xss10M']" in about 105 | -------------------------------------------------------------------------------- /tests/test_misc.py: -------------------------------------------------------------------------------- 1 | from capymoa.classifier import AdaptiveRandomForestClassifier 2 | from capymoa.datasets import ElectricityTiny 3 | from capymoa.misc import legacy_save_model, legacy_load_model, load_model 4 | from tempfile import TemporaryDirectory 5 | import pytest 6 | 7 | 8 | def test_legacy_save_load_model(): 9 | """Tests the legacy save and load model functions. 10 | 11 | Ensures using the new `load_model` function with a legacy model file raises 12 | an exception with a note explaining the error. 13 | """ 14 | 15 | # TODO: This should be removed when `legacy_save_model` and `legacy_load_model` 16 | # are removed. 17 | stream = ElectricityTiny() 18 | learner = AdaptiveRandomForestClassifier(schema=stream.get_schema()) 19 | with TemporaryDirectory() as tmpdir: 20 | filename = tmpdir + "/model.pkl" 21 | legacy_save_model(learner, filename) 22 | 23 | with pytest.raises(RuntimeError): 24 | with open(filename, "rb") as fd: 25 | load_model(fd) 26 | 27 | legacy_load_model(filename) 28 | -------------------------------------------------------------------------------- /tests/test_moajar.py: -------------------------------------------------------------------------------- 1 | import jpype 2 | from pathlib import Path 3 | from hashlib import sha256 4 | import capymoa 5 | 6 | _MOA_JAR_HASH = "b14be3c1df87aa5bf37f24c9a35258ab1f9a941897e61294701c43c0141dc2b7" 7 | 8 | 9 | def test_imports() -> None: 10 | assert capymoa 11 | """Test that the correct moa version is being packaged""" 12 | assert jpype.isJVMStarted(), ( 13 | "JVM should be started automatically when importing capymoa" 14 | ) 15 | jar_path = Path(jpype.getClassPath()) 16 | assert jar_path.suffix == ".jar", "MOA jar should be in the class path" 17 | 18 | with open(jar_path, "rb") as f: 19 | jar_hash = sha256(f.read()).hexdigest() 20 | 21 | assert jar_hash == _MOA_JAR_HASH, ( 22 | "MOA jar hash should match the expected hash. " 23 | "Try `invoke refresh-moa` to download the correct version. " 24 | "If you are expecting a new version update the `_MOA_JAR_HASH` variable`" 25 | ) 26 | -------------------------------------------------------------------------------- /tests/test_pi.py: -------------------------------------------------------------------------------- 1 | from capymoa.evaluation import ( 2 | PredictionIntervalEvaluator, 3 | PredictionIntervalWindowedEvaluator, 4 | ) 5 | from capymoa.datasets import Fried 6 | from capymoa.base import PredictionIntervalLearner 7 | from capymoa.prediction_interval import ( 8 | MVE, 9 | AdaPI, 10 | ) 11 | import pytest 12 | from functools import partial 13 | 14 | 15 | @pytest.mark.parametrize( 16 | "learner_constructor,coverage,win_coverage", 17 | [ 18 | (partial(MVE), 98.7, 99.0), 19 | (partial(AdaPI), 97.0, 97.0), 20 | ], 21 | ids=[ 22 | "MVE", 23 | "AdaPI", 24 | ], 25 | ) 26 | def test_PI(learner_constructor, coverage, win_coverage): 27 | """Test on tiny is a fast running simple test to check if a learner's 28 | accuracy has changed. 29 | 30 | Notice how we use the `partial` function to create a new function with 31 | hyperparameters already set. This allows us to use the same test function 32 | for different learners with different hyperparameters. 33 | """ 34 | stream = Fried() 35 | evaluator = PredictionIntervalEvaluator(schema=stream.get_schema()) 36 | win_evaluator = PredictionIntervalWindowedEvaluator( 37 | schema=stream.get_schema(), window_size=100 38 | ) 39 | learner: PredictionIntervalLearner = learner_constructor(schema=stream.get_schema()) 40 | 41 | i = 0 42 | while stream.has_more_instances(): 43 | i += 1 44 | if i >= 1000: 45 | break 46 | instance = stream.next_instance() 47 | prediction = learner.predict(instance) 48 | evaluator.update(instance.y_value, prediction) 49 | win_evaluator.update(instance.y_value, prediction) 50 | learner.train(instance) 51 | 52 | actual_coverage = evaluator.coverage() 53 | actual_win_coverage = win_evaluator.coverage()[-1] 54 | assert actual_coverage == pytest.approx(coverage, abs=0.1), ( 55 | f"Basic Eval: Expected {coverage:0.1f} coverage got {actual_coverage: 0.1f} coverage" 56 | ) 57 | assert actual_win_coverage == pytest.approx(win_coverage, abs=0.1), ( 58 | f"Windowed Eval: Expected {win_coverage:0.1f} coverage got {actual_win_coverage:0.1f} coverage" 59 | ) 60 | -------------------------------------------------------------------------------- /tests/test_progress_bar.py: -------------------------------------------------------------------------------- 1 | """Tests to ensure progress bars work correctly.""" 2 | 3 | from typing import Optional 4 | from pytest import CaptureFixture 5 | from capymoa.stream.generator import WaveformGenerator 6 | from capymoa.datasets import ElectricityTiny 7 | from capymoa.classifier import NoChange 8 | from capymoa.anomaly import HalfSpaceTrees 9 | from capymoa.evaluation import ( 10 | prequential_evaluation, 11 | prequential_ssl_evaluation, 12 | prequential_evaluation_anomaly, 13 | prequential_evaluation_multiple_learners, 14 | ) 15 | import pytest 16 | from tqdm import tqdm 17 | 18 | 19 | def assert_pbar(capfd: CaptureFixture, startswith: str): 20 | _, err = capfd.readouterr() 21 | err: str = err.splitlines()[-1] 22 | assert err.startswith(startswith) 23 | 24 | 25 | @pytest.mark.parametrize( 26 | "max_instances,instances", 27 | [ 28 | (100, 100), 29 | (None, 2000), 30 | (3000, 2000), 31 | ], 32 | ) 33 | def test_default( 34 | max_instances: Optional[int], instances: int, capfd: CaptureFixture 35 | ) -> None: 36 | stream = ElectricityTiny() 37 | classifier = NoChange(schema=stream.get_schema()) 38 | prequential_evaluation( 39 | stream, 40 | classifier, 41 | optimise=False, 42 | max_instances=max_instances, 43 | progress_bar=True, 44 | ) 45 | assert_pbar(capfd, "Eval 'NoChange' on 'ElectricityTiny':") 46 | 47 | 48 | def test_ssl(capfd: CaptureFixture) -> None: 49 | stream = ElectricityTiny() 50 | classifier = NoChange(schema=stream.get_schema()) 51 | prequential_ssl_evaluation( 52 | stream, classifier, optimise=False, progress_bar=True, max_instances=100 53 | ) 54 | assert_pbar(capfd, "SSL Eval 'NoChange' on 'ElectricityTiny':") 55 | 56 | 57 | def test_anomaly(capfd: CaptureFixture) -> None: 58 | stream = ElectricityTiny() 59 | classifier = HalfSpaceTrees(schema=stream.get_schema()) 60 | prequential_evaluation_anomaly( 61 | stream, classifier, optimise=False, progress_bar=True, max_instances=100 62 | ) 63 | assert_pbar(capfd, "AD Eval 'HalfSpaceTrees' on 'ElectricityTiny':") 64 | 65 | 66 | def test_multiple_learners(capfd: CaptureFixture) -> None: 67 | stream = ElectricityTiny() 68 | classifiers = { 69 | "a": NoChange(schema=stream.get_schema()), 70 | "b": NoChange(schema=stream.get_schema()), 71 | } 72 | prequential_evaluation_multiple_learners( 73 | stream, classifiers, progress_bar=True, max_instances=100 74 | ) 75 | assert_pbar(capfd, "Eval 2 learners on ElectricityTiny:") 76 | 77 | 78 | def test_no_length(capfd: CaptureFixture) -> None: 79 | generator = WaveformGenerator() 80 | classifier = NoChange(schema=generator.get_schema()) 81 | prequential_evaluation( 82 | generator, classifier, optimise=False, max_instances=100, progress_bar=True 83 | ) 84 | assert_pbar(capfd, "Eval 'NoChange' on 'WaveformGenerator':") 85 | 86 | 87 | def test_disabled_progress_bar(capfd: CaptureFixture) -> None: 88 | stream = ElectricityTiny() 89 | classifier = NoChange(schema=stream.get_schema()) 90 | prequential_evaluation(stream, classifier, optimise=False, progress_bar=False) 91 | out, err = capfd.readouterr() 92 | assert out == "" 93 | assert err == "" 94 | 95 | 96 | def test_tqdm(capfd: CaptureFixture) -> None: 97 | stream = ElectricityTiny() 98 | classifier = NoChange(schema=stream.get_schema()) 99 | with tqdm(desc="Custom Message") as progress_bar: 100 | prequential_evaluation( 101 | stream, classifier, optimise=False, progress_bar=progress_bar 102 | ) 103 | assert_pbar(capfd, "Custom Message:") 104 | -------------------------------------------------------------------------------- /tests/test_ssl_classifiers.py: -------------------------------------------------------------------------------- 1 | from capymoa.datasets._datasets import ElectricityTiny, CovtypeTiny 2 | from capymoa.ssl.classifier import OSNN 3 | 4 | import pytest 5 | from capymoa.evaluation.evaluation import prequential_ssl_evaluation 6 | from capymoa.base import ClassifierSSL 7 | from capymoa.stream import Stream 8 | from functools import partial 9 | 10 | 11 | def assert_ssl_evaluation( 12 | learner: ClassifierSSL, 13 | stream: Stream, 14 | expectation: float, 15 | label_probability: float = 0.01, 16 | max_instances: int = 1000, 17 | ): 18 | results = prequential_ssl_evaluation( 19 | stream=stream, 20 | learner=learner, 21 | label_probability=label_probability, 22 | window_size=10, 23 | max_instances=max_instances, 24 | ) 25 | 26 | assert results["cumulative"].accuracy() == pytest.approx(expectation), ( 27 | f"Expected accuracy of {expectation} but got {results['cumulative'].accuracy()}" 28 | + f" for learner {learner} on stream {stream}" 29 | ) 30 | 31 | 32 | @pytest.mark.parametrize( 33 | "learner_constructor, stream_constructor, expectation, label_probability", 34 | [ 35 | (partial(OSNN, optim_steps=10), ElectricityTiny, 46.1, None), 36 | (partial(OSNN, optim_steps=10), CovtypeTiny, 26.3, None), 37 | ], 38 | ids=[ 39 | "OSNN_ElectricityTiny", 40 | "OSNN_CovtypeTiny", 41 | ], 42 | ) 43 | def test_ssl_classifiers( 44 | learner_constructor, stream_constructor, expectation, label_probability 45 | ): 46 | # The optimizer steps are set to 10 to speed up the test 47 | stream = stream_constructor() 48 | learner = learner_constructor(schema=stream.get_schema()) 49 | 50 | if label_probability is None: 51 | label_probability = 0.01 52 | 53 | assert_ssl_evaluation( 54 | learner, 55 | stream, 56 | expectation, 57 | label_probability=label_probability, 58 | ) 59 | --------------------------------------------------------------------------------