├── .github ├── dependabot.yml └── workflows │ └── check.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── assets └── changepoint_example.png ├── examples ├── classification_based_cpd.ipynb ├── configs │ └── test_config_exp.yml └── knn_based_cpd.ipynb ├── pyproject.toml ├── pysatl_cpd ├── __init__.py ├── analysis │ ├── __init__.py │ └── results_analyzer.py ├── core │ ├── __init__.py │ ├── algorithms │ │ ├── __init__.py │ │ ├── abstract_algorithm.py │ │ ├── bayesian │ │ │ ├── __init__.py │ │ │ ├── abstracts │ │ │ │ ├── __init__.py │ │ │ │ ├── idetector.py │ │ │ │ ├── ihazard.py │ │ │ │ ├── ilikelihood.py │ │ │ │ └── ilocalizer.py │ │ │ ├── detectors │ │ │ │ ├── __init__.py │ │ │ │ ├── drop.py │ │ │ │ └── threshold.py │ │ │ ├── hazards │ │ │ │ ├── __init__.py │ │ │ │ └── constant.py │ │ │ ├── likelihoods │ │ │ │ ├── __init__.py │ │ │ │ ├── exponential_conjugate.py │ │ │ │ ├── gaussian.py │ │ │ │ ├── gaussian_conjugate.py │ │ │ │ └── heuristic_gaussian_vs_exponential.py │ │ │ └── localizers │ │ │ │ ├── __init__.py │ │ │ │ └── argmax.py │ │ ├── bayesian_algorithm.py │ │ ├── bayesian_linear_heuristic.py │ │ ├── bayesian_online_algorithm.py │ │ ├── classification │ │ │ ├── __init__.py │ │ │ ├── abstracts │ │ │ │ ├── __init__.py │ │ │ │ ├── iclassifier.py │ │ │ │ ├── iquality_metric.py │ │ │ │ └── istatistic_test.py │ │ │ ├── classifiers │ │ │ │ ├── __init__.py │ │ │ │ ├── decision_tree.py │ │ │ │ ├── knn.py │ │ │ │ ├── logistic_regression.py │ │ │ │ ├── rf.py │ │ │ │ └── svm.py │ │ │ ├── quality_metrics │ │ │ │ ├── __init__.py │ │ │ │ ├── classification │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── accuracy.py │ │ │ │ │ ├── f1.py │ │ │ │ │ └── mcc.py │ │ │ │ └── clustering │ │ │ │ │ └── __init__.py │ │ │ └── test_statistics │ │ │ │ ├── __init__.py │ │ │ │ └── threshold_overcome.py │ │ ├── classification_algorithm.py │ │ ├── density │ │ │ ├── __init__.py │ │ │ └── abstracts │ │ │ │ └── density_based_algorithm.py │ │ ├── graph │ │ │ ├── __init__.py │ │ │ ├── abstracts │ │ │ │ ├── ibuilder.py │ │ │ │ ├── igraph.py │ │ │ │ └── igraph_cpd.py │ │ │ ├── builders │ │ │ │ ├── __init__.py │ │ │ │ ├── list.py │ │ │ │ └── matrix.py │ │ │ ├── graph_cpd.py │ │ │ ├── graph_list.py │ │ │ └── graph_matrix.py │ │ ├── graph_algorithm.py │ │ ├── kliep_algorithm.py │ │ ├── knn │ │ │ ├── __init__.py │ │ │ ├── abstracts │ │ │ │ ├── __init__.py │ │ │ │ └── observation.py │ │ │ ├── classifier.py │ │ │ ├── graph.py │ │ │ └── heap.py │ │ ├── knn_algorithm.py │ │ ├── online_algorithm.py │ │ └── rulsif_algorithm.py │ ├── cpd_core.py │ ├── online_cpd_core.py │ ├── problem.py │ └── scrubber │ │ ├── __init__.py │ │ ├── abstract.py │ │ ├── data_providers.py │ │ └── linear.py ├── cpd_solver.py ├── generator │ ├── __init__.py │ ├── config_parser.py │ ├── dataset_description.py │ ├── distributions.py │ ├── generator.py │ └── saver.py ├── icpd_solver.py ├── labeled_data.py └── online_cpd_solver.py └── tests ├── __init__.py ├── test_configs ├── test_config_1.yml └── test_config_exp.yml ├── test_core ├── __init__.py ├── test_algorithms │ ├── __init__.py │ ├── test_algorithms_utils │ │ ├── __init__.py │ │ └── bayesian │ │ │ ├── __init__.py │ │ │ ├── test_detectors_and_localizers.py │ │ │ ├── test_hazards.py │ │ │ └── test_likelihoods.py │ ├── test_bayesian_algorithm.py │ ├── test_bayesian_linear_heuristic.py │ ├── test_bayesian_online_algorithm.py │ ├── test_classification_algorithms.py │ └── test_graph_algorithm.py ├── test_cpd_core.py ├── test_online_cpd_core.py └── test_scrubber │ ├── __init__.py │ ├── test_dataproviders.py │ └── test_linear_scrubber.py ├── test_generator ├── __init__.py ├── test_distributions.py └── test_generator.py ├── test_labeled_data.py ├── test_online_solver.py └── test_solver.py /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "monthly" 7 | groups: 8 | github-actions: 9 | patterns: 10 | - "*" 11 | - package-ecosystem: "pip" 12 | directory: "/" 13 | schedule: 14 | interval: "weekly" 15 | commit-message: 16 | prefix: "deps: " 17 | groups: 18 | pip-dependencies: 19 | patterns: 20 | - "*" 21 | -------------------------------------------------------------------------------- /.github/workflows/check.yaml: -------------------------------------------------------------------------------- 1 | name: Check code and run tests 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | matrix: 10 | python-version: [ "3.10", "3.11", "3.12", "3.13" ] 11 | steps: 12 | - uses: actions/checkout@v4 13 | - name: Set up Python ${{ matrix.python-version }} 14 | uses: actions/setup-python@v5 15 | with: 16 | python-version: ${{ matrix.python-version }} 17 | - name: Install Poetry 18 | run: | 19 | pipx install poetry==2.1.0 20 | 21 | - name: Install dependencies 22 | run: | 23 | poetry install --with dev 24 | 25 | - name: Lint with ruff 26 | run: | 27 | poetry run ruff check 28 | 29 | - name: Check types 30 | run: | 31 | poetry run mypy 32 | 33 | - name: Run tests 34 | run: | 35 | poetry run pytest --cov=pysatl_cpd 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | .idea/ 163 | /poetry.lock 164 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.5.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | - repo: https://github.com/astral-sh/ruff-pre-commit 9 | rev: v0.9.6 10 | hooks: 11 | - id: ruff 12 | args: [ --fix ] 13 | - id: ruff-format 14 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # PySATL CPD project contributing guide 2 | 3 | Thank you very much if you have decided to contribute to our project. 4 | We follow very simple and clear open-source research community accepted guidelines for contributing. 5 | The guideline instructions divided into sections depending on the part of the project you want to contribute. 6 | 7 | ## Rules for adding commits 8 | 9 | Create a new branch, if you want to add something new. 10 | Recommended naming branch is `/`. 11 | 12 | Commits are added according to conventional commits. 13 | Those `(): `. 14 | 15 | The `` field must take one of these values: 16 | 17 | * `feat` to add new functionality 18 | * `fix` to fix a bug in the project 19 | * `refactor` for code refactoring, such as renaming a variable 20 | * `test` to add tests, refactor them 21 | * `struct` for changes related to a change in the structure of the project (BUT NOT CODE), for example, changing 22 | folder locations 23 | * `ci` for various ci/cd tasks 24 | * `docs` for changes in documentation 25 | * `chore` for changes outside the code, for example, gitignore and reamde updates 26 | 27 | The `` field contains the gist of the changes in the present imperative in English without the dot in at the end, 28 | the first word is a verb with a small letter. 29 | 30 | Examples: 31 | 32 | * Good: "feat: add module for future scrubber implementations" 33 | * Bad: "Added module for future scrubber implementations." 34 | 35 | ## Source code developers guide 36 | 37 | 1. Fork this repository using your GitHub account. 38 | 2. Install `git` and clone your forked copy of the `repo`. 39 | 3. Build project following build instructions in [README.md](./README.md) file, make sure everything is ok. 40 | 4. Run tests following instructions in [README.md](./README.md) file, make sure all tests passing. 41 | 5. Implement new feature or fix existing one in the source code. 42 | 6. Commit your changes. 43 | 7. Open a pull-request. 44 | 8. Wait for review from developers of the project. 45 | 9. Fix major and minor issues if presented. 46 | 10. Get your work merged into `main`! 47 | 48 | ## Rules for collaborators 49 | 50 | ### Basic Tips 51 | 52 | 1. Don't use merge, only rebase (to keep a linear commit history) 53 | 2. Do not change other people's branches unless absolutely necessary 54 | 3. Recheck your commit history before creating a pull request 55 | 4. **Check you're on the right branch**, never commit directly in main 56 | 57 | ### Rules for pull requests 58 | 59 | **Forbidden** to merge your pull request into the branch yourself. 60 | 61 | Each pull request must be reviewed by one of the maintainers 62 | 63 | * Alexey Tatyanenko ([alexdtat](https://github.com/alexdtat)) 64 | * Artemii Patov ([artemiipatov](https://github.com/artemiipatov)) 65 | * Vladimir Kutuev ([vkutuev](https://github.com/vkutuev)) 66 | 67 | If you click on the green button, then **make sure** that it says `REBASE AND MERGE`! 68 | 69 | The review takes place in the form of comments to pull requests, discussions in the team chat and personal 70 | communication. 71 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024-present PySATL Contributors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PySATL-CPD 2 | 3 | [status-shield]: https://img.shields.io/github/actions/workflow/status/PySATL/pysatl-cpd/.github/workflows/check.yaml?branch=main&event=push&style=for-the-badge&label=Checks 4 | [status-url]: https://github.com/PySATL/pysatl-cpd/blob/main/.github/workflows/check.yaml 5 | [license-shield]: https://img.shields.io/github/license/PySATL/pysatl-cpd.svg?style=for-the-badge&color=blue 6 | [license-url]: LICENSE 7 | 8 | [![Checks][status-shield]][status-url] 9 | [![MIT License][license-shield]][license-url] 10 | 11 | PySATL **Change point detection** subproject (*abbreviated pysatl-cpd*) is a module, designed for detecting anomalies in time series data, which refer to significant deviations from expected patterns or trends. Anomalies can indicate unusual events or changes in a system, making them crucial for monitoring and analysis in various fields such as finance, healthcare, and network security. 12 | 13 | At the moment, the module implements the following CPD algorithms: 14 | * Bayesian algorithm (scrubbing, online and linear heuristic online versions) 15 | * Density based algorithms: 16 | * KLIEP 17 | * RuLSIF 18 | * Graph algorithm 19 | * k-NN based algorithm 20 | * Algorithms, based on classifiers: 21 | * SVM 22 | * KNN 23 | * Decision Tree 24 | * Logistic Regression 25 | * Random Forest 26 | --- 27 | 28 | ## Requirements 29 | 30 | - Python 3.10+ 31 | - Poetry 2.1.0+ 32 | 33 | ## Installation 34 | 35 | Clone the repository: 36 | 37 | ```bash 38 | git clone https://github.com/PySATL/pysatl-cpd 39 | ``` 40 | 41 | Install dependencies: 42 | 43 | ```bash 44 | poetry install 45 | ``` 46 | 47 | ## Change point detection example: 48 | 49 | ```python 50 | from pathlib import Path 51 | 52 | from pysatl_cpd.labeled_data import LabeledCpdData 53 | 54 | # import change point detection solver 55 | from pysatl_cpd.online_cpd_solver import OnlineCpdSolver 56 | from pysatl_cpd.core.problem import CpdProblem 57 | 58 | # import algorithm 59 | from pysatl_cpd.core.algorithms.bayesian_online_algorithm import BayesianOnline 60 | from pysatl_cpd.core.algorithms.bayesian.likelihoods.gaussian_conjugate import GaussianConjugate 61 | from pysatl_cpd.core.algorithms.bayesian.hazards.constant import ConstantHazard 62 | from pysatl_cpd.core.algorithms.bayesian.detectors.threshold import ThresholdDetector 63 | from pysatl_cpd.core.algorithms.bayesian.localizers.argmax import ArgmaxLocalizer 64 | 65 | 66 | labeled_data = LabeledCpdData.generate_cp_datasets(Path("examples/configs/test_config_exp.yml"))["example"] 67 | 68 | # specify CPD algorithm with parameters 69 | algorithm = BayesianOnline( 70 | learning_sample_size=5, 71 | likelihood=GaussianConjugate(), 72 | hazard=ConstantHazard(rate=1.0 / (1.0 - 0.5 ** (1.0 / 500))), 73 | detector=ThresholdDetector(threshold=0.005), 74 | localizer=ArgmaxLocalizer(), 75 | ) 76 | # make a solver object 77 | solver = OnlineCpdSolver(CpdProblem(True), algorithm, labeled_data) 78 | 79 | 80 | # then run algorithm 81 | cpd_results = solver.run() 82 | 83 | # print the results 84 | print(cpd_results) 85 | # output: 86 | # Located change points: (200;400) 87 | # Expected change point: (200;400) 88 | # Difference: () 89 | # Computation time (sec): 0.2 90 | 91 | # visualize data with located changepoints 92 | cpd_results.visualize() 93 | ``` 94 | ![example_of_output](assets/changepoint_example.png) 95 | 96 | ## Development 97 | 98 | Install requirements 99 | 100 | ```bash 101 | poetry install --with dev 102 | ``` 103 | 104 | ## Pre-commit 105 | 106 | Install pre-commit hooks: 107 | 108 | ```shell 109 | poetry run pre-commit install 110 | ``` 111 | 112 | Starting manually: 113 | 114 | ```shell 115 | poetry run pre-commit run --all-files --color always --verbose --show-diff-on-failure 116 | ``` 117 | 118 | ## License 119 | 120 | This project is licensed under the terms of the **MIT** license. See the [LICENSE](LICENSE) for more information. 121 | -------------------------------------------------------------------------------- /assets/changepoint_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/assets/changepoint_example.png -------------------------------------------------------------------------------- /examples/configs/test_config_exp.yml: -------------------------------------------------------------------------------- 1 | - name: example 2 | distributions: 3 | - type: exponential 4 | length: 200 5 | parameters: 6 | rate: 2.0 7 | - type: beta 8 | length: 200 9 | parameters: 10 | alpha: 1.0 11 | beta: 5.0 12 | - type: uniform 13 | length: 200 14 | parameters: 15 | min: 0 16 | max: 0.5 17 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "pysatl_cpd" 3 | version = "0.1.0" 4 | description = "Batch module for changepoint detection" 5 | authors = [ 6 | "Temerlan Akhmetov ", 7 | "Alexey Tatyanenko ", 8 | "Artemii Patov ", 9 | "Vladimir Kutuev ", 10 | "Aleksei Ivanov ", 11 | "Artem Romanyuk ", 12 | "Aleksandra Listkova ", 13 | ] 14 | license = "MIT" 15 | readme = "README.md" 16 | repository = "https://github.com/PySATL/pysatl-cpd" 17 | 18 | 19 | [tool.poetry.dependencies] 20 | python = "^3.10" 21 | numpy = "^2.0.0" 22 | scipy = "^1.14.0" 23 | matplotlib = "^3.9.1" 24 | scikit-learn = "^1.5.2" 25 | PyQt5 = "^5.15.11" 26 | 27 | [tool.poetry.group.dev.dependencies] 28 | pytest = "^8.2.2" 29 | mypy = "^1.10.1" 30 | ruff = "^0.11.2" 31 | pre-commit = "^4.1.0" 32 | pyyaml = "^6.0.1" 33 | matplotlib = "^3.9.1" 34 | ipykernel = "^6.29.5" 35 | hypothesis = "^6.122.1" 36 | scipy-stubs = "^1.15.2" 37 | types-pyyaml = "^6.0.12" 38 | microsoft-python-type-stubs = {git = "https://github.com/microsoft/python-type-stubs.git"} 39 | pytest-cov = "^6.0.0" 40 | 41 | 42 | [tool.ruff] 43 | line-length = 120 44 | indent-width = 4 45 | respect-gitignore = true 46 | exclude = ["*.ipynb"] 47 | 48 | [tool.ruff.format] 49 | quote-style = "double" 50 | indent-style = "space" 51 | docstring-code-format = true 52 | skip-magic-trailing-comma = false 53 | line-ending = "auto" 54 | 55 | [tool.ruff.lint] 56 | select = ["A", "E", "F", "I", "PL", "RUF", "SIM", "UP", "W"] 57 | ignore = ["PLR0913"] 58 | 59 | [tool.mypy] 60 | files = "pysatl_cpd" 61 | mypy_path = "pysatl_cpd" 62 | strict = true 63 | 64 | 65 | [build-system] 66 | requires = ["poetry-core"] 67 | build-backend = "poetry.core.masonry.api" 68 | -------------------------------------------------------------------------------- /pysatl_cpd/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/pysatl_cpd/__init__.py -------------------------------------------------------------------------------- /pysatl_cpd/analysis/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/pysatl_cpd/analysis/__init__.py -------------------------------------------------------------------------------- /pysatl_cpd/analysis/results_analyzer.py: -------------------------------------------------------------------------------- 1 | class CpdResultsAnalyzer: 2 | """Class for counting confusion matrix and other metrics on CPD results""" 3 | 4 | @staticmethod 5 | def count_confusion_matrix( 6 | predicted: list[int], actual: list[int], window: tuple[int, int] | None = None 7 | ) -> tuple[int, int, int, int]: 8 | """static method for counting confusion matrix for hypothesis of equality of change points on a window 9 | 10 | :param: predicted: first array or list of change points, determined as prediction 11 | :param: actual: second array or list of change points, determined as actual 12 | :param: window: tuple of two indices (start, stop), determines a window for hypothesis 13 | 14 | :return: tuple of integers (true-positive, true-negative, false-positive, false-negative) 15 | """ 16 | if not predicted and not actual: 17 | raise ValueError("no results and no predictions") 18 | if window is None: 19 | window = (min(predicted + actual), max(predicted + actual)) 20 | predicted_set = set(predicted) 21 | actual_set = set(actual) 22 | tp = tn = fp = fn = 0 23 | for i in range(window[0], window[1]): 24 | if i in predicted_set: 25 | if i in actual_set: 26 | tp += 1 27 | continue 28 | fp += 1 29 | elif i in actual_set: 30 | fn += 1 31 | continue 32 | tn += 1 33 | return tp, tn, fp, fn 34 | 35 | @staticmethod 36 | def count_accuracy(predicted: list[int], actual: list[int], window: tuple[int, int] | None = None) -> float: 37 | """static method for counting accuracy metric for hypothesis of equality of change points on a window 38 | 39 | :param: predicted: first array or list of change points, determined as prediction 40 | :param: actual: second array or list of change points, determined as actual 41 | :param: window: tuple of two indices (start, stop), determines a window for hypothesis 42 | 43 | :return: float, accuracy metric 44 | """ 45 | tp, tn, fp, fn = CpdResultsAnalyzer.count_confusion_matrix(predicted, actual, window) 46 | if tp + tn == 0: 47 | return 0.0 48 | return (tp + tn) / (tp + tn + fp + fn) 49 | 50 | @staticmethod 51 | def count_precision(predicted: list[int], actual: list[int], window: tuple[int, int] | None = None) -> float: 52 | """static method for counting precision metric for hypothesis of equality of change points on a window 53 | 54 | :param: predicted: first array or list of change points, determined as prediction 55 | :param: actual: second array or list of change points, determined as actual 56 | :param: window: tuple of two indices (start, stop), determines a window for hypothesis 57 | 58 | :return: float, precision metric 59 | """ 60 | tp, _, fp, _ = CpdResultsAnalyzer.count_confusion_matrix(predicted, actual, window) 61 | if tp == 0: 62 | return 0.0 63 | return tp / (tp + fp) 64 | 65 | @staticmethod 66 | def count_recall(predicted: list[int], actual: list[int], window: tuple[int, int] | None = None) -> float: 67 | """static method for counting recall metric for hypothesis of equality of change points on a window 68 | 69 | :param: predicted: first array or list of change points, determined as prediction 70 | :param: actual: second array or list of change points, determined as actual 71 | :param: window: tuple of two indices (start, stop), determines a window for hypothesis 72 | 73 | :return: float, recall metric 74 | """ 75 | tp, _, _, fn = CpdResultsAnalyzer.count_confusion_matrix(predicted, actual, window) 76 | if tp == 0: 77 | return 0 78 | return tp / (tp + fn) 79 | -------------------------------------------------------------------------------- /pysatl_cpd/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/pysatl_cpd/core/__init__.py -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/pysatl_cpd/core/algorithms/__init__.py -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/abstract_algorithm.py: -------------------------------------------------------------------------------- 1 | from typing import Protocol 2 | 3 | import numpy as np 4 | import numpy.typing as npt 5 | 6 | 7 | class Algorithm(Protocol): 8 | """Protocol for change point detection algorithms' interface""" 9 | 10 | def detect(self, window: npt.NDArray[np.float64]) -> int: 11 | """Function for finding change points in window 12 | 13 | :param window: part of global data for finding change points 14 | :return: the number of change points in the window 15 | """ 16 | ... 17 | 18 | def localize(self, window: npt.NDArray[np.float64]) -> list[int]: 19 | """Function for finding coordinates of change points in window 20 | 21 | :param window: part of global data for finding change points 22 | :return: list of window change points 23 | """ 24 | ... 25 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/bayesian/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for Bayesian CPD algorithm's customization blocks. 3 | """ 4 | 5 | __author__ = "Alexey Tatyanenko" 6 | __copyright__ = "Copyright (c) 2024 Alexey Tatyanenko" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/bayesian/abstracts/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for abstract base classes for Bayesian CPD algorithm. 3 | """ 4 | 5 | __author__ = "Alexey Tatyanenko" 6 | __copyright__ = "Copyright (c) 2024 Alexey Tatyanenko" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/bayesian/abstracts/idetector.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for Bayesian CPD algorithm detector's abstract base class. 3 | """ 4 | 5 | __author__ = "Alexey Tatyanenko" 6 | __copyright__ = "Copyright (c) 2024 Alexey Tatyanenko" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | 10 | from typing import Protocol 11 | 12 | import numpy as np 13 | import numpy.typing as npt 14 | 15 | 16 | class IDetector(Protocol): 17 | """ 18 | Protocol for detectors that detect a change point with given growth probabilities for run lengths. 19 | """ 20 | 21 | def detect(self, growth_probs: npt.NDArray[np.float64]) -> bool: 22 | """ 23 | Checks whether a changepoint occurred with given growth probabilities at the time. 24 | :param growth_probs: growth probabilities for run lengths at the time. 25 | :return: boolean indicating whether a changepoint occurred 26 | """ 27 | ... 28 | 29 | def clear(self) -> None: 30 | """ 31 | Clears the detector's state. 32 | """ 33 | ... 34 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/bayesian/abstracts/ihazard.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for Bayesian CPD algorithm hazard function's abstract base class. 3 | """ 4 | 5 | __author__ = "Alexey Tatyanenko" 6 | __copyright__ = "Copyright (c) 2024 Alexey Tatyanenko" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | 10 | from typing import Protocol 11 | 12 | import numpy as np 13 | import numpy.typing as npt 14 | 15 | 16 | class IHazard(Protocol): 17 | """ 18 | Hazard function protocol. 19 | """ 20 | 21 | def hazard(self, run_lengths: npt.NDArray[np.intp]) -> npt.NDArray[np.float64]: 22 | """ 23 | Calculates the hazard function for given run lengths. 24 | :param run_lengths: run lengths at the time. 25 | :return: hazard function's values for given run lengths. 26 | """ 27 | ... 28 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/bayesian/abstracts/ilikelihood.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for Bayesian CPD algorithm likelihood function's abstract base class and its' extension for a sample's 3 | probability evaluation with estimated prior parameters. 4 | """ 5 | 6 | __author__ = "Alexey Tatyanenko" 7 | __copyright__ = "Copyright (c) 2025 PySATL project" 8 | __license__ = "SPDX-License-Identifier: MIT" 9 | 10 | 11 | from typing import Protocol 12 | 13 | import numpy as np 14 | import numpy.typing as npt 15 | 16 | 17 | class ILikelihood(Protocol): 18 | """ 19 | Likelihood function's protocol. 20 | """ 21 | 22 | def learn(self, learning_sample: npt.NDArray[np.float64]) -> None: 23 | """ 24 | Learns first parameters of a likelihood function on a given sample. 25 | :param learning_sample: a sample for parameter learning. 26 | """ 27 | ... 28 | 29 | def predict(self, observation: np.float64) -> npt.NDArray[np.float64]: 30 | """ 31 | Returns predictive probabilities for a given observation based on stored parameters. 32 | :param observation: an observation from a sample. 33 | :return: predictive probabilities for a given observation. 34 | """ 35 | ... 36 | 37 | def update(self, observation: np.float64) -> None: 38 | """ 39 | Updates parameters of a likelihood function according to the given observation. 40 | :param observation: an observation from a sample. 41 | """ 42 | ... 43 | 44 | def clear(self) -> None: 45 | """ 46 | Clears likelihood function's state. 47 | """ 48 | ... 49 | 50 | 51 | class ILikelihoodWithPriorProbability(ILikelihood, Protocol): 52 | """ 53 | Likelihood which also allows to evaluate how probable is learning sample with learned prior parameters. 54 | """ 55 | 56 | def probability_of_learned_prior(self, sample: npt.NDArray[np.float64]) -> np.float64: 57 | """ 58 | Evaluation of how probable is learning sample with learned prior parameters. 59 | :param sample: a sample for the likelihood. 60 | :return: probability of getting a learning sample with learned prior parameters. 61 | """ 62 | ... 63 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/bayesian/abstracts/ilocalizer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for Bayesian CPD algorithm localizer's abstract base class. 3 | """ 4 | 5 | __author__ = "Alexey Tatyanenko" 6 | __copyright__ = "Copyright (c) 2024 Alexey Tatyanenko" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | 10 | from typing import Protocol 11 | 12 | import numpy as np 13 | import numpy.typing as npt 14 | 15 | 16 | class ILocalizer(Protocol): 17 | """ 18 | Protocol for localizers that localize a change point with given growth probabilities for run lengths. 19 | """ 20 | 21 | def localize(self, growth_probs: npt.NDArray[np.float64]) -> int: 22 | """ 23 | Localizes a change point with given growth probabilities for run lengths. 24 | :param growth_probs: growth probabilities for run lengths at the time. 25 | :return: run length corresponding with a change point. 26 | """ 27 | ... 28 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/bayesian/detectors/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementations of Bayesian CPD algorithm detectors. 3 | """ 4 | 5 | __author__ = "Alexey Tatyanenko" 6 | __copyright__ = "Copyright (c) 2024 Alexey Tatyanenko" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/bayesian/detectors/drop.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementation of Bayesian CPD algorithm detector analyzing drop of maximal run length's probability. 3 | """ 4 | 5 | __author__ = "Alexey Tatyanenko" 6 | __copyright__ = "Copyright (c) 2024 Alexey Tatyanenko" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | from typing import Optional 10 | 11 | import numpy as np 12 | import numpy.typing as npt 13 | 14 | from pysatl_cpd.core.algorithms.bayesian.abstracts.idetector import IDetector 15 | 16 | 17 | class DropDetector(IDetector): 18 | """ 19 | A detector that detects a change point if the instantaneous drop in the probability of the maximum run length 20 | exceeds the threshold. 21 | """ 22 | 23 | def __init__(self, threshold: float): 24 | """ 25 | Initializes the detector with given drop threshold. 26 | :param threshold: threshold for a drop of the maximum run length's probability. 27 | """ 28 | self.__previous_growth_prob: Optional[float] = None 29 | 30 | self._threshold = threshold 31 | assert 0.0 <= self._threshold <= 1.0, "Drop threshold must be in [0.0, 1.0]" 32 | 33 | def detect(self, growth_probs: npt.NDArray[np.float64]) -> bool: 34 | """ 35 | Checks whether a changepoint occurred with given growth probabilities at the time. 36 | :param growth_probs: growth probabilities for run lengths at the time. 37 | :return: boolean indicating whether a changepoint occurred. 38 | """ 39 | if len(growth_probs) == 0: 40 | return False 41 | 42 | last_growth_prob = growth_probs[-1] 43 | if self.__previous_growth_prob is None: 44 | self.__previous_growth_prob = last_growth_prob 45 | return False 46 | 47 | drop = float(self.__previous_growth_prob - last_growth_prob) 48 | 49 | return drop >= self._threshold 50 | 51 | def clear(self) -> None: 52 | """ 53 | Clears the detector's state. 54 | """ 55 | self.__previous_growth_prob = None 56 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/bayesian/detectors/threshold.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementation of Bayesian CPD algorithm detector comparing maximal run length's probability with 3 | a threshold. 4 | """ 5 | 6 | __author__ = "Alexey Tatyanenko" 7 | __copyright__ = "Copyright (c) 2024 Alexey Tatyanenko" 8 | __license__ = "SPDX-License-Identifier: MIT" 9 | 10 | 11 | import numpy as np 12 | import numpy.typing as npt 13 | 14 | from pysatl_cpd.core.algorithms.bayesian.abstracts.idetector import IDetector 15 | 16 | 17 | class ThresholdDetector(IDetector): 18 | """ 19 | A detector that detects a change point if the probability of the maximum run length drops below the threshold. 20 | """ 21 | 22 | def __init__(self, threshold: float): 23 | """ 24 | Detects a change point if the probability of the maximum run length drops below the threshold. 25 | :param threshold: lower threshold for the maximum run length's probability. 26 | """ 27 | self._threshold = threshold 28 | assert 0.0 <= self._threshold <= 1.0, "Threshold must be in [0.0, 1.0]" 29 | 30 | def detect(self, growth_probs: npt.NDArray[np.float64]) -> bool: 31 | """ 32 | Detects a change point if the probability of the maximum run length drops below the threshold. 33 | :param growth_probs: growth probabilities for run lengths at the time. 34 | :return: boolean indicating whether a changepoint occurred. 35 | """ 36 | return len(growth_probs) > 0 and growth_probs[-1] < self._threshold 37 | 38 | def clear(self) -> None: 39 | """ 40 | Clears the detector's state (for this detector it does nothing). 41 | """ 42 | pass 43 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/bayesian/hazards/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementations of Bayesian CPD algorithm hazard functions. 3 | """ 4 | 5 | __author__ = "Alexey Tatyanenko" 6 | __copyright__ = "Copyright (c) 2024 Alexey Tatyanenko" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/bayesian/hazards/constant.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementation of Bayesian CPD algorithm constant hazard function corresponding to an exponential 3 | distribution. 4 | """ 5 | 6 | __author__ = "Alexey Tatyanenko" 7 | __copyright__ = "Copyright (c) 2024 Alexey Tatyanenko" 8 | __license__ = "SPDX-License-Identifier: MIT" 9 | 10 | 11 | import numpy as np 12 | import numpy.typing as npt 13 | 14 | from pysatl_cpd.core.algorithms.bayesian.abstracts.ihazard import IHazard 15 | 16 | 17 | class ConstantHazard(IHazard): 18 | """ 19 | A constant hazard function, corresponding to an exponential distribution with a given rate. 20 | """ 21 | 22 | def __init__(self, rate: float): 23 | """ 24 | Initializes the constant hazard function with a given rate of an underlying exponential distribution. 25 | :param rate: rate of an underlying exponential distribution. 26 | """ 27 | self._rate = np.float64(rate) 28 | assert self._rate >= 1.0, "Hazard rate cannot be less than 1.0" 29 | 30 | def hazard(self, run_lengths: npt.NDArray[np.intp]) -> npt.NDArray[np.float64]: 31 | """ 32 | Calculates the constant hazard function. 33 | :param run_lengths: run lengths at the time. 34 | :return: hazard function's values for given run lengths. 35 | """ 36 | return np.ones(len(run_lengths)) / self._rate 37 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/bayesian/likelihoods/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementations of Bayesian CPD algorithm likelihood functions. 3 | """ 4 | 5 | __author__ = "Alexey Tatyanenko" 6 | __copyright__ = "Copyright (c) 2024 Alexey Tatyanenko" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/bayesian/likelihoods/exponential_conjugate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for exponential likelihood function with gamma prior used in Bayesian change point detection. Also contains its' 3 | extension for a sample's probability evaluation with estimated prior parameters. 4 | """ 5 | 6 | __author__ = "Alexey Tatyanenko" 7 | __copyright__ = "Copyright (c) 2025 PySATL project" 8 | __license__ = "SPDX-License-Identifier: MIT" 9 | 10 | from typing import Optional 11 | 12 | import numpy as np 13 | import scipy.stats 14 | from numpy import typing as npt 15 | 16 | from pysatl_cpd.core.algorithms.bayesian.abstracts.ilikelihood import ILikelihood, ILikelihoodWithPriorProbability 17 | 18 | 19 | class ExponentialConjugate(ILikelihood): 20 | """ 21 | Class implementing exponential likelihood function with conjugate gamma prior for Bayesian change point detection. 22 | Note: it's support is [0; +inf) 23 | """ 24 | 25 | def __init__(self) -> None: 26 | self._shape_prior: Optional[np.float64] = None 27 | self._scale_prior: Optional[np.float64] = None 28 | 29 | self.__shapes: npt.NDArray[np.float64] = np.array([]) 30 | self.__scales: npt.NDArray[np.float64] = np.array([]) 31 | 32 | def learn(self, learning_sample: npt.NDArray[np.float64]) -> None: 33 | """ 34 | Learns starting prior parameters to model exponential distribution's likelihood function. 35 | :param learning_sample: sample to learn starting prior parameters. 36 | :return: 37 | """ 38 | self._shape_prior = np.float64(learning_sample.shape[0]) 39 | self._scale_prior = np.sum(learning_sample) 40 | 41 | assert self._shape_prior is not None 42 | assert self._scale_prior is not None 43 | 44 | self.__shapes = np.array([self._shape_prior]) 45 | self.__scales = np.array([self._scale_prior]) 46 | 47 | def update(self, observation: np.float64) -> None: 48 | """ 49 | Updates parameters (calculating posterior parameters) after a given new observation. 50 | :param observation: a new observation of time series. 51 | :return: 52 | """ 53 | assert self._shape_prior is not None 54 | assert self._scale_prior is not None 55 | 56 | self.__shapes = np.append([self._shape_prior], (self.__shapes + 1.0)) 57 | self.__scales = np.append([self._scale_prior], (self.__scales + observation)) 58 | 59 | def predict(self, observation: np.float64) -> npt.NDArray[np.float64]: 60 | """ 61 | Calculates predictive posterior probabilities of exponential likelihood for corresponding values of run length. 62 | :param observation: a new observation of time series. 63 | :return: an array of predictive posterior probabilities (densities). 64 | """ 65 | assert self._shape_prior is not None 66 | assert self._scale_prior is not None 67 | 68 | predictive_probabilities = scipy.stats.lomax.pdf( 69 | x=observation, 70 | c=self.__shapes, 71 | loc=0.0, 72 | scale=self.__scales, 73 | ) 74 | 75 | # In case of negative scale parameter corresponding distribution does not exist, so substitution of 76 | # an observation results in a NaN-value. In context of algorithm it can be assumed that this probability is 0. 77 | without_nans = np.nan_to_num(x=predictive_probabilities, nan=0.0) 78 | 79 | return np.array(without_nans) 80 | 81 | def clear(self) -> None: 82 | """ 83 | Clears a current state of the likelihood, setting parameters to default init values. 84 | :return: 85 | """ 86 | self._shape_prior = None 87 | self._scale_prior = None 88 | 89 | self.__shapes = np.array([]) 90 | self.__scales = np.array([]) 91 | 92 | 93 | class ExponentialConjugateWithPriorProbability(ExponentialConjugate, ILikelihoodWithPriorProbability): 94 | """ 95 | Exponential likelihood, supporting a sample's probability evaluation with estimated prior parameters. 96 | """ 97 | 98 | def __init__(self) -> None: 99 | super().__init__() 100 | 101 | def probability_of_learned_prior(self, sample: npt.NDArray[np.float64]) -> np.float64: 102 | """ 103 | Evaluates probability of a sample with learned prior parameters of exponential conjugate likelihood. 104 | :param sample: sample for probability's evaluation. 105 | :return: probability of a sample with learned prior parameters of exponential conjugate likelihood. 106 | """ 107 | assert self._shape_prior is not None 108 | assert self._scale_prior is not None 109 | 110 | probabilities_of_learning_sample = scipy.stats.lomax.pdf( 111 | x=sample, 112 | c=self._shape_prior, 113 | loc=0.0, 114 | scale=self._scale_prior, 115 | ) 116 | 117 | without_nans = np.nan_to_num(x=probabilities_of_learning_sample, nan=0.0) 118 | 119 | probability_of_learning_sample = np.prod(without_nans) 120 | return np.float64(probability_of_learning_sample) 121 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/bayesian/likelihoods/gaussian.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementation of Bayesian CPD algorithm gaussian (normal) likelihood function with mean and standard 3 | deviation learning. 4 | """ 5 | 6 | __author__ = "Alexey Tatyanenko" 7 | __copyright__ = "Copyright (c) 2024 Alexey Tatyanenko" 8 | __license__ = "SPDX-License-Identifier: MIT" 9 | 10 | import numpy as np 11 | import numpy.typing as npt 12 | from scipy import stats 13 | from typing_extensions import deprecated 14 | 15 | from pysatl_cpd.core.algorithms.bayesian.abstracts.ilikelihood import ILikelihood 16 | 17 | 18 | @deprecated("Use GaussianConjugate instead") 19 | class Gaussian(ILikelihood): 20 | """ 21 | Likelihood for Gaussian (a.k.a. normal) distribution, parametrized by mean and standard deviation. 22 | """ 23 | 24 | def __init__(self) -> None: 25 | """ 26 | Initializes the GaussianLikelihood, parametrized by mean and standard deviation (without any concrete values). 27 | """ 28 | self.__means = np.array([]) 29 | self.__standard_deviations = np.array([]) 30 | 31 | self.__sample_sum = 0.0 32 | self.__squared_sample_sum = 0.0 33 | self.__gap_size = 0 34 | 35 | def __update_parameters_lists(self) -> None: 36 | """ 37 | Updates the parameters lists based on accumulated sums, assuming we have at least 2 observations. 38 | """ 39 | assert self.__gap_size > 1 40 | new_mean = self.__sample_sum / self.__gap_size 41 | variance = (self.__squared_sample_sum - (self.__sample_sum**2.0) / self.__gap_size) / (self.__gap_size - 1) 42 | assert variance > 0.0 43 | assert len(self.__means) == len(self.__standard_deviations) 44 | 45 | new_standard_deviation = np.sqrt(variance) 46 | 47 | self.__means = np.append(self.__means, new_mean) 48 | self.__standard_deviations = np.append(self.__standard_deviations, new_standard_deviation) 49 | 50 | def learn(self, learning_sample: npt.NDArray[np.float64]) -> None: 51 | """ 52 | Learns first mean and stander deviations from a given sample. 53 | :param learning_sample: a sample for parameter learning. 54 | :return: 55 | """ 56 | assert len(self.__means) == len(self.__standard_deviations) == 0 57 | assert self.__gap_size == 0 58 | 59 | self.__sample_sum += sum(learning_sample) 60 | for observation in learning_sample: 61 | self.__squared_sample_sum += observation**2.0 62 | 63 | self.__gap_size = len(learning_sample) 64 | # self.__squared_sample_sum += sum(learning_sample ** 2.) 65 | 66 | self.__update_parameters_lists() 67 | 68 | def update(self, observation: np.float64) -> None: 69 | """ 70 | Updates the means and standard deviations lists according to the given observation. 71 | :param observation: an observation from a sample. 72 | :return: 73 | """ 74 | self.__sample_sum += observation 75 | self.__squared_sample_sum += observation**2 76 | self.__gap_size += 1 77 | 78 | self.__update_parameters_lists() 79 | 80 | def predict(self, observation: np.float64) -> npt.NDArray[np.float64]: 81 | """ 82 | Returns predictive probabilities for a given observation based on stored means and standard deviations. 83 | :param observation: an observation from a sample. 84 | :return: predictive probabilities for a given observation. 85 | """ 86 | return np.array(stats.norm(self.__means, self.__standard_deviations).pdf(observation)) 87 | 88 | def clear(self) -> None: 89 | """ 90 | Clears parameters of gaussian likelihood. 91 | :return: 92 | """ 93 | self.__means = np.array([]) 94 | self.__standard_deviations = np.array([]) 95 | 96 | self.__sample_sum = 0.0 97 | self.__squared_sample_sum = 0.0 98 | self.__gap_size = 0 99 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/bayesian/likelihoods/heuristic_gaussian_vs_exponential.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for prediction model for Bayesian online CPD, which supports heuristic selection of gaussian (normal) or 3 | exponential conjugate likelihood based on estimation from learning sample. 4 | """ 5 | 6 | from typing import Optional 7 | 8 | import numpy as np 9 | from numpy import typing as npt 10 | 11 | from pysatl_cpd.core.algorithms.bayesian.abstracts.ilikelihood import ILikelihood, ILikelihoodWithPriorProbability 12 | from pysatl_cpd.core.algorithms.bayesian.likelihoods.exponential_conjugate import ( 13 | ExponentialConjugateWithPriorProbability, 14 | ) 15 | from pysatl_cpd.core.algorithms.bayesian.likelihoods.gaussian_conjugate import GaussianConjugateWithPriorProbability 16 | 17 | 18 | class HeuristicGaussianVsExponential(ILikelihood): 19 | """ 20 | Prediction model class with heuristic selection of gaussian (normal) or exponential conjugate likelihood based on 21 | estimation from learning sample. 22 | """ 23 | 24 | def __init__(self) -> None: 25 | self.__likelihood: Optional[ILikelihoodWithPriorProbability] = None 26 | 27 | def learn(self, learning_sample: npt.NDArray[np.float64]) -> None: 28 | """ 29 | Learns prior parameters for gaussian and exponential likelihoods, evaluates which makes a learning sample more 30 | probable and saves acquired likelihood for further work. 31 | :param learning_sample: a sample to estimate prior parameters and compare likelihoods. 32 | :return: 33 | """ 34 | gaussian = GaussianConjugateWithPriorProbability() 35 | exponential = ExponentialConjugateWithPriorProbability() 36 | 37 | gaussian.learn(learning_sample) 38 | exponential.learn(learning_sample) 39 | 40 | gaussian_probability = gaussian.probability_of_learned_prior(learning_sample) 41 | exponential_probability = exponential.probability_of_learned_prior(learning_sample) 42 | 43 | self.__likelihood = gaussian if gaussian_probability >= exponential_probability else exponential 44 | 45 | def predict(self, observation: np.float64) -> npt.NDArray[np.float64]: 46 | """ 47 | Returns prediction from an underlying likelihood. 48 | :param observation: a new observation of time series. 49 | :return: an array of predictive posterior probabilities (densities). 50 | """ 51 | assert self.__likelihood is not None, "Underlying likelihood must not be None" 52 | 53 | return self.__likelihood.predict(observation) 54 | 55 | def update(self, observation: np.float64) -> None: 56 | """ 57 | Updates an underlying likelihood's state (calculates posterior parameters). 58 | :param observation: a new observation of time series. 59 | :return: 60 | """ 61 | assert self.__likelihood is not None, "Underlying likelihood must not be None" 62 | 63 | self.__likelihood.update(observation) 64 | 65 | def clear(self) -> None: 66 | """ 67 | Sets an underlying likelihood to None. 68 | :return: 69 | """ 70 | self.__likelihood = None 71 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/bayesian/localizers/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementations of Bayesian CPD algorithm localizers. 3 | """ 4 | 5 | __author__ = "Alexey Tatyanenko" 6 | __copyright__ = "Copyright (c) 2024 Alexey Tatyanenko" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/bayesian/localizers/argmax.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementation of Bayesian CPD algorithm localizer selecting the most probable run length. 3 | """ 4 | 5 | __author__ = "Alexey Tatyanenko" 6 | __copyright__ = "Copyright (c) 2024 Alexey Tatyanenko" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | import numpy as np 10 | import numpy.typing as npt 11 | 12 | from pysatl_cpd.core.algorithms.bayesian.abstracts.ilocalizer import ILocalizer 13 | 14 | 15 | class ArgmaxLocalizer(ILocalizer): 16 | """ 17 | A localizer that localizes a change point corresponding with the most probable non-max run length. 18 | """ 19 | 20 | def localize(self, growth_probs: npt.NDArray[np.float64]) -> int: 21 | """ 22 | Localizes a change point corresponding with the most probable non-max run length. 23 | :param growth_probs: growth probabilities for run lengths at the time. 24 | :return: the most probable non-max run length corresponding change point; 25 | in case of one-element array returns it. 26 | """ 27 | max_run_length = growth_probs.shape[0] 28 | assert max_run_length > 0, "Run length distribution should not be empty" 29 | 30 | return 0 if max_run_length == 1 else int(growth_probs[:-1].argmax()) 31 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/bayesian_linear_heuristic.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for chanhe point detection online algorithm, based on Bayesian online algorithm with heuristic, turning it into 3 | an algorithm with linear time complexity with a cost of some information loss. 4 | """ 5 | 6 | __author__ = "Alexey Tatyanenko" 7 | __copyright__ = "Copyright (c) 2025 PySATL project" 8 | __license__ = "SPDX-License-Identifier: MIT" 9 | 10 | import copy 11 | from typing import Optional 12 | 13 | import numpy as np 14 | from numpy import typing as npt 15 | 16 | from pysatl_cpd.core.algorithms.bayesian_online_algorithm import BayesianOnline 17 | from pysatl_cpd.core.algorithms.online_algorithm import OnlineAlgorithm 18 | 19 | 20 | class BayesianLinearHeuristic(OnlineAlgorithm): 21 | """An online change point detection algorithm, based on changing the main Bayesian online algorithm instance to the 22 | duplicating time after some time. Note: this heuristic, however makes an algorithm linear on big time series, leads 23 | to some information loss, which may lead to some unstability in output's correctness.""" 24 | 25 | def __init__(self, algorithm: BayesianOnline, time_before_duplicate_start: int, duplicate_preparation_time: int): 26 | """Initializes the Bayesian change point detection algorithm with linear time-complexity heuristc.. 27 | 28 | :param algorithm: The base algorithm instance to use for detection/localization. 29 | :param time_before_duplicate_start: Time steps before starting duplicate algorithm's preparation (training 30 | and Bayesian modeling). 31 | :param duplicate_preparation_time: Time steps required to prepare (train and perform Bayesian modeling) the 32 | duplicating algorithm. 33 | :raises ValueError: If time constraints are not satisfied. 34 | :return: 35 | """ 36 | if not (time_before_duplicate_start > duplicate_preparation_time > 0): 37 | raise ValueError( 38 | "time_before_duplicate_start must be greater than duplicate_preparation_time, which must be positive" 39 | ) 40 | 41 | self.__original_algorithm = copy.deepcopy(algorithm) 42 | self.__time_before_duplicate_start = time_before_duplicate_start 43 | self.__duplicate_preparation_time = duplicate_preparation_time 44 | self.__main_algorithm = copy.deepcopy(algorithm) 45 | self.__duplicating_algorithm: Optional[BayesianOnline] = None 46 | self.__time = 0 47 | self.__last_algorithm_start_time = 0 48 | 49 | @property 50 | def __work_time(self) -> int: 51 | """ 52 | Returns the number of steps since the last algorithm start. 53 | :return: the number of steps since the last algorithm start. 54 | """ 55 | return self.__time - self.__last_algorithm_start_time 56 | 57 | def _handle_duplicate_preparation( 58 | self, observation: np.float64 | npt.NDArray[np.float64], method_name: str 59 | ) -> None: 60 | """ 61 | Manages the creation and training, Bayesian modeling of the duplicating algorithm. 62 | 63 | :param observation: a new observation from a time series. 64 | :param method_name: the method to call on the duplicating algorithm ('detect'/'localize'). 65 | :return: 66 | """ 67 | work_time = self.__work_time 68 | stage_end = self.__time_before_duplicate_start + self.__duplicate_preparation_time 69 | 70 | # Start initializing duplicating algorithm 71 | if work_time == self.__time_before_duplicate_start: 72 | self.__duplicating_algorithm = copy.deepcopy(self.__original_algorithm) 73 | 74 | # Train the duplicating algorithm amd perform a Bayesian modeling during preparation period 75 | elif self.__time_before_duplicate_start < work_time < stage_end: 76 | if self.__duplicating_algorithm is not None: 77 | getattr(self.__duplicating_algorithm, method_name)(observation) 78 | 79 | # Switch to the prepared duplicating algorithm 80 | elif work_time == stage_end: 81 | assert self.__duplicating_algorithm is not None, "Duplicating algorithm must be initialized" 82 | self.__main_algorithm = copy.deepcopy(self.__duplicating_algorithm) 83 | self.__duplicating_algorithm = None 84 | self.__last_algorithm_start_time = self.__time - self.__duplicate_preparation_time 85 | 86 | def detect(self, observation: np.float64 | npt.NDArray[np.float64]) -> bool: 87 | """ 88 | Processes an observation and returns whether a change point was detected by a main algorithm. 89 | :param observation: a new observation from a time series. Note: only univariate data is supported for now. 90 | :return: whether a change point was detected by a main algorithm. 91 | """ 92 | if observation is npt.NDArray[np.float64]: 93 | raise TypeError("Multivariate observations are not supported") 94 | assert self.__main_algorithm is not None, "Main algorithm must be initialized" 95 | 96 | # Run main detection 97 | if self.__main_algorithm.detect(observation): 98 | self.__last_algorithm_start_time = self.__time 99 | self.__duplicating_algorithm = None 100 | self.__time += 1 101 | return True 102 | 103 | # Manage duplicating algorithm training 104 | self._handle_duplicate_preparation(observation, "detect") 105 | self.__time += 1 106 | return False 107 | 108 | def localize(self, observation: np.float64 | npt.NDArray[np.float64]) -> Optional[int]: 109 | """ 110 | Processes an observation and returns the change point if localized by the main algorithm. 111 | :param observation: a new observation from a time series. Note: only univariate data is supported for now. 112 | :return: a change point, if it was localized, None otherwise. 113 | """ 114 | if observation is npt.NDArray[np.float64]: 115 | raise TypeError("Multivariate observations are not supported") 116 | assert self.__main_algorithm is not None, "Main algorithm must be initialized" 117 | 118 | # Run main localization 119 | if (result := self.__main_algorithm.localize(observation)) is not None: 120 | change_point = self.__last_algorithm_start_time + result 121 | self.__last_algorithm_start_time = change_point 122 | self.__duplicating_algorithm = None 123 | self.__time += 1 124 | return change_point 125 | 126 | # Manage duplicating algorithm training 127 | self._handle_duplicate_preparation(observation, "localize") 128 | self.__time += 1 129 | return None 130 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/classification/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/pysatl_cpd/core/algorithms/classification/__init__.py -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/classification/abstracts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/pysatl_cpd/core/algorithms/classification/abstracts/__init__.py -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/classification/abstracts/iclassifier.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for Classification CPD algorithm's classifier abstract base class. 3 | """ 4 | 5 | __author__ = "Artemii Patov" 6 | __copyright__ = "Copyright (c) 2024 Artemii Patov" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | from abc import ABC, abstractmethod 10 | 11 | import numpy as np 12 | import numpy.typing as npt 13 | 14 | 15 | class Classifier(ABC): 16 | """Classifier's abstract base class.""" 17 | 18 | @abstractmethod 19 | def train(self, sample: npt.NDArray[np.float64], barrier: int) -> None: 20 | """Trains binary classifier on the given sample. 21 | The observations before barrier belong to the class 0, after barrier --- to the class 1. 22 | 23 | :param sample: sample for training classifier. 24 | :param barrier: index of observation that splits the given sample. 25 | """ 26 | raise NotImplementedError 27 | 28 | @abstractmethod 29 | def predict(self, sample: npt.NDArray[np.float64]) -> npt.NDArray[np.intp]: 30 | """Classifies the elements of a sample into one of two classes, based on training with the barrier. 31 | 32 | :param sample: sample to classify. 33 | """ 34 | raise NotImplementedError 35 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/classification/abstracts/iquality_metric.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for Classification CPD algorithm's quality metric abstract base class. 3 | """ 4 | 5 | __author__ = "Artemii Patov" 6 | __copyright__ = "Copyright (c) 2024 Artemii Patov" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | from abc import ABC, abstractmethod 10 | 11 | import numpy as np 12 | import numpy.typing as npt 13 | 14 | 15 | class QualityMetric(ABC): 16 | """Quality metric's abstract base class.""" 17 | 18 | @abstractmethod 19 | def assess_barrier(self, classes: npt.NDArray[np.intp], time: int) -> float: 20 | """Evaluates quality function based on classificator in the specified point. 21 | 22 | :param classes: Classes of observations, predicted by the classifier. 23 | :param time: Index of barrier in the given sample to calculate quality. 24 | :return: Quality assessment. 25 | """ 26 | raise NotImplementedError 27 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/classification/abstracts/istatistic_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for Classification CPD algorithm's test statistic abstract base class. 3 | """ 4 | 5 | __author__ = "Artemii Patov" 6 | __copyright__ = "Copyright (c) 2024 Artemii Patov" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | from abc import ABC, abstractmethod 10 | 11 | 12 | class TestStatistic(ABC): 13 | """Test statistic's abstract base class.""" 14 | 15 | @abstractmethod 16 | def get_change_points(self, classifier_assessments: list[float]) -> list[int]: 17 | """Separates change points from other points in sample based on some criterion. 18 | 19 | :param classifier_assessments: List of quality assessments evaluated in each point of the sample. 20 | :return: Change points in the current window. 21 | """ 22 | raise NotImplementedError 23 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/classification/classifiers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/pysatl_cpd/core/algorithms/classification/classifiers/__init__.py -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/classification/classifiers/decision_tree.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementation of decision tree classifier for cpd. 3 | """ 4 | 5 | __author__ = "Artemii Patov" 6 | __copyright__ = "Copyright (c) 2024 Artemii Patov" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | from typing import cast 10 | 11 | import numpy as np 12 | import numpy.typing as npt 13 | import sklearn.tree as sk 14 | 15 | from pysatl_cpd.core.algorithms.classification.abstracts.iclassifier import Classifier 16 | 17 | 18 | class DecisionTreeClassifier(Classifier): 19 | """ 20 | The class implementing decision tree classifier for cpd. 21 | """ 22 | 23 | def __init__(self) -> None: 24 | """ 25 | Initializes a new instance of decision tree classifier for cpd. 26 | """ 27 | self.__model: sk.DecisionTreeClassifier | None = None 28 | 29 | def train(self, sample: npt.NDArray[np.float64], barrier: int) -> None: 30 | """Trains classifier on the given sample. 31 | 32 | :param sample: sample for training classifier. 33 | :param barrier: index of observation that splits the given sample. 34 | """ 35 | classes = np.array([0 if i <= barrier else 1 for i in range(len(sample))]) 36 | self.__model = sk.DecisionTreeClassifier() 37 | self.__model.fit(sample, classes) 38 | 39 | def predict(self, sample: npt.NDArray[np.float64]) -> npt.NDArray[np.intp]: 40 | """Classifies observations in the given sample based on training with barrier. 41 | 42 | :param sample: sample to classify. 43 | """ 44 | assert self.__model is not None 45 | return cast(npt.NDArray[np.intp], self.__model.predict(sample)) 46 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/classification/classifiers/knn.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementation of knn classifier for cpd. 3 | """ 4 | 5 | __author__ = "Artemii Patov" 6 | __copyright__ = "Copyright (c) 2024 Artemii Patov" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | import typing as tp 10 | 11 | import numpy as np 12 | import numpy.typing as npt 13 | from sklearn.neighbors import KNeighborsClassifier 14 | 15 | from pysatl_cpd.core.algorithms.classification.abstracts.iclassifier import Classifier 16 | 17 | 18 | class KNNClassifier(Classifier): 19 | """ 20 | The class implementing knn classifier for cpd. 21 | """ 22 | 23 | def __init__( 24 | self, k: int, distance: tp.Literal["manhattan", "euclidean", "minkowski", "hamming"] = "minkowski" 25 | ) -> None: 26 | """ 27 | Initializes a new instance of knn classifier for cpd. 28 | :param k: number of neighbours in the knn graph relative to each point. 29 | :param distance: Metric to use for distance computation. 30 | Default is "minkowski", which results in the standard Euclidean distance when p = 2. 31 | """ 32 | self.__k = k 33 | self.__distance: tp.Literal["manhattan", "euclidean", "minkowski", "hamming"] = distance 34 | self.__model: KNeighborsClassifier | None = None 35 | 36 | def train(self, sample: npt.NDArray[np.float64], barrier: int) -> None: 37 | """Trains classifier on the given sample. 38 | 39 | :param sample: sample for training classifier. 40 | :param barrier: index of observation that splits the given sample. 41 | """ 42 | classes = np.array([0 if i <= barrier else 1 for i in range(len(sample))]) 43 | self.__model = KNeighborsClassifier(n_neighbors=self.__k, metric=self.__distance) 44 | self.__model.fit(sample, classes) 45 | 46 | def predict(self, sample: npt.NDArray[np.float64]) -> npt.NDArray[np.intp]: 47 | """Classifies observations in the given sample based on training with barrier. 48 | 49 | :param sample: sample to classify. 50 | """ 51 | assert self.__model is not None 52 | return self.__model.predict(sample) 53 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/classification/classifiers/logistic_regression.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementation of classifier based on logistic regression for cpd. 3 | """ 4 | 5 | __author__ = "Artemii Patov" 6 | __copyright__ = "Copyright (c) 2024 Artemii Patov" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | from typing import cast 10 | 11 | import numpy as np 12 | import numpy.typing as npt 13 | from sklearn.linear_model import LogisticRegression 14 | 15 | from pysatl_cpd.core.algorithms.classification.abstracts.iclassifier import Classifier 16 | 17 | 18 | class LogisticRegressionClassifier(Classifier): 19 | """ 20 | The class implementing classifier based on logistic regression for cpd. 21 | """ 22 | 23 | def __init__(self) -> None: 24 | """ 25 | Initializes a new instance of classifier based on logistic regression for cpd. 26 | """ 27 | self.__model: LogisticRegression | None = None 28 | 29 | def train(self, sample: npt.NDArray[np.float64], barrier: int) -> None: 30 | """Trains classifier on the given sample. 31 | 32 | :param sample: sample for training classifier. 33 | :param barrier: index of observation that splits the given sample. 34 | """ 35 | classes = np.array([0 if i <= barrier else 1 for i in range(len(sample))]) 36 | self.__model = LogisticRegression() 37 | self.__model.fit(sample, classes) 38 | 39 | def predict(self, sample: npt.NDArray[np.float64]) -> npt.NDArray[np.intp]: 40 | """Classifies observations in the given sample based on training with barrier. 41 | 42 | :param sample: sample to classify. 43 | """ 44 | assert self.__model is not None 45 | return cast(npt.NDArray[np.intp], self.__model.predict(sample)) 46 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/classification/classifiers/rf.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementation of random forest classifier for cpd. 3 | """ 4 | 5 | __author__ = "Artemii Patov" 6 | __copyright__ = "Copyright (c) 2024 Artemii Patov" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | from typing import cast 10 | 11 | import numpy as np 12 | import numpy.typing as npt 13 | from sklearn.ensemble import RandomForestClassifier 14 | 15 | from pysatl_cpd.core.algorithms.classification.abstracts.iclassifier import Classifier 16 | 17 | 18 | class RFClassifier(Classifier): 19 | """ 20 | The class implementing random forest classifier for cpd. 21 | """ 22 | 23 | def __init__(self) -> None: 24 | """ 25 | Initializes a new instance of RF classifier for cpd. 26 | """ 27 | self.__model: RandomForestClassifier | None = None 28 | 29 | def train(self, sample: npt.NDArray[np.float64], barrier: int) -> None: 30 | """Trains classifier on the given sample. 31 | 32 | :param sample: sample for training classifier. 33 | :param barrier: index of observation that splits the given sample. 34 | """ 35 | classes = np.array([0 if i <= barrier else 1 for i in range(len(sample))]) 36 | self.__model = RandomForestClassifier() 37 | self.__model.fit(sample, classes) 38 | 39 | def predict(self, sample: npt.NDArray[np.float64]) -> npt.NDArray[np.intp]: 40 | """Classifies observations in the given sample based on training with barrier. 41 | 42 | :param sample: sample to classify. 43 | """ 44 | assert self.__model is not None 45 | return cast(npt.NDArray[np.intp], self.__model.predict(sample)) 46 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/classification/classifiers/svm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementation of svm classifier for cpd. 3 | """ 4 | 5 | __author__ = "Artemii Patov" 6 | __copyright__ = "Copyright (c) 2024 Artemii Patov" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | import typing as tp 10 | 11 | import numpy as np 12 | import numpy.typing as npt 13 | from sklearn.svm import SVC 14 | 15 | from pysatl_cpd.core.algorithms.classification.abstracts.iclassifier import Classifier 16 | 17 | 18 | class SVMClassifier(Classifier): 19 | """ 20 | The class implementing svm classifier for cpd. 21 | """ 22 | 23 | def __init__(self, kernel: tp.Literal["linear", "poly", "rbf", "sigmoid", "precomputed"] = "rbf") -> None: 24 | """ 25 | Initializes a new instance of svm classifier for cpd. 26 | :param kernel: specifies the kernel type to be used in the algorithm. If none is given, 'rbf' will be used. 27 | """ 28 | self.__kernel: tp.Literal["linear", "poly", "rbf", "sigmoid", "precomputed"] = kernel 29 | self.__model: SVC | None = None 30 | 31 | def train(self, sample: npt.NDArray[np.float64], barrier: int) -> None: 32 | """Trains classifier on the given sample. 33 | 34 | :param sample: sample for training classifier. 35 | :param barrier: index of observation that splits the given sample. 36 | """ 37 | classes = np.array([0 if i <= barrier else 1 for i in range(len(sample))]) 38 | self.__model = SVC(kernel=self.__kernel) 39 | self.__model.fit(sample, classes) 40 | 41 | def predict(self, sample: npt.NDArray[np.float64]) -> npt.NDArray[np.intp]: 42 | """Classifies observations in the given sample based on training with barrier. 43 | 44 | :param sample: sample to classify. 45 | """ 46 | assert self.__model is not None 47 | return tp.cast(npt.NDArray[np.intp], self.__model.predict(sample)) 48 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/classification/quality_metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/pysatl_cpd/core/algorithms/classification/quality_metrics/__init__.py -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/classification/quality_metrics/classification/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/pysatl_cpd/core/algorithms/classification/quality_metrics/classification/__init__.py -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/classification/quality_metrics/classification/accuracy.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementation of classifier's quality metric based on accuracy. 3 | """ 4 | 5 | __author__ = "Artemii Patov" 6 | __copyright__ = "Copyright (c) 2024 Artemii Patov" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | import numpy as np 10 | import numpy.typing as npt 11 | 12 | from pysatl_cpd.core.algorithms.classification.abstracts.iquality_metric import QualityMetric 13 | 14 | 15 | class Accuracy(QualityMetric): 16 | """ 17 | The class implementing quality metric based on accuracy. 18 | """ 19 | 20 | def assess_barrier(self, classes: npt.NDArray[np.intp], time: int) -> float: 21 | """Evaluates quality function based on classificator in the specified point. 22 | 23 | :param classes: Classes of observations, predicted by the classifier. 24 | :param time: Index of barrier in the given sample to calculate quality. 25 | :return: Quality assessment. 26 | """ 27 | before = classes[:time] 28 | after = classes[time:] 29 | before_length = time 30 | sample_length = len(classes) 31 | 32 | true_positive = float(after.sum()) 33 | true_negative = before_length - float(before.sum()) 34 | 35 | return (true_positive + true_negative) / sample_length 36 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/classification/quality_metrics/classification/f1.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementation of classifier's quality metric based on F1 score. 3 | """ 4 | 5 | __author__ = "Artemii Patov" 6 | __copyright__ = "Copyright (c) 2024 Artemii Patov" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | import numpy as np 10 | import numpy.typing as npt 11 | 12 | from pysatl_cpd.core.algorithms.classification.abstracts.iquality_metric import QualityMetric 13 | 14 | 15 | class F1(QualityMetric): 16 | """ 17 | The class implementing quality metric based on F1 score. 18 | """ 19 | 20 | def assess_barrier(self, classes: npt.NDArray[np.intp], time: int) -> float: 21 | """Evaluates quality function based on classificator in the specified point. 22 | 23 | :param classes: Classes of observations, predicted by the classifier. 24 | :param time: Index of barrier in the given sample to calculate quality. 25 | :return: Quality assessment. 26 | """ 27 | before = classes[:time] 28 | after = classes[time:] 29 | after_length = len(after) 30 | 31 | true_positive = float(after.sum()) 32 | false_positive = float(before.sum()) 33 | false_negative = after_length - true_positive 34 | 35 | return 2 * true_positive / (2 * true_positive + false_positive + false_negative) 36 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/classification/quality_metrics/classification/mcc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementation of classifier's quality metric based on Matthews correlation coefficient. 3 | """ 4 | 5 | __author__ = "Artemii Patov" 6 | __copyright__ = "Copyright (c) 2024 Artemii Patov" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | from math import sqrt 10 | 11 | import numpy as np 12 | import numpy.typing as npt 13 | 14 | from pysatl_cpd.core.algorithms.classification.abstracts.iquality_metric import QualityMetric 15 | 16 | 17 | class MCC(QualityMetric): 18 | """ 19 | The class implementing quality metric based on Matthews correlation coefficient. 20 | """ 21 | 22 | def assess_barrier(self, classes: npt.NDArray[np.intp], time: int) -> float: 23 | """Evaluates quality function based on classificator in the specified point. 24 | 25 | :param classes: Classes of observations, predicted by the classifier. 26 | :param time: Index of barrier in the given sample to calculate quality. 27 | :return: Quality assessment. 28 | """ 29 | before = classes[:time] 30 | after = classes[time:] 31 | after_length = len(after) 32 | before_length = time 33 | 34 | true_positive = after.sum() 35 | false_positive = before.sum() 36 | true_negative = before_length - false_positive 37 | false_negative = after_length - true_positive 38 | positive = true_positive + false_negative 39 | negative = false_positive + true_negative 40 | pp = true_positive + false_positive 41 | pn = false_negative + true_negative 42 | 43 | if pp == 0 or pn == 0: 44 | return -1.0 45 | 46 | tpr = true_positive / positive 47 | tnr = true_negative / negative 48 | ppv = true_positive / pp 49 | npv = true_negative / pn 50 | fnr = false_negative / positive 51 | fpr = false_positive / negative 52 | fo_rate = false_negative / pn 53 | fdr = false_positive / pp 54 | 55 | return sqrt(tpr * tnr * ppv * npv) - sqrt(fnr * fpr * fo_rate * fdr) 56 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/classification/quality_metrics/clustering/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/pysatl_cpd/core/algorithms/classification/quality_metrics/clustering/__init__.py -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/classification/test_statistics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/pysatl_cpd/core/algorithms/classification/test_statistics/__init__.py -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/classification/test_statistics/threshold_overcome.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementation of test statistic based on threshold overcome. 3 | """ 4 | 5 | __author__ = "Artemii Patov" 6 | __copyright__ = "Copyright (c) 2024 Artemii Patov" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | from pysatl_cpd.core.algorithms.classification.abstracts.istatistic_test import TestStatistic 10 | 11 | 12 | class ThresholdOvercome(TestStatistic): 13 | """ 14 | The class implementing test statistic based on threshold overcome. 15 | """ 16 | 17 | def __init__(self, threshold: float) -> None: 18 | """ 19 | Initializes a new instance of threshold overcome criterion. 20 | 21 | :param threshold: Threshold to overcome to detect the change point. 22 | """ 23 | self.__threshold = threshold 24 | 25 | def get_change_points(self, classifier_assessments: list[float]) -> list[int]: 26 | """Separates change points from other points in sample based on some criterion. 27 | 28 | :param classifier_assessments: List of quality assessments evaluated in each point of the sample. 29 | :return: Change points in the current window. 30 | """ 31 | return [i for i, v in enumerate(classifier_assessments) if v > self.__threshold] 32 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/classification_algorithm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementation of CPD algorithm based on classification. 3 | """ 4 | 5 | __author__ = "Artemii Patov" 6 | __copyright__ = "Copyright (c) 2024 Artemii Patov" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | 10 | import numpy as np 11 | import numpy.typing as npt 12 | 13 | from pysatl_cpd.core.algorithms.abstract_algorithm import Algorithm 14 | from pysatl_cpd.core.algorithms.classification.abstracts.iclassifier import Classifier 15 | from pysatl_cpd.core.algorithms.classification.abstracts.iquality_metric import QualityMetric 16 | from pysatl_cpd.core.algorithms.classification.abstracts.istatistic_test import TestStatistic 17 | 18 | 19 | class ClassificationAlgorithm(Algorithm): 20 | """ 21 | The class implementing change point detection algorithm based on classification. 22 | """ 23 | 24 | def __init__( 25 | self, classifier: Classifier, quality_metric: QualityMetric, test_statistic: TestStatistic, indent_coeff: float 26 | ) -> None: 27 | """ 28 | Initializes a new instance of classification based change point detection algorithm. 29 | 30 | :param classifier: Classifier for sample classification. 31 | :param quality_metric: Metric to assess independence of the two samples 32 | resulting from splitting the original sample. 33 | :param test_statistic: Criterion to separate change points from other points in sample. 34 | :param indent_coeff: Coefficient for evaluating indent from window borders. 35 | The indentation is calculated by multiplying the given coefficient by the size of window. 36 | """ 37 | self.__classifier = classifier 38 | self.__test_statistic = test_statistic 39 | self.__quality_metric = quality_metric 40 | 41 | self.__shift_coeff = indent_coeff 42 | 43 | self.__change_points: list[int] = [] 44 | self.__change_points_count = 0 45 | 46 | @property 47 | def test_statistic(self) -> TestStatistic: 48 | return self.__test_statistic 49 | 50 | @test_statistic.setter 51 | def test_statistic(self, test_statistic: TestStatistic) -> None: 52 | self.__test_statistic = test_statistic 53 | 54 | def detect(self, window: npt.NDArray[np.float64]) -> int: 55 | """Finds change points in window. 56 | 57 | :param window: part of global data for finding change points. 58 | :return: the number of change points in the window. 59 | """ 60 | self.__process_data(window) 61 | return self.__change_points_count 62 | 63 | def localize(self, window: npt.NDArray[np.float64]) -> list[int]: 64 | """Finds coordinates of change points (localizes them) in window. 65 | 66 | :param window: part of global data for finding change points. 67 | :return: list of window change points. 68 | """ 69 | self.__process_data(window) 70 | return self.__change_points.copy() 71 | 72 | def __process_data(self, window: npt.NDArray[np.float64]) -> None: 73 | """ 74 | Processes a window of data to detect/localize all change points depending on working mode. 75 | 76 | :param window: part of global data for change points analysis. 77 | """ 78 | sample_size = len(window) 79 | if sample_size == 0: 80 | return 81 | 82 | # Examining each point. 83 | # Boundaries are always change points. 84 | first_point = int(sample_size * self.__shift_coeff) 85 | last_point = int(sample_size * (1 - self.__shift_coeff)) 86 | assessments = [] 87 | 88 | for time in range(first_point, last_point): 89 | train_sample, test_sample = ClassificationAlgorithm.__split_sample(window) 90 | self.__classifier.train(train_sample, int(time / 2)) 91 | classes = self.__classifier.predict(test_sample) 92 | 93 | quality = self.__quality_metric.assess_barrier(classes, int(time / 2)) 94 | assessments.append(quality) 95 | 96 | change_points = self.__test_statistic.get_change_points(assessments) 97 | 98 | # Shifting change points coordinates according to their place in window. 99 | self.__change_points = list(map(lambda x: x + first_point, change_points)) 100 | self.__change_points_count = len(change_points) 101 | 102 | # Splits the given sample into train and test samples. 103 | # Strategy: even elements goes to the train sample; odd goes to the test sample 104 | # Soon classification algorithm will be more generalized: the split strategy will be one of the parameters. 105 | @staticmethod 106 | def __split_sample( 107 | sample: npt.NDArray[np.float64], 108 | ) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]: 109 | train_sample = [] 110 | test_sample = [] 111 | 112 | # Univariate distribution case. We need to make 2-dimensional array manually. 113 | if np.ndim(sample) == 1: 114 | sample = np.reshape(sample, (-1, 1)) 115 | 116 | for i, x in enumerate(sample): 117 | if i % 2 == 0: 118 | train_sample.append(x) 119 | else: 120 | test_sample.append(x) 121 | 122 | return np.array(train_sample), np.array(test_sample) 123 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/density/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/pysatl_cpd/core/algorithms/density/__init__.py -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/density/abstracts/density_based_algorithm.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from collections.abc import Callable 3 | from typing import TypeAlias 4 | 5 | import numpy as np 6 | import numpy.typing as npt 7 | from scipy.optimize import minimize 8 | 9 | from pysatl_cpd.core.algorithms.abstract_algorithm import Algorithm 10 | 11 | _TObjFunc: TypeAlias = Callable[[npt.NDArray[np.float64], npt.NDArray[np.float64]], float] 12 | _TMetrics: TypeAlias = dict[str, int | float] 13 | 14 | 15 | class DensityBasedAlgorithm(Algorithm): 16 | @staticmethod 17 | def _kernel_density_estimation(observation: npt.NDArray[np.float64], bandwidth: float) -> npt.NDArray[np.float64]: 18 | """Perform kernel density estimation on the given observations without fitting a model. 19 | 20 | :param observation: the data points for which to estimate the density. 21 | :param bandwidth: the bandwidth parameter for the kernel density estimation. 22 | 23 | :return: estimated density values for the observations. 24 | """ 25 | n = len(observation) 26 | x_grid = np.linspace(np.min(observation) - 3 * bandwidth, np.max(observation) + 3 * bandwidth, 1000) 27 | kde_values = np.zeros_like(x_grid) 28 | for x in observation: 29 | kde_values += np.exp(-0.5 * ((x_grid - x) / bandwidth) ** 2) 30 | 31 | kde_values /= n * bandwidth * np.sqrt(2 * np.pi) 32 | return kde_values 33 | 34 | def _calculate_weights( 35 | self, 36 | test_value: npt.NDArray[np.float64], 37 | reference_value: npt.NDArray[np.float64], 38 | bandwidth: float, 39 | objective_function: _TObjFunc, 40 | ) -> npt.NDArray[np.float64]: 41 | """Calculate the weights based on the density ratio between test and reference values. 42 | 43 | :param test_value: the test data points. 44 | :param reference_value: the reference data points. 45 | :param bandwidth: the bandwidth parameter for the kernel density estimation. 46 | :param objective_function: the objective function to minimize. 47 | 48 | :return: the calculated density ratios normalized to their mean. 49 | """ 50 | test_density = self._kernel_density_estimation(test_value, bandwidth) 51 | reference_density = self._kernel_density_estimation(reference_value, bandwidth) 52 | 53 | def objective_function_wrapper(alpha: npt.NDArray[np.float64], /) -> float: 54 | """Wrapper for the objective function to calculate the density ratio. 55 | 56 | :param alpha: relative parameter that controls the weighting between the numerator distribution 57 | and the denominator distribution in the density ratio estimation. 58 | 59 | :return: the value of the objective function to minimize. 60 | """ 61 | objective_density_ratio = np.exp(test_density - reference_density - alpha) 62 | return objective_function(objective_density_ratio, alpha) 63 | 64 | res = minimize(objective_function_wrapper, np.zeros(len(test_value)), method="L-BFGS-B") 65 | optimized_alpha: npt.NDArray[np.float64] = res.x 66 | density_ratio: npt.NDArray[np.float64] = np.exp(test_density - reference_density - optimized_alpha) 67 | return density_ratio / np.mean(density_ratio) 68 | 69 | @abstractmethod 70 | def detect(self, window: npt.NDArray[np.float64]) -> int: 71 | # maybe rtype tuple[int] 72 | """Function for finding change points in window 73 | 74 | :param window: part of global data for finding change points 75 | :return: list of right borders of window change points 76 | """ 77 | raise NotImplementedError 78 | 79 | @abstractmethod 80 | def localize(self, window: npt.NDArray[np.float64]) -> list[int]: 81 | """Function for finding coordinates of change points in window 82 | 83 | :param window: part of global data for finding change points 84 | :return: list of window change points 85 | """ 86 | raise NotImplementedError 87 | 88 | @staticmethod 89 | def evaluate_detection_accuracy(true_change_points: list[int], detected_change_points: list[int]) -> _TMetrics: 90 | """Evaluate the accuracy of change point detection. 91 | 92 | :param true_change_points: list of true change point indices. 93 | :param detected_change_points: list of detected change point indices. 94 | 95 | :return: a dictionary with evaluation metrics (precision, recall, F1 score). 96 | """ 97 | true_positive = len(set(true_change_points) & set(detected_change_points)) 98 | false_positive = len(set(detected_change_points) - set(true_change_points)) 99 | false_negative = len(set(true_change_points) - set(detected_change_points)) 100 | 101 | precision = true_positive / (true_positive + false_positive) if true_positive + false_positive > 0 else 0.0 102 | recall = true_positive / (true_positive + false_negative) if true_positive + false_negative > 0 else 0.0 103 | f1_score = (2 * precision * recall / (precision + recall)) if (precision + recall) > 0 else 0.0 104 | 105 | return { 106 | "precision": precision, 107 | "recall": recall, 108 | "f1_score": f1_score, 109 | "true_positive": true_positive, 110 | "false_positive": false_positive, 111 | "false_negative": false_negative, 112 | } 113 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/graph/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/pysatl_cpd/core/algorithms/graph/__init__.py -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/graph/abstracts/ibuilder.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from collections.abc import Callable, Iterable 3 | from typing import Any 4 | 5 | import numpy as np 6 | import numpy.typing as npt 7 | 8 | from pysatl_cpd.core.algorithms.graph.abstracts.igraph import IGraph 9 | 10 | 11 | class IBuilder(ABC): 12 | def __init__( 13 | self, data: Iterable[np.float64] | Iterable[npt.NDArray[np.float64]], compare: Callable[[Any, Any], bool] 14 | ): 15 | """ 16 | Initialize the builder with data and a comparison function. 17 | 18 | :param data: List of elements to be used in building the graph. 19 | :param compare: Callable that takes two elements and returns a boolean indicating 20 | if an edge should exist between them. 21 | """ 22 | self.data = list(data) 23 | self.compare = compare 24 | self.num_of_edges: int = 0 25 | 26 | @abstractmethod 27 | def build_graph(self) -> IGraph: 28 | """ 29 | Abstract method to build and return a graph representation. 30 | 31 | :return: An instance of IGraph representing the built graph. 32 | """ 33 | raise NotImplementedError 34 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/graph/abstracts/igraph.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class IGraph(ABC): 5 | def __init__(self, num_of_edges: int, len_data: int) -> None: 6 | """ 7 | Initialize the IGraph with the number of edges and the length of data. 8 | 9 | :param num_of_edges: Number of edges in the graph. 10 | :param len_data: Number of nodes in the graph. 11 | """ 12 | self.num_of_edges: int = num_of_edges 13 | self.len: int = len_data 14 | 15 | @abstractmethod 16 | def check_edges_exist(self, thao: int) -> int: 17 | """ 18 | Calculate the number of edges that exist between nodes up to a certain index (thao) 19 | and nodes from that index to the end. 20 | 21 | :param thao: Index dividing the nodes into two sets. 22 | :return: Number of edges existing between the two sets of nodes. 23 | """ 24 | raise NotImplementedError 25 | 26 | @abstractmethod 27 | def sum_of_squares_of_degrees_of_nodes(self) -> int: 28 | """ 29 | Calculate the sum of the squares of the degrees of all nodes. 30 | 31 | :return: Sum of the squares of the degrees of the nodes. 32 | """ 33 | raise NotImplementedError 34 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/graph/abstracts/igraph_cpd.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | from pysatl_cpd.core.algorithms.graph.abstracts.igraph import IGraph 4 | 5 | 6 | class IGraphCPD(ABC): 7 | def __init__(self, graph: IGraph): 8 | """ 9 | Initialize the IGraphCPD with the given graph. 10 | 11 | :param graph: An instance of IGraph representing the graph. 12 | """ 13 | self.graph = graph 14 | self.size = graph.len 15 | 16 | @abstractmethod 17 | def calculation_e(self, thao: int) -> float: 18 | """ 19 | Calculate the mathematical expectation (E) using the given formula. 20 | 21 | :param thao: Index dividing the nodes into two sets. 22 | :return: Calculated expectation value. 23 | """ 24 | raise NotImplementedError 25 | 26 | @abstractmethod 27 | def calculation_var(self, thao: int) -> float: 28 | """ 29 | Calculate the variance using the given formula. 30 | 31 | :param thao: Index dividing the nodes into two sets. 32 | :return: Calculated variance value. 33 | """ 34 | raise NotImplementedError 35 | 36 | @abstractmethod 37 | def calculation_z(self, thao: int) -> float: 38 | """ 39 | Calculate the Z statistic. 40 | 41 | :param thao: Index dividing the nodes into two sets. 42 | :return: Calculated Z statistic. 43 | """ 44 | raise NotImplementedError 45 | 46 | @abstractmethod 47 | def find_changepoint(self, border: float) -> list[int]: 48 | """ 49 | Find change points in the data based on the Z statistic. 50 | 51 | :param border: Threshold value for detecting change points. 52 | :return: List of detected change points. 53 | """ 54 | raise NotImplementedError 55 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/graph/builders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/pysatl_cpd/core/algorithms/graph/builders/__init__.py -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/graph/builders/list.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Callable 2 | from typing import Any 3 | 4 | from pysatl_cpd.core.algorithms.graph.abstracts.ibuilder import IBuilder 5 | from pysatl_cpd.core.algorithms.graph.abstracts.igraph import IGraph 6 | from pysatl_cpd.core.algorithms.graph.graph_list import GraphList 7 | 8 | 9 | class AdjacencyListBuilder(IBuilder): 10 | def __init__(self, data: list[Any], comparing_function: Callable[[Any, Any], bool]): 11 | super().__init__(data, comparing_function) 12 | 13 | def build(self) -> dict[int, list[Any]]: # Adjacency List 14 | """ 15 | Build the adjacency list from the provided data. 16 | 17 | :return: A dictionary representing the adjacency list where keys are node indices and values 18 | are lists of adjacent nodes. 19 | """ 20 | unique_edges = set() 21 | count_nodes = len(self.data) 22 | adjacency_list: dict[int, list[Any]] = {index: [] for index in range(count_nodes)} 23 | for i in range(count_nodes): 24 | for j in range(count_nodes): 25 | if self.compare(self.data[i], self.data[j]) and (i != j): 26 | adjacency_list[i].append(self.data[j]) 27 | edge = tuple(sorted((i, j))) 28 | unique_edges.add(edge) 29 | self.num_of_edges = len(unique_edges) 30 | 31 | # for i in range(0, len(self.data)): 32 | # print(f"{self.data[i]}: {adjacency_list[i]}") 33 | 34 | return adjacency_list 35 | 36 | def build_graph(self) -> IGraph: 37 | graph = self.build() 38 | return GraphList(graph, self.data, self.num_of_edges) 39 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/graph/builders/matrix.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Callable, Iterable 2 | from typing import Any 3 | 4 | import numpy as np 5 | import numpy.typing as npt 6 | 7 | from pysatl_cpd.core.algorithms.graph.abstracts.ibuilder import IBuilder 8 | from pysatl_cpd.core.algorithms.graph.abstracts.igraph import IGraph 9 | from pysatl_cpd.core.algorithms.graph.graph_matrix import GraphMatrix 10 | 11 | 12 | class AdjacencyMatrixBuilder(IBuilder): 13 | def __init__( 14 | self, 15 | data: Iterable[np.float64] | Iterable[npt.NDArray[np.float64]], 16 | comparing_function: Callable[[Any, Any], bool], 17 | ): 18 | super().__init__(data, comparing_function) 19 | 20 | def build_matrix(self) -> npt.NDArray[np.int8]: # Adjacency Matrix 21 | """ 22 | Build the adjacency matrix from the provided data. 23 | 24 | :return: A NumPy ndarray representing the adjacency matrix where element [i, j] is 1 if 25 | there is an edge between node i and node j, otherwise 0. 26 | """ 27 | count_edges = 0 28 | count_nodes = len(self.data) 29 | adjacency_matrix = np.zeros((count_nodes, count_nodes), dtype=np.int8) 30 | 31 | for i in range(count_nodes): 32 | for j in range(count_nodes): 33 | if self.compare(self.data[i], self.data[j]) and (i != j): 34 | adjacency_matrix[i, j] = 1 35 | count_edges += 1 36 | self.num_of_edges = count_edges // 2 37 | 38 | return adjacency_matrix 39 | 40 | def build_graph(self) -> IGraph: 41 | graph = self.build_matrix() 42 | return GraphMatrix(graph, self.num_of_edges) 43 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/graph/graph_cpd.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from pysatl_cpd.core.algorithms.graph.abstracts.igraph import IGraph 4 | from pysatl_cpd.core.algorithms.graph.abstracts.igraph_cpd import IGraphCPD 5 | 6 | 7 | class GraphCPD(IGraphCPD): 8 | def __init__(self, graph: IGraph): 9 | super().__init__(graph) 10 | 11 | def calculation_e(self, thao: int) -> float: 12 | p1 = ((2 * thao) * (self.size - thao)) / (self.size * (self.size - 1)) 13 | return p1 * self.graph.num_of_edges 14 | 15 | def calculation_var(self, thao: int) -> float: 16 | p1 = ((2 * thao) * (self.size - thao)) / (self.size * (self.size - 1)) 17 | p2 = (4 * thao * (thao - 1) * (self.size - thao) * (self.size - thao - 1)) / ( 18 | self.size * (self.size - 1) * (self.size - 2) * (self.size - 3) 19 | ) 20 | var = ( 21 | p1 * self.graph.num_of_edges 22 | + (0.5 * p1 - p2) * self.graph.sum_of_squares_of_degrees_of_nodes() 23 | + (p2 - p1**2) * self.graph.num_of_edges**2 24 | ) 25 | return var 26 | 27 | def calculation_z(self, thao: int) -> float: 28 | zg = -((self.graph.check_edges_exist(thao) - self.calculation_e(thao)) / math.sqrt(self.calculation_var(thao))) 29 | return zg 30 | 31 | def find_changepoint(self, border: float) -> list[int]: 32 | change_point_list: list[int] = [] 33 | for t in range(1, self.size): 34 | if self.calculation_z(t) > border: 35 | change_point_list.append(t) 36 | return change_point_list 37 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/graph/graph_list.py: -------------------------------------------------------------------------------- 1 | from typing import Any, TypeAlias 2 | 3 | from pysatl_cpd.core.algorithms.graph.abstracts.igraph import IGraph 4 | 5 | _TAdjList: TypeAlias = dict[int, list[Any]] 6 | 7 | 8 | class GraphList(IGraph): 9 | def __init__(self, graph: _TAdjList, data: list[Any], num_of_edges: int) -> None: 10 | """ 11 | Initialize the GraphList with the adjacency list, data, and number of edges. 12 | 13 | :param graph: Adjacency list representing the graph. 14 | :param data: List of elements representing the nodes. 15 | :param num_of_edges: Number of edges in the graph. 16 | """ 17 | super().__init__(num_of_edges, len(data)) 18 | self.graph = graph 19 | self.data = data 20 | 21 | def __getitem__(self, item: int) -> Any: 22 | """ 23 | Get the list of adjacent nodes for a given node. 24 | 25 | :param item: Node index. 26 | :return: List of adjacent nodes. 27 | """ 28 | return self.graph[item] 29 | 30 | def check_edges_exist(self, thao: int) -> int: 31 | count_edges = 0 32 | for node_1 in range(thao): 33 | for node_2 in range(thao, self.len): 34 | if self.data[node_2] in self.graph[node_1]: 35 | count_edges += 1 36 | return count_edges 37 | 38 | def sum_of_squares_of_degrees_of_nodes(self) -> int: 39 | sum_squares = 0 40 | for node in range(0, self.len): 41 | sum_squares += len(self.graph[node]) ** 2 42 | return sum_squares 43 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/graph/graph_matrix.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import numpy.typing as npt 4 | 5 | from pysatl_cpd.core.algorithms.graph.abstracts.igraph import IGraph 6 | 7 | 8 | class GraphMatrix(IGraph): 9 | def __init__(self, graph: npt.NDArray[Any], num_of_edges: int): 10 | """ 11 | Initialize the GraphMatrix with the adjacency matrix and number of edges. 12 | 13 | :param graph: Adjacency matrix representing the graph. 14 | :param num_of_edges: Number of edges in the graph. 15 | """ 16 | super().__init__(num_of_edges, len(graph)) 17 | self.mtx = graph 18 | 19 | def __getitem__(self, item: int) -> Any: 20 | """ 21 | Get the row of the adjacency matrix for a given node. 22 | 23 | :param item: Node index. 24 | :return: Row of the adjacency matrix corresponding to the node. 25 | """ 26 | return self.mtx[item] 27 | 28 | def check_edges_exist(self, thao: int) -> int: 29 | count_edges = 0 30 | for node_before in range(thao): 31 | for node_after in range(thao, self.len): 32 | if self.mtx[node_before, node_after] == 1: 33 | count_edges += 1 34 | return count_edges 35 | 36 | def sum_of_squares_of_degrees_of_nodes(self) -> int: 37 | sum_squares = 0 38 | for node_1 in range(0, self.len): 39 | node_degree = 0 40 | for node_2 in range(0, self.len): 41 | if self.mtx[node_1, node_2] == 1: 42 | node_degree += 1 43 | node_degree = node_degree**2 44 | sum_squares += node_degree 45 | return sum_squares 46 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/graph_algorithm.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Callable 2 | from typing import Any 3 | 4 | import numpy as np 5 | import numpy.typing as npt 6 | 7 | from .abstract_algorithm import Algorithm 8 | from .graph.builders.matrix import AdjacencyMatrixBuilder 9 | from .graph.graph_cpd import GraphCPD 10 | 11 | 12 | class GraphAlgorithm(Algorithm): 13 | def __init__(self, compare_func: Callable[[Any, Any], bool], threshold: float): 14 | self.compare = compare_func 15 | self.threshold = threshold 16 | 17 | def localize(self, window: npt.NDArray[np.float64]) -> list[int]: 18 | builder = AdjacencyMatrixBuilder(window, self.compare) 19 | graph = builder.build_graph() 20 | cpd = GraphCPD(graph) 21 | num_cpd: list[int] = cpd.find_changepoint(self.threshold) 22 | return num_cpd 23 | 24 | def detect(self, window: npt.NDArray[np.float64]) -> int: 25 | builder = AdjacencyMatrixBuilder(window, self.compare) 26 | graph = builder.build_graph() 27 | cpd = GraphCPD(graph) 28 | num_cpd: list[int] = cpd.find_changepoint(self.threshold) 29 | return len(num_cpd) 30 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/kliep_algorithm.py: -------------------------------------------------------------------------------- 1 | from typing import cast 2 | 3 | import numpy as np 4 | import numpy.typing as npt 5 | from numpy import dtype, float64, ndarray 6 | 7 | from pysatl_cpd.core.algorithms.density.abstracts.density_based_algorithm import DensityBasedAlgorithm 8 | 9 | 10 | class KliepAlgorithm(DensityBasedAlgorithm): 11 | """Kullback-Leibler Importance Estimation Procedure (KLIEP) algorithm 12 | for change point detection. 13 | 14 | KLIEP estimates the density ratio between two distributions and uses 15 | the importance weights for detecting changes in the data distribution. 16 | """ 17 | 18 | def __init__(self, bandwidth: float, regularization_coef: float, threshold: float = 1.1): 19 | """Initialize the KLIEP algorithm. 20 | 21 | Args: 22 | bandwidth (float): bandwidth parameter for density estimation. 23 | regularization_coef (float): regularization parameter. 24 | threshold (float, optional): threshold for detecting change points. 25 | Defaults to 1.1. 26 | """ 27 | self.bandwidth = bandwidth 28 | self.regularization_coef = regularization_coef 29 | self.threshold = np.float64(threshold) 30 | 31 | def _loss_function(self, density_ratio: npt.NDArray[np.float64], alpha: npt.NDArray[np.float64]) -> float: 32 | """Loss function for KLIEP. 33 | 34 | Args: 35 | density_ratio (np.ndarray): estimated density ratio. 36 | alpha (np.ndarray): coefficients for the density ratio. 37 | 38 | Returns: 39 | float: the computed loss value. 40 | """ 41 | return -np.mean(density_ratio) + self.regularization_coef * np.sum(alpha**2) 42 | 43 | def detect(self, window: npt.NDArray[np.float64]) -> int: 44 | """Detect the number of change points in the given data window 45 | using KLIEP. 46 | 47 | Args: 48 | window (Iterable[float]): the data window to detect change points. 49 | 50 | Returns: 51 | int: the number of detected change points. 52 | """ 53 | 54 | window_sample = np.array(window) 55 | weights = self._calculate_weights( 56 | test_value=window_sample, 57 | reference_value=window_sample, 58 | bandwidth=self.bandwidth, 59 | objective_function=self._loss_function, 60 | ) 61 | 62 | return np.count_nonzero(weights > self.threshold) 63 | 64 | def localize(self, window: npt.NDArray[np.float64]) -> list[int]: 65 | """Localize the change points in the given data window using KLIEP. 66 | 67 | Args: 68 | window (Iterable[float]): the data window to localize 69 | change points. 70 | 71 | Returns: 72 | List[int]: the indices of the detected change points. 73 | """ 74 | window_sample = np.array(window) 75 | weights: ndarray[tuple[int, ...], dtype[float64]] = self._calculate_weights( 76 | test_value=window_sample, 77 | reference_value=window_sample, 78 | bandwidth=self.bandwidth, 79 | objective_function=self._loss_function, 80 | ) 81 | 82 | return cast(list[int], np.where(weights > self.threshold)[0].tolist()) 83 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/knn/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/pysatl_cpd/core/algorithms/knn/__init__.py -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/knn/abstracts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/pysatl_cpd/core/algorithms/knn/abstracts/__init__.py -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/knn/abstracts/observation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for abstractions used in heap, needed to clearly distinguish observations made at different times. 3 | """ 4 | 5 | __author__ = "Artemii Patov" 6 | __copyright__ = "Copyright (c) 2024 Artemii Patov" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | from dataclasses import dataclass, field 10 | 11 | import numpy as np 12 | import numpy.typing as npt 13 | 14 | 15 | @dataclass(order=True) 16 | class Observation: 17 | """ 18 | Abstraction over observation that consists of the time of the point in time series and the value of it. 19 | """ 20 | 21 | time: int 22 | value: np.float64 | npt.NDArray[np.float64] = field(compare=False) 23 | 24 | 25 | @dataclass(order=True) 26 | class Neighbour: 27 | """ 28 | Abstraction over neighbour that consists of the distance to the main point and the observation-neighbour itself. 29 | """ 30 | 31 | distance: float 32 | observation: Observation 33 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/knn/classifier.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementation of classifier based on nearest neighbours for cpd. 3 | """ 4 | 5 | __author__ = "Artemii Patov" 6 | __copyright__ = "Copyright (c) 2024 Artemii Patov" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | import typing as tp 10 | from math import sqrt 11 | 12 | import numpy as np 13 | import numpy.typing as npt 14 | 15 | from .graph import KNNGraph 16 | 17 | 18 | class KNNClassifier: 19 | """ 20 | The class implementing classifier based on nearest neighbours. 21 | """ 22 | 23 | def __init__( 24 | self, 25 | metric: tp.Callable[[np.float64 | npt.NDArray[np.float64], np.float64 | npt.NDArray[np.float64]], float], 26 | k: int = 7, 27 | delta: float = 1e-12, 28 | ) -> None: 29 | """ 30 | Initializes a new instance of KNN classifier for cpd. 31 | 32 | :param metric: function for calculating distance between points in time series. 33 | :param k: number of neighbours in the knn graph relative to each point. 34 | Default is 7, which is generally the most optimal value (based on the experiments results). 35 | :param delta: delta for comparing float values of the given observations. 36 | """ 37 | self.__k = k 38 | self.__metric = metric 39 | self.__delta = delta 40 | 41 | self.__window: npt.NDArray[np.float64] | None = None 42 | self.__knn_graph: KNNGraph | None = None 43 | 44 | def classify(self, window: npt.NDArray[np.float64]) -> None: 45 | """Applies classificator to the given sample. 46 | 47 | :param window: part of global data for finding change points. 48 | """ 49 | self.__window = window 50 | self.__knn_graph = KNNGraph(window, self.__metric, self.__k, self.__delta) 51 | self.__knn_graph.build() 52 | 53 | def assess_barrier(self, time: int) -> float: 54 | """ 55 | Calculates quality function in specified point. 56 | 57 | :param time: index of point in the given sample to calculate statistics relative to it. 58 | """ 59 | assert self.__window is not None 60 | window_size = len(self.__window) 61 | 62 | assert self.__knn_graph is not None, "Graph should not be None." 63 | 64 | k = self.__k 65 | n = window_size 66 | n_1 = time 67 | n_2 = n - time 68 | 69 | if n <= k: 70 | # Unable to analyze sample due to its size. 71 | # Returns negative number that will be less than the statistics in this case, 72 | # but big enough not to spoil overall statistical picture. 73 | return -k 74 | 75 | h = 4 * (n_1 - 1) * (n_2 - 1) / ((n - 2) * (n - 3)) 76 | 77 | sum_1 = (1 / n) * sum( 78 | self.__knn_graph.check_for_neighbourhood(j, i) 79 | for i in range(window_size) 80 | for j in self.__knn_graph.get_neighbours(i) 81 | ) 82 | 83 | sum_2 = (1 / n) * ( 84 | 2 85 | * sum( 86 | self.__knn_graph.check_for_neighbourhood(m, i) 87 | for j in range(window_size) 88 | for i in self.__knn_graph.get_neighbours(j) 89 | for m in range(j + 1, window_size) 90 | ) 91 | + sum(len(self.__knn_graph.get_neighbours(i)) for i in range(window_size)) 92 | ) 93 | 94 | expectation = 4 * k * n_1 * n_2 / (n - 1) 95 | variance = (expectation / k) * (h * (sum_1 + k - (2 * k**2 / (n - 1))) + (1 - h) * (sum_2 - k**2)) 96 | deviation = sqrt(variance) 97 | 98 | permutation = np.arange(window_size) 99 | random_variable_value = self.__calculate_random_variable(permutation, time, window_size) 100 | 101 | if deviation == 0: 102 | # if the deviation is zero, it likely means that the time is 1 or the data is constant. 103 | # In this case we cannot detect any change-points. 104 | # Thus, we can return negative number that will be less than the statistics in this case. 105 | return -k 106 | 107 | statistics = -(random_variable_value - expectation) / deviation 108 | 109 | return statistics 110 | 111 | def __calculate_random_variable(self, permutation: npt.NDArray[np.intp], t: int, window_size: int) -> int: 112 | """ 113 | Calculates a random variable from a permutation and a fixed point. 114 | 115 | :param permutation: random permutation of observations. 116 | :param t: fixed point that splits the permutation. 117 | :return: value of the random variable. 118 | """ 119 | 120 | def b(i: int, j: int) -> bool: 121 | pi = int(permutation[i]) 122 | pj = int(permutation[j]) 123 | return (pi <= t < pj) or (pj <= t < pi) 124 | 125 | assert self.__knn_graph is not None 126 | s = sum( 127 | (self.__knn_graph.check_for_neighbourhood(i, j) + self.__knn_graph.check_for_neighbourhood(j, i)) * b(i, j) 128 | for i in range(window_size) 129 | for j in range(window_size) 130 | ) 131 | 132 | return s 133 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/knn/graph.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementation of neareset neighbours graph. 3 | """ 4 | 5 | __author__ = "Artemii Patov" 6 | __copyright__ = "Copyright (c) 2024 Artemii Patov" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | import typing as tp 10 | from collections import deque 11 | 12 | import numpy as np 13 | import numpy.typing as npt 14 | 15 | from .abstracts.observation import Observation 16 | from .heap import NNHeap 17 | 18 | 19 | class KNNGraph: 20 | """ 21 | The class implementing nearest neighbours graph. 22 | """ 23 | 24 | def __init__( 25 | self, 26 | window: npt.NDArray[np.float64], 27 | metric: tp.Callable[[np.float64 | npt.NDArray[np.float64], np.float64 | npt.NDArray[np.float64]], float], 28 | k: int = 7, 29 | delta: float = 1e-12, 30 | ) -> None: 31 | """ 32 | Initializes a new instance of KNN graph. 33 | 34 | :param window: an overall sample the graph is based on. 35 | :param metric: function for calculating the distance between two points in time series. 36 | :param k: number of neighbours in the knn graph relative to each point. 37 | Default is 7, which is generally the most optimal value (based on the experiments results). 38 | :param delta: delta for comparing float values of the given observations. 39 | """ 40 | self.__window: list[Observation] = [Observation(t, v) for t, v in enumerate(window)] 41 | self.__metric: tp.Callable[[Observation, Observation], float] = lambda obs1, obs2: metric( 42 | obs1.value, obs2.value 43 | ) 44 | self.__k = k 45 | self.__delta = delta 46 | 47 | self.__graph: deque[NNHeap] = deque(maxlen=len(self.__window)) 48 | 49 | def build(self) -> None: 50 | """ 51 | Builds KNN graph according to the given parameters. 52 | """ 53 | for i in range(len(self.__window)): 54 | heap = NNHeap(self.__k, self.__metric, self.__window[-i - 1], self.__delta) 55 | heap.build(self.__window) 56 | self.__graph.appendleft(heap) 57 | 58 | def get_neighbours(self, obs_index: int) -> list[int]: 59 | return self.__graph[obs_index].get_neighbours_indices() 60 | 61 | def check_for_neighbourhood(self, first_index: int, second_index: int) -> bool: 62 | """ 63 | Checks if the second observation is among the k nearest neighbours of the first observation. 64 | 65 | :param first_index: index of main observation. 66 | :param second_index: index of possible neighbour. 67 | :return: true if the second point is the neighbour of the first one, false otherwise. 68 | """ 69 | neighbour = self.__window[second_index] 70 | return self.__graph[first_index].find_in_heap(neighbour) 71 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/knn/heap.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementation of nearest neighbours heap. 3 | """ 4 | 5 | __author__ = "Artemii Patov" 6 | __copyright__ = "Copyright (c) 2024 Artemii Patov" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | import heapq 10 | import typing as tp 11 | from math import isclose 12 | 13 | from .abstracts.observation import Neighbour, Observation 14 | 15 | 16 | class NNHeap: 17 | """ 18 | The class implementing nearest neighbours heap --- helper abstraction for KNN graph. 19 | """ 20 | 21 | def __init__( 22 | self, 23 | size: int, 24 | metric: tp.Callable[[Observation, Observation], float], 25 | main_observation: Observation, 26 | delta: float, 27 | ) -> None: 28 | """ 29 | Initializes a new instance of NNHeap. 30 | 31 | :param size: size of the heap. 32 | :param metric: function for calculating distance between two observations. 33 | :param main_observation: the central point relative to which the nearest neighbours are sought. 34 | :param delta: delta for comparing float values of the given observations. 35 | """ 36 | self.__size = size 37 | self.__metric = metric 38 | self.__main_observation = main_observation 39 | 40 | self.__heap: list[Neighbour] = [] 41 | self.__delta = delta 42 | 43 | def build(self, neighbours: list[Observation]) -> None: 44 | """ 45 | Builds a nearest neighbour heap relative to the main observation with the given neighbours. 46 | 47 | :param neighbours: list of neighbours. 48 | """ 49 | for neighbour in neighbours: 50 | self.__add(neighbour) 51 | 52 | def find_in_heap(self, observation: Observation) -> bool: 53 | """ 54 | Checks if the given observation is among the nearest neighbours of the main observation. 55 | 56 | :param observation: observation to test. 57 | """ 58 | 59 | def predicate(x: Neighbour) -> bool: 60 | return isclose(self.__metric(x.observation, observation), 0.0, rel_tol=self.__delta) and ( 61 | x.observation.time == observation.time 62 | ) 63 | 64 | return any(predicate(i) for i in self.__heap) 65 | 66 | def get_neighbours_indices(self) -> list[int]: 67 | return [n.observation.time for n in self.__heap] 68 | 69 | def __add(self, observation: Observation) -> None: 70 | """ 71 | Adds observation to heap. 72 | 73 | :param observation: observation to add. 74 | """ 75 | if observation is self.__main_observation: 76 | return 77 | 78 | # Sign conversion is needed to convert the smallest element heap to the greatest element heap. 79 | neg_distance = -self.__metric(self.__main_observation, observation) 80 | neighbour = Neighbour(neg_distance, observation) 81 | 82 | if len(self.__heap) == self.__size and neighbour.distance > self.__heap[0].distance: 83 | heapq.heapreplace(self.__heap, neighbour) 84 | elif len(self.__heap) < self.__size: 85 | heapq.heappush(self.__heap, neighbour) 86 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/knn_algorithm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementation of CPD algorithm based on knn classification. 3 | """ 4 | 5 | __author__ = "Artemii Patov" 6 | __copyright__ = "Copyright (c) 2024 Artemii Patov" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | import typing as tp 10 | 11 | import numpy as np 12 | import numpy.typing as npt 13 | 14 | from pysatl_cpd.core.algorithms.abstract_algorithm import Algorithm 15 | from pysatl_cpd.core.algorithms.classification.abstracts.istatistic_test import TestStatistic 16 | from pysatl_cpd.core.algorithms.knn.classifier import KNNClassifier 17 | 18 | 19 | class KNNAlgorithm(Algorithm): 20 | """ 21 | The class implementing change point detection algorithm based on k-NN classifier. Works only with non-constant data. 22 | """ 23 | 24 | def __init__( 25 | self, 26 | distance_func: tp.Callable[[np.float64 | npt.NDArray[np.float64], np.float64 | npt.NDArray[np.float64]], float], 27 | test_statistic: TestStatistic, 28 | indent_coeff: float, 29 | k: int = 7, 30 | delta: float = 1e-12, 31 | ) -> None: 32 | """ 33 | Initializes a new instance of k-NN based change point detection algorithm. 34 | 35 | :param distance_func: function for calculating the distance between two points in time series. 36 | :param test_statistic: Criterion to separate change points from other points in sample. 37 | :param indent_coeff: Coefficient for evaluating indent from window borders. 38 | The indentation is calculated by multiplying the given coefficient by the size of window. 39 | :param k: number of neighbours in the knn graph relative to each point. 40 | Default is 7, which is generally the most optimal value (based on the experiments results). 41 | :param delta: delta for comparing float values of the given observations. 42 | """ 43 | self.__test_statistic = test_statistic 44 | 45 | self.__shift_coeff = indent_coeff 46 | self.__classifier = KNNClassifier(distance_func, k, delta) 47 | 48 | self.__change_points: list[int] = [] 49 | self.__change_points_count = 0 50 | 51 | @property 52 | def test_statistic(self) -> TestStatistic: 53 | return self.__test_statistic 54 | 55 | @test_statistic.setter 56 | def test_statistic(self, test_statistic: TestStatistic) -> None: 57 | self.__test_statistic = test_statistic 58 | 59 | def detect(self, window: npt.NDArray[np.float64]) -> int: 60 | """Finds change points in window. 61 | 62 | :param window: part of global data for finding change points. 63 | :return: the number of change points in the window. 64 | """ 65 | self.__process_data(window) 66 | return self.__change_points_count 67 | 68 | def localize(self, window: npt.NDArray[np.float64]) -> list[int]: 69 | """Finds coordinates of change points (localizes them) in window. 70 | 71 | :param window: part of global data for finding change points. 72 | :return: list of window change points. 73 | """ 74 | self.__process_data(window) 75 | return self.__change_points.copy() 76 | 77 | def __process_data(self, window: npt.NDArray[np.float64]) -> None: 78 | """ 79 | Processes a window of data to detect/localize all change points depending on working mode. 80 | 81 | :param window: part of global data for change points analysis. 82 | """ 83 | sample_size = len(window) 84 | if sample_size == 0: 85 | return 86 | 87 | self.__classifier.classify(window) 88 | 89 | # Examining each point. 90 | # Boundaries are always change points. 91 | first_point = int(sample_size * self.__shift_coeff) 92 | last_point = int(sample_size * (1 - self.__shift_coeff)) 93 | assessments = [] 94 | 95 | for time in range(first_point, last_point): 96 | quality = self.__classifier.assess_barrier(time) 97 | assessments.append(quality) 98 | 99 | change_points = self.__test_statistic.get_change_points(assessments) 100 | 101 | # Shifting change points coordinates according to their place in window. 102 | self.__change_points = list(map(lambda x: x + first_point, change_points)) 103 | self.__change_points_count = len(change_points) 104 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/online_algorithm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for online change point detection algorithm's interface. 3 | """ 4 | 5 | __author__ = "Alexey Tatyanenko" 6 | __copyright__ = "Copyright (c) 2025 PySATL project" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | from typing import Optional, Protocol 10 | 11 | import numpy as np 12 | import numpy.typing as npt 13 | 14 | 15 | class OnlineAlgorithm(Protocol): 16 | """ 17 | Protocol for online change point detection algorithm's interface. 18 | """ 19 | 20 | def detect(self, observation: np.float64 | npt.NDArray[np.float64]) -> bool: 21 | """ 22 | Method for a step of detection of a change point. 23 | :param observation: new observation of a time series. 24 | :return: bool observation whether a change point was detected after processing the new observation. 25 | """ 26 | ... 27 | 28 | def localize(self, observation: np.float64 | npt.NDArray[np.float64]) -> Optional[int]: 29 | """ 30 | Method for a step of localization of a change point. 31 | :param observation: new observation of a time series 32 | :return: absolute location of a change point, acquired after processing the new observation, 33 | or None if there wasn't any. 34 | """ 35 | ... 36 | -------------------------------------------------------------------------------- /pysatl_cpd/core/algorithms/rulsif_algorithm.py: -------------------------------------------------------------------------------- 1 | from typing import cast 2 | 3 | import numpy as np 4 | import numpy.typing as npt 5 | 6 | from pysatl_cpd.core.algorithms.density.abstracts.density_based_algorithm import DensityBasedAlgorithm 7 | 8 | 9 | class RulsifAlgorithm(DensityBasedAlgorithm): 10 | """Relative Unconstrained Least-Squares Importance Fitting (RULSIF) 11 | algorithm for change point detection. 12 | 13 | RULSIF estimates the density ratio between two distributions and uses 14 | the importance weights for detecting changes in the data distribution. 15 | """ 16 | 17 | def __init__(self, bandwidth: float, regularization_coef: float, threshold: float = 1.1): 18 | """Initialize the RULSIF algorithm. 19 | 20 | Args: 21 | bandwidth (float): bandwidth parameter for density estimation. 22 | regularization_coef (float): regularization parameter. 23 | threshold (float, optional): threshold for detecting change points. 24 | Defaults to 1.1. 25 | """ 26 | self.bandwidth = bandwidth 27 | self.regularization_coef = regularization_coef 28 | self.threshold = threshold 29 | 30 | def _loss_function(self, density_ratio: npt.NDArray[np.float64], alpha: npt.NDArray[np.float64]) -> float: 31 | """Loss function for RULSIF. 32 | 33 | Args: 34 | density_ratio (np.ndarray): estimated density ratio. 35 | alpha (np.ndarray): coefficients for the density ratio. 36 | 37 | Returns: 38 | float: the computed loss value. 39 | """ 40 | return np.mean((density_ratio - 1) ** 2) + self.regularization_coef * np.sum(alpha**2) 41 | 42 | def detect(self, window: npt.NDArray[np.float64]) -> int: 43 | """Detect the number of change points in the given data window 44 | using RULSIF. 45 | 46 | Args: 47 | window (Iterable[float]): the data window to detect change points. 48 | 49 | Returns: 50 | int: the number of detected change points. 51 | """ 52 | window_sample = np.array(window) 53 | weights = self._calculate_weights( 54 | test_value=window_sample, 55 | reference_value=window_sample, 56 | bandwidth=self.bandwidth, 57 | objective_function=self._loss_function, 58 | ) 59 | 60 | return np.count_nonzero(weights > self.threshold) 61 | 62 | def localize(self, window: npt.NDArray[np.float64]) -> list[int]: 63 | """Localize the change points in the given data window using RULSIF. 64 | 65 | Args: 66 | window (Iterable[float]): the data window to localize change points. 67 | 68 | Returns: 69 | List[int]: the indices of the detected change points. 70 | """ 71 | window_sample = np.array(window) 72 | weights = self._calculate_weights( 73 | test_value=window_sample, 74 | reference_value=window_sample, 75 | bandwidth=self.bandwidth, 76 | objective_function=self._loss_function, 77 | ) 78 | 79 | return cast(list[int], np.where(weights > self.threshold)[0].tolist()) 80 | -------------------------------------------------------------------------------- /pysatl_cpd/core/cpd_core.py: -------------------------------------------------------------------------------- 1 | __author__ = "Artem Romanyuk, Vladimir Kutuev" 2 | __copyright__ = "Copyright (c) 2025 PySATL project" 3 | __license__ = "SPDX-License-Identifier: MIT" 4 | 5 | from .algorithms.abstract_algorithm import Algorithm 6 | from .scrubber.abstract import Scrubber 7 | 8 | 9 | class CpdCore: 10 | """Change Point Detection core""" 11 | 12 | def __init__( 13 | self, 14 | scrubber: Scrubber, 15 | algorithm: Algorithm, 16 | ) -> None: 17 | """Change Point Detection core algorithm 18 | 19 | :param scrubber: scrubber for dividing data into windows 20 | and subsequent processing of data windows 21 | by change point detection algorithms 22 | :param algorithm: change point detection algorithm 23 | :return: list of found change points 24 | """ 25 | self.scrubber = scrubber 26 | self.algorithm = algorithm 27 | 28 | def localize(self) -> list[int]: 29 | """Find change points 30 | 31 | :return: list of change points 32 | """ 33 | change_points: list[int] = [] 34 | for window in self.scrubber.__iter__(): 35 | window_change_points = self.algorithm.localize(window.values) 36 | change_points.extend(map(lambda i: window.indices[i], window_change_points)) 37 | return change_points 38 | 39 | def detect(self) -> int: 40 | """Count change points 41 | 42 | :return: number of change points 43 | """ 44 | change_points_count = 0 45 | for window in self.scrubber.__iter__(): 46 | change_points_count += self.algorithm.detect(window.values) 47 | return change_points_count 48 | -------------------------------------------------------------------------------- /pysatl_cpd/core/online_cpd_core.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for online-CPD core, which presents access to algorithms as iterators over provdied data. 3 | """ 4 | 5 | __author__ = "Alexey Tatyanenko" 6 | __copyright__ = "Copyright (c) 2025 PySATL project" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | from collections.abc import Iterator 10 | 11 | from pysatl_cpd.core.algorithms.online_algorithm import OnlineAlgorithm 12 | from pysatl_cpd.core.scrubber.data_providers import DataProvider 13 | 14 | 15 | class OnlineCpdCore: 16 | """ 17 | Class that presents online CPD-algorithm as detection or localization iterator over the provided data. 18 | """ 19 | 20 | def __init__(self, algorithm: OnlineAlgorithm, data_provider: DataProvider) -> None: 21 | self.algorithm = algorithm 22 | self.data_provider = data_provider 23 | 24 | def detect(self) -> Iterator[bool]: 25 | """ 26 | Iteratively tries to detect a change point in the provided data. 27 | :return: whether a change point after processed observation was detected. 28 | """ 29 | for observation in self.data_provider: 30 | yield self.algorithm.detect(observation) 31 | 32 | def localize(self) -> Iterator[int | None]: 33 | """ 34 | Iteratively tries to localize a change point in the provided data. 35 | :return: change point location, if it was successfully localized, or None, otherwise. 36 | """ 37 | for observation in self.data_provider: 38 | yield self.algorithm.localize(observation) 39 | -------------------------------------------------------------------------------- /pysatl_cpd/core/problem.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass 5 | class CpdProblem: 6 | """Specification of the solving problem 7 | 8 | :param to_localize: is it necessary to localize change points, defaults to False 9 | """ 10 | 11 | to_localize: bool = True 12 | -------------------------------------------------------------------------------- /pysatl_cpd/core/scrubber/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/pysatl_cpd/core/scrubber/__init__.py -------------------------------------------------------------------------------- /pysatl_cpd/core/scrubber/abstract.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for Abstract Scrubber description. 3 | """ 4 | 5 | __author__ = "Romanyuk Artem, Vladimir Kutuev" 6 | __copyright__ = "Copyright (c) 2025 PySATL project" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | 10 | from abc import ABC, abstractmethod 11 | from collections.abc import Iterator 12 | from dataclasses import dataclass 13 | 14 | import numpy as np 15 | import numpy.typing as npt 16 | 17 | from pysatl_cpd.core.scrubber.data_providers import DataProvider 18 | 19 | 20 | @dataclass 21 | class ScrubberWindow: 22 | values: npt.NDArray[np.float64] 23 | indices: list[int] 24 | 25 | 26 | class Scrubber(ABC): 27 | """A scrubber for dividing data into windows 28 | and subsequent processing of data windows 29 | by change point detection algorithms 30 | """ 31 | 32 | def __init__(self, data_provider: DataProvider) -> None: 33 | """A scrubber for dividing data into windows 34 | and subsequent processing of data windows 35 | by change point detection algorithms 36 | 37 | """ 38 | self._data_provider = data_provider 39 | 40 | @abstractmethod 41 | def __iter__(self) -> Iterator[ScrubberWindow]: 42 | """Function for dividing data into parts to feed into the change point detection algorithm 43 | 44 | :return: Iterator of data windows for change point detection algorithm 45 | """ 46 | ... 47 | 48 | @property 49 | def data(self) -> Iterator[np.float64] | Iterator[npt.NDArray[np.float64]]: 50 | return iter(self._data_provider) 51 | -------------------------------------------------------------------------------- /pysatl_cpd/core/scrubber/data_providers.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module contains classes providing data from different sources to scrubbers. 3 | """ 4 | 5 | __author__ = "Vladimir Kutuev" 6 | __copyright__ = "Copyright (c) 2025 PySATL project" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | from collections.abc import Iterator 10 | from typing import Protocol, runtime_checkable 11 | 12 | import numpy as np 13 | import numpy.typing as npt 14 | 15 | from pysatl_cpd.labeled_data import LabeledCpdData 16 | 17 | 18 | @runtime_checkable 19 | class DataProvider(Protocol): 20 | """Interface for abstracting the scrubber from the data source and its format""" 21 | 22 | def __iter__(self) -> Iterator[np.float64] | Iterator[npt.NDArray[np.float64]]: 23 | """ 24 | :return: an iterator over the data 25 | """ 26 | ... 27 | 28 | 29 | class ListUnivariateProvider(DataProvider): 30 | """Provides data from list of floats""" 31 | 32 | def __init__(self, data: list[float]) -> None: 33 | self._data = data 34 | 35 | def __iter__(self) -> Iterator[np.float64] | Iterator[npt.NDArray[np.float64]]: 36 | return map(np.float64, self._data) 37 | 38 | 39 | class ListMultivariateProvider(DataProvider): 40 | """Provides data from list of NumPy ndarrays""" 41 | 42 | def __init__(self, data: list[npt.NDArray[np.float64]]) -> None: 43 | self._data = data 44 | 45 | def __iter__(self) -> Iterator[np.float64] | Iterator[npt.NDArray[np.float64]]: 46 | return iter(self._data) 47 | 48 | 49 | class LabeledDataProvider(DataProvider): 50 | """Provides data from LabeledData instance""" 51 | 52 | def __init__(self, data: LabeledCpdData) -> None: 53 | self._data = data.raw_data 54 | 55 | def __iter__(self) -> Iterator[np.float64] | Iterator[npt.NDArray[np.float64]]: 56 | return iter(self._data) 57 | -------------------------------------------------------------------------------- /pysatl_cpd/core/scrubber/linear.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for implementation of Linear Scrubber. 3 | """ 4 | 5 | __author__ = "Vladimir Kutuev, Artemii Patov" 6 | __copyright__ = "Copyright (c) 2025 PySATL project" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | from collections.abc import Iterator 10 | from itertools import islice 11 | 12 | import numpy as np 13 | import numpy.typing as npt 14 | 15 | from pysatl_cpd.core.scrubber.data_providers import DataProvider 16 | 17 | from .abstract import Scrubber, ScrubberWindow 18 | 19 | 20 | class LinearScrubber(Scrubber): 21 | """A linear scrubber for dividing data into windows by moving them through data""" 22 | 23 | def __init__( 24 | self, 25 | data_provider: DataProvider, 26 | window_length: int = 100, 27 | shift_factor: float = 1.0 / 3.0, 28 | ): 29 | """A linear scrubber for dividing data into windows by moving them through data 30 | 31 | :param window_length: length of data window 32 | :param shift_factor: how far will the window move relative to the length 33 | """ 34 | super().__init__(data_provider) 35 | self._window_length = window_length 36 | self._shift_factor = shift_factor 37 | 38 | def __iter__(self) -> Iterator[ScrubberWindow]: 39 | window_start = 0 40 | shift = max(1, int(self._window_length * self._shift_factor)) 41 | provided_data_it = iter(self._data_provider) 42 | next_slice = np.array(list(islice(provided_data_it, self._window_length))) 43 | window_data: npt.NDArray[np.float64] = np.array([]) 44 | while next_slice.size > 0: 45 | window_data = ( 46 | np.concat((np.delete(window_data, np.s_[:shift], 0), next_slice), axis=0) 47 | if len(window_data) > 0 48 | else next_slice 49 | ) 50 | window_end = window_start + min(self._window_length, len(window_data)) 51 | yield ScrubberWindow(window_data, list(range(window_start, window_end))) 52 | window_start += shift 53 | window_end += shift 54 | next_slice = np.array(list(islice(provided_data_it, shift))) 55 | -------------------------------------------------------------------------------- /pysatl_cpd/cpd_solver.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module contains class for solving change point detection problem. 3 | """ 4 | 5 | __author__ = "Aleksei Ivanov, Artem Romanyuk, Vladimir Kutuev" 6 | __copyright__ = "Copyright (c) 2025 PySATL project" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | import time 10 | 11 | from .core.algorithms.abstract_algorithm import Algorithm 12 | from .core.cpd_core import CpdCore 13 | from .core.problem import CpdProblem 14 | from .core.scrubber.abstract import Scrubber 15 | from .core.scrubber.data_providers import LabeledDataProvider 16 | from .icpd_solver import CpdLocalizationResults, ICpdSolver 17 | from .labeled_data import LabeledCpdData 18 | 19 | 20 | class CpdSolver(ICpdSolver): 21 | """Class, that grants a convenient interface to 22 | work with CPD algorithms""" 23 | 24 | def __init__( 25 | self, 26 | scenario: CpdProblem, 27 | algorithm: Algorithm, 28 | algorithm_input: Scrubber | tuple[LabeledCpdData, type[Scrubber]], 29 | ) -> None: 30 | """pysatl_cpd object constructor 31 | 32 | :param: scenario: scenario specify 33 | :param: algorithm: CPD algorithm, that will search for change points 34 | :param: scrubber: scrubber object for splitting data into parts 35 | """ 36 | self._labeled_data: LabeledCpdData | None = None 37 | self._cpd_core: CpdCore 38 | match algorithm_input: 39 | case Scrubber() as scrubber: 40 | self._cpd_core = CpdCore(scrubber, algorithm) 41 | case (data, scrubber_type): 42 | self._labeled_data = data 43 | self._cpd_core = CpdCore(scrubber_type(LabeledDataProvider(data)), algorithm) 44 | 45 | self._scenario = scenario 46 | 47 | def run(self) -> CpdLocalizationResults | int: 48 | """Execute CPD algorithm and return container with its results 49 | 50 | :return: CpdLocalizationResults object, containing algo result CP and expected CP if needed, 51 | or number of detected change points. 52 | """ 53 | time_start = time.perf_counter() 54 | if not self._scenario.to_localize: 55 | return self._cpd_core.detect() 56 | algo_results = self._cpd_core.localize() 57 | time_end = time.perf_counter() 58 | expected_change_points: list[int] | None = None 59 | if isinstance(self._labeled_data, LabeledCpdData): 60 | expected_change_points = self._labeled_data.change_points 61 | data = self._cpd_core.scrubber.data 62 | return CpdLocalizationResults(data, algo_results, expected_change_points, time_end - time_start) 63 | -------------------------------------------------------------------------------- /pysatl_cpd/generator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/pysatl_cpd/generator/__init__.py -------------------------------------------------------------------------------- /pysatl_cpd/generator/dataset_description.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from itertools import accumulate 3 | 4 | from .distributions import Distribution 5 | 6 | 7 | class SampleDescription: 8 | """Contains dataset description: 9 | 10 | * sub-samples lengths; 11 | * sub-samples distributions. 12 | 13 | Also can represent it in AsciiDoc format. 14 | """ 15 | 16 | _name: str 17 | _samples_length: list[int] 18 | _samples_distributions: list[Distribution] 19 | 20 | def __init__( 21 | self, 22 | name: str, 23 | samples_length: list[int], 24 | samples_distributions: list[Distribution], 25 | ) -> None: 26 | """ 27 | Creates new DatasetDescription instance. 28 | 29 | :param name: Name for the sample. 30 | :param samples_length: List of sub-samples length. 31 | :param samples_distributions: List of sub-samples distributions. 32 | """ 33 | self._name = name 34 | self._samples_length = samples_length 35 | self._samples_distributions = samples_distributions 36 | assert len(self._samples_length) == len(self._samples_distributions) 37 | 38 | @property 39 | def name(self) -> str: 40 | return self._name 41 | 42 | @property 43 | def changepoints(self) -> list[int]: 44 | return list(accumulate(self._samples_length))[:-1] 45 | 46 | @property 47 | def length(self) -> list[int]: 48 | return self._samples_length 49 | 50 | @property 51 | def distributions(self) -> list[Distribution]: 52 | return self._samples_distributions 53 | 54 | def to_asciidoc(self, image_path: str | None = None) -> str: 55 | """ 56 | Converts `DatasetDescription` instance to string in AsciiDoc format. 57 | This description contain information about sample length, sub-samples lengths and distributions, 58 | changepoints indices in sample. 59 | 60 | Example 61 | ------- 62 | .. code-block:: 63 | 64 | = Sample 20-normal-0-1-20-normal-10-1 65 | 66 | [horizontal] 67 | Sample length:: 40 68 | Sub-samples lengths:: [20, 20] 69 | Change points:: [20] 70 | 71 | == Distributions 72 | 73 | . normal 74 | [horizontal] 75 | mean:: 0.0 76 | variance:: 1.0 77 | . normal 78 | [horizontal] 79 | mean:: 10.0 80 | variance:: 1.0 81 | 82 | :return: Dataset description string in AsciiDoc format. 83 | """ 84 | description = StringIO() 85 | description.write(f"= Sample {self._name}\n\n") 86 | description.write("[horizontal]\n") 87 | description.write(f"Sample length:: {sum(self._samples_length)}\n") 88 | description.write(f"Subsamples lengths:: {self._samples_length}\n") 89 | description.write(f"Change points:: {self.changepoints}\n\n") 90 | description.write("== Distributions\n\n") 91 | for i in range(len(self._samples_length)): 92 | distr = self._samples_distributions[i] 93 | description.write(f". {distr.name}\n") 94 | description.write("[horizontal]\n") 95 | for k, v in distr.params.items(): 96 | description.write(f"{k}:: {v}\n") 97 | 98 | if image_path: 99 | description.write("\n") 100 | description.write(f"image::{image_path}[Sample]\n") 101 | 102 | return description.getvalue() 103 | 104 | 105 | class DatasetDescriptionBuilder: 106 | """Builder for `DatasetDescription` instance.""" 107 | 108 | def __init__(self) -> None: 109 | """Creates new DatasetDescriptionBuilder empty instance.""" 110 | self._distributions: dict[int, tuple[int, Distribution]] = dict() 111 | self._name: str | None = None 112 | 113 | def set_name(self, name: str) -> None: 114 | """Set name for dataset 115 | 116 | :param name: name for dataset""" 117 | self._name = name 118 | 119 | def add_distribution( 120 | self, distribution_type: str, distribution_length: int, distribution_parameters: dict[str, str] 121 | ) -> None: 122 | """Add new distribution to dataset 123 | 124 | :param distribution_type: type of distribution 125 | :param distribution_length: length of distribution in dataset 126 | :param distribution_parameters: special distribution parameters""" 127 | distribution_index = len(self._distributions) 128 | distribution = Distribution.from_str(distribution_type, distribution_parameters) 129 | self._distributions[distribution_index] = (distribution_length, distribution) 130 | 131 | def build(self) -> SampleDescription: 132 | """ 133 | Validate parameters and create `DatasetDescription` instance. 134 | 135 | :return: New `DatasetDescription` instance. 136 | """ 137 | assert self._name 138 | assert len(self._distributions) 139 | lengths, distributions = zip(*self._distributions.values()) 140 | return SampleDescription(self._name, list(lengths), list(distributions)) 141 | -------------------------------------------------------------------------------- /pysatl_cpd/generator/generator.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from enum import Enum 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import numpy.typing as npt 7 | 8 | from .config_parser import ConfigParser 9 | from .distributions import Distribution 10 | from .saver import DatasetSaver 11 | 12 | 13 | class Generators(Enum): 14 | SCIPY = "scipy" 15 | 16 | def __str__(self) -> str: 17 | return self.value 18 | 19 | 20 | class DatasetGenerator(ABC): 21 | """ 22 | An interface for dataset generators using different backends (e.g. SciPy or Numpy) 23 | to create a sample with a given distributions and lengths. 24 | """ 25 | 26 | @abstractmethod 27 | def generate_sample(self, distributions: list[Distribution], lengths: list[int]) -> npt.NDArray[np.float64]: 28 | """ 29 | Creates a sample consists of subsamples with given `distributions` and `lengths`. 30 | 31 | :param distributions: List of distributions for subsamples. 32 | :param lengths: List of subsamples lengths. 33 | :return: Created sample. 34 | """ 35 | raise NotImplementedError() 36 | 37 | @staticmethod 38 | def get_generator(generator_backend: Generators) -> "DatasetGenerator": 39 | match generator_backend: 40 | case Generators.SCIPY: 41 | return ScipyDatasetGenerator() 42 | case _: 43 | raise ValueError("Unknown generator") 44 | 45 | def generate_datasets( 46 | self, config_path: Path, saver: DatasetSaver | None = None 47 | ) -> dict[str, tuple[npt.NDArray[np.float64], list[int]]]: 48 | """Generate pairs of dataset and change points by config file 49 | 50 | :param config_path: path to config file 51 | :param saver: saver of saving files (if saver is None, then the data does not need to be saved), 52 | defaults to None 53 | 54 | :return: dictionary with names and pairs of dataset and change points 55 | """ 56 | config_parser: ConfigParser = ConfigParser(config_path) 57 | 58 | datasets = dict() 59 | 60 | for descr in config_parser: 61 | sample = self.generate_sample(descr.distributions, descr.length) 62 | current_point = 0 63 | change_points = [] 64 | for length in descr.length[:-1]: 65 | current_point += length 66 | change_points.append(current_point) 67 | datasets[descr.name] = (sample, change_points) 68 | if saver: 69 | saver.save_sample(sample, descr) 70 | return datasets 71 | 72 | 73 | class ScipyDatasetGenerator(DatasetGenerator): 74 | """ 75 | Dataset generator using SciPy to create samples. 76 | """ 77 | 78 | def generate_sample(self, distributions: list[Distribution], lengths: list[int]) -> npt.NDArray[np.float64]: 79 | return np.concatenate( 80 | [distribution.scipy_sample(length) for distribution, length in zip(distributions, lengths)] 81 | ) 82 | -------------------------------------------------------------------------------- /pysatl_cpd/generator/saver.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Final 3 | 4 | import matplotlib.pyplot as plt 5 | import numpy as np 6 | import numpy.typing as npt 7 | 8 | from pysatl_cpd.generator.dataset_description import SampleDescription 9 | 10 | 11 | class DatasetSaver: 12 | """ 13 | Saves samples and descriptions to specified directory. 14 | """ 15 | 16 | SAMPLE_DATA: Final[str] = "sample.csv" 17 | DESCRIPTION: Final[str] = "sample.adoc" 18 | SAMPLE_IMAGE: Final[str] = "sample.png" 19 | CHANGEPOINTS_DATA: Final[str] = "changepoints.csv" 20 | 21 | _out_dir: Path 22 | _replace: bool 23 | 24 | def __init__(self, out_dir: Path, replace: bool): 25 | """ 26 | :param out_dir: Directory to save samples and descriptions. 27 | :param replace: Whether sample should be saved if it already exists. 28 | """ 29 | if not out_dir.exists(): 30 | out_dir.mkdir() 31 | self._replace = replace 32 | self._out_dir = out_dir 33 | 34 | def save_sample(self, sample: npt.NDArray[np.float64], description: SampleDescription) -> bool: 35 | """ 36 | Save sample, list of changepoints, sample plot and AsciiDoc description. 37 | 38 | :param sample: Sample to save. 39 | :param description: Description of the saving `sample`. 40 | :return: Whether sample and description have been saved to output directory. 41 | """ 42 | sample_dir: Path = self._out_dir.joinpath(description.name) 43 | if sample_dir.exists() and not self._replace: 44 | return False 45 | if not sample_dir.exists(): 46 | sample_dir.mkdir() 47 | # Save generated sample 48 | sample_file: Path = sample_dir.joinpath(DatasetSaver.SAMPLE_DATA) 49 | np.savetxt(sample_file, sample, delimiter=",") 50 | # Save changepoints list 51 | changepoints_file: Path = sample_dir.joinpath(DatasetSaver.CHANGEPOINTS_DATA) 52 | changepoints: list[int] = description.changepoints 53 | with open(changepoints_file, "w") as cf: 54 | for cp in changepoints: 55 | cf.write(f"{cp}\n") 56 | # Save sample plot 57 | image_file: Path = sample_dir.joinpath(DatasetSaver.SAMPLE_IMAGE) 58 | plt.plot(sample) 59 | plt.vlines(x=changepoints, ymin=sample.min(), ymax=sample.max(), colors="orange", ls="--") 60 | plt.savefig(image_file) 61 | plt.close() 62 | # Save description 63 | description_file: Path = sample_dir.joinpath(DatasetSaver.DESCRIPTION) 64 | with open(description_file, "w") as df: 65 | df.write(description.to_asciidoc(DatasetSaver.SAMPLE_IMAGE)) 66 | 67 | return True 68 | -------------------------------------------------------------------------------- /pysatl_cpd/labeled_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module contains wrapper for generated or labeled dataset. 3 | """ 4 | 5 | __author__ = "Artem Romanyuk, Vladimir Kutuev" 6 | __copyright__ = "Copyright (c) 2025 PySATL project" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | import os 10 | from collections.abc import Iterator 11 | from pathlib import Path 12 | 13 | import numpy as np 14 | import numpy.typing as npt 15 | 16 | from pysatl_cpd.generator.generator import DatasetGenerator, ScipyDatasetGenerator 17 | from pysatl_cpd.generator.saver import DatasetSaver 18 | 19 | 20 | class LabeledCpdData: 21 | """Class for generating and storing labeled data, 22 | needed in pysatl_cpd""" 23 | 24 | def __init__( 25 | self, 26 | raw_data: npt.NDArray[np.float64], 27 | change_points: list[int], 28 | ) -> None: 29 | """LabeledCPData object constructor 30 | 31 | :param: raw_data: data, that will be passed into CPD algo 32 | :param: change_points: expected results after passing raw_data into CPD algo 33 | """ 34 | self.raw_data = raw_data 35 | self.change_points = change_points 36 | 37 | def __iter__(self) -> Iterator[npt.NDArray[np.float64]]: 38 | """labeledCPData iterator""" 39 | return self.raw_data.__iter__() 40 | 41 | def __str__(self) -> str: 42 | """Shows main info about LabeledCPData object""" 43 | return f"data={self.raw_data}, change_points={self.change_points}" 44 | 45 | def __len__(self) -> int: 46 | return len(self.raw_data) 47 | 48 | @staticmethod 49 | def generate_cp_datasets( 50 | config_path: Path, 51 | generator: DatasetGenerator = ScipyDatasetGenerator(), 52 | to_save: bool = False, 53 | output_directory: Path = Path(), 54 | to_replace: bool = True, 55 | ) -> dict[str, "LabeledCpdData"]: 56 | """Method for generating labeled data, that contains CP with specific 57 | distribution 58 | 59 | :param config_path: path to config file 60 | :param generator: DataGenerator object, defaults to ScipyDatasetGenerator() 61 | :param to_save: is it necessary to save the data, defaults to False 62 | :param output_directory: directory to save data, defaults to Path() 63 | :param to_replace: is it necessary to replace the files in directory 64 | 65 | :return: dict of pairs: name, LabeledCPData (pairs of data and change points)""" 66 | # maybe create default config 67 | if not os.path.exists(config_path): 68 | raise ValueError("Incorrect config path") 69 | if to_save: 70 | datasets = generator.generate_datasets(config_path, DatasetSaver(output_directory, to_replace)) 71 | else: 72 | datasets = generator.generate_datasets(config_path) 73 | labeled_data_dict = dict() 74 | for name in datasets: 75 | data, change_points = datasets[name] 76 | labeled_data_dict[name] = LabeledCpdData(data, change_points) 77 | return labeled_data_dict 78 | 79 | @staticmethod 80 | def read_generated_datasets(datasets_directory: Path) -> dict[str, "LabeledCpdData"]: 81 | """Read already generated datasets from directory 82 | 83 | :param datasets_directory: directory with datasets 84 | :return: dict of pairs: name, LabeledCPData (pairs of data and change points)""" 85 | datasets = dict() 86 | for dataset_directory in os.scandir(datasets_directory): 87 | dataset_files = dict() 88 | with os.scandir(dataset_directory) as entries: 89 | for file in entries: 90 | dataset_files[file.name] = file 91 | if "changepoints.csv" not in dataset_files or "sample.csv" not in dataset_files: 92 | raise ValueError(f"{datasets_directory} is not datasets directory") 93 | with open(dataset_files["sample.csv"]) as sample: 94 | raw_data = sample.readlines() 95 | data: list[npt.NDArray[np.float64]] | npt.NDArray[np.float64] 96 | try: 97 | data = np.array(list(map(np.float64, raw_data))) 98 | except ValueError: 99 | data = np.array([list(map(np.float64, vals.split(","))) for vals in raw_data]) 100 | with open(dataset_files["changepoints.csv"]) as changepoints: 101 | change_points = list(map(int, changepoints.readlines())) 102 | datasets[dataset_directory.name] = LabeledCpdData(data, change_points) 103 | return datasets 104 | -------------------------------------------------------------------------------- /pysatl_cpd/online_cpd_solver.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module contains class for solving change point detection problem with an online CPD algorithm. 3 | """ 4 | 5 | __author__ = "Alexey Tatyanenko" 6 | __copyright__ = "Copyright (c) 2025 PySATL project" 7 | __license__ = "SPDX-License-Identifier: MIT" 8 | 9 | import time 10 | 11 | from pysatl_cpd.core.algorithms.online_algorithm import OnlineAlgorithm 12 | from pysatl_cpd.core.online_cpd_core import OnlineCpdCore 13 | from pysatl_cpd.core.problem import CpdProblem 14 | from pysatl_cpd.core.scrubber.data_providers import DataProvider, LabeledDataProvider 15 | from pysatl_cpd.icpd_solver import CpdLocalizationResults, ICpdSolver 16 | from pysatl_cpd.labeled_data import LabeledCpdData 17 | 18 | 19 | class OnlineCpdSolver(ICpdSolver): 20 | """Class, that grants a convenient interface to 21 | work with online-CPD algorithms""" 22 | 23 | def __init__( 24 | self, 25 | scenario: CpdProblem, 26 | algorithm: OnlineAlgorithm, 27 | algorithm_input: DataProvider | LabeledCpdData, 28 | ) -> None: 29 | """pysatl_cpd object constructor 30 | 31 | :param: scenario: scenario specify 32 | :param: algorithm: online-CPD algorithm, that will search for change points 33 | :param: algorithm_input: data provider or labeled data to construct corresponding data provider. 34 | """ 35 | self._labeled_data: LabeledCpdData | None = None 36 | self._cpd_core: OnlineCpdCore 37 | match algorithm_input: 38 | case LabeledCpdData() as data: 39 | self._labeled_data = data 40 | self._cpd_core = OnlineCpdCore( 41 | data_provider=LabeledDataProvider(data), 42 | algorithm=algorithm, 43 | ) 44 | case DataProvider() as data_provider: 45 | self._cpd_core = OnlineCpdCore( 46 | data_provider=data_provider, 47 | algorithm=algorithm, 48 | ) 49 | 50 | self._scenario = scenario 51 | 52 | def run(self) -> CpdLocalizationResults | int: 53 | """Execute online-CPD algorithm and return container with its results 54 | 55 | :return: CpdLocalizationResults object, containing algo result CP and expected CP if needed 56 | """ 57 | time_start = time.perf_counter() 58 | if not self._scenario.to_localize: 59 | return sum(self._cpd_core.detect()) 60 | 61 | algo_results = [cp for cp in self._cpd_core.localize() if cp is not None] 62 | 63 | time_end = time.perf_counter() 64 | expected_change_points: list[int] | None = None 65 | if isinstance(self._labeled_data, LabeledCpdData): 66 | expected_change_points = self._labeled_data.change_points 67 | data = iter(self._cpd_core.data_provider) 68 | return CpdLocalizationResults(data, algo_results, expected_change_points, time_end - time_start) 69 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_configs/test_config_1.yml: -------------------------------------------------------------------------------- 1 | - name: 20-normal-0-1-no-change-point 2 | distributions: 3 | - type: normal 4 | length: 20 5 | parameters: 6 | mean: 0 7 | variance: 1 8 | - name: 20-exponential-1-no-change-point 9 | distributions: 10 | - type: exponential 11 | length: 20 12 | parameters: 13 | rate: 1 14 | - name: 20-weibull-1-1-no-change-point 15 | distributions: 16 | - type: weibull 17 | length: 20 18 | parameters: 19 | shape: 1 20 | scale: 1 21 | - name: 20-uniform-0-1-no-change-point 22 | distributions: 23 | - type: uniform 24 | length: 20 25 | parameters: 26 | min: 0 27 | max: 1 28 | - name: 20-beta-1-1-no-change-point 29 | distributions: 30 | - type: beta 31 | length: 20 32 | parameters: 33 | alpha: 1 34 | beta: 1 35 | - name: 20-gamma-1-1-no-change-point 36 | distributions: 37 | - type: gamma 38 | length: 20 39 | parameters: 40 | alpha: 1 41 | beta: 1 42 | - name: 20-t-2-no-change-point 43 | distributions: 44 | - type: t 45 | length: 20 46 | parameters: 47 | n: 2 48 | - name: 20-lognorm-1-no-change-point 49 | distributions: 50 | - type: lognorm 51 | length: 20 52 | parameters: 53 | s: 1 54 | - name: 20-multivariate_normal-0-1-no-change-point 55 | distributions: 56 | - type: multivariate_normal 57 | length: 20 58 | parameters: 59 | mean: '["0.0", "1.0"]' 60 | - name: 100-normal-0-1-no-change-point 61 | distributions: 62 | - type: normal 63 | length: 100 64 | parameters: 65 | mean: 0 66 | variance: 1 67 | - name: 20-normal-0-1-20-normal-10-1 68 | distributions: 69 | - type: normal 70 | length: 20 71 | parameters: 72 | mean: 0 73 | variance: 1 74 | - type: normal 75 | length: 20 76 | parameters: 77 | mean: 10 78 | variance: 1 79 | - name: 20-multivariate_normal-0-0-20-multivariate_normal-10-10 80 | distributions: 81 | - type: multivariate_normal 82 | length: 20 83 | parameters: 84 | mean: '["0", "0"]' 85 | - type: multivariate_normal 86 | length: 20 87 | parameters: 88 | mean: '["10", "10"]' 89 | -------------------------------------------------------------------------------- /tests/test_configs/test_config_exp.yml: -------------------------------------------------------------------------------- 1 | - name: exp 2 | distributions: 3 | - type: exponential 4 | length: 1000 5 | parameters: 6 | rate: 2.0 7 | - type: beta 8 | length: 500 9 | parameters: 10 | alpha: 1.0 11 | beta: 5.0 12 | -------------------------------------------------------------------------------- /tests/test_core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/tests/test_core/__init__.py -------------------------------------------------------------------------------- /tests/test_core/test_algorithms/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/tests/test_core/test_algorithms/__init__.py -------------------------------------------------------------------------------- /tests/test_core/test_algorithms/test_algorithms_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/tests/test_core/test_algorithms/test_algorithms_utils/__init__.py -------------------------------------------------------------------------------- /tests/test_core/test_algorithms/test_algorithms_utils/bayesian/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/tests/test_core/test_algorithms/test_algorithms_utils/bayesian/__init__.py -------------------------------------------------------------------------------- /tests/test_core/test_algorithms/test_algorithms_utils/bayesian/test_detectors_and_localizers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from pysatl_cpd.core.algorithms.bayesian.detectors.drop import DropDetector 5 | from pysatl_cpd.core.algorithms.bayesian.detectors.threshold import ThresholdDetector 6 | from pysatl_cpd.core.algorithms.bayesian.localizers.argmax import ArgmaxLocalizer 7 | 8 | 9 | @pytest.fixture( 10 | params=[pytest.param((ThresholdDetector, 0.8), id="Threshold"), pytest.param((DropDetector, 0.1), id="Drop")], 11 | scope="function", 12 | ) 13 | def detector(request): 14 | cls, threshold = request.param 15 | return cls(threshold) 16 | 17 | 18 | def generate_test_data(has_cp=True): 19 | run_length_probs = np.full(100, 1.0) 20 | run_length_probs[-1] = 50 if has_cp else 500 21 | 22 | return run_length_probs / run_length_probs.sum() 23 | 24 | 25 | class TestDetectors: 26 | def test_detection(self, detector): 27 | before_cp = generate_test_data(has_cp=False) 28 | after_cp = generate_test_data(has_cp=True) 29 | print(before_cp[-1], after_cp[-1]) 30 | assert not detector.detect(before_cp), ( 31 | "Detector should not react in case of stable high probability of max run length" 32 | ) 33 | assert detector.detect(after_cp), ( 34 | "Detector should react in case of significant abrupt drop of probability of max run length" 35 | ) 36 | 37 | def test_clear(self, detector): 38 | cp_data = generate_test_data(has_cp=True) 39 | 40 | first_result = detector.detect(cp_data) 41 | detector.clear() 42 | 43 | second_result = detector.detect(cp_data) 44 | 45 | assert first_result == second_result, "A state was not cleared correctly" 46 | 47 | 48 | class TestDropDetectorSpecific: 49 | def test_gradual_change(self): 50 | detector = DropDetector(0.1) 51 | run_lengths = np.full(100, 1.0) 52 | 53 | for value in range(500, 490, -1): 54 | run_lengths[-1] = value 55 | data = run_lengths / run_lengths.sum() 56 | assert not detector.detect(data), "Drop detector should not react on a gradual probability decrease" 57 | 58 | 59 | class TestArgmaxLocalizer: 60 | def test_localization(self): 61 | change_point = 5 62 | run_lengths = np.full(11, 0.05) 63 | run_lengths[change_point] = 0.5 64 | localizer = ArgmaxLocalizer() 65 | result = localizer.localize(run_lengths) 66 | assert result == change_point, f"Expected change at {change_point}, got {result}" 67 | -------------------------------------------------------------------------------- /tests/test_core/test_algorithms/test_algorithms_utils/bayesian/test_hazards.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from pysatl_cpd.core.algorithms.bayesian.hazards.constant import ConstantHazard 5 | 6 | 7 | class TestConstantHazard: 8 | @pytest.mark.parametrize("hazard_rate,max_run_length", [(1.1, 50), (10, 100), (200, 250), (500.325251, 500)]) 9 | def test_constant_hazard_for_constants(self, hazard_rate, max_run_length): 10 | constant_hazard = ConstantHazard(hazard_rate) 11 | run_lengths = np.arange(max_run_length, dtype=np.intp) 12 | hazard_probs = constant_hazard.hazard(run_lengths) 13 | assert hazard_probs.shape[0] == max_run_length, ( 14 | f"Expected {max_run_length} probabilities, got {hazard_probs.shape[0]}" 15 | ) 16 | assert np.all(hazard_probs == 1 / hazard_rate), f"Hazard probabilities must be {1 / hazard_rate}" 17 | -------------------------------------------------------------------------------- /tests/test_core/test_algorithms/test_algorithms_utils/bayesian/test_likelihoods.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from pysatl_cpd.core.algorithms.bayesian.likelihoods.exponential_conjugate import ( 5 | ExponentialConjugate, 6 | ExponentialConjugateWithPriorProbability, 7 | ) 8 | from pysatl_cpd.core.algorithms.bayesian.likelihoods.gaussian_conjugate import ( 9 | GaussianConjugate, 10 | GaussianConjugateWithPriorProbability, 11 | ) 12 | from pysatl_cpd.core.algorithms.bayesian.likelihoods.heuristic_gaussian_vs_exponential import ( 13 | HeuristicGaussianVsExponential, 14 | ) 15 | 16 | 17 | @pytest.fixture(scope="module") 18 | def set_seed(): 19 | np.random.seed(42) 20 | 21 | 22 | @pytest.fixture( 23 | params=[ 24 | (GaussianConjugate, {"pre_loc": 0, "pre_scale": 1, "post_loc": 5, "post_scale": 2}), 25 | (ExponentialConjugate, {"pre_scale": 1 / 0.5, "post_scale": 1 / 2}), 26 | (HeuristicGaussianVsExponential, {"pre_scale": 1 / 0.5, "post_loc": 5, "post_scale": 2}), 27 | ], 28 | ids=["Gaussian", "Exponential", "HeuristicGaussianVsExponential"], 29 | ) 30 | def likelihood_config(request): 31 | return request.param 32 | 33 | 34 | @pytest.fixture 35 | def test_data(likelihood_config, set_seed): 36 | likelihood_cls, params = likelihood_config 37 | size = 500 38 | change_point = 250 39 | 40 | match likelihood_cls(): 41 | case GaussianConjugate(): 42 | data = np.concatenate( 43 | [ 44 | np.random.normal(params["pre_loc"], params["pre_scale"], change_point), 45 | np.random.normal(params["post_loc"], params["post_scale"], size - change_point), 46 | ] 47 | ) 48 | case ExponentialConjugate(): 49 | data = np.concatenate( 50 | [ 51 | np.random.exponential(params["pre_scale"], change_point), 52 | np.random.exponential(params["post_scale"], size - change_point), 53 | ] 54 | ) 55 | case HeuristicGaussianVsExponential(): 56 | data = np.concatenate( 57 | [ 58 | np.random.exponential(params["pre_scale"], change_point), 59 | np.random.normal(params["post_loc"], params["post_scale"], size - change_point), 60 | ] 61 | ) 62 | case _: 63 | raise ValueError("Unsupported likelihood") 64 | 65 | return data 66 | 67 | 68 | class TestConjugateLikelihood: 69 | @pytest.fixture(autouse=True) 70 | def setup(self, test_data, likelihood_config): 71 | self.likelihood_cls = likelihood_config[0] 72 | self.data = test_data 73 | self.size = 500 74 | self.change_point = 250 75 | self.learning_steps = 50 76 | 77 | def test_learning_and_update(self): 78 | likelihood = self.likelihood_cls() 79 | likelihood.learn(self.data[: self.learning_steps]) 80 | 81 | metrics = {"after_learn": None, "before_cp": None, "after_cp": None} 82 | 83 | for time in range(self.learning_steps, self.size): 84 | observation = np.float64(self.data[time]) 85 | pred_probs = likelihood.predict(observation) 86 | 87 | assert len(pred_probs) == time - self.learning_steps + 1 88 | 89 | current_mean = np.mean(pred_probs) 90 | if time == self.learning_steps + 1: 91 | metrics["after_learn"] = current_mean 92 | elif time == self.change_point - 1: 93 | metrics["before_cp"] = current_mean 94 | elif time == self.change_point + 1: 95 | metrics["after_cp"] = current_mean 96 | 97 | likelihood.update(observation) 98 | 99 | assert not np.isclose(metrics["after_learn"], metrics["before_cp"], atol=0.05) 100 | assert not np.isclose(metrics["before_cp"], metrics["after_cp"], atol=0.05) 101 | 102 | @pytest.mark.parametrize("data_size", [51, 100], ids=["small", "medium"]) 103 | def test_clear(self, data_size): 104 | likelihood = self.likelihood_cls() 105 | test_data = self.data[:data_size] 106 | 107 | likelihood.learn(test_data[:-2]) 108 | first = likelihood.predict(np.float64(test_data[-1])) 109 | 110 | likelihood.clear() 111 | likelihood.learn(test_data[:-2]) 112 | second = likelihood.predict(np.float64(test_data[-1])) 113 | 114 | np.testing.assert_array_equal(first, second) 115 | 116 | 117 | class TestPriorProbabilityOfSample: 118 | @pytest.fixture(autouse=True) 119 | def setup_teardown(self): 120 | np.random.seed(42) 121 | self.data_size = 20 122 | 123 | @pytest.fixture( 124 | params=[("exponential", "normal"), ("normal", "exponential")], 125 | ids=[ 126 | "exponential data", 127 | "normal_data", 128 | ], 129 | ) 130 | def test_scenario(self, request): 131 | return request.param 132 | 133 | @pytest.fixture 134 | def datasets(self): 135 | return { 136 | "exponential": np.random.exponential(size=self.data_size), 137 | "normal": np.random.normal(size=self.data_size), 138 | } 139 | 140 | def test_probabilities_of_samples(self, test_scenario, datasets): 141 | target_likelihood, compared_likelihood = test_scenario 142 | target_data = datasets[target_likelihood] 143 | 144 | target_likelihood = ( 145 | ExponentialConjugateWithPriorProbability() 146 | if target_likelihood == "exponential" 147 | else GaussianConjugateWithPriorProbability() 148 | ) 149 | compare_likelihood = ( 150 | ExponentialConjugateWithPriorProbability() 151 | if compared_likelihood == "exponential" 152 | else GaussianConjugateWithPriorProbability() 153 | ) 154 | 155 | target_likelihood.learn(target_data) 156 | compare_likelihood.learn(target_data) 157 | 158 | target_prob = target_likelihood.probability_of_learned_prior(target_data) 159 | compare_prob = compare_likelihood.probability_of_learned_prior(target_data) 160 | 161 | assert target_prob > compare_prob, ( 162 | f"{target_likelihood} likelihood should have higher probability " 163 | f"for {target_likelihood} data than {compare_likelihood} likelihood" 164 | ) 165 | -------------------------------------------------------------------------------- /tests/test_core/test_algorithms/test_bayesian_algorithm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from pysatl_cpd.core.algorithms.bayesian.detectors.threshold import ThresholdDetector 5 | from pysatl_cpd.core.algorithms.bayesian.hazards.constant import ConstantHazard 6 | from pysatl_cpd.core.algorithms.bayesian.likelihoods.gaussian_conjugate import ( 7 | GaussianConjugate, 8 | ) 9 | from pysatl_cpd.core.algorithms.bayesian.localizers.argmax import ArgmaxLocalizer 10 | from pysatl_cpd.core.algorithms.bayesian_algorithm import BayesianAlgorithm 11 | 12 | 13 | def set_seed(): 14 | np.random.seed(1) 15 | 16 | 17 | def construct_bayesian_algorithm(): 18 | return BayesianAlgorithm( 19 | learning_steps=50, 20 | likelihood=GaussianConjugate(), 21 | hazard=ConstantHazard(rate=1.0 / (1.0 - 0.5 ** (1.0 / 500))), 22 | detector=ThresholdDetector(threshold=0.04), 23 | localizer=ArgmaxLocalizer(), 24 | ) 25 | 26 | 27 | @pytest.fixture(scope="function") 28 | def data_params(): 29 | return { 30 | "num_of_tests": 10, 31 | "size": 500, 32 | "change_point": 250, 33 | "tolerable_deviation": 25, 34 | } 35 | 36 | 37 | @pytest.fixture 38 | def generate_data(data_params): 39 | def _generate_data(): 40 | set_seed() 41 | return np.concatenate( 42 | [ 43 | np.random.normal(loc=0, scale=1, size=data_params["change_point"]), 44 | np.random.normal(loc=5, scale=2, size=data_params["size"] - data_params["change_point"]), 45 | ] 46 | ) 47 | 48 | return _generate_data 49 | 50 | 51 | @pytest.fixture(scope="function") 52 | def outer_bayesian_algorithm(): 53 | return construct_bayesian_algorithm() 54 | 55 | 56 | @pytest.fixture 57 | def inner_algorithm_factory(): 58 | def _factory(): 59 | return construct_bayesian_algorithm() 60 | 61 | return _factory 62 | 63 | 64 | class TestBayesianAlgorithm: 65 | def test_consecutive_detection(self, outer_bayesian_algorithm, generate_data, data_params): 66 | for _ in range(data_params["num_of_tests"]): 67 | data = generate_data() 68 | result = outer_bayesian_algorithm.detect(data) 69 | assert result, "There was undetected change point in data" 70 | 71 | def test_correctness_of_consecutive_detection( 72 | self, outer_bayesian_algorithm, inner_algorithm_factory, generate_data, data_params 73 | ): 74 | for _ in range(data_params["num_of_tests"]): 75 | data = generate_data() 76 | inner_algorithm = inner_algorithm_factory() 77 | outer_result = outer_bayesian_algorithm.detect(data) 78 | inner_result = inner_algorithm.detect(data) 79 | assert outer_result == inner_result, "Consecutive and independent detection should give same results" 80 | 81 | def test_consecutive_localization(self, outer_bayesian_algorithm, generate_data, data_params): 82 | for _ in range(data_params["num_of_tests"]): 83 | data = generate_data() 84 | result = outer_bayesian_algorithm.localize(data) 85 | assert ( 86 | len(result) > 0 87 | and data_params["change_point"] - data_params["tolerable_deviation"] 88 | <= result[0] 89 | <= data_params["change_point"] + data_params["tolerable_deviation"] 90 | ), "Incorrect change point localization" 91 | 92 | def test_correctness_of_consecutive_localization( 93 | self, outer_bayesian_algorithm, inner_algorithm_factory, generate_data, data_params 94 | ): 95 | for _ in range(data_params["num_of_tests"]): 96 | data = generate_data() 97 | inner_algorithm = inner_algorithm_factory() 98 | outer_result = outer_bayesian_algorithm.localize(data) 99 | inner_result = inner_algorithm.localize(data) 100 | assert outer_result == inner_result, "Consecutive and independent localization should give same results" 101 | -------------------------------------------------------------------------------- /tests/test_core/test_algorithms/test_bayesian_linear_heuristic.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from pysatl_cpd.core.algorithms.bayesian.detectors.threshold import ThresholdDetector 5 | from pysatl_cpd.core.algorithms.bayesian.hazards.constant import ConstantHazard 6 | from pysatl_cpd.core.algorithms.bayesian.likelihoods.heuristic_gaussian_vs_exponential import ( 7 | HeuristicGaussianVsExponential, 8 | ) 9 | from pysatl_cpd.core.algorithms.bayesian.localizers.argmax import ArgmaxLocalizer 10 | from pysatl_cpd.core.algorithms.bayesian_linear_heuristic import BayesianLinearHeuristic 11 | from pysatl_cpd.core.algorithms.bayesian_online_algorithm import BayesianOnline 12 | from pysatl_cpd.core.problem import CpdProblem 13 | from pysatl_cpd.core.scrubber.data_providers import ListUnivariateProvider 14 | from pysatl_cpd.online_cpd_solver import OnlineCpdSolver 15 | 16 | 17 | def generate_no_change_exponential(rate, n=40000, seed=None): 18 | np.random.seed(seed) 19 | return np.random.exponential(scale=1 / rate, size=n) 20 | 21 | 22 | def generate_no_change_normal(mean, std, n=40000, seed=None): 23 | np.random.seed(seed) 24 | return np.random.normal(loc=mean, scale=std, size=n) 25 | 26 | 27 | def generate_change_exp_to_exp(rate1, rate2, change_point, n=40000, seed=None): 28 | np.random.seed(seed) 29 | part1 = np.random.exponential(scale=1 / rate1, size=change_point) 30 | part2 = np.random.exponential(scale=1 / rate2, size=n - change_point) 31 | return np.concatenate([part1, part2]) 32 | 33 | 34 | def generate_change_norm_to_norm(mean1, std1, mean2, std2, change_point, n=40000, seed=None): 35 | np.random.seed(seed) 36 | part1 = np.random.normal(loc=mean1, scale=std1, size=change_point) 37 | part2 = np.random.normal(loc=mean2, scale=std2, size=n - change_point) 38 | return np.concatenate([part1, part2]) 39 | 40 | 41 | def generate_change_exp_to_norm(rate, mean, std, change_point, n=40000, seed=None): 42 | np.random.seed(seed) 43 | part1 = np.random.exponential(scale=1 / rate, size=change_point) 44 | part2 = np.random.normal(loc=mean, scale=std, size=n - change_point) 45 | return np.concatenate([part1, part2]) 46 | 47 | 48 | def generate_change_norm_to_exp(mean, std, rate, change_point, n=40000, seed=None): 49 | np.random.seed(seed) 50 | part1 = np.random.normal(loc=mean, scale=std, size=change_point) 51 | part2 = np.random.exponential(scale=1 / rate, size=n - change_point) 52 | return np.concatenate([part1, part2]) 53 | 54 | 55 | @pytest.fixture 56 | def setup_algorithm(): 57 | base_algorithm = BayesianOnline( 58 | learning_sample_size=20, 59 | likelihood=HeuristicGaussianVsExponential(), 60 | hazard=ConstantHazard(rate=1.0 / (1.0 - 0.5 ** (1.0 / 500))), 61 | detector=ThresholdDetector(threshold=0.04), 62 | localizer=ArgmaxLocalizer(), 63 | ) 64 | heuristic_algorithm = BayesianLinearHeuristic( 65 | algorithm=base_algorithm, time_before_duplicate_start=275, duplicate_preparation_time=225 66 | ) 67 | return base_algorithm, heuristic_algorithm 68 | 69 | 70 | @pytest.mark.parametrize( 71 | "data_generator, params, true_cp", 72 | [ 73 | (generate_no_change_exponential, {"rate": 2.0}, None), 74 | (generate_no_change_normal, {"mean": 0.0, "std": 1.0}, None), 75 | (generate_change_exp_to_exp, {"rate1": 2.0, "rate2": 0.5, "change_point": 10000}, 10000), 76 | ( 77 | generate_change_norm_to_norm, 78 | {"mean1": 0.0, "std1": 1.0, "mean2": 5.0, "std2": 1.0, "change_point": 15000}, 79 | 15000, 80 | ), 81 | (generate_change_exp_to_norm, {"rate": 2.0, "mean": 5.0, "std": 1.0, "change_point": 20000}, 20000), 82 | (generate_change_norm_to_exp, {"mean": 0.0, "std": 1.0, "rate": 0.5, "change_point": 25000}, 25000), 83 | ], 84 | ) 85 | def test_cpd_detection(setup_algorithm, data_generator, params, true_cp): 86 | _, heuristic_algorithm = setup_algorithm 87 | 88 | data = data_generator(**params, n=40000, seed=42) 89 | data_provider = ListUnivariateProvider(list(data)) 90 | 91 | solver_heuristic = OnlineCpdSolver( 92 | scenario=CpdProblem(True), algorithm=heuristic_algorithm, algorithm_input=data_provider 93 | ) 94 | result_heuristic = solver_heuristic.run() 95 | 96 | if true_cp is None: 97 | print(result_heuristic.result) 98 | assert len(result_heuristic.result) < len(data) / 500, "There shouldn't be too much change points" 99 | else: 100 | assert any(true_cp - 25 <= cp <= true_cp + 25 for cp in result_heuristic.result), ( 101 | f"No detected change point near {true_cp} in heuristic result" 102 | ) 103 | 104 | 105 | def test_time_comparison(setup_algorithm): 106 | base_algorithm, heuristic_algorithm = setup_algorithm 107 | 108 | data = generate_change_exp_to_exp(rate1=2.0, rate2=0.5, change_point=10000, n=40000, seed=42) 109 | data_provider = ListUnivariateProvider(list(data)) 110 | 111 | solver_heuristic = OnlineCpdSolver( 112 | scenario=CpdProblem(True), algorithm=heuristic_algorithm, algorithm_input=data_provider 113 | ) 114 | time_heuristic = solver_heuristic.run().time_sec 115 | 116 | solver_base = OnlineCpdSolver(scenario=CpdProblem(True), algorithm=base_algorithm, algorithm_input=data_provider) 117 | time_base = solver_base.run().time_sec 118 | 119 | print(time_heuristic, time_base) 120 | assert time_heuristic < time_base, f"Heuristic time ({time_heuristic}) >= base time ({time_base})" 121 | -------------------------------------------------------------------------------- /tests/test_core/test_algorithms/test_classification_algorithms.py: -------------------------------------------------------------------------------- 1 | from itertools import product 2 | 3 | import numpy as np 4 | import numpy.typing as npt 5 | import pytest 6 | 7 | import pysatl_cpd.generator.distributions as dstr 8 | from pysatl_cpd.core.algorithms.classification.classifiers.decision_tree import DecisionTreeClassifier 9 | from pysatl_cpd.core.algorithms.classification.classifiers.knn import KNNClassifier 10 | from pysatl_cpd.core.algorithms.classification.classifiers.rf import RFClassifier 11 | from pysatl_cpd.core.algorithms.classification.classifiers.svm import SVMClassifier 12 | from pysatl_cpd.core.algorithms.classification.quality_metrics.classification.f1 import F1 13 | from pysatl_cpd.core.algorithms.classification.quality_metrics.classification.mcc import MCC 14 | from pysatl_cpd.core.algorithms.classification.test_statistics.threshold_overcome import ThresholdOvercome 15 | from pysatl_cpd.core.algorithms.classification_algorithm import ClassificationAlgorithm 16 | from pysatl_cpd.core.algorithms.knn_algorithm import KNNAlgorithm 17 | from pysatl_cpd.core.scrubber.data_providers import LabeledDataProvider 18 | from pysatl_cpd.core.scrubber.linear import LinearScrubber 19 | from pysatl_cpd.cpd_solver import CpdProblem, CpdSolver 20 | from pysatl_cpd.labeled_data import LabeledCpdData 21 | 22 | K = 7 23 | CM_THRESHOLD = 4.5 24 | INDENT_COEFF = 0.25 25 | SHIFT_FACTOR = 0.5 26 | WINDOW_SIZE = 48 27 | SIZE = 200 28 | CP_N = 100 29 | TOLERABLE_DEVIATION = WINDOW_SIZE / 2 30 | EXPECTED_CP = 100 31 | CLASSIFIERS = ["knn", "svm", "rf", "dt"] 32 | METRICS = ["mcc"] 33 | 34 | 35 | def assert_result(actual): 36 | def in_interval(cp): 37 | return EXPECTED_CP - TOLERABLE_DEVIATION <= cp <= EXPECTED_CP + TOLERABLE_DEVIATION 38 | 39 | assert (len(actual) > 0 and all(in_interval(cp) for cp in actual)), "Incorrect change point localization" 40 | 41 | 42 | def build_classification_alg(classifier_name, metric_name): 43 | match metric_name: 44 | case "f1": 45 | quality_metric = F1() 46 | threshold = 0.85 47 | case "mcc": 48 | quality_metric = MCC() 49 | threshold = 0.85 50 | case _: 51 | raise NotImplementedError("No such metric yet.") 52 | 53 | match classifier_name: 54 | case "knn": 55 | classifier = KNNClassifier(K) 56 | case "svm": 57 | classifier = SVMClassifier() 58 | case "dt": 59 | classifier = DecisionTreeClassifier() 60 | case "rf": 61 | classifier = RFClassifier() 62 | case _: 63 | raise NotImplementedError("No such classifier yet.") 64 | 65 | return ClassificationAlgorithm(classifier=classifier, 66 | quality_metric=quality_metric, 67 | test_statistic=ThresholdOvercome(threshold), 68 | indent_coeff=INDENT_COEFF) 69 | 70 | 71 | def build_solver(alg, data): 72 | data_provider = LabeledDataProvider(LabeledCpdData(data, [EXPECTED_CP])) 73 | scrubber = LinearScrubber(data_provider, WINDOW_SIZE, SHIFT_FACTOR) 74 | return CpdSolver(CpdProblem(to_localize=True), algorithm=alg, algorithm_input=scrubber) 75 | 76 | 77 | @pytest.fixture(scope="session") 78 | def univariate_data(): 79 | np.random.seed(1) 80 | left_distr = dstr.Distribution.from_str( 81 | str(dstr.Distributions.UNIFORM), 82 | {"min": "2.0", "max": "2.1"}) 83 | right_distr = dstr.Distribution.from_str( 84 | str(dstr.Distributions.UNIFORM), 85 | {"min": "0.0", "max": "0.1"}) 86 | return np.concatenate( 87 | [ 88 | left_distr.scipy_sample(EXPECTED_CP), 89 | right_distr.scipy_sample(SIZE - EXPECTED_CP), 90 | ] 91 | ) 92 | 93 | 94 | @pytest.fixture(scope="session") 95 | def multivariate_data(): 96 | np.random.seed(1) 97 | left_distr = dstr.Distribution.from_str( 98 | str(dstr.Distributions.MULTIVARIATIVE_NORMAL), 99 | {"mean": str([0.0] * 10)}) 100 | right_distr = dstr.Distribution.from_str( 101 | str(dstr.Distributions.MULTIVARIATIVE_NORMAL), 102 | {"mean": str([5.0] * 10)}) 103 | return np.concatenate( 104 | [ 105 | left_distr.scipy_sample(EXPECTED_CP), 106 | right_distr.scipy_sample(SIZE - EXPECTED_CP) 107 | ] 108 | ) 109 | 110 | 111 | class TestClassificationCpd: 112 | @pytest.mark.parametrize( 113 | "classifier_name, metric", 114 | list(product(CLASSIFIERS, METRICS)), 115 | ) 116 | def test_classification_cpd_univariate(self, classifier_name, metric, univariate_data): 117 | alg = build_classification_alg(classifier_name, metric) 118 | solver = build_solver(alg, univariate_data) 119 | actual = solver.run().result 120 | assert_result(actual) 121 | 122 | @pytest.mark.parametrize( 123 | "classifier_name, metric", 124 | list(product(CLASSIFIERS, METRICS)), 125 | ) 126 | def test_classification_cpd_multivariate(self, classifier_name, metric, multivariate_data): 127 | alg = build_classification_alg(classifier_name, metric) 128 | solver = build_solver(alg, multivariate_data) 129 | actual = solver.run().result 130 | assert_result(actual) 131 | 132 | 133 | class TestKnnCpd: 134 | @pytest.fixture(scope="function") 135 | def knn_cpd_univariate(self): 136 | def metric(obs1: float, obs2: float) -> float: 137 | return abs(obs1 - obs2) 138 | 139 | return KNNAlgorithm(distance_func=metric, 140 | test_statistic=ThresholdOvercome(CM_THRESHOLD), 141 | indent_coeff=INDENT_COEFF, 142 | k=K) 143 | 144 | @pytest.fixture(scope="function") 145 | def knn_cpd_multivariate(self): 146 | def metric(obs1: npt.NDArray[np.float64], obs2: npt.NDArray[np.float64]) -> float: 147 | return float(np.linalg.norm(obs1 - obs2)) 148 | 149 | return KNNAlgorithm(distance_func=metric, 150 | test_statistic=ThresholdOvercome(CM_THRESHOLD), 151 | indent_coeff=INDENT_COEFF, 152 | k=K) 153 | 154 | def test_knn_cpd_univariate(self, knn_cpd_univariate, univariate_data): 155 | solver = build_solver(knn_cpd_univariate, univariate_data) 156 | actual = solver.run().result 157 | assert_result(actual) 158 | 159 | def test_knn_cpd_multivariate(self, knn_cpd_multivariate, multivariate_data): 160 | solver = build_solver(knn_cpd_multivariate, multivariate_data) 161 | actual = solver.run().result 162 | assert_result(actual) 163 | -------------------------------------------------------------------------------- /tests/test_core/test_algorithms/test_graph_algorithm.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from pysatl_cpd.core.algorithms.graph_algorithm import GraphAlgorithm 4 | 5 | 6 | def custom_comparison(node1, node2): 7 | arg = 5 8 | return abs(node1 - node2) <= arg 9 | 10 | 11 | class TestGraphAlgorithm: 12 | @pytest.mark.parametrize( 13 | "alg_param,data,expected", 14 | (((custom_comparison, 1.5), (50, 55, 60, 48, 52, 70, 75, 80, 90, 85, 95, 100, 50), [5]),), 15 | ) 16 | def test_localize(self, alg_param, data, expected): 17 | algorithm = GraphAlgorithm(*alg_param) 18 | assert algorithm.localize(data) == expected 19 | 20 | @pytest.mark.parametrize( 21 | "alg_param,data,expected", 22 | (((custom_comparison, 1.5), (50, 55, 60, 48, 52, 70, 75, 80, 90, 85, 95, 100, 50), 1),), 23 | ) 24 | def test_detect(self, alg_param, data, expected): 25 | algorithm = GraphAlgorithm(*alg_param) 26 | assert algorithm.detect(data) == expected 27 | -------------------------------------------------------------------------------- /tests/test_core/test_cpd_core.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from pysatl_cpd.core.algorithms.graph_algorithm import GraphAlgorithm 4 | from pysatl_cpd.core.cpd_core import CpdCore 5 | from pysatl_cpd.core.scrubber.data_providers import ListUnivariateProvider 6 | from pysatl_cpd.core.scrubber.linear import LinearScrubber 7 | 8 | 9 | def custom_comparison(node1, node2): 10 | arg = 5 11 | return abs(node1 - node2) <= arg 12 | 13 | 14 | class TestCPDCore: 15 | @pytest.mark.parametrize( 16 | "data,alg_class,alg_param,expected", 17 | ( 18 | ( 19 | [50, 55, 60, 48, 52, 70, 75, 80, 90, 85, 95, 100, 50], 20 | GraphAlgorithm, 21 | (custom_comparison, 1.5), 22 | [5], 23 | ), 24 | ), 25 | ) 26 | def test_run(self, data, alg_class, alg_param, expected): 27 | scrubber = LinearScrubber(ListUnivariateProvider(data)) 28 | algorithm = alg_class(*alg_param) 29 | 30 | core = CpdCore(scrubber, algorithm) 31 | assert core.localize() == expected 32 | -------------------------------------------------------------------------------- /tests/test_core/test_online_cpd_core.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from pysatl_cpd.core.algorithms.bayesian.detectors.threshold import ThresholdDetector 5 | from pysatl_cpd.core.algorithms.bayesian.hazards.constant import ConstantHazard 6 | from pysatl_cpd.core.algorithms.bayesian.likelihoods.gaussian_conjugate import GaussianConjugate 7 | from pysatl_cpd.core.algorithms.bayesian.localizers.argmax import ArgmaxLocalizer 8 | from pysatl_cpd.core.algorithms.bayesian_online_algorithm import BayesianOnline 9 | from pysatl_cpd.core.online_cpd_core import OnlineCpdCore 10 | from pysatl_cpd.core.scrubber.data_providers import ListUnivariateProvider 11 | 12 | DATA_PARAMS = { 13 | "num_of_tests": 10, 14 | "size": 500, 15 | "change_point": 250, 16 | "tolerable_deviation": 25, 17 | } 18 | 19 | 20 | @pytest.fixture(scope="session") 21 | def data_params(): 22 | return DATA_PARAMS 23 | 24 | 25 | def construct_bayesian_online_algorithm(): 26 | return BayesianOnline( 27 | learning_sample_size=50, 28 | likelihood=GaussianConjugate(), 29 | hazard=ConstantHazard(rate=1.0 / (1.0 - 0.5 ** (1.0 / 500))), 30 | detector=ThresholdDetector(threshold=0.04), 31 | localizer=ArgmaxLocalizer(), 32 | ) 33 | 34 | 35 | @pytest.fixture 36 | def algorithm(): 37 | return construct_bayesian_online_algorithm() 38 | 39 | 40 | @pytest.fixture(params=[True, False], ids=["with_cp", "without_cp"]) 41 | def dataset(request, data_params): 42 | np.random.seed(42 + request.param_index) 43 | if request.param: 44 | return np.concatenate( 45 | [ 46 | np.random.normal(0, 1, data_params["change_point"]), 47 | np.random.normal(5, 2, data_params["size"] - data_params["change_point"]), 48 | ] 49 | ) 50 | return np.random.normal(0, 1, data_params["size"]) 51 | 52 | 53 | @pytest.fixture 54 | def online_core(dataset): 55 | return OnlineCpdCore( 56 | algorithm=construct_bayesian_online_algorithm(), data_provider=ListUnivariateProvider(list(dataset)) 57 | ) 58 | 59 | 60 | class TestOnlineCpdCore: 61 | @pytest.mark.parametrize("test_iteration", range(DATA_PARAMS["num_of_tests"])) 62 | @pytest.mark.parametrize("mode", ["detect", "localize"]) 63 | def test_core_functionality(self, algorithm, online_core, dataset, data_params, mode, test_iteration): 64 | core_iterator = getattr(online_core, mode)() 65 | algo_method = getattr(algorithm, mode) 66 | 67 | for time_point in range(data_params["size"]): 68 | observation = dataset[time_point] 69 | algo_result = algo_method(observation) 70 | core_result = next(core_iterator) 71 | 72 | assert algo_result == core_result, ( 73 | f"Different results at {time_point} between manual {mode} and core {mode} iteration" 74 | ) 75 | -------------------------------------------------------------------------------- /tests/test_core/test_scrubber/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/tests/test_core/test_scrubber/__init__.py -------------------------------------------------------------------------------- /tests/test_core/test_scrubber/test_dataproviders.py: -------------------------------------------------------------------------------- 1 | __author__ = "Vladimir Kutuev" 2 | __copyright__ = "Copyright (c) 2025 PySATL project" 3 | __license__ = "SPDX-License-Identifier: MIT" 4 | 5 | from hypothesis import given, strategies 6 | 7 | from pysatl_cpd.core.scrubber.data_providers import ListUnivariateProvider 8 | 9 | 10 | class TestDataProviders: 11 | @given(strategies.lists(strategies.floats(allow_nan=False), min_size=0, max_size=100)) 12 | def test_list_univariate(self, data: list[float]): 13 | provider = ListUnivariateProvider(data) 14 | provided_data = list(provider.__iter__()) 15 | assert len(data) == len(provided_data) 16 | assert all(map(lambda t: t[0] == t[1], zip(data, provided_data))) 17 | -------------------------------------------------------------------------------- /tests/test_core/test_scrubber/test_linear_scrubber.py: -------------------------------------------------------------------------------- 1 | import hypothesis.strategies as st 2 | import numpy as np 3 | from hypothesis import given, settings 4 | 5 | from pysatl_cpd.core.scrubber.data_providers import ListUnivariateProvider 6 | from pysatl_cpd.core.scrubber.linear import LinearScrubber 7 | 8 | 9 | class TestLinearScrubber: 10 | @settings(max_examples=1000) 11 | @given(st.integers(0, 100), st.integers(1, 100), st.floats(0.01, 1)) 12 | def test_get_windows(self, data_length, window_length, shift_factor): 13 | data = [float(i) for i in range(data_length)] 14 | scrubber = LinearScrubber(ListUnivariateProvider(data), window_length, shift_factor) 15 | cur_index = 0 16 | for window in iter(scrubber): 17 | assert len(window.values) == len(window.indices) 18 | assert np.array_equal(window.values, np.fromiter(data[cur_index : cur_index + window_length], np.float64)) 19 | cur_index += max(1, int(window_length * shift_factor)) 20 | 21 | @settings(max_examples=1000) 22 | @given(st.integers(0, 100), st.integers(1, 100), st.floats(0.01, 1), st.integers(0, 100)) 23 | def test_restart(self, data_length, window_length, shift_factor, window_start): 24 | data = [i for i in range(data_length)] 25 | scrubber = LinearScrubber(ListUnivariateProvider(data), window_length, shift_factor) 26 | fst = list(scrubber) 27 | snd = list(scrubber) 28 | assert len(fst) == len(snd) 29 | assert all( 30 | map(lambda w: w[0].indices == w[1].indices and np.array_equal(w[0].values, w[1].values), zip(fst, snd)) 31 | ) 32 | -------------------------------------------------------------------------------- /tests/test_generator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PySATL/pysatl-cpd/9f496f4cdf1401d3d405e28a86e82ab848bb6b52/tests/test_generator/__init__.py -------------------------------------------------------------------------------- /tests/test_generator/test_distributions.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import pysatl_cpd.generator.distributions as dstr 4 | 5 | 6 | class TestDistributions: 7 | @pytest.mark.parametrize( 8 | "distribution, params, error", 9 | [ 10 | (dstr.Distributions.NORMAL, {"mean": "0"}, ValueError), 11 | (dstr.Distributions.NORMAL, {"mean": "0", "var": "1"}, KeyError), 12 | (dstr.Distributions.NORMAL, {"mean": "0", "variance": "1", "x": "5"}, ValueError), 13 | (dstr.Distributions.NORMAL, {"mean": "0", "variance": "-1"}, ValueError), 14 | (dstr.Distributions.EXPONENTIAL, {}, ValueError), 15 | (dstr.Distributions.EXPONENTIAL, {"rt": "1"}, KeyError), 16 | (dstr.Distributions.EXPONENTIAL, {"rate": "1", "x": "5"}, ValueError), 17 | (dstr.Distributions.EXPONENTIAL, {"rate": "-1"}, ValueError), 18 | (dstr.Distributions.WEIBULL, {"shape": "0"}, ValueError), 19 | (dstr.Distributions.WEIBULL, {"shape": "0", "var": "1"}, KeyError), 20 | (dstr.Distributions.WEIBULL, {"shape": "1", "scale": "1", "x": "5"}, ValueError), 21 | (dstr.Distributions.WEIBULL, {"shape": "-1", "scale": "1"}, ValueError), 22 | (dstr.Distributions.WEIBULL, {"shape": "1", "scale": "-1"}, ValueError), 23 | (dstr.Distributions.UNIFORM, {"min": "0"}, ValueError), 24 | (dstr.Distributions.UNIFORM, {"min": "-1", "MAX": "1"}, KeyError), 25 | (dstr.Distributions.UNIFORM, {"min": "-1", "max": "1", "x": "5"}, ValueError), 26 | (dstr.Distributions.UNIFORM, {"min": "1", "max": "-1"}, ValueError), 27 | (dstr.Distributions.BETA, {"alpha": "1"}, ValueError), 28 | (dstr.Distributions.BETA, {"alpha": "1", "x": "1"}, KeyError), 29 | (dstr.Distributions.BETA, {"alpha": "1", "beta": "1", "x": "5"}, ValueError), 30 | (dstr.Distributions.BETA, {"alpha": "-1", "beta": "1"}, ValueError), 31 | (dstr.Distributions.BETA, {"alpha": "1", "beta": "-1"}, ValueError), 32 | (dstr.Distributions.GAMMA, {"alpha": "1"}, ValueError), 33 | (dstr.Distributions.GAMMA, {"alpha": "1", "x": "1"}, KeyError), 34 | (dstr.Distributions.GAMMA, {"alpha": "1", "beta": "1", "x": "5"}, ValueError), 35 | (dstr.Distributions.GAMMA, {"alpha": "-1", "beta": "1"}, ValueError), 36 | (dstr.Distributions.GAMMA, {"alpha": "1", "beta": "-1"}, ValueError), 37 | (dstr.Distributions.T, {}, ValueError), 38 | (dstr.Distributions.T, {"N": "1"}, KeyError), 39 | (dstr.Distributions.T, {"n": "1", "x": "5"}, ValueError), 40 | (dstr.Distributions.T, {"n": "-1"}, ValueError), 41 | (dstr.Distributions.LOGNORM, {}, ValueError), 42 | (dstr.Distributions.LOGNORM, {"S": "1"}, KeyError), 43 | (dstr.Distributions.LOGNORM, {"s": "1", "x": "5"}, ValueError), 44 | (dstr.Distributions.LOGNORM, {"s": "-1"}, ValueError), 45 | (dstr.Distributions.MULTIVARIATIVE_NORMAL, {}, ValueError), 46 | (dstr.Distributions.MULTIVARIATIVE_NORMAL, {"Mean": "[0.0, 0.0]"}, KeyError), 47 | (dstr.Distributions.MULTIVARIATIVE_NORMAL, {"mean": "[0.0, 0.0]", "x": "5"}, ValueError), 48 | (dstr.Distributions.MULTIVARIATIVE_NORMAL, {"mean": "[]"}, ValueError), 49 | ], 50 | ) 51 | def test_distribution_params_validation_fail(self, distribution, params, error): 52 | sample_len = 100 53 | with pytest.raises(error): 54 | d = dstr.Distribution.from_str(str(distribution), params) 55 | assert len(d.scipy_sample(sample_len)) == sample_len 56 | 57 | @pytest.mark.parametrize( 58 | "distribution, params", 59 | [ 60 | (dstr.Distributions.NORMAL, {"mean": "0", "variance": "1"}), 61 | (dstr.Distributions.EXPONENTIAL, {"rate": "1"}), 62 | (dstr.Distributions.WEIBULL, {"shape": "1", "scale": "1"}), 63 | (dstr.Distributions.UNIFORM, {"min": "0", "max": "1"}), 64 | (dstr.Distributions.BETA, {"alpha": "1", "beta": "1"}), 65 | (dstr.Distributions.GAMMA, {"alpha": "1", "beta": "1"}), 66 | (dstr.Distributions.T, {"n": "1"}), 67 | (dstr.Distributions.LOGNORM, {"s": "1"}), 68 | (dstr.Distributions.MULTIVARIATIVE_NORMAL, {"mean": "[0.0, 0.1]"}), 69 | ], 70 | ) 71 | def test_distribution_generate(self, distribution, params): 72 | sample_len = 100 73 | d = dstr.Distribution.from_str(str(distribution), params) 74 | assert len(d.scipy_sample(sample_len)) == sample_len 75 | -------------------------------------------------------------------------------- /tests/test_generator/test_generator.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from os import walk 3 | from pathlib import Path 4 | 5 | import pytest 6 | 7 | from pysatl_cpd.generator.generator import ScipyDatasetGenerator 8 | from pysatl_cpd.generator.saver import DatasetSaver 9 | 10 | 11 | class TestGenerator: 12 | config_path = "tests/test_configs/test_config_1.yml" 13 | 14 | @pytest.mark.parametrize( 15 | "config_path_str,generator,configurations", 16 | ( 17 | ( 18 | config_path, 19 | ScipyDatasetGenerator(), 20 | { 21 | "20-normal-0-1-20-normal-10-1": [40, [20]], 22 | "20-multivariate_normal-0-0-20-multivariate_normal-10-10": [40, [20]], 23 | "20-normal-0-1-no-change-point": [20, []], 24 | "20-exponential-1-no-change-point": [20, []], 25 | "20-weibull-1-1-no-change-point": [20, []], 26 | "20-uniform-0-1-no-change-point": [20, []], 27 | "20-beta-1-1-no-change-point": [20, []], 28 | "20-gamma-1-1-no-change-point": [20, []], 29 | "20-t-2-no-change-point": [20, []], 30 | "20-lognorm-1-no-change-point": [20, []], 31 | "20-multivariate_normal-0-1-no-change-point": [20, []], 32 | "100-normal-0-1-no-change-point": [100, []], 33 | }, 34 | ), 35 | ), 36 | ) 37 | def test_generate_datasets(self, config_path_str, generator, configurations) -> None: 38 | generated = generator.generate_datasets(Path(config_path_str)) 39 | for name in configurations: 40 | data_length = len(generated[name][0]) 41 | assert data_length == configurations[name][0] 42 | assert generated[name][1] == configurations[name][1] 43 | 44 | @pytest.mark.parametrize( 45 | "config_path_str,generator,configurations", 46 | ( 47 | ( 48 | config_path, 49 | ScipyDatasetGenerator(), 50 | { 51 | "20-normal-0-1-20-normal-10-1": [40, [20]], 52 | "20-normal-0-1-no-change-point": [20, []], 53 | "100-normal-0-1-no-change-point": [100, []], 54 | }, 55 | ), 56 | ), 57 | ) 58 | def test_generate_datasets_save(self, config_path_str, generator, configurations) -> None: 59 | with tempfile.TemporaryDirectory() as tempdir: 60 | saver = DatasetSaver(Path(tempdir), True) 61 | generated = generator.generate_datasets(Path(config_path_str), saver) 62 | for name in configurations: 63 | data_length = sum(1 for _ in generated[name][0]) 64 | assert data_length == configurations[name][0] 65 | assert generated[name][1] == configurations[name][1] 66 | 67 | directory = [file_names for (_, _, file_names) in walk(tempdir)] 68 | for file_names in directory[1:]: 69 | assert sorted(file_names) == sorted(["changepoints.csv", "sample.adoc", "sample.png", "sample.csv"]) 70 | -------------------------------------------------------------------------------- /tests/test_labeled_data.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from os import walk 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import pytest 7 | 8 | from pysatl_cpd.labeled_data import LabeledCpdData 9 | 10 | 11 | class TestLabeledCPData: 12 | config_path = "tests/test_configs/test_config_1.yml" 13 | data = LabeledCpdData([1, 2, 3], [4, 5, 6]) 14 | 15 | def test_init(self) -> None: 16 | assert self.data.raw_data == [1, 2, 3] 17 | assert self.data.change_points == [4, 5, 6] 18 | 19 | def test_iter(self) -> None: 20 | assert list(self.data.__iter__()) == [1, 2, 3] 21 | 22 | @pytest.mark.parametrize( 23 | "config_path_str,expected_change_points_list,expected_lengths", 24 | ( 25 | ( 26 | config_path, 27 | { 28 | "20-normal-0-1-20-normal-10-1": [20], 29 | "20-normal-0-1-no-change-point": [], 30 | "100-normal-0-1-no-change-point": [], 31 | }, 32 | { 33 | "20-normal-0-1-20-normal-10-1": 40, 34 | "20-normal-0-1-no-change-point": 20, 35 | "100-normal-0-1-no-change-point": 100, 36 | }, 37 | ), 38 | ), 39 | ) 40 | def test_generate_datasets(self, config_path_str, expected_change_points_list, expected_lengths) -> None: 41 | generated = LabeledCpdData.generate_cp_datasets(Path(config_path_str)) 42 | for name in expected_lengths: 43 | data_length = len(generated[name].raw_data) 44 | assert data_length == expected_lengths[name] 45 | assert generated[name].change_points == expected_change_points_list[name] 46 | 47 | @pytest.mark.parametrize( 48 | "config_path_str,expected_change_points_list,expected_lengths", 49 | ( 50 | ( 51 | config_path, 52 | { 53 | "20-normal-0-1-20-normal-10-1": [20], 54 | "20-normal-0-1-no-change-point": [], 55 | "100-normal-0-1-no-change-point": [], 56 | }, 57 | { 58 | "20-normal-0-1-20-normal-10-1": 40, 59 | "20-normal-0-1-no-change-point": 20, 60 | "100-normal-0-1-no-change-point": 100, 61 | }, 62 | ), 63 | ), 64 | ) 65 | def test_generate_datasets_save(self, config_path_str, expected_change_points_list, expected_lengths) -> None: 66 | with tempfile.TemporaryDirectory() as tempdir: 67 | generated = LabeledCpdData.generate_cp_datasets( 68 | Path(config_path_str), to_save=True, output_directory=Path(tempdir) 69 | ) 70 | for name in expected_lengths: 71 | data_length = len(generated[name].raw_data) 72 | assert data_length == expected_lengths[name] 73 | assert generated[name].change_points == expected_change_points_list[name] 74 | 75 | directory = [file_names for (_, _, file_names) in walk(tempdir)] 76 | for file_names in directory[1:]: 77 | assert sorted(file_names) == sorted(["changepoints.csv", "sample.adoc", "sample.png", "sample.csv"]) 78 | 79 | @pytest.mark.parametrize( 80 | "config_path_str", 81 | (config_path,), 82 | ) 83 | def test_read_generated_datasets(self, config_path_str): 84 | with tempfile.TemporaryDirectory() as tempdir: 85 | generated = LabeledCpdData.generate_cp_datasets( 86 | Path(config_path_str), to_save=True, output_directory=Path(tempdir) 87 | ) 88 | read = LabeledCpdData.read_generated_datasets(Path(tempdir)) 89 | for name in generated: 90 | assert read[name].raw_data.shape == generated[name].raw_data.shape 91 | assert np.array_equal(read[name].raw_data, generated[name].raw_data) 92 | assert read[name].change_points == generated[name].change_points 93 | -------------------------------------------------------------------------------- /tests/test_online_solver.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | 4 | from pysatl_cpd.core.algorithms.bayesian.detectors.threshold import ThresholdDetector 5 | from pysatl_cpd.core.algorithms.bayesian.hazards.constant import ConstantHazard 6 | from pysatl_cpd.core.algorithms.bayesian.likelihoods.exponential_conjugate import ExponentialConjugate 7 | from pysatl_cpd.core.algorithms.bayesian.likelihoods.gaussian_conjugate import GaussianConjugate 8 | from pysatl_cpd.core.algorithms.bayesian.localizers.argmax import ArgmaxLocalizer 9 | from pysatl_cpd.core.algorithms.bayesian_online_algorithm import BayesianOnline 10 | from pysatl_cpd.core.problem import CpdProblem 11 | from pysatl_cpd.core.scrubber.data_providers import ListUnivariateProvider 12 | from pysatl_cpd.icpd_solver import CpdLocalizationResults 13 | from pysatl_cpd.labeled_data import LabeledCpdData 14 | from pysatl_cpd.online_cpd_solver import OnlineCpdSolver 15 | 16 | DATA_PARAMS = { 17 | "num_tests": 10, 18 | "size": 500, 19 | "change_point": 250, 20 | "tolerable_deviation": 25, 21 | } 22 | 23 | 24 | @pytest.fixture(scope="session") 25 | def data_params(): 26 | return DATA_PARAMS 27 | 28 | 29 | @pytest.fixture 30 | def data_generator(data_params): 31 | def _generate(has_cp, test_iteration): 32 | seed = 42 + test_iteration 33 | np.random.seed(seed) 34 | if has_cp: 35 | return np.concatenate( 36 | [ 37 | np.random.normal(0, 1, data_params["change_point"]), 38 | np.random.normal(5, 2, data_params["size"] - data_params["change_point"]), 39 | ] 40 | ) 41 | return np.random.normal(0, 1, data_params["size"]) 42 | 43 | return _generate 44 | 45 | 46 | @pytest.fixture 47 | def labeled_data_factory(data_params): 48 | def _factory(data, has_cp): 49 | return LabeledCpdData(raw_data=data, change_points=[data_params["change_point"]] if has_cp else None) 50 | 51 | return _factory 52 | 53 | 54 | @pytest.fixture 55 | def solver_factory(): 56 | def _factory(data_input, with_localization): 57 | return OnlineCpdSolver( 58 | algorithm=BayesianOnline( 59 | learning_sample_size=50, 60 | likelihood=GaussianConjugate(), 61 | hazard=ConstantHazard(rate=1.0 / (1.0 - 0.5 ** (1.0 / 500))), 62 | detector=ThresholdDetector(threshold=0.04), 63 | localizer=ArgmaxLocalizer(), 64 | ), 65 | algorithm_input=data_input, 66 | scenario=CpdProblem(with_localization), 67 | ) 68 | 69 | return _factory 70 | 71 | 72 | def pytest_generate_tests(metafunc): 73 | if "test_iteration" in metafunc.fixturenames: 74 | metafunc.parametrize("test_iteration", range(DATA_PARAMS["num_tests"])) 75 | 76 | 77 | class TestOnlineCpdSolver: 78 | @pytest.mark.parametrize( 79 | "has_cp,with_localization,is_labeled", 80 | [ 81 | (False, True, True), 82 | (True, True, True), 83 | (False, True, False), 84 | (True, True, False), 85 | (False, False, True), 86 | (True, False, True), 87 | (False, False, False), 88 | (True, False, False), 89 | ], 90 | ) 91 | def test_all_scenarios( 92 | self, 93 | data_generator, 94 | labeled_data_factory, 95 | solver_factory, 96 | has_cp, 97 | with_localization, 98 | is_labeled, 99 | test_iteration, 100 | data_params, 101 | ): 102 | raw_data = data_generator(has_cp, test_iteration) 103 | 104 | data_input = labeled_data_factory(raw_data, has_cp) if is_labeled else ListUnivariateProvider(raw_data.tolist()) 105 | 106 | solver = solver_factory(data_input, with_localization) 107 | result = solver.run() 108 | 109 | if with_localization: 110 | assert isinstance(result, CpdLocalizationResults), "Localization result must be CpdLocalizationResults" 111 | if has_cp: 112 | assert len(result.result) == 1, "There must be only one change point" 113 | assert abs(result.result[0] - data_params["change_point"]) <= data_params["tolerable_deviation"], ( 114 | "Change point must lie in tolerable interval" 115 | ) 116 | if is_labeled: 117 | assert result.expected_result == [data_params["change_point"]], ( 118 | "Labeled change point must be equal to generated one" 119 | ) 120 | else: 121 | assert result.expected_result is None, "Expected result must be None for not labeled data" 122 | else: 123 | assert result.result == [], "There must be no change points" 124 | else: 125 | assert isinstance(result, int), "Detection result must be a number of detected change points" 126 | assert result == (1 if has_cp else 0), ( 127 | "Number of change points must be equal to expected in the generated data" 128 | ) 129 | 130 | def test_exponential_with_negatives(self, data_params): 131 | np.random.seed(42) 132 | data = np.concatenate( 133 | [ 134 | np.random.exponential(1 / 2, data_params["change_point"]), 135 | np.random.normal(0, 1, data_params["size"] - data_params["change_point"]), 136 | ] 137 | ) 138 | 139 | algorithm = BayesianOnline( 140 | learning_sample_size=20, 141 | likelihood=ExponentialConjugate(), 142 | hazard=ConstantHazard(rate=1.0 / (1.0 - 0.5 ** (1.0 / 500))), 143 | detector=ThresholdDetector(threshold=0.04), 144 | localizer=ArgmaxLocalizer(), 145 | ) 146 | 147 | data_provider = ListUnivariateProvider(list(data)) 148 | 149 | cpd = OnlineCpdSolver( 150 | scenario=CpdProblem(True), 151 | algorithm=algorithm, 152 | algorithm_input=data_provider, 153 | ) 154 | 155 | cpd.run() 156 | -------------------------------------------------------------------------------- /tests/test_solver.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from os import walk 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import pytest 7 | 8 | from pysatl_cpd.analysis.results_analyzer import CpdResultsAnalyzer 9 | from pysatl_cpd.core.algorithms.graph_algorithm import GraphAlgorithm 10 | from pysatl_cpd.core.problem import CpdProblem 11 | from pysatl_cpd.core.scrubber.data_providers import ListUnivariateProvider 12 | from pysatl_cpd.core.scrubber.linear import LinearScrubber 13 | from pysatl_cpd.cpd_solver import CpdLocalizationResults, CpdSolver, LabeledCpdData 14 | 15 | 16 | def custom_comparison(node1, node2): # TODO: Remove it everywhere 17 | arg = 1 18 | return abs(node1 - node2) <= arg 19 | 20 | 21 | class TestCpdSolver: 22 | def test_cpd_localization_no_changepoint(self) -> None: 23 | data = [1, 2, 3, 4] 24 | problem = CpdProblem(True) 25 | algorithm = GraphAlgorithm(custom_comparison, 4) 26 | scrubber = LinearScrubber(ListUnivariateProvider(data)) 27 | solver = CpdSolver(problem, algorithm, scrubber) 28 | cpd_result = solver.run() 29 | assert isinstance(cpd_result, CpdLocalizationResults) 30 | assert cpd_result.result == [] 31 | assert cpd_result.expected_result is None 32 | 33 | def test_cpd_localization_labeled_data(self) -> None: 34 | data = LabeledCpdData(np.array([1, 2, 3, 4], dtype=np.float64), [4, 5, 6, 7]) 35 | problem = CpdProblem(True) 36 | algorithm = GraphAlgorithm(custom_comparison, 4) 37 | solver = CpdSolver(problem, algorithm, (data, LinearScrubber)) 38 | cpd_result = solver.run() 39 | assert isinstance(cpd_result, CpdLocalizationResults) 40 | assert cpd_result.result == [] 41 | assert cpd_result.expected_result == [4, 5, 6, 7] 42 | assert cpd_result.result_diff == [4, 5, 6, 7] 43 | 44 | 45 | class TestCPDResultsAnalyzer: 46 | @pytest.mark.parametrize( 47 | "result1, result2, window, expected", 48 | [ 49 | ([4, 5, 6, 7], [3, 5, 6], None, (2, 1, 1, 1)), 50 | ([4, 5, 6, 7], [3, 5, 6], (5, 6), (1, 0, 0, 0)), 51 | ([4, 5, 6, 7], [3, 5, 6], (0, 100), (2, 97, 2, 1)), 52 | ([4, 5, 6, 7], [3, 5, 6], (6, 6), (0, 0, 0, 0)), 53 | ([3, 5, 6, 7], [4, 5, 6], None, (2, 1, 1, 1)), 54 | ([], [4, 5, 6], None, (0, 0, 0, 2)), 55 | ([3, 5, 6, 7], [], None, (0, 4, 3, 0)), 56 | ], 57 | ) 58 | def test_count_confusion_matrix(self, result1, result2, window, expected): 59 | assert CpdResultsAnalyzer.count_confusion_matrix(result1, result2, window) == expected 60 | 61 | def test_count_confusion_matrix_exception_case(self): 62 | with pytest.raises(ValueError): 63 | CpdResultsAnalyzer.count_confusion_matrix([], []) 64 | 65 | @pytest.mark.parametrize( 66 | "result1, result2, window, expected", 67 | [ 68 | ([4, 5, 6, 7], [3, 5, 6], None, 0.6), 69 | ([4, 5, 6, 7], [3, 5, 6], (5, 6), 1.0), 70 | ([4, 5, 6, 7], [3, 5, 6], (6, 6), 0.0), 71 | ], 72 | ) 73 | def test_count_accuracy(self, result1, result2, window, expected): 74 | assert CpdResultsAnalyzer.count_accuracy(result1, result2, window) == expected 75 | 76 | @pytest.mark.parametrize( 77 | "result1, result2, window, expected", 78 | [ 79 | ([4, 5, 6, 7], [3, 5, 6], None, 2 / 3), 80 | ([4, 5, 6, 7], [3, 5, 6], (5, 6), 1.0), 81 | ([4, 5, 6, 7], [3, 5, 6], (6, 6), 0.0), 82 | ], 83 | ) 84 | def test_count_precision(self, result1, result2, window, expected): 85 | assert CpdResultsAnalyzer.count_precision(result1, result2, window) == expected 86 | 87 | @pytest.mark.parametrize( 88 | "result1, result2, window, expected", 89 | [ 90 | ([4, 5, 6, 7], [3, 5, 6], None, 2 / 3), 91 | ([4, 5, 6, 7], [3, 5, 6], (5, 6), 1.0), 92 | ([4, 5, 6, 7], [3, 5, 6], (6, 6), 0.0), 93 | ], 94 | ) 95 | def test_count_recall(self, result1, result2, window, expected): 96 | assert CpdResultsAnalyzer.count_recall(result1, result2, window) == expected 97 | 98 | 99 | class TestCpdLocalizationResults: 100 | data = [np.float64(1)] * 15 101 | cont_default1 = CpdLocalizationResults(iter(data), [1, 2, 3], [2, 3, 4], 10) 102 | cont_default2 = CpdLocalizationResults(iter(data), [1, 2, 3, 6, 8], [2, 3, 4, 6], 20) 103 | cont_no_expected = CpdLocalizationResults(iter(data), [1, 2, 3], None, 5) 104 | 105 | def test_result_diff(self) -> None: 106 | assert self.cont_default1.result_diff == [1, 4] 107 | assert self.cont_default2.result_diff == [1, 4, 8] 108 | 109 | def test_result_diff_exception_case(self) -> None: 110 | with pytest.raises(ValueError): 111 | print(self.cont_no_expected.result_diff) 112 | 113 | def test_str_cp_container(self) -> None: 114 | assert ( 115 | str(self.cont_default1) 116 | == """Located change points: (1;2;3) 117 | Expected change point: (2;3;4) 118 | Difference: (1;4) 119 | Computation time (sec): 10""" 120 | ) 121 | 122 | assert ( 123 | str(self.cont_default2) 124 | == """Located change points: (1;2;3;6;8) 125 | Expected change point: (2;3;4;6) 126 | Difference: (1;4;8) 127 | Computation time (sec): 20""" 128 | ) 129 | 130 | assert ( 131 | str(self.cont_no_expected) 132 | == """Located change points: (1;2;3) 133 | Computation time (sec): 5""" 134 | ) 135 | 136 | @pytest.mark.parametrize( 137 | "data,name", 138 | ( 139 | (cont_default1, "d_1"), 140 | (cont_default2, "d_2"), 141 | (cont_no_expected, "cne"), 142 | ), 143 | ) 144 | def test_visualize(self, data, name) -> None: 145 | with tempfile.TemporaryDirectory() as tempdir: 146 | data.visualize(False, Path(tempdir), name) 147 | assert [f"{name}.png"] in [file_names for (_, _, file_names) in walk(tempdir)] 148 | 149 | def test_metric_exception_case(self): 150 | with pytest.raises(ValueError): 151 | self.cont_no_expected.count_confusion_matrix() 152 | --------------------------------------------------------------------------------