├── .github └── workflows │ └── publish-documentation.yml ├── .gitignore ├── .gitlab-ci.yml ├── LICENSE.txt ├── README.md ├── distancematrix ├── __init__.py ├── calculator.py ├── consumer │ ├── __init__.py │ ├── abstract_consumer.py │ ├── contextmanager.py │ ├── contextual_matrix_profile.py │ ├── distance_matrix.py │ ├── matrix_profile_lr.py │ ├── multidimensional_matrix_profile_lr.py │ ├── radius_profile.py │ └── threshold_counter.py ├── generator │ ├── __init__.py │ ├── abstract_generator.py │ ├── euclidean.py │ ├── filter_generator.py │ └── znorm_euclidean.py ├── insights.py ├── interrupt_util.py ├── math_tricks.py ├── ostinato.py ├── ringbuffer.py ├── tests │ ├── __init__.py │ ├── consumer │ │ ├── __init__.py │ │ ├── test_contextmanager.py │ │ ├── test_contextual_matrix_profile.py │ │ ├── test_distance_matrix.py │ │ ├── test_matrix_profile_lr.py │ │ ├── test_multidimensional_matrix_profile_lr.py │ │ ├── test_radius_profile.py │ │ └── test_threshold_counter.py │ ├── generator │ │ ├── __init__.py │ │ ├── mock_generator.py │ │ ├── test_euclidean.py │ │ ├── test_filter_generator.py │ │ └── test_znorm_euclidean.py │ ├── test_calculator.py │ ├── test_insights.py │ ├── test_math_tricks.py │ ├── test_ostinato.py │ ├── test_ringbuffer.py │ ├── test_util.py │ └── test_valmod.py ├── util.py └── valmod.py ├── docs ├── .gitignore ├── Example_matrix_profile.ipynb ├── Makefile ├── conf.py ├── doc_environment.yml ├── examples.rst ├── index.rst ├── install.md └── make.bat ├── setup.py └── test_environment.yml /.github/workflows/publish-documentation.yml: -------------------------------------------------------------------------------- 1 | name: Publish documentation 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | # Inspired by https://github.com/conda-incubator/setup-miniconda#usage-examples 10 | run-tests: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: Set up test environment 15 | uses: conda-incubator/setup-miniconda@v2 16 | with: 17 | environment-file: test_environment.yml 18 | activate-environment: test_env 19 | - name: Run tests 20 | shell: bash -l {0} 21 | run: | 22 | nosetests -v --nocapture 23 | 24 | publish-docs: 25 | needs: run-tests 26 | runs-on: ubuntu-latest 27 | steps: 28 | - uses: actions/checkout@v2 29 | - name: Set up doc building environment 30 | uses: conda-incubator/setup-miniconda@v2 31 | with: 32 | environment-file: docs/doc_environment.yml 33 | activate-environment: doc_env 34 | - name: Build documentation 35 | shell: bash -l {0} 36 | working-directory: ./docs 37 | run: | 38 | make html 39 | - name: Push to gh-pages branch 40 | uses: JamesIves/github-pages-deploy-action@4.1.4 41 | with: 42 | branch: gh-pages # The branch the action should deploy to. 43 | folder: docs/_build/html # The folder the action should deploy. 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # IDE 2 | .idea/ 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | bin/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # Installer logs 27 | pip-log.txt 28 | pip-delete-this-directory.txt 29 | 30 | # Unit test / coverage reports 31 | .tox/ 32 | .coverage 33 | .cache 34 | nosetests.xml 35 | coverage.xml 36 | 37 | # Jupyter notebook checkpoints 38 | .ipynb_checkpoints/ -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | # Based on: https://henningtimm.gitlab.io/post/gitlab_ci_and_conda/ 2 | 3 | image: continuumio/miniconda3:latest 4 | 5 | unittests: 6 | script: 7 | - apt-get update -q -y 8 | - apt-get install -y build-essential 9 | - conda env create -f test_environment.yml 10 | - source activate test_env 11 | - nosetests -v --nocapture -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Dieter De Paepe, Sofie Van Hoecke 4 | Ghent University - imec, Belgium 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Series Distance Matrix 2 | 3 | This library implements the [Series Distance Matrix framework](https://doi.org/10.1016/j.engappai.2020.103487), 4 | a flexible component-based framework that bundles various [Matrix Profile](https://www.cs.ucr.edu/~eamonn/MatrixProfile.html) 5 | related techniques. 6 | These techniques can be used for (time) series mining and analysis. 7 | Some example applications include: 8 | - motif discovery: finding the best (imperfect) matching subsequence pair in a larger series 9 | - discord discovery: finding the most dissimilar subsequence in a larger series 10 | - finding repeating subsequences in one or more series (common and consensus motifs) 11 | - visualizing series 12 | - finding changing patterns 13 | - ... 14 | 15 | The **Series Distance Matrix** framework was designed to integrate the various 16 | Matrix Profile variants that were established over the years. 17 | It does this by splitting the generation and consumption of 18 | the all-pair subsequence distances, 19 | putting the focus on the distance matrix itself. 20 | This allows for easier and more flexible experiments by 21 | freely combining components and eliminates the need 22 | to re-implement algorithms to combine techniques in an efficient way. 23 | 24 | 25 | Following core techniques are implemented: 26 | - Z-normalized Euclidean distance (including noise elimination) 27 | - Euclidean distance 28 | - (Left/Right) Matrix Profile 29 | - Multidimensional Matrix Profile 30 | - Contextual Matrix Profile 31 | - Radius Profile 32 | - Streaming and batch calculation 33 | 34 | 35 | Following Matrix Profile related techniques are implemented: 36 | - Valmod: find the top-1 motif in a series for each subsequence length in a given range 37 | - Ostinato: find the top-1 (k of n) consensus motif in a collection of series 38 | - Anytime Ostinato: find the radius profile for a collection of series 39 | 40 | 41 | ## Basic Usage 42 | 43 | Calculate a standard Matrix Profile using z-normalized Euclidean distance over a single series. 44 | 45 | ```python 46 | import numpy as np 47 | from distancematrix.generator.znorm_euclidean import ZNormEuclidean 48 | from distancematrix.consumer.matrix_profile_lr import MatrixProfileLR 49 | from distancematrix.calculator import AnytimeCalculator 50 | 51 | data = np.random.randn(10000) 52 | m = 100 # Subsequence length 53 | 54 | calc = AnytimeCalculator(m, data) 55 | gen_0 = calc.add_generator(0, ZNormEuclidean()) 56 | cons_mp = calc.add_consumer([0], MatrixProfileLR()) 57 | calc.calculate_columns() 58 | 59 | matrix_profile = cons_mp.matrix_profile() 60 | ``` 61 | 62 | Calculate a Matrix Profile and (common-10) Radius Profile over a single series using Euclidean distance. 63 | A combined calculation is more efficient, as it can reuse the calculated distances. 64 | 65 | ```python 66 | import numpy as np 67 | from distancematrix.generator.euclidean import Euclidean 68 | from distancematrix.consumer.radius_profile import RadiusProfile 69 | from distancematrix.consumer.matrix_profile_lr import MatrixProfileLR 70 | from distancematrix.calculator import AnytimeCalculator 71 | 72 | data = np.random.randn(10000) 73 | m = 100 # Subsequence length 74 | 75 | calc = AnytimeCalculator(m, data) 76 | gen_0 = calc.add_generator(0, Euclidean()) # Generator 0 works on channel 0 77 | cons_mp = calc.add_consumer([0], MatrixProfileLR()) # Consumer consumes generator 0 78 | cons_rp = calc.add_consumer([0], RadiusProfile(10, m//2)) # Consumer consumes generator 0 79 | calc.calculate_columns() 80 | 81 | matrix_profile = cons_mp.matrix_profile() 82 | radius_profile = cons_rp.values 83 | ``` 84 | 85 | Calculate a partial multidimensional Matrix Profile over two data channels. 86 | Partial calculations return approximated results but are significantly faster, 87 | they are particularly interesting in interactive workflows, as they can be resumed. 88 | 89 | ```python 90 | import numpy as np 91 | from distancematrix.generator.znorm_euclidean import ZNormEuclidean 92 | from distancematrix.consumer.multidimensional_matrix_profile_lr import MultidimensionalMatrixProfileLR 93 | from distancematrix.consumer.matrix_profile_lr import MatrixProfileLR 94 | from distancematrix.calculator import AnytimeCalculator 95 | 96 | data = np.random.randn(2, 10000) 97 | m = 100 # Subsequence length 98 | 99 | calc = AnytimeCalculator(m, data) 100 | gen_0 = calc.add_generator(0, ZNormEuclidean()) # Generator 0 works on channel 0 101 | gen_1 = calc.add_generator(1, ZNormEuclidean()) # Generator 1 works on channel 1 102 | cons_mmp = calc.add_consumer([0, 1], MultidimensionalMatrixProfileLR()) # Consumer consumes generator 0 & 1 103 | 104 | # Calculate only 1/4 of all distances: faster, but returns approximated results 105 | calc.calculate_diagonals(partial=0.25) 106 | multidimensional_matrix_profile = cons_mmp.md_matrix_profile() 107 | 108 | # Calculate the next quarter, so in total 1/2 of all distances are processed. 109 | calc.calculate_diagonals(partial=0.5) 110 | multidimensional_matrix_profile = cons_mmp.md_matrix_profile() 111 | ``` 112 | 113 | ## Documentation 114 | 115 | Documentation for the latest version is available [online](https://predict-idlab.github.io/seriesdistancematrix). 116 | 117 | Building the documentation locally is done using Sphinx. Navigate to the `docs` folder, activate the conda environment 118 | defined in the environment file, and run: 119 | 120 | ```bash 121 | make html 122 | ``` 123 | 124 | ## Installing 125 | 126 | Using pip: 127 | ```bash 128 | pip install seriesdistancematrix 129 | ``` 130 | 131 | Alternatively, clone this repositor and run: 132 | ```bash 133 | python setup.py clean build install 134 | ``` 135 | 136 | For local development (this allows you to edit code without having to reinstall the library): 137 | ```bash 138 | python setup.py develop 139 | ``` 140 | 141 | ## Academic Usage 142 | 143 | When using this library for academic purposes, please cite: 144 | ``` 145 | @article{series_distance_matrix, 146 | title = "A generalized matrix profile framework with support for contextual series analysis", 147 | journal = "Engineering Applications of Artificial Intelligence", 148 | volume = "90", 149 | pages = "103487", 150 | year = "2020", 151 | issn = "0952-1976", 152 | doi = "https://doi.org/10.1016/j.engappai.2020.103487", 153 | url = "http://www.sciencedirect.com/science/article/pii/S0952197620300087", 154 | author = "De Paepe, Dieter and Vanden Hautte, Sander and Steenwinckel, Bram and De Turck, Filip and Ongenae, Femke and Janssens, Olivier and Van Hoecke, Sofie" 155 | } 156 | ``` 157 | -------------------------------------------------------------------------------- /distancematrix/__init__.py: -------------------------------------------------------------------------------- 1 | import distancematrix.consumer 2 | import distancematrix.generator 3 | from distancematrix.calculator import AnytimeCalculator 4 | from distancematrix.calculator import StreamingCalculator 5 | 6 | __version__ = "0.3.1" # Also update ../setup.py! -------------------------------------------------------------------------------- /distancematrix/consumer/__init__.py: -------------------------------------------------------------------------------- 1 | from distancematrix.consumer.contextual_matrix_profile import ContextualMatrixProfile 2 | from distancematrix.consumer.distance_matrix import DistanceMatrix 3 | from distancematrix.consumer.matrix_profile_lr import MatrixProfileLR 4 | from distancematrix.consumer.matrix_profile_lr import ShiftingMatrixProfileLR 5 | from distancematrix.consumer.matrix_profile_lr import MatrixProfileLRReservoir 6 | from distancematrix.consumer.multidimensional_matrix_profile_lr import MultidimensionalMatrixProfileLR 7 | from distancematrix.consumer.threshold_counter import ThresholdCounter 8 | -------------------------------------------------------------------------------- /distancematrix/consumer/abstract_consumer.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class AbstractConsumer(ABC): 5 | @abstractmethod 6 | def initialise(self, dims, query_subseq, series_subseq): 7 | """ 8 | Initialise this consumer. 9 | 10 | :param dims: the number of dimensions (data channels) this consumer will receive 11 | :param query_subseq: the number of query subsequences (rows in the distance matrix) 12 | :param series_subseq: the number of series subsequences (column in the distance matrix) 13 | :return: None 14 | """ 15 | pass 16 | 17 | @abstractmethod 18 | def process_diagonal(self, diagonal_index, values): 19 | """ 20 | Method called when a diagonal of the distance matrix is calculated. 21 | 22 | The number of values on the diagonal might be less than the diagonal of the full matrix profile, 23 | this can occur when not enough data is available yet to calculate the entire distance matrix 24 | (typically for streaming when not enough data is available to fill the entire foreseen space). 25 | 26 | :param diagonal_index: index of the diagonal in range ]-num_query_subseq, num_series_subseq[, 27 | the main diagonal has index 0 28 | :param values: array of shape (num_dimensions, num_values_on_diagonal) containing the distances 29 | :return: None 30 | """ 31 | pass 32 | 33 | @abstractmethod 34 | def process_column(self, column_index, values): 35 | """ 36 | Method called when a column of the distance matrix is calculated. 37 | 38 | The number of values on the column might be less than the column of the full matrix profile, 39 | this can occur when not enough data is available yet to calculate the entire distance matrix 40 | (typically for streaming when not enough data is available to fill the entire foreseen space). 41 | 42 | :param column_index: index of the column, in range [0, series_subseq[ 43 | :param values: array of shape (num_dimensions, num_values_on_column) containing the distances 44 | :return: None 45 | """ 46 | pass 47 | 48 | 49 | class AbstractStreamingConsumer(AbstractConsumer): 50 | @abstractmethod 51 | def shift_query(self, amount): 52 | """ 53 | Inform the consumer that the distance matrix has shifted in the query direction. 54 | 55 | :param amount: amount of subsequences that were shifted 56 | :return: None 57 | """ 58 | pass 59 | 60 | @abstractmethod 61 | def shift_series(self, amount): 62 | """ 63 | Inform the consumer that the distance matrix has shifted in the series direction. 64 | 65 | :param amount: amount of subsequences that were shifted 66 | :return: None 67 | """ 68 | pass 69 | -------------------------------------------------------------------------------- /distancematrix/consumer/contextmanager.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Iterable, Tuple 3 | import collections 4 | import numpy as np 5 | 6 | 7 | class AbstractContextManager(ABC): 8 | @abstractmethod 9 | def query_contexts(self, start: int, stop: int) -> Iterable[Tuple[int, int, int]]: 10 | """ 11 | Return all non-empty query context definitions that fall in the given range of the distance matrix query axis. 12 | 13 | :param start: start of the range 14 | :param stop: end of the range 15 | :return: iterable of tuples (start of context, end of context, context id) 16 | """ 17 | pass 18 | 19 | @abstractmethod 20 | def series_contexts(self, start: int, stop: int) -> Iterable[Tuple[int, int, int]]: 21 | """ 22 | Return all non-empty series context definitions that fall in the given range of the distance matrix series axis. 23 | 24 | :param start: start of the range 25 | :param stop: end of the range 26 | :return: iterable of tuples (start of context, end of context, context id) 27 | """ 28 | pass 29 | 30 | @abstractmethod 31 | def context_matrix_shape(self) -> (int, int): 32 | """ 33 | Returns the shape of the contextual distance matrix 34 | 35 | :return: upper bound for any context id returned by this manager, for query and series axis 36 | """ 37 | pass 38 | 39 | def shift_query(self, amount: int) -> int: 40 | """ 41 | Informs the manager that the distance matrix has shifted along the query axis. 42 | 43 | :param amount: amount of values shifted 44 | :return: the amount of values that the contextual distance matrix should shift along the query axis 45 | """ 46 | raise RuntimeError("This generator does not support query shifting.") 47 | 48 | def shift_series(self, amount: int) -> int: 49 | """ 50 | Informs the manager that the distance matrix has shifted along the series axis. 51 | 52 | :param amount: amount of values shifted 53 | :return: the amount of values that the contextual distance matrix should shift along the series axis 54 | """ 55 | raise RuntimeError("This generator does not support series shifting.") 56 | 57 | 58 | class GeneralStaticManager(AbstractContextManager): 59 | """ 60 | General purpose context manager for contextual matrix profile. This manager does not support streaming data. 61 | """ 62 | 63 | def __init__(self, series_contexts, query_contexts=None): 64 | """ 65 | Creates a new context manager. 66 | 67 | :param series_contexts: an iterable of ranges, each range defines one context. You can also 68 | use lists of ranges, to specify non-consecutive contexts. 69 | :param query_contexts: iterable of ranges, defaults to None, meaning to use the same contexts as the series 70 | """ 71 | _verify_ranges([r for i, r in _enumerate_flattened(series_contexts)]) 72 | 73 | if query_contexts is None: 74 | query_contexts = series_contexts 75 | else: 76 | _verify_ranges([r for i, r in _enumerate_flattened(query_contexts)]) 77 | 78 | self._series_contexts = np.array( 79 | [(r.start, r.stop, i) for i, r in _filter_empty(_enumerate_flattened(series_contexts))], dtype=int) 80 | self._query_contexts = np.array( 81 | [(r.start, r.stop, i) for i, r in _filter_empty(_enumerate_flattened(query_contexts))], dtype=int) 82 | 83 | self._qc_sorted_start = self._query_contexts[np.argsort(self._query_contexts[:, 0])] 84 | self._qc_sorted_stop = self._query_contexts[np.argsort(self._query_contexts[:, 1])] 85 | 86 | def context_matrix_shape(self) -> (int, int): 87 | num_query_contexts = np.max(self._query_contexts[:, 2]) + 1 88 | num_series_contexts = np.max(self._series_contexts[:, 2]) + 1 89 | 90 | return num_query_contexts, num_series_contexts 91 | 92 | def series_contexts(self, start, stop): 93 | return self._series_contexts[np.logical_and( 94 | self._series_contexts[:, 0] < stop, # Start of context is before stop 95 | self._series_contexts[:, 1] > start # End of context is after start 96 | )] 97 | 98 | def query_contexts(self, start, stop): 99 | if start <= self._qc_sorted_start[0, 0] and stop >= self._qc_sorted_stop[-1, 1]: 100 | return self._query_contexts 101 | 102 | if start == 0: 103 | # All contexts that start before stop 104 | contexts = self._qc_sorted_start[0: np.searchsorted(self._qc_sorted_start[:, 0], stop)] 105 | return filter(lambda c: c[1] > 0, contexts) 106 | elif stop >= self._qc_sorted_stop[-1, 1]: 107 | # All contexts that end after start 108 | contexts = self._qc_sorted_stop[np.searchsorted(self._qc_sorted_stop[:, 1], start, side="right"):] 109 | return filter(lambda c: c[0] < stop, contexts) 110 | else: 111 | return self._query_contexts[np.logical_and( 112 | self._query_contexts[:, 0] < stop, # Start of context is before stop 113 | self._query_contexts[:, 1] > start # End of context is after start 114 | )] 115 | 116 | 117 | def _verify_ranges(ranges): 118 | for r in ranges: 119 | if r.step != 1: 120 | raise RuntimeError("Only ranges with step 1 supported.") 121 | if r.start < 0: 122 | raise RuntimeError("Range start should not be negative.") 123 | 124 | 125 | def _enumerate_flattened(l): 126 | """ 127 | Converts a list of elements and lists into tuples (index, element), so that elements in nested lists 128 | have the same index. 129 | 130 | Eg: [1, [2,3], 4] => (0, 1), (1, 2), (1, 3), (2, 4) 131 | """ 132 | for i, el in enumerate(l): 133 | if isinstance(el, collections.abc.Iterable) and not isinstance(el, range): 134 | for r in el: 135 | yield i, r 136 | else: 137 | yield i, el 138 | 139 | 140 | def _filter_empty(iter): 141 | for i, r in iter: 142 | if r.start < r.stop: 143 | yield (i, r) 144 | -------------------------------------------------------------------------------- /distancematrix/consumer/contextual_matrix_profile.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from distancematrix.ringbuffer import RingBuffer 4 | from distancematrix.consumer.abstract_consumer import AbstractStreamingConsumer 5 | from distancematrix.consumer.contextmanager import AbstractContextManager 6 | 7 | 8 | class ContextualMatrixProfile(AbstractStreamingConsumer): 9 | """ 10 | A consumer that constructs the contextual matrix profile. The contextual matrix profile is formed by 11 | taking the minimum of rectangles across the full distance matrix (where the matrix profile takes the 12 | minimum across columns). 13 | 14 | This consumer supports streaming if the provided context manager does. 15 | """ 16 | 17 | def __init__(self, context_manager: AbstractContextManager, rb_scale_factor=2.): 18 | """ 19 | Creates a new consumer that calculates a contextual matrix profile, 20 | according to the contexts defined by the manager. 21 | 22 | :param context_manager: object responsible for defining the spans of each context over the query and series axis 23 | :param rb_scale_factor: scaling factor used for RingBuffers in case of streaming data (should be >= 1), 24 | this allows choosing a balance between less memory (low values) and reduced data copying (higher values) 25 | """ 26 | if rb_scale_factor < 1.: 27 | raise ValueError("rb_scale_factor should be >= 1, it was: " + str(rb_scale_factor)) 28 | 29 | self._num_series_subseq = None 30 | self._num_query_subseq = None 31 | self._range = None 32 | 33 | self._contexts = context_manager 34 | self._query_shift = 0 35 | self._series_shift = 0 36 | 37 | self._distance_matrix = None 38 | self._match_index_series = None 39 | self._match_index_query = None 40 | 41 | self._rb_scale_factor = rb_scale_factor 42 | 43 | def initialise(self, dims, query_subseq, series_subseq): 44 | self._num_series_subseq = series_subseq 45 | self._num_query_subseq = query_subseq 46 | self._range = np.arange(0, max(series_subseq, query_subseq), dtype=int) 47 | 48 | num_query_contexts, num_series_contexts = self._contexts.context_matrix_shape() 49 | 50 | self._distance_matrix = RingBuffer(np.full((num_query_contexts, num_series_contexts), np.Inf, dtype=float), 51 | scaling_factor=self._rb_scale_factor) 52 | self._match_index_series = RingBuffer(np.full((num_query_contexts, num_series_contexts), -1, dtype=int), 53 | scaling_factor=self._rb_scale_factor) 54 | self._match_index_query = RingBuffer(np.full((num_query_contexts, num_series_contexts), -1, dtype=int), 55 | scaling_factor=self._rb_scale_factor) 56 | 57 | def process_diagonal(self, diag, values): 58 | values = values[0] 59 | num_values = len(values) 60 | 61 | if diag >= 0: 62 | values_idx1_start = diag 63 | context0_idxs = self._contexts.query_contexts(0, num_values) 64 | else: 65 | values_idx1_start = 0 66 | context0_idxs = self._contexts.query_contexts(-diag, self._num_query_subseq) 67 | 68 | for c0_start, c0_end, c0_identifier in context0_idxs: 69 | # We now have a sub-sequence (ss) defined by the first context on the query axis 70 | # In absolute coordinates, start/end of this subsequence on 2nd axis (series axis) 71 | ss1_start = min(max(0, c0_start + diag), self._num_series_subseq) 72 | ss1_end = min(self._num_series_subseq, min(self._num_query_subseq, c0_end) + diag) 73 | 74 | if ss1_start == ss1_end: 75 | continue 76 | 77 | context1_idxs = self._contexts.series_contexts(ss1_start, ss1_end) 78 | 79 | for c1_start, c1_end, c1_identifier in context1_idxs: 80 | # In absolute coordinates, start/end of the subsequence on 2nd axis defined by both contexts 81 | sss1_start = max(ss1_start, c1_start) 82 | sss1_end = min(ss1_end, c1_end) 83 | 84 | # Values that belong to both contexts 85 | sss_values = values[sss1_start - values_idx1_start: sss1_end - values_idx1_start] 86 | 87 | # Compare if better than current 88 | min_sss_value = np.min(sss_values) 89 | is_better = min_sss_value < self._distance_matrix[c0_identifier, c1_identifier] 90 | 91 | if is_better: 92 | self._distance_matrix[c0_identifier, c1_identifier] = min_sss_value 93 | rel_indices = np.argmin(sss_values) 94 | sss0_start = sss1_start - diag 95 | self._match_index_query[c0_identifier, c1_identifier] = rel_indices + sss0_start + self._query_shift 96 | self._match_index_series[c0_identifier, c1_identifier] = rel_indices + sss1_start + self._series_shift 97 | 98 | def process_column(self, column_index, values): 99 | values = values[0] 100 | context1_idxs = self._contexts.series_contexts(column_index, column_index + 1) 101 | 102 | for _, _, c1_identifier in context1_idxs: 103 | query_contexts = self._contexts.query_contexts(0, self._num_query_subseq) 104 | 105 | for c0_start, c0_end, c0_identifier in query_contexts: 106 | subseq = values[c0_start: c0_end] 107 | best_value = np.min(subseq) 108 | 109 | if best_value < self._distance_matrix[c0_identifier, c1_identifier]: 110 | self._distance_matrix[c0_identifier, c1_identifier] = best_value 111 | self._match_index_query[c0_identifier, c1_identifier] = np.argmin(subseq) + c0_start + self._query_shift 112 | self._match_index_series[c0_identifier, c1_identifier] = column_index + self._series_shift 113 | 114 | def shift_series(self, amount): 115 | context_shift = self._contexts.shift_series(amount) 116 | self._series_shift += amount 117 | 118 | if context_shift > 0: 119 | height = self._distance_matrix.max_shape[0] 120 | self._distance_matrix.push(np.full((height, context_shift), np.Inf, dtype=float)) 121 | self._match_index_series.push(np.full((height, context_shift), -1, dtype=int)) 122 | self._match_index_query.push(np.full((height, context_shift), -1, dtype=int)) 123 | 124 | def shift_query(self, amount): 125 | context_shift = self._contexts.shift_query(amount) 126 | self._query_shift += amount 127 | 128 | if context_shift > 0: 129 | # Note: This could be more efficient using a 2D Ringbuffer. 130 | height = min(context_shift, self._distance_matrix.max_shape[0]) 131 | self._distance_matrix.view = np.roll(self._distance_matrix.view, context_shift, axis=0) 132 | self._distance_matrix[-height:, :] = np.Inf 133 | self._match_index_series.view = np.roll(self._match_index_series.view, context_shift, axis=0) 134 | self._match_index_series[-height:, :] = -1 135 | self._match_index_query.view = np.roll(self._match_index_query.view, context_shift, axis=0) 136 | self._match_index_query[-height:, :] = -1 137 | 138 | @property 139 | def match_index_query(self): 140 | return self._match_index_query.view 141 | 142 | @property 143 | def match_index_series(self): 144 | return self._match_index_series.view 145 | 146 | @property 147 | def distance_matrix(self): 148 | return self._distance_matrix.view 149 | -------------------------------------------------------------------------------- /distancematrix/consumer/distance_matrix.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from ..util import diag_indices_of 3 | from .abstract_consumer import AbstractStreamingConsumer 4 | 5 | 6 | class DistanceMatrix(AbstractStreamingConsumer): 7 | def __init__(self): 8 | """ 9 | Creates a new consumer that will store the complete distance matrix. 10 | This consumer supports streaming. 11 | 12 | Note that the distance matrix requires quadratic memory, so it is unsuited for long time series. 13 | """ 14 | 15 | self.distance_matrix = None 16 | 17 | def initialise(self, dims, query_subseq, series_subseq): 18 | if dims != 1: 19 | raise RuntimeError("Input should be 1D") 20 | 21 | self.distance_matrix = np.full((query_subseq, series_subseq), np.nan, dtype=float) 22 | 23 | def process_diagonal(self, diagonal_index, values): 24 | num_values = values.shape[1] 25 | indices = diag_indices_of(self.distance_matrix, diagonal_index) 26 | indices = (indices[0][:num_values], indices[1][:num_values]) 27 | self.distance_matrix[indices] = values 28 | 29 | def process_column(self, column_index, values): 30 | num_values = values.shape[1] 31 | self.distance_matrix[:num_values, column_index] = values 32 | 33 | def shift_series(self, amount): 34 | if amount == 0: 35 | return 36 | 37 | self.distance_matrix = np.roll(self.distance_matrix, -amount, axis=1) 38 | self.distance_matrix[:, -amount:] = np.nan 39 | 40 | def shift_query(self, amount): 41 | if amount == 0: 42 | return 43 | 44 | self.distance_matrix = np.roll(self.distance_matrix, -amount, axis=0) 45 | self.distance_matrix[-amount:, :] = np.nan 46 | -------------------------------------------------------------------------------- /distancematrix/consumer/multidimensional_matrix_profile_lr.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from distancematrix.ringbuffer import RingBuffer 3 | 4 | from .abstract_consumer import AbstractStreamingConsumer 5 | 6 | 7 | class MultidimensionalMatrixProfileLR(AbstractStreamingConsumer): 8 | """ 9 | A consumer that builds the multidimensional matrix profile. This consumer takes in distance measures from 10 | multiple channels (dimensions) at the same time and tracks the best distance, the index of this match and 11 | the dimensions used in this match. 12 | More specifically, if the input has N data channels, this consumer will select for each number of channels 13 | (1, 2, ..., N), the channels containing the best match, index and dimensions. It will not track matches for 14 | any possible combination of channels. 15 | 16 | This consumer keeps track of the left and right multidimensional profile, and can be used to create the 17 | (normal) multidimensional profile from it. The left profile, index and dimensions 18 | at index i contain information about a match whose index is less than or equal to i, while the right 19 | profile, index and dimensions track information about a match whose index is larger than i. 20 | 21 | The profile is an array with shape (num_dimensions, num_distances). The value at row i, j contains the best averaged 22 | distances encountered at index j for any i+1 dimensions. The index is similar, but tracks the index of the query 23 | series that had the best match. 24 | 25 | The dimensions being tracked is a list of length num_dimensions. Entry i of this list contains an 26 | (i+1, num_distances) array that lists the indices of the dimensions that contained the best match. 27 | 28 | This consumer supports streaming. 29 | """ 30 | 31 | def __init__(self, rb_scale_factor=2.): 32 | """ 33 | Creates a new consumer that calculates the left and right matrix profile, the corresponding 34 | indices and the used dimensions over multiple dimensions (data channels). 35 | 36 | :param rb_scale_factor: scaling factor used for RingBuffers in case of streaming data (should be >= 1), 37 | this allows choosing a balance between less memory (low values) and reduced data copying (higher values) 38 | """ 39 | 40 | if rb_scale_factor < 1.: 41 | raise ValueError("rb_scale_factor should be >= 1, it was: " + str(rb_scale_factor)) 42 | 43 | self._num_subseq = None 44 | self._range = None 45 | self._n_dim = None 46 | 47 | self._md_matrix_profile_left = None 48 | self._md_profile_index_left = None 49 | self._md_profile_dimension_left = None 50 | 51 | self._md_matrix_profile_right = None 52 | self._md_profile_index_right = None 53 | self._md_profile_dimension_right = None 54 | 55 | self._series_shift = 0 56 | self._query_shift = 0 57 | 58 | self._rb_scale_factor = rb_scale_factor 59 | 60 | def initialise(self, dims, query_subseq, series_subseq): 61 | self._n_dim = dims 62 | self._num_subseq = series_subseq 63 | self._range = RingBuffer(np.arange(0, self._num_subseq, dtype=int), 64 | scaling_factor=self._rb_scale_factor) 65 | 66 | self._md_matrix_profile_left = RingBuffer(np.full((dims, self._num_subseq), np.inf, dtype=float), 67 | scaling_factor=self._rb_scale_factor) 68 | self._md_profile_index_left = RingBuffer(np.full((dims, self._num_subseq), -1, dtype=int), 69 | scaling_factor=self._rb_scale_factor) 70 | self._md_profile_dimension_left = \ 71 | [RingBuffer(np.full((i + 1, self._num_subseq), -1, dtype=int), 72 | scaling_factor=self._rb_scale_factor) for i in range(dims)] 73 | 74 | self._md_matrix_profile_right = RingBuffer(np.full((dims, self._num_subseq), np.inf, dtype=float), 75 | scaling_factor=self._rb_scale_factor) 76 | self._md_profile_index_right = RingBuffer(np.full((dims, self._num_subseq), -1, dtype=int), 77 | scaling_factor=self._rb_scale_factor) 78 | self._md_profile_dimension_right = \ 79 | [RingBuffer(np.full((i + 1, self._num_subseq), -1, dtype=int), 80 | scaling_factor=self._rb_scale_factor) for i in range(dims)] 81 | 82 | def process_diagonal(self, diag, values): 83 | n_dim, num_values = values.shape 84 | shift_diff = self._series_shift - self._query_shift 85 | 86 | values_sort_order = np.argsort(values, axis=0) 87 | values_sorted = np.sort(values, axis=0) 88 | values_cumsum = np.zeros(num_values) 89 | 90 | if diag + shift_diff >= 0: 91 | # left MP 92 | if diag >= 0: 93 | for dim in range(n_dim): 94 | values_cumsum += values_sorted[dim, :] 95 | values_mean_over_dim = values_cumsum / (dim + 1) 96 | 97 | self._update_matrix_profile(values_mean_over_dim, 98 | self._range[:num_values], 99 | values_sort_order[:dim + 1, :], 100 | self._md_matrix_profile_left[dim, diag:diag + num_values], 101 | self._md_profile_index_left[dim, diag:diag + num_values], 102 | self._md_profile_dimension_left[dim][:, diag:diag + num_values]) 103 | else: 104 | for dim in range(n_dim): 105 | values_cumsum += values_sorted[dim, :] 106 | values_mean_over_dim = values_cumsum / (dim + 1) 107 | 108 | self._update_matrix_profile(values_mean_over_dim, 109 | self._range[-diag:-diag + num_values], 110 | values_sort_order[:dim + 1, :], 111 | self._md_matrix_profile_left[dim, :num_values], 112 | self._md_profile_index_left[dim, :num_values], 113 | self._md_profile_dimension_left[dim][:, :num_values]) 114 | else: 115 | # right MP 116 | if diag >= 0: 117 | for dim in range(n_dim): 118 | values_cumsum += values_sorted[dim, :] 119 | values_mean_over_dim = values_cumsum / (dim + 1) 120 | 121 | self._update_matrix_profile(values_mean_over_dim, 122 | self._range[num_values], 123 | values_sort_order[:dim + 1, :], 124 | self._md_matrix_profile_right[dim, diag:diag + num_values], 125 | self._md_profile_index_right[dim, diag:diag + num_values], 126 | self._md_profile_dimension_right[dim][:, diag:diag + num_values]) 127 | else: 128 | for dim in range(n_dim): 129 | values_cumsum += values_sorted[dim, :] 130 | values_mean_over_dim = values_cumsum / (dim + 1) 131 | 132 | self._update_matrix_profile(values_mean_over_dim, 133 | self._range[-diag:-diag + num_values], 134 | values_sort_order[:dim + 1, :], 135 | self._md_matrix_profile_right[dim, :num_values], 136 | self._md_profile_index_right[dim, :num_values], 137 | self._md_profile_dimension_right[dim][:, :num_values]) 138 | 139 | if diag >= 0: 140 | for dim in range(n_dim): 141 | values_cumsum += values_sorted[dim, :] 142 | values_mean_over_dim = values_cumsum / (dim + 1) 143 | 144 | self._update_matrix_profile(values_mean_over_dim, 145 | self._range[:num_values], 146 | values_sort_order[:dim + 1, :], 147 | self._md_matrix_profile_left[dim, diag:diag + num_values], 148 | self._md_profile_index_left[dim, diag:diag + num_values], 149 | self._md_profile_dimension_left[dim][:, diag:diag + num_values]) 150 | 151 | else: 152 | for dim in range(n_dim): 153 | values_cumsum += values_sorted[dim, :] 154 | values_mean_over_dim = values_cumsum / (dim + 1) 155 | 156 | self._update_matrix_profile(values_mean_over_dim, 157 | self._range[-diag:-diag + num_values], 158 | values_sort_order[:dim + 1, :], 159 | self._md_matrix_profile_right[dim, :num_values], 160 | self._md_profile_index_right[dim, :num_values], 161 | self._md_profile_dimension_right[dim][:, :num_values]) 162 | 163 | def _update_matrix_profile(self, new_distances, new_distance_indices, new_distance_dimensions, 164 | matrix_profile, matrix_profile_index, matrix_profile_dims): 165 | update_pos = new_distances < matrix_profile 166 | matrix_profile[update_pos] = new_distances[update_pos] 167 | matrix_profile_index[update_pos] = new_distance_indices[update_pos] 168 | matrix_profile_dims[:, update_pos] = new_distance_dimensions[:, update_pos] 169 | 170 | def process_column(self, column_index, values): 171 | n_dim, num_values = values.shape 172 | shift_diff = self._series_shift - self._query_shift 173 | 174 | border = max(0, column_index + 1 + shift_diff) 175 | 176 | values_sorted = np.sort(values, axis=0) 177 | values_cumsum = np.zeros(num_values) 178 | 179 | for dim in range(n_dim): 180 | values_cumsum += values_sorted[dim, :] 181 | 182 | if border > 0: 183 | min_position_l = np.argmin(values_cumsum[:border]) 184 | new_min_value = values_cumsum[min_position_l] / (dim + 1) 185 | 186 | if new_min_value < self._md_matrix_profile_left[dim, column_index]: 187 | self._md_matrix_profile_left[dim, column_index] = new_min_value 188 | self._md_profile_index_left[dim, column_index] = min_position_l + self._query_shift 189 | self._md_profile_dimension_left[dim][:, column_index] =\ 190 | np.argsort(values[:, min_position_l])[:dim + 1] 191 | 192 | # Check if column crosses into the lower triangle of the distance matrix 193 | if num_values > border: 194 | min_position_r = np.argmin(values_cumsum[border:]) + border 195 | new_min_value = values_cumsum[min_position_r] / (dim + 1) 196 | 197 | # In case of shifting, a lower value could already be present 198 | if new_min_value < self._md_matrix_profile_right[dim, column_index]: 199 | self._md_matrix_profile_right[dim, column_index] = new_min_value 200 | self._md_profile_index_right[dim, column_index] = min_position_r + self._query_shift 201 | self._md_profile_dimension_right[dim][:, column_index] =\ 202 | np.argsort(values[:, min_position_r])[:dim + 1] 203 | 204 | def shift_query(self, amount): 205 | if amount == 0: 206 | return 207 | 208 | self._query_shift += amount 209 | self._range.push(np.arange(self._range[-1] + 1, self._range[-1] + 1 + amount)) 210 | 211 | def shift_series(self, amount): 212 | if amount == 0: 213 | return 214 | 215 | self._series_shift += amount 216 | 217 | push_values = np.full((self._n_dim, amount), np.inf) 218 | self._md_matrix_profile_left.push(push_values) 219 | self._md_matrix_profile_right.push(push_values) 220 | 221 | push_values[:] = -1 222 | self._md_profile_index_left.push(push_values) 223 | self._md_profile_index_right.push(push_values) 224 | 225 | for dim in range(self._n_dim): 226 | self._md_profile_dimension_left[dim].push(push_values[:dim + 1, :]) 227 | self._md_profile_dimension_right[dim].push(push_values[:dim + 1, :]) 228 | 229 | def md_matrix_profile(self): 230 | """ 231 | Merges the left and right multidimensional matrix profile, to create the multidimensional matrix profile. 232 | :return: ndarray of shape (num_dimensions, num_subsequences) 233 | """ 234 | left_best = self._md_matrix_profile_left.view < self._md_matrix_profile_right.view 235 | return np.where( 236 | left_best, 237 | self._md_matrix_profile_left.view, 238 | self._md_matrix_profile_right.view 239 | ) 240 | 241 | def md_profile_index(self): 242 | """ 243 | Merges the left and right multidimensional matrix profile index, to create the multidimensional matrix profile 244 | index. 245 | :return: ndarray of shape (num_dimensions, num_subsequences) 246 | """ 247 | left_best = self._md_matrix_profile_left.view < self._md_matrix_profile_right.view 248 | return np.where( 249 | left_best, 250 | self._md_profile_index_left.view, 251 | self._md_profile_index_right.view 252 | ) 253 | 254 | def md_profile_dimensions(self): 255 | """ 256 | Merges the left and right dimensions, to create the dimensions for the multidimensional matrix profile. 257 | :return: list of length num_dimensions, where the entry at index i is an ndarray of shape 258 | (i+1, num_subsequences). 259 | """ 260 | profile_dimension = [np.full((i + 1, self._num_subseq), -1, dtype=int) for i in range(self._n_dim)] 261 | 262 | for dim in range(self._n_dim): 263 | left_best = self._md_matrix_profile_left[dim, :] < self._md_matrix_profile_right[dim, :] 264 | profile_dimension[dim] = np.where( 265 | left_best, 266 | self._md_profile_dimension_left[dim].view, 267 | self._md_profile_dimension_right[dim].view 268 | ) 269 | 270 | return profile_dimension 271 | 272 | @property 273 | def md_matrix_profile_left(self): 274 | return self._md_matrix_profile_left.view 275 | 276 | @property 277 | def md_matrix_profile_right(self): 278 | return self._md_matrix_profile_right.view 279 | 280 | @property 281 | def md_profile_index_left(self): 282 | return self._md_profile_index_left.view 283 | 284 | @property 285 | def md_profile_index_right(self): 286 | return self._md_profile_index_right.view 287 | 288 | @property 289 | def md_profile_dimension_left(self): 290 | return [buffer.view for buffer in self._md_profile_dimension_left] 291 | 292 | @property 293 | def md_profile_dimension_right(self): 294 | return [buffer.view for buffer in self._md_profile_dimension_right] 295 | -------------------------------------------------------------------------------- /distancematrix/consumer/radius_profile.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List 2 | 3 | import numpy as np 4 | 5 | from distancematrix.consumer.abstract_consumer import AbstractConsumer 6 | from distancematrix.insights import lowest_value_idxs 7 | 8 | 9 | class RadiusProfile0(AbstractConsumer): 10 | """ 11 | Consumer that calculates (common-k) radius profiles. 12 | 13 | The (common-k) radius profile tracks the distance between each subsequence and its k-th best match. 14 | It can be used to find subsequences with at least k repetitions (so called common motifs). 15 | 16 | This class has been optimised for finding matches without ignoring trivial matches. 17 | In other words, it is not possible to define an exclusion zone for the matches. 18 | """ 19 | def __init__(self, track_indices): 20 | """ 21 | Creates a new radius profile consumer that tracks the distance between each subsequence and its 22 | k-th best matches. 23 | 24 | Note that the resulting radius profile will contain distances as if the given track_indices were sorted. 25 | 26 | :param track_indices: values of k to track 27 | """ 28 | self.track_indices = np.array(track_indices, ndmin=1, dtype=int) 29 | 30 | if self.track_indices.ndim != 1: 31 | raise ValueError('Track_indices should be scalar or one-dimensional.') 32 | if len(self.track_indices) == 0: 33 | raise ValueError('At least one track index needed.') 34 | if np.any(self.track_indices < 0): 35 | raise ValueError('Only positive track_indices allowed.') 36 | 37 | self.track_indices.sort() 38 | self.values = None 39 | 40 | def initialise(self, dims, query_subseq, series_subseq): 41 | self.values = np.full((len(self.track_indices), series_subseq), np.nan, dtype=float) 42 | 43 | def process_diagonal(self, diag, values): 44 | raise NotImplementedError 45 | 46 | def process_column(self, column_index, values): 47 | values = values[0] 48 | 49 | sorted_values = np.empty(len(values) + 1, dtype=float) 50 | sorted_values[:-1] = np.sort(values) 51 | sorted_values[-1] = np.nan 52 | 53 | self.values[:, column_index] = np.take(sorted_values, self.track_indices, mode="clip") 54 | 55 | 56 | class RadiusProfile(AbstractConsumer): 57 | """ 58 | Consumer that calculates (common-k) radius profiles. 59 | 60 | The (common-k) radius profile tracks the distance between each subsequence and its k-th best match. 61 | It can be used to find subsequences with at least k repetitions (so called common motifs). 62 | """ 63 | def __init__(self, track_indices: Union[int, List[int]], exclude_distance: int): 64 | """ 65 | Creates a new radius profile consumer that tracks the distance between each subsequence and its 66 | k-th best matches. 67 | 68 | Note that the resulting radius profile will contain distances as if the given track_indices were sorted. 69 | 70 | .. seealso:: If excludedistance is zero, 71 | consider using :class:`distancematrix.consumer.radius_profile.RadiusProfile0` 72 | 73 | :param track_indices: values of k to track 74 | :param exclude_distance: trivial match exclusion distance, typical subsequence length / 2. 75 | """ 76 | self.track_indices = np.array(track_indices, ndmin=1, dtype=int) 77 | 78 | if self.track_indices.ndim != 1: 79 | raise ValueError('Track_indices should be scalar or one-dimensional.') 80 | if len(self.track_indices) == 0: 81 | raise ValueError('At least one track index needed.') 82 | if np.any(self.track_indices < 0): 83 | raise ValueError('Only positive track_indices allowed.') 84 | if type(exclude_distance) is not int or exclude_distance < 0: 85 | raise RuntimeError('Exclude distance should be positive integer.') 86 | 87 | self.track_indices.sort() 88 | self.exclusion = exclude_distance 89 | self.values = None 90 | 91 | def initialise(self, dims, query_subseq, series_subseq): 92 | self.values = np.full((len(self.track_indices), series_subseq), np.nan, dtype=float) 93 | 94 | def process_diagonal(self, diag, values): 95 | raise NotImplementedError 96 | 97 | def process_column(self, column_index, values): 98 | values = values[0] 99 | 100 | iterator = lowest_value_idxs(values, self.exclusion) 101 | tracker_idx = 0 102 | 103 | # Iterate from best match to worst, ignoring trivial matches 104 | for i, low_value_idx in enumerate(iterator): 105 | # If we are interested in the i-th match 106 | if i == self.track_indices[tracker_idx]: 107 | self.values[tracker_idx, column_index] = values[low_value_idx] 108 | tracker_idx += 1 109 | 110 | # Abort if we found all matches we are tracking 111 | if tracker_idx >= len(self.track_indices): 112 | return 113 | 114 | return 115 | -------------------------------------------------------------------------------- /distancematrix/consumer/threshold_counter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .abstract_consumer import AbstractConsumer 4 | 5 | 6 | class ThresholdCounter(AbstractConsumer): 7 | """ 8 | Consumer that counts the number of values in each column of the distancematrix that are below 9 | or equal to specified thresholds. 10 | 11 | This consumer counts values as they are passed and does not extrapolate or keep information about which 12 | values were already counted. Specifically: partial calculations will result in counts of the produced values, 13 | and passing the same diagonals multiple time could result in double counts. 14 | """ 15 | 16 | def __init__(self, thresholds): 17 | """ 18 | Creates a new counter. 19 | 20 | :param thresholds: scalar or 1D array of threshold values 21 | """ 22 | self.thresholds = np.array(thresholds, ndmin=1, dtype=float) 23 | if self.thresholds.ndim != 1: 24 | raise ValueError('Thresholds should be scalar or one-dimensional.') 25 | self.counts = None 26 | 27 | def initialise(self, dims, query_subseq, series_subseq): 28 | self.counts = np.full((len(self.thresholds), series_subseq), 0, dtype=int) 29 | 30 | def process_diagonal(self, diag, values): 31 | values = values[0] 32 | num_values = len(values) 33 | 34 | if diag >= 0: 35 | self.counts[:, diag:diag + num_values] += values <= self.thresholds[:, None] 36 | else: 37 | self.counts[:, :num_values] += values <= self.thresholds[:, None] 38 | 39 | def process_column(self, column_index, values): 40 | values = values[0] 41 | 42 | self.counts[:, column_index] = np.count_nonzero(values <= self.thresholds[:, None], axis=1) 43 | 44 | 45 | class DistancedThresholdCounter(AbstractConsumer): 46 | """ 47 | Consumer that counts the number of values in each column of the distancematrix that are below 48 | or equal to specified thresholds, with the added restriction of only counting elements that are at least 49 | a number of values apart from each other. 50 | 51 | This consumer does not support diagonal calculations. 52 | """ 53 | 54 | def __init__(self, thresholds, exclusion): 55 | """ 56 | Creates a new counter. 57 | 58 | :param thresholds: scalar or 1D array of threshold values 59 | :param exclusion: number of required spaces in between counted values 60 | """ 61 | self.thresholds = np.array(thresholds, ndmin=1, dtype=float) 62 | if self.thresholds.ndim != 1: 63 | raise ValueError('Thresholds should be scalar or one-dimensional.') 64 | self.thresholds.sort() 65 | self.exclusion = exclusion 66 | self.counts = None 67 | 68 | def initialise(self, dims, query_subseq, series_subseq): 69 | self.counts = np.full((len(self.thresholds), series_subseq), 0, dtype=int) 70 | 71 | def process_diagonal(self, diag, values): 72 | raise NotImplementedError("Diagonal processing is not supported.") 73 | 74 | def process_column(self, column_index, values): 75 | values = values[0] 76 | 77 | threshold_idx = 0 78 | current_thresh = self.thresholds[threshold_idx] 79 | 80 | # Todo: check performance if this is a class variable instead of a local one 81 | exclusions = np.zeros(len(values), dtype=bool) 82 | order = np.argsort(values) 83 | 84 | # Iterate over value indices from smallest to largest value 85 | for i in order: 86 | value = values[i] 87 | while value > current_thresh: 88 | threshold_idx += 1 89 | if threshold_idx == len(self.thresholds): 90 | return 91 | current_thresh = self.thresholds[threshold_idx] 92 | 93 | if not exclusions[i]: 94 | self.counts[threshold_idx:, column_index] += 1 95 | exclusions[max(0, i - self.exclusion):i + self.exclusion + 1] = True 96 | 97 | return 98 | -------------------------------------------------------------------------------- /distancematrix/generator/__init__.py: -------------------------------------------------------------------------------- 1 | from distancematrix.generator.euclidean import Euclidean 2 | from distancematrix.generator.znorm_euclidean import ZNormEuclidean 3 | from distancematrix.generator.filter_generator import FilterGenerator 4 | -------------------------------------------------------------------------------- /distancematrix/generator/abstract_generator.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | 3 | 4 | class AbstractGenerator(ABC): 5 | @abstractmethod 6 | def prepare(self, m, series, query=None): 7 | """ 8 | Create a bound non-streaming generator for the given series and query sequences. 9 | 10 | :param m: the size of the subsequences used to calculate distances between series and query 11 | :param series: 1D array, used as the horizontal axis of a distance matrix 12 | :param query: 1D array, used as the vertical axis of a distance matrix, or None to indicate a self-join 13 | :return: a bound generator 14 | """ 15 | pass 16 | 17 | @abstractmethod 18 | def prepare_streaming(self, m, series_window, query_window=None): 19 | """ 20 | Create a bound generator that supports streaming data. 21 | The generator will need to receive data before any distances can be calculated. 22 | 23 | :param m: the size of the subsequences used to calculate distances between series and query 24 | :param series_window: number of values to keep in memory for series, the length of the 25 | horizontal axis of the distance matrix will be equal to (series_window - m + 1) 26 | :param query_window: number of values to keep in memory for query, the length of the 27 | vertical axis of the distance matrix will be equal to (query_window - m + 1), 28 | or None to indicate a self-join. 29 | :return: a bound generator that supports streaming 30 | """ 31 | pass 32 | 33 | 34 | class AbstractBoundGenerator(ABC): 35 | @abstractmethod 36 | def calc_diagonal(self, diag): 37 | """ 38 | Calculates all distances of the distance matrix diagonal with the given index for the available data. 39 | 40 | If diag is zero, this calculates the main diagonal, running from the top left to the bottom right. 41 | Any positive value represents a diagonal above the main diagonal, and a negative value represents 42 | a diagonal below the main diagonal. 43 | 44 | :param diag: the diagonal index 45 | :return: 1D array, containing all values 46 | """ 47 | pass 48 | 49 | @abstractmethod 50 | def calc_column(self, column): 51 | """ 52 | Calculates all distances of the distance matrix on the specified column for the available data. 53 | 54 | :param column: the column index (starting at 0) 55 | :return: 1D array, containing all values 56 | """ 57 | pass 58 | 59 | 60 | class AbstractBoundStreamingGenerator(ABC): 61 | @abstractmethod 62 | def append_series(self, values): 63 | """ 64 | Adds more data points to the series sequence (and the query in case of a self-join). 65 | Older data points will be dropped if the series would become larger than the foreseen capacity. 66 | 67 | :param values: 1D array, the new values to append to the series 68 | :return: None 69 | """ 70 | 71 | @abstractmethod 72 | def append_query(self, values): 73 | """ 74 | Adds more data points to the query sequence. 75 | Older data points will be dropped if the query would become larger than the foreseen capacity. 76 | 77 | :param values: 1D array, the new values to append to the query 78 | :return: None 79 | """ -------------------------------------------------------------------------------- /distancematrix/generator/euclidean.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from distancematrix.util import diag_length 4 | from distancematrix.util import sliding_window_view 5 | from distancematrix.ringbuffer import RingBuffer 6 | from distancematrix.generator.abstract_generator import AbstractGenerator 7 | from distancematrix.generator.abstract_generator import AbstractBoundStreamingGenerator 8 | 9 | EPSILON = 1e-15 10 | 11 | 12 | class Euclidean(AbstractGenerator): 13 | """ 14 | Class capable of efficiently calculating parts of the euclidean distance matrix between two series, 15 | where each entry in the distance matrix equals the euclidean distance between 2 subsequences of both series. 16 | 17 | This generator can handle streaming data. 18 | """ 19 | 20 | def __init__(self, rb_scale_factor=2.): 21 | """ 22 | Creates a new instance. 23 | 24 | :param rb_scale_factor: scaling factor used for RingBuffers in case of streaming data (should be >= 1), 25 | this allows choosing a balance between less memory (low values) and reduced data copying (higher values) 26 | """ 27 | if rb_scale_factor < 1.: 28 | raise ValueError("rb_scale_factor should be >= 1, it was: " + str(rb_scale_factor)) 29 | 30 | self._rb_scale_factor = rb_scale_factor 31 | 32 | def prepare_streaming(self, m, series_window, query_window=None): 33 | series = RingBuffer(None, (series_window,), dtype=float, scaling_factor=self._rb_scale_factor) 34 | 35 | if query_window is not None: 36 | query = RingBuffer(None, (query_window,), dtype=float, scaling_factor=self._rb_scale_factor) 37 | self_join = False 38 | else: 39 | query = series 40 | self_join = True 41 | 42 | return BoundStreamingEuclidean(m, series, query, self_join) 43 | 44 | def prepare(self, m, series, query=None): 45 | if series.ndim != 1: 46 | raise RuntimeError("Series should be 1D") 47 | if query is not None and query.ndim != 1: 48 | raise RuntimeError("Query should be 1D") 49 | 50 | series = RingBuffer(series, dtype=float, scaling_factor=1) 51 | if query is not None: 52 | query = RingBuffer(query, dtype=float, scaling_factor=1) 53 | self_join = False 54 | else: 55 | query = series 56 | self_join = True 57 | return BoundStreamingEuclidean(m, series, query, self_join) 58 | 59 | 60 | class BoundStreamingEuclidean(AbstractBoundStreamingGenerator): 61 | def __init__(self, m, series, query, self_join): 62 | self.m = m 63 | self.series = series 64 | self.query = query 65 | self.self_join = self_join 66 | 67 | self.first_row = None 68 | self.first_row_backlog = 0 # The number of values not yet processed for the first row cache 69 | self.prev_calc_column_index = None 70 | self.prev_calc_column_sq_dist = None 71 | 72 | def append_series(self, values): 73 | if len(values) == 0: 74 | return 75 | 76 | data_dropped = self.series.push(values) 77 | num_dropped = len(values) - (self.series.max_shape[0] - self.series.view.shape[0]) 78 | self.first_row_backlog += len(values) 79 | 80 | if self.prev_calc_column_index is not None and num_dropped > 0: 81 | self.prev_calc_column_index -= num_dropped 82 | 83 | if self.self_join: 84 | if data_dropped: 85 | self.first_row = None # The first row was dropped by new data 86 | self.prev_calc_column_index = None 87 | 88 | def append_query(self, values): 89 | if self.self_join: 90 | raise RuntimeError("Cannot append query data in case of a self join.") 91 | 92 | if len(values) == 0: 93 | return 94 | 95 | if self.query.push(values): 96 | self.first_row = None # The first row was dropped by new data 97 | self.prev_calc_column_index = None 98 | 99 | def calc_diagonal(self, diag): 100 | dl = diag_length(len(self.query.view), len(self.series.view), diag) 101 | cumsum = np.zeros(dl + 1, dtype=float) 102 | 103 | if diag >= 0: 104 | # Eg: for diag = 2: 105 | # D = (y0 - x2)², (y1 - x3)², (y2 - x4)²... 106 | # cumsum = 0, D0, D0+D1, D0+D1+D2, ... 107 | cumsum[1:] = np.cumsum(np.square(self.query[:dl] - self.series[diag: diag + dl])) 108 | else: 109 | # Eg: for diag = -2: 110 | # D = (y2 - x0)², (y3 - x1)², (y4 - x2)²... 111 | # cumsum = 0, D0, D0+D1, D0+D1+D2, ... 112 | cumsum[1:] = np.cumsum(np.square(self.query[-diag: -diag + dl] - self.series[:dl])) 113 | 114 | return np.sqrt(cumsum[self.m:] - cumsum[:len(cumsum) - self.m]) 115 | 116 | def calc_column(self, column): 117 | if self.prev_calc_column_index != column - 1 or column == 0: 118 | # Previous column not cached or data for incremental calculation not available: full calculation 119 | sq_dist = _euclidean_distance_squared(self.query.view, self.series[column:column + self.m]) 120 | else: 121 | # Previous column cached, reuse it 122 | if self.first_row is None: 123 | self.first_row = RingBuffer(_euclidean_distance_squared(self.series.view, self.query[0: self.m]), 124 | shape=(self.series.max_shape[0] - self.m + 1,)) 125 | self.first_row_backlog = 0 126 | elif self.first_row_backlog > 0: 127 | # Series has been updated since last calculation of first_row 128 | elems_to_recalc = self.first_row_backlog + self.m - 1 129 | self.first_row.push(_euclidean_distance_squared(self.series[-elems_to_recalc:], self.query[0: self.m])) 130 | self.first_row_backlog = 0 131 | 132 | sq_dist = self.prev_calc_column_sq_dist # work in same array 133 | sq_dist[1:] = (self.prev_calc_column_sq_dist[:-1] 134 | - np.square(self.series[column - 1] - self.query[:len(self.query.view)-self.m]) 135 | + np.square(self.series[column + self.m - 1] - self.query[self.m:])) 136 | sq_dist[0] = self.first_row[column] 137 | 138 | self.prev_calc_column_sq_dist = sq_dist 139 | self.prev_calc_column_index = column 140 | 141 | return np.sqrt(sq_dist) 142 | 143 | 144 | def _euclidean_distance_squared(series, sequence): 145 | """ 146 | Calculates the squared euclidean distance between the given sequence and each possible subsequence of the series 147 | (using a sliding window of the same length as the sequence). 148 | 149 | :param series: 1D numpy array of length n 150 | :param sequence: 1D numpy array of length m 151 | :return: a 1D numpy array of length n-m+1 containing the squared euclidean distance 152 | """ 153 | if series.ndim != 1: 154 | raise RuntimeError("Series should be 1D") 155 | if sequence.ndim != 1: 156 | raise RuntimeError("Sequence should be 1D") 157 | 158 | m = len(sequence) 159 | 160 | sliding_view = sliding_window_view(series, [m]) 161 | 162 | # (X - Y)^2 = X^2 - 2XY + Y^2 163 | # Here, einsum is used to calculate dot products over sliding window to prevent memory copying. 164 | # Using the normal euclidean distance calculation over the sliding window (x - y)^2 would result in copying 165 | # each window, which leads to memory errors for long series. 166 | dist = np.einsum('ij,ij->i', sliding_view, sliding_view) # Dot product of every window with itself 167 | dist -= 2 * np.einsum('ij,j->i', sliding_view, sequence) # Dot product of every window with sequence 168 | dist += np.dot(sequence, sequence) # Dot product of sequence with itself 169 | dist[dist < EPSILON] = 0 # Avoid very small negative numbers due to rounding 170 | 171 | # Simple implementation, this takes double as long to calculate as the einsum approach, though it contains 172 | # no approximations. For very long series (100k when testing), suddenly takes 10 times as long, most likely 173 | # due to cpu caching that cannot contain the entire series (could be circumvented by batching): 174 | # num_sub_seq = len(series) - m + 1 175 | # dist = np.zeros(num_sub_seq) 176 | # for i in range(m): 177 | # dist += np.square(series[i:num_sub_seq + i] - sequence[i]) 178 | 179 | return dist 180 | -------------------------------------------------------------------------------- /distancematrix/generator/filter_generator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from distancematrix.util import sliding_window_view 4 | from distancematrix.ringbuffer import RingBuffer 5 | from distancematrix.generator.abstract_generator import AbstractGenerator 6 | from distancematrix.generator.abstract_generator import AbstractBoundGenerator 7 | from distancematrix.generator.abstract_generator import AbstractBoundStreamingGenerator 8 | 9 | 10 | def is_not_finite(data, subseq_length): 11 | """ 12 | Marks infinite or nan values as invalid. 13 | """ 14 | return ~np.isfinite(data) 15 | 16 | 17 | class FilterGenerator(AbstractGenerator): 18 | def __init__(self, generator, invalid_data_function=is_not_finite, rb_scale_factor=2.): 19 | """ 20 | Creates a new generator by wrapping another generator. 21 | 22 | :param generator: the generator whose results and input data will be filtered 23 | :param invalid_data_function: a function that takes in the original data (series or query) and 24 | subsequence length and returns a boolean array of the same size that has a True value for any invalid values. 25 | These values will be replaced by zeros before reaching the wrapped generator. Any distance values 26 | that were calculated using invalid data points will be positive infinite values. 27 | :param invalid_subseq_function: optional - a function that takes in the original data (series or query) and 28 | subsequence length and returns a boolean array of size matching the number of subsequences that has 29 | a True value for any invalid subsequence. Invalid subsequences will have positive infinte values 30 | as distance. 31 | """ 32 | if rb_scale_factor < 1.: 33 | raise ValueError("rb_scale_factor should be >= 1, it was: " + str(rb_scale_factor)) 34 | 35 | self._rb_scale_factor = rb_scale_factor 36 | self._generator = generator 37 | self._invalid_data_function = invalid_data_function 38 | 39 | def prepare_streaming(self, m, series_window, query_window=None): 40 | gen = self._generator.prepare_streaming(m, series_window, query_window) 41 | 42 | num_s_subseq = series_window - m + 1 43 | if query_window is None: 44 | num_q_subseq = None 45 | else: 46 | num_q_subseq = query_window - m + 1 47 | 48 | return BoundStreamingFilterGenerator(gen, m, num_s_subseq, num_q_subseq, 49 | self._invalid_data_function, self._rb_scale_factor) 50 | 51 | def prepare(self, m, series, query=None): 52 | new_series, invalid_series_subseq = _correct_data_and_create_masks(series, m, self._invalid_data_function) 53 | 54 | if query is not None: 55 | new_query, invalid_query_subseq = _correct_data_and_create_masks(query, m, self._invalid_data_function) 56 | num_q_subseq = len(query) - m + 1 57 | else: 58 | new_query = None 59 | invalid_query_subseq = invalid_series_subseq 60 | num_q_subseq = len(series) - m + 1 61 | 62 | generator = self._generator.prepare(m, new_series, new_query) 63 | return BoundFilterGenerator(generator, m, num_q_subseq, invalid_series_subseq, invalid_query_subseq) 64 | 65 | 66 | class BoundFilterGenerator(AbstractBoundGenerator): 67 | """ 68 | Wrapper around other generators that will replace values in the distance matrix marked as invalid 69 | by positive infinity. It can also perform a data pre-processing step before data reaches the wrapped generator, 70 | by setting values marked as invalid to zero, this can be useful for example to remove nan values for a generator 71 | that does not support nan values. 72 | """ 73 | 74 | def __init__(self, generator, m, num_q_subseq, invalid_series_subseq, invalid_query_subseq): 75 | """ 76 | Creates a new generator by wrapping another generator. 77 | 78 | :param generator: the generator whose results and input data will be filtered 79 | :param invalid_data_function: optional - a function that takes in the original data (series or query) and 80 | subsequence length and returns a boolean array of the same size that has a True value for any invalid values. 81 | These values will be replaced by zeros before reaching the wrapped generator. Any distance values 82 | that were calculated using invalid data points will be positive infinite values. 83 | :param invalid_subseq_function: optional - a function that takes in the original data (series or query) and 84 | subsequence length and returns a boolean array of size matching the number of subsequences that has 85 | a True value for any invalid subsequence. Invalid subsequences will have positive infinte values 86 | as distance. 87 | """ 88 | self.generator = generator 89 | 90 | self.m = m 91 | self.num_q_subseq = num_q_subseq 92 | 93 | self.invalid_series_subseq = invalid_series_subseq 94 | self.invalid_query_subseq = invalid_query_subseq 95 | 96 | def calc_diagonal(self, diag): 97 | distances = self.generator.calc_diagonal(diag) 98 | 99 | if diag >= 0: 100 | if self.invalid_series_subseq is not None: 101 | distances[self.invalid_series_subseq[diag: diag+len(distances)]] = np.Inf 102 | if self.invalid_query_subseq is not None: 103 | distances[self.invalid_query_subseq[:len(distances)]] = np.Inf 104 | else: 105 | if self.invalid_series_subseq is not None: 106 | distances[self.invalid_series_subseq[:len(distances)]] = np.Inf 107 | if self.invalid_query_subseq is not None: 108 | distances[self.invalid_query_subseq[-diag: -diag+len(distances)]] = np.Inf 109 | 110 | return distances 111 | 112 | def calc_column(self, column): 113 | if self.invalid_series_subseq is not None and self.invalid_series_subseq[column]: 114 | return np.full(self.num_q_subseq, np.Inf) 115 | 116 | distances = self.generator.calc_column(column) 117 | 118 | if self.invalid_query_subseq is not None: 119 | distances[self.invalid_query_subseq] = np.Inf 120 | 121 | return distances 122 | 123 | 124 | class BoundStreamingFilterGenerator(BoundFilterGenerator, AbstractBoundStreamingGenerator): 125 | """ 126 | Wrapper around other generators that will replace values in the distance matrix marked as invalid 127 | by positive infinity. It can also perform a data pre-processing step before data reaches the wrapped generator, 128 | by setting values marked as invalid to zero, this can be useful for example to remove nan values for a generator 129 | that does not support nan values. 130 | """ 131 | 132 | def __init__(self, generator, m, num_s_subseq, num_q_subseq, invalid_data_function, rb_scale_factor): 133 | """ 134 | Creates a new generator by wrapping another generator. 135 | 136 | :param generator: the generator whose results and input data will be filtered 137 | :param invalid_data_function: optional - a function that takes in the original data (series or query) and 138 | subsequence length and returns a boolean array of the same size that has a True value for any invalid values. 139 | These values will be replaced by zeros before reaching the wrapped generator. Any distance values 140 | that were calculated using invalid data points will be positive infinite values. 141 | """ 142 | 143 | self._invalid_data_function = invalid_data_function 144 | 145 | invalid_s_subseq_buffer = RingBuffer(None, shape=(num_s_subseq,), 146 | dtype=bool, scaling_factor=rb_scale_factor) 147 | 148 | self.invalid_series = RingBuffer(None, shape=(num_s_subseq + m - 1,), 149 | dtype=bool, scaling_factor=rb_scale_factor) 150 | 151 | if num_q_subseq is None: 152 | self.self_join = True 153 | invalid_q_subseq_buffer = invalid_s_subseq_buffer 154 | num_q_subseq = num_s_subseq 155 | self.invalid_query = self.invalid_series 156 | else: 157 | self.self_join = False 158 | 159 | invalid_q_subseq_buffer = RingBuffer(None, shape=(num_q_subseq,), 160 | dtype=bool, scaling_factor=rb_scale_factor) 161 | self.invalid_query = RingBuffer(None, shape=(num_q_subseq + m - 1,), 162 | dtype=bool, scaling_factor=rb_scale_factor) 163 | 164 | super().__init__(generator, m, num_q_subseq, invalid_s_subseq_buffer, invalid_q_subseq_buffer) 165 | 166 | def append_series(self, values): 167 | invalid_points = _apply_data_validation(values, self.m, self._invalid_data_function) 168 | self.invalid_series.push(invalid_points) 169 | 170 | if np.any(invalid_points): 171 | values = values.copy() 172 | values[invalid_points] = 0 173 | 174 | if len(self.invalid_series.view) >= self.m: 175 | rel_values = self.invalid_series[-(len(values) + self.m - 1):] 176 | self.invalid_series_subseq.push(np.any(sliding_window_view(rel_values, (self.m,)), axis=-1)) 177 | 178 | self.generator.append_series(values) 179 | 180 | def append_query(self, values): 181 | if self.self_join: 182 | raise RuntimeError("Cannot append to query for a self-join.") 183 | 184 | invalid_points = _apply_data_validation(values, self.m, self._invalid_data_function) 185 | self.invalid_query.push(invalid_points) 186 | 187 | if np.any(invalid_points): 188 | values = values.copy() 189 | values[invalid_points] = 0 190 | 191 | if len(self.invalid_query.view) >= self.m: 192 | rel_values = self.invalid_query[-(len(values) + self.m - 1):] 193 | self.invalid_query_subseq.push(np.any(sliding_window_view(rel_values, (self.m,)), axis=-1)) 194 | 195 | self.generator.append_query(values) 196 | 197 | def calc_column(self, column): 198 | if self.invalid_series_subseq[column]: 199 | return np.full(len(self.invalid_query_subseq.view), np.Inf) 200 | 201 | distances = self.generator.calc_column(column) 202 | distances[self.invalid_query_subseq.view] = np.Inf 203 | 204 | return distances 205 | 206 | 207 | def _apply_data_validation(data, m, invalid_data_function): 208 | """ 209 | Returns a boolean array of the same size as data. 210 | 211 | :param data: 212 | :param m: 213 | :param invalid_data_function: 214 | :return: 215 | """ 216 | invalid_data = invalid_data_function(data, m) 217 | if invalid_data.shape != data.shape: 218 | raise RuntimeError("Invalid_data_function's output does not have expected dimension.") 219 | 220 | return invalid_data 221 | 222 | 223 | def _correct_data_and_create_masks(data, m, invalid_data_function): 224 | """ 225 | Runs invalid_data_function and invalid_subseq_function, if they are defined. 226 | Any invalid data points are set to zero value and returned in a copied array. 227 | A boolean array is created to mark all invalid subsequence indices (= True values). 228 | 229 | :param data: 1D-array 230 | :param m: subsequence length 231 | :return: tuple of: data or a modified copy of data; None or a boolean 1D array containing at least 1 True 232 | (= invalid subsequence) value 233 | """ 234 | invalid_data = invalid_data_function(data, m) 235 | if invalid_data.shape != data.shape: 236 | raise RuntimeError("Invalid_data_function's output does not have expected dimension.") 237 | 238 | 239 | # invalid_data = invalid_data and np.any(invalid_data) 240 | # invalid_subseq = invalid_subseq and np.any(invalid_subseq) 241 | 242 | new_data = data 243 | invalid_mask = None 244 | if invalid_data is not None: 245 | new_data = data.copy() 246 | new_data[invalid_data] = 0 247 | invalid_mask = _invalid_data_to_invalid_subseq(invalid_data, m) 248 | 249 | return new_data, invalid_mask 250 | 251 | def _invalid_data_to_invalid_subseq(invalid_data, subseq_length): 252 | """ 253 | Converts a boolean array marking invalid data points to a boolean array marking invalid subsequences. 254 | (A subsequence is invalid if it contained any invalid data point.) 255 | 256 | :param invalid_data: 1D array of booleans, True indicating invalid data points 257 | :param subseq_length: subsequence length 258 | :return: 1D boolean array of length num-subsequences 259 | """ 260 | data_length = invalid_data.shape[0] 261 | result = np.zeros(data_length - subseq_length + 1, dtype=bool) 262 | 263 | impacted = 0 264 | for i in range(0, subseq_length - 1): 265 | if invalid_data[i]: 266 | impacted = subseq_length 267 | if impacted: 268 | impacted -= 1 269 | 270 | for i in range(subseq_length-1, data_length): 271 | if invalid_data[i]: 272 | impacted = subseq_length 273 | if impacted: 274 | result[i - subseq_length + 1] = True 275 | impacted -= 1 276 | 277 | return result 278 | -------------------------------------------------------------------------------- /distancematrix/insights.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def lowest_value_idxs(array, exclude_distance): 5 | """ 6 | Creates a generator that returns the indices of the lowest elements, where each index differs by at least 7 | exclude_distance from every previously returned index. Non-finite values are ignored. 8 | 9 | :param array: 1D array 10 | :param exclude_distance: a positive integer 11 | :return: a generator 12 | """ 13 | if not array.ndim == 1: 14 | raise RuntimeError("Array should be 1-dimensional.") 15 | if type(exclude_distance) is not int or exclude_distance < 0: 16 | raise RuntimeError('Exclude distance should be positive integer.') 17 | 18 | array = array.astype(float, copy=True) 19 | array[~np.isfinite(array)] = np.inf 20 | 21 | min_idx = np.argmin(array) 22 | 23 | while array[min_idx] != np.inf: 24 | yield min_idx 25 | 26 | array[max(0, min_idx - exclude_distance): min_idx + exclude_distance + 1] = np.inf 27 | min_idx = np.argmin(array) 28 | 29 | return 30 | 31 | 32 | def highest_value_idxs(array, exclude_distance): 33 | """ 34 | Creates a generator that returns the indices of the highest elements, where each index differs by at least 35 | exclude_distance from every previously returned index. Non-finite values are ignored. 36 | 37 | :param array: 1D array 38 | :param exclude_distance: a positive integer 39 | :return: a generator 40 | """ 41 | return lowest_value_idxs(-array, exclude_distance) 42 | -------------------------------------------------------------------------------- /distancematrix/interrupt_util.py: -------------------------------------------------------------------------------- 1 | import signal 2 | from contextlib import contextmanager 3 | 4 | 5 | @contextmanager 6 | def interrupt_catcher(): 7 | """ 8 | A context that allows for gracefully terminating a calculation by catching interrupts 9 | and providing a method to check whether an interrupt has occurred. 10 | 11 | :return: None 12 | """ 13 | interrupted = False 14 | 15 | def set_interrupted(signum, frame): 16 | nonlocal interrupted 17 | interrupted = True 18 | 19 | def is_interrupted(): 20 | nonlocal interrupted 21 | return interrupted 22 | 23 | # Replace the current interrupt handler 24 | original_sigint_handler = signal.getsignal(signal.SIGINT) 25 | signal.signal(signal.SIGINT, set_interrupted) 26 | 27 | try: 28 | yield is_interrupted 29 | finally: 30 | # Restore original interrupt handler 31 | signal.signal(signal.SIGINT, original_sigint_handler) 32 | 33 | 34 | if __name__ == "__main__": 35 | import time 36 | for i in range(5): 37 | print(i) 38 | time.sleep(1) 39 | 40 | print("-- Interrupts will now simply halt the loop --") 41 | with interrupt_catcher() as is_interrupted: 42 | for i in range(5): 43 | if is_interrupted(): 44 | break 45 | print(i) 46 | time.sleep(1) 47 | print("-- Interrupts are back to normal --") 48 | for i in range(5): 49 | print(i) 50 | time.sleep(1) -------------------------------------------------------------------------------- /distancematrix/math_tricks.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from distancematrix.ringbuffer import RingBuffer 3 | from distancematrix.util import sliding_window_view 4 | 5 | 6 | def sliding_mean_std(series, m): 7 | """ 8 | Calculates the sliding mean and standard deviation over the series using a window of size m. 9 | The series should only contain finite values. 10 | 11 | :param series: 1D numpy array 12 | :param m: sliding window size 13 | :return: tuple of 2 arrays, each of size (len(series) - m + 1) 14 | """ 15 | if m <= 0 or not isinstance(m, int): 16 | raise RuntimeError('m should be an integer > 0.') 17 | 18 | if series.ndim != 1: 19 | raise RuntimeError('series should be one dimensional') 20 | 21 | if not np.isfinite(series).all(): 22 | raise RuntimeError('Provided series contains nan or infinite values.') 23 | 24 | sliding_view = sliding_window_view(series, [m]) 25 | return np.mean(sliding_view, axis=1), np.std(sliding_view, axis=1) 26 | 27 | 28 | def sliding_mean_var(series, m): 29 | """ 30 | Calculates the sliding mean and variance over the series using a window of size m. 31 | The series should only contain finite values. 32 | 33 | :param series: 1D numpy array 34 | :param m: sliding window size 35 | :return: tuple of 2 arrays, each of size (len(series) - m + 1) 36 | """ 37 | if m <= 0 or not isinstance(m, int): 38 | raise RuntimeError('m should be an integer > 0.') 39 | 40 | if series.ndim != 1: 41 | raise RuntimeError('series should be one dimensional') 42 | 43 | if not np.isfinite(series).all(): 44 | raise RuntimeError('Provided series contains nan or infinite values.') 45 | 46 | sliding_view = sliding_window_view(series, [m]) 47 | return np.mean(sliding_view, axis=1), np.var(sliding_view, axis=1) 48 | 49 | 50 | class StreamingStats(object): 51 | """ 52 | Class that tracks a data stream and corresponding mean and standard deviation of a window over this data. 53 | 54 | The data stream has to be updated by the user, after which the mean/std stream will be updated automatically. 55 | 56 | This class uses RingBuffers internally, so any old view (data, mean, std) should be considered unreliable 57 | after new data was pushed to this class. 58 | """ 59 | 60 | def __init__(self, series, m) -> None: 61 | """ 62 | Creates a new instance. This instance will keep track of a data stream (with dimensions matching those of 63 | series) and a stream of moving mean and standard deviation using a window of length m. 64 | 65 | :param series: Starting data of the data stream 66 | :param m: window size for mean and variance 67 | """ 68 | if m > series.shape[-1]: 69 | raise RuntimeError("M should be <= series.shape[-1].") 70 | 71 | self._data_buffer = RingBuffer(series) 72 | self._m = m 73 | 74 | sliding_avg, sliding_std = sliding_mean_std(series, m) 75 | self._mean_buffer = RingBuffer(sliding_avg) 76 | self._std_buffer = RingBuffer(sliding_std) 77 | 78 | def append(self, data): 79 | data_length = data.shape[-1] 80 | 81 | if data_length == 0: 82 | return 83 | 84 | self._data_buffer.push(data) 85 | new_means, new_stds = sliding_mean_std(self._data_buffer[max(-self._m - 1 - data_length, 0):], self._m) 86 | self._mean_buffer.push(new_means) 87 | self._std_buffer.push(new_stds) 88 | 89 | # Original implementation below, this approach might still be interesting if the current approach proves to be 90 | # too slow in practice. One issue that remains to be solved (why this method was replaced) is that 91 | # a mid-signal constant window will not result in variance of 0. One approach might be to simply check 92 | # for constant signals. A starting point might be: 93 | # https://stackoverflow.com/questions/1066758/find-length-of-sequences-of-identical-values-in-a-numpy-array-run-length-encodi?rq=1 94 | # The numerical stability test gives a use case where this method fails. 95 | # 96 | # buffer_length = self._data_buffer.view.shape[-1] 97 | # if data_length >= buffer_length: 98 | # sliding_avg, sliding_var = sliding_mean_var(data[..., -buffer_length:], self._m) 99 | # self._mean_buffer.push(sliding_avg) 100 | # self._var_buffer.push(sliding_var) 101 | # else: 102 | # # Sliding variance formula: http://jonisalonen.com/2014/efficient-and-accurate-rolling-standard-deviation/ 103 | # # First steps of derivation: http://jonisalonen.com/2013/deriving-welfords-method-for-computing-variance/ 104 | # # (For non-online calculation, the formula used in sliding_mean_var is faster) 105 | # 106 | # old_mean = self._mean_buffer.view[..., -1] 107 | # old_var = self._var_buffer.view[..., -1] 108 | # values_to_remove = self._data_buffer.view[..., -self._m: min(-1, -self._m + data_length)] 109 | # values_to_add = data[..., :values_to_remove.shape[-1]] 110 | # new_means = old_mean + np.cumsum(- values_to_remove + values_to_add) / self._m 111 | # old_means = np.concatenate((np.atleast_1d(old_mean), new_means[..., :-1])) 112 | # new_vars = old_var + np.cumsum((values_to_add - values_to_remove) * ( 113 | # values_to_add - new_means + values_to_remove - old_means) / self._m) 114 | # new_vars[new_vars < 1e-12] = 0. # Unreliable! 115 | # 116 | # self._mean_buffer.push(new_means) 117 | # self._var_buffer.push(new_vars) 118 | # 119 | # if data_length >= self._m: 120 | # sliding_avg, sliding_var = sliding_mean_var(data, self._m) 121 | # self._mean_buffer.push(sliding_avg) 122 | # self._var_buffer.push(sliding_var) 123 | # 124 | # self._data_buffer.push(data) 125 | 126 | @property 127 | def data(self): 128 | return self._data_buffer.view 129 | 130 | @property 131 | def mean(self): 132 | return self._mean_buffer.view 133 | 134 | @property 135 | def std(self): 136 | return self._std_buffer.view 137 | -------------------------------------------------------------------------------- /distancematrix/ostinato.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import numpy as np 3 | 4 | from distancematrix import AnytimeCalculator 5 | from distancematrix.generator import ZNormEuclidean 6 | from distancematrix.generator.znorm_euclidean import BoundZNormEuclidean, _CONSTANT_SUBSEQ_THRESHOLD 7 | from distancematrix.consumer import MatrixProfileLR 8 | from distancematrix.math_tricks import sliding_mean_std 9 | from distancematrix.ringbuffer import RingBuffer 10 | 11 | CMResult = namedtuple('CMResult', ['radius', 'series_index', 'subseq_index']) 12 | 13 | 14 | def find_consensus_motif(series_list, m: int) -> CMResult: 15 | """ 16 | Finds the top-1 consensus motif and corresponding distance for the given collection of series. 17 | The consensus motif is the subsequence (extracted from one of the series), 18 | that has a match to a subsequence from each other series within a certain distance, 19 | where that distance is minimal. 20 | 21 | This method implements the Ostinato algorithm, described in 22 | "Matrix Profile XV: Exploiting Time Series Consensus Motifs to Find Structure in Time Series Sets" 23 | by K. Kamgar, S. Gharghabi and E. Keogh. 24 | 25 | :param series_list: list of 1-dimensional arrays 26 | :param m: length of the consensus motif 27 | :return: tuple containing radius, series index and subsequence index of the consensus motif 28 | """ 29 | if len(series_list) < 2: 30 | raise RuntimeError("At least 2 series are required.") 31 | if m < 3: 32 | raise RuntimeError("Motif length should be >= 3.") 33 | for series in series_list: 34 | series = np.array(series) 35 | if len(series) < m: 36 | raise RuntimeError("One or more series are shorter than the desired motif length.") 37 | if series.ndim != 1: 38 | raise RuntimeError("One or more series are not one dimensional.") 39 | 40 | best_result = CMResult(np.inf, -1, -1) 41 | num_series = len(series_list) 42 | 43 | # Create a distance calculator for each series pair, but reuse mu/std calculations per series. 44 | # Step 1: mu/std calculation 45 | cached_generators = {} 46 | mus = [] 47 | stds = [] 48 | stdsz = [] 49 | for series in series_list: 50 | mu, std = sliding_mean_std(series, m) 51 | mus.append(RingBuffer(mu, scaling_factor=1.)) 52 | stds.append(RingBuffer(std, scaling_factor=1.)) 53 | stdsz.append(RingBuffer(std > _CONSTANT_SUBSEQ_THRESHOLD, scaling_factor=1.)) 54 | 55 | # Step 2: create the distance calculator 56 | for i, series1 in enumerate(series_list): 57 | for j, series2 in enumerate(series_list): 58 | if i == j: 59 | continue 60 | gen = BoundZNormEuclidean(m, RingBuffer(series1, scaling_factor=1.), RingBuffer(series2, scaling_factor=1.), 61 | False, 0., mus[i], stds[i], stdsz[i], mus[j], stds[j], stdsz[j]) 62 | cached_generators[i, j] = gen 63 | 64 | # Look for the consensus motif: iterator over all series 65 | for series_idx in range(num_series): 66 | next_series_idx = (series_idx + 1) % num_series 67 | active_series = series_list[series_idx] 68 | 69 | # Calculate a full matrix profile between the series and the next series 70 | dist_calc = cached_generators[(series_idx, next_series_idx)] 71 | num_subseq = len(active_series) - m + 1 72 | mp = np.empty(num_subseq, dtype=float) 73 | for col in range(num_subseq): 74 | mp[col] = np.min(dist_calc.calc_column(col)) 75 | 76 | # Order the subsequences of the series from lowest to highest distances (as given by the Matrix Profile) 77 | candidates = np.argsort(mp) 78 | 79 | # Iterate over all candidate subsequences, starting from those that had the best match to next_series. 80 | for subseq_idx in candidates: 81 | candidate_radius = mp[subseq_idx] 82 | aborted = False 83 | 84 | # Abort if the distance (to next_series) is worse than best result so far 85 | if candidate_radius >= best_result.radius: 86 | break 87 | 88 | # Check distance of the candidate subsequence to all other series. 89 | for other_series_idx in range(num_series): 90 | # Skip the current and next_series, as we already considered those. 91 | if other_series_idx in [series_idx, next_series_idx]: 92 | continue 93 | 94 | # Calculates the distance from the candidate subsequence to all subsequences in other_series. 95 | other_gen = cached_generators[(series_idx, other_series_idx)] 96 | distances = other_gen.calc_column(subseq_idx) 97 | min_distance = np.min(distances) 98 | candidate_radius = max(candidate_radius, min_distance) 99 | 100 | # Abort search if distance is greater than best so far. 101 | if candidate_radius >= best_result.radius: 102 | aborted = True 103 | break 104 | 105 | # Store the current candidate as best result so far. 106 | if not aborted and candidate_radius < best_result.radius: 107 | best_result = CMResult(candidate_radius, series_idx, subseq_idx) 108 | 109 | return best_result 110 | 111 | 112 | def find_consensus_motif_subset(series_list, m: int, k: int) -> CMResult: 113 | """ 114 | Finds the top-1 k of n consensus motif and corresponding distance for the given collection of series. 115 | The consensus motif is the subsequence (extracted from one of the series), 116 | that has a match to a subsequence from k other series within a certain distance, 117 | where that distance is minimal. 118 | 119 | This method implements the k of n Ostinato algorithm, described in 120 | "Matrix Profile XV: Exploiting Time Series Consensus Motifs to Find Structure in Time Series Sets" 121 | by K. Kamgar, S. Gharghabi and E. Keogh. 122 | 123 | Note: this algorithm has not yet been optimized for speed. 124 | (Instead, consider using the Anytime Ostinato algorithm.) 125 | 126 | :param series_list: list of 1-dimensional arrays 127 | :param m: length of the consensus motif 128 | :return: tuple containing radius, series index and subsequence index of the consensus motif 129 | """ 130 | if len(series_list) < 2: 131 | raise RuntimeError("At least 2 series are required.") 132 | if m < 3: 133 | raise RuntimeError("Motif length should be >= 3.") 134 | if k < 2 or k > len(series_list): 135 | raise RuntimeError("Number of considered series should be >= 2 and <= len(series).") 136 | for series in series_list: 137 | series = np.array(series) 138 | if len(series) < m: 139 | raise RuntimeError("One or more series are shorter than the desired motif length.") 140 | if series.ndim != 1: 141 | raise RuntimeError("One or more series are not one dimensional.") 142 | 143 | best_result = CMResult(np.inf, -1, -1) 144 | num_series = len(series_list) 145 | num_ignored_series = num_series - k 146 | 147 | # Using streaming generators avoids having to recalculate the means/stds for calculating 148 | # distance between the series and a single subsequence 149 | cached_generators = [] 150 | for series in series_list: 151 | gen = ZNormEuclidean().prepare_streaming(m, m, len(series)) 152 | gen.append_query(series) 153 | cached_generators.append(gen) 154 | 155 | for series_idx in range(num_series): 156 | active_series = series_list[series_idx] 157 | num_subseqs = len(active_series) - m + 1 158 | 159 | # Calculate for each subsequence in active_series the best match to the next 160 | # (num_ignored_series + 1) series 161 | next_mps = np.empty((num_ignored_series + 1, num_subseqs)) 162 | for i in range(num_ignored_series + 1): 163 | next_series_idx = (series_idx + 1 + i) % num_series 164 | next_series = series_list[next_series_idx] 165 | next_mps[i, :] = _calculate_mp(m, active_series, next_series) 166 | 167 | candidates = np.argsort(np.min(next_mps, axis=0)) 168 | 169 | # Iterate over all candidate subsequences, starting from those that had the best match to any next_series. 170 | for subseq_idx in candidates: 171 | aborted = False 172 | 173 | # We track the (num_ignored_series + 1) biggest radii found, 174 | # where only the smallest value determines the actual radius for the subsequence 175 | # (since we can ignore the other values). 176 | candidate_radii: np.ndarray = next_mps[:, subseq_idx].copy() 177 | 178 | # Iterate over all other, not yet calculated, series 179 | for j in range(num_series - num_ignored_series - 2): 180 | other_series_idx = (series_idx + num_ignored_series + 2) % num_series 181 | 182 | candidate_radii.sort() 183 | if candidate_radii[0] >= best_result.radius: 184 | aborted = True 185 | break 186 | 187 | # Calculates the distance from the candidate subsequence to all subsequences in other_series. 188 | other_gen = cached_generators[other_series_idx] 189 | other_gen.append_series(active_series[subseq_idx: subseq_idx + m]) 190 | min_distance = np.min(other_gen.calc_column(0)) 191 | candidate_radii[0] = max(candidate_radii[0], min_distance) 192 | 193 | if not aborted: 194 | best_radius = np.min(candidate_radii) 195 | if best_radius < best_result.radius: 196 | best_result = CMResult(best_radius, series_idx, subseq_idx) 197 | 198 | return best_result 199 | 200 | 201 | def _calculate_mp(m, series, query) -> np.array: 202 | """Calculates the z-norm-based Matrix Profile.""" 203 | 204 | # Todo: MP_LR will have unneeded overhead, change to lightweight MP (only MP, no idx, no left/right) 205 | calc = AnytimeCalculator(m, series, query) 206 | calc.add_generator(0, ZNormEuclidean()) 207 | cons = calc.add_consumer([0], MatrixProfileLR()) 208 | calc.calculate_columns() 209 | return cons.matrix_profile() 210 | 211 | 212 | class _MPReverse(MatrixProfileLR): 213 | def __init__(self): 214 | super().__init__() 215 | 216 | def initialise(self, dims, query_subseq, series_subseq): 217 | super().initialise(dims, series_subseq, query_subseq) 218 | 219 | def process_diagonal(self, diag, values): 220 | super().process_diagonal(-diag, values) 221 | 222 | 223 | class OstinatoAnytime(object): 224 | """ 225 | Implementation of the Anytime Ostinato algorithm, which can be used to find the radius profile 226 | for a collection of series. Since it is an anytime algorithm, the user can choose between more accurate results 227 | or a shorter runtime. 228 | 229 | The radius profile contains for each subsequence the minimum distance needed to match a subsequence 230 | from all other series. 231 | Given the radius profile, the top-k minimal values correspond to the top-k consensus motifs. 232 | 233 | This algorithm is described in 234 | "Mining Recurring Patterns in Real-Valued Time Series using the Radius Profile" 235 | by D. De Paepe and S. Van Hoecke. 236 | """ 237 | def __init__(self, series, m: int) -> None: 238 | """ 239 | Creates a new instance that can be used to find the radius profile for the given series. 240 | 241 | :param series: the series for which to calculate the radius profile, a list of 1-D series 242 | :param m: subsequence length 243 | """ 244 | num_series = len(series) 245 | 246 | self.calculators = [] 247 | self.mps = [[] for i in range(num_series)] 248 | 249 | for i in range(num_series): 250 | for j in range(i + 1, num_series): 251 | calc = AnytimeCalculator(m, series[j], series[i]) 252 | calc.add_generator(0, ZNormEuclidean()) 253 | 254 | self.mps[j].append(calc.add_consumer([0], MatrixProfileLR())) 255 | self.mps[i].append(calc.add_consumer([0], _MPReverse())) 256 | self.calculators.append(calc) 257 | 258 | def calculate(self, fraction: float): 259 | """ 260 | Calculates a given fraction of all distances. 261 | 262 | Experiments show that even for low fractions, the resulting radius profile will give representative 263 | approximate results. The runtime of this method scales linear with the fraction. 264 | 265 | :param fraction: fraction of values to calculate, value in [0 .. 1] 266 | """ 267 | for calc in self.calculators: 268 | calc.calculate_diagonals(fraction) 269 | 270 | def get_radii(self, k_best: int = None): 271 | """ 272 | Retrieves the radius profile for each series. 273 | If the calculation was not performed completely, the returned profiles will overestimate the real 274 | radius profile. 275 | 276 | :param k_best: If specified, calculates the radius using only the k_best best matching series 277 | (instead of all series) 278 | """ 279 | radii = [] 280 | 281 | for serie_consumers in self.mps: 282 | serie_mps = [cons.matrix_profile() for cons in serie_consumers] 283 | 284 | if k_best is None: 285 | radii.append(np.max(serie_mps, axis=0)) 286 | else: 287 | radii.append(np.sort(serie_mps, axis=0)[k_best-1, :]) 288 | 289 | return radii 290 | -------------------------------------------------------------------------------- /distancematrix/ringbuffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from math import ceil 3 | 4 | 5 | class RingBuffer(object): 6 | """ 7 | A data structure that represents a sliding window over a data stream. Data can be pushed onto the buffer, 8 | thereby discarding the oldest data. The buffer is not resizable. 9 | 10 | Data is pushed onto the last dimension (in case of multidimensional data). 11 | 12 | Users should always reference the buffer instance, not the buffer view, as the view will be replaced 13 | as data is pushed onto the buffer. For user comfort, indexing and slicing on the buffer instance will 14 | immediately access the buffer view. 15 | """ 16 | 17 | def __init__(self, data, shape=None, dtype=None, scaling_factor=2.) -> None: 18 | """ 19 | Creates a new RingBuffer. 20 | 21 | :param data: data to initialize the buffer, data may be smaller or larger than shape, may be None to 22 | initialize an empty buffer 23 | :param shape: the shape of the buffer, if None, uses the shape of data 24 | :param dtype: the datatype for the buffer, if None, uses the dtype of data 25 | :param scaling_factor: determines internal buffer size (window size x scaling_factor) 26 | """ 27 | super().__init__() 28 | 29 | if data is None and shape is None: 30 | raise RuntimeError("Data and shape may not both be None.") 31 | 32 | if data is None and dtype is None: 33 | raise RuntimeError("Data and dtype may not both be None.") 34 | 35 | if data is not None: 36 | data = np.asarray(data) 37 | 38 | if not shape: 39 | shape = list(data.shape) 40 | if not dtype: 41 | dtype = data.dtype 42 | 43 | self.max_shape = tuple(shape) 44 | self._view_start = 0 # Where view of the buffer starts 45 | self._view_max_length = shape[-1] # Max length (last dimension) of the exposed view 46 | self._view_length = 0 # Current length of the exposed view 47 | 48 | buffer_shape = list(shape) 49 | buffer_shape[-1] = ceil(scaling_factor * shape[-1]) 50 | self._buffer = np.empty(buffer_shape, dtype) 51 | 52 | self.view = self._buffer[..., self._view_start: self._view_start + self._view_length] 53 | if data is not None: 54 | self.push(data) 55 | 56 | def push(self, data) -> int: 57 | """ 58 | Appends the given data to the buffer, discarding the oldest values. 59 | Data is appended to the last dimension of the data window. 60 | 61 | :param data: the data to append, all dimensions except the last should match those of the window 62 | :return: The number of data points (per dimension) dropped from the sliding window by this operation 63 | """ 64 | data = np.atleast_1d(data) 65 | if not data.shape[:-1] == self._buffer.shape[:-1]: 66 | raise RuntimeError("Data shape does not match buffer size.") 67 | 68 | data_len = data.shape[-1] 69 | 70 | if data_len == 0: 71 | return 0 72 | 73 | # If the view does not has its target capacity, first fill until it does 74 | if self._view_length < self._view_max_length: 75 | delta = min(data_len, self._view_max_length - self._view_length) 76 | self._buffer[..., self._view_length: self._view_length+delta] = data[..., :delta] 77 | self._view_length += delta 78 | self.view = self._buffer[..., :self._view_length] 79 | 80 | if data_len == delta: 81 | return 0 82 | 83 | # The buffer (its view) is now filled, continue the normal flow to process the remaining data. 84 | data = data[..., delta:] 85 | data_len = data.shape[-1] 86 | 87 | # The view is at target capacity at this point, we will start "dropping" data. 88 | 89 | # The data fits in the remaining pre-allocated memory 90 | if self._view_start + self._view_max_length + data_len <= self._buffer.shape[-1]: 91 | self._view_start += data_len 92 | self.view = self._buffer[..., self._view_start:self._view_start + self._view_max_length] 93 | self.view[..., -data_len:] = data 94 | 95 | # The data does not fit in the remaining memory, but is less than the view capacity: 96 | # we reset the view, copy enough old data to fill to capacity, and append the new data 97 | elif data_len < self._view_max_length: 98 | mem_len = self._view_max_length - data_len 99 | self._buffer[..., :mem_len] = \ 100 | self._buffer[..., self._view_start+data_len:self._view_start+self._view_max_length] 101 | self._buffer[..., mem_len:self._view_max_length] = data 102 | self._view_start = 0 103 | self.view = self._buffer[..., self._view_start:self._view_start + self._view_max_length] 104 | 105 | # The data does not fit in the remaining memory, and can (over)fill the view capacity: 106 | # we reset the view and copy a part of the new data equal to the view capacity. 107 | else: 108 | self._buffer[..., :self._view_max_length] = data[..., -self._view_max_length:] 109 | self._view_start = 0 110 | self.view = self._buffer[..., self._view_start:self._view_start + self._view_max_length] 111 | 112 | return data_len 113 | 114 | def __setitem__(self, key, value): 115 | self.view.__setitem__(key, value) 116 | 117 | def __getitem__(self, key): 118 | return self.view.__getitem__(key) 119 | 120 | def __delitem__(self, key): 121 | self.view.__delitem__(key) -------------------------------------------------------------------------------- /distancematrix/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/predict-idlab/seriesdistancematrix/c0e666d036f24184511e766cee9fdfa55f41df97/distancematrix/tests/__init__.py -------------------------------------------------------------------------------- /distancematrix/tests/consumer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/predict-idlab/seriesdistancematrix/c0e666d036f24184511e766cee9fdfa55f41df97/distancematrix/tests/consumer/__init__.py -------------------------------------------------------------------------------- /distancematrix/tests/consumer/test_contextmanager.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | import numpy.testing as npt 3 | from itertools import zip_longest 4 | 5 | from distancematrix.consumer.contextmanager import GeneralStaticManager 6 | 7 | 8 | class TestGeneralStaticManager(TestCase): 9 | def test_does_not_return_empty_contexts(self): 10 | r = [range(1, 5), range(0, 0), range(5, 10)] 11 | m = GeneralStaticManager(r) 12 | 13 | _assert_equal_iteration(m.series_contexts(0, 1), []) 14 | _assert_equal_iteration(m.series_contexts(0, 4), [(1, 5, 0)]) 15 | _assert_equal_iteration(m.series_contexts(0, 8), [(1, 5, 0), (5, 10, 2)]) 16 | _assert_equal_iteration(m.series_contexts(0, 12), [(1, 5, 0), (5, 10, 2)]) 17 | _assert_equal_iteration(m.series_contexts(5, 12), [(5, 10, 2)]) 18 | 19 | _assert_equal_iteration(m.query_contexts(0, 1), []) 20 | _assert_equal_iteration(m.query_contexts(0, 4), [(1, 5, 0)]) 21 | _assert_equal_iteration(m.query_contexts(0, 8), [(1, 5, 0), (5, 10, 2)]) 22 | _assert_equal_iteration(m.query_contexts(0, 12), [(1, 5, 0), (5, 10, 2)]) 23 | _assert_equal_iteration(m.query_contexts(5, 12), [(5, 10, 2)]) 24 | 25 | 26 | def _assert_equal_iteration(actual, expected, msg=''): 27 | """ 28 | Assert function similar to TestCase.assertSequenceEqual, but that actually treats 2D numpy arrays as iterables. 29 | """ 30 | sentinel = object() 31 | for actual_value, expected_value in zip_longest(actual, expected, fillvalue=sentinel): 32 | if sentinel is actual_value: 33 | raise AssertionError("Actual iterator is shorter, does not include " + str(expected_value)) 34 | 35 | if sentinel is expected_value: 36 | raise AssertionError("Actual iterator is longer, contained " + str(actual_value)) 37 | 38 | npt.assert_equal(actual_value, expected_value, msg) 39 | -------------------------------------------------------------------------------- /distancematrix/tests/consumer/test_distance_matrix.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from unittest import TestCase 3 | import numpy.testing as npt 4 | 5 | from distancematrix.util import diag_indices_of 6 | from distancematrix.consumer.distance_matrix import DistanceMatrix 7 | 8 | 9 | class TestContextualMatrixProfile(TestCase): 10 | 11 | def setUp(self): 12 | self.dist_matrix = np.array([ 13 | [8.67, 1.10, 1.77, 1.26, 1.91, 4.29, 6.32, 4.24, 4.64, 5.06, 6.41, 4.07, 4.67, 9.32, 5.09], 14 | [4.33, 4.99, 0.14, 2.79, 2.10, 6.26, 9.40, 4.14, 5.53, 4.26, 8.21, 5.91, 6.83, 9.26, 6.19], 15 | [0.16, 9.05, 1.35, 4.78, 7.01, 4.36, 5.24, 8.81, 7.90, 5.84, 8.90, 7.88, 3.37, 4.70, 6.94], 16 | [0.94, 8.70, 3.87, 6.29, 0.32, 1.79, 5.80, 2.61, 1.43, 6.32, 1.62, 0.20, 2.28, 7.11, 2.15], 17 | [9.90, 4.51, 2.11, 2.83, 5.52, 8.55, 6.90, 0.24, 1.58, 4.26, 8.75, 3.71, 9.93, 8.33, 0.38], 18 | [7.30, 5.84, 9.63, 1.95, 3.76, 3.61, 9.42, 5.56, 5.09, 7.07, 1.90, 4.78, 1.06, 0.69, 3.67], 19 | [2.17, 8.37, 3.99, 4.28, 4.37, 2.86, 8.61, 3.39, 8.37, 6.95, 6.57, 1.79, 7.40, 4.41, 7.64], 20 | [6.26, 0.29, 6.44, 8.84, 1.24, 2.52, 6.25, 3.07, 5.55, 3.19, 8.16, 5.32, 9.01, 0.39, 9.], 21 | [4.67, 8.88, 3.05, 3.06, 2.36, 8.34, 4.91, 5.46, 9.25, 9.78, 0.03, 5.64, 5.10, 3.58, 6.92], 22 | [1.01, 0.91, 6.28, 7.79, 0.68, 5.50, 6.72, 5.11, 0.80, 9.30, 9.77, 4.71, 3.26, 7.29, 6.26]]) 23 | 24 | def mock_initialise(self, dm): 25 | dm.initialise(1, self.dist_matrix.shape[0], self.dist_matrix.shape[1]) 26 | 27 | def test_process_diagonal(self): 28 | dm = DistanceMatrix() 29 | self.mock_initialise(dm) 30 | 31 | for diag in range(-self.dist_matrix.shape[0] + 1, self.dist_matrix.shape[1]): 32 | diag_ind = diag_indices_of(self.dist_matrix, diag) 33 | dm.process_diagonal(diag, np.atleast_2d(self.dist_matrix[diag_ind])) 34 | 35 | npt.assert_equal(dm.distance_matrix, self.dist_matrix) 36 | 37 | def test_process_diagonal_partial_calculation(self): 38 | dm = DistanceMatrix() 39 | self.mock_initialise(dm) 40 | 41 | correct = np.full_like(self.dist_matrix, np.nan, dtype=float) 42 | 43 | for diag in range(-8, self.dist_matrix.shape[1], 3): 44 | diag_ind = diag_indices_of(self.dist_matrix, diag) 45 | dm.process_diagonal(diag, np.atleast_2d(self.dist_matrix[diag_ind])) 46 | correct[diag_ind] = self.dist_matrix[diag_ind] 47 | 48 | npt.assert_equal(dm.distance_matrix, correct) 49 | 50 | def test_process_column(self): 51 | dm = DistanceMatrix() 52 | self.mock_initialise(dm) 53 | 54 | for column in range(0, self.dist_matrix.shape[1]): 55 | dm.process_column(column, np.atleast_2d(self.dist_matrix[:, column])) 56 | 57 | npt.assert_equal(dm.distance_matrix, self.dist_matrix) 58 | 59 | def test_process_column_partial_calculation(self): 60 | dm = DistanceMatrix() 61 | self.mock_initialise(dm) 62 | 63 | correct = np.full_like(self.dist_matrix, np.nan, dtype=float) 64 | 65 | for column in [2, 3, 4, 5, 10, 11, 12]: 66 | dm.process_column(column, np.atleast_2d(self.dist_matrix[:, column])) 67 | correct[:, column] = self.dist_matrix[:, column] 68 | 69 | npt.assert_equal(dm.distance_matrix, correct) 70 | 71 | def test_streaming_process_column(self): 72 | dm = DistanceMatrix() 73 | dm.initialise(1, 5, 5) 74 | 75 | dm.process_column(0, np.atleast_2d(self.dist_matrix[0, 0])) 76 | dm.process_column(1, np.atleast_2d(self.dist_matrix[:2, 1])) 77 | expected = np.full((5, 5), np.nan) 78 | expected[0, 0] = self.dist_matrix[0, 0] 79 | expected[:2, 1] = self.dist_matrix[:2, 1] 80 | npt.assert_equal(dm.distance_matrix, expected) 81 | 82 | for column in range(0, 5): 83 | dm.process_column(column, np.atleast_2d(self.dist_matrix[:5, :5][:, column])) 84 | npt.assert_equal(dm.distance_matrix, self.dist_matrix[:5, :5]) 85 | 86 | dm.shift_query(1) 87 | dm.shift_series(3) 88 | 89 | correct = np.full((5, 5), np.nan) 90 | correct[0:4, 0:2] = self.dist_matrix[1:5, 3:5] 91 | npt.assert_equal(dm.distance_matrix, correct) 92 | 93 | for column in range(0, 5): 94 | dm.process_column(column, np.atleast_2d(self.dist_matrix[1:6, 3:8][:, column])) 95 | npt.assert_equal(dm.distance_matrix, self.dist_matrix[1:6, 3:8]) 96 | 97 | dm.shift_query(2) 98 | dm.shift_series(1) 99 | dm.process_column(4, np.atleast_2d(self.dist_matrix[3:8, 8])) 100 | 101 | correct = np.full((5, 5), np.nan) 102 | correct[0:3, 0:4] = self.dist_matrix[3:6, 4:8] 103 | correct[:, 4] = self.dist_matrix[3:8, 8] 104 | npt.assert_equal(dm.distance_matrix, correct) 105 | 106 | def test_streaming_process_diagonal(self): 107 | dm = DistanceMatrix() 108 | dm.initialise(1, 5, 5) 109 | 110 | dm.process_diagonal(0, np.atleast_2d(self.dist_matrix[0, 0])) 111 | diag_ind = diag_indices_of(self.dist_matrix[:3, :3], 1) 112 | dm.process_diagonal(1, np.atleast_2d(np.atleast_2d(self.dist_matrix[diag_ind]))) 113 | expected = np.full((5, 5), np.nan) 114 | expected[0, 0] = self.dist_matrix[0, 0] 115 | expected[0, 1] = self.dist_matrix[0, 1] 116 | expected[1, 2] = self.dist_matrix[1, 2] 117 | npt.assert_equal(dm.distance_matrix, expected) 118 | 119 | for diag in range(-4,5): 120 | diag_ind = diag_indices_of(self.dist_matrix[:5, :5], diag) 121 | dm.process_diagonal(diag, np.atleast_2d(self.dist_matrix[diag_ind])) 122 | 123 | npt.assert_equal(dm.distance_matrix, self.dist_matrix[:5, :5]) 124 | 125 | dm.shift_query(2) 126 | dm.shift_series(1) 127 | expected = self.dist_matrix[2:7, 1:6].copy() 128 | expected[-2:, :] = np.nan 129 | expected[:, -1:] = np.nan 130 | npt.assert_equal(dm.distance_matrix, expected) 131 | 132 | for diag in range(-4,5): 133 | diag_ind = diag_indices_of(self.dist_matrix[:5, :5], diag) 134 | dm.process_diagonal(diag, np.atleast_2d(self.dist_matrix[diag_ind])) 135 | npt.assert_equal(dm.distance_matrix, self.dist_matrix[:5, :5]) 136 | -------------------------------------------------------------------------------- /distancematrix/tests/consumer/test_radius_profile.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from unittest import TestCase 3 | import numpy.testing as npt 4 | from itertools import takewhile 5 | 6 | from distancematrix.consumer.radius_profile import RadiusProfile0 7 | from distancematrix.consumer.radius_profile import RadiusProfile 8 | from distancematrix.insights import lowest_value_idxs 9 | 10 | 11 | class TestRadiusProfile0(TestCase): 12 | def setUp(self): 13 | self.dm = np.array([ 14 | [8.67, 1.10, 1.77, 1.26, 1.91, 4.29, 6.32, 4.24, 4.64, 5.06, 6.41, 4.07, 4.67, 9.32, 5.09], 15 | [4.33, 4.99, 0.14, 2.79, 2.10, 6.26, 9.40, 4.14, 5.53, 4.26, 8.21, 5.91, 6.83, 9.26, 6.19], 16 | [0.16, 9.05, 1.35, 4.78, 7.01, 4.36, 5.24, 8.81, 7.90, 5.84, 8.90, 7.88, 3.37, 4.70, 6.94], 17 | [0.94, 8.70, 3.87, 6.29, 0.32, 1.79, 5.80, 2.61, 1.43, 6.32, 1.62, 0.20, 2.28, 7.11, 2.15], 18 | [9.90, 4.51, 2.11, 2.83, 5.52, 8.55, 6.90, 0.24, 1.58, 4.26, 8.75, 3.71, 9.93, 8.33, 0.38], 19 | [7.30, 5.84, 9.63, 1.95, 3.76, 3.61, 9.42, 5.56, 5.09, 7.07, 1.90, 4.78, 1.06, 0.69, 3.67], 20 | [2.17, 8.37, 3.99, 4.28, 4.37, 2.86, 8.61, 3.39, 8.37, 6.95, 6.57, 1.79, 7.40, 4.41, 7.64], 21 | [6.26, 0.29, 6.44, 8.84, 1.24, 2.52, 6.25, 3.07, 5.55, 3.19, 8.16, 5.32, 9.01, 0.39, 9.], 22 | [4.67, 8.88, 3.05, 3.06, 2.36, 8.34, 4.91, 5.46, 9.25, 9.78, 0.03, 5.64, 5.10, 3.58, 6.92], 23 | [1.01, 0.91, 6.28, 7.79, 0.68, 5.50, 6.72, 5.11, 0.80, 9.30, 9.77, 4.71, 3.26, 7.29, 6.26]]) 24 | 25 | 26 | @staticmethod 27 | def bruteforce_calc(array, target_idxs): 28 | target_idxs = sorted(target_idxs) 29 | result = np.zeros((len(target_idxs), array.shape[1]), dtype=float) 30 | 31 | for col in range(array.shape[1]): 32 | sorted_col_values = np.sort(array[:, col]) 33 | for i, target_idx in enumerate(target_idxs): 34 | if target_idx < len(sorted_col_values): 35 | result[i, col] = sorted_col_values[target_idx] 36 | else: 37 | result[i, col] = np.nan 38 | 39 | return result 40 | 41 | def test_process_diagonal(self): 42 | tracker = RadiusProfile0(0) 43 | tracker.initialise(1, self.dm.shape[0], self.dm.shape[1]) 44 | 45 | with self.assertRaises(NotImplementedError): 46 | tracker.process_diagonal(0, np.zeros(10)) 47 | 48 | def test_process_column_single_value(self): 49 | tracker = RadiusProfile0(2) 50 | tracker.initialise(1, self.dm.shape[0], self.dm.shape[1]) 51 | 52 | for column in range(0, self.dm.shape[1]): 53 | tracker.process_column(column, np.atleast_2d(self.dm[:, column])) 54 | 55 | npt.assert_equal(tracker.values, self.bruteforce_calc(self.dm, [2])) 56 | 57 | def test_process_column_multiple_value(self): 58 | track_idxs = [2, 5, 0, 9, len(self.dm)] 59 | tracker = RadiusProfile0(track_idxs) 60 | tracker.initialise(1, self.dm.shape[0], self.dm.shape[1]) 61 | 62 | for column in range(0, self.dm.shape[1]): 63 | tracker.process_column(column, np.atleast_2d(self.dm[:, column])) 64 | 65 | npt.assert_equal(tracker.values, self.bruteforce_calc(self.dm, track_idxs)) 66 | npt.assert_equal(tracker.values[-1, :], np.full(self.dm.shape[1], np.nan)) 67 | 68 | 69 | class TestRadiusProfile(TestCase): 70 | def setUp(self): 71 | self.dm = np.array([ 72 | [8.67, 1.10, 1.77, 1.26, 1.91, 4.29, 6.32, 4.24, 4.64, 5.06, 6.41, 4.07, 4.67, 9.32, 5.09], 73 | [4.33, 4.99, 0.14, 2.79, 2.10, 6.26, 9.40, 4.14, 5.53, 4.26, 8.21, 5.91, 6.83, 9.26, 6.19], 74 | [0.16, 9.05, 1.35, 4.78, 7.01, 4.36, 5.24, 8.81, 7.90, 5.84, 8.90, 7.88, 3.37, 4.70, 6.94], 75 | [0.94, 8.70, 3.87, 6.29, 0.32, 1.79, 5.80, 2.61, 1.43, 6.32, 1.62, 0.20, 2.28, 7.11, 2.15], 76 | [9.90, 4.51, 2.11, 2.83, 5.52, 8.55, 6.90, 0.24, 1.58, 4.26, 8.75, 3.71, 9.93, 8.33, 0.38], 77 | [7.30, 5.84, 9.63, 1.95, 3.76, 3.61, 9.42, 5.56, 5.09, 7.07, 1.90, 4.78, 1.06, 0.69, 3.67], 78 | [2.17, 8.37, 3.99, 4.28, 4.37, 2.86, 8.61, 3.39, 8.37, 6.95, 6.57, 1.79, 7.40, 4.41, 7.64], 79 | [6.26, 0.29, 6.44, 8.84, 1.24, 2.52, 6.25, 3.07, 5.55, 3.19, 8.16, 5.32, 9.01, 0.39, 9.], 80 | [4.67, 8.88, 3.05, 3.06, 2.36, 8.34, 4.91, 5.46, 9.25, 9.78, 0.03, 5.64, 5.10, 3.58, 6.92], 81 | [1.01, 0.91, 6.28, 7.79, 0.68, 5.50, 6.72, 5.11, 0.80, 9.30, 9.77, 4.71, 3.26, 7.29, 6.26]]) 82 | 83 | @staticmethod 84 | def bruteforce_calc(array, target_idxs, exclusion): 85 | target_idxs = np.array(target_idxs) 86 | target_idxs.sort() 87 | 88 | result = np.zeros((len(target_idxs), array.shape[1]), dtype=float) 89 | 90 | for col in range(array.shape[1]): 91 | for i, target_idx in enumerate(target_idxs): 92 | lowest_idxs = list(lowest_value_idxs(array[:, col], exclusion)) # takes care of the exclusion distance 93 | lowest_values = array[:, col][lowest_idxs] 94 | 95 | if target_idx < len(lowest_values): 96 | result[i, col] = lowest_values[target_idx] 97 | else: 98 | result[i, col] = np.nan 99 | 100 | return result 101 | 102 | def test_process_diagonal(self): 103 | tracker = RadiusProfile(2, 2) 104 | tracker.initialise(1, self.dm.shape[0], self.dm.shape[1]) 105 | 106 | with self.assertRaises(NotImplementedError): 107 | tracker.process_diagonal(0, np.zeros(10)) 108 | 109 | def test_process_column_single_value(self): 110 | tracker = RadiusProfile(2, 2) 111 | tracker.initialise(1, self.dm.shape[0], self.dm.shape[1]) 112 | 113 | for column in range(0, self.dm.shape[1]): 114 | tracker.process_column(column, np.atleast_2d(self.dm[:, column])) 115 | 116 | npt.assert_equal(tracker.values, self.bruteforce_calc(self.dm, [2], 2)) 117 | 118 | def test_process_column_multiple_value(self): 119 | track_idxs = [2, 0, len(self.dm)] 120 | tracker = RadiusProfile(track_idxs, 1) 121 | tracker.initialise(1, self.dm.shape[0], self.dm.shape[1]) 122 | 123 | for column in range(0, self.dm.shape[1]): 124 | tracker.process_column(column, np.atleast_2d(self.dm[:, column])) 125 | 126 | npt.assert_equal(tracker.values, self.bruteforce_calc(self.dm, track_idxs, 1)) 127 | -------------------------------------------------------------------------------- /distancematrix/tests/consumer/test_threshold_counter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from unittest import TestCase 3 | import numpy.testing as npt 4 | from itertools import takewhile 5 | 6 | from distancematrix.util import diag_indices_of 7 | from distancematrix.insights import lowest_value_idxs 8 | from distancematrix.consumer.threshold_counter import ThresholdCounter 9 | from distancematrix.consumer.threshold_counter import DistancedThresholdCounter 10 | 11 | 12 | class TestThresholdCounter(TestCase): 13 | def setUp(self): 14 | self.dm = np.array([ 15 | [8.67, 1.10, 1.77, 1.26, 1.91, 4.29, 6.32, 4.24, 4.64, 5.06, 6.41, 4.07, 4.67, 9.32, 5.09], 16 | [4.33, 4.99, 0.14, 2.79, 2.10, 6.26, 9.40, 4.14, 5.53, 4.26, 8.21, 5.91, 6.83, 9.26, 6.19], 17 | [0.16, 9.05, 1.35, 4.78, 7.01, 4.36, 5.24, 8.81, 7.90, 5.84, 8.90, 7.88, 3.37, 4.70, 6.94], 18 | [0.94, 8.70, 3.87, 6.29, 0.32, 1.79, 5.80, 2.61, 1.43, 6.32, 1.62, 0.20, 2.28, 7.11, 2.15], 19 | [9.90, 4.51, 2.11, 2.83, 5.52, 8.55, 6.90, 0.24, 1.58, 4.26, 8.75, 3.71, 9.93, 8.33, 0.38], 20 | [7.30, 5.84, 9.63, 1.95, 3.76, 3.61, 9.42, 5.56, 5.09, 7.07, 1.90, 4.78, 1.06, 0.69, 3.67], 21 | [2.17, 8.37, 3.99, 4.28, 4.37, 2.86, 8.61, 3.39, 8.37, 6.95, 6.57, 1.79, 7.40, 4.41, 7.64], 22 | [6.26, 0.29, 6.44, 8.84, 1.24, 2.52, 6.25, 3.07, 5.55, 3.19, 8.16, 5.32, 9.01, 0.39, 9.], 23 | [4.67, 8.88, 3.05, 3.06, 2.36, 8.34, 4.91, 5.46, 9.25, 9.78, 0.03, 5.64, 5.10, 3.58, 6.92], 24 | [1.01, 0.91, 6.28, 7.79, 0.68, 5.50, 6.72, 5.11, 0.80, 9.30, 9.77, 4.71, 3.26, 7.29, 6.26]]) 25 | 26 | @staticmethod 27 | def bruteforce_count(array, threshold_array): 28 | result = np.zeros((len(threshold_array), array.shape[1]), dtype=int) 29 | 30 | for i, threshold in enumerate(threshold_array): 31 | for col in range(array.shape[1]): 32 | result[i, col] = np.count_nonzero(array[:, col] <= threshold) 33 | 34 | return result 35 | 36 | def test_process_diagonal_single_threshold(self): 37 | threshold = 2.83 38 | counter = ThresholdCounter(threshold) 39 | counter.initialise(1, self.dm.shape[0], self.dm.shape[1]) 40 | 41 | for diag in range(-self.dm.shape[0] + 1, self.dm.shape[1]): 42 | diag_ind = diag_indices_of(self.dm, diag) 43 | counter.process_diagonal(diag, np.atleast_2d(self.dm[diag_ind])) 44 | 45 | npt.assert_equal(counter.counts, self.bruteforce_count(self.dm, [threshold])) 46 | 47 | def test_process_diagonal_multiple_thresholds(self): 48 | thresholds = [-1, 2.12, 2.83, 6] 49 | counter = ThresholdCounter(thresholds) 50 | counter.initialise(1, self.dm.shape[0], self.dm.shape[1]) 51 | 52 | for diag in range(-self.dm.shape[0] + 1, self.dm.shape[1]): 53 | diag_ind = diag_indices_of(self.dm, diag) 54 | counter.process_diagonal(diag, np.atleast_2d(self.dm[diag_ind])) 55 | 56 | npt.assert_equal(counter.counts, self.bruteforce_count(self.dm, thresholds)) 57 | 58 | def test_process_column_single_threshold(self): 59 | threshold = 5.09 60 | counter = ThresholdCounter(threshold) 61 | counter.initialise(1, self.dm.shape[0], self.dm.shape[1]) 62 | 63 | for column in range(0, self.dm.shape[1]): 64 | counter.process_column(column, np.atleast_2d(self.dm[:, column])) 65 | 66 | npt.assert_equal(counter.counts, self.bruteforce_count(self.dm, [threshold])) 67 | 68 | def test_process_column_multiple_thresholds(self): 69 | thresholds = [-1, 0.68, 4.67, 5] 70 | counter = ThresholdCounter(thresholds) 71 | counter.initialise(1, self.dm.shape[0], self.dm.shape[1]) 72 | 73 | for column in range(0, self.dm.shape[1]): 74 | counter.process_column(column, np.atleast_2d(self.dm[:, column])) 75 | 76 | npt.assert_equal(counter.counts, self.bruteforce_count(self.dm, thresholds)) 77 | 78 | 79 | class TestDistancedThresholdCounter(TestCase): 80 | def setUp(self): 81 | self.dm = np.array([ 82 | [8.67, 1.10, 1.77, 1.26, 1.91, 4.29, 6.32, 4.24, 4.64, 5.06, 6.41, 4.07, 4.67, 9.32, 5.09], 83 | [4.33, 4.99, 0.14, 2.79, 2.10, 6.26, 9.40, 4.14, 5.53, 4.26, 8.21, 5.91, 6.83, 9.26, 6.19], 84 | [0.16, 9.05, 1.35, 4.78, 7.01, 4.36, 5.24, 8.81, 7.90, 5.84, 8.90, 7.88, 3.37, 4.70, 6.94], 85 | [0.94, 8.70, 3.87, 6.29, 0.32, 1.79, 5.80, 2.61, 1.43, 6.32, 1.62, 0.20, 2.28, 7.11, 2.15], 86 | [9.90, 4.51, 2.11, 2.83, 5.52, 8.55, 6.90, 0.24, 1.58, 4.26, 8.75, 3.71, 9.93, 8.33, 0.38], 87 | [7.30, 5.84, 9.63, 1.95, 3.76, 3.61, 9.42, 5.56, 5.09, 7.07, 1.90, 4.78, 1.06, 0.69, 3.67], 88 | [2.17, 8.37, 3.99, 4.28, 4.37, 2.86, 8.61, 3.39, 8.37, 6.95, 6.57, 1.79, 7.40, 4.41, 7.64], 89 | [6.26, 0.29, 6.44, 8.84, 1.24, 2.52, 6.25, 3.07, 5.55, 3.19, 8.16, 5.32, 9.01, 0.39, 9.], 90 | [4.67, 8.88, 3.05, 3.06, 2.36, 8.34, 4.91, 5.46, 9.25, 9.78, 0.03, 5.64, 5.10, 3.58, 6.92], 91 | [1.01, 0.91, 6.28, 7.79, 0.68, 5.50, 6.72, 5.11, 0.80, 9.30, 9.77, 4.71, 3.26, 7.29, 6.26]]) 92 | 93 | @staticmethod 94 | def bruteforce_count(array, threshold_array, exclusion): 95 | threshold_array = np.array(threshold_array) 96 | threshold_array.sort() 97 | 98 | result = np.zeros((len(threshold_array), array.shape[1]), dtype=int) 99 | 100 | for i, threshold in enumerate(threshold_array): 101 | for col in range(array.shape[1]): 102 | _iter = lowest_value_idxs(array[:, col], exclusion) # takes care of the exclusion distance 103 | value_iter = takewhile(lambda i: array[i, col] <= threshold, _iter) # takes care of the threshold 104 | result[i, col] = len(list(value_iter)) 105 | 106 | return result 107 | 108 | def test_process_diagonal_single_threshold(self): 109 | threshold = 2.83 110 | counter = DistancedThresholdCounter(threshold, 2) 111 | counter.initialise(1, self.dm.shape[0], self.dm.shape[1]) 112 | 113 | with self.assertRaises(NotImplementedError): 114 | counter.process_diagonal(0, np.zeros(10)) 115 | 116 | def test_process_column_single_threshold(self): 117 | threshold = 5.09 118 | counter = DistancedThresholdCounter(threshold, 2) 119 | counter.initialise(1, self.dm.shape[0], self.dm.shape[1]) 120 | 121 | for column in range(0, self.dm.shape[1]): 122 | counter.process_column(column, np.atleast_2d(self.dm[:, column])) 123 | 124 | npt.assert_equal(counter.counts, self.bruteforce_count(self.dm, [threshold], 2)) 125 | 126 | def test_process_column_multiple_thresholds(self): 127 | thresholds = [-1, 0.68, 4.67, 5] 128 | counter = DistancedThresholdCounter(thresholds, 2) 129 | counter.initialise(1, self.dm.shape[0], self.dm.shape[1]) 130 | 131 | for column in range(0, self.dm.shape[1]): 132 | counter.process_column(column, np.atleast_2d(self.dm[:, column])) 133 | 134 | npt.assert_equal(counter.counts, self.bruteforce_count(self.dm, thresholds, 2)) 135 | -------------------------------------------------------------------------------- /distancematrix/tests/generator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/predict-idlab/seriesdistancematrix/c0e666d036f24184511e766cee9fdfa55f41df97/distancematrix/tests/generator/__init__.py -------------------------------------------------------------------------------- /distancematrix/tests/generator/mock_generator.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from distancematrix.util import diag_indices_of 4 | from distancematrix.generator.abstract_generator import AbstractGenerator 5 | from distancematrix.generator.abstract_generator import AbstractBoundStreamingGenerator 6 | 7 | 8 | class MockGenerator(AbstractGenerator): 9 | """ 10 | Mock generator for testing purposes. Simply returns distances from a given distance matrix. 11 | """ 12 | 13 | def __init__(self, dist_matrix): 14 | """ 15 | Creates a new mock generator that will return distances from the provided distance matrix. 16 | 17 | :param dist_matrix: distances to return. 18 | """ 19 | self._dist_matrix = dist_matrix 20 | 21 | # Storage for parameters used for prepare and prepare_streaming 22 | self.m = None 23 | self.series_window = None 24 | self.query_window = None 25 | self.series = None 26 | self.query = None 27 | self.bound_gen = None 28 | 29 | def prepare_streaming(self, m, series_window, query_window=None): 30 | self.m = m 31 | self.series_window = series_window 32 | self.query_window = query_window 33 | 34 | if query_window is None: 35 | query_window = series_window 36 | self_join = True 37 | else: 38 | self_join = False 39 | 40 | s_subseqs = series_window - m + 1 41 | q_subseqs = query_window - m + 1 42 | self.bound_gen = BoundMockGenerator(self._dist_matrix, s_subseqs, q_subseqs, 43 | self_join, -series_window, -query_window) 44 | 45 | return self.bound_gen 46 | 47 | def prepare(self, m, series, query=None): 48 | self.m = m 49 | self.series = series 50 | self.query = query 51 | 52 | s_win = len(series) - m + 1 53 | if query is None: 54 | q_win = s_win 55 | self_join = True 56 | else: 57 | q_win = len(query) - m + 1 58 | self_join = False 59 | 60 | self.bound_gen = BoundMockGenerator(self._dist_matrix, s_win, q_win, self_join, 0, 0) 61 | return self.bound_gen 62 | 63 | 64 | class BoundMockGenerator(AbstractBoundStreamingGenerator): 65 | """ 66 | Mock generator for testing purposes. Simply returns distances from a given distance matrix. 67 | """ 68 | def __init__(self, dist_matrix, s_win, q_win, self_join, s_view_index, q_view_index): 69 | """ 70 | Creates a new mock generator that will return distances from the provided distance matrix. 71 | 72 | :param dist_matrix: 2D matrix, base distance values to use, a view will be used to determine 73 | which values to return for mocked calculations 74 | :param s_win: window size of the view over the series axis 75 | :param q_win: window size of the view over the query axis 76 | :param self_join: are we doing a self-join (does adding series data also implicitly add query data) 77 | :param s_view_index: start index of the view of dist_matrix (for series) 78 | :param q_view_index: start index of the view of dist_matrix (for query) 79 | """ 80 | self._dist_matrix = dist_matrix 81 | self._s_win = s_win 82 | self._q_win = q_win 83 | self._self_join = self_join 84 | 85 | self._s_index = s_view_index 86 | self._q_index = q_view_index 87 | 88 | self.appended_series = np.empty((0,), dtype=float) 89 | self.appended_query = np.empty((0,), dtype=float) 90 | 91 | def calc_diagonal(self, diag): 92 | view = self._dist_matrix[ 93 | max(self._q_index, 0): max(self._q_index + self._q_win, 0), 94 | max(self._s_index, 0): max(self._s_index + self._s_win, 0) 95 | ] 96 | return view[diag_indices_of(view, diag)] 97 | 98 | def calc_column(self, column): 99 | view = self._dist_matrix[ 100 | max(self._q_index, 0): max(self._q_index + self._q_win, 0), 101 | max(self._s_index, 0): max(self._s_index + self._s_win, 0) 102 | ] 103 | return view[:, column] 104 | 105 | def append_series(self, values): 106 | self.appended_series = np.concatenate([self.appended_series, values]) 107 | self._s_index += len(values) 108 | if self._self_join: 109 | self._q_index += len(values) 110 | 111 | def append_query(self, values): 112 | if self._self_join: 113 | raise RuntimeError("Should not append query if self-joining.") 114 | 115 | self.appended_query = np.concatenate([self.appended_query, values]) 116 | self._q_index += len(values) 117 | -------------------------------------------------------------------------------- /distancematrix/tests/generator/test_euclidean.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from unittest import TestCase 3 | import numpy.testing as npt 4 | 5 | from distancematrix.util import diag_indices 6 | from distancematrix.generator.euclidean import Euclidean 7 | 8 | 9 | class TestEuclidean(TestCase): 10 | def setUp(self): 11 | self.series = np.array( 12 | [0.5578463, 0.4555404, 0.18124978, 0.252396, 0.60623881, 13 | 0.5546021, 0.13714127, 0.903246, 0.03695094, 0.23420792, 14 | 0.27482897, 0.57765821, 0.23571178, 0.65772705, 0.00292154, 15 | 0.87258653, 0.29869269, 0.91492178, 0.69096235, 0.6786107, 16 | 0.85413687, 0.19725933, 0.39460891, 0.32650366, 0.35188833, 17 | 0.92658149, 0.07503563, 0.37864432, 0.9415974, 0.62313779]) 18 | 19 | self.query = np.array( 20 | [0.03737861, 0.53931239, 0.06194507, 0.0938707, 0.95875364, 21 | 0.09495936, 0.12392364, 0.81358582, 0.56507776, 0.61620183, 22 | 0.24720462, 0.83886639, 0.38130506, 0.13693176, 0.90555723, 23 | 0.23274948, 0.31526678, 0.28504739, 0.45200344, 0.9867946]) 24 | 25 | def test_calc_diagonal(self): 26 | m = 5 27 | euclid = Euclidean().prepare(m, self.series, self.query) 28 | _verify_diagonals_correct(self.series, self.query, m, euclid) 29 | 30 | def test_calc_column_no_cache(self): 31 | m = 5 32 | euclid = Euclidean().prepare(m, self.series, self.query) 33 | _verify_columns_correct(self.series, self.query, m, euclid, True) 34 | 35 | def test_calc_column_cache(self): 36 | m = 5 37 | euclid = Euclidean().prepare(m, self.series, self.query) 38 | _verify_columns_correct(self.series, self.query, m, euclid, False) 39 | 40 | 41 | class TestEuclideanSelfJoin(TestCase): 42 | def setUp(self): 43 | self.series = np.array( 44 | [0.5578463, 0.4555404, 0.18124978, 0.252396, 0.60623881, 45 | 0.5546021, 0.13714127, 0.903246, 0.03695094, 0.23420792, 46 | 0.27482897, 0.57765821, 0.23571178, 0.65772705, 0.00292154, 47 | 0.87258653, 0.29869269, 0.91492178, 0.69096235, 0.6786107, 48 | 0.85413687, 0.19725933, 0.39460891, 0.32650366, 0.35188833, 49 | 0.92658149, 0.07503563, 0.37864432, 0.9415974, 0.62313779]) 50 | 51 | def test_calc_diagonal(self): 52 | m = 5 53 | euclid = Euclidean().prepare(m, self.series) 54 | _verify_diagonals_correct(self.series, self.series, m, euclid) 55 | 56 | def test_calc_column_no_cache(self): 57 | m = 5 58 | euclid = Euclidean().prepare(m, self.series) 59 | _verify_columns_correct(self.series, self.series, m, euclid, True) 60 | 61 | def test_calc_column_cache(self): 62 | m = 5 63 | euclid = Euclidean().prepare(m, self.series) 64 | _verify_columns_correct(self.series, self.series, m, euclid, False) 65 | 66 | 67 | class TestStreamingEuclidean(TestCase): 68 | def setUp(self): 69 | self.series = np.array( 70 | [0.5578463, 0.4555404, 0.18124978, 0.252396, 0.60623881, 71 | 0.5546021, 0.13714127, 0.903246, 0.03695094, 0.23420792, 72 | 0.27482897, 0.57765821, 0.23571178, 0.65772705, 0.00292154, 73 | 0.87258653, 0.29869269, 0.91492178, 0.69096235, 0.6786107, 74 | 0.85413687, 0.19725933, 0.39460891, 0.32650366, 0.35188833, 75 | 0.92658149, 0.07503563, 0.37864432, 0.9415974, 0.62313779]) 76 | 77 | self.query = np.array( 78 | [0.03737861, 0.53931239, 0.06194507, 0.0938707, 0.95875364, 79 | 0.09495936, 0.12392364, 0.81358582, 0.56507776, 0.61620183, 80 | 0.24720462, 0.83886639, 0.38130506, 0.13693176, 0.90555723, 81 | 0.23274948, 0.31526678, 0.28504739, 0.45200344, 0.9867946]) 82 | 83 | def test_calc_diagonal(self): 84 | m = 5 85 | euclid = Euclidean().prepare_streaming(m, 20, 15) 86 | 87 | euclid.append_series(self.series[:10]) 88 | euclid.append_query(self.query[:5]) 89 | _verify_diagonals_correct(self.series[:10], self.query[:5], m, euclid) 90 | 91 | euclid.append_series(self.series[10: 15]) 92 | euclid.append_query(self.query[5: 10]) 93 | _verify_diagonals_correct(self.series[:15], self.query[:10], m, euclid) 94 | 95 | euclid.append_series(self.series[15: 25]) 96 | euclid.append_query(self.query[10: 20]) 97 | _verify_diagonals_correct(self.series[5: 25], self.query[5: 20], m, euclid) 98 | 99 | euclid.append_series(self.series[25:30]) 100 | _verify_diagonals_correct(self.series[10: 30], self.query[5: 20], m, euclid) 101 | 102 | def test_calc_column_no_cache(self): 103 | m = 5 104 | euclid = Euclidean().prepare_streaming(m, 20, 15) 105 | 106 | euclid.append_series(self.series[:10]) 107 | euclid.append_query(self.query[:5]) 108 | _verify_columns_correct(self.series[:10], self.query[:5], m, euclid, True) 109 | 110 | euclid.append_series(self.series[10: 15]) 111 | euclid.append_query(self.query[5: 10]) 112 | _verify_columns_correct(self.series[:15], self.query[:10], m, euclid, True) 113 | 114 | euclid.append_series(self.series[15: 25]) 115 | euclid.append_query(self.query[10: 18]) 116 | _verify_columns_correct(self.series[5: 25], self.query[3: 18], m, euclid, True) 117 | 118 | euclid.append_query(self.query[18: 20]) 119 | _verify_columns_correct(self.series[5: 25], self.query[5: 20], m, euclid, True) 120 | 121 | euclid.append_series(self.series[25:30]) 122 | _verify_columns_correct(self.series[10: 30], self.query[5: 20], m, euclid, True) 123 | 124 | def test_calc_column_cache(self): 125 | m = 5 126 | euclid = Euclidean().prepare_streaming(m, 20, 15) 127 | 128 | euclid.append_series(self.series[:10]) 129 | euclid.append_query(self.query[:5]) 130 | _verify_columns_correct(self.series[:10], self.query[:5], m, euclid, False) 131 | 132 | euclid.append_series(self.series[10: 15]) 133 | euclid.append_query(self.query[5: 10]) 134 | _verify_columns_correct(self.series[:15], self.query[:10], m, euclid, False) 135 | 136 | euclid.append_series(self.series[15: 25]) 137 | euclid.append_query(self.query[10: 18]) 138 | _verify_columns_correct(self.series[5: 25], self.query[3: 18], m, euclid, False) 139 | 140 | euclid.append_query(self.query[18: 20]) 141 | _verify_columns_correct(self.series[5: 25], self.query[5: 20], m, euclid, False) 142 | 143 | euclid.append_series(self.series[25:30]) 144 | _verify_columns_correct(self.series[10: 30], self.query[5: 20], m, euclid, False) 145 | 146 | def test_streaming_updates_cached_row(self): 147 | # Override series & query to ensure there are no constant subsequences 148 | self.series = np.array( 149 | [0.5546021, 0.13714127, 0.903246, 0.03695094, 0.23420792, 150 | 0.27482897, 0.57765821, 0.23571178, 0.65772705, 0.00292154, 151 | 0.87258653, 0.29869269, 0.91492178, 0.69096235, 0.6786107]) 152 | 153 | self.query = np.array( 154 | [0.03737861, 0.53931239, 0.06194507, 0.0938707, 0.95875364, 155 | 0.09495936, 0.12392364, 0.81358582, 0.56507776, 0.61620183]) 156 | 157 | gen = Euclidean().prepare_streaming(5, 10, 10) 158 | gen.append_series(self.series[:10]) 159 | gen.append_query(self.query[:10]) 160 | bf_dist_matrix = _bruteforce_euclidean_distance_matrix(self.series[:15], self.query[:10], 5) 161 | 162 | # Test shifted behaviour 163 | npt.assert_allclose(bf_dist_matrix[:, 0], gen.calc_column(0)) 164 | gen.append_series(self.series[10:11]) 165 | npt.assert_allclose(bf_dist_matrix[:, 1], gen.calc_column(0)) 166 | 167 | # Test shifted but off-by-one behaviour 168 | npt.assert_allclose(bf_dist_matrix[:, 4], gen.calc_column(3)) 169 | gen.append_series(self.series[11:12]) 170 | npt.assert_allclose(bf_dist_matrix[:, 6], gen.calc_column(4)) 171 | 172 | 173 | class TestStreamingEuclideanSelfJoin(TestCase): 174 | def setUp(self): 175 | self.series = np.array( 176 | [0.5578463, 0.4555404, 0.18124978, 0.252396, 0.60623881, 177 | 0.5546021, 0.13714127, 0.903246, 0.03695094, 0.23420792, 178 | 0.27482897, 0.57765821, 0.23571178, 0.65772705, 0.00292154, 179 | 0.87258653, 0.29869269, 0.91492178, 0.69096235, 0.6786107, 180 | 0.85413687, 0.19725933, 0.39460891, 0.32650366, 0.35188833, 181 | 0.92658149, 0.07503563, 0.37864432, 0.9415974, 0.62313779]) 182 | 183 | def test_calc_diagonal(self): 184 | m = 5 185 | euclid = Euclidean().prepare_streaming(m, 20) 186 | 187 | euclid.append_series(self.series[:10]) 188 | _verify_diagonals_correct(self.series[:10], self.series[:10], m, euclid) 189 | 190 | euclid.append_series(self.series[10: 15]) 191 | _verify_diagonals_correct(self.series[:15], self.series[:15], m, euclid) 192 | 193 | euclid.append_series(self.series[15: 25]) 194 | _verify_diagonals_correct(self.series[5: 25], self.series[5: 25], m, euclid) 195 | 196 | euclid.append_series(self.series[25:30]) 197 | _verify_diagonals_correct(self.series[10: 30], self.series[10: 30], m, euclid) 198 | 199 | def test_calc_column_no_cache(self): 200 | m = 5 201 | euclid = Euclidean().prepare_streaming(m, 20) 202 | 203 | euclid.append_series(self.series[:10]) 204 | _verify_columns_correct(self.series[:10], self.series[:10], m, euclid, True) 205 | 206 | euclid.append_series(self.series[10: 15]) 207 | _verify_columns_correct(self.series[:15], self.series[:15], m, euclid, True) 208 | 209 | euclid.append_series(self.series[15: 25]) 210 | _verify_columns_correct(self.series[5: 25], self.series[5: 25], m, euclid, True) 211 | 212 | euclid.append_series(self.series[25:30]) 213 | _verify_columns_correct(self.series[10: 30], self.series[10: 30], m, euclid, True) 214 | 215 | def test_calc_column_cache(self): 216 | m = 5 217 | euclid = Euclidean().prepare_streaming(m, 20) 218 | 219 | euclid.append_series(self.series[:10]) 220 | _verify_columns_correct(self.series[:10], self.series[:10], m, euclid, False) 221 | 222 | euclid.append_series(self.series[10: 15]) 223 | _verify_columns_correct(self.series[:15], self.series[:15], m, euclid, False) 224 | 225 | euclid.append_series(self.series[15: 25]) 226 | _verify_columns_correct(self.series[5: 25], self.series[5: 25], m, euclid, False) 227 | 228 | euclid.append_series(self.series[25:30]) 229 | _verify_columns_correct(self.series[10: 30], self.series[10: 30], m, euclid, False) 230 | 231 | 232 | def _verify_diagonals_correct(series, query, m, euclid): 233 | h = len(query) - m + 1 234 | w = len(series) - m + 1 235 | bf_distance_matrix = _bruteforce_euclidean_distance_matrix(series, query, m) 236 | 237 | for i in range(-h + 1, w): 238 | result = euclid.calc_diagonal(i) 239 | expected = bf_distance_matrix[diag_indices(h, w, i)] 240 | npt.assert_allclose(result, expected) 241 | 242 | 243 | def _verify_columns_correct(series, query, m, euclid, backwards): 244 | w = len(series) - m + 1 245 | bf_distance_matrix = _bruteforce_euclidean_distance_matrix(series, query, m) 246 | 247 | if backwards: 248 | r = range(w - 1, -1, -1) 249 | else: 250 | r = range(w) 251 | 252 | for i in r: 253 | result = euclid.calc_column(i) 254 | expected = bf_distance_matrix[:, i] 255 | npt.assert_allclose(result, expected, err_msg="Mismatch for row {row}".format(row=i)) 256 | 257 | 258 | def _bruteforce_euclidean_distance_matrix(series, query, m): 259 | num_cols = len(series) - m + 1 260 | num_rows = len(query) - m + 1 261 | distance_matrix = np.zeros((num_rows, num_cols)) 262 | 263 | for row in range(num_rows): 264 | for col in range(num_cols): 265 | distance_matrix[row, col] = _euclidean_distance( 266 | query[row: row + m], 267 | series[col: col + m]) 268 | 269 | return distance_matrix 270 | 271 | 272 | def _euclidean_distance(s1, s2): 273 | return np.sqrt(np.sum(np.square(s1 - s2))) 274 | -------------------------------------------------------------------------------- /distancematrix/tests/generator/test_znorm_euclidean.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from unittest import TestCase 3 | import numpy.testing as npt 4 | from abc import abstractmethod 5 | 6 | from distancematrix.util import diag_indices 7 | from distancematrix.generator.znorm_euclidean import ZNormEuclidean 8 | 9 | 10 | class AbstractGeneratorTest(object): 11 | def setUp(self): 12 | self.series = np.array( 13 | [0.2488674, 0.1547179, 2, 2, 2, 14 | 2, 2, 2, 0.02841, 0.371845, 15 | 0.5578463, 0.4555404, 0.18124978, 0.252396, 0.60623881, 16 | 0.5546021, 0.13714127, 0.903246, 0.03695094, 0.23420792, 17 | 0.27482897, 0.57765821, 0.23571178, 0.65772705, 0.00292154, 18 | 0.87258653, 0.29869269, 0.91492178, 0.69096235, 0.6786107, 19 | 0.85413687, 0.19725933, 0.39460891, 0.32650366, 0.35188833, 20 | 0.92658149, 0.07503563, 0.37864432, 0.9415974, 0.62313779]) 21 | 22 | self.query = np.array( 23 | [6., 6., 6., 6., 6., 24 | 0.03737861, 0.53931239, 0.06194507, 0.0938707, 0.95875364, 25 | 0.09495936, 0.12392364, 0.81358582, 0.56507776, 0.61620183, 26 | 0.24720462, 0.83886639, 0.38130506, 0.13693176, 0.90555723, 27 | 0.23274948, 0.31526678, 0.28504739, 0.45200344, 0.9867946]) 28 | 29 | self.m = 5 30 | 31 | @abstractmethod 32 | def create_generator(self): 33 | pass 34 | 35 | @abstractmethod 36 | def bruteforce_matrix(self, m, series, query): 37 | pass 38 | 39 | def test_non_streaming_calc_diagonal(self): 40 | gen = self.create_generator().prepare(self.m, self.series, self.query) 41 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series, self.query) 42 | _verify_diagonals_correct(bf_dist_matrix, gen) 43 | 44 | def test_non_streaming_calc_column_no_cache(self): 45 | gen = self.create_generator().prepare(self.m, self.series, self.query) 46 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series, self.query) 47 | _verify_columns_correct(bf_dist_matrix, gen, True) 48 | 49 | def test_non_streaming_calc_column_cache(self): 50 | gen = self.create_generator().prepare(self.m, self.series, self.query) 51 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series, self.query) 52 | _verify_columns_correct(bf_dist_matrix, gen, False) 53 | 54 | def test_non_streaming_self_join_calc_diagonal(self): 55 | gen = self.create_generator().prepare(self.m, self.series) 56 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series, self.series) 57 | _verify_diagonals_correct(bf_dist_matrix, gen) 58 | 59 | def test_non_streaming_self_join_calc_column_no_cache(self): 60 | gen = self.create_generator().prepare(self.m, self.series) 61 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series, self.series) 62 | _verify_columns_correct(bf_dist_matrix, gen, True) 63 | 64 | def test_non_streaming_self_join_calc_column_cache(self): 65 | gen = self.create_generator().prepare(self.m, self.series, self.series) 66 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series, self.series) 67 | _verify_columns_correct(bf_dist_matrix, gen, False) 68 | 69 | def test_streaming_calc_diagonal(self): 70 | gen = self.create_generator().prepare_streaming(self.m, 20, 15) 71 | 72 | gen.append_series(self.series[:10]) 73 | gen.append_query(self.query[:5]) 74 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[:10], self.query[:5]) 75 | _verify_diagonals_correct(bf_dist_matrix, gen) 76 | 77 | gen.append_series(self.series[10: 15]) 78 | gen.append_query(self.query[5: 10]) 79 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[:15], self.query[:10]) 80 | _verify_diagonals_correct(bf_dist_matrix, gen) 81 | 82 | gen.append_query(self.query[10: 15]) 83 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[:15], self.query[:15]) 84 | _verify_diagonals_correct(bf_dist_matrix, gen) 85 | 86 | gen.append_series(self.series[15: 25]) 87 | gen.append_query(self.query[15: 25]) 88 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[5: 25], self.query[10: 25]) 89 | _verify_diagonals_correct(bf_dist_matrix, gen) 90 | 91 | gen.append_series(self.series[25:40]) 92 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[20: 40], self.query[10: 25]) 93 | _verify_diagonals_correct(bf_dist_matrix, gen) 94 | 95 | def test_streaming_calc_column_no_cache(self): 96 | gen = self.create_generator().prepare_streaming(self.m, 20, 15) 97 | 98 | gen.append_series(self.series[:10]) 99 | gen.append_query(self.query[:5]) 100 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[:10], self.query[:5]) 101 | _verify_columns_correct(bf_dist_matrix, gen, True) 102 | 103 | gen.append_series(self.series[10: 15]) 104 | gen.append_query(self.query[5: 10]) 105 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[:15], self.query[:10]) 106 | _verify_columns_correct(bf_dist_matrix, gen, True) 107 | 108 | gen.append_query(self.query[10: 15]) 109 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[:15], self.query[:15]) 110 | _verify_columns_correct(bf_dist_matrix, gen, True) 111 | 112 | gen.append_series(self.series[15: 25]) 113 | gen.append_query(self.query[15: 25]) 114 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[5: 25], self.query[10: 25]) 115 | _verify_columns_correct(bf_dist_matrix, gen, True) 116 | 117 | gen.append_series(self.series[25:40]) 118 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[20: 40], self.query[10: 25]) 119 | _verify_columns_correct(bf_dist_matrix, gen, True) 120 | 121 | def test_streaming_calc_column_cache(self): 122 | gen = self.create_generator().prepare_streaming(self.m, 20, 15) 123 | 124 | gen.append_series(self.series[:10]) 125 | gen.append_query(self.query[:5]) 126 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[:10], self.query[:5]) 127 | _verify_columns_correct(bf_dist_matrix, gen, False) 128 | 129 | gen.append_series(self.series[10: 15]) 130 | gen.append_query(self.query[5: 10]) 131 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[:15], self.query[:10]) 132 | _verify_columns_correct(bf_dist_matrix, gen, False) 133 | 134 | gen.append_query(self.query[10: 15]) 135 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[:15], self.query[:15]) 136 | _verify_columns_correct(bf_dist_matrix, gen, False) 137 | 138 | gen.append_series(self.series[15: 25]) 139 | gen.append_query(self.query[15: 25]) 140 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[5: 25], self.query[10: 25]) 141 | _verify_columns_correct(bf_dist_matrix, gen, False) 142 | 143 | gen.append_series(self.series[25:40]) 144 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[20: 40], self.query[10: 25]) 145 | _verify_columns_correct(bf_dist_matrix, gen, False) 146 | 147 | def test_streaming_updates_cached_row(self): 148 | # Override series & query to ensure there are no constant subsequences 149 | self.series = np.array( 150 | [0.5546021, 0.13714127, 0.903246, 0.03695094, 0.23420792, 151 | 0.27482897, 0.57765821, 0.23571178, 0.65772705, 0.00292154, 152 | 0.87258653, 0.29869269, 0.91492178, 0.69096235, 0.6786107]) 153 | 154 | self.query = np.array( 155 | [0.03737861, 0.53931239, 0.06194507, 0.0938707, 0.95875364, 156 | 0.09495936, 0.12392364, 0.81358582, 0.56507776, 0.61620183]) 157 | 158 | gen = self.create_generator().prepare_streaming(self.m, 10, 10) 159 | gen.append_series(self.series[:10]) 160 | gen.append_query(self.query[:10]) 161 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[:15], self.query[:10]) 162 | 163 | # Test shifted behaviour 164 | npt.assert_allclose(bf_dist_matrix[:, 0], gen.calc_column(0)) 165 | gen.append_series(self.series[10:11]) 166 | npt.assert_allclose(bf_dist_matrix[:, 1], gen.calc_column(0)) 167 | 168 | # Test shifted but off-by-one behaviour 169 | npt.assert_allclose(bf_dist_matrix[:, 4], gen.calc_column(3)) 170 | gen.append_series(self.series[11:12]) 171 | npt.assert_allclose(bf_dist_matrix[:, 6], gen.calc_column(4)) 172 | 173 | def test_streaming_self_join_calc_diagonal(self): 174 | gen = self.create_generator().prepare_streaming(self.m, 20) 175 | 176 | gen.append_series(self.series[:10]) 177 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[:10], self.series[:10]) 178 | _verify_diagonals_correct(bf_dist_matrix, gen) 179 | 180 | gen.append_series(self.series[10: 15]) 181 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[:15], self.series[:15]) 182 | _verify_diagonals_correct(bf_dist_matrix, gen) 183 | 184 | gen.append_series(self.series[15: 16]) 185 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[:16], self.series[:16]) 186 | _verify_diagonals_correct(bf_dist_matrix, gen) 187 | 188 | gen.append_series(self.series[16:40]) 189 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[20: 40], self.series[20: 40]) 190 | _verify_diagonals_correct(bf_dist_matrix, gen) 191 | 192 | def test_streaming_self_join_calc_column_no_cache(self): 193 | gen = self.create_generator().prepare_streaming(self.m, 20) 194 | 195 | gen.append_series(self.series[:10]) 196 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[:10], self.series[:10]) 197 | _verify_columns_correct(bf_dist_matrix, gen, True) 198 | 199 | gen.append_series(self.series[10: 15]) 200 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[:15], self.series[:15]) 201 | _verify_columns_correct(bf_dist_matrix, gen, True) 202 | 203 | gen.append_series(self.series[15: 16]) 204 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[:16], self.series[:16]) 205 | _verify_columns_correct(bf_dist_matrix, gen, True) 206 | 207 | gen.append_series(self.series[16:40]) 208 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[20: 40], self.series[20: 40]) 209 | _verify_columns_correct(bf_dist_matrix, gen, True) 210 | 211 | def test_streaming_self_join_calc_column_cache(self): 212 | gen = self.create_generator().prepare_streaming(self.m, 20) 213 | 214 | gen.append_series(self.series[:10]) 215 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[:10], self.series[:10]) 216 | _verify_columns_correct(bf_dist_matrix, gen, False) 217 | 218 | gen.append_series(self.series[10: 15]) 219 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[:15], self.series[:15]) 220 | _verify_columns_correct(bf_dist_matrix, gen, False) 221 | 222 | gen.append_series(self.series[15: 16]) 223 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[:16], self.series[:16]) 224 | _verify_columns_correct(bf_dist_matrix, gen, False) 225 | 226 | gen.append_series(self.series[16:40]) 227 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series[20: 40], self.series[20: 40]) 228 | _verify_columns_correct(bf_dist_matrix, gen, False) 229 | 230 | def test_non_streaming_calc_single(self): 231 | gen = self.create_generator().prepare(self.m, self.series, self.query) 232 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series, self.query) 233 | num_cols = len(self.series) - self.m + 1 234 | num_rows = len(self.query) - self.m + 1 235 | result = np.full((num_rows, num_cols), np.nan, dtype=float) 236 | for col in range(num_cols): 237 | for row in range(num_rows): 238 | result[row, col] = gen.calc_single(row, col) 239 | 240 | npt.assert_allclose(result, bf_dist_matrix, atol=1e-10) 241 | 242 | def test_numerical_stability(self): 243 | self.series = np.array([9.859169023394657, 18.026092617400675, 1.6423838253843416e-24, 0.0, 0.0]) 244 | self.m = 3 245 | gen = self.create_generator().prepare(self.m, self.series, self.series) 246 | 247 | col0 = gen.calc_column(0) 248 | col1 = gen.calc_column(1) 249 | col2 = gen.calc_column(2) 250 | 251 | # These assertions failed when using FFT convolve 252 | npt.assert_(np.max(col0) <= 2 * np.sqrt(self.m), f"Max value was: {np.max(col0)}") 253 | npt.assert_(np.max(col1) <= 2 * np.sqrt(self.m), f"Max value was: {np.max(col1)}") 254 | npt.assert_(np.max(col2) <= 2 * np.sqrt(self.m), f"Max value was: {np.max(col2)}") 255 | 256 | # These assertions check that calculations for a single value (not using bulk-calculated dot products) 257 | # do not differ. They focus on the checking the more complex dot-product based calculation. 258 | for col_i, col in enumerate([col0, col1, col2]): 259 | for row_i in range(3): 260 | npt.assert_allclose(col[row_i], gen.calc_single(row_i, col_i), atol=1e-10) 261 | 262 | # These assertions check against the more simple euclidean-of-znormalized calculation. 263 | bf_dist_matrix = self.bruteforce_matrix(self.m, self.series, self.series) 264 | npt.assert_allclose(col0, bf_dist_matrix[:, 0], atol=1e-10) 265 | npt.assert_allclose(col1, bf_dist_matrix[:, 1], atol=1e-10) 266 | npt.assert_allclose(col2, bf_dist_matrix[:, 2], atol=1e-10) 267 | 268 | 269 | class TestZnormEuclidean(AbstractGeneratorTest, TestCase): 270 | def create_generator(self): 271 | return ZNormEuclidean() 272 | 273 | def bruteforce_matrix(self, m, series, query): 274 | return _bruteforce_zeuclidean_distance_matrix(series, query, m, 0.) 275 | 276 | 277 | class TestZnormEuclideanNoiseElimination(AbstractGeneratorTest, TestCase): 278 | def create_generator(self): 279 | return ZNormEuclidean(noise_std=0.2) 280 | 281 | def bruteforce_matrix(self, m, series, query): 282 | return _bruteforce_zeuclidean_distance_matrix(series, query, m, 0.2) 283 | 284 | 285 | def _verify_diagonals_correct(bf_distance_matrix, zeuclid): 286 | h, w = bf_distance_matrix.shape 287 | 288 | for i in range(-h + 1, w): 289 | result = zeuclid.calc_diagonal(i) 290 | expected = bf_distance_matrix[diag_indices(h, w, i)] 291 | npt.assert_allclose(result, expected, atol=1e-10) 292 | 293 | 294 | def _verify_columns_correct(bf_distance_matrix, euclid, backwards): 295 | w = bf_distance_matrix.shape[1] 296 | 297 | if backwards: 298 | r = range(w - 1, -1, -1) 299 | else: 300 | r = range(w) 301 | 302 | for i in r: 303 | result = euclid.calc_column(i) 304 | expected = bf_distance_matrix[:, i] 305 | npt.assert_allclose(result, expected, atol=1e-10, err_msg="Mismatch for row {row}".format(row=i)) 306 | 307 | 308 | def _bruteforce_zeuclidean_distance_matrix(series, query, m, noise_std=0.): 309 | num_cols = len(series) - m + 1 310 | num_rows = len(query) - m + 1 311 | distance_matrix = np.zeros((num_rows, num_cols)) 312 | 313 | for row in range(num_rows): 314 | for col in range(num_cols): 315 | distance_matrix[row, col] = _euclidean_znorm_distance( 316 | query[row: row + m], 317 | series[col: col + m], 318 | m, 319 | noise_std 320 | ) 321 | 322 | return distance_matrix 323 | 324 | 325 | def _euclidean_znorm_distance(s1, s2, m, noise_std=0.): 326 | sq_dist = np.sum( 327 | np.square(_znorm(s1) - _znorm(s2))) 328 | 329 | if noise_std != 0.: 330 | std1 = np.std(s1) 331 | std2 = np.std(s2) 332 | 333 | if std1 != 0. or std2 != 0.: 334 | max_std = np.maximum(np.std(s1), np.std(s2)) 335 | sq_dist -= (2 * (m + 1) * np.square(noise_std) / 336 | np.square(max_std)) 337 | sq_dist = np.maximum(sq_dist, 0) 338 | 339 | return np.sqrt(sq_dist) 340 | 341 | 342 | def _znorm(a): 343 | std = np.std(a) 344 | if std < 1e-6: 345 | std = 1 346 | return (a - np.mean(a)) / std 347 | -------------------------------------------------------------------------------- /distancematrix/tests/test_insights.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from unittest import TestCase 3 | import numpy.testing as npt 4 | 5 | from distancematrix.insights import lowest_value_idxs 6 | from distancematrix.insights import highest_value_idxs 7 | 8 | 9 | class TestSlidingMeanStd(TestCase): 10 | def test_lowest_value_idxs(self): 11 | a = np.array([1, 5, 3, 9, 4, 7, 6, 0, 2, 8], dtype=float) 12 | 13 | npt.assert_equal(list(lowest_value_idxs(a, 0)), np.argsort(a)) 14 | npt.assert_equal(list(lowest_value_idxs(a, 1)), [7, 0, 2, 4, 9]) 15 | npt.assert_equal(list(lowest_value_idxs(a, 2)), [7, 0, 4]) 16 | npt.assert_equal(list(lowest_value_idxs(a, 3)), [7, 0]) 17 | 18 | def test_highest_value_idxs(self): 19 | a = np.array([4, 8, 6, 1, 0, 3, 7, 9, 2, 5], dtype=float) 20 | 21 | npt.assert_equal(list(highest_value_idxs(a, 0)), np.argsort(a)[::-1]) 22 | npt.assert_equal(list(highest_value_idxs(a, 1)), [7, 1, 9, 5, 3]) 23 | npt.assert_equal(list(highest_value_idxs(a, 2)), [7, 1, 4]) 24 | npt.assert_equal(list(highest_value_idxs(a, 3)), [7, 1]) 25 | -------------------------------------------------------------------------------- /distancematrix/tests/test_math_tricks.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from unittest import TestCase 3 | import numpy.testing as npt 4 | 5 | import distancematrix.math_tricks as math_tricks 6 | 7 | 8 | def brute_sliding_mean(data, m): 9 | return np.array([np.mean(data[i:i + m]) for i in range(len(data) - m + 1)]) 10 | 11 | 12 | def brute_sliding_var(data, m): 13 | return np.array([np.var(data[i:i + m]) for i in range(len(data) - m + 1)]) 14 | 15 | 16 | def brute_sliding_std(data, m): 17 | return np.array([np.std(data[i:i + m]) for i in range(len(data) - m + 1)]) 18 | 19 | MEAN_STABILITY_DATA = np.array([ 20 | 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 21 | 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 22 | 0., 43.33, 69.39, 76.01, 76.03, 75.19, 82.21, 91.37, 86.44, 88.09, 88.56, 98.88, 91.62, 93.97, 90.81, 88.25, 23 | 95.3, 100., 95.96, 98.13, 97.57, 94.02, 95.24, 92.59, 98.98, 100., 100., 100., 97.88, 96.33, 98.07, 95.18, 24 | 93.52, 79.99, 37.08, 13.9, 17.43, 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 25 | 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 26 | 0., 0., 0., 0., 0., 0., 0., 0., 58.58, 70.16, 83.06, 82.79, 85.38, 100., 100., 100., 27 | 100., 100., 85.97, 56.18, 0., 0., 18.69, 0., 0., 13.9, 13.94, 25.69, 34.33, 65.06, 80.1, 85.65, 28 | 84.57, 83.74, 94.75, 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 29 | 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 30 | 100., 100., 100., 100., 100., 100., 100., 100., 90.1, 79.01, 65.47, 54.24, 25.05, 15.01, 0., 0.]) 31 | 32 | # For a subsequence length of 24, this data array provided a lot of approximation errors for various techniques 33 | # that were tested to calculate sliding variance/std. 34 | STD_VAR_STABILITY_DATA = np.array([ 35 | 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 36 | 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 37 | 12., 12., 12., 12., 12., 43.33, 69.39, 76.01, 76.03, 75.19, 82.21, 91.37, 86.44, 88.09, 38 | 88.56, 98.88, 91.62, 93.97, 90.81, 88.25, 95.3, 100., 95.96, 98.13, 97.57, 94.02, 95.24, 92.59, 39 | 98.98, 100., 100., 100., 97.88, 96.33, 98.07, 95.18, 93.52, 79.99, 37.08, 13.9, 17.43, 12., 40 | 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 41 | 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 42 | 12., 12., 12., 12., 12., 12., 58.58, 70.16, 83.06, 82.79, 85.38, 100., 100., 100., 43 | 100., 100., 85.97, 56.18, 12., 12., 18.69, 12., 12., 13.9, 13.94, 25.69, 34.33, 65.06, 44 | 80.1, 85.65, 84.57, 83.74, 94.75, 100., 100., 100., 100., 100., 100., 100., 100., 100., 45 | 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 46 | 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 47 | 90.1, 79.01, 65.47, 54.24, 25.05, 15.01, 12., 12., 12., 12., 12., 12., 12., 12., 48 | 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 49 | 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 50 | 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 51 | 12., 12., 15.94, 42.61, 71.12, 100., 100., 100., 100., 100., 100., 100., 100., 100., 52 | 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 53 | 100., 100., 100., 120., 120., 120., 120., 120., 120., 120., 14.69, 12., 12., 12., 54 | 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 55 | 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 56 | 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 57 | 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 58 | 12., 12., 15.19, 14.81, 22.67, 31.61, 32.21, 39.68, 47.36, 52.63, 61.79, 62.49, 67.66, 120., 59 | 120., 120., 120., 109.44, 87.13, 51.72, 55.24, 57.78, 62.97, 66.43, 120., 120., 120., 120., 60 | 110.46, 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 12., 61 | 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 62 | 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 63 | 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 64 | 12., 12., 12., 12., 12., 12., 31.04, 52.73, 49.78, 57.56, 66.5, 66.92, 75.89, 88.17, 65 | 97.98, 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 66 | 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 100., 67 | 100., 100., 100., 49.6, 45.2, 13.15, 12., 12., 12., 12., 12., 12., 12., 12., 68 | 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 69 | 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 12., 70 | 12., 12., 12., 12., 12., 12., 12., 12., 12., 12.]) 71 | 72 | 73 | class TestSlidingMeanStdVar(TestCase): 74 | def test_sliding_mean_std(self): 75 | random_gen = np.random.RandomState(0) 76 | 77 | data_array = [ 78 | np.array([5.15, 2.15, 1.05, -9.2, 0.01, 7.14, 4.18, 10.2, 3.25, 14.1, -9.85, 5.12, 0.11, 0.14, 0.98]), 79 | np.array([0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., -50, -50, -50, -50, -50, -50]), 80 | np.array([1e8, 1.6e9, 0.9e8, 6e4, 5.6e2, 9.9e6, 9e7, 6.48e4, 9.2e4, 1e8, 3.14e7]), 81 | random_gen.rand(1000) 82 | ] 83 | m = 5 84 | 85 | for data in data_array: 86 | correct_mean = [np.mean(data[i:i + m]) for i in range(len(data) - m + 1)] 87 | correct_std = [np.std(data[i:i + m]) for i in range(len(data) - m + 1)] 88 | 89 | mean, std = math_tricks.sliding_mean_std(data, m) 90 | 91 | npt.assert_allclose(mean, correct_mean) 92 | npt.assert_allclose(std, correct_std) 93 | 94 | def test_sliding_mean_numerical_stability(self): 95 | npt.assert_allclose( 96 | math_tricks.sliding_mean_std(MEAN_STABILITY_DATA, 24)[0], 97 | brute_sliding_mean(MEAN_STABILITY_DATA, 24), ) 98 | 99 | def test_sliding_std_numerical_stability(self): 100 | npt.assert_allclose( 101 | math_tricks.sliding_mean_std(STD_VAR_STABILITY_DATA, 24)[1], 102 | brute_sliding_std(STD_VAR_STABILITY_DATA, 24)) 103 | 104 | def test_sliding_var_numerical_stability(self): 105 | npt.assert_allclose( 106 | math_tricks.sliding_mean_var(STD_VAR_STABILITY_DATA, 24)[1], 107 | brute_sliding_var(STD_VAR_STABILITY_DATA, 24)) 108 | 109 | 110 | class TestStreamingStatistics(TestCase): 111 | def test_different_m(self): 112 | data = np.array([ 113 | 5.15, 2.15, 1.05, -9.2, 0.01, 7.14, 4.18, 10.2, 3.25, 14.1, 114 | -9.85, 5.12, 0.11, 0.14, 0.98, 0., 0., 0., 0., 0., 115 | 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 116 | 1., -50, -50, -50, -50, -50, -50, 1e8, 1.6e9, 0.9e8, 117 | 6e4, 5.6e2, 9.9e6, 9e7, 6.48e4, 9.2e4, 1e8, 3.14e7, 42., 1. 118 | ]) 119 | 120 | self._test_for_params(data, 10, 5) 121 | self._test_for_params(data, 10, 4) 122 | self._test_for_params(data, 10, 3) 123 | self._test_for_params(data, 10, 2) 124 | self._test_for_params(data, 10, 1) 125 | self._test_for_params(data, 5, 5) 126 | self._test_for_params(data, 5, 4) 127 | self._test_for_params(data, 5, 3) 128 | self._test_for_params(data, 5, 2) 129 | self._test_for_params(data, 5, 1) 130 | 131 | def test_different_stepsize(self): 132 | data = np.array([ 133 | 5.15, 2.15, 1.05, -9.2, 0.01, 7.14, 4.18, 10.2, 3.25, 14.1, 134 | -9.85, 5.12, 0.11, 0.14, 0.98, 0., 0., 0., 0., 0., 135 | 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 136 | 1., -50, -50, -50, -50, -50, -50, 1e8, 1.6e9, 0.9e8, 137 | 6e4, 5.6e2, 9.9e6, 9e7, 6.48e4, 9.2e4, 1e8, 3.14e7, 42., 1. 138 | ]) 139 | 140 | self._test_for_params(data, 10, 5, 1) 141 | self._test_for_params(data, 10, 5, 2) 142 | self._test_for_params(data, 10, 5, 3) 143 | self._test_for_params(data, 10, 5, 4) 144 | self._test_for_params(data, 10, 5, 5) 145 | self._test_for_params(data, 10, 5, 6) 146 | self._test_for_params(data, 10, 5, 7) 147 | self._test_for_params(data, 10, 5, 9) 148 | self._test_for_params(data, 10, 5, 10) 149 | self._test_for_params(data, 10, 5, 11) 150 | self._test_for_params(data, 10, 5, 12) 151 | 152 | def _test_for_params(self, data, data_len, m, stepsize=1): 153 | start = 0 154 | ss = math_tricks.StreamingStats(data[:data_len], m) 155 | 156 | npt.assert_equal(ss.data, data[start: start + data_len]) 157 | npt.assert_allclose(ss.mean, [np.mean(data[start + i: start + i + m]) for i in range(data_len - m + 1)]) 158 | npt.assert_allclose(ss.std, [np.std(data[start + i: start + i + m]) for i in range(data_len - m + 1)]) 159 | 160 | while start + data_len + stepsize < len(data): 161 | ss.append(data[start + data_len: start + data_len + stepsize]) 162 | start += stepsize 163 | npt.assert_equal(ss.data, data[start: start + data_len]) 164 | npt.assert_allclose( 165 | ss.mean, [np.mean(data[start + i: start + i + m]) for i in range(data_len - m + 1)], 166 | atol=2e-15, err_msg="Different for window starting at " + str(start)) 167 | 168 | expected_std = [np.std(data[start + i: start + i + m]) for i in range(data_len - m + 1)] 169 | npt.assert_allclose( 170 | ss.std, expected_std, 171 | atol=2e-15, 172 | err_msg="Different for window starting at " + str(start) + ": " + str(ss.std - expected_std)) 173 | 174 | def test_stability(self): 175 | self._test_for_params(STD_VAR_STABILITY_DATA, 50, 24) 176 | -------------------------------------------------------------------------------- /distancematrix/tests/test_ostinato.py: -------------------------------------------------------------------------------- 1 | from itertools import permutations 2 | from unittest import TestCase 3 | 4 | import numpy as np 5 | import numpy.testing as npt 6 | 7 | from distancematrix.generator import ZNormEuclidean 8 | from distancematrix.consumer import MatrixProfileLR 9 | from distancematrix.calculator import AnytimeCalculator 10 | from distancematrix.ostinato import find_consensus_motif, CMResult 11 | 12 | 13 | class TestOstinato(TestCase): 14 | def test_exact_match(self): 15 | # Each series contains a shifted/scaled version of [1, 1, 0, 2, 2] 16 | series_list = np.array([ 17 | np.array([0.04, 0.45, 0.45, 0.00, 0.90, 0.90, 0.74, 0.72, 0.48, 0.82, 0.49, 0.36, 0.02, 0.37, 0.21]), 18 | np.array([0.08, 0.19, 0.25, 0.59, 0.50, 0.72, 0.16, 0.45, 1.49, 1.49, 0.49, 2.49, 2.49, 0.92, 0.16]), 19 | np.array([0.29, 0.42, 0.96, 1.68, 1.68, 1.00, 2.36, 2.36, 0.14, 0.22, 0.51, 0.45, 0.01, 0.66, 0.53]), 20 | np.array([0.84, 0.01, 0.01, 0.00, 0.02, 0.02, 0.51, 0.53, 0.91, 0.94, 0.47, 0.36, 0.28, 0.15, 0.08]) 21 | ]) 22 | 23 | correct_subseq_idx = [1, 8, 3, 1] 24 | 25 | for perm in permutations(range(len(series_list))): 26 | perm = list(perm) # Tuple to list for indexing 27 | calc_result = find_consensus_motif(series_list[perm], 5) 28 | bf_result = find_consensus_motif_bruteforce(series_list[perm], 5) 29 | 30 | npt.assert_almost_equal(bf_result.radius, 0) 31 | npt.assert_equal(bf_result.series_index, 0) 32 | npt.assert_equal(bf_result.subseq_index, correct_subseq_idx[perm[0]]) 33 | 34 | npt.assert_almost_equal(calc_result.radius, 0) 35 | npt.assert_equal(calc_result.series_index, 0) 36 | npt.assert_equal(calc_result.subseq_index, correct_subseq_idx[perm[0]]) 37 | 38 | def test_near_match(self): 39 | # Fourth series contains shifted/scaled [1, 1, 1, 2, 2], 40 | # all other series contain shifted/scaled versions with slight noise. 41 | series_list = np.array([ 42 | np.array([0.04, 0.40, 0.50, 0.45, 0.90, 0.90, 0.74, 0.72, 0.48, 0.82, 0.49, 0.36, 0.02, 0.37, 0.21]), 43 | np.array([0.08, 0.19, 0.25, 0.59, 0.50, 0.72, 0.16, 0.45, 1.53, 1.44, 1.49, 2.49, 2.49, 0.92, 0.16]), 44 | np.array([0.29, 0.42, 0.96, 1.68, 1.78, 1.58, 2.36, 2.36, 0.14, 0.22, 0.51, 0.45, 0.01, 0.66, 0.53]), 45 | np.array([0.84, 0.01, 0.01, 0.01, 0.02, 0.02, 0.51, 0.53, 0.91, 0.94, 0.47, 0.36, 0.28, 0.15, 0.08]) 46 | ]) 47 | 48 | for perm in permutations(range(len(series_list))): 49 | perm = list(perm) # Tuple to list for indexing 50 | calc_result = find_consensus_motif(series_list[perm], 5) 51 | bf_result = find_consensus_motif_bruteforce(series_list[perm], 5) 52 | 53 | npt.assert_almost_equal(calc_result.radius, bf_result.radius) 54 | npt.assert_equal(bf_result.series_index, perm.index(3)) 55 | npt.assert_equal(calc_result.series_index, perm.index(3)) 56 | npt.assert_equal(bf_result.subseq_index, 1) 57 | npt.assert_equal(calc_result.subseq_index, 1) 58 | 59 | def test_on_random_data(self): 60 | data = np.array([ 61 | [0.292, 0.183, 0.509, 0.128, 0.718, 0.054, 0.7, 0.532, 0.178, 0.076, 0.46, 0.027, 0.882, 0.288, 0.746], 62 | [0.57, 0.539, 0.239, 0.328, 0.784, 0.614, 0.288, 0.696, 0.12, 0.337, 0.54, 0.401, 0.589, 0.461, 0.666], 63 | [0.454, 0.487, 0.687, 0.981, 0.24, 0.863, 0.458, 0.203, 0.798, 0.917, 0.336, 0.562, 0.266, 0.325, 0.818], 64 | [0.749, 0.886, 0.095, 0.335, 0.247, 0.403, 0.063, 0.047, 0.804, 0.976, 0.836, 0.065, 0.27, 0.59, 0.747], 65 | [0.196, 0.924, 0.968, 0.19, 0.999, 0.31, 0.908, 0.576, 0.521, 0.246, 0.444, 0.319, 0.781, 0.628, 0.183], 66 | [0.136, 0.444, 0.115, 0.954, 0.231, 0.876, 0.566, 0.886, 0.898, 0.287, 0.544, 0.365, 0.108, 0.345, 0.03], 67 | [0.813, 0.324, 0.465, 0.459, 0.565, 0.28, 0.334, 0.169, 0.479, 0.957, 0.621, 0.026, 0.998, 0.732, 0.365], 68 | [0.176, 0.072, 0.288, 0.915, 0.867, 0.215, 0.566, 0.555, 0.602, 0.943, 0.786, 0.404, 0.271, 0.579, 0.362], 69 | [0.7, 0.113, 0.159, 0.701, 0.476, 0.216, 0.359, 0.613, 0.358, 0.871, 0.888, 0.668, 0.604, 0.574, 0.555], 70 | [0.745, 0.298, 0.213, 0.669, 0.303, 0.737, 0.93, 0.998, 0.529, 0.215, 0.839, 0.666, 0.669, 0.583, 0.168]]) 71 | 72 | calc_result = find_consensus_motif(data, 5) 73 | bf_result = find_consensus_motif_bruteforce(data, 5) 74 | 75 | npt.assert_almost_equal(calc_result.radius, bf_result.radius) 76 | npt.assert_equal(calc_result.series_index, bf_result.series_index) 77 | npt.assert_equal(calc_result.subseq_index, bf_result.subseq_index) 78 | 79 | 80 | def find_consensus_motif_bruteforce(series_list, m) -> CMResult: 81 | result = CMResult(np.inf, -1, -1) 82 | 83 | for series_idx, series in enumerate(series_list): 84 | radii = np.zeros(len(series) - m + 1) 85 | for series2_idx, series2 in enumerate(series_list): 86 | if series_idx == series2_idx: 87 | continue 88 | 89 | calc = AnytimeCalculator(m, series, series2) 90 | calc.add_generator(0, ZNormEuclidean()) 91 | mp_cons = calc.add_consumer([0], MatrixProfileLR()) 92 | calc.calculate_columns() 93 | mp = mp_cons.matrix_profile() 94 | 95 | radii = np.maximum(radii, mp) 96 | 97 | subseq_idx = np.argmin(radii) 98 | subseq_radius = radii[subseq_idx] 99 | if subseq_radius < result.radius: 100 | result = CMResult(subseq_radius, series_idx, subseq_idx) 101 | 102 | return result 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /distancematrix/tests/test_ringbuffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from unittest import TestCase 3 | import numpy.testing as npt 4 | 5 | from distancematrix.ringbuffer import RingBuffer 6 | 7 | 8 | class TestRingBuffer(TestCase): 9 | def test_one_dimensional(self): 10 | buffer = RingBuffer([0, 1, 2, 3, 4]) 11 | npt.assert_equal(buffer.view, np.array([0, 1, 2, 3, 4])) 12 | npt.assert_equal(buffer.max_shape, (5,)) 13 | 14 | self.assertEqual(buffer.push([]), 0) 15 | npt.assert_equal(buffer.view, np.array([0, 1, 2, 3, 4])) 16 | self.assertEqual(buffer[0], 0) 17 | 18 | self.assertEqual(buffer.push(5), 1) 19 | npt.assert_equal(buffer.view, np.array([1, 2, 3, 4, 5])) 20 | self.assertEqual(buffer[0], 1) 21 | 22 | self.assertEqual(buffer.push([6]), 1) 23 | self.assertEqual(buffer.push([7]), 1) 24 | npt.assert_equal(buffer.view, np.array([3, 4, 5, 6, 7])) 25 | self.assertEqual(buffer[0], 3) 26 | 27 | self.assertEqual(buffer.push([8, 9, 10]), 3) 28 | npt.assert_equal(buffer.view, np.array([6, 7, 8, 9, 10])) 29 | self.assertEqual(buffer[0], 6) 30 | 31 | self.assertEqual(buffer.push([11, 12, 13, 14]), 4) 32 | npt.assert_equal(buffer.view, np.array([10, 11, 12, 13, 14])) 33 | self.assertEqual(buffer[0], 10) 34 | 35 | self.assertEqual(buffer.push([15, 16, 17, 18, 19]), 5) 36 | npt.assert_equal(buffer.view, np.array([15, 16, 17, 18, 19])) 37 | self.assertEqual(buffer[0], 15) 38 | 39 | self.assertEqual(buffer.push([20, 21, 22, 23, 24, 25]), 6) 40 | npt.assert_equal(buffer.view, np.array([21, 22, 23, 24, 25])) 41 | self.assertEqual(buffer[0], 21) 42 | 43 | def test_multi_dimensional(self): 44 | buffer = RingBuffer([[0, 1, 2, 3, 4], [0, -1, -2, -3, -4]]) 45 | npt.assert_equal(buffer.view, np.array([[0, 1, 2, 3, 4], [0, -1, -2, -3, -4]])) 46 | npt.assert_equal(buffer.max_shape, (2, 5)) 47 | 48 | self.assertEqual(buffer.push([[], []]), 0) 49 | npt.assert_equal(buffer.view, np.array([[0, 1, 2, 3, 4], [0, -1, -2, -3, -4]])) 50 | npt.assert_equal(buffer[:, 0], [0, 0]) 51 | 52 | self.assertEqual(buffer.push([[5], [-5]]), 1) 53 | npt.assert_equal(buffer.view, np.array([[1, 2, 3, 4, 5], [-1, -2, -3, -4, -5]])) 54 | npt.assert_equal(buffer[:, 0], [1, -1]) 55 | 56 | self.assertEqual(buffer.push([[6, 7], [-6, -7]]), 2) 57 | npt.assert_equal(buffer.view, np.array([[3, 4, 5, 6, 7], [-3, -4, -5, -6, -7]])) 58 | npt.assert_equal(buffer[:, 0], [3, -3]) 59 | 60 | self.assertEqual(buffer.push([[8, 9, 10], [-8, -9, -10]]), 3) 61 | npt.assert_equal(buffer.view, np.array([[6, 7, 8, 9, 10], [-6, -7, -8, -9, -10]])) 62 | npt.assert_equal(buffer[:, 0], [6, -6]) 63 | 64 | self.assertEqual(buffer.push([[11, 12, 13, 14], [-11, -12, -13, -14]]), 4) 65 | npt.assert_equal(buffer.view, np.array([[10, 11, 12, 13, 14], [-10, -11, -12, -13, -14]])) 66 | npt.assert_equal(buffer[:, 0], [10, -10]) 67 | 68 | self.assertEqual(buffer.push([[15, 16, 17, 18, 19], [-15, -16, -17, -18, -19]]), 5) 69 | npt.assert_equal(buffer.view, np.array([[15, 16, 17, 18, 19], [-15, -16, -17, -18, -19]])) 70 | npt.assert_equal(buffer[:, 0], [15, -15]) 71 | 72 | self.assertEqual(buffer.push([[20, 21, 22, 23, 24, 25], [-20, -21, -22, -23, -24, -25]]), 6) 73 | npt.assert_equal(buffer.view, np.array([[21, 22, 23, 24, 25], [-21, -22, -23, -24, -25]])) 74 | npt.assert_equal(buffer[:, 0], [21, -21]) 75 | 76 | def test_empty_intialization(self): 77 | buffer = RingBuffer(None, shape=(5,), dtype=int) 78 | npt.assert_equal(buffer.max_shape, (5,)) 79 | 80 | npt.assert_equal(buffer.view, np.array([])) 81 | 82 | self.assertEqual(buffer.push([1]), 0) 83 | npt.assert_equal(buffer.view, np.array([1])) 84 | self.assertEqual(buffer[0], 1) 85 | 86 | self.assertEqual(buffer.push([2, 3]), 0) 87 | npt.assert_equal(buffer.view, np.array([1, 2, 3])) 88 | self.assertEqual(buffer[0], 1) 89 | 90 | self.assertEqual(buffer.push([4, 5, 6]), 1) 91 | npt.assert_equal(buffer.view, np.array([2, 3, 4, 5, 6])) 92 | self.assertEqual(buffer[0], 2) 93 | 94 | def test_partial_intialization(self): 95 | buffer = RingBuffer([1, 2], shape=(5,), dtype=int) 96 | npt.assert_equal(buffer.max_shape, (5,)) 97 | 98 | npt.assert_equal(buffer.view, np.array([1, 2])) 99 | self.assertEqual(buffer[0], 1) 100 | 101 | self.assertEqual(buffer.push([3]), 0) 102 | npt.assert_equal(buffer.view, np.array([1, 2, 3])) 103 | self.assertEqual(buffer[0], 1) 104 | 105 | self.assertEqual(buffer.push([4, 5, 6]), 1) 106 | npt.assert_equal(buffer.view, np.array([2, 3, 4, 5, 6])) 107 | self.assertEqual(buffer[0], 2) 108 | 109 | def test_oversized_initialization(self): 110 | buffer = RingBuffer([1, 2, 3, 4, 5, 6], shape=(5,), dtype=int) 111 | npt.assert_equal(buffer.max_shape, (5,)) 112 | 113 | npt.assert_equal(buffer.view, np.array([2, 3, 4, 5, 6])) 114 | self.assertEqual(buffer[0], 2) 115 | 116 | self.assertEqual(buffer.push([7]), 1) 117 | npt.assert_equal(buffer.view, np.array([3, 4, 5, 6, 7])) 118 | self.assertEqual(buffer[0], 3) 119 | 120 | self.assertEqual(buffer.push([8, 9, 10]), 3) 121 | npt.assert_equal(buffer.view, np.array([6, 7, 8, 9, 10])) 122 | self.assertEqual(buffer[0], 6) 123 | -------------------------------------------------------------------------------- /distancematrix/tests/test_util.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | import numpy as np 3 | import numpy.testing as npt 4 | 5 | from distancematrix.util import diag_length 6 | from distancematrix.util import diag_indices 7 | from distancematrix.util import diag_indices_of 8 | from distancematrix.util import cut_indices_of 9 | from distancematrix.util import shortest_path_distances 10 | from distancematrix.util import shortest_path 11 | from distancematrix.util import sliding_min 12 | from distancematrix.util import sliding_max 13 | from distancematrix.util import sliding_window_view 14 | 15 | 16 | class TestUtil(TestCase): 17 | def test_diag_length_square_matrix(self): 18 | self.assertEqual(diag_length(5, 5, 0), 5) 19 | self.assertEqual(diag_length(5, 5, 1), 4) 20 | self.assertEqual(diag_length(5, 5, -2), 3) 21 | self.assertEqual(diag_length(5, 5, 4), 1) 22 | self.assertEqual(diag_length(5, 5, 5), 0) 23 | self.assertEqual(diag_length(5, 5, 6), 0) 24 | 25 | def test_diag_length_rect_matrix(self): 26 | self.assertEqual(diag_length(5, 3, 0), 3) 27 | self.assertEqual(diag_length(5, 3, 1), 2) 28 | self.assertEqual(diag_length(5, 3, 2), 1) 29 | self.assertEqual(diag_length(5, 3, 3), 0) 30 | self.assertEqual(diag_length(5, 3, 4), 0) 31 | 32 | self.assertEqual(diag_length(5, 3, -1), 3) 33 | self.assertEqual(diag_length(5, 3, -2), 3) 34 | self.assertEqual(diag_length(5, 3, -3), 2) 35 | self.assertEqual(diag_length(5, 3, -4), 1) 36 | self.assertEqual(diag_length(5, 3, -5), 0) 37 | self.assertEqual(diag_length(5, 3, -6), 0) 38 | 39 | self.assertEqual(diag_length(3, 5, 0), 3) 40 | self.assertEqual(diag_length(3, 5, 1), 3) 41 | self.assertEqual(diag_length(3, 5, 2), 3) 42 | self.assertEqual(diag_length(3, 5, 3), 2) 43 | self.assertEqual(diag_length(3, 5, 4), 1) 44 | self.assertEqual(diag_length(3, 5, 5), 0) 45 | self.assertEqual(diag_length(3, 5, 6), 0) 46 | 47 | self.assertEqual(diag_length(3, 5, -1), 2) 48 | self.assertEqual(diag_length(3, 5, -2), 1) 49 | self.assertEqual(diag_length(3, 5, -3), 0) 50 | self.assertEqual(diag_length(3, 5, -4), 0) 51 | 52 | def test_diag_indices_square(self): 53 | data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) 54 | npt.assert_equal(data[diag_indices(3, 3, -3)], []) 55 | npt.assert_equal(data[diag_indices(3, 3, -2)], [7]) 56 | npt.assert_equal(data[diag_indices(3, 3, -1)], [4, 8]) 57 | npt.assert_equal(data[diag_indices(3, 3, 0)], [1, 5, 9]) 58 | npt.assert_equal(data[diag_indices(3, 3, 1)], [2, 6]) 59 | npt.assert_equal(data[diag_indices(3, 3, 2)], [3]) 60 | npt.assert_equal(data[diag_indices(3, 3, 3)], []) 61 | 62 | def test_diag_indices_rect(self): 63 | data = np.array([[1, 2, 3], [4, 5, 6]]) 64 | npt.assert_equal(data[diag_indices(2, 3, -2)], []) 65 | npt.assert_equal(data[diag_indices(2, 3, -1)], [4]) 66 | npt.assert_equal(data[diag_indices(2, 3, 0)], [1, 5]) 67 | npt.assert_equal(data[diag_indices(2, 3, 1)], [2, 6]) 68 | npt.assert_equal(data[diag_indices(2, 3, 2)], [3]) 69 | npt.assert_equal(data[diag_indices(2, 3, 3)], []) 70 | 71 | def test_diag_indices_of_rect(self): 72 | data = np.array([[1, 2, 3], [4, 5, 6]]) 73 | npt.assert_equal(data[diag_indices_of(data, -2)], []) 74 | npt.assert_equal(data[diag_indices_of(data, -1)], [4]) 75 | npt.assert_equal(data[diag_indices_of(data, 0)], [1, 5]) 76 | npt.assert_equal(data[diag_indices_of(data, 1)], [2, 6]) 77 | npt.assert_equal(data[diag_indices_of(data, 2)], [3]) 78 | npt.assert_equal(data[diag_indices_of(data, 3)], []) 79 | 80 | def test_cut_indices_of(self): 81 | data = np.array([ 82 | [1, 2, 3], 83 | [4, 5, 6], 84 | [7, 8, 9], 85 | [10, 11, 12] 86 | ]) 87 | 88 | npt.assert_equal(data[cut_indices_of(data, 0)], [1]) 89 | npt.assert_equal(data[cut_indices_of(data, 1)], [4, 2]) 90 | npt.assert_equal(data[cut_indices_of(data, 2)], [7, 5, 3]) 91 | npt.assert_equal(data[cut_indices_of(data, 3)], [10, 8, 6]) 92 | npt.assert_equal(data[cut_indices_of(data, 4)], [11, 9]) 93 | npt.assert_equal(data[cut_indices_of(data, 5)], [12]) 94 | 95 | npt.assert_equal(data[cut_indices_of(data, 6)], []) 96 | 97 | data = np.array([ 98 | [0, 1, 2, 3, 4], 99 | [5, 6, 7, 8, 9] 100 | ]) 101 | 102 | npt.assert_equal(data[cut_indices_of(data, 0)], [0]) 103 | npt.assert_equal(data[cut_indices_of(data, 1)], [5, 1]) 104 | npt.assert_equal(data[cut_indices_of(data, 2)], [6, 2]) 105 | npt.assert_equal(data[cut_indices_of(data, 3)], [7, 3]) 106 | npt.assert_equal(data[cut_indices_of(data, 4)], [8, 4]) 107 | npt.assert_equal(data[cut_indices_of(data, 5)], [9]) 108 | 109 | def test_shortest_path_distances(self): 110 | data = np.array([ 111 | [1, 2, 1, 0, 3], 112 | [1, 3, 0, 1, 1], 113 | [0, 1, 1, 4, 0], 114 | [2, 5, 5, 2, 2], 115 | [0, 1, 2, 3, 9] 116 | ], dtype=float) 117 | 118 | expected = np.array([ 119 | [1, 3, 4, 4, 7], 120 | [2, 4, 3, 4, 5], 121 | [2, 3, 4, 7, 4], 122 | [4, 7, 8, 6, 6], 123 | [4, 5, 7, 9, 15] 124 | ], dtype=float) 125 | 126 | result = shortest_path_distances(data) 127 | npt.assert_equal(result, expected) 128 | 129 | result = shortest_path_distances(data[:3, :]) 130 | npt.assert_equal(result, expected[:3, :]) 131 | 132 | result = shortest_path_distances(data[:, :3]) 133 | npt.assert_equal(result, expected[:, :3]) 134 | 135 | def test_shortest_path(self): 136 | data = np.array([ 137 | [1, 2, 1, 0, 3], 138 | [1, 3, 3, 1, 1], 139 | [4, 3, 8, 4, 0], 140 | [2, 2, 5, 2, 5], 141 | [0, 1, 1, 3, 2], 142 | [0, 1, 1, 5, 9] 143 | ], dtype=float) 144 | 145 | result = shortest_path(data) 146 | npt.assert_equal(result, [[0, 0], [1, 0], [2, 1], [3, 1], [4, 2], [4, 3], [5, 4]]) 147 | 148 | def test_sliding_min(self): 149 | data = np.array([1, 2, 3, 4, 5, 6, 7, 8]) 150 | npt.assert_equal( 151 | sliding_min(data, 3), 152 | [1, 2, 3, 4, 5, 6] 153 | ) 154 | 155 | data = np.array([8, 7, 6, 5, 4, 3, 2, 1]) 156 | npt.assert_equal( 157 | sliding_min(data, 3), 158 | [6, 5, 4, 3, 2, 1] 159 | ) 160 | 161 | data = np.array([8, 3, 4, 0, 6, 1, 1, 1, 2, 7, 6, 4, 3, 4]) 162 | npt.assert_equal( 163 | sliding_min(data, 3), 164 | [3, 0, 0, 0, 1, 1, 1, 1, 2, 4, 3, 3] 165 | ) 166 | 167 | def test_sliding_max(self): 168 | data = np.array([1, 2, 3, 4, 5, 6, 7, 8]) 169 | npt.assert_equal( 170 | sliding_max(data, 3), 171 | [3, 4, 5, 6, 7, 8] 172 | ) 173 | 174 | data = np.array([8, 7, 6, 5, 4, 3, 2, 1]) 175 | npt.assert_equal( 176 | sliding_max(data, 3), 177 | [8, 7, 6, 5, 4, 3] 178 | ) 179 | 180 | data = np.array([8, 3, 4, 0, 6, 1, 1, 1, 2, 7, 6, 4, 3, 4]) 181 | npt.assert_equal( 182 | sliding_max(data, 3), 183 | [8, 4, 6, 6, 6, 1, 2, 7, 7, 7, 6, 4] 184 | ) 185 | 186 | def test_sliding_window_view(self): 187 | data = np.array([1, 2, 3, 4, 5, 6, 7, 8]) 188 | npt.assert_equal( 189 | sliding_window_view(data, [3]), 190 | [[1, 2, 3], [2, 3, 4], [3, 4, 5], [4, 5, 6], [5, 6, 7], [6, 7, 8]] 191 | ) 192 | 193 | npt.assert_equal( 194 | sliding_window_view(data, [3], step=[2]), 195 | [[1, 2, 3], [3, 4, 5], [5, 6, 7]] 196 | ) 197 | 198 | data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) 199 | npt.assert_equal( 200 | sliding_window_view(data, [2, 2]), 201 | [[[[1, 2], [4, 5]], [[2, 3], [5, 6]]], [[[4, 5], [7, 8]], [[5, 6], [8, 9]]]] 202 | ) 203 | 204 | npt.assert_equal( 205 | sliding_window_view(data, [1, 3], step=[2, 1]), 206 | [[[[1, 2, 3]]], [[[7, 8, 9]]]] 207 | ) 208 | -------------------------------------------------------------------------------- /distancematrix/tests/test_valmod.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | from unittest import TestCase 5 | import numpy.testing as npt 6 | 7 | from distancematrix.valmod import _find_all_motifs_full_matrix_iteration 8 | from distancematrix.valmod import LowerBoundEntry 9 | from distancematrix.generator.znorm_euclidean import ZNormEuclidean 10 | 11 | 12 | class TestValmod(TestCase): 13 | def _test_find_all_motifs_full_matrix_iteration(self, data, m, lb_list_size): 14 | dist_gen = ZNormEuclidean(0.).prepare(m, data) 15 | 16 | calc_lb_lists, calc_motif_idxs = _find_all_motifs_full_matrix_iteration(dist_gen, lb_list_size, int(np.ceil(m / 2))) 17 | bf_lb_lists, bf_motif_idxs = bruteforce_full_matrix_iteration(data, m, lb_list_size) 18 | 19 | npt.assert_equal(set(bf_motif_idxs), set(calc_motif_idxs)) 20 | 21 | npt.assert_equal(len(bf_lb_lists), len(calc_lb_lists)) 22 | for iteration, (bf_lb_list, calc_lb_list) in enumerate(zip(bf_lb_lists, calc_lb_lists)): 23 | # Ensure lower bounds match 24 | bf_lower_bounds = [e.lower_bound_base for e in bf_lb_list] 25 | calc_lower_bounds = [e.lower_bound_base for e in calc_lb_list] 26 | 27 | npt.assert_allclose(bf_lower_bounds, calc_lower_bounds, err_msg="Mismatch for iteration " + str(iteration)) 28 | 29 | if len(bf_lower_bounds) == 0: 30 | continue 31 | 32 | # Since multiple entries may have the same lower bound for different dot products: sort again 33 | bf_lb_list.sort(key=lambda e: (e.lower_bound_base, e.dot_prod, e.q_index)) 34 | calc_lb_list.sort(key=lambda e: (e.lower_bound_base, e.dot_prod, e.q_index)) 35 | lists_match_upto = bf_lower_bounds.index(bf_lower_bounds[-1]) 36 | 37 | npt.assert_allclose( 38 | [e.dot_prod for e in bf_lb_list[:lists_match_upto]], 39 | [e.dot_prod for e in calc_lb_list[:lists_match_upto]]) 40 | 41 | npt.assert_equal( 42 | [(e.q_index, e.s_index) for e in bf_lb_list[:lists_match_upto]], 43 | [(e.q_index, e.s_index) for e in calc_lb_list[:lists_match_upto]]) 44 | 45 | for entry in calc_lb_list[lists_match_upto:]: 46 | subseq_1 = data[entry.q_index: entry.q_index + m ] 47 | subseq_2 = data[entry.s_index: entry.s_index + m] 48 | npt.assert_almost_equal(np.sum(subseq_1 * subseq_2), entry.dot_prod) 49 | 50 | def test_find_all_motifs_full_matrix_iteration_normal_data(self): 51 | # Random data, 20 points 52 | data = np.array( 53 | [-1.61, -0.43, -0.43, 0.82, 0.42, 1.58, -0.46, 1.41, 1.31, 54 | -0.13, -0.05, 0.59, 1.76, -0.43, -0.14, -0.14, 1.07, 1.1, 0.84, -1.49]) 55 | 56 | self._test_find_all_motifs_full_matrix_iteration(data, 4, 1) 57 | self._test_find_all_motifs_full_matrix_iteration(data, 4, 5) 58 | 59 | # Due to the large subseq length, some lower bound arrays will be empty 60 | self._test_find_all_motifs_full_matrix_iteration(data, 10, 5) 61 | 62 | # Because the division by zero can result in inf or -inf, results for the lower bound are not deterministic, 63 | # which is a pain to test. Behavior should be correct though. 64 | @unittest.skip("VALMOD: Flat signals have undefined lower bounds.") 65 | def test_find_all_motifs_full_matrix_iteration_data_with_flats(self): 66 | # Random data, 20 points, with flat signals 67 | data = np.array( 68 | [-1.61, -0.43, -0.43, -0.43, -0.43, 1.58, -0.46, 1.41, 1.31, 69 | -0.13, -0.05, 0.59, 1.76, -0.43, 0.84, 0.84, 0.84, 0.84, 0.84, -1.49]) 70 | 71 | self._test_find_all_motifs_full_matrix_iteration(data, 4, 1) 72 | self._test_find_all_motifs_full_matrix_iteration(data, 4, 5) 73 | 74 | # Due to the large subseq length, some lower bound arrays will be empty 75 | self._test_find_all_motifs_full_matrix_iteration(data, 10, 5) 76 | 77 | 78 | def bruteforce_full_matrix_iteration(series, subseq_length, lb_list_size): 79 | """ 80 | Brute force implementation of _find_all_motifs_full_matrix_iteration 81 | 82 | :param series: 1D series 83 | :param subseq_length: subsequence length to use 84 | :param lb_list_size: max size of lower bound lists 85 | :return: tuple of: list of all lb_lists per column, indices of the best motif for the entire distance matrix 86 | """ 87 | num_subseq = series.shape[0] - subseq_length + 1 88 | triv_match_buffer = int(np.ceil(subseq_length / 2)) 89 | 90 | means = np.array([np.mean(series[i: i + subseq_length]) for i in range(num_subseq)]) 91 | stds = np.array([np.std(series[i: i + subseq_length]) for i in range(num_subseq)]) 92 | 93 | # Finding the best motif 94 | motif_dist2 = np.inf 95 | motif_idxs = None 96 | 97 | # Lower bounds 98 | lb_lists = [] 99 | 100 | for s_i in range(num_subseq): 101 | subseq_1 = series[s_i: s_i + subseq_length] 102 | 103 | lb_list = [] 104 | 105 | for q_i in range(num_subseq): 106 | # Avoid trivial match 107 | if abs(s_i - q_i) <= triv_match_buffer: 108 | continue 109 | 110 | subseq_2 = series[q_i: q_i + subseq_length] 111 | dot_prod = np.sum(subseq_1 * subseq_2) 112 | 113 | # Calculate z-normalised distance (squared) 114 | if stds[s_i] == 0 and stds[q_i] == 0: 115 | z_dist2 = 0 116 | elif stds[s_i] == 0 or stds[q_i] == 0: 117 | z_dist2 = np.square(subseq_length) 118 | else: 119 | z_dist2 = 2 * (subseq_length - (dot_prod - subseq_length * means[s_i] * means[q_i]) / 120 | (stds[s_i] * stds[q_i])) 121 | 122 | if z_dist2 < motif_dist2: 123 | motif_dist2 = z_dist2 124 | motif_idxs = (s_i, q_i) 125 | 126 | # Calculate lower bound 127 | if stds[s_i] != 0: 128 | std_q = stds[q_i] 129 | lower_bound_q = np.clip( 130 | (dot_prod / subseq_length - means[s_i] * means[q_i]) / (stds[s_i] * std_q), 0, 1) 131 | lower_bound = np.sqrt(subseq_length * (1 - np.square(lower_bound_q))) * stds[s_i] 132 | 133 | lb_list.append(LowerBoundEntry(q_i, s_i, lower_bound, dot_prod)) 134 | 135 | # Trim lower bound lists 136 | lb_lists.append(sorted(lb_list, key=lambda e: e.lower_bound_base)[:lb_list_size]) 137 | 138 | return lb_lists, motif_idxs -------------------------------------------------------------------------------- /distancematrix/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def diag_length(h, w, diagonal=0): 6 | """ 7 | Returns the number of elements on the specified diagonal of a matrix with dimensions (h, w). 8 | 9 | :param h: int, height of the matrix 10 | :param w: int, width of the matrix 11 | :param diagonal: int, diagonal index of the matrix 12 | :return: a positive integer, zero if diagonal fall completely outside the matrix 13 | """ 14 | if diagonal >= 0: 15 | return max(min(h, w - diagonal), 0) 16 | else: 17 | return max(min(w, h + diagonal), 0) 18 | 19 | 20 | def diag_indices(h, w, diagonal=0): 21 | """ 22 | Returns the indices of the elements on the specified diagonal of a matrix with dimensions (h, w). 23 | 24 | :param h: int, height of the matrix 25 | :param w: int, width of the matrix 26 | :param diagonal: int, diagonal index of the matrix 27 | :return: a tuple of ranges, serving as indices of the elements 28 | """ 29 | dl = diag_length(h, w, diagonal) 30 | 31 | if diagonal >= 0: 32 | return range(0, dl), range(diagonal, diagonal + dl) 33 | else: 34 | return range(-diagonal, -diagonal + dl), range(0, dl) 35 | 36 | 37 | def diag_indices_of(array, diagonal=0): 38 | """ 39 | Returns the indices of the elements on the specified diagonal of the given matrix. 40 | 41 | :param array: 2D array 42 | :param diagonal: int, diagonal index of the matrix 43 | :return: a tuple of ranges, serving as indices of the elements 44 | """ 45 | if array.ndim != 2: 46 | raise RuntimeError("array should be 2D") 47 | 48 | return diag_indices(array.shape[0], array.shape[1], diagonal) 49 | 50 | 51 | def cut_indices_of(array, cut): 52 | """ 53 | Calculates the indices of the elements on the given cut for the given matrix. 54 | Where a diagonal runs from top left to bottom right, a cut runs from bottom left to top right. 55 | 56 | :param array: 2D array 57 | :param cut: index of the cut (cut 0 is the single element of the top left) 58 | :return: the indices to retrieve the cut 59 | """ 60 | if array.ndim != 2: 61 | raise RuntimeError("array should be 2D") 62 | 63 | h, w = array.shape 64 | 65 | if cut < 0 or cut >= w + h - 1: 66 | return range(0, 0), range(0, 0) 67 | 68 | cut_length = cut + 1 - max(0, cut - h + 1) - max(0, cut - w + 1) 69 | 70 | if cut < h: 71 | return range(cut, cut - cut_length, -1), range(0, cut_length) 72 | else: 73 | return range(h-1, h-cut_length-1, -1), range(cut - h + 1, cut - h + 1 + cut_length) 74 | 75 | 76 | def shortest_path_distances(cost_array): 77 | """ 78 | Creates a new array of the same shape, where each entry contains the lowest sum of elements on the path 79 | from (0, 0) to that entry. Steps in the path can go horizontal, vertical and diagonal. 80 | 81 | :param cost_array: 2D array containing only positives 82 | :return: a new array 83 | """ 84 | if cost_array.ndim != 2: 85 | raise RuntimeError("array should be 2D") 86 | 87 | dist = np.empty_like(cost_array, dtype=float) 88 | 89 | # Borders can only come from previous step 90 | dist[0, :] = np.cumsum(cost_array[0, :]) 91 | dist[:, 0] = np.cumsum(cost_array[:, 0]) 92 | 93 | # This operation could be vectorised by calculating one cut at a time, but the index juggling becomes quite 94 | # complex for rectangular arrays. 95 | for c in range(1, dist.shape[0]): 96 | for r in range(1, dist.shape[1]): 97 | dist[c, r] = min(dist[c-1, r], dist[c, r-1], dist[c-1, r-1]) + cost_array[c, r] 98 | 99 | return dist 100 | 101 | 102 | def shortest_path(cost_array): 103 | """ 104 | Finds the shortest (= least summed cost) path from the top left of the array to the bottom right. 105 | 106 | :param cost_array: 2D array containing only positives 107 | :return: array of indices, starting from the top left (index: [0, 0]) 108 | """ 109 | if cost_array.ndim != 2: 110 | raise RuntimeError("array should be 2D") 111 | 112 | row = cost_array.shape[0] - 1 113 | col = cost_array.shape[1] - 1 114 | 115 | walk_dist_matrix = shortest_path_distances(cost_array) 116 | 117 | path = [(row, col)] 118 | while row != 0 or col != 0: 119 | best_cost = np.inf 120 | if row != 0 and col != 0: 121 | delta_step = (-1, -1) 122 | best_cost = walk_dist_matrix[row - 1, col - 1] 123 | if row != 0 and walk_dist_matrix[row - 1, col] < best_cost: 124 | delta_step = (-1, 0) 125 | best_cost = walk_dist_matrix[row - 1, col] 126 | if col != 0 and walk_dist_matrix[row, col -1] < best_cost: 127 | delta_step = (0, -1) 128 | 129 | row += delta_step[0] 130 | col += delta_step[1] 131 | path.append((row, col)) 132 | 133 | return path[::-1] # TODO: other indices order 134 | 135 | 136 | def sliding_min(array, window_size): 137 | #result = np.empty(array.shape[0] - window_size + 1, array.dtype) 138 | #d = collections.deque() # d is always sorted 139 | # 140 | #for i in range(array.shape[0]): 141 | # while len(d) > 0 and d[-1][0] >= array[i]: 142 | # d.pop() 143 | # d.append((array[i], i)) 144 | # 145 | # if d[0][1] <= i - window_size: 146 | # d.popleft() 147 | # 148 | # if i >= window_size - 1: 149 | # result[i - window_size + 1] = d[0][0] 150 | # 151 | #return result 152 | 153 | # Pandas has implemented this in native code, speedup of about 10 times 154 | return pd.Series(array).rolling(window_size).min().values[window_size - 1:] 155 | 156 | 157 | def sliding_max(array, window_size): 158 | return pd.Series(array).rolling(window_size).max().values[window_size - 1:] 159 | 160 | 161 | def sliding_window_view(x, shape, step=None, subok=False, writeable=False): 162 | """ 163 | Create sliding window views of the N dimensions array with the given window 164 | shape. Window slides across each dimension of `x` and provides subsets of `x` 165 | at any window position. 166 | 167 | ``sliding_window_view`` create sliding window views of the N dimensions array 168 | with the given window shape and its implementation based on ``as_strided``. 169 | Please note that if writeable set to False, the return is views, not copies 170 | of array. In this case, write operations could be unpredictable, so the return 171 | views is readonly. Bear in mind, return copies (writeable=True), could possibly 172 | take memory multiple amount of origin array, due to overlapping windows. 173 | 174 | For some cases, there may be more efficient approaches 175 | 176 | :param x: ndarray 177 | Array to create sliding window views. 178 | :param shape: sequence of int 179 | The shape of the window. Must have same length as number of input array dimensions. 180 | :param step: sequence of int, optional 181 | The steps of window shifts for each dimension on input array at a time. 182 | If given, must have same length as number of input array dimensions. 183 | Defaults to 1 on all dimensions. 184 | :param subok: bool, optional 185 | If True, then sub-classes will be passed-through, otherwise the returned 186 | array will be forced to be a base-class array (default). 187 | :param writeable: bool, optional 188 | If set to False, the returned array will always be readonly view. 189 | Otherwise it will return writable copies(see Notes). 190 | :return: ndarray 191 | Sliding window views (or copies) of `x`. view.shape = (x.shape - shape) // step + 1 192 | """ 193 | 194 | # MIT License 195 | # Copyright (c) 2018 Fanjin Zeng 196 | # This work is licensed under the terms of the MIT license, see . 197 | # https://gist.github.com/Fnjn/b061b28c05b5b0e768c60964d2cafa8d 198 | 199 | # first convert input to array, possibly keeping subclass 200 | x = np.array(x, copy=False, subok=subok) 201 | 202 | try: 203 | shape = np.array(shape, int) 204 | except: 205 | raise TypeError('`shape` must be a sequence of integer') 206 | else: 207 | if shape.ndim > 1: 208 | raise ValueError('`shape` must be one-dimensional sequence of integer') 209 | if len(x.shape) != len(shape): 210 | raise ValueError("`shape` length doesn't match with input array dimensions") 211 | if np.any(shape <= 0): 212 | raise ValueError('`shape` cannot contain non-positive value') 213 | 214 | if step is None: 215 | step = np.ones(len(x.shape), np.intp) 216 | else: 217 | try: 218 | step = np.array(step, np.intp) 219 | except: 220 | raise TypeError('`step` must be a sequence of integer') 221 | else: 222 | if step.ndim > 1: 223 | raise ValueError('`step` must be one-dimensional sequence of integer') 224 | if len(x.shape) != len(step): 225 | raise ValueError("`step` length doesn't match with input array dimensions") 226 | if np.any(step <= 0): 227 | raise ValueError('`step` cannot contain non-positive value') 228 | 229 | o = (np.array(x.shape) - shape) // step + 1 # output shape 230 | if np.any(o <= 0): 231 | raise ValueError('window shape cannot larger than input array shape') 232 | 233 | strides = x.strides 234 | view_strides = strides * step 235 | 236 | view_shape = np.concatenate((o, shape), axis=0) 237 | view_strides = np.concatenate((view_strides, strides), axis=0) 238 | view = np.lib.stride_tricks.as_strided(x, view_shape, view_strides, subok=subok, writeable=writeable) 239 | 240 | if writeable: 241 | return view.copy() 242 | else: 243 | return view 244 | -------------------------------------------------------------------------------- /distancematrix/valmod.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from distancematrix.generator.znorm_euclidean import ZNormEuclidean 3 | import time 4 | 5 | 6 | def find_variable_length_motifs(series, min_motif_length, max_motif_length, cache_size=3, noise_std=0.): 7 | """ 8 | Finds the top motif for each subsequence length in the given range. The top motif is defined as the 9 | subsequence (for a given length) for which the z-normalized euclidean distance is minimal, excluding any 10 | trivial matches. 11 | 12 | This method implements the VALMOD algorithm described in "Matrix Profile X: VALMOD - Scalable Discovery of 13 | Variable-Length Motifs in Data Series" by M. Linardi et al. 14 | 15 | :param series: one dimensional time series 16 | :param min_motif_length: minimum motif length 17 | :param max_motif_length: maximum motif length (inclusive) 18 | :param cache_size: number of entries kept in memory per subsequence (can only affect performance, default should 19 | be okay) 20 | :param noise_std: standard deviation of noise on the signal, used for correcting the z-normalized euclidean distance 21 | :return: a list of tuples of length (max_motif_length - min_motif_length + 1), containing the indices of the 22 | motif and its match 23 | """ 24 | 25 | if series.ndim != 1: 26 | raise RuntimeError("Series should be 1D") 27 | if min_motif_length < 2 or not np.isfinite(min_motif_length): 28 | raise RuntimeError("Invalid min_motif_length: " + str(min_motif_length)) 29 | if max_motif_length < min_motif_length or not np.isfinite(max_motif_length): 30 | raise RuntimeError("Invalid max_motif_length: " + str(min_motif_length)) 31 | if cache_size < 0: 32 | raise RuntimeError("Invalid p: " + str(min_motif_length)) 33 | 34 | # Stores for each motif length a tuple of the indices of the motif 35 | motifs_found = [] 36 | 37 | dist_generator = ZNormEuclidean(noise_std=noise_std).prepare(min_motif_length, series) 38 | 39 | # Full distance matrix calculation for first motif length 40 | lb_lists, best_motif_idxs = _find_all_motifs_full_matrix_iteration(dist_generator, cache_size, 41 | int(np.ceil(min_motif_length / 2))) 42 | motifs_found.append(best_motif_idxs) 43 | 44 | # For all following motif lengths: try exploiting the lower bound to avoid calculations 45 | for m in range(min_motif_length + 1, max_motif_length + 1): 46 | # Note: might be possible to simply update the existing generator? 47 | dist_generator = ZNormEuclidean(noise_std=noise_std).prepare(m, series) 48 | 49 | num_subseq = len(series) - m + 1 50 | trivial_match_buffer = int(np.ceil(min_motif_length / 2)) 51 | 52 | best_candidate_motif_distance = np.inf 53 | best_candidate_motif_idxs = None 54 | invalid_subseq_idxs = [] # Indices of subsequences for which lower bound pruning did not work 55 | invalid_subseq_lbs = [] # Lower bound for the match on subsequences where pruning did not work 56 | 57 | for i in range(num_subseq): 58 | subseq_lb_list = lb_lists[i] 59 | 60 | best_match_entry = None 61 | best_match_distance = np.inf 62 | subseq_lower_bound = -1 63 | 64 | for entry in subseq_lb_list: 65 | # As motif length grows, some lowerbound entries may have become trivial matches 66 | if abs(entry.q_index - i) <= trivial_match_buffer: 67 | continue 68 | 69 | # Or they may no longer contain valid indices 70 | if entry.q_index >= num_subseq or entry.s_index >= num_subseq: 71 | continue 72 | 73 | entry.dot_prod += series[entry.q_index + m - 1] * series[entry.s_index + m - 1] 74 | 75 | # Calculate actual distance for these indices 76 | dist = dist_generator.calc_single(entry.q_index, entry.s_index, dot_prod=entry.dot_prod) 77 | if dist < best_match_distance: 78 | best_match_distance = dist 79 | best_match_entry = entry 80 | 81 | # Calculate lower bound using last (highest) non-trivial entry 82 | # (all previous entries should have lower bound) 83 | subseq_lower_bound = max(subseq_lower_bound, entry.lower_bound_base / dist_generator.std_s[i]) 84 | 85 | # if minimum of actual distances < largest lower bound 86 | if best_match_distance < subseq_lower_bound: 87 | # best match for this subseq found 88 | if best_match_distance < best_candidate_motif_distance: 89 | best_candidate_motif_distance = best_match_distance 90 | best_candidate_motif_idxs = (best_match_entry.q_index, best_match_entry.s_index) 91 | else: 92 | # best match may be outside the lowerbound entries, but we have a lower bound for its distance 93 | invalid_subseq_idxs.append(i) 94 | invalid_subseq_lbs.append(subseq_lower_bound) 95 | 96 | # If the best candidate motif has a lower distance than all lower bounds, we have the motif 97 | if best_candidate_motif_idxs and best_candidate_motif_distance <= np.min(invalid_subseq_lbs): 98 | motifs_found.append(best_candidate_motif_idxs) 99 | continue 100 | 101 | # if not, we need to calculate all those whose lower bound was lower than the candidate motif to be sure 102 | if len(invalid_subseq_idxs) > num_subseq * np.log(num_subseq): 103 | # If too many columns have to be recalculated, recalculate the entire matrix and update the lb_lists. 104 | # A clear boundary for when this should happen isn't available, 105 | # different strategies might affect performance (but not correctness) 106 | lb_lists, best_candidate_motif_idxs = _find_all_motifs_full_matrix_iteration( 107 | dist_generator, cache_size, trivial_match_buffer) 108 | else: 109 | # Recalculate all columns that might have a better match 110 | for invalid_idx, lower_bound in zip(invalid_subseq_idxs, invalid_subseq_lbs): 111 | if lower_bound < best_candidate_motif_distance: 112 | distances = dist_generator.calc_column(invalid_idx) 113 | trivial_match_start = max(0, invalid_idx - trivial_match_buffer) 114 | trivial_match_end = invalid_idx + trivial_match_buffer + 1 115 | distances[trivial_match_start: trivial_match_end] = np.inf 116 | best_match_distance = np.min(distances) 117 | 118 | if best_match_distance < best_candidate_motif_distance: 119 | best_candidate_motif_distance = best_match_distance 120 | best_candidate_motif_idxs = (np.argmin(distances), invalid_idx) 121 | 122 | # We now have the best motif for sure 123 | motifs_found.append(best_candidate_motif_idxs) 124 | 125 | return motifs_found 126 | 127 | 128 | def _find_all_motifs_full_matrix_iteration(dist_generator, lb_list_size, trivial_match_buffer): 129 | """ 130 | Calculates the entire distance matrix using the provided distance generator. 131 | For each column, lower bounds are calculated (as described in the VALMOD paper) and the lb_list_size best entries 132 | are stored (ordered by ascending distance). 133 | 134 | :param dist_generator: z-normalized distance generator 135 | :param lb_list_size: max number of lower bound entries to store 136 | :param trivial_match_buffer: trivial match buffer, the lb_list will not contain any entries that fall inside 137 | this buffer 138 | :return: tuple of: list of all lb_lists per column, indices of the best motif for the entire distance matrix 139 | """ 140 | num_subseq = dist_generator.mu_s.view.shape[0] 141 | subseq_length = dist_generator.m 142 | 143 | lb_lists = [] 144 | best_motif_dist = np.Inf 145 | best_motif_idxs = None 146 | 147 | for column_idx in range(num_subseq): 148 | distances = dist_generator.calc_column(column_idx) 149 | 150 | # Find best match, while avoiding trivial matches 151 | trivial_match_start = max(0, column_idx - trivial_match_buffer) 152 | trivial_match_end = column_idx + trivial_match_buffer + 1 153 | distances[trivial_match_start: trivial_match_end] = np.inf 154 | 155 | best_dist = np.min(distances) 156 | if best_dist < best_motif_dist: 157 | best_motif_dist = best_dist 158 | best_motif_idxs = (np.argmin(distances), column_idx) 159 | 160 | # Determine lower boundaries 161 | dotprod = dist_generator.prev_calc_column_dot_prod 162 | mu = dist_generator.mu_s.view 163 | std = dist_generator.std_s.view 164 | 165 | if std[column_idx] == 0: 166 | # In case one of the stds is zero, there is no defined formula for a lower bound (not found yet at least). 167 | # So we simply return no empty bounds, so this column will always be calculated. 168 | lb_list = [] 169 | # We can get away with only checking std[column_idx] and not every entry of std (in the else clause): 170 | # if a lower bound is underestimated, it can only result in unneeded calculation, which is ok 171 | # if a lower bound is overestimated, a motif for a stable signal may go undetected, but since the entire 172 | # column will be calculated, it will be found this way. 173 | else: 174 | lower_bound_q = np.clip((dotprod / subseq_length - mu * mu[column_idx]) / (std * std[column_idx]), 0, 1) 175 | lower_bound_base = np.sqrt(subseq_length * (1 - np.square(lower_bound_q))) * std[column_idx] 176 | lower_bound_base[trivial_match_start: trivial_match_end] = np.inf 177 | 178 | closest_indices = np.argsort(lower_bound_base)[:lb_list_size] 179 | 180 | # Cover corner case where there may not be enough non-trivial matches to fill the lb_list 181 | if lower_bound_base[closest_indices[-1]] == np.inf: 182 | first_inf_idx = np.searchsorted(lower_bound_base[closest_indices], np.inf) 183 | closest_indices = closest_indices[:first_inf_idx] 184 | 185 | lb_list = [] 186 | for i in range(len(closest_indices)): 187 | lb_list.append(LowerBoundEntry(closest_indices[i], column_idx, lower_bound_base[closest_indices[i]], 188 | dotprod[closest_indices[i]])) 189 | lb_lists.append(lb_list) 190 | 191 | return lb_lists, best_motif_idxs 192 | 193 | 194 | class LowerBoundEntry: 195 | def __init__(self, q_index, s_index, lower_bound_base, dot_prod): 196 | self.q_index = q_index 197 | self.s_index = s_index 198 | self.lower_bound_base = lower_bound_base 199 | self.dot_prod = dot_prod 200 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | _autosummary/ 2 | _build/ 3 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | 14 | import os 15 | import sys 16 | sys.path.insert(0, os.path.abspath('..')) 17 | 18 | 19 | # -- Project information ----------------------------------------------------- 20 | 21 | project = 'Series Distance Matrix' 22 | copyright = '2021, Dieter De Paepe' 23 | author = 'Dieter De Paepe' 24 | 25 | 26 | # -- General configuration --------------------------------------------------- 27 | 28 | # Add any Sphinx extension module names here, as strings. They can be 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 30 | # ones. 31 | extensions = [ 32 | 'sphinx.ext.githubpages', # Create a .no_jekyll file in output 33 | 'autoapi.extension', # Automatically generate an API overview 34 | 'nbsphinx', # Convert jupyter notebooks 35 | 'myst_parser' # Accept markdown files 36 | ] 37 | autoapi_type = 'python' 38 | autoapi_dirs = ['../distancematrix'] 39 | autoapi_options = [ 'members', 'undoc-members', 'show-inheritance', 'show-module-summary', 'imported-members', ] 40 | autoapi_ignore = ['*test*'] 41 | 42 | # Add any paths that contain templates here, relative to this directory. 43 | templates_path = ['_templates'] 44 | 45 | # List of patterns, relative to source directory, that match files and 46 | # directories to ignore when looking for source files. 47 | # This pattern also affects html_static_path and html_extra_path. 48 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 49 | 50 | 51 | # -- Options for HTML output ------------------------------------------------- 52 | 53 | # The theme to use for HTML and HTML Help pages. See the documentation for 54 | # a list of builtin themes. 55 | # 56 | # html_theme = 'alabaster' 57 | html_theme = 'sphinx_rtd_theme' # https://sphinx-themes.org/sample-sites/sphinx-rtd-theme/ 58 | 59 | # Add any paths that contain custom static files (such as style sheets) here, 60 | # relative to this directory. They are copied after the builtin static files, 61 | # so a file named "default.css" will overwrite the builtin "default.css". 62 | html_static_path = ['_static'] -------------------------------------------------------------------------------- /docs/doc_environment.yml: -------------------------------------------------------------------------------- 1 | 2 | # Create this anaconda environment using the following command: 3 | # conda env create -f doc_environment.yml 4 | 5 | name: doc_env 6 | 7 | channels: 8 | - conda-forge 9 | - defaults 10 | 11 | dependencies: 12 | - python>=3.6 13 | - sphinx-autoapi 14 | - myst-parser 15 | - sphinx_rtd_theme 16 | - nbsphinx # Enable notebook conversion 17 | - ipython # Includes ipython lexer for converting notebooks 18 | 19 | -------------------------------------------------------------------------------- /docs/examples.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Examples 3 | ======== 4 | 5 | .. toctree:: 6 | :maxdepth: 2 7 | :glob: 8 | 9 | Example_matrix_profile.ipynb -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | 2 | Welcome to Series Distance Matrix's documentation! 3 | ================================================== 4 | 5 | 6 | .. toctree:: 7 | :hidden: 8 | :maxdepth: 2 9 | :caption: Contents: 10 | 11 | examples.rst 12 | install.md 13 | 14 | 15 | 16 | Indices and tables 17 | ================== 18 | 19 | * :ref:`genindex` 20 | * :ref:`modindex` 21 | -------------------------------------------------------------------------------- /docs/install.md: -------------------------------------------------------------------------------- 1 | # Installing 2 | 3 | Using pip: 4 | ```bash 5 | pip install seriesdistancematrix 6 | ``` 7 | 8 | Alternatively, clone this repositor and run: 9 | ```bash 10 | python setup.py clean build install 11 | ``` 12 | 13 | For local development (this allows you to edit code without having to reinstall the library): 14 | ```bash 15 | python setup.py develop 16 | ``` -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from setuptools import find_packages 3 | 4 | def readme(): 5 | with open("README.md") as readme: 6 | return readme.read() 7 | 8 | setup(name='seriesdistancematrix', 9 | version='0.3.1', # Also update distancematrix/__init__.py! 10 | description=( 11 | 'Flexible time series analysis library' 12 | 'implementing Matrix Profile related functionality.' 13 | ), 14 | long_description_content_type="text/markdown", 15 | long_description=readme(), 16 | keywords=[ 17 | 'time series', 18 | 'matrix profile', 19 | 'contextual matrix profile', 20 | 'radius profile', 21 | 'series distance matrix', 22 | 'motif', 23 | 'discord' 24 | ], 25 | url='https://github.com/predict-idlab/seriesdistancematrix/', 26 | project_urls={ 27 | 'Documentation': 'https://predict-idlab.github.io/seriesdistancematrix/', 28 | 'Source': 'https://github.com/predict-idlab/seriesdistancematrix/' 29 | }, 30 | author='Dieter De Paepe', 31 | author_email='dieter.depaepe@gmail.com', 32 | license='MIT', 33 | packages=find_packages(exclude=["distancematrix.tests*"]), 34 | classifiers=( 35 | 'License :: OSI Approved :: MIT License', 36 | 'Intended Audience :: Science/Research', 37 | 'Intended Audience :: Developers', 38 | 'Topic :: Software Development', 39 | 'Topic :: Scientific/Engineering', 40 | 'Programming Language :: Python', 41 | 'Programming Language :: Python :: 3', 42 | 'Operating System :: OS Independent' 43 | ), 44 | install_requires=['numpy', 'scipy', 'pandas'] 45 | ) 46 | -------------------------------------------------------------------------------- /test_environment.yml: -------------------------------------------------------------------------------- 1 | name: test_env 2 | 3 | dependencies: 4 | - python>=3.6 5 | - nose 6 | - numpy 7 | - scipy 8 | - pandas --------------------------------------------------------------------------------