├── tests
    ├── __init__.py
    ├── data
    │   ├── test_interval.py
    │   └── test_helpers_data.py
    ├── optimization
    │   └── test_gaussian.py
    └── models
    │   ├── test_helpers_models.py
    │   └── test_models_builder.py
├── eventdetector_ts
    ├── data
    │   ├── __init__.py
    │   ├── interval.py
    │   └── helpers_data.py
    ├── optimization
    │   ├── __init__.py
    │   ├── algorithms.py
    │   └── event_extraction_pipeline.py
    ├── prediction
    │   ├── __init__.py
    │   ├── utils.py
    │   └── prediction.py
    ├── metamodel
    │   ├── __init__.py
    │   ├── utils.py
    │   └── meta_model.py
    ├── models
    │   ├── __init__.py
    │   ├── helpers_models.py
    │   └── models_trainer.py
    ├── plotter
    │   ├── __init__.py
    │   ├── helpers.py
    │   └── plotter.py
    └── __init__.py
├── images
    ├── op_bs.png
    ├── op_ccf.png
    ├── delta_t_bs.png
    ├── losses_bs.png
    ├── losses_ccf.png
    ├── op_mex_ccf.png
    ├── delta_t_ccf.png
    ├── losses_mex_ccf.png
    ├── inputs_event_detector.png
    └── logo_eventdetector.svg
├── requirements.txt
├── requirements_dev.txt
├── .gitignore
├── LICENSE
├── pyproject.toml
├── .github
    └── workflows
    │   └── unit_tests.yml
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
4 | 


--------------------------------------------------------------------------------
/eventdetector_ts/data/__init__.py:
--------------------------------------------------------------------------------
1 | VALUE_ERROR = ValueError("Invalid TimeUnit value.")
2 | 


--------------------------------------------------------------------------------
/images/op_bs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodeSignal/hire_eventdetector/main/images/op_bs.png


--------------------------------------------------------------------------------
/images/op_ccf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodeSignal/hire_eventdetector/main/images/op_ccf.png


--------------------------------------------------------------------------------
/eventdetector_ts/optimization/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | logger = logging.getLogger(__name__)
4 | 


--------------------------------------------------------------------------------
/eventdetector_ts/prediction/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | logger = logging.getLogger(__name__)
4 | 


--------------------------------------------------------------------------------
/images/delta_t_bs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodeSignal/hire_eventdetector/main/images/delta_t_bs.png


--------------------------------------------------------------------------------
/images/losses_bs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodeSignal/hire_eventdetector/main/images/losses_bs.png


--------------------------------------------------------------------------------
/images/losses_ccf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodeSignal/hire_eventdetector/main/images/losses_ccf.png


--------------------------------------------------------------------------------
/images/op_mex_ccf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodeSignal/hire_eventdetector/main/images/op_mex_ccf.png


--------------------------------------------------------------------------------
/images/delta_t_ccf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodeSignal/hire_eventdetector/main/images/delta_t_ccf.png


--------------------------------------------------------------------------------
/eventdetector_ts/metamodel/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | logger_meta_model = logging.getLogger(__name__)
4 | 


--------------------------------------------------------------------------------
/images/losses_mex_ccf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodeSignal/hire_eventdetector/main/images/losses_mex_ccf.png


--------------------------------------------------------------------------------
/images/inputs_event_detector.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CodeSignal/hire_eventdetector/main/images/inputs_event_detector.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | joblib
 2 | matplotlib
 3 | numpy
 4 | pandas
 5 | python_dateutil
 6 | scikit_learn
 7 | scipy
 8 | seaborn
 9 | sympy
10 | colorlog
11 | pydot
12 | pyqt5
13 | tqdm


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
 1 | joblib
 2 | matplotlib
 3 | numpy
 4 | pandas
 5 | python_dateutil
 6 | scikit_learn
 7 | scipy
 8 | seaborn
 9 | sympy
10 | tensorflow
11 | colorlog
12 | pydot
13 | pyqt5
14 | tqdm
15 | 


--------------------------------------------------------------------------------
/eventdetector_ts/models/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | logger_models = logging.getLogger(__name__)
4 | 
5 | ACTIVATION_FUNCTIONS = ["relu", "sigmoid", "tanh", "softmax", "leaky_relu", "elu", "selu", "swish"]
6 | 


--------------------------------------------------------------------------------
/eventdetector_ts/plotter/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | logger = logging.getLogger(__name__)
 4 | COLOR_TRUE = "k"  # black
 5 | COLOR_PREDICTED = "r"  # red
 6 | STYLE_TRUE = "-"  # solid line
 7 | STYLE_PREDICTED = "--"  # dashed line
 8 | FIG_SIZE = (6, 4.5)  # width, height in inches
 9 | PALETTE = "tab10"  # categorical color map
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized files
 2 | *.py[cod]
 3 | __pycache__/
 4 | *.py[cod]?
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | dist/
11 | build/
12 | eggs/
13 | *.egg-info/
14 | .svn/
15 | *.swp
16 | *.tar.gz
17 | *.tgz
18 | *.zip
19 | *.rar
20 | 
21 | # Development
22 | *.bak
23 | *.tmp
24 | 
25 | # IDE specific files
26 | .vscode/
27 | .idea/
28 | 
29 | # Jupyter Notebook
30 | .ipynb_checkpoints/
31 | 
32 | # Environment
33 | .env
34 | env/
35 | venv/
36 | ENV/
37 | env.bak/
38 | venv.bak/
39 | 
40 | # Compiled Python modules
41 | *.pyd
42 | 
43 | # Coverage
44 | .coverage
45 | .coverage.*
46 | htmlcov/
47 | 
48 | # Type checking
49 | .mypy_cache/
50 | .dmypy.json
51 | 
52 | # Sphinx documentation
53 | docs/_build/
54 | 
55 | # Ignore .pkl file
56 | *.pkl
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/eventdetector_ts/optimization/algorithms.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | def convolve_with_gaussian_kernel(signal: np.ndarray, sigma: Union[int, float], m: int) -> np.ndarray:
 7 |     """
 8 |     Convolve a signal with a Gaussian kernel.
 9 | 
10 |     Args:
11 |         signal (np.ndarray): The input signal to convolve.
12 |         sigma (Union[int, float]): The standard deviation of the Gaussian kernel.
13 |         m (int): The radius of the kernel.
14 | 
15 |     Returns:
16 |         np.ndarray: The convolved signal.
17 | 
18 |     """
19 | 
20 |     # Create the Gaussian kernel
21 |     kernel = (1 / (np.sqrt(2 * np.pi) * sigma)) * np.exp(-(np.arange(-m, m + 1) ** 2) / (2 * sigma ** 2))
22 |     kernel /= np.sum(kernel)  # Normalize the kernel
23 | 
24 |     # Perform the convolution
25 |     convolved_signal = np.convolve(signal, kernel, mode='same')
26 | 
27 |     return convolved_signal
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) [year] [fullname]
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/tests/data/test_interval.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datetime import datetime, timedelta
 3 | 
 4 | from eventdetector_ts.data.interval import Interval
 5 | 
 6 | 
 7 | class TestInterval(unittest.TestCase):
 8 |     def setUp(self):
 9 |         self.interval1 = Interval(datetime(2010, 7, 21, 18, 25), datetime(2010, 7, 21, 18, 28))
10 |         self.interval2 = Interval(datetime(2010, 7, 21, 18, 24, 30), datetime(2010, 7, 21, 18, 27, 30))
11 |         self.interval3 = Interval(datetime(2010, 7, 21, 18, 26, 30), datetime(2010, 7, 21, 18, 29, 30))
12 | 
13 |     def test_overlap(self):
14 |         self.assertEqual(self.interval1.overlap(self.interval2), timedelta(seconds=150))
15 |         self.assertEqual(self.interval1.overlap(self.interval3), timedelta(seconds=90))
16 |         self.assertEqual(self.interval2.overlap(self.interval3), timedelta(seconds=60))
17 | 
18 |     def test_overlapping_parameter(self):
19 |         self.assertEqual(round(self.interval1.overlapping_parameter(self.interval2), 3), 0.714)
20 |         self.assertEqual(round(self.interval1.overlapping_parameter(self.interval3), 3), 0.333)
21 |         self.assertEqual(round(self.interval2.overlapping_parameter(self.interval3), 3), 0.200)
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     unittest.main()
26 | 


--------------------------------------------------------------------------------
/tests/optimization/test_gaussian.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | 
 5 | from eventdetector_ts.optimization.algorithms import convolve_with_gaussian_kernel
 6 | 
 7 | 
 8 | def convolution_with_gaussian(signal, sigma, m):
 9 |     signal_size = len(signal)
10 | 
11 |     output = []
12 |     for n in range(signal_size):
13 |         temp = 0
14 |         sum_kernel = 0
15 |         for i in range(-m, m + 1):
16 |             g_i = (1 / (np.sqrt(2 * np.pi) * sigma)) * np.exp(-(i ** 2) / (2. * sigma ** 2))
17 |             if 0 <= (n - i) < signal_size:
18 |                 temp += g_i * signal[n - i]
19 |             sum_kernel += g_i
20 | 
21 |         output.append(temp / sum_kernel)
22 |     return output
23 | 
24 | 
25 | class TestGaussianFilter(unittest.TestCase):
26 |     def test_gaussian_filter(self):
27 |         signal = np.array([1.0, 2, 3, 4.0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16])
28 |         sigma = 1
29 |         m = 2
30 | 
31 |         convolved_signal = convolve_with_gaussian_kernel(signal=signal, sigma=sigma, m=m)
32 |         convolved_signal_expected = convolution_with_gaussian(signal=signal, sigma=sigma, m=m)
33 | 
34 |         # Check if the outputs are equal
35 |         np.testing.assert_allclose(convolved_signal_expected, convolved_signal, atol=1e-8)
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     unittest.main()
40 | 


--------------------------------------------------------------------------------
/tests/models/test_helpers_models.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import numpy as np
 4 | import tensorflow as tf
 5 | 
 6 | from eventdetector_ts.models.helpers_models import CustomEarlyStopping
 7 | 
 8 | 
 9 | class TestHelpers(unittest.TestCase):
10 |     def setUp(self):
11 |         pass
12 | 
13 |     class TestCustomEarlyStopping(tf.test.TestCase):
14 |         def test_on_epoch_end(self):
15 |             # Create a custom early stopping callback
16 |             early_stopping = CustomEarlyStopping(ratio=2.0, patience=3, verbose=0)
17 | 
18 |             # Set up test data
19 |             x_train = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
20 |             y_train = np.array([0, 1, 1, 0])
21 |             x_val = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
22 |             y_val = np.array([0, 1, 1, 0])
23 | 
24 |             # Define a simple model
25 |             model = tf.keras.models.Sequential([
26 |                 tf.keras.layers.Dense(2, activation='sigmoid', input_shape=(2,)),
27 |                 tf.keras.layers.Dense(1, activation='sigmoid')
28 |             ])
29 |             model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
30 | 
31 |             # Train the model with the custom early stopping callback
32 |             model.fit(x_train, y_train, epochs=10, validation_data=(x_val, y_val), callbacks=[early_stopping])
33 | 
34 |             # Check that training was stopped early
35 |             self.assertLess(early_stopping.stopped_epoch, 10)
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     unittest.main()
40 | 


--------------------------------------------------------------------------------
/eventdetector_ts/prediction/utils.py:
--------------------------------------------------------------------------------
 1 | import csv
 2 | import os
 3 | from typing import List
 4 | 
 5 | import numpy as np
 6 | import seaborn as sns
 7 | from matplotlib import pyplot as plt
 8 | 
 9 | from eventdetector_ts.plotter import COLOR_PREDICTED, COLOR_TRUE
10 | 
11 | 
12 | def plot_prediction(predicted_op: np.ndarray, filtered_predicted_op: np.ndarray) -> None:
13 |     """
14 |     Plot the original and filtered predicted Op
15 |     Args:
16 |         predicted_op (np.ndarray): Predicted Op
17 |         filtered_predicted_op (np.ndarray): Filtered predicted Op
18 | 
19 |     Returns:
20 |         None
21 |     """
22 |     sns.set(style="ticks", palette="Set2")
23 |     plt.figure(figsize=(8, 6))  # Set the figure size
24 | 
25 |     # Plot the true and predicted values using Seaborn
26 |     n = len(predicted_op)
27 |     sns.lineplot(x=np.arange(n), y=predicted_op, color=COLOR_TRUE, label='Predicted Op')
28 |     sns.lineplot(x=np.arange(n), y=filtered_predicted_op, color=COLOR_PREDICTED, label='Filtered Predicted Op')
29 | 
30 |     # Add labels and title to the plot
31 |     plt.xlabel('Partitions')
32 |     plt.ylabel('Op')
33 |     plt.title('Predicted Op')
34 |     # Add legend
35 |     plt.legend()
36 |     # Show
37 |     plt.show()
38 | 
39 | 
40 | def write_events_to_csv(events: List, name: str) -> None:
41 |     path = os.path.join(f"{name}.csv")
42 |     with open(path, 'w', encoding='UTF8', newline='') as f:
43 |         writer = csv.writer(f, delimiter=' ')
44 |         for (start_time, end_time) in events:
45 |             writer.writerow([start_time, end_time])
46 | 


--------------------------------------------------------------------------------
/eventdetector_ts/plotter/helpers.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | 
 3 | import matplotlib.dates as mdates
 4 | from matplotlib.patches import Rectangle
 5 | 
 6 | from eventdetector_ts import TimeUnit
 7 | from eventdetector_ts.data.helpers_data import convert_time_to_datetime, get_timedelta
 8 | 
 9 | 
10 | def event_to_rectangle(event, width_events_s: float, time_unit: TimeUnit, color, height=1, style="solid"):
11 |     """
12 |     Function to convert an event to a rectangle object for visualization.
13 |     
14 |     Args:
15 |     event (datetime or other): The event timestamp or object.
16 |     width_events_s (float): The width of events in the unit of time for the dataset.
17 |     time_unit (TimeUnit): The time unit of the partition size.
18 |     color (str): The color of the rectangle.
19 |     height (int): The height of the rectangle.
20 |     style (str): The line style of the rectangle.
21 | 
22 |     Returns:
23 |         Rectangle: The rectangle object representing the event.
24 | 
25 |     """
26 |     time = event
27 |     if not isinstance(event, datetime):
28 |         time = convert_time_to_datetime(event, to_timestamp=False)
29 |     w_s_timedelta = get_timedelta(float(width_events_s) / 2, time_unit)
30 |     start_time = time - w_s_timedelta
31 |     end_time = time + w_s_timedelta
32 | 
33 |     start_rect = mdates.date2num(start_time)
34 |     end_rect = mdates.date2num(end_time)
35 | 
36 |     width_rect = end_rect - start_rect
37 |     rect = Rectangle((start_rect, 0), width_rect, height, edgecolor=color, linestyle=style,
38 |                      facecolor='none', linewidth=1)
39 | 
40 |     return rect
41 | 


--------------------------------------------------------------------------------
/tests/models/test_models_builder.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import tensorflow as tf
 4 | from sympy.testing import pytest
 5 | 
 6 | from eventdetector_ts import RNN_ENCODER_DECODER, FFN, CNN, RNN_BIDIRECTIONAL, CONV_LSTM1D, LSTM, SELF_ATTENTION
 7 | from eventdetector_ts.models.models_builder import ModelBuilder, ModelCreator
 8 | 
 9 | 
10 | class TestModelsBuilder(unittest.TestCase):
11 |     def setUp(self):
12 |         # create a model builder with an input layer
13 |         self.inputs = tf.keras.layers.Input(shape=(10,))
14 |         self.model_builder = ModelBuilder(self.inputs)
15 |         self.inputs_rnn = tf.keras.Input(shape=(45, 5), name="Input")
16 | 
17 |     def test_check_input_shape(self):
18 |         # create a layer with compatible input shape and call __check_input_shape
19 |         layer1 = tf.keras.layers.Dense(5)
20 |         output1 = self.model_builder._ModelBuilder__check_input_shape(layer1)
21 |         self.assertEqual(output1.shape, tf.TensorShape([None, 10]))
22 | 
23 |     def test_add_layer(self):
24 |         layer1 = tf.keras.layers.Dense(5)
25 |         self.model_builder._ModelBuilder__add_layer(layer1)
26 |         self.assertEqual(self.model_builder.outputs.shape, tf.TensorShape([None, 5]))
27 | 
28 |         layer2 = tf.keras.layers.Conv2D(32, kernel_size=3)
29 |         with pytest.raises(ValueError):
30 |             self.model_builder._ModelBuilder__add_layer(layer2)
31 | 
32 |     def test_create_models(self):
33 |         model_creator = ModelCreator(
34 |             [(RNN_ENCODER_DECODER, 1), (FFN, 2), (CNN, 2), (RNN_BIDIRECTIONAL, 1), (CONV_LSTM1D, 1), (LSTM, 3),
35 |              (SELF_ATTENTION, 3)],
36 |             hyperparams_rnn=(1, 2, 45, 46, "tanh"),
37 |             hyperparams_cnn=(64, 65, 3, 4, 1, 1, "relu"),
38 |             hyperparams_ffn=(1, 2, 64, 128, "sigmoid"), save_models_as_dot_format=False, root_dir=None, dropout=0.3,
39 |             last_act_func="sigmoid", hyperparams_transformer=(256, 4, 1, True, "relu"))
40 | 
41 |         model_creator.create_models(inputs=self.inputs_rnn)
42 | 
43 |         for key, value in model_creator.created_models.items():
44 |             keras_model: tf.keras.Model = value
45 |             self.assertEqual(keras_model.layers[-1].output_shape, (None, 1))
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     unittest.main()
50 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.ruff]
 2 | line-length = 120
 3 | 
 4 | [tool.coverage.run]
 5 | omit = [
 6 |     "eventdetector_ts/plotter/*",
 7 |     "eventdetector_ts/prediction/*",
 8 |     "eventdetector_ts/metamodel/*",
 9 |     "eventdetector_ts/optimization/event_extraction_pipeline.py",
10 |     "eventdetector_ts/models/models_trainer.py"
11 | ]
12 | source = ["eventdetector_ts"]
13 | 
14 | [build-system]
15 | requires = ["flit_core>=3.4"]
16 | build-backend = "flit_core.buildapi"
17 | 
18 | [project]
19 | name = "eventdetector_ts"
20 | version = "1.1.0"
21 | description = "EventDetector introduces a universal event detection method for multivariate time series. Unlike traditional deep-learning methods, it's regression-based, requiring only reference events. The robust stacked ensemble, from Feed-Forward Neural Networks to Transformers, ensures accuracy by mitigating biases. The package supports practical implementation, excelling in detecting events with precision, validated across diverse domains."
22 | keywords = [
23 |     "Universal Event Detection",
24 |     "Multivariate Time Series",
25 |     "Regression-based",
26 |     "Stacked Ensemble Learning",
27 |     "Deep Learning Models",
28 |     "Feed-Forward Neural Networks",
29 |     "Transformers",
30 |     "Event Detection Package",
31 |     "Rare Events",
32 |     "Imbalanced Datasets",
33 |     "Anomaly Detection",
34 |     "Change Point Detection",
35 |     "Fraud Detection",
36 |     "Empirical Validations"
37 | ]
38 | authors = [
39 |     { name = "Menouar Azib", email = "menouar.azib@akkodis.com" }
40 | ]
41 | 
42 | maintainers = [
43 |     { name = "Menouar Azib", email = "menouar.azib@akkodis.com" }
44 | ]
45 | requires-python = ">=3.9"
46 | readme = "README.md"
47 | license = { file = "LICENSE" }
48 | classifiers = ["License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", "Operating System :: OS Independent"]
49 | 
50 | dependencies = [
51 |     "joblib",
52 |     "matplotlib",
53 |     "numpy",
54 |     "pandas",
55 |     "python_dateutil",
56 |     "scikit_learn",
57 |     "scipy",
58 |     "seaborn",
59 |     "sympy",
60 |     "colorlog",
61 |     "pydot",
62 |     "pyqt5",
63 |     "tqdm"
64 | ]
65 | 
66 | [project.urls]
67 | "Homepage" = "https://github.com/menouarazib/eventdetector"
68 | "Bug Tracker" = "https://github.com/menouarazib/eventdetector/issues"


--------------------------------------------------------------------------------
/.github/workflows/unit_tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests and Lint
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |       - dev
 8 |     paths-ignore:
 9 |       - 'README.md'
10 |       - 'pyproject.toml'
11 | 
12 | jobs:
13 |   build:
14 |     runs-on: ${{ matrix.os }}
15 |     strategy:
16 |       matrix:
17 |         os: [ ubuntu-latest, windows-latest, macos-latest ]
18 |         python-version: [ "3.9", "3.10", "3.11" ]
19 | 
20 |     steps:
21 |       - uses: actions/checkout@v3
22 |       - name: Set up Python ${{ matrix.python-version }}
23 |         uses: actions/setup-python@v4
24 |         with:
25 |           python-version: ${{ matrix.python-version }}
26 |       - name: Update pip and setuptools
27 |         run: |
28 |           python -m pip install --upgrade pip
29 |           python -m pip install --upgrade setuptools
30 | 
31 |       - name: Install dependencies
32 |         run: |
33 |           pip install --no-cache-dir ruff pytest coverage
34 |           pip install --no-cache-dir -r requirements_dev.txt
35 |       - name: Lint with ruff
36 |         run: |
37 |           # stop the build if there are Python syntax errors or undefined names
38 |           ruff --output-format=github --select=E9,F63,F7,F82 --target-version=py37 .
39 |           # default set of ruff rules with GitHub Annotations
40 |           ruff --output-format=github --target-version=py37 .
41 |         continue-on-error: true
42 |       - name: List files in workspace
43 |         run: |
44 |           ls "${{ github.workspace }}"
45 |       - name: Run unit tests with coverage
46 |         env: # Add the env section with GITHUB_TOKEN
47 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
48 |         run: |
49 |           coverage run -m pytest tests/
50 |           coverage xml -o coverage.xml
51 |       - name: Upload coverage report (only for ubuntu-latest and python 3.10)
52 |         if: ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.10' }}
53 |         uses: actions/upload-artifact@v3
54 |         with:
55 |           name: coverage-report
56 |           path: coverage.xml
57 |       - name: Run Coveralls (only for ubuntu-latest and python 3.10)
58 |         if: ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.10' }}
59 |         env:
60 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
61 |         uses: coverallsapp/github-action@v2
62 | 


--------------------------------------------------------------------------------
/eventdetector_ts/data/interval.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | 
 3 | 
 4 | class Interval:
 5 |     """
 6 |     Represents a time interval between two datetime objects. This class is used to model an event or partition in
 7 |     time-series.
 8 |     """
 9 | 
10 |     def __init__(self, start_time: datetime, end_time: datetime):
11 |         """
12 |         Constructs an interval for a given start and end time.
13 | 
14 |         Args:
15 |             start_time (datetime): The starting time of the interval.
16 |             end_time (datetime): The ending time of the interval.
17 |         """
18 |         self.start_time = start_time
19 |         self.end_time = end_time
20 |         self.duration = self.end_time - self.start_time
21 | 
22 |     def __str__(self) -> str:
23 |         """
24 |         Returns a string representation of the interval in the format "start_time ---> end_time".
25 | 
26 |         Returns:
27 |             str: A string representation of the interval.
28 |         """
29 |         return "{} ---> {}".format(self.start_time, self.end_time)
30 | 
31 |     def __repr__(self) -> str:
32 |         """
33 |         Returns a string representation of the interval in the format "start_time ---> end_time".
34 | 
35 |         Returns:
36 |             str: A string representation of the interval.
37 |         """
38 |         return "{} ---> {}".format(self.start_time, self.end_time)
39 | 
40 |     def overlap(self, other: 'Interval') -> timedelta:
41 |         """
42 |         Computes the overlapping time (ot) between this interval and another interval.
43 | 
44 |         Args:
45 |             other (Interval): Another interval to compare with.
46 | 
47 |         Returns:
48 |             timedelta: The overlapping time between this interval and the other interval as a timedelta object.
49 |         """
50 |         overlap_start_time = max(self.start_time, other.start_time)
51 |         overlap_end_time = min(self.end_time, other.end_time)
52 |         overlap_duration = max(timedelta(0), overlap_end_time - overlap_start_time)
53 |         return overlap_duration
54 | 
55 |     def overlapping_parameter(self, other: 'Interval') -> float:
56 |         """
57 |         Computes the overlapping parameter between this interval and another interval.
58 | 
59 |         Args:
60 |             other (Interval): Another interval to compare with.
61 | 
62 |         Returns:
63 |             float: A floating number between 0.0 and 1.0 representing the degree of overlap between the two intervals.
64 |         """
65 |         if other is None:
66 |             return 0.0
67 |         overlap_duration = self.overlap(other)
68 |         total_duration = self.duration + other.duration - overlap_duration
69 |         return overlap_duration / total_duration
70 | 


--------------------------------------------------------------------------------
/eventdetector_ts/__init__.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from enum import Enum
  3 | from logging import config
  4 | from typing import Dict, Optional
  5 | from urllib.request import urlretrieve
  6 | 
  7 | import pandas as pd
  8 | from tqdm import tqdm
  9 | 
 10 | TIME_LABEL = "time"
 11 | MIDDLE_EVENT_LABEL = "event"
 12 | 
 13 | LSTM = "LSTM"
 14 | GRU = "GRU"
 15 | CNN = "CNN"
 16 | RNN_BIDIRECTIONAL = "RNN_BIDIRECTIONAL"
 17 | CONV_LSTM1D = "CONV_LSTM_1D"
 18 | RNN_ENCODER_DECODER = "RNN_ENCODER_DECODER"
 19 | CNN_RNN = "CNN_RNN"
 20 | SELF_ATTENTION = "SELF_ATTENTION"
 21 | TRANSFORMER = "TRANSFORMER"
 22 | FFN = "FFN"
 23 | 
 24 | FILL_NAN_ZEROS = 'zeros'
 25 | FILL_NAN_FFILL = 'ffill'
 26 | FILL_NAN_BFILL = 'bfill'
 27 | FILL_NAN_MEDIAN = 'median'
 28 | 
 29 | TYPE_TRAINING_AVERAGE = 'average'
 30 | TYPE_TRAINING_FFN = 'ffn'
 31 | META_MODEL_NETWORK = "meta_model_ffn"
 32 | META_MODEL_SCALER = "meta_model_scaler"
 33 | 
 34 | # Define constants for scaler types
 35 | MIN_MAX_SCALER = "MinMaxScaler"
 36 | STANDARD_SCALER = "StandardScaler"
 37 | ROBUST_SCALER = "RobustScaler"
 38 | 
 39 | SCALERS_DIR = "scalers"
 40 | MODELS_DIR = "models"
 41 | OUTPUT_DIR = "output"
 42 | CONFIG_FILE = ".config.json"
 43 | # Store some important values for prediction
 44 | config_dict: Dict = {}
 45 | 
 46 | 
 47 | class TimeUnit(Enum):
 48 |     """
 49 |     An enumeration of different time units.
 50 | 
 51 |     Attributes:
 52 |         SECOND: The time unit is in seconds.
 53 |         MILLISECOND: The time unit is in milliseconds.
 54 |         MICROSECOND: The time unit is in microseconds.
 55 |         MINUTE: The time unit is in minutes.
 56 |         HOUR: The time unit is in hours.
 57 |         DAY: The time unit is in days.
 58 |         YEAR: The time unit is in years.
 59 |     """
 60 |     SECOND = "second"
 61 |     MILLISECOND = "millisecond"
 62 |     MICROSECOND = "microsecond"
 63 |     MINUTE = "minute"
 64 |     HOUR = "hour"
 65 |     DAY = "day"
 66 |     YEAR = "year"
 67 | 
 68 |     @classmethod
 69 |     def _missing_(cls, value):
 70 |         return cls.SECOND
 71 | 
 72 |     def __str__(self):
 73 |         return self.value
 74 | 
 75 | 
 76 | LOGGING_CONFIG = {
 77 |     "version": 1,
 78 |     "disable_existing_loggers": False,
 79 |     "formatters": {
 80 |         "colored": {
 81 |             "()": "colorlog.ColoredFormatter",
 82 |             "format": "%(asctime)s %(log_color)s[%(levelname)s] %(name)s: %(message)s",
 83 |             "datefmt": "%Y-%m-%d %H:%M:%S",
 84 |             "log_colors": {
 85 |                 "DEBUG": "cyan",
 86 |                 "INFO": "white",
 87 |                 "WARNING": "yellow",
 88 |                 "ERROR": "red",
 89 |                 "CRITICAL": "red,bg_white",
 90 |             },
 91 |         },
 92 |     },
 93 |     "handlers": {
 94 |         "console": {
 95 |             "class": "logging.StreamHandler",
 96 |             "level": "DEBUG",
 97 |             "formatter": "colored",
 98 |             "stream": "ext://sys.stdout",
 99 |         },
100 |     },
101 |     "loggers": {
102 |         "": {
103 |             "handlers": ["console"],
104 |             "level": "INFO",
105 |             "propagate": False,
106 |         },
107 |     },
108 | }
109 | 
110 | config.dictConfig(LOGGING_CONFIG)
111 | 
112 | 
113 | def my_hook(t):
114 |     """
115 |     Wraps tqdm instance. Don't forget to close() or __exit__()
116 |     the tqdm instance once you're done with it (easiest using `with` syntax).
117 |   
118 |     Example
119 |     -------
120 |   
121 |     
122 |   
123 |     """
124 |     last_b = [0]
125 | 
126 |     def inner(b=1, bsize=1, t_size=None):
127 |         """
128 |         b  : int, optional
129 |             Number of blocks just transferred [default: 1].
130 |         bsize  : int, optional
131 |             Size of each block (in tqdm units) [default: 1].
132 |         t_size  : int, optional
133 |             Total size (in tqdm units). If [default: None] remains unchanged.
134 |         """
135 |         if t_size is not None:
136 |             t.total = t_size
137 |         t.update((b - last_b[0]) * bsize)
138 |         last_b[0] = b
139 | 
140 |     return inner
141 | 
142 | 
143 | def load_dataset(file_path: str, name: str, url=None, index_col: Optional[int] = 0) -> pd.DataFrame:
144 |     """
145 |     Load a dataset from a file. If the file is not found, it will be downloaded from the given URL.
146 | 
147 |     Args:
148 |         name: Name of the file to load
149 |         index_col: the same value as pandas index_col
150 |         file_path (str): The path to the dataset file.
151 |         url (str): The URL from which to download the dataset (optional).
152 | 
153 |     Returns:
154 |         pandas.DataFrame: The loaded dataset.
155 |     """
156 | 
157 |     file_extension = os.path.splitext(file_path)[1].lower()
158 | 
159 |     if not os.path.isfile(file_path) and url:
160 |         # Dataset file isn't found, download it
161 |         with tqdm(unit='B', unit_scale=True, leave=True, miniters=1,
162 |                   desc=f"Downloading {name}") as t:  # all optional kwargs
163 |             urlretrieve(url, filename=file_path,
164 |                         reporthook=my_hook(t), data=None)
165 | 
166 |     if file_extension == ".csv":
167 |         # Read CSV file
168 |         dataset = pd.read_csv(file_path, index_col=index_col)
169 |     elif file_extension == ".pkl":
170 |         # Read Pickle file
171 |         dataset = pd.read_pickle(file_path)
172 |     else:
173 |         raise ValueError(f"Unsupported file format: {file_extension}")
174 | 
175 |     # Return the loaded dataset
176 |     return dataset
177 | 
178 | 
179 | def load_martian_bow_shock():
180 |     """
181 |         Load the Martian bow shock dataset and events, for more information check this link:  http://amda.cdpp.eu/
182 | 
183 |         Returns:
184 |             A dataset and events as pd.DataFrame
185 | 
186 |         """
187 |     url_dataset = "https://archive.org/download/martian_bow_shock_dataset/martian_bow_shock_dataset.pkl"
188 |     url_events = "https://archive.org/download/martian_bow_shock_events/martian_bow_shock_events.csv"
189 |     data_set = load_dataset(file_path="martian_bow_shock_dataset.pkl", name="Martian Bow Shock data set",
190 |                             url=url_dataset)
191 |     events = load_dataset(file_path="martian_bow_shock_events.csv", name="Martian Bow Shock events", index_col=None,
192 |                           url=url_events)
193 | 
194 |     return data_set, events
195 | 
196 | 
197 | def load_credit_card_fraud():
198 |     """
199 |     Load the credit card fraud dataset and events, for more information check this link: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
200 |     
201 |     Returns:
202 |         A dataset and events as pd.DataFrame
203 | 
204 |     """
205 |     url_dataset = "https://archive.org/download/credit_card_fraud_dataset/credit_card_fraud_dataset.csv"
206 |     url_events = "https://archive.org/download/credit_card_fraud_events/credit_card_fraud_events.csv"
207 | 
208 |     data_set = load_dataset(file_path="credit_card_fraud_dataset.csv", name="Credit Card Fraud data set",
209 |                             url=url_dataset)
210 |     events = load_dataset(file_path="credit_card_fraud_events.csv", name="Credit Card Fraud events", index_col=None,
211 |                           url=url_events)
212 | 
213 |     return data_set, events
214 | 


--------------------------------------------------------------------------------
/eventdetector_ts/models/helpers_models.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | from sklearn.model_selection import KFold
  6 | 
  7 | 
  8 | class CustomEarlyStopping(tf.keras.callbacks.Callback):
  9 |     """
 10 |     Create a custom early stopping callback that stops training when the ratio of current training loss to current
 11 |     validation loss is less than a specified ratio for a number of consecutive epochs.
 12 | 
 13 |     Args:
 14 |         ratio (float): Ratio to compare current train loss and current val loss against.
 15 |         patience (int): Number of epochs to wait before stopping training.
 16 |         verbose (int, optional): Verbosity level.
 17 | 
 18 |     Attributes:
 19 |         stopped_epoch (int or None): Last epoch index where training was stopped.
 20 |         best (float or None): Best validation loss observed so far.
 21 |         best_epoch (int or None): Index of the epoch where the best validation loss was observed.
 22 |         ratio (float): Ratio to compare current train loss and current val loss against.
 23 |         patience (int): Number of epochs to wait before stopping training.
 24 |         verbose (int): Verbosity level.
 25 |         wait (int): Number of epochs since the last time the ratio was greater than self.ratio.
 26 |         monitor_op (function): Comparison operator for the ratio.
 27 |         best_weights (np.ndarray or None): Model weights at the epoch with the best validation loss.
 28 |     """
 29 | 
 30 |     def __init__(self, ratio: float, patience: int, verbose: int = 1):
 31 |         super().__init__()
 32 |         self.stopped_epoch = None
 33 |         self.best = None
 34 |         self.best_epoch = None
 35 |         self.ratio = ratio
 36 |         self.patience = patience
 37 |         self.verbose = verbose
 38 |         self.wait = 0
 39 |         self.monitor_op = np.greater
 40 |         self.best_weights = None
 41 | 
 42 |     def on_train_begin(self, logs=None):
 43 |         """
 44 |         Initialize instance attributes.
 45 |         """
 46 |         self.wait = 0
 47 |         self.best_weights = None
 48 |         self.stopped_epoch = 0
 49 |         self.best_epoch = 0
 50 |         self.best = np.inf
 51 | 
 52 |     def on_epoch_end(self, epoch, logs=None):
 53 |         """
 54 |         Update the best validation loss and check whether to stop training.
 55 |         """
 56 |         if logs is not None:
 57 |             if self.best_weights is None:
 58 |                 self.best_weights = self.model.get_weights()
 59 | 
 60 |             current_val = logs.get('val_loss')  # Current validation loss
 61 |             current_train = logs.get('loss')  # Current training loss
 62 |             if current_val is None:
 63 |                 logging.warning(
 64 |                     "Early stopping conditioned on metric `%s` "
 65 |                     "which is not available. Available metrics are: %s",
 66 |                     'val_loss',
 67 |                     ",".join(list(logs.keys())),
 68 |                 )
 69 | 
 70 |             # Update the best validation loss and weights
 71 |             if self.monitor_op(self.best, current_val):
 72 |                 self.best = current_val
 73 |                 self.best_weights = self.model.get_weights()
 74 |                 self.best_epoch = epoch
 75 | 
 76 |             # If the ratio of current training loss to current validation loss is greater than the specified ratio.
 77 |             if self.monitor_op(np.divide(current_train, current_val), self.ratio):
 78 |                 self.wait = 0
 79 |             else:
 80 |                 # Only check after the first epoch.
 81 |                 if self.wait >= self.patience and epoch > 0:
 82 |                     self.stopped_epoch = epoch
 83 |                     self.model.stop_training = True
 84 |                     if self.verbose > 0:
 85 |                         tf.print(
 86 |                             "Restoring model weights from "
 87 |                             "the end of the best epoch: "
 88 |                             f"{self.best_epoch + 1}."
 89 |                         )
 90 |                     self.model.set_weights(self.best_weights)
 91 |                 self.wait += 1
 92 | 
 93 |     def on_train_end(self, logs=None):
 94 |         """
 95 |         Print a message indicating that training was stopped early.
 96 |         """
 97 |         if logs is not None:
 98 |             if self.stopped_epoch > 0 and self.verbose > 0:
 99 |                 tf.print(
100 |                     f"Epoch {self.stopped_epoch + 1}: early stopping. "
101 |                     "Restoring model weights from "
102 |                     "the end of the best epoch: "
103 |                     f"{self.best_epoch + 1}. "
104 |                     "Best validation loss: "
105 |                     f"{self.best}."
106 |                 )
107 | 
108 | 
109 | class SelfAttention(tf.keras.layers.Layer):
110 |     """
111 |     Self-Attention layer for Neural Networks
112 |     """
113 | 
114 |     def __init__(self, units: int, **kwargs) -> None:
115 |         super().__init__()
116 |         self.last_attention_weights = None
117 |         # Instantiate a multi-head attention layer with key dimensionality of units
118 |         # and a single head
119 |         self.mha = tf.keras.layers.MultiHeadAttention(key_dim=units, num_heads=1, **kwargs)
120 |         # Instantiate a normalization layer
121 |         self.layer_norm = tf.keras.layers.LayerNormalization()
122 |         # Instantiate an addition layer
123 |         self.add = tf.keras.layers.Add()
124 | 
125 |     def call(self, query: tf.Tensor) -> tf.Tensor:
126 |         """
127 |         Apply a self-attention mechanism on the input query and return the output.
128 | 
129 |         Args:
130 |             query: input tensor to the layer.
131 | 
132 |         Return:
133 |             output tensor of the layer.
134 |         """
135 |         # Apply multi-head attention on a query
136 |         attn_output, attn_scores = self.mha(
137 |             query=query,
138 |             key=query,
139 |             value=query,
140 |             return_attention_scores=True)
141 | 
142 |         # Store the attention scores in last_attention_weights for inspection
143 |         self.last_attention_weights = attn_scores
144 | 
145 |         # Add the attention output to the query and normalize it
146 |         x = self.add([query, attn_output])
147 |         x = self.layer_norm(x)
148 | 
149 |         return x
150 | 
151 | 
152 | def custom_cross_val_score(model: tf.keras.Model, x: np.ndarray, y: np.ndarray, cv: KFold, epochs: int, batch_size: int,
153 |                            callbacks: list) -> np.ndarray:
154 |     """
155 |     A function to perform custom cross-validation for a Keras model.
156 | 
157 |     Args:
158 |         model: A Keras model.
159 |         x: The input data.
160 |         y: The target data.
161 |         cv: A KFold cross-validation object.
162 |         epochs: The number of epochs for training.
163 |         batch_size: The batch size for training.
164 |         callbacks: A list of Keras callbacks.
165 | 
166 |     Returns:
167 |         The mean of the validation loss across all folds.
168 |     """
169 |     scores = []
170 |     for train_index, val_index in cv.split(x):
171 |         train_x, train_y = x[train_index], y[train_index]
172 |         val_x, val_y = x[val_index], y[val_index]
173 |         history = model.fit(train_x, train_y, epochs=epochs, batch_size=batch_size, callbacks=callbacks,
174 |                             validation_data=(val_x, val_y), verbose=0)
175 |         scores.append(np.min(history.history['val_loss']))
176 |     return np.mean(scores)
177 | 


--------------------------------------------------------------------------------
/eventdetector_ts/prediction/prediction.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from typing import Dict, List, Tuple, Any
  4 | 
  5 | import joblib
  6 | import numpy as np
  7 | import pandas as pd
  8 | import tensorflow as tf
  9 | 
 10 | from eventdetector_ts import CONFIG_FILE, SCALERS_DIR, TYPE_TRAINING_FFN, TimeUnit, MODELS_DIR, META_MODEL_NETWORK, \
 11 |     META_MODEL_SCALER
 12 | from eventdetector_ts.data.helpers_data import convert_dataframe_to_overlapping_partitions, get_timedelta
 13 | from eventdetector_ts.optimization.algorithms import convolve_with_gaussian_kernel
 14 | from eventdetector_ts.optimization.event_extraction_pipeline import get_peaks, compute_op_as_mid_times
 15 | from eventdetector_ts.prediction import logger
 16 | 
 17 | 
 18 | def load_model_with_fallback(model_path: str) -> tf.keras.Model:
 19 |     """
 20 |     Load a Keras model with fallback mechanisms for different formats.
 21 | 
 22 |     Args:
 23 |         model_path (str): Path to the model file
 24 | 
 25 |     Returns:
 26 |         tf.keras.Model: Loaded model
 27 |     """
 28 |     try:
 29 |         # Try loading with Keras 3 format first
 30 |         return tf.keras.models.load_model(model_path)
 31 |     except ValueError as e:
 32 |         if "File format not supported" in str(e):
 33 |             # Try loading as SavedModel format
 34 |             try:
 35 |                 saved_model = tf.saved_model.load(model_path)
 36 |                 # Convert SavedModel to Keras model
 37 |                 class SavedModelWrapper(tf.keras.Model):
 38 |                     def __init__(self, saved_model):
 39 |                         super().__init__()
 40 |                         self.saved_model = saved_model
 41 | 
 42 |                     def call(self, inputs):
 43 |                         return self.saved_model(inputs)
 44 | 
 45 |                 return SavedModelWrapper(saved_model)
 46 |             except Exception:
 47 |                 # If all else fails, try H5 format
 48 |                 h5_path = model_path.replace('.keras', '.h5')
 49 |                 if os.path.exists(h5_path):
 50 |                     return tf.keras.models.load_model(h5_path)
 51 |                 else:
 52 |                     raise e
 53 |         else:
 54 |             raise e
 55 | 
 56 | 
 57 | def load_config_file(path: str) -> Dict:
 58 |     """
 59 |      Load config file of the meta-model.
 60 | 
 61 |     Args:
 62 |         path (str): Where the config file is stored
 63 | 
 64 |     Returns:
 65 |         Data as a Dict which contains all configuration information
 66 |     """
 67 |     config_file_path = os.path.join(path, CONFIG_FILE)
 68 |     if not os.path.exists(config_file_path):
 69 |         msg: str = f"The config file {CONFIG_FILE} does not exist in this path: {config_file_path}"
 70 |         logger.critical(msg)
 71 |         raise ValueError(msg)
 72 | 
 73 |     with open(config_file_path, 'r') as f:
 74 |         config_: Dict = json.load(f)
 75 |         return config_
 76 | 
 77 | 
 78 | def load_models(model_keys: List[str], output_dir: str) -> List[tf.keras.Model]:
 79 |     """
 80 |     Loads the trained models.
 81 |     Args:
 82 |         model_keys (List[str]): List of model's name
 83 |         output_dir (str): The parent directory where the trained models are stored
 84 | 
 85 |     Returns:
 86 |         List of keras models
 87 |     """
 88 |     models: List[tf.keras.Model] = []
 89 |     for key in model_keys:
 90 |         path = os.path.join(output_dir, MODELS_DIR)
 91 |         # Add .keras extension if not already present
 92 |         if not key.endswith('.keras'):
 93 |             key = f"{key}.keras"
 94 |         path = os.path.join(path, key)
 95 |         models.append(load_model_with_fallback(path))
 96 |     return models
 97 | 
 98 | 
 99 | def apply_scaling(x: np.ndarray, config_data: Dict) -> np.ndarray:
100 |     """
101 |     Scaling input data according to the stored scalers.
102 |     Args:
103 |         x (np.ndarray): Input data to be scaled
104 |         config_data (Dict): Configuration Data
105 | 
106 |     Returns:
107 |         Scaled data.
108 |     """
109 |     n_time_steps = x.shape[1]
110 |     output_dir: str = config_data.get("output_dir")
111 |     scalers_dir = os.path.join(output_dir, SCALERS_DIR)
112 |     try:
113 |         for i in range(n_time_steps):
114 |             scaler_i_path = os.path.join(scalers_dir, f'scaler_{i}.joblib')
115 |             # Print progress
116 |             print("\rLoading and applying scalers...{}/{}".format(i + 1, n_time_steps), end="")
117 |             # Load the scaler from disk
118 |             print(scaler_i_path)
119 |             scaler = joblib.load(scaler_i_path)
120 |             x[:, i, :] = scaler.transform(x[:, i, :])
121 |     except ValueError as e:
122 |         logger.critical(e)
123 |         raise e
124 | 
125 |     logger.info("Convert data to float32 for consistency...")
126 |     x = np.asarray(x).astype('float32')
127 |     return x
128 | 
129 | 
130 | def load_meta_model(output_dir: str) -> Tuple[tf.keras.Model, Any]:
131 |     """
132 |     Load the metamodel network and the scaler.
133 |     Args:
134 |         output_dir (str): The parent directory where the trained models are stored
135 | 
136 |     Returns:
137 |         tf.keras.Model, StanderScaler
138 |     """
139 |     path = os.path.join(output_dir, MODELS_DIR)
140 |     # Add .keras extension if not already present
141 |     meta_model_name = META_MODEL_NETWORK
142 |     if not meta_model_name.endswith('.keras'):
143 |         meta_model_name = f"{meta_model_name}.keras"
144 |     path = os.path.join(path, meta_model_name)
145 |     model = load_model_with_fallback(path)
146 | 
147 |     scalers_dir = os.path.join(output_dir, SCALERS_DIR)
148 |     scaler_path = os.path.join(scalers_dir, f'{META_MODEL_SCALER}.joblib')
149 |     scaler = joblib.load(scaler_path)
150 | 
151 |     return model, scaler
152 | 
153 | 
154 | def predict(dataset: pd.DataFrame, path: str) -> Tuple[List, np.ndarray, np.ndarray]:
155 |     """
156 |     Generates output predictions for the input dataset
157 |     Args:
158 |         dataset (pd.DataFrame): The input dataset.
159 |         path (str): The path to the created folder by the MetaModel.
160 | 
161 |     Returns:
162 |         Tuple[List, np.ndarray, np.ndarray]: Predicted events, predicted Op and filtered predicted Op
163 |     """
164 | 
165 |     if path is None or not isinstance(path, str) or len(path) == 0:
166 |         msg: str = f"The provided path {path} is not valid."
167 |         logger.critical(msg)
168 |         raise ValueError(msg)
169 | 
170 |     config_data: Dict = load_config_file(path=path)
171 |     config_data['output_dir'] = path
172 |     logger.info(f"Config dict: {config_data}")
173 |     logger.info("Converting the dataset to overlapping partitions.")
174 |     dataset_as_overlapping_partitions: np.ndarray = convert_dataframe_to_overlapping_partitions(dataset,
175 |                                                                                                 width=config_data.get(
176 |                                                                                                     "width"),
177 |                                                                                                 step=config_data.get(
178 |                                                                                                     "step"),
179 |                                                                                                 fill_method=config_data.get(
180 |                                                                                                     'fill_nan'))
181 |     # Remove the column containing the timestamps from the overlapping partitions
182 |     x: np.ndarray = np.delete(dataset_as_overlapping_partitions, -1, axis=2)
183 |     logger.info(f"The shape of the input data: {x.shape}")
184 |     x = apply_scaling(x=x, config_data=config_data)
185 |     model_keys: List[str] = config_data.get('models')
186 |     logger.info(f"Loading models: {model_keys}")
187 |     models: List[tf.keras.Model] = load_models(model_keys=model_keys, output_dir=config_data.get('output_dir'))
188 |     batch_size: int = config_data.get("batch_size")
189 |     predictions = []
190 |     logger.info("Making prediction from the trained models")
191 |     for model in models:
192 |         # Make predictions using each model
193 |         predicted_y: np.ndarray = model.predict(x, batch_size=batch_size)
194 |         predicted_y = predicted_y.flatten()
195 |         predictions.append(predicted_y)
196 | 
197 |     type_training: str = config_data.get('type_training')
198 |     # Convert a list of 1D NumPy arrays to 2D NumPy array
199 |     predictions = np.stack(predictions, axis=1)
200 |     if type_training == TYPE_TRAINING_FFN:
201 |         logger.info("Loading the MetaModel and its Scaler")
202 |         model, scaler = load_meta_model(output_dir=config_data.get('output_dir'))
203 |         predictions = scaler.transform(predictions)
204 |         logger.info("Make a final prediction using the network of the MetaModel")
205 |         predicted_op = model.predict(predictions, batch_size=batch_size)
206 |         predicted_op = predicted_op.flatten()
207 |     else:
208 |         logger.info("Make a final prediction by averaging")
209 |         predicted_op = np.mean(predictions, axis=1)
210 | 
211 |     sigma, m, h = config_data.get('best_combination')
212 |     logger.info(f"Applying Gaussian Filter with sigma = {sigma} and m = {m}")
213 |     filtered_predicted_op = convolve_with_gaussian_kernel(predicted_op, sigma=sigma, m=m)
214 |     logger.info("Computing filtered predictions as a function of the mid-times of the overlapping partitions")
215 |     t, filtered_predicted_op = compute_op_as_mid_times(overlapping_partitions=dataset_as_overlapping_partitions,
216 |                                                        op_g=filtered_predicted_op)
217 |     logger.info(f"Computing peaks with h = {h:.2f}")
218 |     s_peaks = get_peaks(h=h, t=t, op_g=filtered_predicted_op)
219 |     predicted_events = []
220 |     time_unit: TimeUnit = TimeUnit.__call__(config_data.get('time_unit'))
221 |     radius = get_timedelta(config_data.get("width_events_s") / 2.0, time_unit)
222 |     logger.info(f"Generating a predicted events with radius = {radius}, predicted op and a filtered predicted op")
223 |     for i in range(len(s_peaks)):
224 |         predicted_event = s_peaks[i]
225 |         start_time = predicted_event - radius
226 |         end_time = predicted_event + radius
227 |         predicted_events.append((start_time.isoformat(), end_time.isoformat()))
228 |     return predicted_events, predicted_op, filtered_predicted_op
229 | 


--------------------------------------------------------------------------------
/tests/data/test_helpers_data.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from datetime import datetime, timedelta
  3 | 
  4 | import numpy as np
  5 | import pandas as pd
  6 | from pandas.core.dtypes.common import is_datetime64_any_dtype
  7 | from sympy.testing import pytest
  8 | 
  9 | from eventdetector_ts import TimeUnit
 10 | from eventdetector_ts.data.helpers_data import overlapping_partitions, compute_middle_event, \
 11 |     num_columns, convert_dataframe_to_overlapping_partitions, get_timedelta, get_total_units, check_time_unit, \
 12 |     convert_dataset_index_to_datetime, convert_seconds_to_time_unit
 13 | 
 14 | 
 15 | def test_overlapping_partitions():
 16 |     data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
 17 |     expected_output = np.array([[[1, 2, 3], [4, 5, 6]], [[4, 5, 6], [7, 8, 9]], [[7, 8, 9], [10, 11, 12]]])
 18 |     assert np.array_equal(overlapping_partitions(data, width=2, step=1), expected_output)
 19 | 
 20 | 
 21 | class TestHelpers(unittest.TestCase):
 22 | 
 23 |     def setUp(self):
 24 |         self.n: int = 100
 25 | 
 26 |     def test_overlapping_partitions(self):
 27 |         # Test case 1: 1D input
 28 |         data1 = np.array([1, 2, 3, 4, 5])
 29 |         result1 = overlapping_partitions(data1, width=3, step=1)
 30 |         expected1 = np.array([[1, 2, 3], [2, 3, 4], [3, 4, 5]])
 31 |         self.assertTrue(np.array_equal(result1, expected1))
 32 | 
 33 |         # Test case 2: partition width greater than the size of the input data
 34 |         data2 = np.array([1, 2, 3, 4, 5])
 35 |         with pytest.raises(ValueError):
 36 |             overlapping_partitions(data2, width=6, step=1)
 37 | 
 38 |         # Test case 3: 2D input
 39 |         data3 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
 40 |         result3 = overlapping_partitions(data3, width=2, step=1)
 41 |         expected3 = np.array([[[1, 2, 3], [4, 5, 6]], [[4, 5, 6], [7, 8, 9]]])
 42 |         assert np.array_equal(result3, expected3)
 43 | 
 44 |         # Test case 4: 2D input
 45 |         data4 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
 46 |         result4 = overlapping_partitions(data4, width=2, step=2)
 47 |         expected4 = np.array([[[1, 2, 3], [4, 5, 6]]])
 48 |         assert np.array_equal(result4, expected4)
 49 | 
 50 |     def test_convert_dataframe_to_overlapping_partitions(self):
 51 |         # Create a sample DataFrame with datetime index and real-valued features
 52 | 
 53 |         data = np.random.rand(self.n, 3)
 54 |         index = pd.date_range(start='2022-01-01', periods=self.n, freq='D')
 55 |         df = pd.DataFrame(data=data, columns=['feat1', 'feat2', 'feat3'], index=index)
 56 | 
 57 |         # Test overlapping partition generation with default settings
 58 |         sw = convert_dataframe_to_overlapping_partitions(df, width=2, step=1)
 59 |         expected_shape = (self.n - 1, 2, 4)  # Number of partitions, partition width, number of features+time
 60 |         self.assertEqual(sw.shape, expected_shape)
 61 | 
 62 |         # Test overlapping partition generation with custom settings
 63 |         sw = convert_dataframe_to_overlapping_partitions(df, width=14, step=7, fill_method='ffill')
 64 |         expected_shape = (13, 14, 4)  # Number of partitions, partition width, number of features+time
 65 |         self.assertEqual(sw.shape, expected_shape)
 66 | 
 67 |     def test_compute_middle_event(self):
 68 |         # Test case 1: List of events with 2 columns
 69 |         events_list = [['2022-01-01', '2022-01-02'], ['2022-01-03', '2022-01-05']]
 70 |         expected_output = pd.DataFrame({'event': [datetime(2022, 1, 1, 12, 0), datetime(2022, 1, 4)]})
 71 |         # call function to get actual output
 72 |         actual_output = compute_middle_event(events_list)
 73 | 
 74 |         # compare expected and actual outputs
 75 |         pd.testing.assert_frame_equal(expected_output, actual_output)
 76 | 
 77 |         # Test case 2: List of events with 1 column
 78 |         events_list = [['2022-01-01'], ['2022-01-03']]
 79 |         expected_output = pd.DataFrame({"event": [datetime(2022, 1, 1), datetime(2022, 1, 3)]})
 80 |         # call function to get actual output
 81 |         actual_output = compute_middle_event(events_list)
 82 | 
 83 |         # compare expected and actual outputs
 84 |         pd.testing.assert_frame_equal(expected_output, actual_output)
 85 |         # Test case 3: Pandas DataFrame with 2 columns
 86 |         events_df = pd.DataFrame({'Starting Date': ['2022-01-01', '2022-01-03'],
 87 |                                   'Ending Date': ['2022-01-02', '2022-01-05']})
 88 |         expected_output = pd.DataFrame({"event": [datetime(2022, 1, 1, 12, 0), datetime(2022, 1, 4)]})
 89 |         # call function to get actual output
 90 |         actual_output = compute_middle_event(events_df)
 91 | 
 92 |         # compare expected and actual outputs
 93 |         pd.testing.assert_frame_equal(expected_output, actual_output)
 94 | 
 95 |         # Test case 4: Pandas DataFrame with 1 column
 96 |         expected_output = pd.DataFrame({"event": [datetime(2022, 1, 1), datetime(2022, 1, 3)]})
 97 |         # call function to get actual output
 98 |         actual_output = compute_middle_event(events_list)
 99 | 
100 |         # compare expected and actual outputs
101 |         pd.testing.assert_frame_equal(expected_output, actual_output)
102 | 
103 |         # Test case 5: Empty list of events
104 |         events_list = []
105 |         with pytest.raises(ValueError):
106 |             compute_middle_event(events_list)
107 | 
108 |         # Test case 6: Empty DataFrame of events
109 |         events_df = pd.DataFrame()
110 |         with pytest.raises(ValueError):
111 |             compute_middle_event(events_df)
112 | 
113 |         # Test case 7: Invalid input format for events
114 |         events_list = [[1, 2], [3, 4, 5]]
115 |         with pytest.raises(ValueError):
116 |             compute_middle_event(events_list)
117 | 
118 |     def test_empty_list(self):
119 |         self.assertEqual(num_columns([]), 0)
120 | 
121 |     def test_single_column_list(self):
122 |         self.assertEqual(num_columns([1, 2, 3]), 1)
123 | 
124 |     def test_multi_column_list(self):
125 |         self.assertEqual(num_columns([[1, 2], [3, 4], [5, 6]]), 2)
126 | 
127 |     def test_mixed_list(self):
128 |         self.assertEqual(num_columns([[1, 2], 3, 4]), 2)
129 | 
130 |     def test_microsecond(self):
131 |         result = get_timedelta(100, TimeUnit.MICROSECOND)
132 |         self.assertEqual(result, timedelta(microseconds=100))
133 | 
134 |     def test_millisecond(self):
135 |         result = get_timedelta(500, TimeUnit.MILLISECOND)
136 |         self.assertEqual(result, timedelta(milliseconds=500))
137 | 
138 |     def test_second(self):
139 |         result = get_timedelta(60, TimeUnit.SECOND)
140 |         self.assertEqual(result, timedelta(seconds=60))
141 | 
142 |     def test_minute(self):
143 |         result = get_timedelta(30, TimeUnit.MINUTE)
144 |         self.assertEqual(result, timedelta(minutes=30))
145 | 
146 |     def test_hour(self):
147 |         result = get_timedelta(2, TimeUnit.HOUR)
148 |         self.assertEqual(result, timedelta(hours=2))
149 | 
150 |     def test_day(self):
151 |         result = get_timedelta(5, TimeUnit.DAY)
152 |         self.assertEqual(result, timedelta(days=5))
153 | 
154 |     def test_year(self):
155 |         result = get_timedelta(2, TimeUnit.YEAR)
156 |         self.assertEqual(result, timedelta(days=2 * 365))
157 | 
158 |     def test_invalid_unit(self):
159 |         with self.assertRaises(ValueError):
160 |             get_timedelta(10, "null")
161 | 
162 |     def test_microsecond_(self):
163 |         td = timedelta(microseconds=123456789)
164 |         self.assertEqual(get_total_units(td, TimeUnit.MICROSECOND), 123456789)
165 | 
166 |     def test_millisecond_(self):
167 |         td = timedelta(milliseconds=123456)
168 |         self.assertEqual(get_total_units(td, TimeUnit.MILLISECOND), 123456)
169 | 
170 |     def test_second_(self):
171 |         td = timedelta(seconds=123)
172 |         self.assertEqual(get_total_units(td, TimeUnit.SECOND), 123)
173 | 
174 |     def test_minute_(self):
175 |         td = timedelta(minutes=2)
176 |         self.assertEqual(get_total_units(td, TimeUnit.MINUTE), 2)
177 | 
178 |     def test_hour_(self):
179 |         td = timedelta(hours=1)
180 |         self.assertEqual(get_total_units(td, TimeUnit.HOUR), 1)
181 | 
182 |     def test_day_(self):
183 |         td = timedelta(days=3)
184 |         self.assertEqual(get_total_units(td, TimeUnit.DAY), 3)
185 | 
186 |     def test_year_(self):
187 |         td = timedelta(days=365.25)
188 |         self.assertAlmostEqual(get_total_units(td, TimeUnit.YEAR), 1.0, places=2)
189 | 
190 |     def test_invalid_unit_(self):
191 |         td = timedelta(seconds=123)
192 |         with self.assertRaises(ValueError):
193 |             get_total_units(td, "invalid_unit")
194 | 
195 |     def test_year__(self):
196 |         diff = timedelta(days=365)
197 |         expected_result = (1, TimeUnit.YEAR)
198 |         self.assertEqual(check_time_unit(diff), expected_result)
199 | 
200 |     def test_day__(self):
201 |         diff = timedelta(days=2)
202 |         expected_result = (2, TimeUnit.DAY)
203 |         self.assertEqual(check_time_unit(diff), expected_result)
204 | 
205 |     def test_hour__(self):
206 |         diff = timedelta(hours=1)
207 |         expected_result = (1, TimeUnit.HOUR)
208 |         self.assertEqual(check_time_unit(diff), expected_result)
209 | 
210 |     def test_minute__(self):
211 |         diff = timedelta(minutes=2)
212 |         expected_result = (2, TimeUnit.MINUTE)
213 |         self.assertEqual(check_time_unit(diff), expected_result)
214 | 
215 |     def test_second__(self):
216 |         diff = timedelta(seconds=30)
217 |         expected_result = (30, TimeUnit.SECOND)
218 |         self.assertEqual(check_time_unit(diff), expected_result)
219 | 
220 |     def test_millisecond__(self):
221 |         diff = timedelta(milliseconds=500)
222 |         expected_result = (500, TimeUnit.MILLISECOND)
223 |         self.assertEqual(check_time_unit(diff), expected_result)
224 | 
225 |     def test_microsecond__(self):
226 |         diff = timedelta(microseconds=200)
227 |         expected_result = (200, TimeUnit.MICROSECOND)
228 |         self.assertEqual(check_time_unit(diff), expected_result)
229 | 
230 |     def test_invalid_time(self):
231 |         diff = timedelta(microseconds=0)
232 |         with self.assertRaises(ValueError):
233 |             check_time_unit(diff)
234 | 
235 |     def test_convert_datetime_index(self):
236 |         # Create a DataFrame with a datetime index
237 |         data = {'value': [1, 2, 3, 4, 5]}
238 |         index = pd.date_range(start='2023-01-01', periods=5)
239 |         dataset = pd.DataFrame(data, index=index)
240 | 
241 |         # Call the function to convert the index to datetime
242 |         convert_dataset_index_to_datetime(dataset)
243 | 
244 |         # Check if the index is in datetime format
245 |         self.assertTrue(is_datetime64_any_dtype(dataset.index))
246 | 
247 |     def test_non_datetime_index(self):
248 |         # Create a DataFrame with a non-datetime index
249 |         data = {'value': [1, 2, 3, 4, 5]}
250 |         index = ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05']
251 |         dataset = pd.DataFrame(data, index=index)
252 | 
253 |         # Call the function to convert the index to datetime
254 |         convert_dataset_index_to_datetime(dataset)
255 | 
256 |         # Check if the index is converted to datetime format
257 |         self.assertTrue(is_datetime64_any_dtype(dataset.index))
258 | 
259 |     def test_conversion(self):
260 |         self.assertEqual(convert_seconds_to_time_unit(1, TimeUnit.SECOND), 1)
261 |         self.assertEqual(convert_seconds_to_time_unit(60, TimeUnit.MINUTE), 1)
262 |         self.assertEqual(convert_seconds_to_time_unit(3600, TimeUnit.HOUR), 1)
263 | 
264 | 
265 | if __name__ == '__main__':
266 |     unittest.main()
267 | 


--------------------------------------------------------------------------------
/eventdetector_ts/optimization/event_extraction_pipeline.py:
--------------------------------------------------------------------------------
  1 | from datetime import timedelta
  2 | from math import ceil
  3 | from typing import Tuple, Union
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from scipy.signal import find_peaks
  8 | 
  9 | from eventdetector_ts import MIDDLE_EVENT_LABEL, TimeUnit, config_dict
 10 | from eventdetector_ts.data.helpers_data import get_timedelta, get_total_units
 11 | from eventdetector_ts.optimization import logger
 12 | from eventdetector_ts.optimization.algorithms import convolve_with_gaussian_kernel
 13 | 
 14 | 
 15 | class OptimizationData:
 16 |     """
 17 |     OptimizationData class represents the data used for the event extraction pipeline.
 18 | 
 19 |      Attributes:
 20 |         - time_unit (TimeUnit): Unit of time used in the dataset.
 21 |         - true_events (pd.DataFrame): DataFrame to store true events.
 22 |         - predicted_op (np.ndarray): Array to store predicted outcomes.
 23 |         - delta Union[int, float]: The maximum time tolerance used to determine the correspondence between a predicted 
 24 |             event and its actual counterpart in the true events.
 25 |         - s_h (float): A step parameter for the peak height threshold h.
 26 |         - s_s (int): Step size in time unit for overlapping the partition.
 27 |         - w_s (int): Size in time unit of the overlapping partition.
 28 |         - t_max (float): The maximum total time related to sigma.
 29 |         - output_dir (str): The parent directory.
 30 |         - big_sigma (int): Value calculated based on t_max, w_s, and s_s.
 31 |         - overlapping_partitions (np.ndarray): Array to store overalapping partitions.
 32 | 
 33 |     """
 34 | 
 35 |     def __init__(self, t_max: float, w_s: int, s_s: int,
 36 |                  s_h: float,
 37 |                  delta: Union[int, float],
 38 |                  output_dir: str, time_unit: TimeUnit):
 39 |         """
 40 |         Initializes the OptimizationData object.
 41 | 
 42 |         Args:
 43 |             t_max (float): The maximum total time related to sigma.
 44 |             w_s (int): Size in time unit of the overalapping partition.
 45 |             s_s (int): Step size in time unit for overalapping the partition.
 46 |             s_h (float): A step parameter for the peak height threshold h.
 47 |             delta Union[int, float]: The maximum time tolerance used to determine the correspondence between a predicted
 48 |                 event and its actual counterpart in the true events.
 49 |             output_dir (str): The parent directory.
 50 |             time_unit (TimeUnit): Unit of time used in the dataset.
 51 |         """
 52 |         self.time_unit = time_unit
 53 |         self.true_events: pd.DataFrame = pd.DataFrame()
 54 |         self.predicted_op: np.ndarray = np.empty(shape=(0,))
 55 |         self.delta = delta
 56 |         self.s_h = s_h
 57 |         self.s_s = s_s
 58 |         self.w_s = w_s
 59 |         self.t_max = t_max
 60 |         self.output_dir = output_dir
 61 |         self.big_sigma = 1 + ceil((self.t_max - self.w_s) / self.s_s)
 62 |         self.overlapping_partitions: np.ndarray = np.empty(shape=(0,))
 63 | 
 64 |     def set_true_events(self, true_events: pd.DataFrame) -> None:
 65 |         self.true_events = true_events
 66 | 
 67 |     def set_overlapping_partitions(self, overlapping_partitions: np.ndarray):
 68 |         self.overlapping_partitions = overlapping_partitions
 69 | 
 70 |     def set_predicted_op(self, predicted_op: np.ndarray):
 71 |         self.predicted_op = predicted_op
 72 |         overlapping_partitions_test = self.overlapping_partitions[-len(predicted_op):]
 73 |         self.overlapping_partitions = overlapping_partitions_test
 74 |         first_partition_test_data = self.overlapping_partitions[0]
 75 |         last_partition_test_data = self.overlapping_partitions[-1]
 76 |         start_date_test_data = first_partition_test_data[0][-1].to_pydatetime()
 77 |         end_date_test_data = last_partition_test_data[0][-1].to_pydatetime()
 78 |         logger.info(
 79 |             f"Starting and ending dates of test data are respectively {start_date_test_data} --> {end_date_test_data}")
 80 | 
 81 |         true_events_test = self.true_events[(self.true_events[MIDDLE_EVENT_LABEL] >= start_date_test_data) & (
 82 |                 self.true_events[MIDDLE_EVENT_LABEL] <= end_date_test_data)]
 83 |         self.true_events = true_events_test
 84 | 
 85 | 
 86 | def get_peaks(h: float, t: np.ndarray, op_g: np.ndarray) -> np.ndarray:
 87 |     """
 88 |     Compute peaks for given mid_times of partitions, op values, and threshold h.
 89 |     Args:
 90 |         h (float): Threshold for peaks.
 91 |         t (np.ndarray): mid_times of partitions
 92 |         op_g (np.ndarray): op values
 93 | 
 94 |     Returns:
 95 |         np.ndarray: Peaks.
 96 |     """
 97 |     peaks, _ = find_peaks(op_g, height=np.array([h, 1.0]))
 98 |     return t[peaks]
 99 | 
100 | 
101 | def compute_op_as_mid_times(overlapping_partitions: np.ndarray, op_g: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
102 |     """
103 |     Compute op as a function of mid-times of partitions instead of partition's index.
104 |     Args:
105 |         overlapping_partitions (np.ndarray): overalapping partitions
106 |         op_g (np.ndarray): Op array
107 | 
108 |     Returns:
109 |         Tuple[np.ndarray, np.ndarray]: mid-times of partitions, op as a function of mid-times of partitions
110 |     """
111 |     t = []
112 |     op_g_ = []
113 |     for n in range(len(op_g)):
114 |         w_n = overlapping_partitions[n]
115 |         b_n = w_n[0][-1].to_pydatetime()
116 |         e_n = w_n[-1][-1].to_pydatetime()
117 |         c_n = b_n + (e_n - b_n) / 2
118 |         t.append(c_n)
119 |         op_g_.append(op_g[n])
120 |     t, op_g_ = np.array(t), np.array(op_g_)
121 |     return t, op_g_
122 | 
123 | 
124 | class OptimizationCalculator:
125 |     def __init__(self, optimization_data: OptimizationData):
126 |         self.optimization_data = optimization_data
127 | 
128 |     def apply_gaussian_filter(self, sigma: int, m: int) -> np.ndarray:
129 |         return convolve_with_gaussian_kernel(self.optimization_data.predicted_op, sigma, m=m)
130 | 
131 |     def __compute_op_as_mid_times(self, op_g: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
132 |         return compute_op_as_mid_times(self.optimization_data.overlapping_partitions, op_g)
133 | 
134 |     def __util_method(self, s_peaks: np.ndarray, delta_with_time_unit: timedelta) -> Tuple[int, int, int, list]:
135 |         """
136 |         Useful method for compute_f1score method.
137 |         Args:
138 |             s_peaks (np.ndarray): peaks of op.
139 |             delta_with_time_unit (timedelta): delta as number in unit time.
140 | 
141 |         Returns:
142 |             tp, fp, fn, delta_t
143 |         """
144 |         e_t = self.optimization_data.true_events.copy()
145 | 
146 |         fp: int = 0
147 |         tp: int = 0
148 |         delta_t: list = []
149 |         for m_p in s_peaks:
150 |             signed_delta = delta_with_time_unit
151 |             t_t = None
152 |             for i, t_e in enumerate(e_t[MIDDLE_EVENT_LABEL]):
153 |                 m_t = t_e
154 |                 diff = m_p - m_t
155 | 
156 |                 if abs(diff) <= delta_with_time_unit:
157 |                     if t_t is None or abs(m_p - t_t) > abs(diff):
158 |                         t_t = m_t
159 |                         signed_delta = diff
160 | 
161 |             if t_t is not None:
162 |                 tp += 1
163 |                 e_t = e_t.drop(e_t[e_t[MIDDLE_EVENT_LABEL] == t_t].index)
164 |                 diff = get_total_units(timedelta_=signed_delta, unit=self.optimization_data.time_unit)
165 | 
166 |                 delta_t.append(diff)
167 |             else:
168 |                 fp += 1
169 |         fn: int = len(e_t)
170 |         return tp, fp, fn, delta_t
171 | 
172 |     def compute_f1score(self, sigma: int, m: int, h: float):
173 |         delta_with_time_unit = get_timedelta(self.optimization_data.delta, self.optimization_data.time_unit)
174 |         op_g: np.ndarray = self.apply_gaussian_filter(sigma=sigma, m=m)
175 |         t, op_g = self.__compute_op_as_mid_times(op_g=op_g)
176 |         s_peaks = get_peaks(h=h, t=t, op_g=op_g)
177 |         tp, fp, fn, delta_t = self.__util_method(s_peaks=s_peaks, delta_with_time_unit=delta_with_time_unit)
178 | 
179 |         if tp + fp == 0 or tp + fn == 0:
180 |             return 0.0, 0.0, 0.0, [], []
181 | 
182 |         precision = tp / (tp + fp)
183 |         recall = tp / (tp + fn)
184 |         if precision + recall == 0:
185 |             return 0.0, 0.0, 0.0, [], []
186 |         return (2.0 * precision * recall) / (precision + recall), precision, recall, s_peaks.tolist(), delta_t
187 | 
188 |     def evaluate_combination(self, combination):
189 |         sigma, m, h = combination
190 |         f1_score, precision, recall, peaks, delta_t = self.compute_f1score(sigma, m, h)
191 |         formatted_combination = ', '.join(f'{item:.4f}' for item in combination)
192 |         if f1_score > 0:
193 |             logger.info(
194 |                 f"Evaluated Combination [sigma, m, h] : [{formatted_combination}] => [F1 Score: {f1_score:.4f}, "
195 |                 f"Precision: {precision:.4f}, Recall: {recall:.4f}]")
196 |         return f1_score, precision, recall, peaks, delta_t
197 | 
198 | 
199 | class EventOptimization:
200 |     """
201 |     After obtaining the predicted op values from the metamodel, they are then processed
202 |         through an optimization algorithm to extract the predicted events. This involves applying
203 |         a Gaussian filter to smooth out the predictions and identifying peaks in the resulting signal
204 |         that correspond to the mid-times of the predicted events, which are then compared to the
205 |         actual events in the test set. The performance of the algorithm is evaluated by computing
206 |         metrics such as F1-Score, which combines precision and recall using their harmonic means.
207 |         Maximizing the F1-Score is the preferred metric for evaluating models since it requires
208 |         simultaneously maximizing precision and recall.
209 |     """
210 | 
211 |     def __init__(self, optimization_data: OptimizationData):
212 |         self.optimization_data = optimization_data
213 |         self.optimization_calculator: OptimizationCalculator = OptimizationCalculator(self.optimization_data)
214 |         self.results = ()
215 | 
216 |     def max_f1score(self) -> tuple[list, list]:
217 |         """
218 |         The optimization process aims to maximize the F1-Score metric by fine-tuning several parameters,
219 |             including the filter size (2m + 1) and standard deviation (σ) of the Gaussian filter,
220 |             and the peak height threshold h.
221 | 
222 |         Returns:
223 |               list of peaks, delta_t
224 |         """
225 |         sigma_range = range(1, self.optimization_data.big_sigma + 1)
226 |         h_values = np.arange(0, 1, self.optimization_data.s_h)
227 |         # Create a list of all combinations to evaluate
228 |         combinations = [(sigma, m, h) for sigma in sigma_range for m in [sigma, 2 * sigma, 3 * sigma] for
229 |                         h in h_values]
230 | 
231 |         try:
232 |             # Evaluate combinations sequentially
233 |             results = [self.optimization_calculator.evaluate_combination(combination) for combination in
234 |                        combinations]
235 |         except ValueError as e:
236 |             logger.error(e)
237 |             exit(0)
238 | 
239 |         # Find the combination with the maximum F1 score
240 |         best_combination_index = np.argmax(list(map(lambda metrics: metrics[0], results)))
241 |         best_combination = combinations[best_combination_index]
242 |         config_dict["best_combination"] = best_combination
243 |         self.results = results[best_combination_index]
244 |         max_f1_score, precision, recall, peaks, delta_t = self.results
245 | 
246 |         formatted_combination = ', '.join(f'{item:.4f}' for item in best_combination)
247 |         logger.warning(
248 |             f"Best Combination [sigma, m, h] : [{formatted_combination}] => "
249 |             f"[Max F1 Score: {max_f1_score:.4f} => Precision:{precision:.4f}, Recall:{recall:.4f}]")
250 |         return peaks, delta_t
251 | 


--------------------------------------------------------------------------------
/eventdetector_ts/plotter/plotter.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import os
  3 | from typing import Dict
  4 | 
  5 | import matplotlib.dates as mdates
  6 | import matplotlib.pyplot as plt
  7 | import numpy as np
  8 | import pandas as pd
  9 | import seaborn as sns
 10 | from matplotlib.patches import Patch
 11 | 
 12 | from eventdetector_ts import OUTPUT_DIR, TimeUnit, MIDDLE_EVENT_LABEL
 13 | from eventdetector_ts.data.helpers_data import get_timedelta
 14 | from eventdetector_ts.plotter import logger, COLOR_TRUE, COLOR_PREDICTED, STYLE_PREDICTED, STYLE_TRUE, FIG_SIZE, PALETTE
 15 | from eventdetector_ts.plotter.helpers import event_to_rectangle
 16 | 
 17 | 
 18 | class Plotter:
 19 |     """
 20 |     The Plotter class is responsible for generating and saving plots of the predicted and true op, events, delta_t,...
 21 |         It provides a convenient way to visualize and compare the performance of a
 22 |         predictive model against the actual observed values.
 23 |     """
 24 | 
 25 |     def __init__(self, root_dir: str, time_unit: TimeUnit, width_events_s: float) -> None:
 26 |         """
 27 |         Initialize the Plotter object.
 28 | 
 29 |         Args:
 30 |             root_dir (str): The root directory for saving the plots.
 31 |             time_unit (TimeUnit): The unit time of the dataset.
 32 |             width_events_s (float): The width of events in the unit of time for the dataset.
 33 |         """
 34 | 
 35 |         self.val_losses = {}
 36 |         self.train_losses = {}
 37 |         self.val_loss_meta_model: list = []
 38 |         self.train_loss_meta_model: list = []
 39 |         self.width_events_s = width_events_s
 40 |         self.time_unit = time_unit
 41 |         # Whether to display the plots or not. Defaults to False.
 42 |         self.show = True
 43 |         self.root_dir = root_dir
 44 |         self.predicted_y: np.ndarray = np.empty(shape=(0,))
 45 |         self.test_y: np.ndarray = np.empty(shape=(0,))
 46 |         self.predicted_events: list = []
 47 |         self.true_events: pd.DataFrame = pd.DataFrame()
 48 |         self.delta_t: list = []
 49 |         self.working_dir = os.path.join(root_dir, OUTPUT_DIR)
 50 |         os.makedirs(self.working_dir)
 51 | 
 52 |     def set_show(self, show: bool) -> None:
 53 |         """
 54 |         Set show value
 55 |         Args:
 56 |             show (bool): Value to set for 'self.show'
 57 | 
 58 |         Returns:
 59 |             None
 60 |         """
 61 |         self.show = show
 62 | 
 63 |     def set_data_op(self, test_y: np.ndarray, predicted_y: np.ndarray) -> None:
 64 |         """
 65 |         Set test_y and predicted_y
 66 |         Args:
 67 |             test_y: The true op values
 68 |             predicted_y: The predicted op values
 69 | 
 70 |         Returns:
 71 |             None
 72 |         """
 73 |         self.test_y = test_y
 74 |         self.predicted_y = predicted_y
 75 | 
 76 |     def set_data_events(self, predicted_events: list, true_events: pd.DataFrame) -> None:
 77 |         """
 78 |         Set true and predicted events
 79 |         Args:
 80 |             predicted_events (list): List of predicted events computed by the optimization process
 81 |             true_events (pd.DataFrame): DataFrame of true events
 82 | 
 83 |         Returns:
 84 |             None
 85 |         """
 86 |         self.predicted_events = predicted_events
 87 |         self.true_events = true_events
 88 | 
 89 |     def set_delta_t(self, delta_t: list) -> None:
 90 |         """
 91 |         Set delta_t
 92 |         Args:
 93 |             delta_t (list): Each item of this list contains the accepted delta in time unit between
 94 |                 true event its correspondent in the list of predicted events
 95 | 
 96 |         Returns:
 97 |             None
 98 |         """
 99 |         self.delta_t = delta_t
100 | 
101 |     def set_losses(self, train_losses: Dict[str, list], val_losses: Dict[str, list],
102 |                    train_loss_meta_model: list, val_loss_meta_model: list) -> None:
103 |         """
104 |         Set losses of all trained models.
105 |         Args:
106 |             train_losses (Dict[str, list]): train losses.
107 |             val_losses (Dict[str, list]): val losses.
108 |             train_loss_meta_model (list): train loss for the metamodel.
109 |             val_loss_meta_model (list): val loss for the metamodel.
110 |         Returns:
111 |             None
112 |         """
113 |         self.train_losses = train_losses
114 |         self.val_losses = val_losses
115 |         self.train_loss_meta_model = train_loss_meta_model
116 |         self.val_loss_meta_model = val_loss_meta_model
117 | 
118 |     def plot_prediction(self) -> None:
119 |         """
120 |         Plot the true and the predicted op and save it.
121 | 
122 |         Returns:
123 |             None
124 |         """
125 | 
126 |         logger.info("Plotting and saving the figure displaying the true and the predicted op")
127 |         # Create the plot using Seaborn
128 |         # Set the ggplot style
129 |         sns.set(style="ticks", palette=PALETTE)
130 |         plt.figure(figsize=FIG_SIZE)  # Set the figure size
131 |         # Plot the true and predicted values using Seaborn
132 |         n = len(self.test_y)
133 |         sns.lineplot(x=np.arange(n), y=self.test_y, color=COLOR_TRUE, label='True Op')
134 |         sns.lineplot(x=np.arange(n), y=self.predicted_y, color=COLOR_PREDICTED, label='Predicted Op')
135 |         # Add labels and title to the plot
136 |         plt.xlabel('Windows')
137 |         plt.ylabel('Op')
138 |         plt.title('True Op vs Predicted Op')
139 |         # Add legend
140 |         plt.legend()
141 |         # Save the plot to a file
142 |         path = os.path.join(self.working_dir, "op.png")
143 |         plt.savefig(path, dpi=300)
144 |         # Show the plot
145 |         if self.show:
146 |             plt.show()
147 |         self.__save_op()
148 | 
149 |     def plot_predicted_events(self) -> None:
150 |         """
151 |         Plot the true and the predicted events and save it.
152 | 
153 |         Returns:
154 |             None
155 |         """
156 | 
157 |         logger.info("Plotting and saving the figure displaying the true events and the predicted events")
158 |         fig, ax = plt.subplots(figsize=FIG_SIZE)
159 |         sns.set(style="ticks", palette=PALETTE)
160 | 
161 |         for i, predicted_event in enumerate(self.predicted_events):
162 |             rect1 = event_to_rectangle(event=predicted_event, width_events_s=self.width_events_s,
163 |                                        time_unit=self.time_unit,
164 |                                        color=COLOR_PREDICTED,
165 |                                        style=STYLE_PREDICTED)
166 |             ax.add_patch(rect1)
167 | 
168 |         for _, test_date in self.true_events[MIDDLE_EVENT_LABEL].items():
169 |             rect1 = event_to_rectangle(event=test_date, width_events_s=self.width_events_s, time_unit=self.time_unit,
170 |                                        color=COLOR_TRUE,
171 |                                        style=STYLE_TRUE)
172 |             ax.add_patch(rect1)
173 | 
174 |         locator = mdates.AutoDateLocator(minticks=3)
175 |         formatter = mdates.AutoDateFormatter(locator)
176 |         ax.xaxis.set_major_locator(locator)
177 |         ax.xaxis.set_major_formatter(formatter)
178 | 
179 |         start_time = self.true_events[MIDDLE_EVENT_LABEL].iloc[0]
180 |         end_time = self.true_events[MIDDLE_EVENT_LABEL].iloc[-1]
181 |         ax.set_xlim([start_time, end_time])
182 |         ax.set_ylim([0.0, 1.01])
183 | 
184 |         predicted_patch = Patch(color=COLOR_PREDICTED, label='Predicted Events')
185 |         true_patch = Patch(color=COLOR_TRUE, label='True Events')
186 |         ax.legend(handles=[predicted_patch, true_patch], edgecolor="black")
187 | 
188 |         # Save the plot to a file
189 |         path = os.path.join(self.working_dir, "events.png")
190 |         plt.savefig(path, dpi=300)
191 |         # Show the plot
192 |         if self.show:
193 |             plt.show()
194 |         self.__save_events()
195 | 
196 |     def plot_delta_t(self, bins=30) -> None:
197 |         """
198 |         Plots a histogram for delta t.
199 | 
200 |         Args:
201 |             bins (int): The number of bins in the histogram. Default is 10.
202 | 
203 |         Returns:
204 |               None
205 |         """
206 |         sns.set(style="ticks", palette=PALETTE)
207 |         plt.figure(figsize=FIG_SIZE)
208 | 
209 |         sns.histplot(self.delta_t, bins=bins, binrange=(-self.width_events_s, self.width_events_s))
210 | 
211 |         plt.xlabel(f'delta ({self.time_unit})')
212 |         plt.ylabel('Number of events')
213 | 
214 |         std = np.std(self.delta_t)
215 |         mu = np.mean(self.delta_t)
216 | 
217 |         plt.title(f'Histogram std = {std:.2f}, mu = {mu:.2f}')
218 |         # Save the plot to a file
219 |         path = os.path.join(self.working_dir, "delta_t.png")
220 |         plt.savefig(path, dpi=300)
221 |         # Show the plot
222 |         if self.show:
223 |             plt.show()
224 | 
225 |     def plot_losses(self):
226 |         """
227 |         Plot losses for all trained models.
228 |         Returns:
229 |             None
230 |         """
231 |         meta_model_was_used: bool = len(self.val_loss_meta_model) > 0
232 | 
233 |         sns.set(style="ticks", palette=PALETTE)
234 |         if meta_model_was_used:
235 |             fig, (ax1, ax2) = plt.subplots(1, 2)
236 |             fig.set_size_inches((11, 8.5), forward=False)
237 |         else:
238 |             fig, ax1 = plt.subplots(figsize=FIG_SIZE)
239 |         y_label = 'Loss'
240 |         x_label = 'Epochs'
241 |         colors = sns.color_palette(PALETTE, len(self.val_losses))
242 |         lifestyle_val = '--'
243 |         lifestyle_train = '-'
244 |         for i, (model_name, val_loss) in enumerate(self.val_losses.items()):
245 |             epochs = range(1, len(val_loss) + 1)
246 |             train_loss = self.train_losses[model_name]
247 |             ax1.plot(epochs, train_loss, linestyle=lifestyle_train, color=colors[i],
248 |                      label='Training Loss - {}'.format(model_name))
249 |             ax1.plot(epochs, val_loss, linestyle=lifestyle_val, color=colors[i],
250 |                      label='Validation Loss - {}'.format(model_name))
251 |             ax1.set_ylabel(y_label)
252 |             ax1.set_xlabel(x_label)
253 |             ax1.legend()
254 | 
255 |         if len(self.val_loss_meta_model) > 0:
256 |             epochs_meta = range(1, len(self.val_loss_meta_model) + 1)
257 |             ax2.plot(epochs_meta, self.train_loss_meta_model, linestyle=lifestyle_train, color='b',
258 |                      label='Training Loss - Meta Model')
259 |             ax2.plot(epochs_meta, self.val_loss_meta_model, linestyle=lifestyle_val, color='g',
260 |                      label='Validation Loss - Meta Model')
261 |             ax2.set_ylabel(y_label)
262 |             ax2.set_xlabel(x_label)
263 |             ax2.legend()
264 | 
265 |         fig.suptitle('Training and Validation Losses')
266 |         plt.tight_layout()
267 |         # Save the plot to a file
268 |         path = os.path.join(self.working_dir, "losses.png")
269 |         plt.savefig(path, dpi=300)
270 |         # Show the plot
271 |         if self.show:
272 |             plt.show()
273 | 
274 |     def __save_events(self) -> None:
275 |         """
276 |         Save predicted events/true events to csv files.
277 | 
278 |         Returns:
279 |             None
280 |         """
281 |         path = os.path.join(self.working_dir, "predicted_events.csv")
282 |         radius = get_timedelta(float(self.width_events_s) / 2.0, self.time_unit)
283 |         with open(path, 'w', encoding='UTF8', newline='') as f:
284 |             writer = csv.writer(f, delimiter=' ')
285 |             for i in range(len(self.predicted_events)):
286 |                 predicted_event = self.predicted_events[i]
287 |                 start_time = predicted_event - radius
288 |                 end_time = predicted_event + radius
289 | 
290 |                 start_time = start_time.replace(microsecond=0)
291 |                 end_time = end_time.replace(microsecond=0)
292 | 
293 |                 writer.writerow([start_time.isoformat(), end_time.isoformat()])
294 | 
295 |         path = os.path.join(self.working_dir, "true_events.csv")
296 |         with open(path, 'w', encoding='UTF8', newline='') as f:
297 |             writer = csv.writer(f, delimiter=' ')
298 |             for _, test_date in enumerate(self.true_events[MIDDLE_EVENT_LABEL]):
299 |                 start_time = test_date - radius
300 |                 end_time = test_date + radius
301 | 
302 |                 start_time = start_time.replace(microsecond=0)
303 |                 end_time = end_time.replace(microsecond=0)
304 | 
305 |                 writer.writerow([start_time.isoformat(), end_time.isoformat()])
306 | 
307 |     def __save_op(self) -> None:
308 |         """
309 |         Save predicted/true Op into csv file.
310 | 
311 |         Returns:
312 |             None
313 |         """
314 |         df = pd.DataFrame({'True-Op': self.test_y, 'Predicted-Op': self.predicted_y})
315 |         path = os.path.join(self.working_dir, "op.csv")
316 |         df.to_csv(path, index=True, sep=" ")
317 | 


--------------------------------------------------------------------------------
/eventdetector_ts/models/models_trainer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Dict, Tuple
  3 | 
  4 | import joblib
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | from numpy import ndarray
  8 | from sklearn.model_selection import KFold, train_test_split
  9 | from sklearn.preprocessing import StandardScaler
 10 | 
 11 | from eventdetector_ts import MODELS_DIR, META_MODEL_NETWORK, config_dict, TYPE_TRAINING_FFN, SCALERS_DIR, \
 12 |     META_MODEL_SCALER
 13 | from eventdetector_ts.metamodel.utils import DataSplitter
 14 | from eventdetector_ts.models import logger_models
 15 | from eventdetector_ts.models.helpers_models import CustomEarlyStopping, custom_cross_val_score
 16 | from eventdetector_ts.models.models_builder import ModelBuilder
 17 | 
 18 | 
 19 | class ModelTrainer:
 20 |     """
 21 |     A class used to train and evaluate machine learning models.
 22 | 
 23 |     Attributes:
 24 |         data_splitter (DataSplitter): An object of the DataSplitter class, which is used to split the data
 25 |             into train and test sets.
 26 |         epochs (int): The number of epochs to train the models.
 27 |         batch_size (int): The batch size to use during training.
 28 |         pa (int): The patience value to use for the EarlyStopping callback.
 29 |         t_r (float): The ratio value to use for the CustomEarlyStopping callback.
 30 |         use_kfold (bool): Whether to use K-Fold cross-validation or not.
 31 |         val_size (float): The size of the validation set to use during training.
 32 |         epsilon (float): A small constant used to control the size of set which contains the top models
 33 |             with the lowest MSE values.
 34 |         save_models_as_dot_format (bool): Whether to save the models as a dot format file.
 35 |                 The default value is False. If set to True, then you should have graphviz software
 36 |                 to be installed on your machine.
 37 |         train_losses (Dict[str, np.ndarray]): A dictionary containing the training losses for each model.
 38 |         val_losses (Dict[str, np.ndarray]): A dictionary containing the validation losses for each model.
 39 |         val_loss_meta_model (np.ndarray): val loss for the meta_model.
 40 |         train_loss_meta_model (np.ndarray): train loss for the meta_model
 41 |     """
 42 | 
 43 |     def __init__(self, data_splitter: DataSplitter, epochs: int,
 44 |                  batch_size: int, pa: int, t_r: float,
 45 |                  use_kfold: bool, val_size: float, epsilon: float, save_models_as_dot_format: bool) -> None:
 46 |         """
 47 |         Initialize the ModelTrainer object.
 48 | 
 49 |         Args:
 50 |             data_splitter (DataSplitter): An object of the DataSplitter class, which is used to split the data
 51 |                 into train and test sets.
 52 |             epochs (int): The number of epochs to train the models.
 53 |             batch_size (int): The batch size to use during training.
 54 |             pa (int): The patience value to use for the EarlyStopping callback.
 55 |             t_r (float): The ratio value to use for the CustomEarlyStopping callback.
 56 |             use_kfold (bool): Whether to use K-Fold cross-validation or not.
 57 |             val_size (float): The size of the validation set to use during training.
 58 |             epsilon (float): A small constant used to control the size of set which contains the top models
 59 |                     with the lowest MSE values.
 60 |             save_models_as_dot_format (bool): Whether to save the models as a dot format file.
 61 |                 The default value is False. If set to True, then you should have graphviz software
 62 |                 to be installed on your machine.
 63 |         """
 64 | 
 65 |         self.val_loss_meta_model: list = []
 66 |         self.train_loss_meta_model: list = []
 67 |         self.save_models_as_dot_format = save_models_as_dot_format
 68 |         self.best_models: Dict[str, tf.keras.Model] = {}
 69 |         self.train_losses: Dict[str, list] = {}
 70 |         self.val_losses: Dict[str, list] = {}
 71 |         self.data_splitter: DataSplitter = data_splitter
 72 |         self.epochs: int = epochs
 73 |         self.batch_size: int = batch_size
 74 |         self.pa = pa
 75 |         self.t_r = t_r
 76 |         self.use_kfold = use_kfold
 77 |         self.val_size = val_size
 78 |         self.epsilon = epsilon
 79 | 
 80 |     def fitting_models(self, created_models: Dict[str, tf.keras.Model]) -> None:
 81 |         """
 82 |         Fits the created models to the training data and saves the training and validation losses.
 83 | 
 84 |         Args:
 85 |             created_models: A dictionary containing the created models with their names as keys
 86 |                 and the models as values.
 87 | 
 88 |         Returns:
 89 |             None
 90 |         """
 91 |         # Define early stopping based on validation loss
 92 |         early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=self.pa * 2)
 93 |         # Define custom early stopping based on a ratio and patience
 94 |         custom_early_stopping = CustomEarlyStopping(ratio=self.t_r, patience=self.pa, verbose=1)
 95 |         # Loop through each model in the created models dictionary
 96 |         for model_name, model in created_models.items():
 97 |             # If using k-fold cross-validation
 98 |             if self.use_kfold:
 99 |                 logger_models.info("Performing a KFold cross-validation")
100 |                 # Calculate cross validation score using custom function
101 |                 score: np.ndarray = custom_cross_val_score(model=model, x=self.data_splitter.train_x,
102 |                                                            y=self.data_splitter.train_y,
103 |                                                            cv=KFold(n_splits=5, shuffle=False), epochs=self.epochs,
104 |                                                            batch_size=self.batch_size,
105 |                                                            callbacks=[early_stopping, custom_early_stopping])
106 |                 # Print cross validation score for the current model
107 |                 logger_models.info(f"The cross validation score for {model_name} is {score}")
108 |             # Split training data into training and validation sets
109 |             train_x, val_x, train_y, val_y = train_test_split(self.data_splitter.train_x, self.data_splitter.train_y,
110 |                                                               test_size=self.val_size,
111 |                                                               shuffle=False)
112 |             # Print message indicating fitting of a current model
113 |             logger_models.info(f"Summary of {model_name}...")
114 |             logger_models.info(model.summary())
115 |             logger_models.info(f"Fitting of {model_name}...")
116 |             # Fit the model using training data and validate using validation data
117 |             history = model.fit(train_x, train_y, epochs=self.epochs,
118 |                                 batch_size=self.batch_size, verbose=1,
119 |                                 validation_data=(val_x, val_y),
120 |                                 callbacks=[early_stopping, custom_early_stopping])
121 |             # Save training and validation errors for the current model
122 |             self.train_losses[model_name] = history.history['loss']
123 |             self.val_losses[model_name] = history.history['val_loss']
124 | 
125 |         losses_test_data: Dict[str, tf.keras.Model] = {}
126 |         min_loss = np.inf
127 |         for model_name, model in created_models.items():
128 |             logger_models.info(f"Evaluating model {model_name} on test data")
129 |             loss = model.evaluate(self.data_splitter.test_x, self.data_splitter.test_y, batch_size=self.batch_size)
130 |             logger_models.info(f"The loss value of model {model_name} on test data is {loss:.4f}")
131 |             losses_test_data[model_name] = loss
132 |             if min_loss > loss:
133 |                 min_loss = loss
134 | 
135 |         logger_models.info(f"Selecting best models based on the min MSE {min_loss:.4f} and epsilon {self.epsilon}:")
136 |         for model_name, loss_ in losses_test_data.items():
137 |             if loss_ <= (min_loss + self.epsilon):
138 |                 self.best_models[model_name] = created_models[model_name]
139 |         logger_models.info(f"Best models selected: {self.best_models.keys()}")
140 | 
141 |         config_dict["models"] = list(self.best_models.keys())
142 | 
143 |     def save_best_models(self, output_dir: str) -> None:
144 |         """
145 |         Save the best models to the specified output directory.
146 | 
147 |         Args:
148 |              output_dir (str): The directory to save the best models.
149 | 
150 |         Returns:
151 |             None
152 |         """
153 | 
154 |         for model_name, model in self.best_models.items():
155 |             # Print the name of the current model being saved
156 |             logger_models.info(f"Current model to be saved on the disk is {model_name}")
157 |             model_name_with_ext = f"{model_name}.keras"
158 |             model_path = os.path.join(output_dir, MODELS_DIR, model_name_with_ext)
159 | 
160 |             # Save in Keras 3 compatible format
161 |             model.save(model_path, save_format='keras')
162 | 
163 |         logger_models.info("Models saved successfully.")
164 | 
165 |     def train_meta_model(self, type_training: str, hyperparams_mm_network: Tuple[int, int], output_dir: str) \
166 |             -> tuple[ndarray, float, ndarray]:
167 |         """
168 |         Trains the metamodel using the best models predictions as features.
169 | 
170 |          Args:
171 |             type_training: The type of training to use, either "ffn" or "mean".
172 |             hyperparams_mm_network: A tuple containing the hyperparameters the MetaModel network.
173 |             output_dir: The directory to save the trained models to.
174 | 
175 |         Returns:
176 |             A tuple containing the final prediction and the loss.
177 |         """
178 |         predictions = []
179 |         for model_name, model in self.best_models.items():
180 |             # Make predictions for the test set using each model
181 |             predicted_y: np.ndarray = model.predict(self.data_splitter.test_x, batch_size=self.batch_size)
182 |             predicted_y = predicted_y.flatten()
183 |             predictions.append(predicted_y)
184 | 
185 |         # Convert a list of 1D NumPy arrays to 2D NumPy array
186 |         x = np.stack(predictions, axis=1)
187 | 
188 |         if type_training == TYPE_TRAINING_FFN:
189 |             logger_models.info("Train the MetaModel using a FFN to produce a final prediction")
190 |             # Split the data into training and test sets
191 |             train_x, test_x, train_y, test_y = train_test_split(x, self.data_splitter.test_y,
192 |                                                                 test_size=self.data_splitter.test_size,
193 |                                                                 shuffle=False)
194 |             scaler = StandardScaler()
195 |             train_x = scaler.fit_transform(train_x)
196 |             test_x = scaler.transform(test_x)
197 |             scalers_dir = os.path.join(output_dir, SCALERS_DIR)
198 |             scaler_path = os.path.join(scalers_dir, f"{META_MODEL_SCALER}.joblib")
199 |             joblib.dump(scaler, scaler_path)
200 |             # Build the FFN model
201 |             inputs = tf.keras.Input(shape=(train_x.shape[1],), name="input")
202 |             layers, units = hyperparams_mm_network
203 |             model_builder: ModelBuilder = ModelBuilder(inputs=inputs)
204 | 
205 |             for _ in range(layers):
206 |                 units_j = units
207 |                 model_builder.add_dense_layer(units=units_j)
208 |             model_builder.add_dense_layer(units=1, dropout=None)
209 |             keras_model = model_builder.build(name=META_MODEL_NETWORK, root_dir=output_dir,
210 |                                               save_models_as_dot_format=self.save_models_as_dot_format)
211 |             # Train the model
212 |             logger_models.info("Fitting the MetaModel network...")
213 |             history = keras_model.fit(train_x, train_y, epochs=self.epochs, batch_size=self.batch_size, verbose=1,
214 |                                       validation_data=(test_x, test_y))
215 | 
216 |             path = os.path.join(output_dir, MODELS_DIR)
217 |             model_path = os.path.join(path, META_MODEL_NETWORK)
218 |             keras_model.save(model_path, save_format='keras')
219 |             logger_models.info("MetaModel network saved successfully.")
220 |             self.train_loss_meta_model = history.history['loss']
221 |             self.val_loss_meta_model = history.history['val_loss']
222 | 
223 |             # final_prediction: np.ndarray = keras_model.predict(self.data_splitter.test_x, batch_size=self.batch_size)
224 |             final_prediction: np.ndarray = keras_model.predict(test_x, batch_size=self.batch_size)
225 |             final_prediction = final_prediction.flatten()
226 |             return final_prediction, tf.keras.losses.mse(final_prediction, test_y), test_y
227 |         else:
228 |             # Compute the average prediction
229 |             logger_models.info("Compute the average of predictions to produce a final prediction")
230 |             final_prediction = np.mean(x, axis=1)
231 |             return final_prediction, tf.keras.losses.mse(final_prediction,
232 |                                                          self.data_splitter.test_y), self.data_splitter.test_y
233 | 


--------------------------------------------------------------------------------
/images/logo_eventdetector.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <svg viewBox="0 0 115 50" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
 3 |   <defs>
 4 |     <linearGradient id="color-2-0" gradientUnits="userSpaceOnUse" x1="67.585" y1="3.135" x2="67.585" y2="41.39" xlink:href="#color-2"/>
 5 |     <linearGradient id="color-2">
 6 |       <stop style="stop-color: #00f260;" offset="0"/>
 7 |       <stop style="stop-color: #0575e6;" offset="1"/>
 8 |     </linearGradient>
 9 |     <linearGradient id="color-2-6" gradientUnits="userSpaceOnUse" x1="110.267" y1="65.31" x2="110.267" y2="87.296" xlink:href="#color-2"/>
10 |     <linearGradient id="color-2-5" gradientUnits="userSpaceOnUse" x1="117.883" y1="72.893" x2="117.883" y2="95.047" xlink:href="#color-2"/>
11 |     <linearGradient id="color-1-0" gradientUnits="userSpaceOnUse" x1="78.657" y1="17.691" x2="78.657" y2="26.333" xlink:href="#color-1"/>
12 |     <linearGradient id="color-1">
13 |       <stop style="stop-color: #0099f7;" offset="0"/>
14 |       <stop style="stop-color: #f11712;" offset="1"/>
15 |     </linearGradient>
16 |     <linearGradient id="color-2-1" gradientUnits="userSpaceOnUse" x1="81.594" y1="14.367" x2="81.594" y2="27.396" xlink:href="#color-2"/>
17 |     <linearGradient id="color-2-2" gradientUnits="userSpaceOnUse" x1="81.594" y1="14.367" x2="81.594" y2="27.396" xlink:href="#color-2"/>
18 |     <linearGradient id="color-2-3" gradientUnits="userSpaceOnUse" x1="81.594" y1="14.367" x2="81.594" y2="27.396" xlink:href="#color-2"/>
19 |     <linearGradient id="color-2-4" gradientUnits="userSpaceOnUse" x1="81.594" y1="14.367" x2="81.594" y2="27.396" xlink:href="#color-2"/>
20 |   </defs>
21 |   <g id="layer3" transform="matrix(0.303647994995, 0.004114000127, -0.034355007112, 0.528638899326, -1.893455701117, -3.260362300408)" style="transform-origin: 67.585px 22.2625px;">
22 |     <path style="stroke-width: 4; stroke-linecap: butt; stroke-linejoin: miter; stroke-dasharray: none; stroke-opacity: 1; fill: none; stroke: url(#color-2-0);" d="M 23.66 41.39 C 42.496 -43.82 62.207 41.546 111.51 30.427" id="path586-7-5"/>
23 |   </g>
24 |   <g id="layer1" transform="matrix(1.3596000671386719, 0, 0, 1.7648699283599854, -42.99967956542969, -6.1296257972717285)" style="">
25 |     <g id="g16950" transform="matrix(0.458857, 0, 0, 0.458857, -12.722214, -15.87053)" style="display:inline">
26 |       <path style="fill-opacity: 1; stroke-width: 0.264583; fill: url(#color-2-6);" d="m 113.91105,65.310547 c -1.21276,0.0056 -2.37092,0.109066 -3.38998,0.289388 -3.00202,0.530359 -3.54707,1.640444 -3.54707,3.68763 v 2.703711 h 7.09414 v 0.901237 h -7.09414 -2.66237 c -2.06175,0 -3.86708,1.239234 -4.431767,3.59668 -0.651356,2.70218 -0.680247,4.388385 0,7.209896 0.504277,2.100223 1.708557,3.596679 3.770317,3.596679 h 2.43912 v -3.241146 c 0,-2.341536 2.02596,-4.406965 4.43177,-4.406966 h 7.08588 c 1.97245,0 3.54707,-1.624056 3.54707,-3.604948 v -6.755143 c 0,-1.922552 -1.62189,-3.366764 -3.54707,-3.68763 -1.21867,-0.202862 -2.48314,-0.295023 -3.6959,-0.289388 z m -3.83646,2.174545 c 0.73278,0 1.33118,0.608183 1.33118,1.355989 0,0.745156 -0.5984,1.347721 -1.33118,1.347721 -0.7354,0 -1.33119,-0.602566 -1.33119,-1.347721 0,-0.747806 0.59579,-1.355989 1.33119,-1.355989 z" id="path1948"/>
27 |       <path style="fill-opacity: 1; stroke-width: 0.264583; fill: url(#color-2-5);" d="m 122.03872,72.892513 v 3.150195 c 0,2.442304 -2.0706,4.497917 -4.43177,4.497917 h -7.08588 c -1.94094,0 -3.54707,1.661182 -3.54707,3.604948 v 6.755142 c 0,1.922554 1.6718,3.053378 3.54707,3.604949 2.24561,0.660297 4.39903,0.779629 7.08588,0 1.78597,-0.517099 3.54707,-1.557763 3.54707,-3.604949 v -2.70371 h -7.08588 v -0.901237 h 7.08588 3.54707 c 2.06175,0 2.83005,-1.438118 3.54707,-3.596679 0.74065,-2.222206 0.70914,-4.359216 0,-7.209896 -0.50953,-2.05249 -1.48269,-3.59668 -3.54707,-3.59668 z m -3.98529,17.106966 c 0.7354,1e-6 1.33119,0.602567 1.33119,1.347721 0,0.747806 -0.59579,1.35599 -1.33119,1.35599 -0.73278,0 -1.33118,-0.608184 -1.33118,-1.35599 0,-0.745154 0.5984,-1.347721 1.33118,-1.347721 z" id="path1950-4"/>
28 |     </g>
29 |     <path d="M 54.523 24.272 L 48.464 24.272 L 48.464 15.835 L 54.523 15.835 L 54.523 16.913 L 50.79 16.913 L 50.79 19.52 L 53.673 19.52 L 53.673 20.592 L 50.79 20.592 L 50.79 23.194 L 54.523 23.194 Z" style="fill: url(#color-2-1); white-space: pre;"/>
30 |     <path d="M 57.879 24.272 L 56.691 24.272 L 55.05 19.76 L 56.371 19.76 L 57.574 23.354 L 58.742 19.76 L 59.656 19.76 Z" style="fill: url(#color-2-2); white-space: pre;"/>
31 |     <path d="M 61.207 22.131 Q 61.449 22.186 61.691 22.217 Q 61.933 22.249 62.179 22.249 Q 62.437 22.249 62.658 22.2 Q 62.879 22.151 63.041 22.036 Q 63.203 21.92 63.297 21.729 Q 63.39 21.538 63.39 21.252 Q 63.39 21.022 63.328 20.831 Q 63.265 20.639 63.142 20.502 Q 63.019 20.366 62.84 20.29 Q 62.66 20.213 62.422 20.213 Q 62.199 20.213 62.021 20.295 Q 61.843 20.377 61.709 20.516 Q 61.574 20.655 61.478 20.838 Q 61.383 21.022 61.32 21.221 Q 61.258 21.42 61.23 21.624 Q 61.203 21.827 61.203 22.006 Q 61.203 22.038 61.203 22.069 Q 61.203 22.1 61.207 22.131 Z M 64.461 21.241 Q 64.461 21.518 64.371 21.735 Q 64.281 21.952 64.127 22.116 Q 63.972 22.28 63.769 22.393 Q 63.566 22.506 63.336 22.579 Q 63.105 22.651 62.861 22.682 Q 62.617 22.713 62.386 22.713 Q 62.093 22.713 61.808 22.672 Q 61.523 22.631 61.246 22.542 Q 61.289 22.799 61.383 23.032 Q 61.476 23.264 61.633 23.438 Q 61.789 23.612 62.017 23.713 Q 62.246 23.815 62.558 23.815 Q 62.765 23.815 62.953 23.752 Q 63.14 23.69 63.297 23.577 Q 63.453 23.463 63.572 23.307 Q 63.691 23.151 63.765 22.96 L 64.336 23.17 Q 64.207 23.479 63.982 23.704 Q 63.758 23.928 63.474 24.075 Q 63.191 24.221 62.873 24.292 Q 62.554 24.362 62.242 24.362 Q 61.726 24.362 61.289 24.188 Q 60.851 24.014 60.535 23.702 Q 60.218 23.389 60.039 22.956 Q 59.859 22.522 59.859 22.006 Q 59.859 21.491 60.039 21.057 Q 60.218 20.624 60.535 20.311 Q 60.851 19.999 61.289 19.825 Q 61.726 19.651 62.242 19.651 Q 62.492 19.651 62.754 19.686 Q 63.015 19.721 63.261 19.797 Q 63.508 19.874 63.726 19.995 Q 63.945 20.116 64.107 20.293 Q 64.269 20.471 64.365 20.706 Q 64.461 20.94 64.461 21.241 Z" style="fill: url(#color-2-2); white-space: pre;"/>
32 |     <path d="M 69.892 21.686 Q 69.892 22.143 69.78 22.639 Q 69.669 23.135 69.47 23.612 Q 69.271 24.088 68.993 24.508 Q 68.716 24.928 68.384 25.233 L 67.888 24.963 Q 67.993 24.768 68.087 24.526 Q 68.181 24.284 68.257 24.012 Q 68.333 23.741 68.394 23.452 Q 68.454 23.163 68.495 22.879 Q 68.536 22.596 68.558 22.329 Q 68.579 22.061 68.579 21.831 Q 68.579 21.678 68.568 21.504 Q 68.556 21.331 68.519 21.157 Q 68.482 20.983 68.419 20.823 Q 68.357 20.663 68.253 20.542 Q 68.15 20.42 68.003 20.346 Q 67.857 20.272 67.657 20.272 Q 67.462 20.272 67.31 20.36 Q 67.157 20.448 67.04 20.59 Q 66.923 20.733 66.839 20.917 Q 66.755 21.1 66.702 21.293 Q 66.65 21.487 66.626 21.674 Q 66.603 21.862 66.603 22.01 L 66.603 24.272 L 65.278 24.272 L 65.278 19.76 L 65.786 19.76 L 66.392 20.811 Q 66.501 20.549 66.665 20.335 Q 66.829 20.12 67.036 19.967 Q 67.243 19.815 67.495 19.733 Q 67.747 19.651 68.04 19.651 Q 68.509 19.651 68.857 19.803 Q 69.204 19.956 69.435 20.227 Q 69.665 20.499 69.778 20.872 Q 69.892 21.245 69.892 21.686 Z" style="fill: url(#color-2-2); white-space: pre;"/>
33 |     <path d="M 72.413 24.272 L 71.089 24.272 L 71.089 20.284 L 70.269 20.284 L 70.269 19.76 L 71.089 19.76 L 71.089 18.1 L 72.413 18.1 L 72.413 19.76 L 73.78 19.76 L 73.78 20.284 L 72.413 20.284 Z" style="fill: url(#color-2-2); white-space: pre;"/>
34 |     <path d="M 82.978 20.042 Q 82.978 20.657 82.814 21.21 Q 82.65 21.764 82.354 22.233 Q 82.058 22.702 81.648 23.08 Q 81.237 23.458 80.742 23.721 Q 80.247 23.985 79.682 24.128 Q 79.116 24.272 78.519 24.272 L 74.798 24.272 L 74.798 15.835 L 78.519 15.835 Q 79.116 15.835 79.679 15.975 Q 80.241 16.116 80.736 16.376 Q 81.232 16.637 81.645 17.012 Q 82.058 17.387 82.354 17.853 Q 82.65 18.319 82.814 18.873 Q 82.978 19.426 82.978 20.042 Z M 80.616 20.042 Q 80.616 19.356 80.449 18.738 Q 80.282 18.12 79.931 17.651 Q 79.579 17.182 79.037 16.907 Q 78.495 16.631 77.739 16.631 L 77.124 16.631 L 77.124 23.452 L 77.739 23.452 Q 78.484 23.452 79.026 23.173 Q 79.568 22.895 79.922 22.423 Q 80.276 21.952 80.446 21.336 Q 80.616 20.721 80.616 20.042 Z" style="fill: url(#color-2-3); white-space: pre;"/>
35 |     <path d="M 85.091 22.131 Q 85.333 22.186 85.575 22.217 Q 85.817 22.249 86.063 22.249 Q 86.321 22.249 86.542 22.2 Q 86.763 22.151 86.925 22.036 Q 87.087 21.92 87.181 21.729 Q 87.274 21.538 87.274 21.252 Q 87.274 21.022 87.212 20.831 Q 87.149 20.639 87.026 20.502 Q 86.903 20.366 86.724 20.29 Q 86.544 20.213 86.306 20.213 Q 86.083 20.213 85.905 20.295 Q 85.727 20.377 85.593 20.516 Q 85.458 20.655 85.362 20.838 Q 85.267 21.022 85.204 21.221 Q 85.142 21.42 85.114 21.624 Q 85.087 21.827 85.087 22.006 Q 85.087 22.038 85.087 22.069 Q 85.087 22.1 85.091 22.131 Z M 88.345 21.241 Q 88.345 21.518 88.255 21.735 Q 88.165 21.952 88.011 22.116 Q 87.856 22.28 87.653 22.393 Q 87.45 22.506 87.22 22.579 Q 86.989 22.651 86.745 22.682 Q 86.501 22.713 86.27 22.713 Q 85.977 22.713 85.692 22.672 Q 85.407 22.631 85.13 22.542 Q 85.173 22.799 85.267 23.032 Q 85.36 23.264 85.517 23.438 Q 85.673 23.612 85.901 23.713 Q 86.13 23.815 86.442 23.815 Q 86.649 23.815 86.837 23.752 Q 87.024 23.69 87.181 23.577 Q 87.337 23.463 87.456 23.307 Q 87.575 23.151 87.649 22.96 L 88.22 23.17 Q 88.091 23.479 87.866 23.704 Q 87.642 23.928 87.358 24.075 Q 87.075 24.221 86.757 24.292 Q 86.438 24.362 86.126 24.362 Q 85.61 24.362 85.173 24.188 Q 84.735 24.014 84.419 23.702 Q 84.102 23.389 83.923 22.956 Q 83.743 22.522 83.743 22.006 Q 83.743 21.491 83.923 21.057 Q 84.102 20.624 84.419 20.311 Q 84.735 19.999 85.173 19.825 Q 85.61 19.651 86.126 19.651 Q 86.376 19.651 86.638 19.686 Q 86.899 19.721 87.145 19.797 Q 87.392 19.874 87.61 19.995 Q 87.829 20.116 87.991 20.293 Q 88.153 20.471 88.249 20.706 Q 88.345 20.94 88.345 21.241 Z" style="fill: url(#color-2-4); white-space: pre;"/>
36 |     <path d="M 90.862 24.272 L 89.537 24.272 L 89.537 20.284 L 88.717 20.284 L 88.717 19.76 L 89.537 19.76 L 89.537 18.1 L 90.862 18.1 L 90.862 19.76 L 92.229 19.76 L 92.229 20.284 L 90.862 20.284 Z" style="fill: url(#color-2-4); white-space: pre;"/>
37 |     <path d="M 93.72 22.131 Q 93.962 22.186 94.204 22.217 Q 94.447 22.249 94.693 22.249 Q 94.95 22.249 95.171 22.2 Q 95.392 22.151 95.554 22.036 Q 95.716 21.92 95.81 21.729 Q 95.904 21.538 95.904 21.252 Q 95.904 21.022 95.841 20.831 Q 95.779 20.639 95.655 20.502 Q 95.532 20.366 95.353 20.29 Q 95.173 20.213 94.935 20.213 Q 94.712 20.213 94.534 20.295 Q 94.357 20.377 94.222 20.516 Q 94.087 20.655 93.991 20.838 Q 93.896 21.022 93.833 21.221 Q 93.771 21.42 93.743 21.624 Q 93.716 21.827 93.716 22.006 Q 93.716 22.038 93.716 22.069 Q 93.716 22.1 93.72 22.131 Z M 96.974 21.241 Q 96.974 21.518 96.884 21.735 Q 96.794 21.952 96.64 22.116 Q 96.486 22.28 96.282 22.393 Q 96.079 22.506 95.849 22.579 Q 95.618 22.651 95.374 22.682 Q 95.13 22.713 94.9 22.713 Q 94.607 22.713 94.322 22.672 Q 94.036 22.631 93.759 22.542 Q 93.802 22.799 93.896 23.032 Q 93.989 23.264 94.146 23.438 Q 94.302 23.612 94.53 23.713 Q 94.759 23.815 95.072 23.815 Q 95.279 23.815 95.466 23.752 Q 95.654 23.69 95.81 23.577 Q 95.966 23.463 96.085 23.307 Q 96.204 23.151 96.279 22.96 L 96.849 23.17 Q 96.72 23.479 96.495 23.704 Q 96.271 23.928 95.988 24.075 Q 95.704 24.221 95.386 24.292 Q 95.068 24.362 94.755 24.362 Q 94.239 24.362 93.802 24.188 Q 93.364 24.014 93.048 23.702 Q 92.732 23.389 92.552 22.956 Q 92.372 22.522 92.372 22.006 Q 92.372 21.491 92.552 21.057 Q 92.732 20.624 93.048 20.311 Q 93.364 19.999 93.802 19.825 Q 94.239 19.651 94.755 19.651 Q 95.005 19.651 95.267 19.686 Q 95.529 19.721 95.775 19.797 Q 96.021 19.874 96.239 19.995 Q 96.458 20.116 96.62 20.293 Q 96.782 20.471 96.878 20.706 Q 96.974 20.94 96.974 21.241 Z" style="fill: url(#color-2-4); white-space: pre;"/>
38 |     <path d="M 101.966 20.877 L 101.395 21.104 Q 101.259 20.674 100.938 20.444 Q 100.618 20.213 100.169 20.213 Q 99.927 20.213 99.737 20.29 Q 99.548 20.366 99.405 20.499 Q 99.263 20.631 99.163 20.807 Q 99.063 20.983 99.001 21.182 Q 98.938 21.381 98.911 21.594 Q 98.884 21.807 98.884 22.006 Q 98.884 22.221 98.907 22.44 Q 98.931 22.659 98.987 22.86 Q 99.044 23.061 99.138 23.235 Q 99.231 23.409 99.374 23.538 Q 99.516 23.667 99.714 23.741 Q 99.911 23.815 100.169 23.815 Q 100.376 23.815 100.563 23.752 Q 100.751 23.69 100.907 23.577 Q 101.063 23.463 101.184 23.307 Q 101.306 23.151 101.376 22.96 L 101.95 23.17 Q 101.821 23.479 101.608 23.704 Q 101.395 23.928 101.126 24.075 Q 100.856 24.221 100.548 24.292 Q 100.239 24.362 99.923 24.362 Q 99.411 24.362 98.973 24.188 Q 98.536 24.014 98.218 23.7 Q 97.899 23.385 97.72 22.954 Q 97.54 22.522 97.54 22.006 Q 97.54 21.487 97.718 21.053 Q 97.895 20.62 98.212 20.307 Q 98.528 19.995 98.966 19.823 Q 99.403 19.651 99.923 19.651 Q 100.251 19.651 100.563 19.723 Q 100.876 19.795 101.143 19.946 Q 101.411 20.096 101.624 20.327 Q 101.837 20.557 101.966 20.877 Z" style="fill: url(#color-2-4); white-space: pre;"/>
39 |     <path d="M 104.44 24.272 L 103.116 24.272 L 103.116 20.284 L 102.296 20.284 L 102.296 19.76 L 103.116 19.76 L 103.116 18.1 L 104.44 18.1 L 104.44 19.76 L 105.807 19.76 L 105.807 20.284 L 104.44 20.284 Z" style="fill: url(#color-2-4); white-space: pre;"/>
40 |     <path d="M 110.988 22.006 Q 110.988 22.358 110.894 22.667 Q 110.801 22.975 110.635 23.237 Q 110.469 23.499 110.234 23.708 Q 110 23.917 109.721 24.061 Q 109.441 24.206 109.123 24.284 Q 108.805 24.362 108.469 24.362 Q 108.133 24.362 107.816 24.284 Q 107.5 24.206 107.221 24.061 Q 106.941 23.917 106.707 23.708 Q 106.472 23.499 106.306 23.237 Q 106.14 22.975 106.047 22.665 Q 105.953 22.354 105.953 22.006 Q 105.953 21.659 106.047 21.348 Q 106.14 21.038 106.306 20.776 Q 106.472 20.514 106.707 20.305 Q 106.941 20.096 107.221 19.952 Q 107.5 19.807 107.816 19.729 Q 108.133 19.651 108.469 19.651 Q 108.805 19.651 109.123 19.729 Q 109.441 19.807 109.721 19.952 Q 110 20.096 110.234 20.305 Q 110.469 20.514 110.635 20.776 Q 110.801 21.038 110.894 21.348 Q 110.988 21.659 110.988 22.006 Z M 109.644 22.006 Q 109.644 21.819 109.623 21.61 Q 109.601 21.401 109.551 21.2 Q 109.5 20.999 109.412 20.813 Q 109.324 20.627 109.195 20.489 Q 109.066 20.35 108.887 20.266 Q 108.707 20.182 108.469 20.182 Q 108.242 20.182 108.066 20.268 Q 107.89 20.354 107.76 20.497 Q 107.629 20.639 107.541 20.827 Q 107.453 21.014 107.398 21.215 Q 107.344 21.417 107.32 21.622 Q 107.297 21.827 107.297 22.006 Q 107.297 22.186 107.32 22.391 Q 107.344 22.596 107.398 22.799 Q 107.453 23.002 107.541 23.188 Q 107.629 23.374 107.76 23.516 Q 107.89 23.659 108.066 23.745 Q 108.242 23.831 108.469 23.831 Q 108.703 23.831 108.883 23.747 Q 109.062 23.663 109.193 23.522 Q 109.324 23.381 109.41 23.198 Q 109.496 23.014 109.549 22.811 Q 109.601 22.608 109.623 22.401 Q 109.644 22.194 109.644 22.006 Z" style="fill: url(#color-2-4); white-space: pre;"/>
41 |     <path d="M 115.653 20.159 L 114.91 21.389 Q 114.863 21.233 114.789 21.038 Q 114.715 20.842 114.606 20.672 Q 114.496 20.502 114.344 20.387 Q 114.192 20.272 113.992 20.272 Q 113.809 20.272 113.674 20.366 Q 113.539 20.46 113.445 20.61 Q 113.352 20.76 113.291 20.95 Q 113.231 21.139 113.197 21.333 Q 113.164 21.526 113.153 21.704 Q 113.141 21.881 113.141 22.01 L 113.141 24.272 L 111.817 24.272 L 111.817 19.76 L 112.324 19.76 L 112.86 20.811 Q 112.969 20.557 113.11 20.344 Q 113.25 20.131 113.434 19.977 Q 113.617 19.823 113.85 19.737 Q 114.082 19.651 114.375 19.651 Q 114.528 19.651 114.715 19.676 Q 114.903 19.702 115.082 19.762 Q 115.262 19.823 115.414 19.92 Q 115.567 20.018 115.653 20.159 Z" style="fill: url(#color-2-4); white-space: pre;"/>
42 |   </g>
43 | </svg>


--------------------------------------------------------------------------------
/eventdetector_ts/metamodel/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | from typing import Dict, Tuple
  4 | 
  5 | import joblib
  6 | import numpy as np
  7 | import pandas as pd
  8 | from sklearn.model_selection import train_test_split
  9 | from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
 10 | 
 11 | from eventdetector_ts import MIN_MAX_SCALER, ROBUST_SCALER, SCALERS_DIR, FILL_NAN_ZEROS, FILL_NAN_FFILL, \
 12 |     FILL_NAN_BFILL, FILL_NAN_MEDIAN, RNN_BIDIRECTIONAL, CONV_LSTM1D, RNN_ENCODER_DECODER, FFN, CNN_RNN, \
 13 |     GRU, CNN, SELF_ATTENTION, LSTM, TYPE_TRAINING_AVERAGE, TYPE_TRAINING_FFN, STANDARD_SCALER, TRANSFORMER
 14 | from eventdetector_ts.data.helpers_data import InvalidArgumentError
 15 | 
 16 | 
 17 | class DataSplitter:
 18 |     """
 19 |     A class for splitting and scaling data into training, test sets and applying scalers to each
 20 |     time step in the data.
 21 |     """
 22 | 
 23 |     def __init__(self, test_size: float, scaler_type: str):
 24 |         """
 25 |         Initialize the DataSplitter object.
 26 | 
 27 |         Args:
 28 |             test_size: The fraction of data to use for testing.
 29 |             scaler_type: The type of scaler to use.
 30 |         """
 31 | 
 32 |         self.train_x: np.ndarray = np.empty(shape=(0,))
 33 |         self.test_x: np.ndarray = np.empty(shape=(0,))
 34 |         self.train_y: np.ndarray = np.empty(shape=(0,))
 35 |         self.test_y: np.ndarray = np.empty(shape=(0,))
 36 |         self.scalers: Dict[int, StandardScaler | MinMaxScaler | ROBUST_SCALER] = {}
 37 |         self.test_size: float = test_size
 38 |         self.scaler_type: str = scaler_type
 39 | 
 40 |     def split_data_and_apply_scaler(self, x: np.ndarray, y: np.ndarray) -> None:
 41 |         """
 42 |         Split the data into training, validation, and test sets and apply the specified scaler to each time step.
 43 | 
 44 |         Args:
 45 |             x: The input data with shape (n_samples, n_time_steps, n_features).
 46 |             y: The target data with shape (n_samples,).
 47 | 
 48 |         Returns:
 49 |             A tuple containing the training, validation, and test sets as numpy arrays and a dictionary of scalers.
 50 |         """
 51 |         assert x.ndim == 3, "x must be a 3D array."
 52 |         assert y.ndim == 1, "y must be a 1D array."
 53 |         assert x.shape[0] == y.shape[0], "x and y must have the same number of samples."
 54 | 
 55 |         # Split the data into training and test sets
 56 |         self.train_x, self.test_x, self.train_y, self.test_y = train_test_split(x, y, test_size=self.test_size,
 57 |                                                                                 shuffle=False)
 58 | 
 59 |         n_time_steps = x.shape[1]
 60 | 
 61 |         self.scalers = {}
 62 |         # Apply scaler to each time step
 63 |         for i in range(n_time_steps):
 64 |             scaler = StandardScaler()
 65 |             if self.scaler_type == MIN_MAX_SCALER:
 66 |                 scaler = MinMaxScaler()
 67 |             elif self.scaler_type == ROBUST_SCALER:
 68 |                 scaler = RobustScaler()
 69 |             self.scalers[i] = scaler
 70 |             self.train_x[:, i, :] = self.scalers[i].fit_transform(self.train_x[:, i, :])
 71 |             self.test_x[:, i, :] = self.scalers[i].transform(self.test_x[:, i, :])
 72 | 
 73 |     def save_scalers(self, output_dir: str) -> None:
 74 |         """
 75 |         Saves the scalers to disk.
 76 | 
 77 |         Args:
 78 |             output_dir: the directory where the scalers should be saved
 79 | 
 80 |         Returns:
 81 |             None
 82 |         """
 83 |         # Create the directory if it doesn't exist
 84 |         scalers_dir = os.path.join(output_dir, SCALERS_DIR)
 85 |         if not os.path.exists(scalers_dir):
 86 |             os.makedirs(scalers_dir)
 87 | 
 88 |         # Save each scaler to disk
 89 |         n_time_steps: int = self.test_x.shape[1]
 90 |         for i in range(n_time_steps):
 91 |             # Generate the path to save the scaler to
 92 |             scaler_i_path = os.path.join(scalers_dir, f'scaler_{i}.joblib')
 93 |             # Print progress
 94 |             print("\rSaving scaling...{}/{}".format(i + 1, n_time_steps), end="")
 95 |             # Save the scaler to disk
 96 |             joblib.dump(self.scalers[i], scaler_i_path)
 97 |         print()
 98 | 
 99 | 
100 | def validate_required_args(meta_model) -> None:
101 |     """
102 |        Validate the required arguments of the MetaModel.
103 | 
104 |        Args:
105 |            meta_model (MetaModel): A MetaModel instance.
106 | 
107 |        Returns:
108 |            None
109 | 
110 |        Raises:
111 |            ValueError: If any of the arguments are invalid.
112 |        """
113 | 
114 |     __validate_required_args(meta_model=meta_model)
115 | 
116 |     if meta_model.dataset is None or meta_model.dataset.empty:
117 |         raise InvalidArgumentError("dataset cannot be None or empty.")
118 |     elif not isinstance(meta_model.dataset, pd.DataFrame):
119 |         raise InvalidArgumentError("dataset should be a Pandas DataFrame.")
120 | 
121 |     if len(meta_model.dataset) < meta_model.width:
122 |         raise InvalidArgumentError("Dataset length is smaller than the given partition width.")
123 | 
124 |     if meta_model.events is None or (isinstance(meta_model.events, pd.DataFrame) and meta_model.events.empty) or \
125 |             (isinstance(meta_model.events, list) and len(meta_model.events) == 0):
126 |         raise InvalidArgumentError("Events is empty or None.")
127 |     elif not isinstance(meta_model.events, (list, pd.DataFrame)):
128 |         raise InvalidArgumentError("Events should be a list or a Pandas DataFrame.")
129 | 
130 |     if not re.match(r"^\w+$", meta_model.output_dir):
131 |         raise InvalidArgumentError(
132 |             "Output directory name can only contain alphanumeric characters and underscores.")
133 | 
134 | 
135 | def __validate_required_args(meta_model) -> None:
136 |     if not isinstance(meta_model.step, int) or meta_model.step <= 0:
137 |         raise InvalidArgumentError("step should be a positive integer.")
138 | 
139 |     if not isinstance(meta_model.width, int) or meta_model.width <= meta_model.step:
140 |         raise InvalidArgumentError(f"width should be greater than step = {meta_model.step}.")
141 | 
142 |     if meta_model.width_events is not None and not isinstance(meta_model.width_events,
143 |                                                               (int, float)) and meta_model.width_events <= 0:
144 |         raise InvalidArgumentError("width_events should be either a positive integer or positive float.")
145 | 
146 | 
147 | def validate_args(meta_model) -> None:
148 |     """
149 |     Validate the arguments of the MetaModel.
150 | 
151 |     Args:
152 |         meta_model (MetaModel): A MetaModel instance.
153 | 
154 |     Returns:
155 |         None
156 | 
157 |     Raises:
158 |         ValueError: If any of the arguments are invalid.
159 |     """
160 | 
161 |     validate_args_1(meta_model)
162 |     validate_args_2(meta_model)
163 |     validate_args_3(meta_model)
164 |     validate_args_4(meta_model)
165 |     validate_args_5(meta_model)
166 | 
167 |     if len(meta_model.hyperparams_mm_network) != 3:
168 |         raise ValueError("hyperparams_mm_network must be a tuple of length 3")
169 | 
170 |     if not all(isinstance(val, int) for val in meta_model.hyperparams_mm_network[:-1]):
171 |         raise ValueError("hyperparams_mm_network values must be integers except the last which is"
172 |                          " the activation function (str)")
173 | 
174 |     if not isinstance(meta_model.save_models_as_dot_format, bool):
175 |         raise InvalidArgumentError("Invalid save_models_as_dot_format parameter: must be a boolean.")
176 | 
177 |     if meta_model.dropout is None or not 0 <= meta_model.dropout < 1 or not isinstance(meta_model.dropout, float):
178 |         raise InvalidArgumentError("Invalid dropout parameter: must be a float between 0 and 1.0.")
179 | 
180 | 
181 | def validate_args_1(meta_model) -> None:
182 |     """
183 |     Validate the arguments of the MetaModel.
184 | 
185 |     Args:
186 |         meta_model (MetaModel): A MetaModel instance.
187 | 
188 |     Returns:
189 |         None
190 | 
191 |     Raises:
192 |         ValueError: If any of the arguments are invalid.
193 |     """
194 | 
195 |     if meta_model.fill_nan not in [FILL_NAN_ZEROS, FILL_NAN_FFILL, FILL_NAN_BFILL, FILL_NAN_MEDIAN]:
196 |         raise InvalidArgumentError(
197 |             f"Invalid method for filling NaN values. Supported methods are"
198 |             f"  {FILL_NAN_ZEROS}, {FILL_NAN_FFILL}, {FILL_NAN_BFILL}, and {FILL_NAN_MEDIAN}.")
199 | 
200 |     if not isinstance(meta_model.epochs, int) or meta_model.epochs <= 0:
201 |         raise InvalidArgumentError("epochs should be a positive integer.")
202 | 
203 |     if not isinstance(meta_model.batch_size, int) or meta_model.batch_size <= 0:
204 |         raise InvalidArgumentError("batch_size should be a positive integer.")
205 | 
206 |     if not isinstance(meta_model.t_max, float) and not isinstance(meta_model.t_max, int):
207 |         raise InvalidArgumentError("t_max should be float/int.")
208 | 
209 |     if meta_model.t_max <= meta_model.w_s:
210 |         raise InvalidArgumentError(f"t_max should be greater than w_s {meta_model.w_s}.")
211 | 
212 |     if not isinstance(meta_model.delta, (int, float)) or meta_model.delta <= 0:
213 |         raise InvalidArgumentError("delta should be either a positive integer or positive float.")
214 | 
215 |     if not (0 < meta_model.s_h < 1):
216 |         raise InvalidArgumentError("s_h should be a float between 0 and 1 exclusive.")
217 | 
218 |     if not isinstance(meta_model.epsilon, float) or not (0 < meta_model.epsilon <= 1):
219 |         raise InvalidArgumentError("epsilon should be a positive number between 0 and 1.")
220 | 
221 |     if not isinstance(meta_model.pa, int) or meta_model.pa <= 0:
222 |         raise InvalidArgumentError("pa should be a positive integer.")
223 | 
224 | 
225 | def validate_args_2(meta_model) -> None:
226 |     """
227 |     Validate the arguments of the MetaModel.
228 | 
229 |     Args:
230 |         meta_model (MetaModel): A MetaModel instance.
231 | 
232 |     Returns:
233 |         None
234 | 
235 |     Raises:
236 |         ValueError: If any of the arguments are invalid.
237 |     """
238 | 
239 |     if not isinstance(meta_model.t_r, float) or not (0 < meta_model.t_r <= 1):
240 |         raise InvalidArgumentError("t_r should be a positive number between 0 and 1.")
241 | 
242 |     if meta_model.time_window is not None and (
243 |             not isinstance(meta_model.time_window, (int, float)) or meta_model.time_window <= 0):
244 |         raise InvalidArgumentError("time_window should be either a positive integer or positive float.")
245 | 
246 |     if not all(isinstance(model, (str, tuple)) and
247 |                (isinstance(model, str) or (isinstance(model, tuple) and len(model) == 2 and isinstance(model[0],
248 |                                                                                                        str) and
249 |                                            isinstance(model[1], int)))
250 |                for model in meta_model.models):
251 |         raise InvalidArgumentError(
252 |             "Invalid format for models. It should be a list of strings or tuples of (string, integer).")
253 | 
254 | 
255 | def validate_model_type(model):
256 |     if model not in [LSTM, GRU, CNN, RNN_BIDIRECTIONAL, CONV_LSTM1D, RNN_ENCODER_DECODER, CNN_RNN,
257 |                      SELF_ATTENTION, FFN, TRANSFORMER]:
258 |         raise InvalidArgumentError(
259 |             f"Invalid model type {model}. Supported models are {LSTM}, {GRU}, {CNN}, {RNN_BIDIRECTIONAL},"
260 |             f" {CONV_LSTM1D}, {RNN_ENCODER_DECODER}, {CNN_RNN}, {SELF_ATTENTION}, {TRANSFORMER}, and {FFN}.")
261 | 
262 | 
263 | def validate_model_instances(model_instances):
264 |     if not isinstance(model_instances, int) or model_instances <= 0:
265 |         raise InvalidArgumentError("Number of model instances should be a positive integer.")
266 | 
267 | 
268 | def validate_args_3(meta_model) -> None:
269 |     """
270 |     Validate the arguments of the MetaModel.
271 | 
272 |     Args:
273 |         meta_model (MetaModel): A MetaModel instance.
274 | 
275 |     Returns:
276 |         None
277 | 
278 |     Raises:
279 |         ValueError: If any of the arguments are invalid.
280 |     """
281 | 
282 |     for model in meta_model.models:
283 |         if isinstance(model, str):
284 |             validate_model_type(model)
285 |         elif isinstance(model, tuple) and len(model) == 2:
286 |             model_type, model_instances = model
287 |             validate_model_type(model_type)
288 |             validate_model_instances(model_instances)
289 |         else:
290 |             raise InvalidArgumentError(f"Invalid model specification {model}.")
291 | 
292 | 
293 | def validate_args_4(meta_model) -> None:
294 |     """
295 |     Validate the arguments of the MetaModel.
296 | 
297 |     Args:
298 |         meta_model (MetaModel): A MetaModel instance.
299 | 
300 |     Returns:
301 |         None
302 | 
303 |     Raises:
304 |         ValueError: If any of the arguments are invalid.
305 |     """
306 | 
307 |     if meta_model.type_training not in [TYPE_TRAINING_AVERAGE, TYPE_TRAINING_FFN]:
308 |         raise InvalidArgumentError(
309 |             f"Invalid type of training technique. Supported techniques are "
310 |             f"{TYPE_TRAINING_AVERAGE} and {TYPE_TRAINING_FFN}.")
311 | 
312 |     if meta_model.scaler not in [MIN_MAX_SCALER, STANDARD_SCALER, ROBUST_SCALER]:
313 |         raise InvalidArgumentError(
314 |             f"Invalid type of scaler technique. Supported techniques are {MIN_MAX_SCALER},"
315 |             f" {STANDARD_SCALER} and {ROBUST_SCALER}.")
316 | 
317 |     if not isinstance(meta_model.use_kfold, bool):
318 |         raise InvalidArgumentError("Invalid use_kfold parameter: must be a boolean.")
319 | 
320 |     if not 0 < meta_model.test_size < 1 or not isinstance(meta_model.test_size, float):
321 |         raise InvalidArgumentError("Invalid test_size parameter: must be a float between 0 and 1.")
322 | 
323 |     if not 0 < meta_model.val_size < 1 or not isinstance(meta_model.val_size, float):
324 |         raise InvalidArgumentError("Invalid val_size parameter: must be a float between 0 and 1.")
325 | 
326 |     if len(meta_model.hyperparams_transformer) != 5:
327 |         raise ValueError("hyperparams_transformer must be a tuple of length 5")
328 | 
329 | 
330 | def validate_args_5(meta_model) -> None:
331 |     """
332 |     Validate the arguments of the MetaModel.
333 | 
334 |     Args:
335 |         meta_model (MetaModel): A MetaModel instance.
336 | 
337 |     Returns:
338 |         None
339 | 
340 |     Raises:
341 |         ValueError: If any of the arguments are invalid.
342 |     """
343 | 
344 |     param1, param2, param3, param4, param5 = meta_model.hyperparams_transformer
345 |     if not (all(isinstance(p, int) for p in [param1, param2, param3]) and isinstance(param4, bool) and isinstance(
346 |             param5,
347 |             str)):
348 |         raise ValueError("hyperparams_transformer values must be Tuple[int, int, int, bool, str]")
349 | 
350 | 
351 | def validate_ffn(meta_model) -> Tuple:
352 |     hyperparams_ffn = meta_model.hyperparams_ffn
353 |     l_ffn = len(hyperparams_ffn)
354 |     print(l_ffn)
355 | 
356 |     if not 2 < l_ffn < 6:
357 |         raise ValueError("hyperparams_ffn must be a tuple of length 3, 4 or 5")
358 | 
359 |     if l_ffn == 3:
360 |         max_layers, min_neurons, max_neurons = hyperparams_ffn
361 |         return 1, max_layers, min_neurons, max_neurons, "sigmoid"
362 | 
363 |     if l_ffn == 4:
364 |         if isinstance(hyperparams_ffn[-1], str):
365 |             max_layers, min_neurons, max_neurons, activation = hyperparams_ffn
366 |             return 1, max_layers, min_neurons, max_neurons, activation
367 |         else:
368 |             min_layers, max_layers, min_neurons, max_neurons = hyperparams_ffn
369 |             return min_layers, max_layers, min_neurons, max_neurons, "sigmoid"
370 | 
371 |     return hyperparams_ffn
372 | 
373 | 
374 | def validate_cnn(meta_model) -> Tuple:
375 |     hyperparams_cnn = meta_model.hyperparams_cnn
376 |     l_cnn = len(hyperparams_cnn)
377 | 
378 |     if not 4 < l_cnn < 8:
379 |         raise ValueError("hyperparams_cnn must be a tuple of length between 5 and 7.")
380 | 
381 |     if l_cnn == 5:
382 |         min_f, max_f, min_k, max_k, max_layers = hyperparams_cnn
383 |         return min_f, max_f, min_k, max_k, 1, max_layers, "relu"
384 | 
385 |     if l_cnn == 6:
386 |         if isinstance(hyperparams_cnn[-1], str):
387 |             min_f, max_f, min_k, max_k, max_layers, activation = hyperparams_cnn
388 |             return min_f, max_f, min_k, max_k, 1, max_layers, activation
389 |         else:
390 |             min_f, max_f, min_k, max_k, min_layers, max_layers = hyperparams_cnn
391 |             return min_f, max_f, min_k, max_k, min_layers, max_layers, "relu"
392 | 
393 |     return hyperparams_cnn
394 | 
395 | 
396 | def validate_rnn(meta_model) -> Tuple:
397 |     hyperparams_rnn = meta_model.hyperparams_rnn
398 |     l_rnn = len(hyperparams_rnn)
399 | 
400 |     if not 2 < l_rnn < 6:
401 |         raise ValueError("hyperparams_rnn must be a tuple of length 3, 4 or 5")
402 | 
403 |     if l_rnn == 3:
404 |         max_layers, min_neurons, max_neurons = hyperparams_rnn
405 |         return 1, max_layers, min_neurons, max_neurons, "tanh"
406 | 
407 |     if l_rnn == 4:
408 |         if isinstance(hyperparams_rnn[-1], str):
409 |             max_layers, min_neurons, max_neurons, activation = hyperparams_rnn
410 |             return 1, max_layers, min_neurons, max_neurons, activation
411 |         else:
412 |             min_layers, max_layers, min_neurons, max_neurons = hyperparams_rnn
413 |             return min_layers, max_layers, min_neurons, max_neurons, "tanh"
414 | 
415 |     return hyperparams_rnn
416 | 


--------------------------------------------------------------------------------
/eventdetector_ts/data/helpers_data.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from datetime import datetime, timedelta
  3 | from functools import reduce
  4 | from typing import Optional, Union, Tuple, Dict
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | from dateutil.parser import parser
  9 | # noinspection PyUnresolvedReferences
 10 | from numpy.lib.stride_tricks import as_strided
 11 | from pandas.core.dtypes.common import is_datetime64_any_dtype
 12 | 
 13 | from eventdetector_ts import TIME_LABEL, FILL_NAN_ZEROS, FILL_NAN_FFILL, FILL_NAN_BFILL, FILL_NAN_MEDIAN, \
 14 |     MIDDLE_EVENT_LABEL, TimeUnit
 15 | from eventdetector_ts.data import VALUE_ERROR
 16 | from eventdetector_ts.data.interval import Interval
 17 | 
 18 | 
 19 | def overlapping_partitions(data: np.ndarray, width: int, step: int = 1):
 20 |     """
 21 |     Splits an input numpy array into a set of overlapping partitions.
 22 | 
 23 |     Args:
 24 |         data: Input numpy array to be split into overlapping partitions
 25 |         width: Width of each overlapping partition
 26 |         step: The step size between successive partitions (default=1)
 27 | 
 28 |     Returns:
 29 |         Numpy array of shape (nb_partitions, width, data.ndim), containing the created overlapping partitions.
 30 |     """
 31 |     if width > data.shape[0]:
 32 |         raise ValueError("Partition size cannot be greater than the size of the input data")
 33 |     if step > width:
 34 |         raise ValueError("Step size cannot be greater than partition size")
 35 | 
 36 |     # Compute the parameters for creating the overlapping partitions
 37 |     np_partitions = (data.shape[0] - width) // step + 1
 38 |     shape = (np_partitions, width) + data.shape[1:]
 39 |     strides = (step * data.strides[0],) + data.strides
 40 | 
 41 |     # Use as_strided to create the overlapping partitions
 42 |     partitioned_array = as_strided(data, shape=shape, strides=strides)
 43 | 
 44 |     return partitioned_array
 45 | 
 46 | 
 47 | def convert_dataframe_to_overlapping_partitions(
 48 |         dataframe: pd.DataFrame,
 49 |         width: int,
 50 |         step: int,
 51 |         fill_method: Optional[str] = None
 52 | ) -> np.ndarray:
 53 |     """
 54 |     Converts a given DataFrame to overlapping partitions.
 55 | 
 56 |     Args:
 57 |         dataframe: Input DataFrame of features
 58 |         width: Width of each overlapping partition
 59 |         step: The step size between successive partitions
 60 |         fill_method: The method to use for filling NaNs. Supported methods are 'zeros', 'ffill', 'bfill', and 'median'.
 61 |             If None, NaNs are left as-is. (default=None)
 62 | 
 63 |     Returns:
 64 |         Numpy array of shape (np_partitions, width, nb_features), containing the created overlapping partitions.
 65 |     """
 66 | 
 67 |     dataframe = dataframe.copy()
 68 |     dataframe.index = pd.to_datetime(dataframe.index)
 69 |     dataframe.loc[:, TIME_LABEL] = dataframe.index.to_pydatetime()
 70 | 
 71 |     if fill_method == FILL_NAN_ZEROS:
 72 |         dataframe = dataframe.fillna(0)
 73 |     elif fill_method == FILL_NAN_FFILL:
 74 |         dataframe = dataframe.ffill()
 75 |     elif fill_method == FILL_NAN_BFILL:
 76 |         dataframe = dataframe.bfill()
 77 |     elif fill_method == FILL_NAN_MEDIAN:
 78 |         dataframe = dataframe.fillna(dataframe.median())
 79 |     elif fill_method is not None:
 80 |         raise ValueError(f"Unsupported fill method: {fill_method}")
 81 | 
 82 |     sw = overlapping_partitions(dataframe.to_numpy(), width=width, step=step)
 83 |     return sw
 84 | 
 85 | 
 86 | class InvalidArgumentError(ValueError):
 87 |     """Raised when an invalid argument is passed to a function or method."""
 88 | 
 89 |     def __init__(self, message):
 90 |         """
 91 |         Initialize a new InvalidArgumentError with the specified error message.
 92 | 
 93 |         Args:
 94 |             message (str): The error message to display.
 95 |         """
 96 |         super().__init__(message)
 97 | 
 98 | 
 99 | def convert_time_to_datetime(date: Union[str, pd.Timestamp, float, int], to_timestamp: bool = True) -> \
100 |         Union[float, datetime]:
101 |     """
102 |     Converts a date string, pandas Timestamp, or numeric timestamp to a Python datetime or Unix timestamp.
103 | 
104 |     Args:
105 |         date: The input date as a string, pandas Timestamp, or numeric timestamp.
106 |         to_timestamp: If True (default), return the date as a Unix timestamp (float), otherwise as a Python datetime.
107 | 
108 |     Returns:
109 |         The input date as a Unix timestamp or Python datetime object.
110 |     """
111 | 
112 |     if isinstance(date, pd.Timestamp):
113 |         dt = date.to_pydatetime()
114 |     elif isinstance(date, (float, int)):
115 |         dt = datetime.fromtimestamp(date)
116 |     elif isinstance(date, str):
117 |         dt = parser.parse(date, ignoretz=True)
118 |     else:
119 |         raise ValueError(f"Invalid date format {date}. Supported formats are str, pd.Timestamp, float, and int.")
120 | 
121 |     if to_timestamp:
122 |         return (dt - datetime(1970, 1, 1)).total_seconds()
123 |     return dt
124 | 
125 | 
126 | def num_columns(lst: list) -> int:
127 |     """
128 |     Returns the number of columns in a list.
129 | 
130 |     Args:
131 |         lst (list): The list to check.
132 | 
133 |     Returns:
134 |         int: The number of columns in the list.
135 |     """
136 | 
137 |     if not lst:
138 |         # if the list is empty return 0
139 |         return 0
140 |     elif isinstance(lst[0], list):
141 |         # if the first element of the list is a list, return the length of the first list
142 |         return len(lst[0])
143 |     else:
144 |         # otherwise return 1, because the list has only one column
145 |         return 1
146 | 
147 | 
148 | def compute_middle_event(events: Union[list, pd.DataFrame]) -> pd.DataFrame:
149 |     """
150 |     Computes the middle date of events and returns it as a DataFrame.
151 | 
152 |     Args:
153 |         events (Union[list, pd.DataFrame]): A list or pandas DataFrame containing the starting and ending
154 |             dates of events.
155 | 
156 |     Returns:
157 |         pd.DataFrame: A pandas DataFrame with a single column containing the middle dates of events.
158 |     """
159 |     column1 = "Starting Date"
160 |     column2 = "Ending Date"
161 |     is2d = True
162 | 
163 |     if isinstance(events, list):
164 |         nb_columns = num_columns(events)
165 |         if nb_columns == 2:
166 |             df = pd.DataFrame(events, columns=[column1, column2])
167 |         elif nb_columns == 1:
168 |             df = pd.DataFrame(events, columns=[column1])
169 | 
170 |             is2d = False
171 |         else:
172 |             raise ValueError(
173 |                 f"The list of events is not compatible. The number of columns {nb_columns} should not exceed 2.")
174 |     elif isinstance(events, pd.DataFrame):
175 |         df = events
176 |         columns = events.columns
177 |         if len(columns) == 2:
178 |             df = df.rename(columns={columns[0]: column1, columns[1]: column2})
179 |         elif len(columns) == 1:
180 |             is2d = False
181 |             df = df.rename(columns={columns[0]: column1})
182 |         else:
183 |             raise ValueError("The dataframe of events in not compatible, columns should not exceed 2")
184 |     else:
185 |         raise ValueError("The events argument must be a list or pandas DataFrame.")
186 | 
187 |     df[column1] = pd.to_datetime(df[column1])
188 |     if is2d:
189 |         df[column2] = pd.to_datetime(df[column2])
190 | 
191 |     if is2d:
192 |         df[column1] = df[column1].apply(lambda x: convert_time_to_datetime(x) / 2)
193 |         df[column2] = df[column2].apply(lambda x: convert_time_to_datetime(x) / 2)
194 |         df[MIDDLE_EVENT_LABEL] = df[column1] + df[column2]
195 |     else:
196 |         df[MIDDLE_EVENT_LABEL] = df[column1].apply(lambda x: convert_time_to_datetime(x))
197 | 
198 |     df[MIDDLE_EVENT_LABEL] = df[MIDDLE_EVENT_LABEL].apply(lambda x: datetime.utcfromtimestamp(x))
199 |     df = df[[MIDDLE_EVENT_LABEL]]
200 |     df = df.sort_values(by=MIDDLE_EVENT_LABEL)
201 |     return df
202 | 
203 | 
204 | def remove_close_events(events_df: pd.DataFrame, delta_unit_time: int, unit: TimeUnit,
205 |                         remove_overlapping_events: bool) -> pd.DataFrame:
206 |     """
207 |     Removes events from a DataFrame that occur too close together.
208 | 
209 |     Args:
210 |         unit: The time unit
211 |         events_df: A pandas DataFrame containing events with a column named 'middle_event'.
212 |         delta_unit_time: A integer representing the minimum time in unit time between events.
213 |         remove_overlapping_events: A flag to indicate if we remove the overlapping events or not.
214 | 
215 |     Returns:
216 |         A pandas DataFrame with close events removed.
217 |     """
218 | 
219 |     # Convert delta to timedelta
220 |     delta = get_timedelta(delta_unit_time, unit)
221 | 
222 |     # List to hold indices of events to delete
223 |     events_to_delete = []
224 | 
225 |     # Loop through all events
226 |     for i in range(len(events_df)):
227 |         # Get middle time of the current event
228 |         mid_time = events_df.iloc[i][MIDDLE_EVENT_LABEL]
229 | 
230 |         # Skip current event if it's already marked for deletion
231 |         if i in events_to_delete:
232 |             continue
233 | 
234 |         # Loop through all remaining events
235 |         for j in range(i + 1, len(events_df)):
236 |             # Get middle time of the next event
237 |             mid_time1 = events_df.iloc[j][MIDDLE_EVENT_LABEL]
238 | 
239 |             # If the next event is too close to the current event, mark it for deletion
240 |             if (mid_time1 - mid_time) <= delta:
241 |                 events_to_delete.append(j)
242 |             else:
243 |                 break
244 | 
245 |     # Drop events that were marked for deletion
246 |     if remove_overlapping_events:
247 |         return events_df.drop(events_df.index[events_to_delete])
248 |     return events_df
249 | 
250 | 
251 | def convert_events_to_intervals(events_df: pd.DataFrame, width_events_s: float, unit: TimeUnit) \
252 |         -> list[Interval]:
253 |     """
254 |     Convert events from a pandas DataFrame to intervals.
255 | 
256 |     Args:
257 |         events_df (pd.DataFrame): DataFrame containing the events' data.
258 |         width_events_s (float): The width of events in the unit of time for the dataset.
259 |         unit: The unit time
260 | 
261 |     Returns:
262 |         list[Interval]: A list of intervals.
263 | 
264 |     """
265 |     # Create an empty list to store the intervals
266 |     events_intervals = []
267 | 
268 |     # Loop over the events in the DataFrame
269 |     for i in range(len(events_df)):
270 |         # Get the middle event time
271 |         mid_time = events_df.iloc[i][MIDDLE_EVENT_LABEL]
272 | 
273 |         width_events_s_float = float(width_events_s)
274 |         # Compute the radius of the interval based on the event size
275 |         radius = get_timedelta(delta_unit_time=width_events_s_float / 2, unit=unit)
276 | 
277 |         # Create an interval with the middle event time at the center
278 |         interval = Interval(mid_time - radius, mid_time + radius)
279 | 
280 |         # Add the interval to the list of intervals
281 |         events_intervals.append(interval)
282 | 
283 |     # Return the list of intervals
284 |     return events_intervals
285 | 
286 | 
287 | def get_union_times_events(events_df: pd.DataFrame, time_window: int, unit_time: TimeUnit) -> pd.DatetimeIndex:
288 |     """
289 |     Given a DataFrame of events and a time partition size in unit time, computes a DatetimeIndex of all times during 
290 |     which at least one event was taking place.
291 | 
292 |     Args:
293 |         events_df (pd.DataFrame): A DataFrame containing at least a MIDDLE_EVENT_LABEL column with the datetime
294 |             of each event.
295 |         time_window (int): The size of the time window to consider before and after each event.
296 |         unit_time (TimeUnit): The unit time
297 | 
298 |     Returns:
299 |         pd.DatetimeIndex: A DatetimeIndex of all times during which at least one event was taking place.
300 |     """
301 | 
302 |     times_during_events = []
303 |     previous_range = None
304 |     for i, event_time in enumerate(events_df[MIDDLE_EVENT_LABEL]):
305 |         start_time = event_time - get_timedelta(time_window, unit=unit_time)
306 |         end_time = event_time + get_timedelta(time_window, unit=unit_time)
307 |         # Generate a list of dates between start_time and end_time with a frequency of exactly (end_time - start_time).
308 |         # This ensures that the last date is exactly equal to end_time (useful when we generate overlapping ranges).
309 |         dates_between = pd.date_range(start=start_time, end=end_time, freq=end_time - start_time)
310 | 
311 |         if previous_range is None:
312 |             times_during_events.append(dates_between)
313 |             previous_range = dates_between
314 |         else:
315 |             # Check if the current range overlaps with the previous one.
316 |             ranges_overlap = max(previous_range[0], previous_range[-1]) < min(dates_between[0], dates_between[-1])
317 |             if not ranges_overlap:
318 |                 # If the ranges don't overlap, then we need to merge the previous and current ranges.
319 |                 merged_range = pd.date_range(start=previous_range[0], end=dates_between[-1],
320 |                                              freq=dates_between[-1] - previous_range[0])
321 |                 # Replace the last range we added to the list with the merged range.
322 |                 times_during_events[-1] = merged_range
323 |                 previous_range = merged_range
324 |             else:
325 |                 previous_range = dates_between
326 |                 times_during_events.append(dates_between)
327 | 
328 |     # Use the reduce function to combine all the overlapping ranges we generated.
329 |     union_ranges = reduce(lambda x, y: x.union(y), times_during_events)
330 |     # Remove any timezone information from the resulting DatetimeIndex, if present.
331 |     union_ranges = union_ranges.tz_localize(None)
332 |     return union_ranges
333 | 
334 | 
335 | def get_dataset_within_events_times(data_set: pd.DataFrame, events_times: pd.DatetimeIndex) -> pd.DataFrame:
336 |     """
337 |     Extracts the data from the given dataset that falls within the specified event times.
338 | 
339 |     Args:
340 |         data_set: A pandas DataFrame containing the data to extract.
341 |         events_times: A pandas DatetimeIndex containing the times of events.
342 | 
343 |     Returns:
344 |         A pandas DataFrame containing the data within the specified event times.
345 |     """
346 | 
347 |     dataset_within_events_times = []
348 | 
349 |     # Iterate through the event times by pairs
350 |     for i in range(0, len(events_times) - 1, 2):
351 |         partition_start_time = events_times[i]
352 |         partition_end_time = events_times[i + 1]
353 | 
354 |         # Extract the data within the event time
355 |         data_within_event_time = data_set.loc[partition_start_time: partition_end_time]
356 | 
357 |         dataset_within_events_times.append(data_within_event_time)
358 | 
359 |     # Concatenate all the data extracted from events times
360 |     return pd.concat(dataset_within_events_times)
361 | 
362 | 
363 | def op(dataset_as_overlapping_partitions: np.ndarray, events_as_intervals: list[Interval]) -> \
364 |         tuple[np.ndarray, np.ndarray]:
365 |     """
366 |     Calculates the "op" value for each overlapping partition in the dataset, based on the overlapping parameter 
367 |     between the partition and a set of events.
368 | 
369 |     Args: dataset_as_overlapping_partitions: A numpy ndarray containing the overlapping partitions for the dataset, 
370 |     where each overlapping partition is a 2D numpy ndarray containing the data points for the partition and their 
371 |     timestamps. events_as_intervals: A list of Interval objects representing the events in the dataset.
372 | 
373 |     Returns:
374 |         A tuple containing two values:
375 |             - A numpy ndarray containing the overlapping partitions for the dataset, with the timestamp column removed.
376 |             - A numpy ndarray of floating-point values representing the "op" value
377 |                 for each overlapping partition in the dataset.
378 |     """
379 | 
380 |     # The index of the first event that hasn't been checked yet
381 |     starting_event_index = 0
382 | 
383 |     # List to store the calculated op values for each overlapping partition
384 |     op_values = []
385 | 
386 |     # Iterate through each overlapping partition in the dataset
387 |     for partition in dataset_as_overlapping_partitions:
388 |         # Get the start and end times of the current overlapping partition
389 |         partition_start_time = partition[0][-1].to_pydatetime()
390 |         partition_end_time = partition[-1][-1].to_pydatetime()
391 | 
392 |         # Create an Interval object to represent the current overlapping partition
393 |         partition_interval = Interval(partition_start_time, partition_end_time)
394 | 
395 |         # Initialize the op value for the current overlapping partition to 0
396 |         current_op_value = 0
397 | 
398 |         # Iterate through each event that hasn't been checked yet
399 |         for event_index in range(starting_event_index, len(events_as_intervals)):
400 |             # Get the Interval object for the current event
401 |             current_event_interval = events_as_intervals[event_index]
402 | 
403 |             # If the start time of the current partition is greater than or equal to the end time of the current event,
404 |             # we can skip this event since it doesn't overlap with the current partition
405 |             if partition_interval.start_time >= current_event_interval.end_time:
406 |                 starting_event_index = event_index + 1
407 |                 continue
408 | 
409 |             # Calculate the overlapping parameter between the current partition and the current event
410 |             overlapping_parameter = partition_interval.overlapping_parameter(current_event_interval)
411 | 
412 |             # If the overlapping parameter is 0, there is no overlap between the current partition and the current event
413 |             if overlapping_parameter == 0:
414 |                 break
415 | 
416 |             # Update the op value for the current partition if the overlapping parameter is greater than the current op
417 |             # value
418 |             if overlapping_parameter > current_op_value:
419 |                 current_op_value = overlapping_parameter
420 | 
421 |         # Add the op value for the current partition to the list of op values
422 |         op_values.append(current_op_value)
423 | 
424 |     # Remove the column containing the timestamps from the overlapping partitions
425 |     dataset_as_overlapping_partitions = np.delete(dataset_as_overlapping_partitions, -1, axis=2)
426 | 
427 |     # Return the updated overlapping partitions and the op values
428 |     return dataset_as_overlapping_partitions, np.array(op_values)
429 | 
430 | 
431 | def get_timedelta(delta_unit_time: Union[int, float], unit: TimeUnit) -> timedelta:
432 |     """
433 |     Returns a timedelta object with the specified delta_unit_time in the specified TimeUnit.
434 | 
435 |     Args:
436 |         delta_unit_time: The delta unit time value.
437 |         unit: The TimeUnit enum value representing the unit of time.
438 | 
439 |     Returns:
440 |         A timedelta object with the specified delta_unit_time in the specified TimeUnit.
441 |     """
442 |     if unit == TimeUnit.MICROSECOND:
443 |         return timedelta(microseconds=delta_unit_time)
444 |     elif unit == TimeUnit.MILLISECOND:
445 |         return timedelta(milliseconds=delta_unit_time)
446 |     elif unit == TimeUnit.SECOND:
447 |         return timedelta(seconds=delta_unit_time)
448 |     elif unit == TimeUnit.MINUTE:
449 |         return timedelta(minutes=delta_unit_time)
450 |     elif unit == TimeUnit.HOUR:
451 |         return timedelta(hours=delta_unit_time)
452 |     elif unit == TimeUnit.DAY:
453 |         return timedelta(days=delta_unit_time)
454 |     elif unit == TimeUnit.YEAR:
455 |         return timedelta(days=delta_unit_time * 365)
456 |     else:
457 |         raise VALUE_ERROR
458 | 
459 | 
460 | def get_total_units(timedelta_: timedelta, unit: Union[TimeUnit, object]) -> float:
461 |     if unit == TimeUnit.MICROSECOND:
462 |         return timedelta_.total_seconds() * 1e6
463 |     elif unit == TimeUnit.MILLISECOND:
464 |         return timedelta_.total_seconds() * 1e3
465 |     elif unit == TimeUnit.SECOND:
466 |         return timedelta_.total_seconds()
467 |     elif unit == TimeUnit.MINUTE:
468 |         return timedelta_.total_seconds() / 60
469 |     elif unit == TimeUnit.HOUR:
470 |         return timedelta_.total_seconds() / 3600
471 |     elif unit == TimeUnit.DAY:
472 |         return timedelta_.total_seconds() / (3600 * 24)
473 |     elif unit == TimeUnit.YEAR:
474 |         return timedelta_.total_seconds() / (3600 * 24 * 365.25)
475 |     else:
476 |         raise VALUE_ERROR
477 | 
478 | 
479 | def check_time_unit(diff: timedelta) -> Tuple[int, TimeUnit]:
480 |     """
481 |     Method to determine the unit of time of the dataset.
482 | 
483 |     Args:
484 |         diff (timedelta): The time difference to be checked.
485 | 
486 |     Returns:
487 |         Tuple[int, TimeUnit]: A tuple with the time value and its unit.
488 |     """
489 | 
490 |     if diff.total_seconds() >= 31536000:  # 1 year in seconds
491 |         years = int(diff.total_seconds() / 31536000)
492 |         t_s = years
493 |         time_unit = TimeUnit.YEAR
494 |     elif diff.total_seconds() >= 86400:  # 1 day in seconds
495 |         days = int(diff.total_seconds() / 86400)
496 |         t_s = days
497 |         time_unit = TimeUnit.DAY
498 |     elif diff.total_seconds() >= 3600:  # 1 hour in seconds
499 |         hours = int(diff.total_seconds() / 3600)
500 |         t_s = hours
501 |         time_unit = TimeUnit.HOUR
502 |     elif diff.total_seconds() >= 60:  # 1 minute in seconds
503 |         minutes = int(diff.total_seconds() / 60)
504 |         t_s = minutes
505 |         time_unit = TimeUnit.MINUTE
506 |     elif diff.total_seconds() >= 1:
507 |         t_s = int(diff.total_seconds())
508 |         time_unit = TimeUnit.SECOND
509 |     elif diff.total_seconds() * 1000 >= 1:
510 |         t_s = int(diff.total_seconds() * 1000)
511 |         time_unit = TimeUnit.MILLISECOND
512 |     elif diff.total_seconds() * 1000000 >= 1:
513 |         t_s = int(diff.total_seconds() * 1000000)
514 |         time_unit = TimeUnit.MICROSECOND
515 |     else:
516 |         raise ValueError("Could not determine the unit of time of the dataset")
517 | 
518 |     return t_s, time_unit
519 | 
520 | 
521 | def convert_seconds_to_time_unit(value: Union[float, int], unit: TimeUnit) -> Union[float, int]:
522 |     """
523 |     Converts a given value from seconds to a specified time unit.
524 | 
525 |     Args:
526 |         value (Union[float, int]): The value in seconds that needs to be converted.
527 |         unit (TimeUnit): The target time unit for the conversion.
528 | 
529 |     Returns:
530 |         Union[float, int]: The converted value in the target time unit.
531 | 
532 |     Raises:
533 |         ValueError: If an invalid TimeUnit is provided.
534 |     """
535 |     conversion_factors = {
536 |         TimeUnit.MICROSECOND: 1e6,
537 |         TimeUnit.MILLISECOND: 1e3,
538 |         TimeUnit.SECOND: 1,
539 |         TimeUnit.MINUTE: 1 / 60,
540 |         TimeUnit.HOUR: 1 / 3600,
541 |         TimeUnit.DAY: 1 / (3600 * 24),
542 |         TimeUnit.YEAR: 1 / (3600 * 24 * 365.25)
543 |     }
544 | 
545 |     if unit in conversion_factors:
546 |         return value * conversion_factors[unit]
547 | 
548 |     raise VALUE_ERROR
549 | 
550 | 
551 | def save_dict_to_json(path: str, data: Dict):
552 |     """
553 |     Save a dictionary into a json file
554 |     Args:
555 |         path (str): the path where to store the json file
556 |         data (Dict): the dictionary
557 | 
558 |     Returns:
559 | 
560 |     """
561 |     with open(path, 'w') as f:
562 |         json.dump(data, f)
563 | 
564 | 
565 | def convert_dataset_index_to_datetime(dataset: pd.DataFrame) -> None:
566 |     """
567 |     Check if the index of the DataFrame dataset is already in the datetime format. If the index is not in datetime 
568 |     format, dataset.index = pd.to_datetime(dataset.index) statement is executed to convert it. 
569 |     
570 |     Args: 
571 |         dataset (pd.DataFrame): A dataset as pandas DataFrame
572 | 
573 |     Returns:
574 |         None
575 |     """
576 |     if not is_datetime64_any_dtype(dataset.index):
577 |         dataset.index = pd.to_datetime(dataset.index)
578 | 


--------------------------------------------------------------------------------
/eventdetector_ts/metamodel/meta_model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pprint
  3 | import shutil
  4 | from typing import Union, Dict, Optional
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import tensorflow as tf
  9 | 
 10 | from eventdetector_ts import FFN, FILL_NAN_ZEROS, TYPE_TRAINING_AVERAGE, STANDARD_SCALER, \
 11 |     config_dict, CONFIG_FILE
 12 | from eventdetector_ts.data.helpers_data import compute_middle_event, remove_close_events, \
 13 |     convert_events_to_intervals, get_union_times_events, get_dataset_within_events_times, \
 14 |     convert_dataframe_to_overlapping_partitions, op, check_time_unit, save_dict_to_json, \
 15 |     convert_dataset_index_to_datetime, convert_seconds_to_time_unit
 16 | from eventdetector_ts.metamodel import logger_meta_model
 17 | from eventdetector_ts.metamodel.utils import DataSplitter, validate_args, validate_required_args, validate_ffn, \
 18 |     validate_cnn, validate_rnn
 19 | from eventdetector_ts.models.models_builder import ModelCreator
 20 | from eventdetector_ts.models.models_trainer import ModelTrainer
 21 | from eventdetector_ts.optimization.event_extraction_pipeline import OptimizationData, EventOptimization
 22 | from eventdetector_ts.plotter.plotter import Plotter
 23 | 
 24 | 
 25 | class MetaModel:
 26 |     def __init__(
 27 |             self,
 28 |             output_dir: str,
 29 |             dataset: pd.DataFrame,
 30 |             events: Union[list, pd.DataFrame],
 31 |             width: int,
 32 |             step: int = 1,
 33 |             width_events: Optional[Union[int, float]] = None,
 34 |             **kwargs
 35 |     ):
 36 |         """
 37 |         Initializes a new instance of the MetaModel class.
 38 | 
 39 |         Args:
 40 |             output_dir (str): The name or path of the directory where all outputs will be saved.
 41 |                 If output_dir is a folder name, the full path in the current directory will be created.
 42 |             dataset (pd.DataFrame): The input dataset as a Pandas DataFrame.
 43 |             events (Union[list, pd.DataFrame]): The input events as either a list or a Pandas DataFrame.
 44 |             width (int): Number of consecutive time steps in each partition (window) when creating overlapping 
 45 |                 partitions (sliding windows).
 46 |             step (int = 1): Number of time steps to advance the sliding window. Default to 1.
 47 |             width_events (Union[int, float] = None): The width of each event. 
 48 |                 If it's an integer, it represents the number of time steps that constitute an event. 
 49 |                 If it's a float, it represents the duration in seconds of each event. 
 50 |                 If not provided (None), it defaults to the value of (width -1).
 51 |             kwargs (Dict): Optional keyword arguments for additional parameters.
 52 |                 - t_max (float): The maximum total time is linked to the `sigma variable of the Gaussian filter. 
 53 |                     This time should be expressed in the same unit of time (seconds, minutes, etc.) as used in the 
 54 |                     dataset. The unit of time for the dataset is determined by its time sampling. In other words, 
 55 |                     the `sigma` variable should align with the timescale used in your time series data. 
 56 |                     The default value is calculated as (3 x (width-1) x time_sampling) / 2.
 57 |                 - delta (Union[int, float]): The maximum time tolerance used to determine the correspondence 
 58 |                     between a predicted event and its actual counterpart in the true events. If it's an integer, it 
 59 |                     represents the number of time steps. If it's a float, it represents the duration in seconds
 60 |                     The default value is width_events x time_sampling.
 61 |                 - s_h (float): A step parameter for adjusting the peak height threshold `h` during the peak detection 
 62 |                     process. The default value is 0.05.
 63 |                 - epsilon (float): A small constant used to control the size of set which contains the top models
 64 |                     with the lowest MSE values. The default value is 0.0002.
 65 |                 - pa (int): The patience for the early stopping algorithm. The default value is 5.
 66 |                 - t_r (float): The ratio threshold for the early stopping algorithm.
 67 |                     The default value is 0.97.
 68 |                 - time_window (Union[int, float] = None): This parameter controls the amount of data within the dataset 
 69 |                     is used for the training process. If it's an integer, it represents a specific number time steps. 
 70 |                     If it's a float, it represents a duration in seconds. By default, it is set to None, which means all
 71 |                     available data will be used. However, if a value is provided, the dataset will include a specific 
 72 |                     interval of data surrounding each reference event. This interval includes data from both sides of 
 73 |                     each event, with a duration equal to the specified `time_window`. Setting a `time_window` in some 
 74 |                     situations can offer several advantages, such as accelerating the training process and enhancing 
 75 |                     the neural networks' understanding of rare events.
 76 |                 - models (List[Union[str, Tuple[str, int]]]): Determines the type of deep learning models to use.
 77 |                     If a tuple is passed, it specifies both the model type and the number of instances to run.
 78 |                     The default value is [(FFN, 2)].
 79 |                 - hyperparams_ffn (Tuple[int, int, int, int, str]): Specify for the FFN the minimum and the maximum 
 80 |                     number of layers, the minimum and the maximum number of neurons per layer, and the activation 
 81 |                     function. The default value is (1, 3, 64, 256, "sigmoid"). The List of available activation 
 82 |                     functions are ["relu","sigmoid","tanh","softmax","leaky_relu","elu","selu","swish"]. 
 83 |                     If you pass `None`, no activation is applied (i.e. "linear" activation: `a(x) = x`).
 84 |                 - hyperparams_cnn (Tuple[int, int, int, int, int, str]): Specify for the CNN the minimum, maximum number
 85 |                     of filters, the minimum, the maximum kernel size, the minimum and the maximum number of pooling
 86 |                     layers, and the activation function. The default value is (16, 64, 3, 8, 1, 2, "relu").
 87 |                 - hyperparams_transformer (Tuple[int, int, int, bool, str]): Specify for Transformer the Key dimension, 
 88 |                     number of heads, the number of the encoder blocks, a flag to indicate the use of the original 
 89 |                     architecture, and the activation function. The default value is (256, 8, 10, True, "relu").
 90 |                 - hyperparams_rnn (Tuple[int, int, int, str]): Specify for the RNN the minimum and the maximum number 
 91 |                     of RNN layers, the minimum and the maximum number of hidden units, and the activation function.
 92 |                     The default value is (1, 2, 16, 128, "tanh").
 93 |                 - hyperparams_mm_network (Tuple[int, int, str]): Specify for the MetaModel network the number
 94 |                     of layers, the number of neurons per layer, and the activation function.
 95 |                     The default value is (1, 32, "sigmoid").
 96 |                 - epochs (int): The number of epochs to train different models. The default value is False 256.
 97 |                 - batch_size (int): The number of samples per gradient update.
 98 |                     The default value is 32.
 99 |                 - fill_nan (str): Specifies the method to use for filling NaN values in the dataset.
100 |                     Supported methods are 'zeros', 'ffill', 'bfill', and 'median'.
101 |                     The default is 'zeros'.
102 |                 - type_training (str):Specifies the type of training technique to use for the MetaModel.
103 |                     Supported techniques are 'average' and 'ffn'.
104 |                     The default is 'average'.
105 |                 - scaler (str): The type of scaler to use for preprocessing the data.
106 |                     Possible values are "MinMaxScaler", "StandardScaler", and "RobustScaler".
107 |                     Default is "StandardScaler"
108 |                 - use_kfold (bool): Whether to use k-fold cross-validation technique or not.
109 |                 The default value is False.
110 |                 - test_size (float): The proportion of the dataset to include in the test split.
111 |                     Should be a value between 0 and 1. Default is 0.2.
112 |                 - val_size (float): The proportion of the training set to use for validation.
113 |                     Should be a value between 0 and 1. Default is 0.2.
114 |                 - save_models_as_dot_format (bool = False): Whether to save the models as a dot format file.
115 |                     The default value is False. If set to True, then you should have graphviz software
116 |                     to be installed on your machine.
117 |                 - remove_overlapping_events (bool = True): Whether to remove the overlapping events or not. 
118 |                     The default value is True.
119 |                 - dropout (float = 0.3): The dropout rate, which determines the fraction of input units to drop during 
120 |                     training.
121 |                 - last_act_func (str = "sigmoid"): Activation function for the final layer of each model. Defaults to
122 |                     "sigmoid". If set to `None`, no activation will be applied (i.e., "linear" activation: `a(x) = x`).
123 | 
124 |         """
125 |         self.step = step
126 |         self.width = width
127 |         self.events = events
128 |         self.dataset = dataset
129 |         self.output_dir = output_dir
130 |         self.width_events = width_events
131 |         validate_required_args(self)
132 |         self.kwargs: Dict = kwargs
133 |         self.y = np.empty(shape=(0,))
134 |         self.x = np.empty(shape=(0,))
135 |         self.__compute_and_set_time_sampling()
136 |         self.__set_defaults()
137 |         validate_args(self)
138 | 
139 |         if self.save_models_as_dot_format:
140 |             logger_meta_model.warning("save_models_as_dot_format is set to true, "
141 |                                       "you should have graphviz software to be installed on your machine.")
142 |         self.__create_output_dir()
143 |         # Create a `ModelCreator` object with the provided models and hyperparameters
144 |         self.model_creator: ModelCreator = ModelCreator(models=self.models, hyperparams_ffn=self.hyperparams_ffn,
145 |                                                         hyperparams_cnn=self.hyperparams_cnn,
146 |                                                         hyperparams_rnn=self.hyperparams_rnn,
147 |                                                         hyperparams_transformer=self.hyperparams_transformer,
148 |                                                         last_act_func=self.last_act_func, dropout=self.dropout,
149 |                                                         save_models_as_dot_format=self.save_models_as_dot_format,
150 |                                                         root_dir=self.output_dir)
151 |         # Create a `DataSplitter` object with the provided test_size and scaler_type
152 |         self.data_splitter: DataSplitter = DataSplitter(test_size=self.test_size, scaler_type=self.scaler)
153 |         # Create a `ModelTrainer` object with the provided data_splitter, epochs,
154 |         #   batch_size, pa, t_r, use_kfold, val_size, epsilon and save_models_as_dot_format.
155 |         self.model_trainer: ModelTrainer = ModelTrainer(data_splitter=self.data_splitter, epochs=self.epochs,
156 |                                                         batch_size=self.batch_size, pa=self.pa, t_r=self.t_r,
157 |                                                         use_kfold=self.use_kfold,
158 |                                                         val_size=self.val_size, epsilon=self.epsilon,
159 |                                                         save_models_as_dot_format=self.save_models_as_dot_format)
160 |         # class represents the data used for the event extraction pipeline.
161 |         self.optimization_data: OptimizationData = OptimizationData(t_max=self.t_max, w_s=self.w_s, s_s=self.s_s,
162 |                                                                     s_h=self.s_h, delta=self.delta,
163 |                                                                     output_dir=self.output_dir,
164 |                                                                     time_unit=self.time_unit)
165 | 
166 |         self.event_optimization: EventOptimization = EventOptimization(optimization_data=self.optimization_data)
167 |         # The Plotter class is responsible for generating and saving plots.
168 |         self.plotter: Plotter = Plotter(root_dir=self.output_dir, time_unit=self.time_unit,
169 |                                         width_events_s=self.width_events_s)
170 | 
171 |     def __create_output_dir(self) -> None:
172 |         """
173 |            Check if output_dir is already a complete path, if output_dir is a folder name,
174 |             create the full path in the current directory.
175 | 
176 |            Returns:
177 |                None
178 |            """
179 | 
180 |         # Check if output_dir is already a complete path
181 |         if os.path.isabs(self.output_dir):
182 |             if not os.path.exists(self.output_dir):
183 |                 logger_meta_model.critical(f"{self.output_dir} does not exists")
184 |                 raise ValueError(f"{self.output_dir} does not exists")
185 | 
186 |         # If output_dir is a folder name, create the full path in the current directory
187 |         else:
188 |             # Get the absolute path of the current directory
189 |             current_directory = os.path.abspath(".")
190 |             self.output_dir = os.path.join(current_directory, self.output_dir)
191 |             if os.path.exists(self.output_dir):
192 |                 logger_meta_model.warning(f"The working directory '{self.output_dir}' exists and it will be deleted")
193 |                 shutil.rmtree(self.output_dir)
194 |             logger_meta_model.info(f"Creating the working directory at: '{self.output_dir}'")
195 |             os.makedirs(self.output_dir)
196 | 
197 |         config_dict['output_dir'] = self.output_dir
198 | 
199 |     def __set_defaults_bis(self) -> None:
200 |         """
201 |         Sets default values for any missing keyword arguments in self.kwargs.
202 | 
203 |         Returns:
204 |             None
205 |         """
206 |         if self.width_events is None:
207 |             self.width_events = self.width
208 |         self.t_max = self.kwargs.get('t_max', (3.0 * self.w_s) / 2)  # the minimum should be equal to w_s
209 | 
210 |         if self.kwargs.get('delta') is None:
211 |             self.delta = self.width_events_s
212 |         else:
213 |             if isinstance(self.kwargs.get('delta'), float):
214 |                 self.delta = convert_seconds_to_time_unit(value=self.kwargs.get('delta'), unit=self.time_unit)
215 |             else:
216 |                 self.delta = self.kwargs.get('delta') * self.t_s
217 | 
218 |         self.s_h = self.kwargs.get('s_h', 0.05)
219 |         self.epsilon = self.kwargs.get('epsilon', 0.0002)
220 |         self.pa = self.kwargs.get('pa', 5)
221 |         self.t_r = self.kwargs.get('t_r', 0.97)
222 | 
223 |     def __set_defaults(self) -> None:
224 |         """
225 |         Sets default values for any missing keyword arguments in self.kwargs.
226 | 
227 |         Returns:
228 |             None
229 |         """
230 |         self.__set_defaults_bis()
231 | 
232 |         if self.kwargs.get('time_window') is None:
233 |             self.time_window = None
234 |         else:
235 |             if isinstance(self.kwargs.get('time_window'), float):
236 |                 self.time_window = convert_seconds_to_time_unit(value=self.kwargs.get('time_window'),
237 |                                                                 unit=self.time_unit)
238 |             else:
239 |                 self.time_window = self.kwargs.get('time_window') * self.t_s
240 | 
241 |         self.models = self.kwargs.get('models', [(FFN, 2)])
242 |         for i, model in enumerate(self.models):
243 |             if isinstance(model, str):
244 |                 self.models[i] = (model, 1)
245 |             elif isinstance(model, tuple) and len(model) == 1:
246 |                 self.models[i] = (model[0], 1)
247 | 
248 |         self.hyperparams_ffn = self.kwargs.get('hyperparams_ffn', (1, 3, 64, 256, "sigmoid"))
249 |         self.hyperparams_ffn = validate_ffn(self)
250 |         self.hyperparams_cnn = self.kwargs.get('hyperparams_cnn', (16, 64, 3, 8, 1, 2, "relu"))
251 |         self.hyperparams_cnn = validate_cnn(self)
252 |         self.hyperparams_rnn = self.kwargs.get('hyperparams_rnn', (1, 2, 16, 128, "tanh"))
253 |         self.hyperparams_rnn = validate_rnn(self)
254 |         self.hyperparams_transformer = self.kwargs.get("hyperparams_transformer", (256, 4, 1, True, "relu"))
255 |         self.hyperparams_mm_network = self.kwargs.get('hyperparams_mm_network', (1, 32, "sigmoid"))
256 |         self.epochs = self.kwargs.get('epochs', 256)
257 |         self.batch_size = self.kwargs.get('batch_size', 32)
258 |         self.fill_nan = self.kwargs.get('fill_nan', FILL_NAN_ZEROS)
259 |         self.type_training = self.kwargs.get('type_training', TYPE_TRAINING_AVERAGE)
260 |         self.scaler = self.kwargs.get('scaler', STANDARD_SCALER)
261 |         self.use_kfold = self.kwargs.get('use_kfold', False)
262 |         self.test_size = self.kwargs.get('test_size', 0.2)
263 |         self.val_size = self.kwargs.get('val_size', 0.2)
264 | 
265 |         self.save_models_as_dot_format = self.kwargs.get('save_models_as_dot_format', False)
266 |         self.remove_overlapping_events = self.kwargs.get("remove_overlapping_events", True)
267 |         self.last_act_func = self.kwargs.get("last_act_func", "sigmoid")
268 |         self.dropout = self.kwargs.get("dropout", 0.3)
269 | 
270 |         log_dict = {
271 |             'width_events_s': self.width_events_s,
272 |             't_max': self.t_max,
273 |             'delta': self.delta,
274 |             's_h': self.s_h,
275 |             'epsilon': self.epsilon,
276 |             'pa': self.pa,
277 |             't_r': self.t_r,
278 |             'time_window': self.time_window,
279 |             'models': self.models,
280 |             'hyperparams_ffn': self.hyperparams_ffn,
281 |             'hyperparams_cnn': self.hyperparams_cnn,
282 |             'hyperparams_rnn': self.hyperparams_rnn,
283 |             'hyperparams_transformer': self.hyperparams_transformer,
284 |             'hyperparams_mm_network': self.hyperparams_mm_network,
285 |             'epochs': self.epochs,
286 |             'batch_size': self.batch_size,
287 |             'fill_nan': self.fill_nan,
288 |             'type_training': self.type_training,
289 |             'scaler': self.scaler,
290 |             'use_kfold': self.use_kfold,
291 |             'test_size': self.test_size,
292 |             'val_size': self.val_size,
293 |             'save_models_as_dot_format': self.save_models_as_dot_format,
294 |             "remove_overlapping_events": self.remove_overlapping_events,
295 |             "last_act_func": self.last_act_func,
296 |             "dropout": self.dropout
297 |         }
298 | 
299 |         log_message = pprint.pformat(log_dict, indent=4)
300 |         logger_meta_model.info(log_message)
301 | 
302 |         config_dict.update({'width': self.width, 'step': self.step, 'batch_size': self.batch_size,
303 |                             'type_training': self.type_training, 'fill_nan': self.fill_nan})
304 | 
305 |     def __compute_and_set_time_sampling(self) -> None:
306 |         """
307 |         Compute the time sampling of the dataset by calculating the time difference between the first two index values.
308 |                 Then set the corresponding parameters: t_s, w_s, and s_s.
309 | 
310 |         Returns:
311 |             None
312 | 
313 |         Raises:
314 |             TypeError: If the index of the dataset is not in datetime format.
315 |         """
316 |         try:
317 |             logger_meta_model.info("checks if the index of the dataset is already in the datetime format.")
318 |             convert_dataset_index_to_datetime(self.dataset)
319 |             # Get the first two index values of the dataset
320 |             a = self.dataset.index[0]
321 |             b = self.dataset.index[1]
322 |             # Calculate the time difference between the first two index values
323 |             diff = b - a
324 |             # Check the units of the time difference
325 |             logger_meta_model.info("Computing the time sampling and time unit of the dataset")
326 |             self.t_s, self.time_unit = check_time_unit(diff=diff)
327 |             logger_meta_model.warning(f"The time sampling t_s is {self.t_s} {self.time_unit}s")
328 |             self.w_s = self.t_s * (self.width - 1)
329 |             self.s_s = self.t_s * self.step
330 | 
331 |             if self.width_events is None:
332 |                 self.width_events_s = self.w_s
333 |             else:
334 |                 self.width_events_s = self.t_s * self.width_events
335 | 
336 |             if isinstance(self.width_events, float):
337 |                 self.width_events_s = convert_seconds_to_time_unit(value=self.width_events, unit=self.time_unit)
338 | 
339 |             config_dict['w_s'] = self.w_s
340 |             config_dict['width_events_s'] = self.width_events_s
341 |             config_dict['time_unit'] = self.time_unit.value
342 |         except AttributeError:
343 |             logger_meta_model.critical("The dataset is not compatible with the datetime format")
344 |             raise TypeError("The index should be in datetime format.")
345 | 
346 |     def prepare_data_and_computing_op(self) -> None:
347 |         """
348 |         Prepare the events and dataset for computing op.
349 |         This method will compute the middle event of the given events, remove any close events based on the self.w_s,
350 |             and convert the remaining events to intervals. If a time partition is specified, it will get the union of
351 |             event times and extract the corresponding portion of the dataset.
352 | 
353 |         The dataset will then be converted to overlapping partitions using the specified width and step size, 
354 |         and the $op$ (overlapping parameter) values will be computed for each partition based on the given intervals.
355 | 
356 |         Finally, the learning data (overlapping partitions and corresponding $op$ values) will be stored in
357 |             the instance variables x and y.
358 | 
359 |         Returns:
360 |              None
361 |         """
362 | 
363 |         logger_meta_model.info("Computes the middle date of events...")
364 | 
365 |         self.events = compute_middle_event(self.events)
366 | 
367 |         logger_meta_model.info("Removes events that occur too close together...")
368 |         temp: int = len(self.events)
369 |         self.events = remove_close_events(self.events, self.width_events_s, self.time_unit,
370 |                                           self.remove_overlapping_events)
371 | 
372 |         logger_meta_model.warning(f"A total of {temp - len(self.events)}/{temp} events were removed due to overlapping")
373 |         logger_meta_model.info("Convert events to intervals...")
374 |         intervals = convert_events_to_intervals(self.events, self.width_events_s, self.time_unit)
375 | 
376 |         if self.time_window is not None:
377 |             logger_meta_model.warning(f"time_window is provided = {self.time_window} {self.time_unit}s")
378 |             events_times = get_union_times_events(self.events, self.time_window, self.time_unit)
379 |             self.dataset = get_dataset_within_events_times(self.dataset, events_times)
380 | 
381 |         logger_meta_model.info("Computing overlapping partitions...")
382 |         overlapping_partitions = convert_dataframe_to_overlapping_partitions(self.dataset, width=self.width,
383 |                                                                              step=self.step,
384 |                                                                              fill_method=self.fill_nan)
385 | 
386 |         logger_meta_model.info("Computing op...")
387 |         self.x, self.y = op(dataset_as_overlapping_partitions=overlapping_partitions, events_as_intervals=intervals)
388 | 
389 |         # Convert x and y arrays to float32 for consistency
390 |         self.x = np.asarray(self.x).astype('float32')
391 |         self.y = np.asarray(self.y).astype('float32')
392 | 
393 |         self.optimization_data.set_overlapping_partitions(overlapping_partitions)
394 |         self.optimization_data.set_true_events(self.events)
395 | 
396 |     def build_stacking_learning(self) -> None:
397 |         """
398 |         Builds a stacking learning pipeline using the provided models and hyperparameters.
399 | 
400 |         Returns:
401 |             None
402 |         """
403 | 
404 |         # Get the number of time steps and features from the x data
405 |         n_time_steps, n_features = self.x.shape[1], self.x.shape[2]
406 |         config_dict['n_time_steps'] = n_time_steps
407 |         inputs = tf.keras.Input(shape=(n_time_steps, n_features), name="input")
408 |         # Call the `create_models` method to create the models
409 |         logger_meta_model.info(f"Create the following models: {list(map(lambda x: x[0], self.models))}")
410 |         self.model_creator.create_models(inputs=inputs)
411 |         logger_meta_model.info("Split the data into training, validation, and test sets and apply "
412 |                                "the specified scaler to each time step...")
413 |         self.data_splitter.split_data_and_apply_scaler(x=self.x, y=self.y)
414 |         logger_meta_model.info("Saves the scalers to disk...")
415 |         self.data_splitter.save_scalers(output_dir=self.output_dir)
416 |         logger_meta_model.info("Fits the created models to the training data...")
417 |         self.model_trainer.fitting_models(self.model_creator.created_models)
418 |         logger_meta_model.info("Saving the best models...")
419 |         self.model_trainer.save_best_models(output_dir=self.output_dir)
420 |         predicted_y, loss, test_y = self.model_trainer.train_meta_model(type_training=self.type_training,
421 |                                                                         hyperparams_mm_network
422 |                                                                         =self.hyperparams_mm_network,
423 |                                                                         output_dir=self.output_dir)
424 |         self.optimization_data.set_predicted_op(predicted_op=predicted_y)
425 |         logger_meta_model.info(f"The loss of the MetaModel is {loss:.4f}")
426 |         self.plotter.set_data_op(test_y=test_y, predicted_y=predicted_y)
427 |         self.plotter.set_losses(train_losses=self.model_trainer.train_losses,
428 |                                 val_losses=self.model_trainer.val_losses, train_loss_meta_model=
429 |                                 self.model_trainer.train_loss_meta_model,
430 |                                 val_loss_meta_model=self.model_trainer.val_loss_meta_model)
431 | 
432 |     def event_extraction_optimization(self) -> None:
433 |         """
434 |         Run the Event Extraction Optimization process.
435 | 
436 |         Returns:
437 |             None
438 |         """
439 | 
440 |         predicted_events, delta_t = self.event_optimization.max_f1score()
441 |         path = os.path.join(self.output_dir, CONFIG_FILE)
442 |         logger_meta_model.info(f"Saving config file into {path}")
443 |         save_dict_to_json(path=path, data=config_dict)
444 |         self.plotter.set_data_events(predicted_events=predicted_events, true_events=self.optimization_data.true_events)
445 |         self.plotter.set_delta_t(delta_t=delta_t)
446 | 
447 |     def plot_save(self, show_plots: bool = True) -> None:
448 |         """
449 |         Plot the results: losses, true/predicted op, true/predicted events, deltat_t.
450 | 
451 |         Args:
452 |             show_plots (bool): whether to show the plots or not.
453 |             
454 |         Returns:
455 |             None
456 |         """
457 |         self.plotter.set_show(show=show_plots)
458 |         self.plotter.plot_losses()
459 |         self.plotter.plot_prediction()
460 |         self.plotter.plot_predicted_events()
461 |         self.plotter.plot_delta_t(bins=10)
462 | 
463 |     def fit(self) -> None:
464 |         """
465 |         Run prepare_data_and_computing_op, build_stacking_learning, event_extraction_optimization, and plot_save
466 |         
467 |         Returns:
468 |             None
469 |         """
470 |         self.prepare_data_and_computing_op()
471 |         self.build_stacking_learning()
472 |         self.event_extraction_optimization()
473 |         self.plot_save()
474 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h1 align="center">
  2 | <img src="https://raw.githubusercontent.com/menouarazib/eventdetector/a4d7e137f88a4a476ba6d07f43337ec39543a522/images/logo_eventdetector.svg" width="400">
  3 | </h1><br>
  4 | 
  5 | [![Python](https://img.shields.io/badge/Python-3.9%2B-blue.svg)](https://www.python.org/downloads/)
  6 | [![PyPI version](https://img.shields.io/pypi/v/eventdetector-ts.svg?color=brightgreen)](https://pypi.org/project/eventdetector-ts/)
  7 | ![Unit Tests and Lint](https://github.com/menouarazib/eventdetector/actions/workflows/unit_tests.yml/badge.svg)
  8 | [![Coverage Status](https://coveralls.io/repos/github/menouarazib/eventdetector/badge.svg?branch=master)](https://coveralls.io/github/menouarazib/eventdetector?branch=master)
  9 | [![License](https://img.shields.io/github/license/menouarazib/eventdetector)](https://github.com/menouarazib/eventdetector/blob/master/LICENSE)
 10 | [![DOI](https://zenodo.org/badge/DOI/10.48550/arXiv.org/2310.16485.svg)](https://doi.org/10.48550/arXiv.2310.16485)
 11 | 
 12 | A Comprehensive Python Library for Deep Learning-Based Event Detection in Multivariate Time Series Data
 13 | ==========================================================
 14 | ## Table of Contents
 15 | 
 16 | - [Introduction](#introduction)
 17 | - [Installation](#installation)
 18 | - [Quickstart](#quickstart-examples)
 19 | - [Make Prediction](#make-prediction)
 20 | - [Documentation](#documentation)
 21 | - [How to credit our package](#how-to-credit-our-package)
 22 | - [Futures Works](#future-works)
 23 | - [References](#references)
 24 | 
 25 | 
 26 | ## Introduction
 27 | Event detection in time series data is crucial in various domains, including finance, healthcare, cybersecurity, and science. Accurately identifying events in time series data is vital for making informed decisions, detecting anomalies, and predicting future trends. Despite extensive research exploring diverse methods for event detection in time series, with deep learning approaches being among the most advanced, there is still room for improvement and innovation in this field. In this paper, we present a new deep learning supervised method for detecting events in multivariate time series data. Our method combines four distinct novelties compared to existing deep-learning supervised methods. Firstly, it is based on regression instead of binary classification. Secondly, it does not require labeled datasets where each point is labeled; instead, it only requires reference events defined as time points or intervals of time. Thirdly, it is designed to be robust by using a stacked ensemble learning meta-model that combines deep learning models, ranging from classic feed-forward neural networks (FFNs) to state-of-the-art architectures like transformers. This ensemble approach can mitigate individual model weaknesses and biases, resulting in more robust predictions. Finally, to facilitate practical implementation, we have developed a Python package to accompany our proposed method. The package, called eventdetector-ts, can be installed through the Python Package Index (PyPI). In this paper, we present our method and provide a comprehensive guide on the usage of the package. We showcase its versatility and effectiveness through different real-world use cases from natural language processing (NLP) to financial security domains.
 28 | 
 29 | <h1 align="center">
 30 | <img src="https://raw.githubusercontent.com/menouarazib/eventdetector/master/images/inputs_event_detector.png">
 31 | </h1><br>
 32 | 
 33 | ## Installation
 34 | 
 35 | **Before installing this package, please ensure that you have `TensorFlow` installed in your environment.** This package relies on `TensorFlow` for its functionality, but does not include it as a dependency to allow users to manage their own TensorFlow installations. You can install TensorFlow via pip with `pip install tensorflow`.
 36 | 
 37 | Once TensorFlow is installed, you can proceed with the installation of this package.
 38 | Please follow the instructions below:
 39 | ### PyPi installation
 40 | <pre><code>
 41 | pip install eventdetector-ts</code>
 42 | </pre>
 43 | ### Manual installation
 44 | To get started using **Event Detector**, simply follow the instructions below to install the required packages and
 45 | dependencies.
 46 | #### Clone the repository:
 47 | 
 48 | <pre><code>git clone https://github.com/menouarazib/eventdetector.git
 49 | cd eventdetector
 50 | </code></pre>
 51 | 
 52 | #### Create a virtual environment:
 53 | 
 54 | <pre><code>python -m venv env
 55 | source env/bin/activate  # for Linux/MacOS
 56 | env\Scripts\activate.bat  # for Windows
 57 | </code></pre>
 58 | 
 59 | #### Install the required packages:
 60 | 
 61 | <pre><code>pip install -r requirements.txt</code></pre>
 62 | 
 63 | ## Quickstart Examples
 64 | 
 65 | ### Data Format
 66 | 
 67 | Input time series data:
 68 | 
 69 | ```
 70 | # pandas DataFrame with datetime index
 71 |                     feature1  feature 2
 72 | 2020-12-04T00:00:00  1.234     5.0
 73 | 2020-12-04T00:00:01  1.456     5.1
 74 | 2020-12-04T00:00:02  1.789     5.5
 75 | 2020-12-04T00:00:03  2.123     5.8
 76 | ```
 77 | 
 78 | Annotated events data format:
 79 | 
 80 | ```
 81 | # pandas DataFrame with start/end columns
 82 |         start                    end
 83 | 0  2020-12-04T00:01:02  2020-12-04T00:01:42
 84 | 1  2020-12-04T00:29:45  2020-12-04T00:30:20
 85 | 2  2020-12-04T00:30:55  2020-12-04T00:31:28
 86 | ```
 87 | 
 88 | ### Code Implementations:
 89 |   - Credit Card Frauds:
 90 | ```python
 91 | from eventdetector_ts import load_credit_card_fraud, FFN
 92 | from eventdetector_ts.metamodel.meta_model import MetaModel
 93 | 
 94 | dataset, events = load_credit_card_fraud()
 95 | 
 96 | meta_model = MetaModel(dataset=dataset, events=events, width=2, step=1,
 97 |                        output_dir='credit_card_fraud', batch_size=3200, s_h=0.01, models=[(FFN, 1)],
 98 |                        hyperparams_ffn=(1, 1, 20, 20, "sigmoid"))
 99 | 
100 | meta_model.fit()
101 | 
102 | ```
103 |   - Martian Bow Shock:
104 | ```python
105 | from eventdetector_ts import load_martian_bow_shock, FFN
106 | from eventdetector_ts.metamodel.meta_model import MetaModel
107 | 
108 | dataset, events = load_martian_bow_shock()
109 | 
110 | meta_model = MetaModel(output_dir="mex_bow_shocks", dataset=dataset, events=events, width=76, step=1,
111 |                        time_window=5400.0, batch_size=3000, models=[(FFN, 1)],
112 |                        hyperparams_ffn=(1 , 1, 20, 20, "sigmoid"))
113 | 
114 | meta_model.fit()
115 | 
116 | ```
117 | 
118 | ### Performance Evaluation and Outputs
119 | 
120 | #### Comparison of Our Method with Deep Learning Methods
121 | 
122 | ##### Credit Card Frauds
123 | 
124 | | Method              | Number of Parameters | Precision | Recall | F1-Score |
125 | |---------------------|----------------------|-----------|--------|----------|
126 | | CNN [[1]](#1)       | 119,457              | 0.89      | 0.68   | 0.77     |
127 | | FFN+SMOTE [[2]](#2) | 5,561                | 0.79      | 0.81   | 0.80     |
128 | | FFN+SMOTE [[3]](#3) | N/A                  | 0.82      | 0.79   | 0.81     |
129 | | Ours                | 1,201                | 0.98      | 0.74   | 0.85     |
130 | 
131 | ##### Bow Shock Crossings
132 | 
133 | | Method             | Number of Parameters | Precision | Recall        | F1-Score      |
134 | |--------------------|----------------------|-----------|---------------|---------------|
135 | | ResNet18 [[4]](#4) | 29,886,979           | 0.99      | [0.83 , 0.88] | [0.91 , 0.94] |
136 | | Ours               | 6,121                | 0.95      | 0.96          | 0.95          |
137 | 
138 | #### Training and Validation Losses
139 | 
140 | The Figure below showcases the training loss and validation loss of the FFNs on the Bow Shock Crossings and Credit Card Frauds.
141 | The low losses observed in both cases indicate that the metamodel has successfully learned the underlying patterns,
142 | justifying the obtained good metrics.
143 | 
144 | <p align="center">
145 |   <img src="https://raw.githubusercontent.com/menouarazib/eventdetector/master/images/losses_ccf.png" width="400" alt="Training and Validation Losses for Credit Card Frauds">
146 |   <img src="https://raw.githubusercontent.com/menouarazib/eventdetector/master/images/losses_bs.png" width="400" alt="Training and Validation Losses for Bow Shock Crossings">
147 | </p>
148 | 
149 | #### Comparison of Predicted `op` and True `op`
150 | The Figure below illustrates the comparison between the predicted $op$ values and the true $op$ values on the Bow Shock Crossings and Credit Card Frauds.
151 | <p align="center">
152 |   <img src="https://raw.githubusercontent.com/menouarazib/eventdetector/master/images/op_ccf.png" width="400" height="400" alt="Predicted $op$ for Credit Card Frauds">
153 |   <img src="https://raw.githubusercontent.com/menouarazib/eventdetector/master/images/op_bs.png" width="400" height="400" alt="Predicted $op$ for Bow Shock Crossings">
154 | </p>
155 | 
156 | #### Distribution of time differences δ(t) between predicted events and ground truth events for Bow Shock Crossings and Credit Card Frauds
157 | <p align="center">
158 |   <img src="https://raw.githubusercontent.com/menouarazib/eventdetector/master/images/delta_t_ccf.png" width="400" alt="Predicted $op$ for Credit Card Frauds">
159 |   <img src="https://raw.githubusercontent.com/menouarazib/eventdetector/master/images/delta_t_bs.png" width="400" alt="Predicted $op$ for Bow Shock Crossings">
160 | </p>
161 | 
162 | 
163 | ## Make Prediction
164 | ```python
165 | from eventdetector_ts.prediction.prediction import predict
166 | from eventdetector_ts.prediction.utils import plot_prediction
167 | 
168 | dataset_for_prediction = ...
169 | 
170 | # Call the 'predict' method
171 | predicted_events, predicted_op, filtered_predicted_op = predict(dataset=dataset_for_prediction,
172 |                                                                 path='path to output_dir')
173 | # Plot the predictions
174 | plot_prediction(predicted_op=predicted_op, filtered_predicted_op=filtered_predicted_op)
175 | ```
176 | 
177 | ## Documentation
178 | For a deeper understanding of the parameters presented below,
179 | please refer to our paper available at this [link](https://osf.io/uabjg).
180 | 
181 | ### Meta Model
182 | The first step is to instantiate the `MetaModel` object with the required arguments:
183 | ```python
184 | from eventdetector_ts.metamodel.meta_model import MetaModel
185 | 
186 | meta_model = MetaModel(output_dir=..., dataset=..., events=..., width=..., step=...)
187 | ```
188 | For a complete description of the required and optional arguments, please refer to the following tables:
189 | 
190 | #### Required Arguments
191 | | Argument       | Type                      | Description                                                                                                                                                                                                                                          | Default Value |
192 | |----------------|---------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------|
193 | | `output_dir`   | str                       | The name or path of the directory where all outputs will be saved. If `output_dir` is a folder name, the full path in the current directory will be created.                                                                                         | -             |
194 | | `dataset`      | pd.DataFrame              | The input dataset as a Pandas DataFrame.                                                                                                                                                                                                             | -             |
195 | | `events`       | Union[list, pd.DataFrame] | The input events as either a list or a Pandas DataFrame.                                                                                                                                                                                             | -             |
196 | | `width`        | int                       | Number of consecutive time steps in each partition (window) when creating overlapping partitions (sliding windows).                                                                                                                                  | -             |
197 | | `step`         | int                       | Number of time steps to advance the sliding window.                                                                                                                                                                                                  | 1             |
198 | | `width_events` | Union[int, float]         | The width of each event. If it's an `ìnt`, it represents the number of time steps that constitute an event. If it's a `float`, it represents the duration in seconds of each event. If not provided (None), it defaults to the value of  `width -1`. | `width -1`    |
199 | 
200 | #### Optional Arguments: Kwargs
201 | | Argument                    | Type                                     | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     | Default Value                          |
202 | |-----------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------|
203 | | `t_max`                     | float                                    | The maximum total time is linked to the `sigma` variable of the Gaussian filter. This time should be expressed in the same unit of time (seconds, minutes, etc.) as used in the dataset. The unit of time for the dataset is determined by its time sampling. In other words, the `sigma` variable should align with the timescale used in your time series data.                                                                                                                                                                                                                                                                                                                                                               | (3 x `(width -1)` x time_sampling) / 2 |
204 | | `delta`                     | Union[int, float]                        | The maximum time tolerance used to determine the correspondence between a predicted event and its actual counterpart in the true events. If it's an integer, it represents the number of time steps. If it's a float, it represents the duration in seconds.                                                                                                                                                                                                                                                                                                                                                                                                                                                                    | `width_events` x time_sampling         |
205 | | `s_h`                       | float                                    | A step parameter for adjusting the peak height threshold `h` during the peak detection process.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 | 0.05                                   |
206 | | `epsilon`                   | float                                    | A small constant used to control the size of set which contains the top models with the lowest MSE values.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      | 0.0002                                 |
207 | | `pa`                        | int                                      | The patience for the early stopping algorithm.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | 5                                      |
208 | | `t_r`                       | float                                    | The ratio threshold for the early stopping algorithm.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           | 0.97                                   |
209 | | `time_window`               | Union[int, float]                        | This parameter controls the amount of data within the dataset is used for the training process. If it's an integer, it represents a specific number time steps.  If it's a float, it represents a duration in seconds. By default, it is set to None, which means all available data will be used. However, if a value is provided, the dataset will include a specific interval of data surrounding each reference event. This interval includes data from both sides of each event, with a duration equal to the specified `time_window`. Setting a `time_window` in some situations can offer several advantages, such as accelerating the training process and enhancing the neural networks' understanding of rare events. | None                                   |
210 | | `models`                    | List[Union[str, Tuple[str, int]]]        | Determines the type of deep learning models and the number of instances to use. Available models: `LSTM`, `GRU`, `CNN`, `RNN_BIDIRECTIONAL`, `RNN_ENCODER_DECODER`, `CNN_RNN`, `FFN`, `CONV_LSTM1D`, `SELF_ATTENTION`, `TRANSFORMER`.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           | `[(FFN, 2)]`                           |
211 | | `hyperparams_ffn`           | Tuple[int, int, int, int, str]           | Specify for the FFN the minimum and the maximum number of layers, the minimum and the maximum number of neurons per layer, and the activation function. The List of available activation functions are ["relu","sigmoid","tanh","softmax","leaky_relu","elu","selu","swish"]. If you pass `None`, no activation is applied (i.e. "linear" activation: `a(x) = x`).                                                                                                                                                                                                                                                                                                                                                              | (1, 3, 64, 256, "sigmoid")             |
212 | | `hyperparams_cnn`           | Tuple[int, int, int, int, int, int, str] | Specify for the CNN the minimum and maximum number of filters, the minimum and the maximum kernel size, the minimum and maximum number of pooling layers, and the activation function.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          | (16, 64, 3, 8 , 1, 2, "relu")          |
213 | | `hyperparams_transformer`   | Tuple[int, int, int, bool, str]          | Specify for Transformer the Key dimension, number of heads, the number of the encoder blocks, a flag to indicate the use of the original architecture, and the activation function.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             | (256, 8, 10, True, "relu")             |
214 | | `hyperparams_rnn`           | Tuple[int, int, int, int, str]           | Specify for the RNN the minimum and maximum number of recurrent layers,the minimum and the maximum number of hidden units, and the activation function.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         | (1,2, 16, 128,"tanh")                  |
215 | | `hyperparams_mm_network`    | Tuple[int,int,str]                       | Specify for the MetaModel network the number of layers,the number of neurons per layer, and the activation function.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            | (1 ,32,"sigmoid")                      |
216 | | `epochs`                    | int                                      | The number of epochs to train different models.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 | 256                                    |
217 | | `batch_size`                | int                                      | The number of samples per gradient update.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      | 32                                     |
218 | | `fill_nan`                  | str                                      | Specifies the method to use for filling `NaN` values in the dataset. Supported methods are 'zeros', 'ffill', 'bfill', and 'median'.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             | "zeros"                                |
219 | | `type_training`             | str                                      | Specifies the type of training technique to use for the MetaModel. Supported techniques are 'average' and 'ffn'.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                | "average"                              |
220 | | `scaler`                    | str                                      | The type of scaler to use for preprocessing the data. Possible values are "MinMaxScaler", "StandardScaler", and "RobustScaler".                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 | "StandardScaler"                       |
221 | | `use_kfold`                 | bool                                     | Whether to use k-fold cross-validation technique or not.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        | False                                  |
222 | | `test_size`                 | float                                    | The proportion of the dataset to include in the test split. Should be a value between 0 and 1.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | 0.2                                    |
223 | | `val_size`                  | float                                    | The proportion of the training set to use for validation. Should be a value between 0 and 1.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    | 0.2                                    |
224 | | `save_models_as_dot_format` | bool                                     | Whether to save the models as a dot format file. If set to True, then you should have graphviz software installed on your machine.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              | False                                  |
225 | | `remove_overlapping_events` | bool                                     | Whether to remove the overlapping events or not.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                | True                                   |
226 | | `dropout`                   | float                                    | The dropout rate, which determines the fraction of input units to drop during training.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         | 0.3                                    |
227 | | `last_act_func`             | str                                      | Activation function for the final layer of each model. If set to `None`, no activation will be applied (i.e., "linear" activation: `a(x) = x`).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 | "sigmoid"                              |
228 | 
229 | #### The method `fit`
230 | The method `fit` calls automatically the following methods:
231 | ##### Prepare data for computing the overlapping parameter `op`
232 | The second thing to do is to prepare the events and the dataset for computing `op`:
233 | ```python
234 | meta_model.prepare_data_and_computing_op()
235 | ```
236 | 
237 | ##### Stacking Ensemble Learning Pipeline
238 | The third thing to do is to build a stacking learning pipeline using the provided models and hyperparameters:
239 | 
240 | ```python
241 | meta_model.build_stacking_learning()
242 | ```
243 | 
244 | ##### Event Extraction Optimization
245 | The fourth thing to do is to run the Event Extraction Optimization process:
246 | 
247 | ```python
248 | meta_model.event_extraction_optimization()
249 | ```
250 | 
251 | ##### Get The Results and Plots
252 | Finally, you can plot the results, which are saved automatically: losses, true/predicted ops, true/predicted events, and delta_t.
253 | 
254 | ```python
255 | meta_model.plot_save(show_plots=True)
256 | ```
257 | ## How to credit our package
258 | 
259 | If you use our package, please cite the following papers:
260 | 
261 | ```bash
262 | @INPROCEEDINGS{10459857,
263 |   author={Azib, Menouar and Renard, Benjamin and Garnier, Philippe and Génot, Vincent and André, Nicolas},
264 |   booktitle={2023 International Conference on Machine Learning and Applications (ICMLA)},
265 |   title={A Comprehensive Python Library for Deep Learning-Based Event Detection in Multivariate Time Series Data},
266 |   year={2023},
267 |   volume={},
268 |   number={},
269 |   pages={1399-1404},
270 |   keywords={Deep learning;Technological innovation;Event detection;Time series analysis;Predictive models;Tagging;Transformers;Event Detection in Time Series;Regression;Python Package;Information Retrieval;Natural Language Pro-cessing (NLP)},
271 |   doi={10.1109/ICMLA58977.2023.00211}
272 | }
273 | ```
274 | 
275 | # Future Works
276 | In our future works, we aim to enhance our model’s capabilities by predicting events of varying durations. This would be a significant improvement over our current approach, which only predicts the midpoint of events with a fixed duration.
277 | 
278 | # References
279 | 
280 | <a id="3"> [1] F. K. Alarfaj, I. Malik, H. U. Khan, N. Almusallam, M. Ramzan and M. Ahmed, “Credit Card Fraud Detection Using State-of-the-Art Machine Learning and Deep Learning Algorithms,” in IEEE Access, vol. 10, pp. 39700-39715, 2022, doi: 10.1109/ACCESS.2022.3166891.
281 | </a>
282 | 
283 | <a id="4"> [2] D. Varmedja, M. Karanovic, S. Sladojevic, M. Arsenovic and A. Anderla, “Credit Card Fraud Detection - Machine Learning methods,” 2019 18th International Symposium INFOTEH-JAHORINA (INFOTEH), East Sarajevo, Bosnia and Herzegovina, 2019, pp. 1-5, doi: 10.1109/INFOTEH.2019.8717766.
284 | </a>
285 | 
286 | <a id="5"> [3] E. Ileberi, Y. Sun and Z. Wang, “A machine learning based credit card fraud detection using the GA algorithm for feature selection,” in J Big Data, vol. 9, no. 24, 2022. [Online]. Available: https://doi.org/10.1186/s40537-022-00573-8.
287 | </a>
288 | 
289 | <a id="6"> [4] I. K. Cheng, N. Achilleos and A. Smith, “Automated bow shock and magnetopause boundary detection with Cassini using threshold and deep learning methods,” Front. Astron. Space Sci., vol. 9, 2022, doi: 10.3389/fspas.2022.1016453.
290 | </a>
291 | 


--------------------------------------------------------------------------------