├── tests ├── __init__.py ├── data │ ├── test_interval.py │ └── test_helpers_data.py ├── optimization │ └── test_gaussian.py └── models │ ├── test_helpers_models.py │ └── test_models_builder.py ├── eventdetector_ts ├── data │ ├── __init__.py │ ├── interval.py │ └── helpers_data.py ├── optimization │ ├── __init__.py │ ├── algorithms.py │ └── event_extraction_pipeline.py ├── prediction │ ├── __init__.py │ ├── utils.py │ └── prediction.py ├── metamodel │ ├── __init__.py │ ├── utils.py │ └── meta_model.py ├── models │ ├── __init__.py │ ├── helpers_models.py │ └── models_trainer.py ├── plotter │ ├── __init__.py │ ├── helpers.py │ └── plotter.py └── __init__.py ├── images ├── op_bs.png ├── op_ccf.png ├── delta_t_bs.png ├── losses_bs.png ├── losses_ccf.png ├── op_mex_ccf.png ├── delta_t_ccf.png ├── losses_mex_ccf.png ├── inputs_event_detector.png └── logo_eventdetector.svg ├── requirements.txt ├── requirements_dev.txt ├── .gitignore ├── LICENSE ├── pyproject.toml ├── .github └── workflows │ └── unit_tests.yml └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 4 | -------------------------------------------------------------------------------- /eventdetector_ts/data/__init__.py: -------------------------------------------------------------------------------- 1 | VALUE_ERROR = ValueError("Invalid TimeUnit value.") 2 | -------------------------------------------------------------------------------- /images/op_bs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeSignal/hire_eventdetector/main/images/op_bs.png -------------------------------------------------------------------------------- /images/op_ccf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeSignal/hire_eventdetector/main/images/op_ccf.png -------------------------------------------------------------------------------- /eventdetector_ts/optimization/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger(__name__) 4 | -------------------------------------------------------------------------------- /eventdetector_ts/prediction/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger(__name__) 4 | -------------------------------------------------------------------------------- /images/delta_t_bs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeSignal/hire_eventdetector/main/images/delta_t_bs.png -------------------------------------------------------------------------------- /images/losses_bs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeSignal/hire_eventdetector/main/images/losses_bs.png -------------------------------------------------------------------------------- /images/losses_ccf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeSignal/hire_eventdetector/main/images/losses_ccf.png -------------------------------------------------------------------------------- /images/op_mex_ccf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeSignal/hire_eventdetector/main/images/op_mex_ccf.png -------------------------------------------------------------------------------- /images/delta_t_ccf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeSignal/hire_eventdetector/main/images/delta_t_ccf.png -------------------------------------------------------------------------------- /eventdetector_ts/metamodel/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger_meta_model = logging.getLogger(__name__) 4 | -------------------------------------------------------------------------------- /images/losses_mex_ccf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeSignal/hire_eventdetector/main/images/losses_mex_ccf.png -------------------------------------------------------------------------------- /images/inputs_event_detector.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CodeSignal/hire_eventdetector/main/images/inputs_event_detector.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | joblib 2 | matplotlib 3 | numpy 4 | pandas 5 | python_dateutil 6 | scikit_learn 7 | scipy 8 | seaborn 9 | sympy 10 | colorlog 11 | pydot 12 | pyqt5 13 | tqdm -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | joblib 2 | matplotlib 3 | numpy 4 | pandas 5 | python_dateutil 6 | scikit_learn 7 | scipy 8 | seaborn 9 | sympy 10 | tensorflow 11 | colorlog 12 | pydot 13 | pyqt5 14 | tqdm 15 | -------------------------------------------------------------------------------- /eventdetector_ts/models/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger_models = logging.getLogger(__name__) 4 | 5 | ACTIVATION_FUNCTIONS = ["relu", "sigmoid", "tanh", "softmax", "leaky_relu", "elu", "selu", "swish"] 6 | -------------------------------------------------------------------------------- /eventdetector_ts/plotter/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger = logging.getLogger(__name__) 4 | COLOR_TRUE = "k" # black 5 | COLOR_PREDICTED = "r" # red 6 | STYLE_TRUE = "-" # solid line 7 | STYLE_PREDICTED = "--" # dashed line 8 | FIG_SIZE = (6, 4.5) # width, height in inches 9 | PALETTE = "tab10" # categorical color map 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized files 2 | *.py[cod] 3 | __pycache__/ 4 | *.py[cod]? 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | dist/ 11 | build/ 12 | eggs/ 13 | *.egg-info/ 14 | .svn/ 15 | *.swp 16 | *.tar.gz 17 | *.tgz 18 | *.zip 19 | *.rar 20 | 21 | # Development 22 | *.bak 23 | *.tmp 24 | 25 | # IDE specific files 26 | .vscode/ 27 | .idea/ 28 | 29 | # Jupyter Notebook 30 | .ipynb_checkpoints/ 31 | 32 | # Environment 33 | .env 34 | env/ 35 | venv/ 36 | ENV/ 37 | env.bak/ 38 | venv.bak/ 39 | 40 | # Compiled Python modules 41 | *.pyd 42 | 43 | # Coverage 44 | .coverage 45 | .coverage.* 46 | htmlcov/ 47 | 48 | # Type checking 49 | .mypy_cache/ 50 | .dmypy.json 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | # Ignore .pkl file 56 | *.pkl 57 | 58 | 59 | -------------------------------------------------------------------------------- /eventdetector_ts/optimization/algorithms.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | import numpy as np 4 | 5 | 6 | def convolve_with_gaussian_kernel(signal: np.ndarray, sigma: Union[int, float], m: int) -> np.ndarray: 7 | """ 8 | Convolve a signal with a Gaussian kernel. 9 | 10 | Args: 11 | signal (np.ndarray): The input signal to convolve. 12 | sigma (Union[int, float]): The standard deviation of the Gaussian kernel. 13 | m (int): The radius of the kernel. 14 | 15 | Returns: 16 | np.ndarray: The convolved signal. 17 | 18 | """ 19 | 20 | # Create the Gaussian kernel 21 | kernel = (1 / (np.sqrt(2 * np.pi) * sigma)) * np.exp(-(np.arange(-m, m + 1) ** 2) / (2 * sigma ** 2)) 22 | kernel /= np.sum(kernel) # Normalize the kernel 23 | 24 | # Perform the convolution 25 | convolved_signal = np.convolve(signal, kernel, mode='same') 26 | 27 | return convolved_signal 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) [year] [fullname] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /tests/data/test_interval.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datetime import datetime, timedelta 3 | 4 | from eventdetector_ts.data.interval import Interval 5 | 6 | 7 | class TestInterval(unittest.TestCase): 8 | def setUp(self): 9 | self.interval1 = Interval(datetime(2010, 7, 21, 18, 25), datetime(2010, 7, 21, 18, 28)) 10 | self.interval2 = Interval(datetime(2010, 7, 21, 18, 24, 30), datetime(2010, 7, 21, 18, 27, 30)) 11 | self.interval3 = Interval(datetime(2010, 7, 21, 18, 26, 30), datetime(2010, 7, 21, 18, 29, 30)) 12 | 13 | def test_overlap(self): 14 | self.assertEqual(self.interval1.overlap(self.interval2), timedelta(seconds=150)) 15 | self.assertEqual(self.interval1.overlap(self.interval3), timedelta(seconds=90)) 16 | self.assertEqual(self.interval2.overlap(self.interval3), timedelta(seconds=60)) 17 | 18 | def test_overlapping_parameter(self): 19 | self.assertEqual(round(self.interval1.overlapping_parameter(self.interval2), 3), 0.714) 20 | self.assertEqual(round(self.interval1.overlapping_parameter(self.interval3), 3), 0.333) 21 | self.assertEqual(round(self.interval2.overlapping_parameter(self.interval3), 3), 0.200) 22 | 23 | 24 | if __name__ == '__main__': 25 | unittest.main() 26 | -------------------------------------------------------------------------------- /tests/optimization/test_gaussian.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | 5 | from eventdetector_ts.optimization.algorithms import convolve_with_gaussian_kernel 6 | 7 | 8 | def convolution_with_gaussian(signal, sigma, m): 9 | signal_size = len(signal) 10 | 11 | output = [] 12 | for n in range(signal_size): 13 | temp = 0 14 | sum_kernel = 0 15 | for i in range(-m, m + 1): 16 | g_i = (1 / (np.sqrt(2 * np.pi) * sigma)) * np.exp(-(i ** 2) / (2. * sigma ** 2)) 17 | if 0 <= (n - i) < signal_size: 18 | temp += g_i * signal[n - i] 19 | sum_kernel += g_i 20 | 21 | output.append(temp / sum_kernel) 22 | return output 23 | 24 | 25 | class TestGaussianFilter(unittest.TestCase): 26 | def test_gaussian_filter(self): 27 | signal = np.array([1.0, 2, 3, 4.0, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]) 28 | sigma = 1 29 | m = 2 30 | 31 | convolved_signal = convolve_with_gaussian_kernel(signal=signal, sigma=sigma, m=m) 32 | convolved_signal_expected = convolution_with_gaussian(signal=signal, sigma=sigma, m=m) 33 | 34 | # Check if the outputs are equal 35 | np.testing.assert_allclose(convolved_signal_expected, convolved_signal, atol=1e-8) 36 | 37 | 38 | if __name__ == '__main__': 39 | unittest.main() 40 | -------------------------------------------------------------------------------- /tests/models/test_helpers_models.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | from eventdetector_ts.models.helpers_models import CustomEarlyStopping 7 | 8 | 9 | class TestHelpers(unittest.TestCase): 10 | def setUp(self): 11 | pass 12 | 13 | class TestCustomEarlyStopping(tf.test.TestCase): 14 | def test_on_epoch_end(self): 15 | # Create a custom early stopping callback 16 | early_stopping = CustomEarlyStopping(ratio=2.0, patience=3, verbose=0) 17 | 18 | # Set up test data 19 | x_train = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) 20 | y_train = np.array([0, 1, 1, 0]) 21 | x_val = np.array([[0, 0], [0, 1], [1, 0], [1, 1]]) 22 | y_val = np.array([0, 1, 1, 0]) 23 | 24 | # Define a simple model 25 | model = tf.keras.models.Sequential([ 26 | tf.keras.layers.Dense(2, activation='sigmoid', input_shape=(2,)), 27 | tf.keras.layers.Dense(1, activation='sigmoid') 28 | ]) 29 | model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) 30 | 31 | # Train the model with the custom early stopping callback 32 | model.fit(x_train, y_train, epochs=10, validation_data=(x_val, y_val), callbacks=[early_stopping]) 33 | 34 | # Check that training was stopped early 35 | self.assertLess(early_stopping.stopped_epoch, 10) 36 | 37 | 38 | if __name__ == '__main__': 39 | unittest.main() 40 | -------------------------------------------------------------------------------- /eventdetector_ts/prediction/utils.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | from typing import List 4 | 5 | import numpy as np 6 | import seaborn as sns 7 | from matplotlib import pyplot as plt 8 | 9 | from eventdetector_ts.plotter import COLOR_PREDICTED, COLOR_TRUE 10 | 11 | 12 | def plot_prediction(predicted_op: np.ndarray, filtered_predicted_op: np.ndarray) -> None: 13 | """ 14 | Plot the original and filtered predicted Op 15 | Args: 16 | predicted_op (np.ndarray): Predicted Op 17 | filtered_predicted_op (np.ndarray): Filtered predicted Op 18 | 19 | Returns: 20 | None 21 | """ 22 | sns.set(style="ticks", palette="Set2") 23 | plt.figure(figsize=(8, 6)) # Set the figure size 24 | 25 | # Plot the true and predicted values using Seaborn 26 | n = len(predicted_op) 27 | sns.lineplot(x=np.arange(n), y=predicted_op, color=COLOR_TRUE, label='Predicted Op') 28 | sns.lineplot(x=np.arange(n), y=filtered_predicted_op, color=COLOR_PREDICTED, label='Filtered Predicted Op') 29 | 30 | # Add labels and title to the plot 31 | plt.xlabel('Partitions') 32 | plt.ylabel('Op') 33 | plt.title('Predicted Op') 34 | # Add legend 35 | plt.legend() 36 | # Show 37 | plt.show() 38 | 39 | 40 | def write_events_to_csv(events: List, name: str) -> None: 41 | path = os.path.join(f"{name}.csv") 42 | with open(path, 'w', encoding='UTF8', newline='') as f: 43 | writer = csv.writer(f, delimiter=' ') 44 | for (start_time, end_time) in events: 45 | writer.writerow([start_time, end_time]) 46 | -------------------------------------------------------------------------------- /eventdetector_ts/plotter/helpers.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | import matplotlib.dates as mdates 4 | from matplotlib.patches import Rectangle 5 | 6 | from eventdetector_ts import TimeUnit 7 | from eventdetector_ts.data.helpers_data import convert_time_to_datetime, get_timedelta 8 | 9 | 10 | def event_to_rectangle(event, width_events_s: float, time_unit: TimeUnit, color, height=1, style="solid"): 11 | """ 12 | Function to convert an event to a rectangle object for visualization. 13 | 14 | Args: 15 | event (datetime or other): The event timestamp or object. 16 | width_events_s (float): The width of events in the unit of time for the dataset. 17 | time_unit (TimeUnit): The time unit of the partition size. 18 | color (str): The color of the rectangle. 19 | height (int): The height of the rectangle. 20 | style (str): The line style of the rectangle. 21 | 22 | Returns: 23 | Rectangle: The rectangle object representing the event. 24 | 25 | """ 26 | time = event 27 | if not isinstance(event, datetime): 28 | time = convert_time_to_datetime(event, to_timestamp=False) 29 | w_s_timedelta = get_timedelta(float(width_events_s) / 2, time_unit) 30 | start_time = time - w_s_timedelta 31 | end_time = time + w_s_timedelta 32 | 33 | start_rect = mdates.date2num(start_time) 34 | end_rect = mdates.date2num(end_time) 35 | 36 | width_rect = end_rect - start_rect 37 | rect = Rectangle((start_rect, 0), width_rect, height, edgecolor=color, linestyle=style, 38 | facecolor='none', linewidth=1) 39 | 40 | return rect 41 | -------------------------------------------------------------------------------- /tests/models/test_models_builder.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import tensorflow as tf 4 | from sympy.testing import pytest 5 | 6 | from eventdetector_ts import RNN_ENCODER_DECODER, FFN, CNN, RNN_BIDIRECTIONAL, CONV_LSTM1D, LSTM, SELF_ATTENTION 7 | from eventdetector_ts.models.models_builder import ModelBuilder, ModelCreator 8 | 9 | 10 | class TestModelsBuilder(unittest.TestCase): 11 | def setUp(self): 12 | # create a model builder with an input layer 13 | self.inputs = tf.keras.layers.Input(shape=(10,)) 14 | self.model_builder = ModelBuilder(self.inputs) 15 | self.inputs_rnn = tf.keras.Input(shape=(45, 5), name="Input") 16 | 17 | def test_check_input_shape(self): 18 | # create a layer with compatible input shape and call __check_input_shape 19 | layer1 = tf.keras.layers.Dense(5) 20 | output1 = self.model_builder._ModelBuilder__check_input_shape(layer1) 21 | self.assertEqual(output1.shape, tf.TensorShape([None, 10])) 22 | 23 | def test_add_layer(self): 24 | layer1 = tf.keras.layers.Dense(5) 25 | self.model_builder._ModelBuilder__add_layer(layer1) 26 | self.assertEqual(self.model_builder.outputs.shape, tf.TensorShape([None, 5])) 27 | 28 | layer2 = tf.keras.layers.Conv2D(32, kernel_size=3) 29 | with pytest.raises(ValueError): 30 | self.model_builder._ModelBuilder__add_layer(layer2) 31 | 32 | def test_create_models(self): 33 | model_creator = ModelCreator( 34 | [(RNN_ENCODER_DECODER, 1), (FFN, 2), (CNN, 2), (RNN_BIDIRECTIONAL, 1), (CONV_LSTM1D, 1), (LSTM, 3), 35 | (SELF_ATTENTION, 3)], 36 | hyperparams_rnn=(1, 2, 45, 46, "tanh"), 37 | hyperparams_cnn=(64, 65, 3, 4, 1, 1, "relu"), 38 | hyperparams_ffn=(1, 2, 64, 128, "sigmoid"), save_models_as_dot_format=False, root_dir=None, dropout=0.3, 39 | last_act_func="sigmoid", hyperparams_transformer=(256, 4, 1, True, "relu")) 40 | 41 | model_creator.create_models(inputs=self.inputs_rnn) 42 | 43 | for key, value in model_creator.created_models.items(): 44 | keras_model: tf.keras.Model = value 45 | self.assertEqual(keras_model.layers[-1].output_shape, (None, 1)) 46 | 47 | 48 | if __name__ == '__main__': 49 | unittest.main() 50 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.ruff] 2 | line-length = 120 3 | 4 | [tool.coverage.run] 5 | omit = [ 6 | "eventdetector_ts/plotter/*", 7 | "eventdetector_ts/prediction/*", 8 | "eventdetector_ts/metamodel/*", 9 | "eventdetector_ts/optimization/event_extraction_pipeline.py", 10 | "eventdetector_ts/models/models_trainer.py" 11 | ] 12 | source = ["eventdetector_ts"] 13 | 14 | [build-system] 15 | requires = ["flit_core>=3.4"] 16 | build-backend = "flit_core.buildapi" 17 | 18 | [project] 19 | name = "eventdetector_ts" 20 | version = "1.1.0" 21 | description = "EventDetector introduces a universal event detection method for multivariate time series. Unlike traditional deep-learning methods, it's regression-based, requiring only reference events. The robust stacked ensemble, from Feed-Forward Neural Networks to Transformers, ensures accuracy by mitigating biases. The package supports practical implementation, excelling in detecting events with precision, validated across diverse domains." 22 | keywords = [ 23 | "Universal Event Detection", 24 | "Multivariate Time Series", 25 | "Regression-based", 26 | "Stacked Ensemble Learning", 27 | "Deep Learning Models", 28 | "Feed-Forward Neural Networks", 29 | "Transformers", 30 | "Event Detection Package", 31 | "Rare Events", 32 | "Imbalanced Datasets", 33 | "Anomaly Detection", 34 | "Change Point Detection", 35 | "Fraud Detection", 36 | "Empirical Validations" 37 | ] 38 | authors = [ 39 | { name = "Menouar Azib", email = "menouar.azib@akkodis.com" } 40 | ] 41 | 42 | maintainers = [ 43 | { name = "Menouar Azib", email = "menouar.azib@akkodis.com" } 44 | ] 45 | requires-python = ">=3.9" 46 | readme = "README.md" 47 | license = { file = "LICENSE" } 48 | classifiers = ["License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", "Operating System :: OS Independent"] 49 | 50 | dependencies = [ 51 | "joblib", 52 | "matplotlib", 53 | "numpy", 54 | "pandas", 55 | "python_dateutil", 56 | "scikit_learn", 57 | "scipy", 58 | "seaborn", 59 | "sympy", 60 | "colorlog", 61 | "pydot", 62 | "pyqt5", 63 | "tqdm" 64 | ] 65 | 66 | [project.urls] 67 | "Homepage" = "https://github.com/menouarazib/eventdetector" 68 | "Bug Tracker" = "https://github.com/menouarazib/eventdetector/issues" -------------------------------------------------------------------------------- /.github/workflows/unit_tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests and Lint 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | - dev 8 | paths-ignore: 9 | - 'README.md' 10 | - 'pyproject.toml' 11 | 12 | jobs: 13 | build: 14 | runs-on: ${{ matrix.os }} 15 | strategy: 16 | matrix: 17 | os: [ ubuntu-latest, windows-latest, macos-latest ] 18 | python-version: [ "3.9", "3.10", "3.11" ] 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v4 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | - name: Update pip and setuptools 27 | run: | 28 | python -m pip install --upgrade pip 29 | python -m pip install --upgrade setuptools 30 | 31 | - name: Install dependencies 32 | run: | 33 | pip install --no-cache-dir ruff pytest coverage 34 | pip install --no-cache-dir -r requirements_dev.txt 35 | - name: Lint with ruff 36 | run: | 37 | # stop the build if there are Python syntax errors or undefined names 38 | ruff --output-format=github --select=E9,F63,F7,F82 --target-version=py37 . 39 | # default set of ruff rules with GitHub Annotations 40 | ruff --output-format=github --target-version=py37 . 41 | continue-on-error: true 42 | - name: List files in workspace 43 | run: | 44 | ls "${{ github.workspace }}" 45 | - name: Run unit tests with coverage 46 | env: # Add the env section with GITHUB_TOKEN 47 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 48 | run: | 49 | coverage run -m pytest tests/ 50 | coverage xml -o coverage.xml 51 | - name: Upload coverage report (only for ubuntu-latest and python 3.10) 52 | if: ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.10' }} 53 | uses: actions/upload-artifact@v3 54 | with: 55 | name: coverage-report 56 | path: coverage.xml 57 | - name: Run Coveralls (only for ubuntu-latest and python 3.10) 58 | if: ${{ matrix.os == 'ubuntu-latest' && matrix.python-version == '3.10' }} 59 | env: 60 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 61 | uses: coverallsapp/github-action@v2 62 | -------------------------------------------------------------------------------- /eventdetector_ts/data/interval.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | 3 | 4 | class Interval: 5 | """ 6 | Represents a time interval between two datetime objects. This class is used to model an event or partition in 7 | time-series. 8 | """ 9 | 10 | def __init__(self, start_time: datetime, end_time: datetime): 11 | """ 12 | Constructs an interval for a given start and end time. 13 | 14 | Args: 15 | start_time (datetime): The starting time of the interval. 16 | end_time (datetime): The ending time of the interval. 17 | """ 18 | self.start_time = start_time 19 | self.end_time = end_time 20 | self.duration = self.end_time - self.start_time 21 | 22 | def __str__(self) -> str: 23 | """ 24 | Returns a string representation of the interval in the format "start_time ---> end_time". 25 | 26 | Returns: 27 | str: A string representation of the interval. 28 | """ 29 | return "{} ---> {}".format(self.start_time, self.end_time) 30 | 31 | def __repr__(self) -> str: 32 | """ 33 | Returns a string representation of the interval in the format "start_time ---> end_time". 34 | 35 | Returns: 36 | str: A string representation of the interval. 37 | """ 38 | return "{} ---> {}".format(self.start_time, self.end_time) 39 | 40 | def overlap(self, other: 'Interval') -> timedelta: 41 | """ 42 | Computes the overlapping time (ot) between this interval and another interval. 43 | 44 | Args: 45 | other (Interval): Another interval to compare with. 46 | 47 | Returns: 48 | timedelta: The overlapping time between this interval and the other interval as a timedelta object. 49 | """ 50 | overlap_start_time = max(self.start_time, other.start_time) 51 | overlap_end_time = min(self.end_time, other.end_time) 52 | overlap_duration = max(timedelta(0), overlap_end_time - overlap_start_time) 53 | return overlap_duration 54 | 55 | def overlapping_parameter(self, other: 'Interval') -> float: 56 | """ 57 | Computes the overlapping parameter between this interval and another interval. 58 | 59 | Args: 60 | other (Interval): Another interval to compare with. 61 | 62 | Returns: 63 | float: A floating number between 0.0 and 1.0 representing the degree of overlap between the two intervals. 64 | """ 65 | if other is None: 66 | return 0.0 67 | overlap_duration = self.overlap(other) 68 | total_duration = self.duration + other.duration - overlap_duration 69 | return overlap_duration / total_duration 70 | -------------------------------------------------------------------------------- /eventdetector_ts/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | from enum import Enum 3 | from logging import config 4 | from typing import Dict, Optional 5 | from urllib.request import urlretrieve 6 | 7 | import pandas as pd 8 | from tqdm import tqdm 9 | 10 | TIME_LABEL = "time" 11 | MIDDLE_EVENT_LABEL = "event" 12 | 13 | LSTM = "LSTM" 14 | GRU = "GRU" 15 | CNN = "CNN" 16 | RNN_BIDIRECTIONAL = "RNN_BIDIRECTIONAL" 17 | CONV_LSTM1D = "CONV_LSTM_1D" 18 | RNN_ENCODER_DECODER = "RNN_ENCODER_DECODER" 19 | CNN_RNN = "CNN_RNN" 20 | SELF_ATTENTION = "SELF_ATTENTION" 21 | TRANSFORMER = "TRANSFORMER" 22 | FFN = "FFN" 23 | 24 | FILL_NAN_ZEROS = 'zeros' 25 | FILL_NAN_FFILL = 'ffill' 26 | FILL_NAN_BFILL = 'bfill' 27 | FILL_NAN_MEDIAN = 'median' 28 | 29 | TYPE_TRAINING_AVERAGE = 'average' 30 | TYPE_TRAINING_FFN = 'ffn' 31 | META_MODEL_NETWORK = "meta_model_ffn" 32 | META_MODEL_SCALER = "meta_model_scaler" 33 | 34 | # Define constants for scaler types 35 | MIN_MAX_SCALER = "MinMaxScaler" 36 | STANDARD_SCALER = "StandardScaler" 37 | ROBUST_SCALER = "RobustScaler" 38 | 39 | SCALERS_DIR = "scalers" 40 | MODELS_DIR = "models" 41 | OUTPUT_DIR = "output" 42 | CONFIG_FILE = ".config.json" 43 | # Store some important values for prediction 44 | config_dict: Dict = {} 45 | 46 | 47 | class TimeUnit(Enum): 48 | """ 49 | An enumeration of different time units. 50 | 51 | Attributes: 52 | SECOND: The time unit is in seconds. 53 | MILLISECOND: The time unit is in milliseconds. 54 | MICROSECOND: The time unit is in microseconds. 55 | MINUTE: The time unit is in minutes. 56 | HOUR: The time unit is in hours. 57 | DAY: The time unit is in days. 58 | YEAR: The time unit is in years. 59 | """ 60 | SECOND = "second" 61 | MILLISECOND = "millisecond" 62 | MICROSECOND = "microsecond" 63 | MINUTE = "minute" 64 | HOUR = "hour" 65 | DAY = "day" 66 | YEAR = "year" 67 | 68 | @classmethod 69 | def _missing_(cls, value): 70 | return cls.SECOND 71 | 72 | def __str__(self): 73 | return self.value 74 | 75 | 76 | LOGGING_CONFIG = { 77 | "version": 1, 78 | "disable_existing_loggers": False, 79 | "formatters": { 80 | "colored": { 81 | "()": "colorlog.ColoredFormatter", 82 | "format": "%(asctime)s %(log_color)s[%(levelname)s] %(name)s: %(message)s", 83 | "datefmt": "%Y-%m-%d %H:%M:%S", 84 | "log_colors": { 85 | "DEBUG": "cyan", 86 | "INFO": "white", 87 | "WARNING": "yellow", 88 | "ERROR": "red", 89 | "CRITICAL": "red,bg_white", 90 | }, 91 | }, 92 | }, 93 | "handlers": { 94 | "console": { 95 | "class": "logging.StreamHandler", 96 | "level": "DEBUG", 97 | "formatter": "colored", 98 | "stream": "ext://sys.stdout", 99 | }, 100 | }, 101 | "loggers": { 102 | "": { 103 | "handlers": ["console"], 104 | "level": "INFO", 105 | "propagate": False, 106 | }, 107 | }, 108 | } 109 | 110 | config.dictConfig(LOGGING_CONFIG) 111 | 112 | 113 | def my_hook(t): 114 | """ 115 | Wraps tqdm instance. Don't forget to close() or __exit__() 116 | the tqdm instance once you're done with it (easiest using `with` syntax). 117 | 118 | Example 119 | ------- 120 | 121 | 122 | 123 | """ 124 | last_b = [0] 125 | 126 | def inner(b=1, bsize=1, t_size=None): 127 | """ 128 | b : int, optional 129 | Number of blocks just transferred [default: 1]. 130 | bsize : int, optional 131 | Size of each block (in tqdm units) [default: 1]. 132 | t_size : int, optional 133 | Total size (in tqdm units). If [default: None] remains unchanged. 134 | """ 135 | if t_size is not None: 136 | t.total = t_size 137 | t.update((b - last_b[0]) * bsize) 138 | last_b[0] = b 139 | 140 | return inner 141 | 142 | 143 | def load_dataset(file_path: str, name: str, url=None, index_col: Optional[int] = 0) -> pd.DataFrame: 144 | """ 145 | Load a dataset from a file. If the file is not found, it will be downloaded from the given URL. 146 | 147 | Args: 148 | name: Name of the file to load 149 | index_col: the same value as pandas index_col 150 | file_path (str): The path to the dataset file. 151 | url (str): The URL from which to download the dataset (optional). 152 | 153 | Returns: 154 | pandas.DataFrame: The loaded dataset. 155 | """ 156 | 157 | file_extension = os.path.splitext(file_path)[1].lower() 158 | 159 | if not os.path.isfile(file_path) and url: 160 | # Dataset file isn't found, download it 161 | with tqdm(unit='B', unit_scale=True, leave=True, miniters=1, 162 | desc=f"Downloading {name}") as t: # all optional kwargs 163 | urlretrieve(url, filename=file_path, 164 | reporthook=my_hook(t), data=None) 165 | 166 | if file_extension == ".csv": 167 | # Read CSV file 168 | dataset = pd.read_csv(file_path, index_col=index_col) 169 | elif file_extension == ".pkl": 170 | # Read Pickle file 171 | dataset = pd.read_pickle(file_path) 172 | else: 173 | raise ValueError(f"Unsupported file format: {file_extension}") 174 | 175 | # Return the loaded dataset 176 | return dataset 177 | 178 | 179 | def load_martian_bow_shock(): 180 | """ 181 | Load the Martian bow shock dataset and events, for more information check this link: http://amda.cdpp.eu/ 182 | 183 | Returns: 184 | A dataset and events as pd.DataFrame 185 | 186 | """ 187 | url_dataset = "https://archive.org/download/martian_bow_shock_dataset/martian_bow_shock_dataset.pkl" 188 | url_events = "https://archive.org/download/martian_bow_shock_events/martian_bow_shock_events.csv" 189 | data_set = load_dataset(file_path="martian_bow_shock_dataset.pkl", name="Martian Bow Shock data set", 190 | url=url_dataset) 191 | events = load_dataset(file_path="martian_bow_shock_events.csv", name="Martian Bow Shock events", index_col=None, 192 | url=url_events) 193 | 194 | return data_set, events 195 | 196 | 197 | def load_credit_card_fraud(): 198 | """ 199 | Load the credit card fraud dataset and events, for more information check this link: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud 200 | 201 | Returns: 202 | A dataset and events as pd.DataFrame 203 | 204 | """ 205 | url_dataset = "https://archive.org/download/credit_card_fraud_dataset/credit_card_fraud_dataset.csv" 206 | url_events = "https://archive.org/download/credit_card_fraud_events/credit_card_fraud_events.csv" 207 | 208 | data_set = load_dataset(file_path="credit_card_fraud_dataset.csv", name="Credit Card Fraud data set", 209 | url=url_dataset) 210 | events = load_dataset(file_path="credit_card_fraud_events.csv", name="Credit Card Fraud events", index_col=None, 211 | url=url_events) 212 | 213 | return data_set, events 214 | -------------------------------------------------------------------------------- /eventdetector_ts/models/helpers_models.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | from sklearn.model_selection import KFold 6 | 7 | 8 | class CustomEarlyStopping(tf.keras.callbacks.Callback): 9 | """ 10 | Create a custom early stopping callback that stops training when the ratio of current training loss to current 11 | validation loss is less than a specified ratio for a number of consecutive epochs. 12 | 13 | Args: 14 | ratio (float): Ratio to compare current train loss and current val loss against. 15 | patience (int): Number of epochs to wait before stopping training. 16 | verbose (int, optional): Verbosity level. 17 | 18 | Attributes: 19 | stopped_epoch (int or None): Last epoch index where training was stopped. 20 | best (float or None): Best validation loss observed so far. 21 | best_epoch (int or None): Index of the epoch where the best validation loss was observed. 22 | ratio (float): Ratio to compare current train loss and current val loss against. 23 | patience (int): Number of epochs to wait before stopping training. 24 | verbose (int): Verbosity level. 25 | wait (int): Number of epochs since the last time the ratio was greater than self.ratio. 26 | monitor_op (function): Comparison operator for the ratio. 27 | best_weights (np.ndarray or None): Model weights at the epoch with the best validation loss. 28 | """ 29 | 30 | def __init__(self, ratio: float, patience: int, verbose: int = 1): 31 | super().__init__() 32 | self.stopped_epoch = None 33 | self.best = None 34 | self.best_epoch = None 35 | self.ratio = ratio 36 | self.patience = patience 37 | self.verbose = verbose 38 | self.wait = 0 39 | self.monitor_op = np.greater 40 | self.best_weights = None 41 | 42 | def on_train_begin(self, logs=None): 43 | """ 44 | Initialize instance attributes. 45 | """ 46 | self.wait = 0 47 | self.best_weights = None 48 | self.stopped_epoch = 0 49 | self.best_epoch = 0 50 | self.best = np.inf 51 | 52 | def on_epoch_end(self, epoch, logs=None): 53 | """ 54 | Update the best validation loss and check whether to stop training. 55 | """ 56 | if logs is not None: 57 | if self.best_weights is None: 58 | self.best_weights = self.model.get_weights() 59 | 60 | current_val = logs.get('val_loss') # Current validation loss 61 | current_train = logs.get('loss') # Current training loss 62 | if current_val is None: 63 | logging.warning( 64 | "Early stopping conditioned on metric `%s` " 65 | "which is not available. Available metrics are: %s", 66 | 'val_loss', 67 | ",".join(list(logs.keys())), 68 | ) 69 | 70 | # Update the best validation loss and weights 71 | if self.monitor_op(self.best, current_val): 72 | self.best = current_val 73 | self.best_weights = self.model.get_weights() 74 | self.best_epoch = epoch 75 | 76 | # If the ratio of current training loss to current validation loss is greater than the specified ratio. 77 | if self.monitor_op(np.divide(current_train, current_val), self.ratio): 78 | self.wait = 0 79 | else: 80 | # Only check after the first epoch. 81 | if self.wait >= self.patience and epoch > 0: 82 | self.stopped_epoch = epoch 83 | self.model.stop_training = True 84 | if self.verbose > 0: 85 | tf.print( 86 | "Restoring model weights from " 87 | "the end of the best epoch: " 88 | f"{self.best_epoch + 1}." 89 | ) 90 | self.model.set_weights(self.best_weights) 91 | self.wait += 1 92 | 93 | def on_train_end(self, logs=None): 94 | """ 95 | Print a message indicating that training was stopped early. 96 | """ 97 | if logs is not None: 98 | if self.stopped_epoch > 0 and self.verbose > 0: 99 | tf.print( 100 | f"Epoch {self.stopped_epoch + 1}: early stopping. " 101 | "Restoring model weights from " 102 | "the end of the best epoch: " 103 | f"{self.best_epoch + 1}. " 104 | "Best validation loss: " 105 | f"{self.best}." 106 | ) 107 | 108 | 109 | class SelfAttention(tf.keras.layers.Layer): 110 | """ 111 | Self-Attention layer for Neural Networks 112 | """ 113 | 114 | def __init__(self, units: int, **kwargs) -> None: 115 | super().__init__() 116 | self.last_attention_weights = None 117 | # Instantiate a multi-head attention layer with key dimensionality of units 118 | # and a single head 119 | self.mha = tf.keras.layers.MultiHeadAttention(key_dim=units, num_heads=1, **kwargs) 120 | # Instantiate a normalization layer 121 | self.layer_norm = tf.keras.layers.LayerNormalization() 122 | # Instantiate an addition layer 123 | self.add = tf.keras.layers.Add() 124 | 125 | def call(self, query: tf.Tensor) -> tf.Tensor: 126 | """ 127 | Apply a self-attention mechanism on the input query and return the output. 128 | 129 | Args: 130 | query: input tensor to the layer. 131 | 132 | Return: 133 | output tensor of the layer. 134 | """ 135 | # Apply multi-head attention on a query 136 | attn_output, attn_scores = self.mha( 137 | query=query, 138 | key=query, 139 | value=query, 140 | return_attention_scores=True) 141 | 142 | # Store the attention scores in last_attention_weights for inspection 143 | self.last_attention_weights = attn_scores 144 | 145 | # Add the attention output to the query and normalize it 146 | x = self.add([query, attn_output]) 147 | x = self.layer_norm(x) 148 | 149 | return x 150 | 151 | 152 | def custom_cross_val_score(model: tf.keras.Model, x: np.ndarray, y: np.ndarray, cv: KFold, epochs: int, batch_size: int, 153 | callbacks: list) -> np.ndarray: 154 | """ 155 | A function to perform custom cross-validation for a Keras model. 156 | 157 | Args: 158 | model: A Keras model. 159 | x: The input data. 160 | y: The target data. 161 | cv: A KFold cross-validation object. 162 | epochs: The number of epochs for training. 163 | batch_size: The batch size for training. 164 | callbacks: A list of Keras callbacks. 165 | 166 | Returns: 167 | The mean of the validation loss across all folds. 168 | """ 169 | scores = [] 170 | for train_index, val_index in cv.split(x): 171 | train_x, train_y = x[train_index], y[train_index] 172 | val_x, val_y = x[val_index], y[val_index] 173 | history = model.fit(train_x, train_y, epochs=epochs, batch_size=batch_size, callbacks=callbacks, 174 | validation_data=(val_x, val_y), verbose=0) 175 | scores.append(np.min(history.history['val_loss'])) 176 | return np.mean(scores) 177 | -------------------------------------------------------------------------------- /eventdetector_ts/prediction/prediction.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from typing import Dict, List, Tuple, Any 4 | 5 | import joblib 6 | import numpy as np 7 | import pandas as pd 8 | import tensorflow as tf 9 | 10 | from eventdetector_ts import CONFIG_FILE, SCALERS_DIR, TYPE_TRAINING_FFN, TimeUnit, MODELS_DIR, META_MODEL_NETWORK, \ 11 | META_MODEL_SCALER 12 | from eventdetector_ts.data.helpers_data import convert_dataframe_to_overlapping_partitions, get_timedelta 13 | from eventdetector_ts.optimization.algorithms import convolve_with_gaussian_kernel 14 | from eventdetector_ts.optimization.event_extraction_pipeline import get_peaks, compute_op_as_mid_times 15 | from eventdetector_ts.prediction import logger 16 | 17 | 18 | def load_model_with_fallback(model_path: str) -> tf.keras.Model: 19 | """ 20 | Load a Keras model with fallback mechanisms for different formats. 21 | 22 | Args: 23 | model_path (str): Path to the model file 24 | 25 | Returns: 26 | tf.keras.Model: Loaded model 27 | """ 28 | try: 29 | # Try loading with Keras 3 format first 30 | return tf.keras.models.load_model(model_path) 31 | except ValueError as e: 32 | if "File format not supported" in str(e): 33 | # Try loading as SavedModel format 34 | try: 35 | saved_model = tf.saved_model.load(model_path) 36 | # Convert SavedModel to Keras model 37 | class SavedModelWrapper(tf.keras.Model): 38 | def __init__(self, saved_model): 39 | super().__init__() 40 | self.saved_model = saved_model 41 | 42 | def call(self, inputs): 43 | return self.saved_model(inputs) 44 | 45 | return SavedModelWrapper(saved_model) 46 | except Exception: 47 | # If all else fails, try H5 format 48 | h5_path = model_path.replace('.keras', '.h5') 49 | if os.path.exists(h5_path): 50 | return tf.keras.models.load_model(h5_path) 51 | else: 52 | raise e 53 | else: 54 | raise e 55 | 56 | 57 | def load_config_file(path: str) -> Dict: 58 | """ 59 | Load config file of the meta-model. 60 | 61 | Args: 62 | path (str): Where the config file is stored 63 | 64 | Returns: 65 | Data as a Dict which contains all configuration information 66 | """ 67 | config_file_path = os.path.join(path, CONFIG_FILE) 68 | if not os.path.exists(config_file_path): 69 | msg: str = f"The config file {CONFIG_FILE} does not exist in this path: {config_file_path}" 70 | logger.critical(msg) 71 | raise ValueError(msg) 72 | 73 | with open(config_file_path, 'r') as f: 74 | config_: Dict = json.load(f) 75 | return config_ 76 | 77 | 78 | def load_models(model_keys: List[str], output_dir: str) -> List[tf.keras.Model]: 79 | """ 80 | Loads the trained models. 81 | Args: 82 | model_keys (List[str]): List of model's name 83 | output_dir (str): The parent directory where the trained models are stored 84 | 85 | Returns: 86 | List of keras models 87 | """ 88 | models: List[tf.keras.Model] = [] 89 | for key in model_keys: 90 | path = os.path.join(output_dir, MODELS_DIR) 91 | # Add .keras extension if not already present 92 | if not key.endswith('.keras'): 93 | key = f"{key}.keras" 94 | path = os.path.join(path, key) 95 | models.append(load_model_with_fallback(path)) 96 | return models 97 | 98 | 99 | def apply_scaling(x: np.ndarray, config_data: Dict) -> np.ndarray: 100 | """ 101 | Scaling input data according to the stored scalers. 102 | Args: 103 | x (np.ndarray): Input data to be scaled 104 | config_data (Dict): Configuration Data 105 | 106 | Returns: 107 | Scaled data. 108 | """ 109 | n_time_steps = x.shape[1] 110 | output_dir: str = config_data.get("output_dir") 111 | scalers_dir = os.path.join(output_dir, SCALERS_DIR) 112 | try: 113 | for i in range(n_time_steps): 114 | scaler_i_path = os.path.join(scalers_dir, f'scaler_{i}.joblib') 115 | # Print progress 116 | print("\rLoading and applying scalers...{}/{}".format(i + 1, n_time_steps), end="") 117 | # Load the scaler from disk 118 | print(scaler_i_path) 119 | scaler = joblib.load(scaler_i_path) 120 | x[:, i, :] = scaler.transform(x[:, i, :]) 121 | except ValueError as e: 122 | logger.critical(e) 123 | raise e 124 | 125 | logger.info("Convert data to float32 for consistency...") 126 | x = np.asarray(x).astype('float32') 127 | return x 128 | 129 | 130 | def load_meta_model(output_dir: str) -> Tuple[tf.keras.Model, Any]: 131 | """ 132 | Load the metamodel network and the scaler. 133 | Args: 134 | output_dir (str): The parent directory where the trained models are stored 135 | 136 | Returns: 137 | tf.keras.Model, StanderScaler 138 | """ 139 | path = os.path.join(output_dir, MODELS_DIR) 140 | # Add .keras extension if not already present 141 | meta_model_name = META_MODEL_NETWORK 142 | if not meta_model_name.endswith('.keras'): 143 | meta_model_name = f"{meta_model_name}.keras" 144 | path = os.path.join(path, meta_model_name) 145 | model = load_model_with_fallback(path) 146 | 147 | scalers_dir = os.path.join(output_dir, SCALERS_DIR) 148 | scaler_path = os.path.join(scalers_dir, f'{META_MODEL_SCALER}.joblib') 149 | scaler = joblib.load(scaler_path) 150 | 151 | return model, scaler 152 | 153 | 154 | def predict(dataset: pd.DataFrame, path: str) -> Tuple[List, np.ndarray, np.ndarray]: 155 | """ 156 | Generates output predictions for the input dataset 157 | Args: 158 | dataset (pd.DataFrame): The input dataset. 159 | path (str): The path to the created folder by the MetaModel. 160 | 161 | Returns: 162 | Tuple[List, np.ndarray, np.ndarray]: Predicted events, predicted Op and filtered predicted Op 163 | """ 164 | 165 | if path is None or not isinstance(path, str) or len(path) == 0: 166 | msg: str = f"The provided path {path} is not valid." 167 | logger.critical(msg) 168 | raise ValueError(msg) 169 | 170 | config_data: Dict = load_config_file(path=path) 171 | config_data['output_dir'] = path 172 | logger.info(f"Config dict: {config_data}") 173 | logger.info("Converting the dataset to overlapping partitions.") 174 | dataset_as_overlapping_partitions: np.ndarray = convert_dataframe_to_overlapping_partitions(dataset, 175 | width=config_data.get( 176 | "width"), 177 | step=config_data.get( 178 | "step"), 179 | fill_method=config_data.get( 180 | 'fill_nan')) 181 | # Remove the column containing the timestamps from the overlapping partitions 182 | x: np.ndarray = np.delete(dataset_as_overlapping_partitions, -1, axis=2) 183 | logger.info(f"The shape of the input data: {x.shape}") 184 | x = apply_scaling(x=x, config_data=config_data) 185 | model_keys: List[str] = config_data.get('models') 186 | logger.info(f"Loading models: {model_keys}") 187 | models: List[tf.keras.Model] = load_models(model_keys=model_keys, output_dir=config_data.get('output_dir')) 188 | batch_size: int = config_data.get("batch_size") 189 | predictions = [] 190 | logger.info("Making prediction from the trained models") 191 | for model in models: 192 | # Make predictions using each model 193 | predicted_y: np.ndarray = model.predict(x, batch_size=batch_size) 194 | predicted_y = predicted_y.flatten() 195 | predictions.append(predicted_y) 196 | 197 | type_training: str = config_data.get('type_training') 198 | # Convert a list of 1D NumPy arrays to 2D NumPy array 199 | predictions = np.stack(predictions, axis=1) 200 | if type_training == TYPE_TRAINING_FFN: 201 | logger.info("Loading the MetaModel and its Scaler") 202 | model, scaler = load_meta_model(output_dir=config_data.get('output_dir')) 203 | predictions = scaler.transform(predictions) 204 | logger.info("Make a final prediction using the network of the MetaModel") 205 | predicted_op = model.predict(predictions, batch_size=batch_size) 206 | predicted_op = predicted_op.flatten() 207 | else: 208 | logger.info("Make a final prediction by averaging") 209 | predicted_op = np.mean(predictions, axis=1) 210 | 211 | sigma, m, h = config_data.get('best_combination') 212 | logger.info(f"Applying Gaussian Filter with sigma = {sigma} and m = {m}") 213 | filtered_predicted_op = convolve_with_gaussian_kernel(predicted_op, sigma=sigma, m=m) 214 | logger.info("Computing filtered predictions as a function of the mid-times of the overlapping partitions") 215 | t, filtered_predicted_op = compute_op_as_mid_times(overlapping_partitions=dataset_as_overlapping_partitions, 216 | op_g=filtered_predicted_op) 217 | logger.info(f"Computing peaks with h = {h:.2f}") 218 | s_peaks = get_peaks(h=h, t=t, op_g=filtered_predicted_op) 219 | predicted_events = [] 220 | time_unit: TimeUnit = TimeUnit.__call__(config_data.get('time_unit')) 221 | radius = get_timedelta(config_data.get("width_events_s") / 2.0, time_unit) 222 | logger.info(f"Generating a predicted events with radius = {radius}, predicted op and a filtered predicted op") 223 | for i in range(len(s_peaks)): 224 | predicted_event = s_peaks[i] 225 | start_time = predicted_event - radius 226 | end_time = predicted_event + radius 227 | predicted_events.append((start_time.isoformat(), end_time.isoformat())) 228 | return predicted_events, predicted_op, filtered_predicted_op 229 | -------------------------------------------------------------------------------- /tests/data/test_helpers_data.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datetime import datetime, timedelta 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from pandas.core.dtypes.common import is_datetime64_any_dtype 7 | from sympy.testing import pytest 8 | 9 | from eventdetector_ts import TimeUnit 10 | from eventdetector_ts.data.helpers_data import overlapping_partitions, compute_middle_event, \ 11 | num_columns, convert_dataframe_to_overlapping_partitions, get_timedelta, get_total_units, check_time_unit, \ 12 | convert_dataset_index_to_datetime, convert_seconds_to_time_unit 13 | 14 | 15 | def test_overlapping_partitions(): 16 | data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]) 17 | expected_output = np.array([[[1, 2, 3], [4, 5, 6]], [[4, 5, 6], [7, 8, 9]], [[7, 8, 9], [10, 11, 12]]]) 18 | assert np.array_equal(overlapping_partitions(data, width=2, step=1), expected_output) 19 | 20 | 21 | class TestHelpers(unittest.TestCase): 22 | 23 | def setUp(self): 24 | self.n: int = 100 25 | 26 | def test_overlapping_partitions(self): 27 | # Test case 1: 1D input 28 | data1 = np.array([1, 2, 3, 4, 5]) 29 | result1 = overlapping_partitions(data1, width=3, step=1) 30 | expected1 = np.array([[1, 2, 3], [2, 3, 4], [3, 4, 5]]) 31 | self.assertTrue(np.array_equal(result1, expected1)) 32 | 33 | # Test case 2: partition width greater than the size of the input data 34 | data2 = np.array([1, 2, 3, 4, 5]) 35 | with pytest.raises(ValueError): 36 | overlapping_partitions(data2, width=6, step=1) 37 | 38 | # Test case 3: 2D input 39 | data3 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) 40 | result3 = overlapping_partitions(data3, width=2, step=1) 41 | expected3 = np.array([[[1, 2, 3], [4, 5, 6]], [[4, 5, 6], [7, 8, 9]]]) 42 | assert np.array_equal(result3, expected3) 43 | 44 | # Test case 4: 2D input 45 | data4 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) 46 | result4 = overlapping_partitions(data4, width=2, step=2) 47 | expected4 = np.array([[[1, 2, 3], [4, 5, 6]]]) 48 | assert np.array_equal(result4, expected4) 49 | 50 | def test_convert_dataframe_to_overlapping_partitions(self): 51 | # Create a sample DataFrame with datetime index and real-valued features 52 | 53 | data = np.random.rand(self.n, 3) 54 | index = pd.date_range(start='2022-01-01', periods=self.n, freq='D') 55 | df = pd.DataFrame(data=data, columns=['feat1', 'feat2', 'feat3'], index=index) 56 | 57 | # Test overlapping partition generation with default settings 58 | sw = convert_dataframe_to_overlapping_partitions(df, width=2, step=1) 59 | expected_shape = (self.n - 1, 2, 4) # Number of partitions, partition width, number of features+time 60 | self.assertEqual(sw.shape, expected_shape) 61 | 62 | # Test overlapping partition generation with custom settings 63 | sw = convert_dataframe_to_overlapping_partitions(df, width=14, step=7, fill_method='ffill') 64 | expected_shape = (13, 14, 4) # Number of partitions, partition width, number of features+time 65 | self.assertEqual(sw.shape, expected_shape) 66 | 67 | def test_compute_middle_event(self): 68 | # Test case 1: List of events with 2 columns 69 | events_list = [['2022-01-01', '2022-01-02'], ['2022-01-03', '2022-01-05']] 70 | expected_output = pd.DataFrame({'event': [datetime(2022, 1, 1, 12, 0), datetime(2022, 1, 4)]}) 71 | # call function to get actual output 72 | actual_output = compute_middle_event(events_list) 73 | 74 | # compare expected and actual outputs 75 | pd.testing.assert_frame_equal(expected_output, actual_output) 76 | 77 | # Test case 2: List of events with 1 column 78 | events_list = [['2022-01-01'], ['2022-01-03']] 79 | expected_output = pd.DataFrame({"event": [datetime(2022, 1, 1), datetime(2022, 1, 3)]}) 80 | # call function to get actual output 81 | actual_output = compute_middle_event(events_list) 82 | 83 | # compare expected and actual outputs 84 | pd.testing.assert_frame_equal(expected_output, actual_output) 85 | # Test case 3: Pandas DataFrame with 2 columns 86 | events_df = pd.DataFrame({'Starting Date': ['2022-01-01', '2022-01-03'], 87 | 'Ending Date': ['2022-01-02', '2022-01-05']}) 88 | expected_output = pd.DataFrame({"event": [datetime(2022, 1, 1, 12, 0), datetime(2022, 1, 4)]}) 89 | # call function to get actual output 90 | actual_output = compute_middle_event(events_df) 91 | 92 | # compare expected and actual outputs 93 | pd.testing.assert_frame_equal(expected_output, actual_output) 94 | 95 | # Test case 4: Pandas DataFrame with 1 column 96 | expected_output = pd.DataFrame({"event": [datetime(2022, 1, 1), datetime(2022, 1, 3)]}) 97 | # call function to get actual output 98 | actual_output = compute_middle_event(events_list) 99 | 100 | # compare expected and actual outputs 101 | pd.testing.assert_frame_equal(expected_output, actual_output) 102 | 103 | # Test case 5: Empty list of events 104 | events_list = [] 105 | with pytest.raises(ValueError): 106 | compute_middle_event(events_list) 107 | 108 | # Test case 6: Empty DataFrame of events 109 | events_df = pd.DataFrame() 110 | with pytest.raises(ValueError): 111 | compute_middle_event(events_df) 112 | 113 | # Test case 7: Invalid input format for events 114 | events_list = [[1, 2], [3, 4, 5]] 115 | with pytest.raises(ValueError): 116 | compute_middle_event(events_list) 117 | 118 | def test_empty_list(self): 119 | self.assertEqual(num_columns([]), 0) 120 | 121 | def test_single_column_list(self): 122 | self.assertEqual(num_columns([1, 2, 3]), 1) 123 | 124 | def test_multi_column_list(self): 125 | self.assertEqual(num_columns([[1, 2], [3, 4], [5, 6]]), 2) 126 | 127 | def test_mixed_list(self): 128 | self.assertEqual(num_columns([[1, 2], 3, 4]), 2) 129 | 130 | def test_microsecond(self): 131 | result = get_timedelta(100, TimeUnit.MICROSECOND) 132 | self.assertEqual(result, timedelta(microseconds=100)) 133 | 134 | def test_millisecond(self): 135 | result = get_timedelta(500, TimeUnit.MILLISECOND) 136 | self.assertEqual(result, timedelta(milliseconds=500)) 137 | 138 | def test_second(self): 139 | result = get_timedelta(60, TimeUnit.SECOND) 140 | self.assertEqual(result, timedelta(seconds=60)) 141 | 142 | def test_minute(self): 143 | result = get_timedelta(30, TimeUnit.MINUTE) 144 | self.assertEqual(result, timedelta(minutes=30)) 145 | 146 | def test_hour(self): 147 | result = get_timedelta(2, TimeUnit.HOUR) 148 | self.assertEqual(result, timedelta(hours=2)) 149 | 150 | def test_day(self): 151 | result = get_timedelta(5, TimeUnit.DAY) 152 | self.assertEqual(result, timedelta(days=5)) 153 | 154 | def test_year(self): 155 | result = get_timedelta(2, TimeUnit.YEAR) 156 | self.assertEqual(result, timedelta(days=2 * 365)) 157 | 158 | def test_invalid_unit(self): 159 | with self.assertRaises(ValueError): 160 | get_timedelta(10, "null") 161 | 162 | def test_microsecond_(self): 163 | td = timedelta(microseconds=123456789) 164 | self.assertEqual(get_total_units(td, TimeUnit.MICROSECOND), 123456789) 165 | 166 | def test_millisecond_(self): 167 | td = timedelta(milliseconds=123456) 168 | self.assertEqual(get_total_units(td, TimeUnit.MILLISECOND), 123456) 169 | 170 | def test_second_(self): 171 | td = timedelta(seconds=123) 172 | self.assertEqual(get_total_units(td, TimeUnit.SECOND), 123) 173 | 174 | def test_minute_(self): 175 | td = timedelta(minutes=2) 176 | self.assertEqual(get_total_units(td, TimeUnit.MINUTE), 2) 177 | 178 | def test_hour_(self): 179 | td = timedelta(hours=1) 180 | self.assertEqual(get_total_units(td, TimeUnit.HOUR), 1) 181 | 182 | def test_day_(self): 183 | td = timedelta(days=3) 184 | self.assertEqual(get_total_units(td, TimeUnit.DAY), 3) 185 | 186 | def test_year_(self): 187 | td = timedelta(days=365.25) 188 | self.assertAlmostEqual(get_total_units(td, TimeUnit.YEAR), 1.0, places=2) 189 | 190 | def test_invalid_unit_(self): 191 | td = timedelta(seconds=123) 192 | with self.assertRaises(ValueError): 193 | get_total_units(td, "invalid_unit") 194 | 195 | def test_year__(self): 196 | diff = timedelta(days=365) 197 | expected_result = (1, TimeUnit.YEAR) 198 | self.assertEqual(check_time_unit(diff), expected_result) 199 | 200 | def test_day__(self): 201 | diff = timedelta(days=2) 202 | expected_result = (2, TimeUnit.DAY) 203 | self.assertEqual(check_time_unit(diff), expected_result) 204 | 205 | def test_hour__(self): 206 | diff = timedelta(hours=1) 207 | expected_result = (1, TimeUnit.HOUR) 208 | self.assertEqual(check_time_unit(diff), expected_result) 209 | 210 | def test_minute__(self): 211 | diff = timedelta(minutes=2) 212 | expected_result = (2, TimeUnit.MINUTE) 213 | self.assertEqual(check_time_unit(diff), expected_result) 214 | 215 | def test_second__(self): 216 | diff = timedelta(seconds=30) 217 | expected_result = (30, TimeUnit.SECOND) 218 | self.assertEqual(check_time_unit(diff), expected_result) 219 | 220 | def test_millisecond__(self): 221 | diff = timedelta(milliseconds=500) 222 | expected_result = (500, TimeUnit.MILLISECOND) 223 | self.assertEqual(check_time_unit(diff), expected_result) 224 | 225 | def test_microsecond__(self): 226 | diff = timedelta(microseconds=200) 227 | expected_result = (200, TimeUnit.MICROSECOND) 228 | self.assertEqual(check_time_unit(diff), expected_result) 229 | 230 | def test_invalid_time(self): 231 | diff = timedelta(microseconds=0) 232 | with self.assertRaises(ValueError): 233 | check_time_unit(diff) 234 | 235 | def test_convert_datetime_index(self): 236 | # Create a DataFrame with a datetime index 237 | data = {'value': [1, 2, 3, 4, 5]} 238 | index = pd.date_range(start='2023-01-01', periods=5) 239 | dataset = pd.DataFrame(data, index=index) 240 | 241 | # Call the function to convert the index to datetime 242 | convert_dataset_index_to_datetime(dataset) 243 | 244 | # Check if the index is in datetime format 245 | self.assertTrue(is_datetime64_any_dtype(dataset.index)) 246 | 247 | def test_non_datetime_index(self): 248 | # Create a DataFrame with a non-datetime index 249 | data = {'value': [1, 2, 3, 4, 5]} 250 | index = ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05'] 251 | dataset = pd.DataFrame(data, index=index) 252 | 253 | # Call the function to convert the index to datetime 254 | convert_dataset_index_to_datetime(dataset) 255 | 256 | # Check if the index is converted to datetime format 257 | self.assertTrue(is_datetime64_any_dtype(dataset.index)) 258 | 259 | def test_conversion(self): 260 | self.assertEqual(convert_seconds_to_time_unit(1, TimeUnit.SECOND), 1) 261 | self.assertEqual(convert_seconds_to_time_unit(60, TimeUnit.MINUTE), 1) 262 | self.assertEqual(convert_seconds_to_time_unit(3600, TimeUnit.HOUR), 1) 263 | 264 | 265 | if __name__ == '__main__': 266 | unittest.main() 267 | -------------------------------------------------------------------------------- /eventdetector_ts/optimization/event_extraction_pipeline.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | from math import ceil 3 | from typing import Tuple, Union 4 | 5 | import numpy as np 6 | import pandas as pd 7 | from scipy.signal import find_peaks 8 | 9 | from eventdetector_ts import MIDDLE_EVENT_LABEL, TimeUnit, config_dict 10 | from eventdetector_ts.data.helpers_data import get_timedelta, get_total_units 11 | from eventdetector_ts.optimization import logger 12 | from eventdetector_ts.optimization.algorithms import convolve_with_gaussian_kernel 13 | 14 | 15 | class OptimizationData: 16 | """ 17 | OptimizationData class represents the data used for the event extraction pipeline. 18 | 19 | Attributes: 20 | - time_unit (TimeUnit): Unit of time used in the dataset. 21 | - true_events (pd.DataFrame): DataFrame to store true events. 22 | - predicted_op (np.ndarray): Array to store predicted outcomes. 23 | - delta Union[int, float]: The maximum time tolerance used to determine the correspondence between a predicted 24 | event and its actual counterpart in the true events. 25 | - s_h (float): A step parameter for the peak height threshold h. 26 | - s_s (int): Step size in time unit for overlapping the partition. 27 | - w_s (int): Size in time unit of the overlapping partition. 28 | - t_max (float): The maximum total time related to sigma. 29 | - output_dir (str): The parent directory. 30 | - big_sigma (int): Value calculated based on t_max, w_s, and s_s. 31 | - overlapping_partitions (np.ndarray): Array to store overalapping partitions. 32 | 33 | """ 34 | 35 | def __init__(self, t_max: float, w_s: int, s_s: int, 36 | s_h: float, 37 | delta: Union[int, float], 38 | output_dir: str, time_unit: TimeUnit): 39 | """ 40 | Initializes the OptimizationData object. 41 | 42 | Args: 43 | t_max (float): The maximum total time related to sigma. 44 | w_s (int): Size in time unit of the overalapping partition. 45 | s_s (int): Step size in time unit for overalapping the partition. 46 | s_h (float): A step parameter for the peak height threshold h. 47 | delta Union[int, float]: The maximum time tolerance used to determine the correspondence between a predicted 48 | event and its actual counterpart in the true events. 49 | output_dir (str): The parent directory. 50 | time_unit (TimeUnit): Unit of time used in the dataset. 51 | """ 52 | self.time_unit = time_unit 53 | self.true_events: pd.DataFrame = pd.DataFrame() 54 | self.predicted_op: np.ndarray = np.empty(shape=(0,)) 55 | self.delta = delta 56 | self.s_h = s_h 57 | self.s_s = s_s 58 | self.w_s = w_s 59 | self.t_max = t_max 60 | self.output_dir = output_dir 61 | self.big_sigma = 1 + ceil((self.t_max - self.w_s) / self.s_s) 62 | self.overlapping_partitions: np.ndarray = np.empty(shape=(0,)) 63 | 64 | def set_true_events(self, true_events: pd.DataFrame) -> None: 65 | self.true_events = true_events 66 | 67 | def set_overlapping_partitions(self, overlapping_partitions: np.ndarray): 68 | self.overlapping_partitions = overlapping_partitions 69 | 70 | def set_predicted_op(self, predicted_op: np.ndarray): 71 | self.predicted_op = predicted_op 72 | overlapping_partitions_test = self.overlapping_partitions[-len(predicted_op):] 73 | self.overlapping_partitions = overlapping_partitions_test 74 | first_partition_test_data = self.overlapping_partitions[0] 75 | last_partition_test_data = self.overlapping_partitions[-1] 76 | start_date_test_data = first_partition_test_data[0][-1].to_pydatetime() 77 | end_date_test_data = last_partition_test_data[0][-1].to_pydatetime() 78 | logger.info( 79 | f"Starting and ending dates of test data are respectively {start_date_test_data} --> {end_date_test_data}") 80 | 81 | true_events_test = self.true_events[(self.true_events[MIDDLE_EVENT_LABEL] >= start_date_test_data) & ( 82 | self.true_events[MIDDLE_EVENT_LABEL] <= end_date_test_data)] 83 | self.true_events = true_events_test 84 | 85 | 86 | def get_peaks(h: float, t: np.ndarray, op_g: np.ndarray) -> np.ndarray: 87 | """ 88 | Compute peaks for given mid_times of partitions, op values, and threshold h. 89 | Args: 90 | h (float): Threshold for peaks. 91 | t (np.ndarray): mid_times of partitions 92 | op_g (np.ndarray): op values 93 | 94 | Returns: 95 | np.ndarray: Peaks. 96 | """ 97 | peaks, _ = find_peaks(op_g, height=np.array([h, 1.0])) 98 | return t[peaks] 99 | 100 | 101 | def compute_op_as_mid_times(overlapping_partitions: np.ndarray, op_g: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: 102 | """ 103 | Compute op as a function of mid-times of partitions instead of partition's index. 104 | Args: 105 | overlapping_partitions (np.ndarray): overalapping partitions 106 | op_g (np.ndarray): Op array 107 | 108 | Returns: 109 | Tuple[np.ndarray, np.ndarray]: mid-times of partitions, op as a function of mid-times of partitions 110 | """ 111 | t = [] 112 | op_g_ = [] 113 | for n in range(len(op_g)): 114 | w_n = overlapping_partitions[n] 115 | b_n = w_n[0][-1].to_pydatetime() 116 | e_n = w_n[-1][-1].to_pydatetime() 117 | c_n = b_n + (e_n - b_n) / 2 118 | t.append(c_n) 119 | op_g_.append(op_g[n]) 120 | t, op_g_ = np.array(t), np.array(op_g_) 121 | return t, op_g_ 122 | 123 | 124 | class OptimizationCalculator: 125 | def __init__(self, optimization_data: OptimizationData): 126 | self.optimization_data = optimization_data 127 | 128 | def apply_gaussian_filter(self, sigma: int, m: int) -> np.ndarray: 129 | return convolve_with_gaussian_kernel(self.optimization_data.predicted_op, sigma, m=m) 130 | 131 | def __compute_op_as_mid_times(self, op_g: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: 132 | return compute_op_as_mid_times(self.optimization_data.overlapping_partitions, op_g) 133 | 134 | def __util_method(self, s_peaks: np.ndarray, delta_with_time_unit: timedelta) -> Tuple[int, int, int, list]: 135 | """ 136 | Useful method for compute_f1score method. 137 | Args: 138 | s_peaks (np.ndarray): peaks of op. 139 | delta_with_time_unit (timedelta): delta as number in unit time. 140 | 141 | Returns: 142 | tp, fp, fn, delta_t 143 | """ 144 | e_t = self.optimization_data.true_events.copy() 145 | 146 | fp: int = 0 147 | tp: int = 0 148 | delta_t: list = [] 149 | for m_p in s_peaks: 150 | signed_delta = delta_with_time_unit 151 | t_t = None 152 | for i, t_e in enumerate(e_t[MIDDLE_EVENT_LABEL]): 153 | m_t = t_e 154 | diff = m_p - m_t 155 | 156 | if abs(diff) <= delta_with_time_unit: 157 | if t_t is None or abs(m_p - t_t) > abs(diff): 158 | t_t = m_t 159 | signed_delta = diff 160 | 161 | if t_t is not None: 162 | tp += 1 163 | e_t = e_t.drop(e_t[e_t[MIDDLE_EVENT_LABEL] == t_t].index) 164 | diff = get_total_units(timedelta_=signed_delta, unit=self.optimization_data.time_unit) 165 | 166 | delta_t.append(diff) 167 | else: 168 | fp += 1 169 | fn: int = len(e_t) 170 | return tp, fp, fn, delta_t 171 | 172 | def compute_f1score(self, sigma: int, m: int, h: float): 173 | delta_with_time_unit = get_timedelta(self.optimization_data.delta, self.optimization_data.time_unit) 174 | op_g: np.ndarray = self.apply_gaussian_filter(sigma=sigma, m=m) 175 | t, op_g = self.__compute_op_as_mid_times(op_g=op_g) 176 | s_peaks = get_peaks(h=h, t=t, op_g=op_g) 177 | tp, fp, fn, delta_t = self.__util_method(s_peaks=s_peaks, delta_with_time_unit=delta_with_time_unit) 178 | 179 | if tp + fp == 0 or tp + fn == 0: 180 | return 0.0, 0.0, 0.0, [], [] 181 | 182 | precision = tp / (tp + fp) 183 | recall = tp / (tp + fn) 184 | if precision + recall == 0: 185 | return 0.0, 0.0, 0.0, [], [] 186 | return (2.0 * precision * recall) / (precision + recall), precision, recall, s_peaks.tolist(), delta_t 187 | 188 | def evaluate_combination(self, combination): 189 | sigma, m, h = combination 190 | f1_score, precision, recall, peaks, delta_t = self.compute_f1score(sigma, m, h) 191 | formatted_combination = ', '.join(f'{item:.4f}' for item in combination) 192 | if f1_score > 0: 193 | logger.info( 194 | f"Evaluated Combination [sigma, m, h] : [{formatted_combination}] => [F1 Score: {f1_score:.4f}, " 195 | f"Precision: {precision:.4f}, Recall: {recall:.4f}]") 196 | return f1_score, precision, recall, peaks, delta_t 197 | 198 | 199 | class EventOptimization: 200 | """ 201 | After obtaining the predicted op values from the metamodel, they are then processed 202 | through an optimization algorithm to extract the predicted events. This involves applying 203 | a Gaussian filter to smooth out the predictions and identifying peaks in the resulting signal 204 | that correspond to the mid-times of the predicted events, which are then compared to the 205 | actual events in the test set. The performance of the algorithm is evaluated by computing 206 | metrics such as F1-Score, which combines precision and recall using their harmonic means. 207 | Maximizing the F1-Score is the preferred metric for evaluating models since it requires 208 | simultaneously maximizing precision and recall. 209 | """ 210 | 211 | def __init__(self, optimization_data: OptimizationData): 212 | self.optimization_data = optimization_data 213 | self.optimization_calculator: OptimizationCalculator = OptimizationCalculator(self.optimization_data) 214 | self.results = () 215 | 216 | def max_f1score(self) -> tuple[list, list]: 217 | """ 218 | The optimization process aims to maximize the F1-Score metric by fine-tuning several parameters, 219 | including the filter size (2m + 1) and standard deviation (σ) of the Gaussian filter, 220 | and the peak height threshold h. 221 | 222 | Returns: 223 | list of peaks, delta_t 224 | """ 225 | sigma_range = range(1, self.optimization_data.big_sigma + 1) 226 | h_values = np.arange(0, 1, self.optimization_data.s_h) 227 | # Create a list of all combinations to evaluate 228 | combinations = [(sigma, m, h) for sigma in sigma_range for m in [sigma, 2 * sigma, 3 * sigma] for 229 | h in h_values] 230 | 231 | try: 232 | # Evaluate combinations sequentially 233 | results = [self.optimization_calculator.evaluate_combination(combination) for combination in 234 | combinations] 235 | except ValueError as e: 236 | logger.error(e) 237 | exit(0) 238 | 239 | # Find the combination with the maximum F1 score 240 | best_combination_index = np.argmax(list(map(lambda metrics: metrics[0], results))) 241 | best_combination = combinations[best_combination_index] 242 | config_dict["best_combination"] = best_combination 243 | self.results = results[best_combination_index] 244 | max_f1_score, precision, recall, peaks, delta_t = self.results 245 | 246 | formatted_combination = ', '.join(f'{item:.4f}' for item in best_combination) 247 | logger.warning( 248 | f"Best Combination [sigma, m, h] : [{formatted_combination}] => " 249 | f"[Max F1 Score: {max_f1_score:.4f} => Precision:{precision:.4f}, Recall:{recall:.4f}]") 250 | return peaks, delta_t 251 | -------------------------------------------------------------------------------- /eventdetector_ts/plotter/plotter.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | from typing import Dict 4 | 5 | import matplotlib.dates as mdates 6 | import matplotlib.pyplot as plt 7 | import numpy as np 8 | import pandas as pd 9 | import seaborn as sns 10 | from matplotlib.patches import Patch 11 | 12 | from eventdetector_ts import OUTPUT_DIR, TimeUnit, MIDDLE_EVENT_LABEL 13 | from eventdetector_ts.data.helpers_data import get_timedelta 14 | from eventdetector_ts.plotter import logger, COLOR_TRUE, COLOR_PREDICTED, STYLE_PREDICTED, STYLE_TRUE, FIG_SIZE, PALETTE 15 | from eventdetector_ts.plotter.helpers import event_to_rectangle 16 | 17 | 18 | class Plotter: 19 | """ 20 | The Plotter class is responsible for generating and saving plots of the predicted and true op, events, delta_t,... 21 | It provides a convenient way to visualize and compare the performance of a 22 | predictive model against the actual observed values. 23 | """ 24 | 25 | def __init__(self, root_dir: str, time_unit: TimeUnit, width_events_s: float) -> None: 26 | """ 27 | Initialize the Plotter object. 28 | 29 | Args: 30 | root_dir (str): The root directory for saving the plots. 31 | time_unit (TimeUnit): The unit time of the dataset. 32 | width_events_s (float): The width of events in the unit of time for the dataset. 33 | """ 34 | 35 | self.val_losses = {} 36 | self.train_losses = {} 37 | self.val_loss_meta_model: list = [] 38 | self.train_loss_meta_model: list = [] 39 | self.width_events_s = width_events_s 40 | self.time_unit = time_unit 41 | # Whether to display the plots or not. Defaults to False. 42 | self.show = True 43 | self.root_dir = root_dir 44 | self.predicted_y: np.ndarray = np.empty(shape=(0,)) 45 | self.test_y: np.ndarray = np.empty(shape=(0,)) 46 | self.predicted_events: list = [] 47 | self.true_events: pd.DataFrame = pd.DataFrame() 48 | self.delta_t: list = [] 49 | self.working_dir = os.path.join(root_dir, OUTPUT_DIR) 50 | os.makedirs(self.working_dir) 51 | 52 | def set_show(self, show: bool) -> None: 53 | """ 54 | Set show value 55 | Args: 56 | show (bool): Value to set for 'self.show' 57 | 58 | Returns: 59 | None 60 | """ 61 | self.show = show 62 | 63 | def set_data_op(self, test_y: np.ndarray, predicted_y: np.ndarray) -> None: 64 | """ 65 | Set test_y and predicted_y 66 | Args: 67 | test_y: The true op values 68 | predicted_y: The predicted op values 69 | 70 | Returns: 71 | None 72 | """ 73 | self.test_y = test_y 74 | self.predicted_y = predicted_y 75 | 76 | def set_data_events(self, predicted_events: list, true_events: pd.DataFrame) -> None: 77 | """ 78 | Set true and predicted events 79 | Args: 80 | predicted_events (list): List of predicted events computed by the optimization process 81 | true_events (pd.DataFrame): DataFrame of true events 82 | 83 | Returns: 84 | None 85 | """ 86 | self.predicted_events = predicted_events 87 | self.true_events = true_events 88 | 89 | def set_delta_t(self, delta_t: list) -> None: 90 | """ 91 | Set delta_t 92 | Args: 93 | delta_t (list): Each item of this list contains the accepted delta in time unit between 94 | true event its correspondent in the list of predicted events 95 | 96 | Returns: 97 | None 98 | """ 99 | self.delta_t = delta_t 100 | 101 | def set_losses(self, train_losses: Dict[str, list], val_losses: Dict[str, list], 102 | train_loss_meta_model: list, val_loss_meta_model: list) -> None: 103 | """ 104 | Set losses of all trained models. 105 | Args: 106 | train_losses (Dict[str, list]): train losses. 107 | val_losses (Dict[str, list]): val losses. 108 | train_loss_meta_model (list): train loss for the metamodel. 109 | val_loss_meta_model (list): val loss for the metamodel. 110 | Returns: 111 | None 112 | """ 113 | self.train_losses = train_losses 114 | self.val_losses = val_losses 115 | self.train_loss_meta_model = train_loss_meta_model 116 | self.val_loss_meta_model = val_loss_meta_model 117 | 118 | def plot_prediction(self) -> None: 119 | """ 120 | Plot the true and the predicted op and save it. 121 | 122 | Returns: 123 | None 124 | """ 125 | 126 | logger.info("Plotting and saving the figure displaying the true and the predicted op") 127 | # Create the plot using Seaborn 128 | # Set the ggplot style 129 | sns.set(style="ticks", palette=PALETTE) 130 | plt.figure(figsize=FIG_SIZE) # Set the figure size 131 | # Plot the true and predicted values using Seaborn 132 | n = len(self.test_y) 133 | sns.lineplot(x=np.arange(n), y=self.test_y, color=COLOR_TRUE, label='True Op') 134 | sns.lineplot(x=np.arange(n), y=self.predicted_y, color=COLOR_PREDICTED, label='Predicted Op') 135 | # Add labels and title to the plot 136 | plt.xlabel('Windows') 137 | plt.ylabel('Op') 138 | plt.title('True Op vs Predicted Op') 139 | # Add legend 140 | plt.legend() 141 | # Save the plot to a file 142 | path = os.path.join(self.working_dir, "op.png") 143 | plt.savefig(path, dpi=300) 144 | # Show the plot 145 | if self.show: 146 | plt.show() 147 | self.__save_op() 148 | 149 | def plot_predicted_events(self) -> None: 150 | """ 151 | Plot the true and the predicted events and save it. 152 | 153 | Returns: 154 | None 155 | """ 156 | 157 | logger.info("Plotting and saving the figure displaying the true events and the predicted events") 158 | fig, ax = plt.subplots(figsize=FIG_SIZE) 159 | sns.set(style="ticks", palette=PALETTE) 160 | 161 | for i, predicted_event in enumerate(self.predicted_events): 162 | rect1 = event_to_rectangle(event=predicted_event, width_events_s=self.width_events_s, 163 | time_unit=self.time_unit, 164 | color=COLOR_PREDICTED, 165 | style=STYLE_PREDICTED) 166 | ax.add_patch(rect1) 167 | 168 | for _, test_date in self.true_events[MIDDLE_EVENT_LABEL].items(): 169 | rect1 = event_to_rectangle(event=test_date, width_events_s=self.width_events_s, time_unit=self.time_unit, 170 | color=COLOR_TRUE, 171 | style=STYLE_TRUE) 172 | ax.add_patch(rect1) 173 | 174 | locator = mdates.AutoDateLocator(minticks=3) 175 | formatter = mdates.AutoDateFormatter(locator) 176 | ax.xaxis.set_major_locator(locator) 177 | ax.xaxis.set_major_formatter(formatter) 178 | 179 | start_time = self.true_events[MIDDLE_EVENT_LABEL].iloc[0] 180 | end_time = self.true_events[MIDDLE_EVENT_LABEL].iloc[-1] 181 | ax.set_xlim([start_time, end_time]) 182 | ax.set_ylim([0.0, 1.01]) 183 | 184 | predicted_patch = Patch(color=COLOR_PREDICTED, label='Predicted Events') 185 | true_patch = Patch(color=COLOR_TRUE, label='True Events') 186 | ax.legend(handles=[predicted_patch, true_patch], edgecolor="black") 187 | 188 | # Save the plot to a file 189 | path = os.path.join(self.working_dir, "events.png") 190 | plt.savefig(path, dpi=300) 191 | # Show the plot 192 | if self.show: 193 | plt.show() 194 | self.__save_events() 195 | 196 | def plot_delta_t(self, bins=30) -> None: 197 | """ 198 | Plots a histogram for delta t. 199 | 200 | Args: 201 | bins (int): The number of bins in the histogram. Default is 10. 202 | 203 | Returns: 204 | None 205 | """ 206 | sns.set(style="ticks", palette=PALETTE) 207 | plt.figure(figsize=FIG_SIZE) 208 | 209 | sns.histplot(self.delta_t, bins=bins, binrange=(-self.width_events_s, self.width_events_s)) 210 | 211 | plt.xlabel(f'delta ({self.time_unit})') 212 | plt.ylabel('Number of events') 213 | 214 | std = np.std(self.delta_t) 215 | mu = np.mean(self.delta_t) 216 | 217 | plt.title(f'Histogram std = {std:.2f}, mu = {mu:.2f}') 218 | # Save the plot to a file 219 | path = os.path.join(self.working_dir, "delta_t.png") 220 | plt.savefig(path, dpi=300) 221 | # Show the plot 222 | if self.show: 223 | plt.show() 224 | 225 | def plot_losses(self): 226 | """ 227 | Plot losses for all trained models. 228 | Returns: 229 | None 230 | """ 231 | meta_model_was_used: bool = len(self.val_loss_meta_model) > 0 232 | 233 | sns.set(style="ticks", palette=PALETTE) 234 | if meta_model_was_used: 235 | fig, (ax1, ax2) = plt.subplots(1, 2) 236 | fig.set_size_inches((11, 8.5), forward=False) 237 | else: 238 | fig, ax1 = plt.subplots(figsize=FIG_SIZE) 239 | y_label = 'Loss' 240 | x_label = 'Epochs' 241 | colors = sns.color_palette(PALETTE, len(self.val_losses)) 242 | lifestyle_val = '--' 243 | lifestyle_train = '-' 244 | for i, (model_name, val_loss) in enumerate(self.val_losses.items()): 245 | epochs = range(1, len(val_loss) + 1) 246 | train_loss = self.train_losses[model_name] 247 | ax1.plot(epochs, train_loss, linestyle=lifestyle_train, color=colors[i], 248 | label='Training Loss - {}'.format(model_name)) 249 | ax1.plot(epochs, val_loss, linestyle=lifestyle_val, color=colors[i], 250 | label='Validation Loss - {}'.format(model_name)) 251 | ax1.set_ylabel(y_label) 252 | ax1.set_xlabel(x_label) 253 | ax1.legend() 254 | 255 | if len(self.val_loss_meta_model) > 0: 256 | epochs_meta = range(1, len(self.val_loss_meta_model) + 1) 257 | ax2.plot(epochs_meta, self.train_loss_meta_model, linestyle=lifestyle_train, color='b', 258 | label='Training Loss - Meta Model') 259 | ax2.plot(epochs_meta, self.val_loss_meta_model, linestyle=lifestyle_val, color='g', 260 | label='Validation Loss - Meta Model') 261 | ax2.set_ylabel(y_label) 262 | ax2.set_xlabel(x_label) 263 | ax2.legend() 264 | 265 | fig.suptitle('Training and Validation Losses') 266 | plt.tight_layout() 267 | # Save the plot to a file 268 | path = os.path.join(self.working_dir, "losses.png") 269 | plt.savefig(path, dpi=300) 270 | # Show the plot 271 | if self.show: 272 | plt.show() 273 | 274 | def __save_events(self) -> None: 275 | """ 276 | Save predicted events/true events to csv files. 277 | 278 | Returns: 279 | None 280 | """ 281 | path = os.path.join(self.working_dir, "predicted_events.csv") 282 | radius = get_timedelta(float(self.width_events_s) / 2.0, self.time_unit) 283 | with open(path, 'w', encoding='UTF8', newline='') as f: 284 | writer = csv.writer(f, delimiter=' ') 285 | for i in range(len(self.predicted_events)): 286 | predicted_event = self.predicted_events[i] 287 | start_time = predicted_event - radius 288 | end_time = predicted_event + radius 289 | 290 | start_time = start_time.replace(microsecond=0) 291 | end_time = end_time.replace(microsecond=0) 292 | 293 | writer.writerow([start_time.isoformat(), end_time.isoformat()]) 294 | 295 | path = os.path.join(self.working_dir, "true_events.csv") 296 | with open(path, 'w', encoding='UTF8', newline='') as f: 297 | writer = csv.writer(f, delimiter=' ') 298 | for _, test_date in enumerate(self.true_events[MIDDLE_EVENT_LABEL]): 299 | start_time = test_date - radius 300 | end_time = test_date + radius 301 | 302 | start_time = start_time.replace(microsecond=0) 303 | end_time = end_time.replace(microsecond=0) 304 | 305 | writer.writerow([start_time.isoformat(), end_time.isoformat()]) 306 | 307 | def __save_op(self) -> None: 308 | """ 309 | Save predicted/true Op into csv file. 310 | 311 | Returns: 312 | None 313 | """ 314 | df = pd.DataFrame({'True-Op': self.test_y, 'Predicted-Op': self.predicted_y}) 315 | path = os.path.join(self.working_dir, "op.csv") 316 | df.to_csv(path, index=True, sep=" ") 317 | -------------------------------------------------------------------------------- /eventdetector_ts/models/models_trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Dict, Tuple 3 | 4 | import joblib 5 | import numpy as np 6 | import tensorflow as tf 7 | from numpy import ndarray 8 | from sklearn.model_selection import KFold, train_test_split 9 | from sklearn.preprocessing import StandardScaler 10 | 11 | from eventdetector_ts import MODELS_DIR, META_MODEL_NETWORK, config_dict, TYPE_TRAINING_FFN, SCALERS_DIR, \ 12 | META_MODEL_SCALER 13 | from eventdetector_ts.metamodel.utils import DataSplitter 14 | from eventdetector_ts.models import logger_models 15 | from eventdetector_ts.models.helpers_models import CustomEarlyStopping, custom_cross_val_score 16 | from eventdetector_ts.models.models_builder import ModelBuilder 17 | 18 | 19 | class ModelTrainer: 20 | """ 21 | A class used to train and evaluate machine learning models. 22 | 23 | Attributes: 24 | data_splitter (DataSplitter): An object of the DataSplitter class, which is used to split the data 25 | into train and test sets. 26 | epochs (int): The number of epochs to train the models. 27 | batch_size (int): The batch size to use during training. 28 | pa (int): The patience value to use for the EarlyStopping callback. 29 | t_r (float): The ratio value to use for the CustomEarlyStopping callback. 30 | use_kfold (bool): Whether to use K-Fold cross-validation or not. 31 | val_size (float): The size of the validation set to use during training. 32 | epsilon (float): A small constant used to control the size of set which contains the top models 33 | with the lowest MSE values. 34 | save_models_as_dot_format (bool): Whether to save the models as a dot format file. 35 | The default value is False. If set to True, then you should have graphviz software 36 | to be installed on your machine. 37 | train_losses (Dict[str, np.ndarray]): A dictionary containing the training losses for each model. 38 | val_losses (Dict[str, np.ndarray]): A dictionary containing the validation losses for each model. 39 | val_loss_meta_model (np.ndarray): val loss for the meta_model. 40 | train_loss_meta_model (np.ndarray): train loss for the meta_model 41 | """ 42 | 43 | def __init__(self, data_splitter: DataSplitter, epochs: int, 44 | batch_size: int, pa: int, t_r: float, 45 | use_kfold: bool, val_size: float, epsilon: float, save_models_as_dot_format: bool) -> None: 46 | """ 47 | Initialize the ModelTrainer object. 48 | 49 | Args: 50 | data_splitter (DataSplitter): An object of the DataSplitter class, which is used to split the data 51 | into train and test sets. 52 | epochs (int): The number of epochs to train the models. 53 | batch_size (int): The batch size to use during training. 54 | pa (int): The patience value to use for the EarlyStopping callback. 55 | t_r (float): The ratio value to use for the CustomEarlyStopping callback. 56 | use_kfold (bool): Whether to use K-Fold cross-validation or not. 57 | val_size (float): The size of the validation set to use during training. 58 | epsilon (float): A small constant used to control the size of set which contains the top models 59 | with the lowest MSE values. 60 | save_models_as_dot_format (bool): Whether to save the models as a dot format file. 61 | The default value is False. If set to True, then you should have graphviz software 62 | to be installed on your machine. 63 | """ 64 | 65 | self.val_loss_meta_model: list = [] 66 | self.train_loss_meta_model: list = [] 67 | self.save_models_as_dot_format = save_models_as_dot_format 68 | self.best_models: Dict[str, tf.keras.Model] = {} 69 | self.train_losses: Dict[str, list] = {} 70 | self.val_losses: Dict[str, list] = {} 71 | self.data_splitter: DataSplitter = data_splitter 72 | self.epochs: int = epochs 73 | self.batch_size: int = batch_size 74 | self.pa = pa 75 | self.t_r = t_r 76 | self.use_kfold = use_kfold 77 | self.val_size = val_size 78 | self.epsilon = epsilon 79 | 80 | def fitting_models(self, created_models: Dict[str, tf.keras.Model]) -> None: 81 | """ 82 | Fits the created models to the training data and saves the training and validation losses. 83 | 84 | Args: 85 | created_models: A dictionary containing the created models with their names as keys 86 | and the models as values. 87 | 88 | Returns: 89 | None 90 | """ 91 | # Define early stopping based on validation loss 92 | early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=self.pa * 2) 93 | # Define custom early stopping based on a ratio and patience 94 | custom_early_stopping = CustomEarlyStopping(ratio=self.t_r, patience=self.pa, verbose=1) 95 | # Loop through each model in the created models dictionary 96 | for model_name, model in created_models.items(): 97 | # If using k-fold cross-validation 98 | if self.use_kfold: 99 | logger_models.info("Performing a KFold cross-validation") 100 | # Calculate cross validation score using custom function 101 | score: np.ndarray = custom_cross_val_score(model=model, x=self.data_splitter.train_x, 102 | y=self.data_splitter.train_y, 103 | cv=KFold(n_splits=5, shuffle=False), epochs=self.epochs, 104 | batch_size=self.batch_size, 105 | callbacks=[early_stopping, custom_early_stopping]) 106 | # Print cross validation score for the current model 107 | logger_models.info(f"The cross validation score for {model_name} is {score}") 108 | # Split training data into training and validation sets 109 | train_x, val_x, train_y, val_y = train_test_split(self.data_splitter.train_x, self.data_splitter.train_y, 110 | test_size=self.val_size, 111 | shuffle=False) 112 | # Print message indicating fitting of a current model 113 | logger_models.info(f"Summary of {model_name}...") 114 | logger_models.info(model.summary()) 115 | logger_models.info(f"Fitting of {model_name}...") 116 | # Fit the model using training data and validate using validation data 117 | history = model.fit(train_x, train_y, epochs=self.epochs, 118 | batch_size=self.batch_size, verbose=1, 119 | validation_data=(val_x, val_y), 120 | callbacks=[early_stopping, custom_early_stopping]) 121 | # Save training and validation errors for the current model 122 | self.train_losses[model_name] = history.history['loss'] 123 | self.val_losses[model_name] = history.history['val_loss'] 124 | 125 | losses_test_data: Dict[str, tf.keras.Model] = {} 126 | min_loss = np.inf 127 | for model_name, model in created_models.items(): 128 | logger_models.info(f"Evaluating model {model_name} on test data") 129 | loss = model.evaluate(self.data_splitter.test_x, self.data_splitter.test_y, batch_size=self.batch_size) 130 | logger_models.info(f"The loss value of model {model_name} on test data is {loss:.4f}") 131 | losses_test_data[model_name] = loss 132 | if min_loss > loss: 133 | min_loss = loss 134 | 135 | logger_models.info(f"Selecting best models based on the min MSE {min_loss:.4f} and epsilon {self.epsilon}:") 136 | for model_name, loss_ in losses_test_data.items(): 137 | if loss_ <= (min_loss + self.epsilon): 138 | self.best_models[model_name] = created_models[model_name] 139 | logger_models.info(f"Best models selected: {self.best_models.keys()}") 140 | 141 | config_dict["models"] = list(self.best_models.keys()) 142 | 143 | def save_best_models(self, output_dir: str) -> None: 144 | """ 145 | Save the best models to the specified output directory. 146 | 147 | Args: 148 | output_dir (str): The directory to save the best models. 149 | 150 | Returns: 151 | None 152 | """ 153 | 154 | for model_name, model in self.best_models.items(): 155 | # Print the name of the current model being saved 156 | logger_models.info(f"Current model to be saved on the disk is {model_name}") 157 | model_name_with_ext = f"{model_name}.keras" 158 | model_path = os.path.join(output_dir, MODELS_DIR, model_name_with_ext) 159 | 160 | # Save in Keras 3 compatible format 161 | model.save(model_path, save_format='keras') 162 | 163 | logger_models.info("Models saved successfully.") 164 | 165 | def train_meta_model(self, type_training: str, hyperparams_mm_network: Tuple[int, int], output_dir: str) \ 166 | -> tuple[ndarray, float, ndarray]: 167 | """ 168 | Trains the metamodel using the best models predictions as features. 169 | 170 | Args: 171 | type_training: The type of training to use, either "ffn" or "mean". 172 | hyperparams_mm_network: A tuple containing the hyperparameters the MetaModel network. 173 | output_dir: The directory to save the trained models to. 174 | 175 | Returns: 176 | A tuple containing the final prediction and the loss. 177 | """ 178 | predictions = [] 179 | for model_name, model in self.best_models.items(): 180 | # Make predictions for the test set using each model 181 | predicted_y: np.ndarray = model.predict(self.data_splitter.test_x, batch_size=self.batch_size) 182 | predicted_y = predicted_y.flatten() 183 | predictions.append(predicted_y) 184 | 185 | # Convert a list of 1D NumPy arrays to 2D NumPy array 186 | x = np.stack(predictions, axis=1) 187 | 188 | if type_training == TYPE_TRAINING_FFN: 189 | logger_models.info("Train the MetaModel using a FFN to produce a final prediction") 190 | # Split the data into training and test sets 191 | train_x, test_x, train_y, test_y = train_test_split(x, self.data_splitter.test_y, 192 | test_size=self.data_splitter.test_size, 193 | shuffle=False) 194 | scaler = StandardScaler() 195 | train_x = scaler.fit_transform(train_x) 196 | test_x = scaler.transform(test_x) 197 | scalers_dir = os.path.join(output_dir, SCALERS_DIR) 198 | scaler_path = os.path.join(scalers_dir, f"{META_MODEL_SCALER}.joblib") 199 | joblib.dump(scaler, scaler_path) 200 | # Build the FFN model 201 | inputs = tf.keras.Input(shape=(train_x.shape[1],), name="input") 202 | layers, units = hyperparams_mm_network 203 | model_builder: ModelBuilder = ModelBuilder(inputs=inputs) 204 | 205 | for _ in range(layers): 206 | units_j = units 207 | model_builder.add_dense_layer(units=units_j) 208 | model_builder.add_dense_layer(units=1, dropout=None) 209 | keras_model = model_builder.build(name=META_MODEL_NETWORK, root_dir=output_dir, 210 | save_models_as_dot_format=self.save_models_as_dot_format) 211 | # Train the model 212 | logger_models.info("Fitting the MetaModel network...") 213 | history = keras_model.fit(train_x, train_y, epochs=self.epochs, batch_size=self.batch_size, verbose=1, 214 | validation_data=(test_x, test_y)) 215 | 216 | path = os.path.join(output_dir, MODELS_DIR) 217 | model_path = os.path.join(path, META_MODEL_NETWORK) 218 | keras_model.save(model_path, save_format='keras') 219 | logger_models.info("MetaModel network saved successfully.") 220 | self.train_loss_meta_model = history.history['loss'] 221 | self.val_loss_meta_model = history.history['val_loss'] 222 | 223 | # final_prediction: np.ndarray = keras_model.predict(self.data_splitter.test_x, batch_size=self.batch_size) 224 | final_prediction: np.ndarray = keras_model.predict(test_x, batch_size=self.batch_size) 225 | final_prediction = final_prediction.flatten() 226 | return final_prediction, tf.keras.losses.mse(final_prediction, test_y), test_y 227 | else: 228 | # Compute the average prediction 229 | logger_models.info("Compute the average of predictions to produce a final prediction") 230 | final_prediction = np.mean(x, axis=1) 231 | return final_prediction, tf.keras.losses.mse(final_prediction, 232 | self.data_splitter.test_y), self.data_splitter.test_y 233 | -------------------------------------------------------------------------------- /images/logo_eventdetector.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /eventdetector_ts/metamodel/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from typing import Dict, Tuple 4 | 5 | import joblib 6 | import numpy as np 7 | import pandas as pd 8 | from sklearn.model_selection import train_test_split 9 | from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler 10 | 11 | from eventdetector_ts import MIN_MAX_SCALER, ROBUST_SCALER, SCALERS_DIR, FILL_NAN_ZEROS, FILL_NAN_FFILL, \ 12 | FILL_NAN_BFILL, FILL_NAN_MEDIAN, RNN_BIDIRECTIONAL, CONV_LSTM1D, RNN_ENCODER_DECODER, FFN, CNN_RNN, \ 13 | GRU, CNN, SELF_ATTENTION, LSTM, TYPE_TRAINING_AVERAGE, TYPE_TRAINING_FFN, STANDARD_SCALER, TRANSFORMER 14 | from eventdetector_ts.data.helpers_data import InvalidArgumentError 15 | 16 | 17 | class DataSplitter: 18 | """ 19 | A class for splitting and scaling data into training, test sets and applying scalers to each 20 | time step in the data. 21 | """ 22 | 23 | def __init__(self, test_size: float, scaler_type: str): 24 | """ 25 | Initialize the DataSplitter object. 26 | 27 | Args: 28 | test_size: The fraction of data to use for testing. 29 | scaler_type: The type of scaler to use. 30 | """ 31 | 32 | self.train_x: np.ndarray = np.empty(shape=(0,)) 33 | self.test_x: np.ndarray = np.empty(shape=(0,)) 34 | self.train_y: np.ndarray = np.empty(shape=(0,)) 35 | self.test_y: np.ndarray = np.empty(shape=(0,)) 36 | self.scalers: Dict[int, StandardScaler | MinMaxScaler | ROBUST_SCALER] = {} 37 | self.test_size: float = test_size 38 | self.scaler_type: str = scaler_type 39 | 40 | def split_data_and_apply_scaler(self, x: np.ndarray, y: np.ndarray) -> None: 41 | """ 42 | Split the data into training, validation, and test sets and apply the specified scaler to each time step. 43 | 44 | Args: 45 | x: The input data with shape (n_samples, n_time_steps, n_features). 46 | y: The target data with shape (n_samples,). 47 | 48 | Returns: 49 | A tuple containing the training, validation, and test sets as numpy arrays and a dictionary of scalers. 50 | """ 51 | assert x.ndim == 3, "x must be a 3D array." 52 | assert y.ndim == 1, "y must be a 1D array." 53 | assert x.shape[0] == y.shape[0], "x and y must have the same number of samples." 54 | 55 | # Split the data into training and test sets 56 | self.train_x, self.test_x, self.train_y, self.test_y = train_test_split(x, y, test_size=self.test_size, 57 | shuffle=False) 58 | 59 | n_time_steps = x.shape[1] 60 | 61 | self.scalers = {} 62 | # Apply scaler to each time step 63 | for i in range(n_time_steps): 64 | scaler = StandardScaler() 65 | if self.scaler_type == MIN_MAX_SCALER: 66 | scaler = MinMaxScaler() 67 | elif self.scaler_type == ROBUST_SCALER: 68 | scaler = RobustScaler() 69 | self.scalers[i] = scaler 70 | self.train_x[:, i, :] = self.scalers[i].fit_transform(self.train_x[:, i, :]) 71 | self.test_x[:, i, :] = self.scalers[i].transform(self.test_x[:, i, :]) 72 | 73 | def save_scalers(self, output_dir: str) -> None: 74 | """ 75 | Saves the scalers to disk. 76 | 77 | Args: 78 | output_dir: the directory where the scalers should be saved 79 | 80 | Returns: 81 | None 82 | """ 83 | # Create the directory if it doesn't exist 84 | scalers_dir = os.path.join(output_dir, SCALERS_DIR) 85 | if not os.path.exists(scalers_dir): 86 | os.makedirs(scalers_dir) 87 | 88 | # Save each scaler to disk 89 | n_time_steps: int = self.test_x.shape[1] 90 | for i in range(n_time_steps): 91 | # Generate the path to save the scaler to 92 | scaler_i_path = os.path.join(scalers_dir, f'scaler_{i}.joblib') 93 | # Print progress 94 | print("\rSaving scaling...{}/{}".format(i + 1, n_time_steps), end="") 95 | # Save the scaler to disk 96 | joblib.dump(self.scalers[i], scaler_i_path) 97 | print() 98 | 99 | 100 | def validate_required_args(meta_model) -> None: 101 | """ 102 | Validate the required arguments of the MetaModel. 103 | 104 | Args: 105 | meta_model (MetaModel): A MetaModel instance. 106 | 107 | Returns: 108 | None 109 | 110 | Raises: 111 | ValueError: If any of the arguments are invalid. 112 | """ 113 | 114 | __validate_required_args(meta_model=meta_model) 115 | 116 | if meta_model.dataset is None or meta_model.dataset.empty: 117 | raise InvalidArgumentError("dataset cannot be None or empty.") 118 | elif not isinstance(meta_model.dataset, pd.DataFrame): 119 | raise InvalidArgumentError("dataset should be a Pandas DataFrame.") 120 | 121 | if len(meta_model.dataset) < meta_model.width: 122 | raise InvalidArgumentError("Dataset length is smaller than the given partition width.") 123 | 124 | if meta_model.events is None or (isinstance(meta_model.events, pd.DataFrame) and meta_model.events.empty) or \ 125 | (isinstance(meta_model.events, list) and len(meta_model.events) == 0): 126 | raise InvalidArgumentError("Events is empty or None.") 127 | elif not isinstance(meta_model.events, (list, pd.DataFrame)): 128 | raise InvalidArgumentError("Events should be a list or a Pandas DataFrame.") 129 | 130 | if not re.match(r"^\w+$", meta_model.output_dir): 131 | raise InvalidArgumentError( 132 | "Output directory name can only contain alphanumeric characters and underscores.") 133 | 134 | 135 | def __validate_required_args(meta_model) -> None: 136 | if not isinstance(meta_model.step, int) or meta_model.step <= 0: 137 | raise InvalidArgumentError("step should be a positive integer.") 138 | 139 | if not isinstance(meta_model.width, int) or meta_model.width <= meta_model.step: 140 | raise InvalidArgumentError(f"width should be greater than step = {meta_model.step}.") 141 | 142 | if meta_model.width_events is not None and not isinstance(meta_model.width_events, 143 | (int, float)) and meta_model.width_events <= 0: 144 | raise InvalidArgumentError("width_events should be either a positive integer or positive float.") 145 | 146 | 147 | def validate_args(meta_model) -> None: 148 | """ 149 | Validate the arguments of the MetaModel. 150 | 151 | Args: 152 | meta_model (MetaModel): A MetaModel instance. 153 | 154 | Returns: 155 | None 156 | 157 | Raises: 158 | ValueError: If any of the arguments are invalid. 159 | """ 160 | 161 | validate_args_1(meta_model) 162 | validate_args_2(meta_model) 163 | validate_args_3(meta_model) 164 | validate_args_4(meta_model) 165 | validate_args_5(meta_model) 166 | 167 | if len(meta_model.hyperparams_mm_network) != 3: 168 | raise ValueError("hyperparams_mm_network must be a tuple of length 3") 169 | 170 | if not all(isinstance(val, int) for val in meta_model.hyperparams_mm_network[:-1]): 171 | raise ValueError("hyperparams_mm_network values must be integers except the last which is" 172 | " the activation function (str)") 173 | 174 | if not isinstance(meta_model.save_models_as_dot_format, bool): 175 | raise InvalidArgumentError("Invalid save_models_as_dot_format parameter: must be a boolean.") 176 | 177 | if meta_model.dropout is None or not 0 <= meta_model.dropout < 1 or not isinstance(meta_model.dropout, float): 178 | raise InvalidArgumentError("Invalid dropout parameter: must be a float between 0 and 1.0.") 179 | 180 | 181 | def validate_args_1(meta_model) -> None: 182 | """ 183 | Validate the arguments of the MetaModel. 184 | 185 | Args: 186 | meta_model (MetaModel): A MetaModel instance. 187 | 188 | Returns: 189 | None 190 | 191 | Raises: 192 | ValueError: If any of the arguments are invalid. 193 | """ 194 | 195 | if meta_model.fill_nan not in [FILL_NAN_ZEROS, FILL_NAN_FFILL, FILL_NAN_BFILL, FILL_NAN_MEDIAN]: 196 | raise InvalidArgumentError( 197 | f"Invalid method for filling NaN values. Supported methods are" 198 | f" {FILL_NAN_ZEROS}, {FILL_NAN_FFILL}, {FILL_NAN_BFILL}, and {FILL_NAN_MEDIAN}.") 199 | 200 | if not isinstance(meta_model.epochs, int) or meta_model.epochs <= 0: 201 | raise InvalidArgumentError("epochs should be a positive integer.") 202 | 203 | if not isinstance(meta_model.batch_size, int) or meta_model.batch_size <= 0: 204 | raise InvalidArgumentError("batch_size should be a positive integer.") 205 | 206 | if not isinstance(meta_model.t_max, float) and not isinstance(meta_model.t_max, int): 207 | raise InvalidArgumentError("t_max should be float/int.") 208 | 209 | if meta_model.t_max <= meta_model.w_s: 210 | raise InvalidArgumentError(f"t_max should be greater than w_s {meta_model.w_s}.") 211 | 212 | if not isinstance(meta_model.delta, (int, float)) or meta_model.delta <= 0: 213 | raise InvalidArgumentError("delta should be either a positive integer or positive float.") 214 | 215 | if not (0 < meta_model.s_h < 1): 216 | raise InvalidArgumentError("s_h should be a float between 0 and 1 exclusive.") 217 | 218 | if not isinstance(meta_model.epsilon, float) or not (0 < meta_model.epsilon <= 1): 219 | raise InvalidArgumentError("epsilon should be a positive number between 0 and 1.") 220 | 221 | if not isinstance(meta_model.pa, int) or meta_model.pa <= 0: 222 | raise InvalidArgumentError("pa should be a positive integer.") 223 | 224 | 225 | def validate_args_2(meta_model) -> None: 226 | """ 227 | Validate the arguments of the MetaModel. 228 | 229 | Args: 230 | meta_model (MetaModel): A MetaModel instance. 231 | 232 | Returns: 233 | None 234 | 235 | Raises: 236 | ValueError: If any of the arguments are invalid. 237 | """ 238 | 239 | if not isinstance(meta_model.t_r, float) or not (0 < meta_model.t_r <= 1): 240 | raise InvalidArgumentError("t_r should be a positive number between 0 and 1.") 241 | 242 | if meta_model.time_window is not None and ( 243 | not isinstance(meta_model.time_window, (int, float)) or meta_model.time_window <= 0): 244 | raise InvalidArgumentError("time_window should be either a positive integer or positive float.") 245 | 246 | if not all(isinstance(model, (str, tuple)) and 247 | (isinstance(model, str) or (isinstance(model, tuple) and len(model) == 2 and isinstance(model[0], 248 | str) and 249 | isinstance(model[1], int))) 250 | for model in meta_model.models): 251 | raise InvalidArgumentError( 252 | "Invalid format for models. It should be a list of strings or tuples of (string, integer).") 253 | 254 | 255 | def validate_model_type(model): 256 | if model not in [LSTM, GRU, CNN, RNN_BIDIRECTIONAL, CONV_LSTM1D, RNN_ENCODER_DECODER, CNN_RNN, 257 | SELF_ATTENTION, FFN, TRANSFORMER]: 258 | raise InvalidArgumentError( 259 | f"Invalid model type {model}. Supported models are {LSTM}, {GRU}, {CNN}, {RNN_BIDIRECTIONAL}," 260 | f" {CONV_LSTM1D}, {RNN_ENCODER_DECODER}, {CNN_RNN}, {SELF_ATTENTION}, {TRANSFORMER}, and {FFN}.") 261 | 262 | 263 | def validate_model_instances(model_instances): 264 | if not isinstance(model_instances, int) or model_instances <= 0: 265 | raise InvalidArgumentError("Number of model instances should be a positive integer.") 266 | 267 | 268 | def validate_args_3(meta_model) -> None: 269 | """ 270 | Validate the arguments of the MetaModel. 271 | 272 | Args: 273 | meta_model (MetaModel): A MetaModel instance. 274 | 275 | Returns: 276 | None 277 | 278 | Raises: 279 | ValueError: If any of the arguments are invalid. 280 | """ 281 | 282 | for model in meta_model.models: 283 | if isinstance(model, str): 284 | validate_model_type(model) 285 | elif isinstance(model, tuple) and len(model) == 2: 286 | model_type, model_instances = model 287 | validate_model_type(model_type) 288 | validate_model_instances(model_instances) 289 | else: 290 | raise InvalidArgumentError(f"Invalid model specification {model}.") 291 | 292 | 293 | def validate_args_4(meta_model) -> None: 294 | """ 295 | Validate the arguments of the MetaModel. 296 | 297 | Args: 298 | meta_model (MetaModel): A MetaModel instance. 299 | 300 | Returns: 301 | None 302 | 303 | Raises: 304 | ValueError: If any of the arguments are invalid. 305 | """ 306 | 307 | if meta_model.type_training not in [TYPE_TRAINING_AVERAGE, TYPE_TRAINING_FFN]: 308 | raise InvalidArgumentError( 309 | f"Invalid type of training technique. Supported techniques are " 310 | f"{TYPE_TRAINING_AVERAGE} and {TYPE_TRAINING_FFN}.") 311 | 312 | if meta_model.scaler not in [MIN_MAX_SCALER, STANDARD_SCALER, ROBUST_SCALER]: 313 | raise InvalidArgumentError( 314 | f"Invalid type of scaler technique. Supported techniques are {MIN_MAX_SCALER}," 315 | f" {STANDARD_SCALER} and {ROBUST_SCALER}.") 316 | 317 | if not isinstance(meta_model.use_kfold, bool): 318 | raise InvalidArgumentError("Invalid use_kfold parameter: must be a boolean.") 319 | 320 | if not 0 < meta_model.test_size < 1 or not isinstance(meta_model.test_size, float): 321 | raise InvalidArgumentError("Invalid test_size parameter: must be a float between 0 and 1.") 322 | 323 | if not 0 < meta_model.val_size < 1 or not isinstance(meta_model.val_size, float): 324 | raise InvalidArgumentError("Invalid val_size parameter: must be a float between 0 and 1.") 325 | 326 | if len(meta_model.hyperparams_transformer) != 5: 327 | raise ValueError("hyperparams_transformer must be a tuple of length 5") 328 | 329 | 330 | def validate_args_5(meta_model) -> None: 331 | """ 332 | Validate the arguments of the MetaModel. 333 | 334 | Args: 335 | meta_model (MetaModel): A MetaModel instance. 336 | 337 | Returns: 338 | None 339 | 340 | Raises: 341 | ValueError: If any of the arguments are invalid. 342 | """ 343 | 344 | param1, param2, param3, param4, param5 = meta_model.hyperparams_transformer 345 | if not (all(isinstance(p, int) for p in [param1, param2, param3]) and isinstance(param4, bool) and isinstance( 346 | param5, 347 | str)): 348 | raise ValueError("hyperparams_transformer values must be Tuple[int, int, int, bool, str]") 349 | 350 | 351 | def validate_ffn(meta_model) -> Tuple: 352 | hyperparams_ffn = meta_model.hyperparams_ffn 353 | l_ffn = len(hyperparams_ffn) 354 | print(l_ffn) 355 | 356 | if not 2 < l_ffn < 6: 357 | raise ValueError("hyperparams_ffn must be a tuple of length 3, 4 or 5") 358 | 359 | if l_ffn == 3: 360 | max_layers, min_neurons, max_neurons = hyperparams_ffn 361 | return 1, max_layers, min_neurons, max_neurons, "sigmoid" 362 | 363 | if l_ffn == 4: 364 | if isinstance(hyperparams_ffn[-1], str): 365 | max_layers, min_neurons, max_neurons, activation = hyperparams_ffn 366 | return 1, max_layers, min_neurons, max_neurons, activation 367 | else: 368 | min_layers, max_layers, min_neurons, max_neurons = hyperparams_ffn 369 | return min_layers, max_layers, min_neurons, max_neurons, "sigmoid" 370 | 371 | return hyperparams_ffn 372 | 373 | 374 | def validate_cnn(meta_model) -> Tuple: 375 | hyperparams_cnn = meta_model.hyperparams_cnn 376 | l_cnn = len(hyperparams_cnn) 377 | 378 | if not 4 < l_cnn < 8: 379 | raise ValueError("hyperparams_cnn must be a tuple of length between 5 and 7.") 380 | 381 | if l_cnn == 5: 382 | min_f, max_f, min_k, max_k, max_layers = hyperparams_cnn 383 | return min_f, max_f, min_k, max_k, 1, max_layers, "relu" 384 | 385 | if l_cnn == 6: 386 | if isinstance(hyperparams_cnn[-1], str): 387 | min_f, max_f, min_k, max_k, max_layers, activation = hyperparams_cnn 388 | return min_f, max_f, min_k, max_k, 1, max_layers, activation 389 | else: 390 | min_f, max_f, min_k, max_k, min_layers, max_layers = hyperparams_cnn 391 | return min_f, max_f, min_k, max_k, min_layers, max_layers, "relu" 392 | 393 | return hyperparams_cnn 394 | 395 | 396 | def validate_rnn(meta_model) -> Tuple: 397 | hyperparams_rnn = meta_model.hyperparams_rnn 398 | l_rnn = len(hyperparams_rnn) 399 | 400 | if not 2 < l_rnn < 6: 401 | raise ValueError("hyperparams_rnn must be a tuple of length 3, 4 or 5") 402 | 403 | if l_rnn == 3: 404 | max_layers, min_neurons, max_neurons = hyperparams_rnn 405 | return 1, max_layers, min_neurons, max_neurons, "tanh" 406 | 407 | if l_rnn == 4: 408 | if isinstance(hyperparams_rnn[-1], str): 409 | max_layers, min_neurons, max_neurons, activation = hyperparams_rnn 410 | return 1, max_layers, min_neurons, max_neurons, activation 411 | else: 412 | min_layers, max_layers, min_neurons, max_neurons = hyperparams_rnn 413 | return min_layers, max_layers, min_neurons, max_neurons, "tanh" 414 | 415 | return hyperparams_rnn 416 | -------------------------------------------------------------------------------- /eventdetector_ts/data/helpers_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime, timedelta 3 | from functools import reduce 4 | from typing import Optional, Union, Tuple, Dict 5 | 6 | import numpy as np 7 | import pandas as pd 8 | from dateutil.parser import parser 9 | # noinspection PyUnresolvedReferences 10 | from numpy.lib.stride_tricks import as_strided 11 | from pandas.core.dtypes.common import is_datetime64_any_dtype 12 | 13 | from eventdetector_ts import TIME_LABEL, FILL_NAN_ZEROS, FILL_NAN_FFILL, FILL_NAN_BFILL, FILL_NAN_MEDIAN, \ 14 | MIDDLE_EVENT_LABEL, TimeUnit 15 | from eventdetector_ts.data import VALUE_ERROR 16 | from eventdetector_ts.data.interval import Interval 17 | 18 | 19 | def overlapping_partitions(data: np.ndarray, width: int, step: int = 1): 20 | """ 21 | Splits an input numpy array into a set of overlapping partitions. 22 | 23 | Args: 24 | data: Input numpy array to be split into overlapping partitions 25 | width: Width of each overlapping partition 26 | step: The step size between successive partitions (default=1) 27 | 28 | Returns: 29 | Numpy array of shape (nb_partitions, width, data.ndim), containing the created overlapping partitions. 30 | """ 31 | if width > data.shape[0]: 32 | raise ValueError("Partition size cannot be greater than the size of the input data") 33 | if step > width: 34 | raise ValueError("Step size cannot be greater than partition size") 35 | 36 | # Compute the parameters for creating the overlapping partitions 37 | np_partitions = (data.shape[0] - width) // step + 1 38 | shape = (np_partitions, width) + data.shape[1:] 39 | strides = (step * data.strides[0],) + data.strides 40 | 41 | # Use as_strided to create the overlapping partitions 42 | partitioned_array = as_strided(data, shape=shape, strides=strides) 43 | 44 | return partitioned_array 45 | 46 | 47 | def convert_dataframe_to_overlapping_partitions( 48 | dataframe: pd.DataFrame, 49 | width: int, 50 | step: int, 51 | fill_method: Optional[str] = None 52 | ) -> np.ndarray: 53 | """ 54 | Converts a given DataFrame to overlapping partitions. 55 | 56 | Args: 57 | dataframe: Input DataFrame of features 58 | width: Width of each overlapping partition 59 | step: The step size between successive partitions 60 | fill_method: The method to use for filling NaNs. Supported methods are 'zeros', 'ffill', 'bfill', and 'median'. 61 | If None, NaNs are left as-is. (default=None) 62 | 63 | Returns: 64 | Numpy array of shape (np_partitions, width, nb_features), containing the created overlapping partitions. 65 | """ 66 | 67 | dataframe = dataframe.copy() 68 | dataframe.index = pd.to_datetime(dataframe.index) 69 | dataframe.loc[:, TIME_LABEL] = dataframe.index.to_pydatetime() 70 | 71 | if fill_method == FILL_NAN_ZEROS: 72 | dataframe = dataframe.fillna(0) 73 | elif fill_method == FILL_NAN_FFILL: 74 | dataframe = dataframe.ffill() 75 | elif fill_method == FILL_NAN_BFILL: 76 | dataframe = dataframe.bfill() 77 | elif fill_method == FILL_NAN_MEDIAN: 78 | dataframe = dataframe.fillna(dataframe.median()) 79 | elif fill_method is not None: 80 | raise ValueError(f"Unsupported fill method: {fill_method}") 81 | 82 | sw = overlapping_partitions(dataframe.to_numpy(), width=width, step=step) 83 | return sw 84 | 85 | 86 | class InvalidArgumentError(ValueError): 87 | """Raised when an invalid argument is passed to a function or method.""" 88 | 89 | def __init__(self, message): 90 | """ 91 | Initialize a new InvalidArgumentError with the specified error message. 92 | 93 | Args: 94 | message (str): The error message to display. 95 | """ 96 | super().__init__(message) 97 | 98 | 99 | def convert_time_to_datetime(date: Union[str, pd.Timestamp, float, int], to_timestamp: bool = True) -> \ 100 | Union[float, datetime]: 101 | """ 102 | Converts a date string, pandas Timestamp, or numeric timestamp to a Python datetime or Unix timestamp. 103 | 104 | Args: 105 | date: The input date as a string, pandas Timestamp, or numeric timestamp. 106 | to_timestamp: If True (default), return the date as a Unix timestamp (float), otherwise as a Python datetime. 107 | 108 | Returns: 109 | The input date as a Unix timestamp or Python datetime object. 110 | """ 111 | 112 | if isinstance(date, pd.Timestamp): 113 | dt = date.to_pydatetime() 114 | elif isinstance(date, (float, int)): 115 | dt = datetime.fromtimestamp(date) 116 | elif isinstance(date, str): 117 | dt = parser.parse(date, ignoretz=True) 118 | else: 119 | raise ValueError(f"Invalid date format {date}. Supported formats are str, pd.Timestamp, float, and int.") 120 | 121 | if to_timestamp: 122 | return (dt - datetime(1970, 1, 1)).total_seconds() 123 | return dt 124 | 125 | 126 | def num_columns(lst: list) -> int: 127 | """ 128 | Returns the number of columns in a list. 129 | 130 | Args: 131 | lst (list): The list to check. 132 | 133 | Returns: 134 | int: The number of columns in the list. 135 | """ 136 | 137 | if not lst: 138 | # if the list is empty return 0 139 | return 0 140 | elif isinstance(lst[0], list): 141 | # if the first element of the list is a list, return the length of the first list 142 | return len(lst[0]) 143 | else: 144 | # otherwise return 1, because the list has only one column 145 | return 1 146 | 147 | 148 | def compute_middle_event(events: Union[list, pd.DataFrame]) -> pd.DataFrame: 149 | """ 150 | Computes the middle date of events and returns it as a DataFrame. 151 | 152 | Args: 153 | events (Union[list, pd.DataFrame]): A list or pandas DataFrame containing the starting and ending 154 | dates of events. 155 | 156 | Returns: 157 | pd.DataFrame: A pandas DataFrame with a single column containing the middle dates of events. 158 | """ 159 | column1 = "Starting Date" 160 | column2 = "Ending Date" 161 | is2d = True 162 | 163 | if isinstance(events, list): 164 | nb_columns = num_columns(events) 165 | if nb_columns == 2: 166 | df = pd.DataFrame(events, columns=[column1, column2]) 167 | elif nb_columns == 1: 168 | df = pd.DataFrame(events, columns=[column1]) 169 | 170 | is2d = False 171 | else: 172 | raise ValueError( 173 | f"The list of events is not compatible. The number of columns {nb_columns} should not exceed 2.") 174 | elif isinstance(events, pd.DataFrame): 175 | df = events 176 | columns = events.columns 177 | if len(columns) == 2: 178 | df = df.rename(columns={columns[0]: column1, columns[1]: column2}) 179 | elif len(columns) == 1: 180 | is2d = False 181 | df = df.rename(columns={columns[0]: column1}) 182 | else: 183 | raise ValueError("The dataframe of events in not compatible, columns should not exceed 2") 184 | else: 185 | raise ValueError("The events argument must be a list or pandas DataFrame.") 186 | 187 | df[column1] = pd.to_datetime(df[column1]) 188 | if is2d: 189 | df[column2] = pd.to_datetime(df[column2]) 190 | 191 | if is2d: 192 | df[column1] = df[column1].apply(lambda x: convert_time_to_datetime(x) / 2) 193 | df[column2] = df[column2].apply(lambda x: convert_time_to_datetime(x) / 2) 194 | df[MIDDLE_EVENT_LABEL] = df[column1] + df[column2] 195 | else: 196 | df[MIDDLE_EVENT_LABEL] = df[column1].apply(lambda x: convert_time_to_datetime(x)) 197 | 198 | df[MIDDLE_EVENT_LABEL] = df[MIDDLE_EVENT_LABEL].apply(lambda x: datetime.utcfromtimestamp(x)) 199 | df = df[[MIDDLE_EVENT_LABEL]] 200 | df = df.sort_values(by=MIDDLE_EVENT_LABEL) 201 | return df 202 | 203 | 204 | def remove_close_events(events_df: pd.DataFrame, delta_unit_time: int, unit: TimeUnit, 205 | remove_overlapping_events: bool) -> pd.DataFrame: 206 | """ 207 | Removes events from a DataFrame that occur too close together. 208 | 209 | Args: 210 | unit: The time unit 211 | events_df: A pandas DataFrame containing events with a column named 'middle_event'. 212 | delta_unit_time: A integer representing the minimum time in unit time between events. 213 | remove_overlapping_events: A flag to indicate if we remove the overlapping events or not. 214 | 215 | Returns: 216 | A pandas DataFrame with close events removed. 217 | """ 218 | 219 | # Convert delta to timedelta 220 | delta = get_timedelta(delta_unit_time, unit) 221 | 222 | # List to hold indices of events to delete 223 | events_to_delete = [] 224 | 225 | # Loop through all events 226 | for i in range(len(events_df)): 227 | # Get middle time of the current event 228 | mid_time = events_df.iloc[i][MIDDLE_EVENT_LABEL] 229 | 230 | # Skip current event if it's already marked for deletion 231 | if i in events_to_delete: 232 | continue 233 | 234 | # Loop through all remaining events 235 | for j in range(i + 1, len(events_df)): 236 | # Get middle time of the next event 237 | mid_time1 = events_df.iloc[j][MIDDLE_EVENT_LABEL] 238 | 239 | # If the next event is too close to the current event, mark it for deletion 240 | if (mid_time1 - mid_time) <= delta: 241 | events_to_delete.append(j) 242 | else: 243 | break 244 | 245 | # Drop events that were marked for deletion 246 | if remove_overlapping_events: 247 | return events_df.drop(events_df.index[events_to_delete]) 248 | return events_df 249 | 250 | 251 | def convert_events_to_intervals(events_df: pd.DataFrame, width_events_s: float, unit: TimeUnit) \ 252 | -> list[Interval]: 253 | """ 254 | Convert events from a pandas DataFrame to intervals. 255 | 256 | Args: 257 | events_df (pd.DataFrame): DataFrame containing the events' data. 258 | width_events_s (float): The width of events in the unit of time for the dataset. 259 | unit: The unit time 260 | 261 | Returns: 262 | list[Interval]: A list of intervals. 263 | 264 | """ 265 | # Create an empty list to store the intervals 266 | events_intervals = [] 267 | 268 | # Loop over the events in the DataFrame 269 | for i in range(len(events_df)): 270 | # Get the middle event time 271 | mid_time = events_df.iloc[i][MIDDLE_EVENT_LABEL] 272 | 273 | width_events_s_float = float(width_events_s) 274 | # Compute the radius of the interval based on the event size 275 | radius = get_timedelta(delta_unit_time=width_events_s_float / 2, unit=unit) 276 | 277 | # Create an interval with the middle event time at the center 278 | interval = Interval(mid_time - radius, mid_time + radius) 279 | 280 | # Add the interval to the list of intervals 281 | events_intervals.append(interval) 282 | 283 | # Return the list of intervals 284 | return events_intervals 285 | 286 | 287 | def get_union_times_events(events_df: pd.DataFrame, time_window: int, unit_time: TimeUnit) -> pd.DatetimeIndex: 288 | """ 289 | Given a DataFrame of events and a time partition size in unit time, computes a DatetimeIndex of all times during 290 | which at least one event was taking place. 291 | 292 | Args: 293 | events_df (pd.DataFrame): A DataFrame containing at least a MIDDLE_EVENT_LABEL column with the datetime 294 | of each event. 295 | time_window (int): The size of the time window to consider before and after each event. 296 | unit_time (TimeUnit): The unit time 297 | 298 | Returns: 299 | pd.DatetimeIndex: A DatetimeIndex of all times during which at least one event was taking place. 300 | """ 301 | 302 | times_during_events = [] 303 | previous_range = None 304 | for i, event_time in enumerate(events_df[MIDDLE_EVENT_LABEL]): 305 | start_time = event_time - get_timedelta(time_window, unit=unit_time) 306 | end_time = event_time + get_timedelta(time_window, unit=unit_time) 307 | # Generate a list of dates between start_time and end_time with a frequency of exactly (end_time - start_time). 308 | # This ensures that the last date is exactly equal to end_time (useful when we generate overlapping ranges). 309 | dates_between = pd.date_range(start=start_time, end=end_time, freq=end_time - start_time) 310 | 311 | if previous_range is None: 312 | times_during_events.append(dates_between) 313 | previous_range = dates_between 314 | else: 315 | # Check if the current range overlaps with the previous one. 316 | ranges_overlap = max(previous_range[0], previous_range[-1]) < min(dates_between[0], dates_between[-1]) 317 | if not ranges_overlap: 318 | # If the ranges don't overlap, then we need to merge the previous and current ranges. 319 | merged_range = pd.date_range(start=previous_range[0], end=dates_between[-1], 320 | freq=dates_between[-1] - previous_range[0]) 321 | # Replace the last range we added to the list with the merged range. 322 | times_during_events[-1] = merged_range 323 | previous_range = merged_range 324 | else: 325 | previous_range = dates_between 326 | times_during_events.append(dates_between) 327 | 328 | # Use the reduce function to combine all the overlapping ranges we generated. 329 | union_ranges = reduce(lambda x, y: x.union(y), times_during_events) 330 | # Remove any timezone information from the resulting DatetimeIndex, if present. 331 | union_ranges = union_ranges.tz_localize(None) 332 | return union_ranges 333 | 334 | 335 | def get_dataset_within_events_times(data_set: pd.DataFrame, events_times: pd.DatetimeIndex) -> pd.DataFrame: 336 | """ 337 | Extracts the data from the given dataset that falls within the specified event times. 338 | 339 | Args: 340 | data_set: A pandas DataFrame containing the data to extract. 341 | events_times: A pandas DatetimeIndex containing the times of events. 342 | 343 | Returns: 344 | A pandas DataFrame containing the data within the specified event times. 345 | """ 346 | 347 | dataset_within_events_times = [] 348 | 349 | # Iterate through the event times by pairs 350 | for i in range(0, len(events_times) - 1, 2): 351 | partition_start_time = events_times[i] 352 | partition_end_time = events_times[i + 1] 353 | 354 | # Extract the data within the event time 355 | data_within_event_time = data_set.loc[partition_start_time: partition_end_time] 356 | 357 | dataset_within_events_times.append(data_within_event_time) 358 | 359 | # Concatenate all the data extracted from events times 360 | return pd.concat(dataset_within_events_times) 361 | 362 | 363 | def op(dataset_as_overlapping_partitions: np.ndarray, events_as_intervals: list[Interval]) -> \ 364 | tuple[np.ndarray, np.ndarray]: 365 | """ 366 | Calculates the "op" value for each overlapping partition in the dataset, based on the overlapping parameter 367 | between the partition and a set of events. 368 | 369 | Args: dataset_as_overlapping_partitions: A numpy ndarray containing the overlapping partitions for the dataset, 370 | where each overlapping partition is a 2D numpy ndarray containing the data points for the partition and their 371 | timestamps. events_as_intervals: A list of Interval objects representing the events in the dataset. 372 | 373 | Returns: 374 | A tuple containing two values: 375 | - A numpy ndarray containing the overlapping partitions for the dataset, with the timestamp column removed. 376 | - A numpy ndarray of floating-point values representing the "op" value 377 | for each overlapping partition in the dataset. 378 | """ 379 | 380 | # The index of the first event that hasn't been checked yet 381 | starting_event_index = 0 382 | 383 | # List to store the calculated op values for each overlapping partition 384 | op_values = [] 385 | 386 | # Iterate through each overlapping partition in the dataset 387 | for partition in dataset_as_overlapping_partitions: 388 | # Get the start and end times of the current overlapping partition 389 | partition_start_time = partition[0][-1].to_pydatetime() 390 | partition_end_time = partition[-1][-1].to_pydatetime() 391 | 392 | # Create an Interval object to represent the current overlapping partition 393 | partition_interval = Interval(partition_start_time, partition_end_time) 394 | 395 | # Initialize the op value for the current overlapping partition to 0 396 | current_op_value = 0 397 | 398 | # Iterate through each event that hasn't been checked yet 399 | for event_index in range(starting_event_index, len(events_as_intervals)): 400 | # Get the Interval object for the current event 401 | current_event_interval = events_as_intervals[event_index] 402 | 403 | # If the start time of the current partition is greater than or equal to the end time of the current event, 404 | # we can skip this event since it doesn't overlap with the current partition 405 | if partition_interval.start_time >= current_event_interval.end_time: 406 | starting_event_index = event_index + 1 407 | continue 408 | 409 | # Calculate the overlapping parameter between the current partition and the current event 410 | overlapping_parameter = partition_interval.overlapping_parameter(current_event_interval) 411 | 412 | # If the overlapping parameter is 0, there is no overlap between the current partition and the current event 413 | if overlapping_parameter == 0: 414 | break 415 | 416 | # Update the op value for the current partition if the overlapping parameter is greater than the current op 417 | # value 418 | if overlapping_parameter > current_op_value: 419 | current_op_value = overlapping_parameter 420 | 421 | # Add the op value for the current partition to the list of op values 422 | op_values.append(current_op_value) 423 | 424 | # Remove the column containing the timestamps from the overlapping partitions 425 | dataset_as_overlapping_partitions = np.delete(dataset_as_overlapping_partitions, -1, axis=2) 426 | 427 | # Return the updated overlapping partitions and the op values 428 | return dataset_as_overlapping_partitions, np.array(op_values) 429 | 430 | 431 | def get_timedelta(delta_unit_time: Union[int, float], unit: TimeUnit) -> timedelta: 432 | """ 433 | Returns a timedelta object with the specified delta_unit_time in the specified TimeUnit. 434 | 435 | Args: 436 | delta_unit_time: The delta unit time value. 437 | unit: The TimeUnit enum value representing the unit of time. 438 | 439 | Returns: 440 | A timedelta object with the specified delta_unit_time in the specified TimeUnit. 441 | """ 442 | if unit == TimeUnit.MICROSECOND: 443 | return timedelta(microseconds=delta_unit_time) 444 | elif unit == TimeUnit.MILLISECOND: 445 | return timedelta(milliseconds=delta_unit_time) 446 | elif unit == TimeUnit.SECOND: 447 | return timedelta(seconds=delta_unit_time) 448 | elif unit == TimeUnit.MINUTE: 449 | return timedelta(minutes=delta_unit_time) 450 | elif unit == TimeUnit.HOUR: 451 | return timedelta(hours=delta_unit_time) 452 | elif unit == TimeUnit.DAY: 453 | return timedelta(days=delta_unit_time) 454 | elif unit == TimeUnit.YEAR: 455 | return timedelta(days=delta_unit_time * 365) 456 | else: 457 | raise VALUE_ERROR 458 | 459 | 460 | def get_total_units(timedelta_: timedelta, unit: Union[TimeUnit, object]) -> float: 461 | if unit == TimeUnit.MICROSECOND: 462 | return timedelta_.total_seconds() * 1e6 463 | elif unit == TimeUnit.MILLISECOND: 464 | return timedelta_.total_seconds() * 1e3 465 | elif unit == TimeUnit.SECOND: 466 | return timedelta_.total_seconds() 467 | elif unit == TimeUnit.MINUTE: 468 | return timedelta_.total_seconds() / 60 469 | elif unit == TimeUnit.HOUR: 470 | return timedelta_.total_seconds() / 3600 471 | elif unit == TimeUnit.DAY: 472 | return timedelta_.total_seconds() / (3600 * 24) 473 | elif unit == TimeUnit.YEAR: 474 | return timedelta_.total_seconds() / (3600 * 24 * 365.25) 475 | else: 476 | raise VALUE_ERROR 477 | 478 | 479 | def check_time_unit(diff: timedelta) -> Tuple[int, TimeUnit]: 480 | """ 481 | Method to determine the unit of time of the dataset. 482 | 483 | Args: 484 | diff (timedelta): The time difference to be checked. 485 | 486 | Returns: 487 | Tuple[int, TimeUnit]: A tuple with the time value and its unit. 488 | """ 489 | 490 | if diff.total_seconds() >= 31536000: # 1 year in seconds 491 | years = int(diff.total_seconds() / 31536000) 492 | t_s = years 493 | time_unit = TimeUnit.YEAR 494 | elif diff.total_seconds() >= 86400: # 1 day in seconds 495 | days = int(diff.total_seconds() / 86400) 496 | t_s = days 497 | time_unit = TimeUnit.DAY 498 | elif diff.total_seconds() >= 3600: # 1 hour in seconds 499 | hours = int(diff.total_seconds() / 3600) 500 | t_s = hours 501 | time_unit = TimeUnit.HOUR 502 | elif diff.total_seconds() >= 60: # 1 minute in seconds 503 | minutes = int(diff.total_seconds() / 60) 504 | t_s = minutes 505 | time_unit = TimeUnit.MINUTE 506 | elif diff.total_seconds() >= 1: 507 | t_s = int(diff.total_seconds()) 508 | time_unit = TimeUnit.SECOND 509 | elif diff.total_seconds() * 1000 >= 1: 510 | t_s = int(diff.total_seconds() * 1000) 511 | time_unit = TimeUnit.MILLISECOND 512 | elif diff.total_seconds() * 1000000 >= 1: 513 | t_s = int(diff.total_seconds() * 1000000) 514 | time_unit = TimeUnit.MICROSECOND 515 | else: 516 | raise ValueError("Could not determine the unit of time of the dataset") 517 | 518 | return t_s, time_unit 519 | 520 | 521 | def convert_seconds_to_time_unit(value: Union[float, int], unit: TimeUnit) -> Union[float, int]: 522 | """ 523 | Converts a given value from seconds to a specified time unit. 524 | 525 | Args: 526 | value (Union[float, int]): The value in seconds that needs to be converted. 527 | unit (TimeUnit): The target time unit for the conversion. 528 | 529 | Returns: 530 | Union[float, int]: The converted value in the target time unit. 531 | 532 | Raises: 533 | ValueError: If an invalid TimeUnit is provided. 534 | """ 535 | conversion_factors = { 536 | TimeUnit.MICROSECOND: 1e6, 537 | TimeUnit.MILLISECOND: 1e3, 538 | TimeUnit.SECOND: 1, 539 | TimeUnit.MINUTE: 1 / 60, 540 | TimeUnit.HOUR: 1 / 3600, 541 | TimeUnit.DAY: 1 / (3600 * 24), 542 | TimeUnit.YEAR: 1 / (3600 * 24 * 365.25) 543 | } 544 | 545 | if unit in conversion_factors: 546 | return value * conversion_factors[unit] 547 | 548 | raise VALUE_ERROR 549 | 550 | 551 | def save_dict_to_json(path: str, data: Dict): 552 | """ 553 | Save a dictionary into a json file 554 | Args: 555 | path (str): the path where to store the json file 556 | data (Dict): the dictionary 557 | 558 | Returns: 559 | 560 | """ 561 | with open(path, 'w') as f: 562 | json.dump(data, f) 563 | 564 | 565 | def convert_dataset_index_to_datetime(dataset: pd.DataFrame) -> None: 566 | """ 567 | Check if the index of the DataFrame dataset is already in the datetime format. If the index is not in datetime 568 | format, dataset.index = pd.to_datetime(dataset.index) statement is executed to convert it. 569 | 570 | Args: 571 | dataset (pd.DataFrame): A dataset as pandas DataFrame 572 | 573 | Returns: 574 | None 575 | """ 576 | if not is_datetime64_any_dtype(dataset.index): 577 | dataset.index = pd.to_datetime(dataset.index) 578 | -------------------------------------------------------------------------------- /eventdetector_ts/metamodel/meta_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pprint 3 | import shutil 4 | from typing import Union, Dict, Optional 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import tensorflow as tf 9 | 10 | from eventdetector_ts import FFN, FILL_NAN_ZEROS, TYPE_TRAINING_AVERAGE, STANDARD_SCALER, \ 11 | config_dict, CONFIG_FILE 12 | from eventdetector_ts.data.helpers_data import compute_middle_event, remove_close_events, \ 13 | convert_events_to_intervals, get_union_times_events, get_dataset_within_events_times, \ 14 | convert_dataframe_to_overlapping_partitions, op, check_time_unit, save_dict_to_json, \ 15 | convert_dataset_index_to_datetime, convert_seconds_to_time_unit 16 | from eventdetector_ts.metamodel import logger_meta_model 17 | from eventdetector_ts.metamodel.utils import DataSplitter, validate_args, validate_required_args, validate_ffn, \ 18 | validate_cnn, validate_rnn 19 | from eventdetector_ts.models.models_builder import ModelCreator 20 | from eventdetector_ts.models.models_trainer import ModelTrainer 21 | from eventdetector_ts.optimization.event_extraction_pipeline import OptimizationData, EventOptimization 22 | from eventdetector_ts.plotter.plotter import Plotter 23 | 24 | 25 | class MetaModel: 26 | def __init__( 27 | self, 28 | output_dir: str, 29 | dataset: pd.DataFrame, 30 | events: Union[list, pd.DataFrame], 31 | width: int, 32 | step: int = 1, 33 | width_events: Optional[Union[int, float]] = None, 34 | **kwargs 35 | ): 36 | """ 37 | Initializes a new instance of the MetaModel class. 38 | 39 | Args: 40 | output_dir (str): The name or path of the directory where all outputs will be saved. 41 | If output_dir is a folder name, the full path in the current directory will be created. 42 | dataset (pd.DataFrame): The input dataset as a Pandas DataFrame. 43 | events (Union[list, pd.DataFrame]): The input events as either a list or a Pandas DataFrame. 44 | width (int): Number of consecutive time steps in each partition (window) when creating overlapping 45 | partitions (sliding windows). 46 | step (int = 1): Number of time steps to advance the sliding window. Default to 1. 47 | width_events (Union[int, float] = None): The width of each event. 48 | If it's an integer, it represents the number of time steps that constitute an event. 49 | If it's a float, it represents the duration in seconds of each event. 50 | If not provided (None), it defaults to the value of (width -1). 51 | kwargs (Dict): Optional keyword arguments for additional parameters. 52 | - t_max (float): The maximum total time is linked to the `sigma variable of the Gaussian filter. 53 | This time should be expressed in the same unit of time (seconds, minutes, etc.) as used in the 54 | dataset. The unit of time for the dataset is determined by its time sampling. In other words, 55 | the `sigma` variable should align with the timescale used in your time series data. 56 | The default value is calculated as (3 x (width-1) x time_sampling) / 2. 57 | - delta (Union[int, float]): The maximum time tolerance used to determine the correspondence 58 | between a predicted event and its actual counterpart in the true events. If it's an integer, it 59 | represents the number of time steps. If it's a float, it represents the duration in seconds 60 | The default value is width_events x time_sampling. 61 | - s_h (float): A step parameter for adjusting the peak height threshold `h` during the peak detection 62 | process. The default value is 0.05. 63 | - epsilon (float): A small constant used to control the size of set which contains the top models 64 | with the lowest MSE values. The default value is 0.0002. 65 | - pa (int): The patience for the early stopping algorithm. The default value is 5. 66 | - t_r (float): The ratio threshold for the early stopping algorithm. 67 | The default value is 0.97. 68 | - time_window (Union[int, float] = None): This parameter controls the amount of data within the dataset 69 | is used for the training process. If it's an integer, it represents a specific number time steps. 70 | If it's a float, it represents a duration in seconds. By default, it is set to None, which means all 71 | available data will be used. However, if a value is provided, the dataset will include a specific 72 | interval of data surrounding each reference event. This interval includes data from both sides of 73 | each event, with a duration equal to the specified `time_window`. Setting a `time_window` in some 74 | situations can offer several advantages, such as accelerating the training process and enhancing 75 | the neural networks' understanding of rare events. 76 | - models (List[Union[str, Tuple[str, int]]]): Determines the type of deep learning models to use. 77 | If a tuple is passed, it specifies both the model type and the number of instances to run. 78 | The default value is [(FFN, 2)]. 79 | - hyperparams_ffn (Tuple[int, int, int, int, str]): Specify for the FFN the minimum and the maximum 80 | number of layers, the minimum and the maximum number of neurons per layer, and the activation 81 | function. The default value is (1, 3, 64, 256, "sigmoid"). The List of available activation 82 | functions are ["relu","sigmoid","tanh","softmax","leaky_relu","elu","selu","swish"]. 83 | If you pass `None`, no activation is applied (i.e. "linear" activation: `a(x) = x`). 84 | - hyperparams_cnn (Tuple[int, int, int, int, int, str]): Specify for the CNN the minimum, maximum number 85 | of filters, the minimum, the maximum kernel size, the minimum and the maximum number of pooling 86 | layers, and the activation function. The default value is (16, 64, 3, 8, 1, 2, "relu"). 87 | - hyperparams_transformer (Tuple[int, int, int, bool, str]): Specify for Transformer the Key dimension, 88 | number of heads, the number of the encoder blocks, a flag to indicate the use of the original 89 | architecture, and the activation function. The default value is (256, 8, 10, True, "relu"). 90 | - hyperparams_rnn (Tuple[int, int, int, str]): Specify for the RNN the minimum and the maximum number 91 | of RNN layers, the minimum and the maximum number of hidden units, and the activation function. 92 | The default value is (1, 2, 16, 128, "tanh"). 93 | - hyperparams_mm_network (Tuple[int, int, str]): Specify for the MetaModel network the number 94 | of layers, the number of neurons per layer, and the activation function. 95 | The default value is (1, 32, "sigmoid"). 96 | - epochs (int): The number of epochs to train different models. The default value is False 256. 97 | - batch_size (int): The number of samples per gradient update. 98 | The default value is 32. 99 | - fill_nan (str): Specifies the method to use for filling NaN values in the dataset. 100 | Supported methods are 'zeros', 'ffill', 'bfill', and 'median'. 101 | The default is 'zeros'. 102 | - type_training (str):Specifies the type of training technique to use for the MetaModel. 103 | Supported techniques are 'average' and 'ffn'. 104 | The default is 'average'. 105 | - scaler (str): The type of scaler to use for preprocessing the data. 106 | Possible values are "MinMaxScaler", "StandardScaler", and "RobustScaler". 107 | Default is "StandardScaler" 108 | - use_kfold (bool): Whether to use k-fold cross-validation technique or not. 109 | The default value is False. 110 | - test_size (float): The proportion of the dataset to include in the test split. 111 | Should be a value between 0 and 1. Default is 0.2. 112 | - val_size (float): The proportion of the training set to use for validation. 113 | Should be a value between 0 and 1. Default is 0.2. 114 | - save_models_as_dot_format (bool = False): Whether to save the models as a dot format file. 115 | The default value is False. If set to True, then you should have graphviz software 116 | to be installed on your machine. 117 | - remove_overlapping_events (bool = True): Whether to remove the overlapping events or not. 118 | The default value is True. 119 | - dropout (float = 0.3): The dropout rate, which determines the fraction of input units to drop during 120 | training. 121 | - last_act_func (str = "sigmoid"): Activation function for the final layer of each model. Defaults to 122 | "sigmoid". If set to `None`, no activation will be applied (i.e., "linear" activation: `a(x) = x`). 123 | 124 | """ 125 | self.step = step 126 | self.width = width 127 | self.events = events 128 | self.dataset = dataset 129 | self.output_dir = output_dir 130 | self.width_events = width_events 131 | validate_required_args(self) 132 | self.kwargs: Dict = kwargs 133 | self.y = np.empty(shape=(0,)) 134 | self.x = np.empty(shape=(0,)) 135 | self.__compute_and_set_time_sampling() 136 | self.__set_defaults() 137 | validate_args(self) 138 | 139 | if self.save_models_as_dot_format: 140 | logger_meta_model.warning("save_models_as_dot_format is set to true, " 141 | "you should have graphviz software to be installed on your machine.") 142 | self.__create_output_dir() 143 | # Create a `ModelCreator` object with the provided models and hyperparameters 144 | self.model_creator: ModelCreator = ModelCreator(models=self.models, hyperparams_ffn=self.hyperparams_ffn, 145 | hyperparams_cnn=self.hyperparams_cnn, 146 | hyperparams_rnn=self.hyperparams_rnn, 147 | hyperparams_transformer=self.hyperparams_transformer, 148 | last_act_func=self.last_act_func, dropout=self.dropout, 149 | save_models_as_dot_format=self.save_models_as_dot_format, 150 | root_dir=self.output_dir) 151 | # Create a `DataSplitter` object with the provided test_size and scaler_type 152 | self.data_splitter: DataSplitter = DataSplitter(test_size=self.test_size, scaler_type=self.scaler) 153 | # Create a `ModelTrainer` object with the provided data_splitter, epochs, 154 | # batch_size, pa, t_r, use_kfold, val_size, epsilon and save_models_as_dot_format. 155 | self.model_trainer: ModelTrainer = ModelTrainer(data_splitter=self.data_splitter, epochs=self.epochs, 156 | batch_size=self.batch_size, pa=self.pa, t_r=self.t_r, 157 | use_kfold=self.use_kfold, 158 | val_size=self.val_size, epsilon=self.epsilon, 159 | save_models_as_dot_format=self.save_models_as_dot_format) 160 | # class represents the data used for the event extraction pipeline. 161 | self.optimization_data: OptimizationData = OptimizationData(t_max=self.t_max, w_s=self.w_s, s_s=self.s_s, 162 | s_h=self.s_h, delta=self.delta, 163 | output_dir=self.output_dir, 164 | time_unit=self.time_unit) 165 | 166 | self.event_optimization: EventOptimization = EventOptimization(optimization_data=self.optimization_data) 167 | # The Plotter class is responsible for generating and saving plots. 168 | self.plotter: Plotter = Plotter(root_dir=self.output_dir, time_unit=self.time_unit, 169 | width_events_s=self.width_events_s) 170 | 171 | def __create_output_dir(self) -> None: 172 | """ 173 | Check if output_dir is already a complete path, if output_dir is a folder name, 174 | create the full path in the current directory. 175 | 176 | Returns: 177 | None 178 | """ 179 | 180 | # Check if output_dir is already a complete path 181 | if os.path.isabs(self.output_dir): 182 | if not os.path.exists(self.output_dir): 183 | logger_meta_model.critical(f"{self.output_dir} does not exists") 184 | raise ValueError(f"{self.output_dir} does not exists") 185 | 186 | # If output_dir is a folder name, create the full path in the current directory 187 | else: 188 | # Get the absolute path of the current directory 189 | current_directory = os.path.abspath(".") 190 | self.output_dir = os.path.join(current_directory, self.output_dir) 191 | if os.path.exists(self.output_dir): 192 | logger_meta_model.warning(f"The working directory '{self.output_dir}' exists and it will be deleted") 193 | shutil.rmtree(self.output_dir) 194 | logger_meta_model.info(f"Creating the working directory at: '{self.output_dir}'") 195 | os.makedirs(self.output_dir) 196 | 197 | config_dict['output_dir'] = self.output_dir 198 | 199 | def __set_defaults_bis(self) -> None: 200 | """ 201 | Sets default values for any missing keyword arguments in self.kwargs. 202 | 203 | Returns: 204 | None 205 | """ 206 | if self.width_events is None: 207 | self.width_events = self.width 208 | self.t_max = self.kwargs.get('t_max', (3.0 * self.w_s) / 2) # the minimum should be equal to w_s 209 | 210 | if self.kwargs.get('delta') is None: 211 | self.delta = self.width_events_s 212 | else: 213 | if isinstance(self.kwargs.get('delta'), float): 214 | self.delta = convert_seconds_to_time_unit(value=self.kwargs.get('delta'), unit=self.time_unit) 215 | else: 216 | self.delta = self.kwargs.get('delta') * self.t_s 217 | 218 | self.s_h = self.kwargs.get('s_h', 0.05) 219 | self.epsilon = self.kwargs.get('epsilon', 0.0002) 220 | self.pa = self.kwargs.get('pa', 5) 221 | self.t_r = self.kwargs.get('t_r', 0.97) 222 | 223 | def __set_defaults(self) -> None: 224 | """ 225 | Sets default values for any missing keyword arguments in self.kwargs. 226 | 227 | Returns: 228 | None 229 | """ 230 | self.__set_defaults_bis() 231 | 232 | if self.kwargs.get('time_window') is None: 233 | self.time_window = None 234 | else: 235 | if isinstance(self.kwargs.get('time_window'), float): 236 | self.time_window = convert_seconds_to_time_unit(value=self.kwargs.get('time_window'), 237 | unit=self.time_unit) 238 | else: 239 | self.time_window = self.kwargs.get('time_window') * self.t_s 240 | 241 | self.models = self.kwargs.get('models', [(FFN, 2)]) 242 | for i, model in enumerate(self.models): 243 | if isinstance(model, str): 244 | self.models[i] = (model, 1) 245 | elif isinstance(model, tuple) and len(model) == 1: 246 | self.models[i] = (model[0], 1) 247 | 248 | self.hyperparams_ffn = self.kwargs.get('hyperparams_ffn', (1, 3, 64, 256, "sigmoid")) 249 | self.hyperparams_ffn = validate_ffn(self) 250 | self.hyperparams_cnn = self.kwargs.get('hyperparams_cnn', (16, 64, 3, 8, 1, 2, "relu")) 251 | self.hyperparams_cnn = validate_cnn(self) 252 | self.hyperparams_rnn = self.kwargs.get('hyperparams_rnn', (1, 2, 16, 128, "tanh")) 253 | self.hyperparams_rnn = validate_rnn(self) 254 | self.hyperparams_transformer = self.kwargs.get("hyperparams_transformer", (256, 4, 1, True, "relu")) 255 | self.hyperparams_mm_network = self.kwargs.get('hyperparams_mm_network', (1, 32, "sigmoid")) 256 | self.epochs = self.kwargs.get('epochs', 256) 257 | self.batch_size = self.kwargs.get('batch_size', 32) 258 | self.fill_nan = self.kwargs.get('fill_nan', FILL_NAN_ZEROS) 259 | self.type_training = self.kwargs.get('type_training', TYPE_TRAINING_AVERAGE) 260 | self.scaler = self.kwargs.get('scaler', STANDARD_SCALER) 261 | self.use_kfold = self.kwargs.get('use_kfold', False) 262 | self.test_size = self.kwargs.get('test_size', 0.2) 263 | self.val_size = self.kwargs.get('val_size', 0.2) 264 | 265 | self.save_models_as_dot_format = self.kwargs.get('save_models_as_dot_format', False) 266 | self.remove_overlapping_events = self.kwargs.get("remove_overlapping_events", True) 267 | self.last_act_func = self.kwargs.get("last_act_func", "sigmoid") 268 | self.dropout = self.kwargs.get("dropout", 0.3) 269 | 270 | log_dict = { 271 | 'width_events_s': self.width_events_s, 272 | 't_max': self.t_max, 273 | 'delta': self.delta, 274 | 's_h': self.s_h, 275 | 'epsilon': self.epsilon, 276 | 'pa': self.pa, 277 | 't_r': self.t_r, 278 | 'time_window': self.time_window, 279 | 'models': self.models, 280 | 'hyperparams_ffn': self.hyperparams_ffn, 281 | 'hyperparams_cnn': self.hyperparams_cnn, 282 | 'hyperparams_rnn': self.hyperparams_rnn, 283 | 'hyperparams_transformer': self.hyperparams_transformer, 284 | 'hyperparams_mm_network': self.hyperparams_mm_network, 285 | 'epochs': self.epochs, 286 | 'batch_size': self.batch_size, 287 | 'fill_nan': self.fill_nan, 288 | 'type_training': self.type_training, 289 | 'scaler': self.scaler, 290 | 'use_kfold': self.use_kfold, 291 | 'test_size': self.test_size, 292 | 'val_size': self.val_size, 293 | 'save_models_as_dot_format': self.save_models_as_dot_format, 294 | "remove_overlapping_events": self.remove_overlapping_events, 295 | "last_act_func": self.last_act_func, 296 | "dropout": self.dropout 297 | } 298 | 299 | log_message = pprint.pformat(log_dict, indent=4) 300 | logger_meta_model.info(log_message) 301 | 302 | config_dict.update({'width': self.width, 'step': self.step, 'batch_size': self.batch_size, 303 | 'type_training': self.type_training, 'fill_nan': self.fill_nan}) 304 | 305 | def __compute_and_set_time_sampling(self) -> None: 306 | """ 307 | Compute the time sampling of the dataset by calculating the time difference between the first two index values. 308 | Then set the corresponding parameters: t_s, w_s, and s_s. 309 | 310 | Returns: 311 | None 312 | 313 | Raises: 314 | TypeError: If the index of the dataset is not in datetime format. 315 | """ 316 | try: 317 | logger_meta_model.info("checks if the index of the dataset is already in the datetime format.") 318 | convert_dataset_index_to_datetime(self.dataset) 319 | # Get the first two index values of the dataset 320 | a = self.dataset.index[0] 321 | b = self.dataset.index[1] 322 | # Calculate the time difference between the first two index values 323 | diff = b - a 324 | # Check the units of the time difference 325 | logger_meta_model.info("Computing the time sampling and time unit of the dataset") 326 | self.t_s, self.time_unit = check_time_unit(diff=diff) 327 | logger_meta_model.warning(f"The time sampling t_s is {self.t_s} {self.time_unit}s") 328 | self.w_s = self.t_s * (self.width - 1) 329 | self.s_s = self.t_s * self.step 330 | 331 | if self.width_events is None: 332 | self.width_events_s = self.w_s 333 | else: 334 | self.width_events_s = self.t_s * self.width_events 335 | 336 | if isinstance(self.width_events, float): 337 | self.width_events_s = convert_seconds_to_time_unit(value=self.width_events, unit=self.time_unit) 338 | 339 | config_dict['w_s'] = self.w_s 340 | config_dict['width_events_s'] = self.width_events_s 341 | config_dict['time_unit'] = self.time_unit.value 342 | except AttributeError: 343 | logger_meta_model.critical("The dataset is not compatible with the datetime format") 344 | raise TypeError("The index should be in datetime format.") 345 | 346 | def prepare_data_and_computing_op(self) -> None: 347 | """ 348 | Prepare the events and dataset for computing op. 349 | This method will compute the middle event of the given events, remove any close events based on the self.w_s, 350 | and convert the remaining events to intervals. If a time partition is specified, it will get the union of 351 | event times and extract the corresponding portion of the dataset. 352 | 353 | The dataset will then be converted to overlapping partitions using the specified width and step size, 354 | and the $op$ (overlapping parameter) values will be computed for each partition based on the given intervals. 355 | 356 | Finally, the learning data (overlapping partitions and corresponding $op$ values) will be stored in 357 | the instance variables x and y. 358 | 359 | Returns: 360 | None 361 | """ 362 | 363 | logger_meta_model.info("Computes the middle date of events...") 364 | 365 | self.events = compute_middle_event(self.events) 366 | 367 | logger_meta_model.info("Removes events that occur too close together...") 368 | temp: int = len(self.events) 369 | self.events = remove_close_events(self.events, self.width_events_s, self.time_unit, 370 | self.remove_overlapping_events) 371 | 372 | logger_meta_model.warning(f"A total of {temp - len(self.events)}/{temp} events were removed due to overlapping") 373 | logger_meta_model.info("Convert events to intervals...") 374 | intervals = convert_events_to_intervals(self.events, self.width_events_s, self.time_unit) 375 | 376 | if self.time_window is not None: 377 | logger_meta_model.warning(f"time_window is provided = {self.time_window} {self.time_unit}s") 378 | events_times = get_union_times_events(self.events, self.time_window, self.time_unit) 379 | self.dataset = get_dataset_within_events_times(self.dataset, events_times) 380 | 381 | logger_meta_model.info("Computing overlapping partitions...") 382 | overlapping_partitions = convert_dataframe_to_overlapping_partitions(self.dataset, width=self.width, 383 | step=self.step, 384 | fill_method=self.fill_nan) 385 | 386 | logger_meta_model.info("Computing op...") 387 | self.x, self.y = op(dataset_as_overlapping_partitions=overlapping_partitions, events_as_intervals=intervals) 388 | 389 | # Convert x and y arrays to float32 for consistency 390 | self.x = np.asarray(self.x).astype('float32') 391 | self.y = np.asarray(self.y).astype('float32') 392 | 393 | self.optimization_data.set_overlapping_partitions(overlapping_partitions) 394 | self.optimization_data.set_true_events(self.events) 395 | 396 | def build_stacking_learning(self) -> None: 397 | """ 398 | Builds a stacking learning pipeline using the provided models and hyperparameters. 399 | 400 | Returns: 401 | None 402 | """ 403 | 404 | # Get the number of time steps and features from the x data 405 | n_time_steps, n_features = self.x.shape[1], self.x.shape[2] 406 | config_dict['n_time_steps'] = n_time_steps 407 | inputs = tf.keras.Input(shape=(n_time_steps, n_features), name="input") 408 | # Call the `create_models` method to create the models 409 | logger_meta_model.info(f"Create the following models: {list(map(lambda x: x[0], self.models))}") 410 | self.model_creator.create_models(inputs=inputs) 411 | logger_meta_model.info("Split the data into training, validation, and test sets and apply " 412 | "the specified scaler to each time step...") 413 | self.data_splitter.split_data_and_apply_scaler(x=self.x, y=self.y) 414 | logger_meta_model.info("Saves the scalers to disk...") 415 | self.data_splitter.save_scalers(output_dir=self.output_dir) 416 | logger_meta_model.info("Fits the created models to the training data...") 417 | self.model_trainer.fitting_models(self.model_creator.created_models) 418 | logger_meta_model.info("Saving the best models...") 419 | self.model_trainer.save_best_models(output_dir=self.output_dir) 420 | predicted_y, loss, test_y = self.model_trainer.train_meta_model(type_training=self.type_training, 421 | hyperparams_mm_network 422 | =self.hyperparams_mm_network, 423 | output_dir=self.output_dir) 424 | self.optimization_data.set_predicted_op(predicted_op=predicted_y) 425 | logger_meta_model.info(f"The loss of the MetaModel is {loss:.4f}") 426 | self.plotter.set_data_op(test_y=test_y, predicted_y=predicted_y) 427 | self.plotter.set_losses(train_losses=self.model_trainer.train_losses, 428 | val_losses=self.model_trainer.val_losses, train_loss_meta_model= 429 | self.model_trainer.train_loss_meta_model, 430 | val_loss_meta_model=self.model_trainer.val_loss_meta_model) 431 | 432 | def event_extraction_optimization(self) -> None: 433 | """ 434 | Run the Event Extraction Optimization process. 435 | 436 | Returns: 437 | None 438 | """ 439 | 440 | predicted_events, delta_t = self.event_optimization.max_f1score() 441 | path = os.path.join(self.output_dir, CONFIG_FILE) 442 | logger_meta_model.info(f"Saving config file into {path}") 443 | save_dict_to_json(path=path, data=config_dict) 444 | self.plotter.set_data_events(predicted_events=predicted_events, true_events=self.optimization_data.true_events) 445 | self.plotter.set_delta_t(delta_t=delta_t) 446 | 447 | def plot_save(self, show_plots: bool = True) -> None: 448 | """ 449 | Plot the results: losses, true/predicted op, true/predicted events, deltat_t. 450 | 451 | Args: 452 | show_plots (bool): whether to show the plots or not. 453 | 454 | Returns: 455 | None 456 | """ 457 | self.plotter.set_show(show=show_plots) 458 | self.plotter.plot_losses() 459 | self.plotter.plot_prediction() 460 | self.plotter.plot_predicted_events() 461 | self.plotter.plot_delta_t(bins=10) 462 | 463 | def fit(self) -> None: 464 | """ 465 | Run prepare_data_and_computing_op, build_stacking_learning, event_extraction_optimization, and plot_save 466 | 467 | Returns: 468 | None 469 | """ 470 | self.prepare_data_and_computing_op() 471 | self.build_stacking_learning() 472 | self.event_extraction_optimization() 473 | self.plot_save() 474 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
31 |
41 | pip install eventdetector-ts
42 |
43 | ### Manual installation
44 | To get started using **Event Detector**, simply follow the instructions below to install the required packages and
45 | dependencies.
46 | #### Clone the repository:
47 |
48 | git clone https://github.com/menouarazib/eventdetector.git
49 | cd eventdetector
50 |
51 |
52 | #### Create a virtual environment:
53 |
54 | python -m venv env
55 | source env/bin/activate # for Linux/MacOS
56 | env\Scripts\activate.bat # for Windows
57 |
58 |
59 | #### Install the required packages:
60 |
61 | pip install -r requirements.txt
62 |
63 | ## Quickstart Examples
64 |
65 | ### Data Format
66 |
67 | Input time series data:
68 |
69 | ```
70 | # pandas DataFrame with datetime index
71 | feature1 feature 2
72 | 2020-12-04T00:00:00 1.234 5.0
73 | 2020-12-04T00:00:01 1.456 5.1
74 | 2020-12-04T00:00:02 1.789 5.5
75 | 2020-12-04T00:00:03 2.123 5.8
76 | ```
77 |
78 | Annotated events data format:
79 |
80 | ```
81 | # pandas DataFrame with start/end columns
82 | start end
83 | 0 2020-12-04T00:01:02 2020-12-04T00:01:42
84 | 1 2020-12-04T00:29:45 2020-12-04T00:30:20
85 | 2 2020-12-04T00:30:55 2020-12-04T00:31:28
86 | ```
87 |
88 | ### Code Implementations:
89 | - Credit Card Frauds:
90 | ```python
91 | from eventdetector_ts import load_credit_card_fraud, FFN
92 | from eventdetector_ts.metamodel.meta_model import MetaModel
93 |
94 | dataset, events = load_credit_card_fraud()
95 |
96 | meta_model = MetaModel(dataset=dataset, events=events, width=2, step=1,
97 | output_dir='credit_card_fraud', batch_size=3200, s_h=0.01, models=[(FFN, 1)],
98 | hyperparams_ffn=(1, 1, 20, 20, "sigmoid"))
99 |
100 | meta_model.fit()
101 |
102 | ```
103 | - Martian Bow Shock:
104 | ```python
105 | from eventdetector_ts import load_martian_bow_shock, FFN
106 | from eventdetector_ts.metamodel.meta_model import MetaModel
107 |
108 | dataset, events = load_martian_bow_shock()
109 |
110 | meta_model = MetaModel(output_dir="mex_bow_shocks", dataset=dataset, events=events, width=76, step=1,
111 | time_window=5400.0, batch_size=3000, models=[(FFN, 1)],
112 | hyperparams_ffn=(1 , 1, 20, 20, "sigmoid"))
113 |
114 | meta_model.fit()
115 |
116 | ```
117 |
118 | ### Performance Evaluation and Outputs
119 |
120 | #### Comparison of Our Method with Deep Learning Methods
121 |
122 | ##### Credit Card Frauds
123 |
124 | | Method | Number of Parameters | Precision | Recall | F1-Score |
125 | |---------------------|----------------------|-----------|--------|----------|
126 | | CNN [[1]](#1) | 119,457 | 0.89 | 0.68 | 0.77 |
127 | | FFN+SMOTE [[2]](#2) | 5,561 | 0.79 | 0.81 | 0.80 |
128 | | FFN+SMOTE [[3]](#3) | N/A | 0.82 | 0.79 | 0.81 |
129 | | Ours | 1,201 | 0.98 | 0.74 | 0.85 |
130 |
131 | ##### Bow Shock Crossings
132 |
133 | | Method | Number of Parameters | Precision | Recall | F1-Score |
134 | |--------------------|----------------------|-----------|---------------|---------------|
135 | | ResNet18 [[4]](#4) | 29,886,979 | 0.99 | [0.83 , 0.88] | [0.91 , 0.94] |
136 | | Ours | 6,121 | 0.95 | 0.96 | 0.95 |
137 |
138 | #### Training and Validation Losses
139 |
140 | The Figure below showcases the training loss and validation loss of the FFNs on the Bow Shock Crossings and Credit Card Frauds.
141 | The low losses observed in both cases indicate that the metamodel has successfully learned the underlying patterns,
142 | justifying the obtained good metrics.
143 |
144 |
145 |
146 |
147 |
152 |
153 |
154 |
158 |
159 |
160 |