├── gtime
├── utils
│ ├── __init__.py
│ ├── hypothesis
│ │ ├── __init__.py
│ │ ├── tests
│ │ │ ├── __init__.py
│ │ │ └── test_general_strategies.py
│ │ ├── utils.py
│ │ ├── general_strategies.py
│ │ └── feature_matrices.py
│ ├── testing_constants.py
│ ├── trends.py
│ └── fixtures.py
├── external
│ ├── __init__.py
│ └── make_holidays.py
├── causality
│ ├── tests
│ │ ├── __init__.py
│ │ ├── common.py
│ │ ├── test_granger_causality.py
│ │ ├── test_pearson_correlation.py
│ │ └── test_linear_coefficient.py
│ ├── __init__.py
│ ├── pearson_correlation.py
│ ├── linear_coefficient.py
│ └── base.py
├── experimental
│ ├── __init__.py
│ └── trend_models
│ │ └── function_trend.py
├── metrics
│ ├── tests
│ │ └── __init__.py
│ └── __init__.py
├── plotting
│ ├── tests
│ │ ├── __init__.py
│ │ ├── test_plotting.py
│ │ └── test_preprocessing.py
│ └── __init__.py
├── explainability
│ ├── tests
│ │ └── __init__.py
│ └── __init__.py
├── forecasting
│ ├── tests
│ │ ├── __init__.py
│ │ ├── test_online.py
│ │ ├── test_trend.py
│ │ └── test_naive.py
│ ├── __init__.py
│ ├── trend.py
│ └── online.py
├── hierarchical
│ ├── tests
│ │ ├── __init__.py
│ │ ├── test_naive.py
│ │ └── test_bottom_up.py
│ ├── __init__.py
│ ├── base.py
│ └── naive.py
├── preprocessing
│ ├── tests
│ │ ├── __init__.py
│ │ └── utils.py
│ ├── time_series_resampling.py
│ └── __init__.py
├── regressors
│ ├── tests
│ │ ├── __init__.py
│ │ ├── test_linear_regressor.py
│ │ └── test_explainable.py
│ ├── __init__.py
│ ├── linear_regressor.py
│ ├── explainable.py
│ └── multi_output.py
├── feature_extraction
│ ├── tests
│ │ ├── __init__.py
│ │ ├── test_trend.py
│ │ ├── test_sorted_density.py
│ │ └── test_crest_factor_detrending.py
│ ├── __init__.py
│ ├── trend.py
│ └── custom.py
├── feature_generation
│ ├── tests
│ │ ├── __init__.py
│ │ ├── test_calendar.py
│ │ └── test_external.py
│ └── __init__.py
├── model_selection
│ ├── tests
│ │ ├── __init__.py
│ │ └── test_splitters.py
│ ├── __init__.py
│ ├── horizon_shift.py
│ └── splitters.py
├── time_series_models
│ ├── tests
│ │ ├── __init__.py
│ │ ├── test_simple_models.py
│ │ └── test_cv_pipeline.py
│ ├── __init__.py
│ ├── ar.py
│ └── simple_models.py
├── compose
│ ├── __init__.py
│ ├── tests
│ │ └── test_feature_creation.py
│ └── feature_creation.py
├── __init__.py
├── _version.py
└── base.py
├── doc-requirements.txt
├── docs
├── source
│ ├── modules
│ │ ├── compose.rst
│ │ ├── metrics.rst
│ │ ├── regressors.rst
│ │ ├── causality.rst
│ │ ├── forecasting.rst
│ │ ├── preprocessing.rst
│ │ ├── model_selection.rst
│ │ ├── feature_extraction.rst
│ │ ├── feature_generation.rst
│ │ ├── time_series_models.rst
│ │ └── index.rst
│ ├── index.rst
│ └── conf.py
├── index.html
├── .nojekyll
├── Makefile
└── make.bat
├── setup.cfg
├── MANIFEST.in
├── dev-requirements.txt
├── requirements.txt
├── CODE_AUTHORS
├── .pre-commit-config.yaml
├── .coveragerc
├── conftest.py
├── GOVERNANCE.rst
├── .github
├── workflows
│ ├── build_and_publish.yml
│ ├── deploy_github_pages.yml
│ └── ci.yml
└── ISSUE_TEMPLATE
│ └── bug_report.md
├── PULL_REQUEST_TEMPLATE.md
├── .gitignore
├── setup.py
├── examples
└── hierarchical_model.ipynb
├── CODE_OF_CONDUCT.rst
├── README.md
└── CONTRIBUTING.rst
/gtime/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gtime/external/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gtime/causality/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gtime/experimental/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gtime/metrics/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gtime/plotting/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gtime/explainability/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gtime/forecasting/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gtime/hierarchical/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gtime/preprocessing/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gtime/regressors/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gtime/utils/hypothesis/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gtime/feature_extraction/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gtime/feature_generation/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gtime/model_selection/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gtime/time_series_models/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/gtime/utils/hypothesis/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/doc-requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx
2 | numpydoc
3 | sphinx_issues
4 | sphinx_rtd_theme
5 |
--------------------------------------------------------------------------------
/docs/source/modules/compose.rst:
--------------------------------------------------------------------------------
1 | Compose
2 | ==================
3 |
4 | .. automodule:: gtime.compose
5 | :members:
6 |
--------------------------------------------------------------------------------
/docs/source/modules/metrics.rst:
--------------------------------------------------------------------------------
1 | Metrics
2 | ==================
3 |
4 | .. automodule:: gtime.metrics
5 | :members:
6 |
--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/docs/source/modules/regressors.rst:
--------------------------------------------------------------------------------
1 | Regressors
2 | ==================
3 |
4 | .. automodule:: gtime.regressors
5 | :members:
6 |
--------------------------------------------------------------------------------
/docs/source/modules/causality.rst:
--------------------------------------------------------------------------------
1 | Causality Tests
2 | ==================
3 |
4 | .. automodule:: gtime.causality
5 | :members:
6 |
--------------------------------------------------------------------------------
/docs/source/modules/forecasting.rst:
--------------------------------------------------------------------------------
1 | Forecasting
2 | ==================
3 |
4 | .. automodule:: gtime.forecasting
5 | :members:
6 |
--------------------------------------------------------------------------------
/docs/source/modules/preprocessing.rst:
--------------------------------------------------------------------------------
1 | Preprocessing
2 | ==================
3 |
4 | .. automodule:: gtime.preprocessing
5 | :members:
6 |
--------------------------------------------------------------------------------
/docs/source/modules/model_selection.rst:
--------------------------------------------------------------------------------
1 | Model Selection
2 | ==================
3 |
4 | .. automodule:: gtime.model_selection
5 | :members:
6 |
--------------------------------------------------------------------------------
/gtime/explainability/__init__.py:
--------------------------------------------------------------------------------
1 | from .explainer import _ShapExplainer, _LimeExplainer
2 |
3 | __all__ = ["_ShapExplainer", "_LimeExplainer"]
4 |
--------------------------------------------------------------------------------
/docs/source/modules/feature_extraction.rst:
--------------------------------------------------------------------------------
1 | Feature Extraction
2 | ==================
3 |
4 | .. automodule:: gtime.feature_extraction
5 | :members:
6 |
--------------------------------------------------------------------------------
/docs/source/modules/feature_generation.rst:
--------------------------------------------------------------------------------
1 | Feature Generation
2 | ==================
3 |
4 | .. automodule:: gtime.feature_generation
5 | :members:
6 |
--------------------------------------------------------------------------------
/docs/source/modules/time_series_models.rst:
--------------------------------------------------------------------------------
1 | Time Series Models
2 | ==================
3 |
4 | .. automodule:: gtime.time_series_models
5 | :members:
6 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.rst
3 |
4 | [tool:pytest]
5 | addopts =
6 | --ignore doc
7 | -ra
8 |
9 | --ignore gtime/experimental
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | # Include the README
2 | include README.rst
3 |
4 | # Include the license file
5 | include LICENSE
6 |
7 | # Include the requirements file
8 | include requirements.txt
--------------------------------------------------------------------------------
/dev-requirements.txt:
--------------------------------------------------------------------------------
1 | hypothesis==5.5.3
2 | black
3 | pre-commit
4 | pytest
5 | pytest-cov
6 | pytest-xdist
7 | pytest-lazy-fixture
8 | flake8
9 | mypy
10 | nbconvert
11 | jupyter
12 |
--------------------------------------------------------------------------------
/gtime/utils/testing_constants.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | DEFAULT_START = pd.Timestamp("1970-01-01")
4 | DEFAULT_END = pd.Timestamp("2020-01-01")
5 | DEFAULT_FREQ = pd.Timedelta("1D")
6 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas>=0.25.3
2 | scipy>=0.17.0
3 | scikit-learn>=0.22.0
4 | matplotlib>=3.1.0
5 | lime>=0.2.0.0
6 | shap>=0.35
7 | holidays>=0.10.2
8 | lunarcalendar>=0.0.9
9 | giotto-tda
10 |
--------------------------------------------------------------------------------
/gtime/plotting/__init__.py:
--------------------------------------------------------------------------------
1 | from .plotting import seasonal_plot, seasonal_subplots, lag_plot, acf_plot
2 |
3 | __all__ = [
4 | "seasonal_plot",
5 | "seasonal_subplots",
6 | "acf_plot",
7 | "lag_plot",
8 | ]
9 |
--------------------------------------------------------------------------------
/gtime/compose/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The :mod:`gtime.compose` module contains meta-estimators for building composite models
3 | with transformers.
4 | """
5 |
6 | from .feature_creation import FeatureCreation
7 |
8 | __all__ = ["FeatureCreation"]
9 |
--------------------------------------------------------------------------------
/gtime/preprocessing/time_series_resampling.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 |
4 | # FIXME: TBD
5 | class _TimeSeriesResampler:
6 | def __init__(self):
7 | pass
8 |
9 | def transform(self, X: pd.Series):
10 | raise NotImplementedError
11 |
--------------------------------------------------------------------------------
/gtime/utils/trends.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def polynomial(X, weights):
5 | return np.poly1d(weights)(X)
6 |
7 |
8 | def exponential(X, exponent):
9 | return np.exp(X * exponent)
10 |
11 |
12 | TRENDS = {"polynomial": polynomial, "exponential": exponential}
13 |
--------------------------------------------------------------------------------
/CODE_AUTHORS:
--------------------------------------------------------------------------------
1 | # The following is the list of the code authors of the giotto-time python
2 | # package. Where component authors are known, add them here.
3 |
4 | Alessio Baccelli a.baccelli@l2f.ch
5 | Stefano Savarè s.savare@l2f.ch
6 | Benjamin Russell b.russell@l2f.ch
7 | Matteo Caorsi m.caorsi@giotto.ai
8 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/psf/black
3 | rev: stable
4 | hooks:
5 | - id: black
6 | language_version: python3.7
7 | - repo: https://github.com/pre-commit/pre-commit-hooks.git
8 | sha: v0.9.5
9 | hooks:
10 | - id: no-commit-to-branch
--------------------------------------------------------------------------------
/gtime/__init__.py:
--------------------------------------------------------------------------------
1 | from gtime._version import __version__
2 |
3 | __all__ = [
4 | "causality",
5 | "compose",
6 | "feature_extraction",
7 | "feature_generation",
8 | "forecasting",
9 | "metrics",
10 | "model_selection",
11 | "preprocessing",
12 | "regressors",
13 | "time_series_models",
14 | "utils",
15 | ]
16 |
--------------------------------------------------------------------------------
/gtime/feature_generation/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The :mod:`gtime.feature_generation` module deals with the creation of features that do
3 | not depend on the input data, but just on its index.
4 | """
5 |
6 | from .calendar import Calendar
7 | from .external import PeriodicSeasonal, Constant
8 |
9 | __all__ = ["PeriodicSeasonal", "Constant", "Calendar"]
10 |
--------------------------------------------------------------------------------
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | branch = True
3 | source = gtime
4 | parallel = True
5 | omit =
6 | **/experimental/*
7 | **/setup.py
8 | **/tests/*
9 |
10 | [report]
11 | exclude_lines =
12 | # Have to re-enable the standard pragma
13 | pragma: no cover
14 |
15 | # Don't complain if tests don't hit defensive assertion code:
16 | raise NotImplementedError
17 |
--------------------------------------------------------------------------------
/gtime/regressors/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The :mod:`gtime.regressors` module contains regression models.
3 | """
4 |
5 | from .linear_regressor import LinearRegressor
6 | from .multi_output import MultiFeatureMultiOutputRegressor
7 | from .explainable import ExplainableRegressor
8 |
9 | __all__ = [
10 | "LinearRegressor",
11 | "MultiFeatureMultiOutputRegressor",
12 | "ExplainableRegressor",
13 | ]
14 |
--------------------------------------------------------------------------------
/gtime/causality/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The :mod:`gtime.causality` module deals with the causality tests for time
3 | series data.
4 | """
5 |
6 | from .linear_coefficient import ShiftedLinearCoefficient
7 | from .pearson_correlation import ShiftedPearsonCorrelation
8 | from .granger_causality import GrangerCausality
9 |
10 |
11 | __all__ = ["ShiftedLinearCoefficient", "ShiftedPearsonCorrelation", "GrangerCausality"]
12 |
--------------------------------------------------------------------------------
/gtime/model_selection/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The :mod:`gtime.model_selection` module deals with model selection.
3 | """
4 |
5 | from .horizon_shift import horizon_shift
6 | from .splitters import FeatureSplitter
7 | from .cross_validation import time_series_split, blocking_time_series_split
8 |
9 | __all__ = [
10 | "FeatureSplitter",
11 | "horizon_shift",
12 | "time_series_split",
13 | "blocking_time_series_split",
14 | ]
15 |
--------------------------------------------------------------------------------
/gtime/preprocessing/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The :mod:`gtime.preprocessing` module deals with the preprocessing of time series
3 | data.
4 | """
5 |
6 | from .time_series_conversion import (
7 | _SequenceToTimeIndexSeries,
8 | _PandasSeriesToTimeIndexSeries,
9 | _TimeIndexSeriesToPeriodIndexSeries,
10 | )
11 |
12 | from .time_series_preparation import TimeSeriesPreparation
13 |
14 | __all__ = [
15 | "TimeSeriesPreparation",
16 | ]
17 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | .. giotto documentation master file, created by
2 | sphinx-quickstart on Mon Jun 3 11:56:46 2019.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | Welcome to giotto-time's API reference!
7 | ========================================
8 |
9 | .. toctree::
10 | :maxdepth: 3
11 | :caption: Contents:
12 |
13 | modules/index
14 |
15 | References
16 | ----------
17 |
18 | * :ref:`genindex`
19 | * :ref:`modindex`
20 |
21 |
--------------------------------------------------------------------------------
/gtime/hierarchical/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The :mod:`gtime.hierarchical` module contains hierarchical time series models.
3 | """
4 |
5 | from .base import HierarchicalBase
6 | from .naive import HierarchicalNaive
7 | from .bottom_up import HierarchicalBottomUp
8 | from .top_down import HierarchicalTopDown
9 | from .middle_out import HierarchicalMiddleOut
10 |
11 | __all__ = [
12 | "HierarchicalBase",
13 | "HierarchicalNaive",
14 | "HierarchicalBottomUp",
15 | "HierarchicalTopDown",
16 | "HierarchicalMiddleOut",
17 | ]
18 |
--------------------------------------------------------------------------------
/gtime/time_series_models/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The :mod:`gtime.time_series_models` module contains time series models.
3 | """
4 |
5 | from .base import TimeSeriesForecastingModel
6 | from .ar import AR
7 | from .simple_models import (
8 | Naive,
9 | SeasonalNaive,
10 | Average,
11 | Drift,
12 | )
13 | from .cv_pipeline import CVPipeline
14 |
15 | __all__ = [
16 | "TimeSeriesForecastingModel",
17 | "AR",
18 | "Naive",
19 | "SeasonalNaive",
20 | "Average",
21 | "Drift",
22 | "CVPipeline",
23 | ]
24 |
--------------------------------------------------------------------------------
/conftest.py:
--------------------------------------------------------------------------------
1 | from datetime import timedelta
2 |
3 | from hypothesis import settings, Verbosity, HealthCheck
4 |
5 | settings.register_profile(
6 | "ci",
7 | max_examples=100,
8 | suppress_health_check=(HealthCheck.too_slow,),
9 | deadline=timedelta(milliseconds=1000),
10 | )
11 | settings.register_profile(
12 | "dev",
13 | max_examples=7,
14 | suppress_health_check=(HealthCheck.too_slow,),
15 | deadline=timedelta(milliseconds=1000),
16 | )
17 | settings.register_profile("debug", max_examples=7, verbosity=Verbosity.verbose)
18 |
--------------------------------------------------------------------------------
/gtime/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The :mod:`gtime.metrics` module contains a collection of different metrics.
3 | """
4 |
5 | from .metrics import (
6 | non_zero_smape,
7 | smape,
8 | max_error,
9 | mse,
10 | log_mse,
11 | r_square,
12 | mae,
13 | mape,
14 | rmse,
15 | rmsle,
16 | gmae,
17 | )
18 |
19 | __all__ = [
20 | "non_zero_smape",
21 | "smape",
22 | "max_error",
23 | "mse",
24 | "rmse",
25 | "log_mse",
26 | "rmsle",
27 | "r_square",
28 | "mae",
29 | "mape",
30 | "gmae",
31 | ]
32 |
--------------------------------------------------------------------------------
/GOVERNANCE.rst:
--------------------------------------------------------------------------------
1 | This file describe the governance of the Giotto Time project.
2 |
3 | Project owner:
4 | --------------
5 |
6 | - L2F SA
7 |
8 | Authors:
9 | --------
10 |
11 | - Please refer to the `authors `_ file
12 |
13 | Giotto Time Project Team:
14 | --------------------
15 |
16 | - Alessio Baccelli a.baccelli@l2f.ch (Developer)
17 | - Stefano Savarè s.savare@l2f.ch (Developer)
18 | - Philippe Nguyen p.nguyen@l2f.ch (Developer)
19 |
20 | Former Project Team Members:
21 | ----------------------------
22 |
23 | - Benjamin Russell b.russell@l2f.ch
24 |
--------------------------------------------------------------------------------
/docs/source/modules/index.rst:
--------------------------------------------------------------------------------
1 | API reference
2 | =============
3 | This pages contains a list of available features in the library.
4 |
5 | .. toctree::
6 | :maxdepth: 3
7 |
8 | causality
9 |
10 | compose
11 |
12 | explainability
13 |
14 | external
15 |
16 | feature_extraction
17 |
18 | feature_generation
19 |
20 | forecasting
21 |
22 | hierarchical
23 |
24 | metrics
25 |
26 | model_selection
27 |
28 | plotting
29 |
30 | preprocessing
31 |
32 | regressors
33 |
34 | time_series_models
35 |
36 | utils
37 |
38 | References
39 | ----------
40 |
41 | * :ref:`genindex`
42 | * :ref:`modindex`
--------------------------------------------------------------------------------
/gtime/_version.py:
--------------------------------------------------------------------------------
1 | """
2 | ``giotto-time`` is a set of python methods to perform time series forecasting
3 | in a machine learning framework.
4 | """
5 | # License: Apache 2.0
6 |
7 | # PEP0440 compatible formatted version, see:
8 | # https://www.python.org/dev/peps/pep-0440/
9 | #
10 | # Generic release markers:
11 | # X.Y
12 | # X.Y.Z # For bugfix releases
13 | #
14 | # Admissible pre-release markers:
15 | # X.YaN # Alpha release
16 | # X.YbN # Beta release
17 | # X.YrcN # Release Candidate
18 | # X.Y # Final release
19 | #
20 | # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer.
21 | # 'X.Y.dev0' is the canonical version of 'X.Y.dev'
22 | #
23 |
24 | __version__ = "0.2.2"
25 |
--------------------------------------------------------------------------------
/gtime/causality/tests/common.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import pandas as pd
4 |
5 | import pandas.util.testing as testing
6 |
7 |
8 | def make_df_from_expected_shifts(expected_shifts: List[int]) -> pd.DataFrame:
9 | testing.N, testing.K = 500, 1
10 |
11 | df = testing.makeTimeDataFrame(freq="D")
12 | for sh, k in zip(expected_shifts, range(3)):
13 | df[f"shift_{k}"] = df["A"].shift(-sh)
14 | df = df.dropna()
15 |
16 | return df
17 |
18 |
19 | def shift_df_from_expected_shifts(
20 | df: pd.DataFrame, expected_shifts: List[int]
21 | ) -> pd.DataFrame:
22 | for sh, k in zip(expected_shifts, range(3)):
23 | df[f"shift_{k}"] = df[f"shift_{k}"].shift(-sh)
24 | return df.dropna()
25 |
--------------------------------------------------------------------------------
/docs/.nojekyll:
--------------------------------------------------------------------------------
1 | # Compiled python modules.
2 | *.pyc
3 | *.pyo
4 | *.pyd
5 | **/__pycache__
6 |
7 | # Setuptools distribution folder.
8 | /dist/
9 |
10 | # Python egg metadata, regenerated from source files by setuptools.
11 | /*.egg-info
12 | *.so
13 | build
14 |
15 | # Python jupyter notebooks
16 | examples/dask-worker-space
17 | examples/.ipynb_checkpoints
18 |
19 | # Data files
20 | *.pkl
21 | *.csv
22 | *.pqt
23 | data/*
24 |
25 | # Output files
26 | *.out
27 |
28 | # External
29 | **.DS_Store
30 | .idea/*
31 | .vscode/*
32 | *~
33 |
34 | # Unit test
35 | .pytest_cache/
36 | .hypothesis/
37 |
38 | # Pytest output files
39 | test-output.xml
40 |
41 | # Latex
42 | *.aux
43 | *.bbl
44 | *.blg
45 | *.brf
46 | *.log
47 | *.pdf
48 | *.synctex.gz
49 | *.toc
--------------------------------------------------------------------------------
/gtime/forecasting/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The :mod:`gtime.forecasting` module contains a collection of machine learning models,
3 | for dealing with time series data.
4 | """
5 |
6 | from .gar import GAR, GARFF, MultiFeatureMultiOutputRegressor, MultiFeatureGAR
7 | from .trend import TrendForecaster
8 | from .online import HedgeForecaster
9 | from .naive import (
10 | NaiveForecaster,
11 | SeasonalNaiveForecaster,
12 | DriftForecaster,
13 | AverageForecaster,
14 | )
15 |
16 | __all__ = [
17 | "GAR",
18 | "GARFF",
19 | "MultiFeatureGAR",
20 | "TrendForecaster",
21 | "HedgeForecaster",
22 | "NaiveForecaster",
23 | "SeasonalNaiveForecaster",
24 | "DriftForecaster",
25 | "AverageForecaster",
26 | "MultiFeatureMultiOutputRegressor",
27 | ]
28 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
22 | clean:
23 | rm -rf build/ generated/ reference/generated/
24 |
--------------------------------------------------------------------------------
/gtime/feature_extraction/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | The :mod:`gtime.feature_extraction` module deals with the creation of features
3 | starting from a time series.
4 | """
5 |
6 | from gtime.feature_generation.calendar import Calendar
7 | from .standard import (
8 | Shift,
9 | MovingAverage,
10 | MovingMedian,
11 | Max,
12 | Min,
13 | MovingCustomFunction,
14 | Polynomial,
15 | Exogenous,
16 | CustomFeature,
17 | )
18 | from .custom import SortedDensity, CrestFactorDetrending
19 |
20 | from .trend import Detrender
21 |
22 | __all__ = [
23 | "Shift",
24 | "MovingAverage",
25 | "MovingMedian",
26 | "Max",
27 | "Min",
28 | "MovingCustomFunction",
29 | "Polynomial",
30 | "Exogenous",
31 | "Calendar",
32 | "Detrender",
33 | "CustomFeature",
34 | "SortedDensity",
35 | "CrestFactorDetrending",
36 | ]
37 |
--------------------------------------------------------------------------------
/gtime/forecasting/tests/test_online.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from gtime.forecasting.online import HedgeForecaster
4 |
5 |
6 | def test_hedge_fit_predict():
7 | time_index = pd.date_range("2020-01-01", "2020-01-20")
8 | X_np = np.concatenate(
9 | (np.random.randint(4, size=(20, 2)), np.array([100] * 20).reshape(-1, 1)),
10 | axis=1,
11 | )
12 | X = pd.DataFrame(X_np, index=time_index)
13 | y = pd.DataFrame(
14 | np.random.randint(4, size=(20, 1)), index=time_index, columns=["y_1"]
15 | )
16 | hr = HedgeForecaster(random_state=42)
17 |
18 | preds = hr.fit_predict(X, y)
19 | np.testing.assert_equal(preds.shape, y.shape)
20 | np.testing.assert_almost_equal(hr.weights_[0], hr.weights_[1], decimal=2)
21 | assert hr.weights_[2] < hr.weights_[0]
22 | assert hr.weights_[2] < hr.weights_[1]
23 |
--------------------------------------------------------------------------------
/gtime/compose/tests/test_feature_creation.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | if pd.__version__ >= "1.0.0":
4 | import pandas._testing as testing
5 | else:
6 | import pandas.util.testing as testing
7 | from numpy.testing import assert_array_equal
8 |
9 | from gtime.compose import FeatureCreation
10 | from gtime.feature_extraction import Shift, MovingAverage
11 |
12 |
13 | def test_feature_creation_transform():
14 | data = testing.makeTimeDataFrame(freq="s")
15 |
16 | shift = Shift(1)
17 | ma = MovingAverage(window_size=3)
18 |
19 | col_name = "A"
20 |
21 | fc = FeatureCreation([("s1", shift, [col_name]), ("ma3", ma, [col_name]),])
22 | res = fc.fit(data).transform(data)
23 |
24 | assert_array_equal(
25 | res.columns.values,
26 | [
27 | f"s1__{col_name}__{shift.__class__.__name__}",
28 | f"ma3__{col_name}__{ma.__class__.__name__}",
29 | ],
30 | )
31 |
--------------------------------------------------------------------------------
/gtime/base.py:
--------------------------------------------------------------------------------
1 | import functools
2 |
3 | from sklearn.utils.validation import check_is_fitted
4 |
5 |
6 | def add_class_name(func):
7 | @functools.wraps(func)
8 | def wrapper_add_class_name(*args, **kwargs):
9 | value = func(*args, **kwargs)
10 | return value.add_suffix("__" + args[0].__class__.__name__)
11 |
12 | return wrapper_add_class_name
13 |
14 |
15 | class FeatureMixin:
16 | """Mixin class for all feature extraction estimators in giotto-time."""
17 |
18 | _estimator_type = "feature_extractor"
19 |
20 | def get_feature_names(self):
21 | """Return feature names for output features.
22 |
23 | Returns
24 | -------
25 | output_feature_names : ndarray, shape (n_output_features,)
26 | Array of feature names.
27 |
28 | """
29 | check_is_fitted(self)
30 |
31 | return [f"{name}__{self.__class__.__name__}" for name in self.columns_]
32 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/.github/workflows/build_and_publish.yml:
--------------------------------------------------------------------------------
1 | # This workflows will upload a Python Package using Twine when a release is created
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3 |
4 | name: Upload Python Package
5 |
6 | on: [workflow_dispatch]
7 |
8 | jobs:
9 | deploy:
10 |
11 | runs-on: ubuntu-latest
12 |
13 | steps:
14 | - uses: actions/checkout@v2
15 | - name: Set up Python
16 | uses: actions/setup-python@v2
17 | with:
18 | python-version: '3.x'
19 | - name: Install dependencies
20 | run: |
21 | python -m pip install --upgrade pip
22 | pip install setuptools wheel twine
23 | - name: Build and publish
24 | env:
25 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
26 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
27 | run: |
28 | python setup.py sdist bdist_wheel
29 | twine check dist/*
30 | twine upload dist/*
--------------------------------------------------------------------------------
/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 |
5 |
6 | #### Reference Issues/PRs
7 |
13 |
14 |
15 | #### What does this implement/fix? Explain your changes.
16 |
17 |
18 | #### Any other comments?
19 |
20 |
21 |
29 |
--------------------------------------------------------------------------------
/gtime/causality/tests/test_granger_causality.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pytest
3 | import pandas as pd
4 |
5 |
6 | if pd.__version__ >= "1.0.0":
7 | import pandas._testing as testing
8 | else:
9 | import pandas.util.testing as testing
10 | from gtime.causality import GrangerCausality
11 |
12 |
13 | # Expected values from results of statstools
14 | @pytest.mark.parametrize(
15 | "test_input, expected",
16 | [
17 | (["ssr_f"], 0.8420421667509344),
18 | (["ssr_chi2"], 0.8327660223526767),
19 | (["likelihood_chi2"], 0.8341270186135072),
20 | (["zero_f"], 0.8420421667508992),
21 | ],
22 | )
23 | def test_granger_pvalues_ssr_f(test_input, expected):
24 | # Set random seed, otherwise testing creates a new dataframe each time.
25 | np.random.seed(12)
26 |
27 | data = testing.makeTimeDataFrame(freq="s", nper=1000)
28 | granger = (
29 | GrangerCausality(target_col="A", x_col="B", max_shift=10, statistics=test_input)
30 | .fit(data)
31 | .results_[0]
32 | )
33 |
34 | p_value = granger.values[1]
35 | # Not exactly equal but up test to 7 digits
36 | np.testing.assert_almost_equal(p_value, expected, decimal=7)
37 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | #### Description
4 |
6 |
7 | #### Steps/Code to Reproduce
8 |
12 |
13 | #### Expected Results
14 |
15 |
16 | #### Actual Results
17 |
18 |
19 | #### Versions
20 |
31 |
32 |
33 |
34 |
--------------------------------------------------------------------------------
/gtime/forecasting/tests/test_trend.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | import pandas.util.testing as testing
5 |
6 | from gtime.forecasting import TrendForecaster
7 |
8 |
9 | def test_polynomial_trend():
10 | testing.N, testing.K = 500, 1
11 | df = testing.makeTimeDataFrame(freq="D")
12 |
13 | df["A"] = df["A"] + 0.0005 * pd.Series(
14 | index=df.index, data=range(df.shape[0])
15 | ) * pd.Series(index=df.index, data=range(df.shape[0]))
16 |
17 | tm = TrendForecaster(trend="polynomial", trend_x0=0.0)
18 | tm.fit(df["A"])
19 | # too hard to expect every time
20 | # assert np.allclose(tm.best_trend_params_, [0.0] * len(tm.best_trend_params_))
21 | assert len(tm.best_trend_params_) == 1
22 |
23 |
24 | def test_exponential_trend():
25 | testing.N, testing.K = 500, 1
26 | df = testing.makeTimeDataFrame(freq="D")
27 |
28 | df["A"] = df["A"] + 0.0005 * pd.Series(
29 | index=df.index, data=range(df.shape[0])
30 | ).apply(lambda x: np.exp(0.03 * x))
31 |
32 | tm = TrendForecaster(trend="exponential", trend_x0=4 * [0.0])
33 | tm.fit(df)
34 | # too hard to expect this result every time
35 | # assert np.allclose(tm.best_trend_params_, [0.0] * len(tm.best_trend_params_))
36 | assert len(tm.best_trend_params_) == 4
37 |
38 | # TODO: predicting tests
39 |
--------------------------------------------------------------------------------
/.github/workflows/deploy_github_pages.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Deploy to gh-pages
5 | on: [workflow_dispatch]
6 | jobs:
7 | build:
8 |
9 | runs-on: ubuntu-latest
10 |
11 | steps:
12 | - uses: actions/checkout@v2
13 | - name: Set up Python 3.8
14 | uses: actions/setup-python@v2
15 | with:
16 | python-version: 3.8
17 | - name: Install dependencies
18 | run: |
19 | python -m pip install --upgrade pip
20 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
21 | if [ -f doc-requirements.txt ]; then pip install -r doc-requirements.txt; fi
22 | - name: Install giotto-time
23 | run: |
24 | pip install -e .
25 | - name: Git checkout and build sphinx docs
26 | run: |
27 | git config --global user.name "github-pages[bot]"
28 | git config --global user.email "41898281+github-pages[bot]@users.noreply.github.com"
29 | git fetch
30 | git checkout gh-pages
31 | git checkout master
32 | cd docs
33 | make html
34 | - name: push to gh-pages
35 | run: |
36 | git symbolic-ref HEAD refs/heads/gh-pages
37 | git reset --mixed gh-pages
38 | git add --all
39 | git add -f docs/build
40 | git commit -m "push sphinx build"
41 | git push origin gh-pages
42 |
--------------------------------------------------------------------------------
/gtime/causality/tests/test_pearson_correlation.py:
--------------------------------------------------------------------------------
1 | from random import randint
2 |
3 | import numpy as np
4 |
5 | from gtime.causality import ShiftedPearsonCorrelation
6 | from gtime.causality.tests.common import make_df_from_expected_shifts
7 |
8 |
9 | def test_pearson_correlation():
10 | expected_shifts = [randint(2, 6) * 2 for _ in range(3)]
11 | df = make_df_from_expected_shifts(expected_shifts)
12 |
13 | spc = ShiftedPearsonCorrelation(target_col="A", max_shift=12)
14 | spc.fit(df)
15 |
16 | shifts = spc.best_shifts_["A"][4:].values
17 | np.testing.assert_array_equal(shifts, expected_shifts)
18 |
19 |
20 | def test_pearson_bootstrap_p_values():
21 | expected_shifts = [randint(2, 9) * 2 for _ in range(3)]
22 | df = make_df_from_expected_shifts(expected_shifts)
23 | shifted_test = ShiftedPearsonCorrelation(
24 | target_col="A", max_shift=5, bootstrap_iterations=500,
25 | )
26 | shifted_test.fit(df)
27 |
28 | pearson_p_values = shifted_test.bootstrap_p_values_
29 | for col_index in range(len(pearson_p_values.columns)):
30 | assert pearson_p_values.iloc[col_index, col_index] == 0
31 |
32 |
33 | def test_pearson_permutation_p_values():
34 | expected_shifts = [randint(2, 9) * 2 for _ in range(3)]
35 | df = make_df_from_expected_shifts(expected_shifts)
36 | shifted_test = ShiftedPearsonCorrelation(
37 | target_col="A", max_shift=5, permutation_iterations=50,
38 | )
39 | shifted_test.fit(df)
40 |
41 | pearson_p_values = shifted_test.permutation_p_values_
42 | for col_index in range(len(pearson_p_values.columns)):
43 | assert pearson_p_values.iloc[col_index, col_index] == 0
44 |
--------------------------------------------------------------------------------
/gtime/hierarchical/base.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod
2 | from typing import Any, Dict, Union
3 |
4 | import pandas as pd
5 | from sklearn.base import BaseEstimator, RegressorMixin
6 |
7 |
8 | class HierarchicalBase(BaseEstimator, RegressorMixin):
9 | """ Base class for hierarchical models.
10 |
11 | Parameters
12 | ----------
13 | model : BaseEstimator, required
14 | base model applied to all the time series
15 | hierarchy_tree: Union[str, Dict[str, Any]], optional, default = ``'infer'``
16 | hierarchy structure between time series. If 'infer' a standard structure if inferred. It
17 | depends on the subclass the implementation of infer.
18 | """
19 |
20 | def __init__(
21 | self, model: BaseEstimator, hierarchy_tree: Union[str, Dict[str, Any]] = "infer"
22 | ):
23 | self.model = model
24 | self.hierarchy_tree = hierarchy_tree
25 |
26 | @abstractmethod
27 | def fit(self, X: Dict[str, pd.DataFrame], y=None):
28 | raise NotImplementedError
29 |
30 | @abstractmethod
31 | def predict(self, X: Dict[str, pd.DataFrame] = None):
32 | raise NotImplementedError
33 |
34 | @staticmethod
35 | def _check_is_dict_of_dataframes_with_str_key(X: Any):
36 | if not isinstance(X, dict):
37 | raise ValueError(
38 | f"X must be a dictionary of pd.DataFrame. Detected: {type(X)}"
39 | )
40 | if not all(isinstance(key, str) for key in X):
41 | raise ValueError("All X keys must be string")
42 | if not all(isinstance(df, pd.DataFrame) for df in X.values()):
43 | raise ValueError("All values of X must be pd.DataFrame")
44 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | notebooks/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *.cover
46 |
47 | # Translations
48 | *.mo
49 | *.pot
50 |
51 | # Django stuff:
52 | *.log
53 |
54 | # Sphinx documentation
55 | docs/_build/
56 | doc/build/
57 | doc/generated/
58 | doc/reference/generated/
59 |
60 | # PyBuilder
61 | target/
62 |
63 | # DotEnv configuration
64 | .env
65 |
66 | # Database
67 | *.db
68 | *.rdb
69 |
70 | # Pycharm
71 | .idea
72 |
73 | # VS Code
74 | .vscode/
75 |
76 | # Spyder
77 | .spyproject/
78 |
79 | # Jupyter NB Checkpoints
80 | .ipynb_checkpoints/
81 | Untitled*
82 |
83 | # exclude data from source control by default
84 | /data/
85 |
86 | # Mac OS-specific storage files
87 | .DS_Store
88 |
89 | # vim
90 | *.swp
91 | *.swo
92 |
93 | # Mypy cache
94 | .mypy_cache/
95 |
96 | # ignore huge time_series_models
97 | models/*.joblib
98 |
99 | # Hypothesis
100 | .hypothesis/
101 |
102 | # PyTest
103 | .pytest_cache/
104 |
105 | # Excel temporary
106 | ~$*.xls*
107 |
--------------------------------------------------------------------------------
/gtime/feature_generation/tests/test_calendar.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import pytest
4 | from hypothesis import given, settings
5 |
6 | from gtime.feature_extraction import Calendar
7 | from gtime.utils.hypothesis.time_indexes import giotto_time_series
8 |
9 |
10 | def test_empty_and_non_finite_kernel_error():
11 | with pytest.raises(ValueError):
12 | Calendar(
13 | start_date="ignored",
14 | end_date="ignored",
15 | country="Brazil",
16 | kernel=np.array([]),
17 | )
18 |
19 | with pytest.raises(ValueError):
20 | Calendar(
21 | start_date="ignored",
22 | end_date="ignored",
23 | country="Brazil",
24 | kernel=np.array([np.nan, 1]),
25 | )
26 |
27 |
28 | def test_unevenly_spaced_time_series():
29 | unevenly_spaced_ts = pd.DataFrame(
30 | index=[
31 | pd.Period("2012-01-01"),
32 | pd.Period("2012-01-03"),
33 | pd.Period("2012-01-10"),
34 | ]
35 | )
36 | cal_feature = Calendar(
37 | start_date="ignored",
38 | end_date="ignored",
39 | country="Brazil",
40 | kernel=np.array([0, 1]),
41 | )
42 |
43 | with pytest.raises(ValueError):
44 | cal_feature.fit_transform(unevenly_spaced_ts)
45 |
46 |
47 | @settings(deadline=pd.Timedelta(milliseconds=5000), max_examples=7)
48 | @given(giotto_time_series(min_length=2, max_length=30))
49 | def test_correct_index_random_ts(ts):
50 | cal_feature = Calendar(
51 | start_date="ignored",
52 | end_date="ignored",
53 | country="Brazil",
54 | kernel=np.array([1, 2]),
55 | )
56 | Xt = cal_feature.fit_transform(ts)
57 | np.testing.assert_array_equal(Xt.index, ts.index)
58 |
--------------------------------------------------------------------------------
/gtime/time_series_models/ar.py:
--------------------------------------------------------------------------------
1 | from typing import List, Union, Optional
2 |
3 | import numpy as np
4 | from sklearn.compose import make_column_selector
5 | from sklearn.linear_model import LinearRegression
6 |
7 | from gtime.feature_extraction import Shift
8 | from gtime.forecasting import GAR
9 | from gtime.time_series_models import TimeSeriesForecastingModel
10 |
11 |
12 | class AR(TimeSeriesForecastingModel):
13 | """ Standard AR model for time series
14 |
15 | Parameters
16 | ----------
17 | p: int, required
18 | p parameter in AR
19 | horizon: int, required
20 | how many steps to predict in the future
21 |
22 | Examples
23 | --------
24 | >>> import pandas._testing as testing
25 | >>> from gtime.time_series_models import AR
26 | >>>
27 | >>> testing.N, testing.K = 20, 1
28 | >>> data = testing.makeTimeDataFrame(freq="s")
29 | >>> ar = AR(p=2, horizon=3, column_name='A')
30 | >>>
31 | >>> ar.fit(data)
32 | >>> ar.predict()
33 | y_1 y_2 y_3
34 | 2000-01-01 00:00:17 0.037228 0.163446 -0.237299
35 | 2000-01-01 00:00:18 -0.139627 -0.018082 0.063273
36 | 2000-01-01 00:00:19 -0.107707 0.052031 -0.105526
37 | """
38 |
39 | def __init__(
40 | self,
41 | p: int,
42 | horizon: Union[int, List[int]],
43 | explainer_type: Optional[str] = None,
44 | ):
45 | self.p = p
46 | self.explainer_type = explainer_type
47 | features = [
48 | tuple((f"s{i}", Shift(i), make_column_selector(dtype_include=np.number)))
49 | for i in range(p)
50 | ]
51 | model = GAR(LinearRegression(), explainer_type=explainer_type)
52 | super().__init__(features=features, horizon=horizon, model=model)
53 |
--------------------------------------------------------------------------------
/gtime/model_selection/horizon_shift.py:
--------------------------------------------------------------------------------
1 | from typing import List, Union
2 |
3 | import pandas as pd
4 |
5 | from gtime.feature_extraction import Shift
6 |
7 |
8 | def horizon_shift(
9 | time_series: pd.DataFrame, horizon: Union[int, List[int]] = 5
10 | ) -> pd.DataFrame:
11 | """Perform a shift of the original ``time_series`` for each time step between 1 and
12 | ``horizon``.
13 |
14 | Parameters
15 | ----------
16 | time_series : pd.DataFrame, shape (n_samples, n_features), required
17 | The list of ``TimeSeriesFeature`` from which to compute the feature_extraction.
18 |
19 | horizon : int, optional, default: ``5``
20 | It represents how much into the future is necessary to predict. This corresponds
21 | to the number of shifts that are going to be performed on y.
22 |
23 | Returns
24 | -------
25 | y : pd.DataFrame, shape (n_samples, horizon)
26 | The shifted time series.
27 |
28 | Examples
29 | --------
30 | >>> import pandas as pd
31 | >>> from gtime.model_selection import horizon_shift
32 | >>> X = pd.DataFrame(range(0, 5), index=pd.date_range("2020-01-01", "2020-01-05"))
33 | >>> horizon_shift(X, horizon=2)
34 | y_1 y_2
35 | 2020-01-01 1.0 2.0
36 | 2020-01-02 2.0 3.0
37 | 2020-01-03 3.0 4.0
38 | 2020-01-04 4.0 NaN
39 | 2020-01-05 NaN NaN
40 | >>> horizon_shift(X, horizon=[2])
41 | y_2
42 | 2020-01-01 2.0
43 | 2020-01-02 3.0
44 | 2020-01-03 4.0
45 | 2020-01-04 NaN
46 | 2020-01-05 NaN
47 |
48 | """
49 | horizon = range(1, horizon + 1) if isinstance(horizon, (int, float)) else horizon
50 | y = pd.DataFrame(index=time_series.index)
51 | for k in sorted(horizon):
52 | shift_feature = Shift(-k)
53 | y[f"y_{k}"] = shift_feature.fit_transform(time_series)
54 |
55 | return y
56 |
--------------------------------------------------------------------------------
/gtime/utils/hypothesis/utils.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | from typing import Union, Tuple
3 |
4 | import hypothesis.strategies as st
5 | import pandas as pd
6 |
7 |
8 | def initialize_start_date_end_date(
9 | start: datetime, end: datetime
10 | ) -> Tuple[datetime, datetime]:
11 | start = start if start is not None else pd.Timestamp("1980-01-01")
12 | end = end if end is not None else pd.Timestamp("2020-01-01")
13 | return start, end
14 |
15 |
16 | def initialize_start_timedelta_end_timedelta(start: pd.Timedelta, end: pd.Timedelta):
17 | start = start if start is not None else pd.Timedelta(0)
18 | end = end if end is not None else pd.Timedelta("40Y")
19 | return start, end
20 |
21 |
22 | def order_pair(element1, element2):
23 | return st.builds(
24 | lambda start, end: (start, end), start=element1, end=element2
25 | ).filter(lambda x: x[0] < x[1])
26 |
27 |
28 | def expected_start_date_from(
29 | end: Union[datetime, pd.Period], periods: int, freq: pd.Timedelta
30 | ) -> Union[datetime, pd.Period]:
31 | return end - periods * freq
32 |
33 |
34 | def expected_end_date_from(
35 | start: Union[datetime, pd.Period], periods: int, freq: pd.Timedelta
36 | ) -> Union[datetime, pd.Period]:
37 | return start + periods * freq
38 |
39 |
40 | def expected_index_length_from(
41 | start: Union[datetime, pd.Period],
42 | end: Union[datetime, pd.Period],
43 | freq: pd.Timedelta,
44 | ) -> int:
45 | expected_index_length = (end - start) // freq
46 | return expected_index_length
47 |
48 |
49 | def freq_to_timedelta(
50 | freq: str, approximate_if_non_uniform: bool = True
51 | ) -> pd.Timedelta:
52 | try:
53 | return pd.to_timedelta(f"1{freq}")
54 | except ValueError as e:
55 | if approximate_if_non_uniform:
56 | correspondences = {
57 | "B": pd.Timedelta(1, unit="D"),
58 | "Q": pd.Timedelta(90, unit="D"),
59 | "A": pd.Timedelta(365, unit="D"),
60 | }
61 | return correspondences[freq]
62 | else:
63 | raise e
64 |
--------------------------------------------------------------------------------
/gtime/utils/hypothesis/tests/test_general_strategies.py:
--------------------------------------------------------------------------------
1 | from typing import Tuple
2 |
3 | import pytest
4 | from hypothesis import given
5 | from hypothesis.strategies import integers, data
6 |
7 | from gtime.utils.hypothesis.general_strategies import (
8 | ordered_pair,
9 | shape_matrix,
10 | shape_X_y_matrices,
11 | regressors,
12 | )
13 |
14 |
15 | @given(ordered_pair(0, 10))
16 | def test_ordered_pair(pair: Tuple[int, int]):
17 | assert pair[0] < pair[1]
18 |
19 |
20 | @given(ordered_pair(27, 132))
21 | def test_ordered_pair_values(pair: Tuple[int, int]):
22 | assert pair[0] >= 27
23 | assert pair[1] <= 132
24 |
25 |
26 | @given(data=data(), value=integers(0, 10))
27 | def test_ordered_pair_min_equal_max(data, value):
28 | with pytest.raises(ValueError):
29 | data.draw(ordered_pair(value, value))
30 |
31 |
32 | @given(data=data(), shape_0=ordered_pair(10, 100), shape_1=ordered_pair(1, 8))
33 | def test_shape_X(data, shape_0, shape_1):
34 | shape = data.draw(shape_matrix(*shape_0, *shape_1))
35 | assert shape_0[0] <= shape[0] <= shape_0[1]
36 | assert shape_1[0] <= shape[1] <= shape_1[1]
37 |
38 |
39 | @given(shape_X_y_matrices(123, 243, 12, 34, 1, 6, y_as_vector=False))
40 | def test_shape_X_y_matrices_y_matrix(shape_X_y):
41 | shape_X, shape_y = shape_X_y
42 | assert shape_X[0] == shape_y[0]
43 | assert 12 <= shape_X[1] <= 34
44 | assert 1 <= shape_y[1] <= 6
45 |
46 |
47 | @given(shape_X_y_matrices(123, 243, 12, 34, 1, 6, y_as_vector=True))
48 | def test_shape_X_y_matrices_y_vector(shape_X_y):
49 | shape_X, shape_y = shape_X_y
50 | assert shape_X[0] == shape_y[0]
51 | assert 12 <= shape_X[1] <= 34
52 | assert len(shape_y) == 1
53 |
54 |
55 | @given(shape_X_y_matrices(10, 20, 10, 20, 1, 6))
56 | def test_shape_1_X_smaller_shape_0(shape_X_y):
57 | shape_X, shape_y = shape_X_y
58 | assert shape_X[0] > shape_X[1]
59 |
60 |
61 | @given(data=data())
62 | def test_shape_X_Y_value_error(data):
63 | with pytest.raises(ValueError):
64 | data.draw(shape_X_y_matrices(1, 8, 9, 10, 10, 20))
65 |
66 |
67 | @given(regressors())
68 | def test_regressors(regressor):
69 | assert hasattr(regressor, "fit")
70 | assert hasattr(regressor, "predict")
71 |
--------------------------------------------------------------------------------
/gtime/utils/hypothesis/general_strategies.py:
--------------------------------------------------------------------------------
1 | from hypothesis import assume
2 | from hypothesis.strategies import tuples, integers, floats, sampled_from
3 | import hypothesis.strategies as st
4 | from sklearn.ensemble import (
5 | BaggingRegressor,
6 | AdaBoostRegressor,
7 | GradientBoostingRegressor,
8 | RandomForestRegressor,
9 | )
10 | from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge
11 | from sklearn.tree import ExtraTreeRegressor
12 |
13 |
14 | def ordered_pair(min_value: int, max_value: int):
15 | if min_value == max_value:
16 | raise ValueError("min_value and max_value can not be the same")
17 | return (
18 | tuples(integers(min_value, max_value), integers(min_value, max_value))
19 | .map(sorted)
20 | .filter(lambda x: x[0] < x[1])
21 | )
22 |
23 |
24 | def shape_matrix(min_shape_0=30, max_shape_0=200, min_shape_1=5, max_shape_1=10):
25 | return tuples(
26 | integers(min_shape_0, max_shape_0), integers(min_shape_1, max_shape_1)
27 | ).filter(lambda x: x[0] > x[1])
28 |
29 |
30 | @st.composite
31 | def shape_X_y_matrices(
32 | draw,
33 | min_shape_0=30,
34 | max_shape_0=200,
35 | min_shape_1_X=5,
36 | max_shape_1_X=10,
37 | min_shape_1_y=1,
38 | max_shape_1_y=3,
39 | y_as_vector=True,
40 | ):
41 | if max_shape_0 <= min_shape_1_X:
42 | raise ValueError(
43 | f"max_shape_0 must be greater than min_shape_1_X: "
44 | f"{max_shape_0}, {min_shape_1_X}"
45 | )
46 | shape_0 = draw(integers(min_shape_0, max_shape_0))
47 | shape_X = draw(shape_matrix(shape_0, shape_0, min_shape_1_X, max_shape_1_X))
48 | if y_as_vector:
49 | shape_y = (shape_0,)
50 | else:
51 | shape_y = draw(shape_matrix(shape_0, shape_0, min_shape_1_y, max_shape_1_y))
52 | assume(shape_X[1] < shape_X[0])
53 | return shape_X, shape_y
54 |
55 |
56 | @st.composite
57 | def regressors(draw):
58 | regressors = [
59 | LinearRegression(),
60 | Ridge(alpha=draw(floats(0.00001, 2))),
61 | BayesianRidge(),
62 | ExtraTreeRegressor(),
63 | GradientBoostingRegressor(),
64 | RandomForestRegressor(),
65 | ]
66 | return draw(sampled_from(regressors))
67 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: CI
5 |
6 | on:
7 | push:
8 | branches: [ master ]
9 | pull_request:
10 | branches: [ master ]
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ubuntu-latest
16 | strategy:
17 | matrix:
18 | python-version: ['3.8', '3.9', '3.10']
19 |
20 | steps:
21 | - uses: actions/checkout@v2
22 | - name: Set up Python ${{ matrix.python-version }}
23 | uses: actions/setup-python@v2
24 | with:
25 | python-version: ${{ matrix.python-version }}
26 | - name: Install dependencies
27 | run: |
28 | python -m pip install --upgrade pip
29 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
30 | if [ -f dev-requirements.txt ]; then pip install -r dev-requirements.txt; fi
31 | pip install -e .
32 | - name: Lint with flake8
33 | run: |
34 | # stop the build if there are Python syntax errors or undefined names
35 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
36 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
37 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
38 | - name: Type checking with mypy
39 | run: |
40 | mypy --ignore-missing-imports . || {
41 | status=$? echo "Type checking errors!"
42 | }
43 | - name: Test with pytest
44 | continue-on-error: true
45 | run: |
46 | pytest --maxfail=10
47 | - name: Integration tests
48 | run: |
49 | set -e
50 | for n in examples/*.ipynb
51 | do
52 | jupyter nbconvert --to notebook --execute $n
53 | done
54 | - name: Build and install wheels
55 | run: |
56 | set -e
57 | python -m pip install wheel
58 | python setup.py bdist_wheel
59 | python -m pip install dist/*.whl
60 | - name: Upload artifacts
61 | uses: actions/upload-artifact@v2
62 | with:
63 | name: pip_wheel_${{ matrix.python-version }}
64 | path: dist
65 |
--------------------------------------------------------------------------------
/gtime/utils/fixtures.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import pytest
4 | from pytest import fixture
5 | import numpy as np
6 | from sklearn.compose import make_column_selector
7 | from sklearn.linear_model import LinearRegression, Ridge
8 |
9 | from gtime.feature_extraction import Shift, MovingAverage
10 | from gtime.forecasting import GAR
11 | from gtime.time_series_models import TimeSeriesForecastingModel
12 |
13 |
14 | @fixture(scope="function")
15 | def features1():
16 | return [
17 | ("shift_0", Shift(0), make_column_selector(dtype_include=np.number)),
18 | ("shift_1", Shift(1), make_column_selector(dtype_include=np.number)),
19 | (
20 | "moving_average_3",
21 | MovingAverage(window_size=3),
22 | make_column_selector(dtype_include=np.number),
23 | ),
24 | ]
25 |
26 |
27 | @fixture(scope="function")
28 | def features2():
29 | return [
30 | ("shift_0", Shift(0), make_column_selector(dtype_include=np.number)),
31 | ("shift_1", Shift(1), make_column_selector(dtype_include=np.number)),
32 | ]
33 |
34 |
35 | @fixture(scope="function")
36 | def model1():
37 | lr = LinearRegression()
38 | return GAR(lr)
39 |
40 |
41 | @fixture(scope="function")
42 | def model2():
43 | lr = Ridge(alpha=0.1)
44 | return GAR(lr)
45 |
46 |
47 | @fixture(scope="function")
48 | def time_series_forecasting_model1_no_cache(features1, model1):
49 | return TimeSeriesForecastingModel(
50 | features=features1, horizon=2, model=model1, cache_features=False,
51 | )
52 |
53 |
54 | @fixture(scope="function")
55 | def time_series_forecasting_model1_cache(features1, model1):
56 | return TimeSeriesForecastingModel(
57 | features=features1, horizon=2, model=model1, cache_features=True,
58 | )
59 |
60 |
61 | @pytest.fixture(scope="function")
62 | def estimator():
63 | return LinearRegression()
64 |
65 |
66 | def _single_element_lazy_fixtures(*args):
67 | return [pytest.lazy_fixture(arg.__name__) for arg in args[0]]
68 |
69 |
70 | def lazy_fixtures(*args):
71 | if isinstance(args[0], tuple):
72 | raise NotImplementedError
73 | # return [tuple([pytest.lazy_fixture(arg[0].__name__), *arg[1:]]) for arg in args]
74 | else:
75 | return _single_element_lazy_fixtures(*args)
76 |
--------------------------------------------------------------------------------
/gtime/external/make_holidays.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Copyright (c) Facebook, Inc. and its affiliates.
3 |
4 | # This source code is licensed under the MIT license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | from __future__ import absolute_import, division, print_function
8 |
9 | import warnings
10 |
11 | import numpy as np
12 | import pandas as pd
13 |
14 | import holidays as hdays_part1
15 |
16 | import gtime.external.hdays as hdays_part2
17 |
18 |
19 | def get_holiday_names(country):
20 | """Return all possible holiday names of given country
21 | Parameters
22 | ----------
23 | country: country name
24 | Returns
25 | -------
26 | A set of all possible holiday names of given country
27 | """
28 | years = np.arange(1995, 2045)
29 | try:
30 | with warnings.catch_warnings():
31 | warnings.simplefilter("ignore")
32 | holiday_names = getattr(hdays_part2, country)(years=years).values()
33 | except AttributeError:
34 | try:
35 | holiday_names = getattr(hdays_part1, country)(years=years).values()
36 | except AttributeError as e:
37 | raise AttributeError(
38 | "Holidays in {} are not currently supported!".format(country)
39 | ) from e
40 | return set(holiday_names)
41 |
42 |
43 | def make_holidays_df(year_list, country, province=None):
44 | """Make dataframe of holidays for given years and countries
45 | Parameters
46 | ----------
47 | year_list: a list of years
48 | country: country name
49 | Returns
50 | -------
51 | Dataframe with 'ds' and 'holiday', which can directly feed
52 | to 'holidays' params in Prophet
53 | """
54 | try:
55 | holidays = getattr(hdays_part2, country)(years=year_list)
56 | except AttributeError:
57 | try:
58 | holidays = getattr(hdays_part1, country)(prov=province, years=year_list)
59 | except AttributeError as e:
60 | raise AttributeError(
61 | "Holidays in {} are not currently supported!".format(country)
62 | ) from e
63 | holidays_df = pd.DataFrame(list(holidays.items()), columns=["ds", "holiday"])
64 | holidays_df.reset_index(inplace=True, drop=True)
65 | holidays_df["ds"] = pd.to_datetime(holidays_df["ds"])
66 | return holidays_df
67 |
--------------------------------------------------------------------------------
/gtime/regressors/tests/test_linear_regressor.py:
--------------------------------------------------------------------------------
1 | from random import random
2 |
3 | import numpy as np
4 | import pandas as pd
5 | from hypothesis import given, settings
6 | from hypothesis.extra.numpy import arrays
7 | from hypothesis.strategies import floats
8 |
9 | from gtime.regressors import LinearRegressor
10 |
11 |
12 | class TestLinearRegressor:
13 | def test_linear_regressor(self):
14 | train, test = train_test_dataframe()
15 |
16 | predictions = compute_predictions_for_train_test(train, test)
17 | expected = compute_expectation_from_test(test)
18 |
19 | np.testing.assert_array_almost_equal(predictions, expected, decimal=2)
20 |
21 | @settings(deadline=None)
22 | @given(
23 | arrays(
24 | dtype=float,
25 | shape=(100, 1),
26 | elements=floats(allow_nan=False, allow_infinity=False, width=16),
27 | )
28 | )
29 | def test_linear_regressor_random_array(self, random_array):
30 | train, test = train_test_dataframe(random_array)
31 |
32 | predictions = compute_predictions_for_train_test(train, test)
33 | expected = compute_expectation_from_test(test)
34 |
35 | np.testing.assert_array_almost_equal(predictions, expected, decimal=0)
36 |
37 |
38 | def train_test_dataframe(
39 | random_array: np.ndarray = None,
40 | ) -> (pd.DataFrame, pd.DataFrame):
41 | random_array = (
42 | random_array if random_array is not None else [random() for _ in range(100)]
43 | )
44 |
45 | a1, a2, b = random() * 10, random() * 100, 2 * (1 - random())
46 |
47 | df = pd.DataFrame()
48 | df["x1"] = list(range(100))
49 | df["x2"] = random_array
50 | df["y"] = [b + a1 * t for t in range(100)]
51 | df["y"] = df["y"] + a2 * df["x2"]
52 |
53 | train = df[:90]
54 | test = df[90:]
55 |
56 | return train, test
57 |
58 |
59 | def compute_predictions_for_train_test(
60 | train: pd.DataFrame, test: pd.DataFrame
61 | ) -> np.ndarray:
62 | lr = LinearRegressor()
63 |
64 | lr.fit(train[["x1", "x2"]], train["y"], x0=[0, 0, 0])
65 |
66 | preds_y = lr.predict(test[["x1", "x2"]])
67 | preds_y = preds_y / np.sum(preds_y)
68 |
69 | return preds_y
70 |
71 |
72 | def compute_expectation_from_test(test: pd.DataFrame) -> np.ndarray:
73 | test_y = test["y"].values
74 | test_y = test_y / np.sum(test_y)
75 | return test_y
76 |
--------------------------------------------------------------------------------
/gtime/causality/tests/test_linear_coefficient.py:
--------------------------------------------------------------------------------
1 | from random import randint
2 |
3 | import numpy as np
4 | import pytest
5 | from hypothesis import given, strategies as st
6 | from pandas.util import testing as testing
7 |
8 | from gtime.causality import ShiftedLinearCoefficient
9 | from gtime.causality.tests.common import make_df_from_expected_shifts
10 |
11 |
12 | def test_linear_coefficient():
13 | expected_shifts = [randint(2, 6) * 2 for _ in range(3)]
14 |
15 | df = make_df_from_expected_shifts(expected_shifts)
16 | slc = ShiftedLinearCoefficient(target_col="A", max_shift=12)
17 | slc.fit(df)
18 |
19 | shifts = slc.best_shifts_["A"][4:].values
20 | np.testing.assert_array_equal(shifts, expected_shifts)
21 |
22 |
23 | # TODO: tests refactor TBD
24 | @given(st.integers(1, 20))
25 | @pytest.mark.skip(reason="TODO: Write proper test, increase hypothesis max duration")
26 | def test_linear_coefficient_hyp(shift):
27 | testing.N, testing.K = 500, 1
28 | df = testing.makeTimeDataFrame(freq="D")
29 | df["shifted"] = df["A"].shift(shift)
30 |
31 | slc = ShiftedLinearCoefficient(target_col="A", max_shift=20)
32 | slc.fit(df).transform(df)
33 |
34 |
35 | def test_linear_bootstrap_p_values():
36 | # This test and the next one just test if the p_values on the diagonal are equal
37 | # to 0. Is hard to implement other unittest, since the bootstrapping always
38 | # gives different result. However, other properties could be tested
39 | expected_shifts = [randint(2, 4) * 2 for _ in range(3)]
40 | df = make_df_from_expected_shifts(expected_shifts)
41 | shifted_test = ShiftedLinearCoefficient(
42 | target_col="A", max_shift=8, bootstrap_iterations=500,
43 | )
44 | shifted_test.fit(df)
45 |
46 | linear_p_values = shifted_test.bootstrap_p_values_
47 | for col_index in range(len(linear_p_values.columns)):
48 | assert linear_p_values.iloc[col_index, col_index] == 0
49 |
50 |
51 | def test_linear_permutation_p_values():
52 | expected_shifts = [randint(2, 4) * 2 for _ in range(3)]
53 | df = make_df_from_expected_shifts(expected_shifts)
54 | shifted_test = ShiftedLinearCoefficient(
55 | target_col="A", max_shift=8, permutation_iterations=50,
56 | )
57 | shifted_test.fit(df)
58 |
59 | linear_p_values = shifted_test.permutation_p_values_
60 | for col_index in range(len(linear_p_values.columns)):
61 | assert linear_p_values.iloc[col_index, col_index] == 0
62 |
--------------------------------------------------------------------------------
/gtime/feature_extraction/tests/test_trend.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 | from gtime.feature_extraction import Detrender
5 |
6 |
7 | def test_polynomial_detrend():
8 | time_index = pd.date_range(start="2020-01-01", end="2020-01-20")
9 | ts = pd.DataFrame(range(0, 20), index=time_index)
10 |
11 | detrend_feature = Detrender(trend="polynomial", trend_x0=np.zeros(3))
12 | feature_name = detrend_feature.__class__.__name__
13 | ts_t = detrend_feature.fit_transform(ts)
14 | expected_ts = pd.DataFrame(
15 | [
16 | 1.22681324e-05,
17 | 8.34525141e-06,
18 | 4.86108426e-06,
19 | 1.81563099e-06,
20 | -7.91108403e-07,
21 | -2.95913392e-06,
22 | -4.68844555e-06,
23 | -5.97904330e-06,
24 | -6.83092717e-06,
25 | -7.24409716e-06,
26 | -7.21855327e-06,
27 | -6.75429551e-06,
28 | -5.85132385e-06,
29 | -4.50963832e-06,
30 | -2.72923891e-06,
31 | -5.10125625e-07,
32 | 2.14770155e-06,
33 | 5.24424260e-06,
34 | 8.77949753e-06,
35 | 1.27534663e-05,
36 | ],
37 | columns=[f"0__{feature_name}"],
38 | index=time_index,
39 | )
40 | pd.testing.assert_frame_equal(ts_t, expected_ts, check_less_precise=3)
41 |
42 |
43 | def test_exponential_detrend():
44 | time_index = pd.date_range(start="2020-01-01", end="2020-01-20")
45 | ts = pd.DataFrame(range(0, 20), index=time_index)
46 |
47 | detrend_feature = Detrender(trend="exponential", trend_x0=0)
48 | feature_name = detrend_feature.__class__.__name__
49 | ts_t = detrend_feature.fit_transform(ts)
50 | expected_ts = pd.DataFrame(
51 | [
52 | -1.0,
53 | -0.18238542,
54 | 0.60196471,
55 | 1.34698345,
56 | 2.04549733,
57 | 2.68902453,
58 | 3.26753629,
59 | 3.76917473,
60 | 4.1799193,
61 | 4.48319226,
62 | 4.65939237,
63 | 4.68534338,
64 | 4.53364205,
65 | 4.17188719,
66 | 3.5617681,
67 | 2.65798675,
68 | 1.40698343,
69 | -0.25457009,
70 | -2.40155216,
71 | -5.1224979,
72 | ],
73 | columns=[f"0__{feature_name}"],
74 | index=time_index,
75 | )
76 | pd.testing.assert_frame_equal(ts_t, expected_ts)
77 |
--------------------------------------------------------------------------------
/gtime/feature_extraction/tests/test_sorted_density.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import pandas.util.testing as testing
4 | import pytest
5 |
6 | from gtime.feature_extraction.custom import SortedDensity
7 |
8 |
9 | def get_input_data():
10 | input_data = pd.DataFrame.from_dict({"x_1": [0, 7, 2], "x_2": [2, 10, 4]})
11 | input_data.index = [
12 | pd.Timestamp(2000, 1, 1),
13 | pd.Timestamp(2000, 2, 1),
14 | pd.Timestamp(2000, 3, 1),
15 | ]
16 | return input_data
17 |
18 |
19 | def get_output_causal():
20 | custom_feature = SortedDensity(window_size=2, is_causal=True)
21 | feature_name = custom_feature.__class__.__name__
22 | output_causal = pd.DataFrame.from_dict(
23 | {
24 | f"x_1__{feature_name}": [np.nan, 0.5, 0.6111111111111112],
25 | f"x_2__{feature_name}": [np.nan, 0.5833333333333334, 0.6428571428571429],
26 | }
27 | )
28 | output_causal.index = [
29 | pd.Timestamp(2000, 1, 1),
30 | pd.Timestamp(2000, 2, 1),
31 | pd.Timestamp(2000, 3, 1),
32 | ]
33 | return output_causal
34 |
35 |
36 | def get_output_anticausal():
37 | custom_feature = SortedDensity(window_size=2, is_causal=False)
38 | feature_name = custom_feature.__class__.__name__
39 | output_anticausal = pd.DataFrame.from_dict(
40 | {
41 | f"x_1__{feature_name}": [0.5, 0.6111111111111112],
42 | f"x_2__{feature_name}": [0.5833333333333334, 0.6428571428571429],
43 | }
44 | )
45 | output_anticausal.index = [
46 | pd.Timestamp(2000, 2, 1),
47 | pd.Timestamp(2000, 3, 1),
48 | ]
49 | return output_anticausal
50 |
51 |
52 | input_data = get_input_data()
53 | output_causal = get_output_causal()
54 | output_anticausal = get_output_anticausal()
55 |
56 |
57 | class TestSortedDensity:
58 | @pytest.mark.parametrize("test_input, expected", [(input_data, output_causal)])
59 | def test_crest_factor_detrending_causal(self, test_input, expected):
60 | feature = SortedDensity(window_size=2, is_causal=True)
61 | output = feature.fit_transform(test_input)
62 | testing.assert_frame_equal(output, expected)
63 |
64 | @pytest.mark.parametrize("test_input, expected", [(input_data, output_anticausal)])
65 | def test_crest_factor_detrending_anticausal(self, test_input, expected):
66 | feature = SortedDensity(window_size=2, is_causal=False)
67 | output = feature.fit_transform(test_input)
68 | testing.assert_frame_equal(output, expected)
69 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | """Toolbox for Time Series Analysis."""
3 |
4 | import os
5 | import codecs
6 |
7 | from setuptools import setup, find_packages
8 |
9 | from gtime import __version__
10 |
11 | version_file = os.path.join("gtime", "_version.py")
12 | with open(version_file) as f:
13 | exec(f.read())
14 |
15 | with open("requirements.txt") as f:
16 | requirements = f.read().splitlines()
17 | with open("doc-requirements.txt") as f:
18 | doc_requirements = f.read().splitlines()
19 | with open("dev-requirements.txt") as f:
20 | dev_requirements = f.read().splitlines()
21 |
22 | DISTNAME = "giotto-time"
23 | DESCRIPTION = "Toolbox for Time Series analysis and integration with Machine Learning."
24 | with codecs.open("README.md", encoding="utf-8-sig") as f:
25 | LONG_DESCRIPTION = f.read()
26 | LONG_DESCRIPTION_TYPE = "text/markdown"
27 | MAINTAINER = "Alessio Baccelli"
28 | MAINTAINER_EMAIL = "maintainers@giotto.ai"
29 | URL = "https://github.com/giotto-ai/giotto-time"
30 | LICENSE = "AGPLv3"
31 | DOWNLOAD_URL = "https://github.com/giotto-ai/giotto-time/tarball/v0.0a0"
32 | VERSION = __version__
33 | CLASSIFIERS = [
34 | "Intended Audience :: Information Technology",
35 | "Intended Audience :: Developers",
36 | "License :: OSI Approved",
37 | "Programming Language :: Python",
38 | "Topic :: Software Development",
39 | "Topic :: Scientific/Engineering",
40 | "Operating System :: Microsoft :: Windows",
41 | "Operating System :: POSIX",
42 | "Operating System :: Unix",
43 | "Operating System :: MacOS",
44 | "Programming Language :: Python :: 3.7",
45 | "Programming Language :: Python :: 3.8",
46 | "Programming Language :: Python :: 3.9",
47 | ]
48 | KEYWORDS = (
49 | "machine learning time series data analysis " + "topology, persistence diagrams"
50 | )
51 | INSTALL_REQUIRES = requirements
52 | EXTRAS_REQUIRE = {
53 | "tests": dev_requirements,
54 | "doc": doc_requirements,
55 | "examples": [],
56 | }
57 |
58 |
59 | setup(
60 | name=DISTNAME,
61 | maintainer=MAINTAINER,
62 | maintainer_email=MAINTAINER_EMAIL,
63 | description=DESCRIPTION,
64 | license=LICENSE,
65 | url=URL,
66 | version=VERSION,
67 | download_url=DOWNLOAD_URL,
68 | long_description=LONG_DESCRIPTION,
69 | long_description_content_type=LONG_DESCRIPTION_TYPE,
70 | zip_safe=False,
71 | classifiers=CLASSIFIERS,
72 | packages=find_packages(),
73 | keywords=KEYWORDS,
74 | install_requires=INSTALL_REQUIRES,
75 | extras_require=EXTRAS_REQUIRE,
76 | )
77 |
--------------------------------------------------------------------------------
/gtime/feature_extraction/tests/test_crest_factor_detrending.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import pandas.util.testing as testing
4 | import pytest
5 |
6 | from gtime.feature_extraction.custom import CrestFactorDetrending
7 |
8 |
9 | def get_input_data():
10 | input_data = pd.DataFrame.from_dict({"x_1": [0, 7, 2], "x_2": [2, 10, 4]})
11 | input_data.index = [
12 | pd.Timestamp(2000, 1, 1),
13 | pd.Timestamp(2000, 2, 1),
14 | pd.Timestamp(2000, 3, 1),
15 | ]
16 | return input_data
17 |
18 |
19 | def get_output_causal():
20 | custom_feature = CrestFactorDetrending(window_size=2, is_causal=True)
21 | feature_name = custom_feature.__class__.__name__
22 | output_causal = pd.DataFrame.from_dict(
23 | {
24 | f"x_1__{feature_name}": [np.nan, 1.0, 0.07547169811320754],
25 | f"x_2__{feature_name}": [np.nan, 0.9615384615384616, 0.13793103448275862],
26 | }
27 | )
28 | output_causal.index = [
29 | pd.Timestamp(2000, 1, 1),
30 | pd.Timestamp(2000, 2, 1),
31 | pd.Timestamp(2000, 3, 1),
32 | ]
33 | return output_causal
34 |
35 |
36 | def get_output_anticausal():
37 | custom_feature = CrestFactorDetrending(window_size=2, is_causal=False)
38 | feature_name = custom_feature.__class__.__name__
39 | output_anticausal = pd.DataFrame.from_dict(
40 | {
41 | f"x_1__{feature_name}": [1.0, 0.07547169811320754],
42 | f"x_2__{feature_name}": [0.9615384615384616, 0.13793103448275862],
43 | }
44 | )
45 | output_anticausal.index = [
46 | pd.Timestamp(2000, 2, 1),
47 | pd.Timestamp(2000, 3, 1),
48 | ]
49 | return output_anticausal
50 |
51 |
52 | input_data = get_input_data()
53 | output_causal = get_output_causal()
54 | output_anticausal = get_output_anticausal()
55 |
56 |
57 | class TestCrestFactorDetrending:
58 | @pytest.mark.parametrize("test_input, expected", [(input_data, output_causal)])
59 | def test_crest_factor_detrending_causal(self, test_input, expected):
60 | feature = CrestFactorDetrending(window_size=2, is_causal=True)
61 | output = feature.fit_transform(test_input)
62 | testing.assert_frame_equal(output, expected)
63 |
64 | @pytest.mark.parametrize("test_input, expected", [(input_data, output_anticausal)])
65 | def test_crest_factor_detrending_anticausal(self, test_input, expected):
66 | feature = CrestFactorDetrending(window_size=2, is_causal=False)
67 | output = feature.fit_transform(test_input)
68 | testing.assert_frame_equal(output, expected)
69 |
--------------------------------------------------------------------------------
/gtime/time_series_models/tests/test_simple_models.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import pytest
4 | from pandas.util import testing as testing
5 | from hypothesis import given, note
6 | import hypothesis.strategies as st
7 | from gtime.utils.hypothesis.time_indexes import giotto_time_series
8 |
9 |
10 | from gtime.time_series_models import (
11 | Naive,
12 | SeasonalNaive,
13 | Average,
14 | Drift,
15 | )
16 |
17 |
18 | @st.composite
19 | def forecast_input(draw, max_lenth):
20 | length = draw(st.integers(min_value=4, max_value=max_lenth))
21 | horizon = draw(st.integers(min_value=1, max_value=length - 1))
22 | window = draw(st.integers(min_value=1, max_value=length - horizon))
23 | df = draw(
24 | giotto_time_series(
25 | min_length=horizon + window,
26 | max_length=max_lenth,
27 | allow_nan=False,
28 | allow_infinity=False,
29 | )
30 | )
31 | return df, horizon, window
32 |
33 |
34 | class TestNaiveForecast:
35 | @given(x=forecast_input(50))
36 | def test_fit_predict(self, x):
37 | df, horizon, _ = x
38 | model = Naive(horizon=horizon)
39 | model.fit(df)
40 | y_pred = model.predict()
41 | assert y_pred.shape == (horizon, horizon)
42 | res = np.broadcast_to(df.iloc[-horizon:], (horizon, horizon))
43 | y_cols = ["y_" + str(x + 1) for x in range(horizon)]
44 | expected_df = pd.DataFrame(res, index=model.X_test_.index, columns=y_cols)
45 | testing.assert_frame_equal(y_pred, expected_df)
46 |
47 |
48 | class TestSeasonalNaiveForecast:
49 | @given(x=forecast_input(50))
50 | def test_fit_predict(self, x):
51 | df, horizon, seasonal_length = x
52 | model = SeasonalNaive(horizon=horizon, seasonal_length=seasonal_length)
53 | model.fit(df)
54 | y_pred = model.predict()
55 | note(y_pred)
56 | assert y_pred.shape[1] == horizon
57 | if seasonal_length < horizon:
58 | assert all(y_pred.iloc[:, 0] == y_pred.iloc[:, seasonal_length])
59 |
60 |
61 | class TestAverageForecast:
62 | @given(x=forecast_input(50))
63 | def test_fit_predict(self, x):
64 | df, horizon, _ = x
65 | model = Average(horizon=horizon)
66 | model.fit(df)
67 | y_pred = model.predict()
68 | note(y_pred)
69 | assert y_pred.shape == (horizon, horizon)
70 | assert pytest.approx(y_pred.diff(axis=1).sum().sum()) == 0
71 | means = [df.mean()] + [df.iloc[:-i].mean() for i in range(1, horizon)]
72 |
73 |
74 | class TestDriftForecast:
75 | @given(x=forecast_input(50))
76 | def test_fit_predict(self, x):
77 | df, horizon, _ = x
78 | model = Drift(horizon=horizon)
79 | model.fit(df)
80 | y_pred = model.predict()
81 | note(y_pred)
82 | assert len(y_pred) == horizon
83 | # assert pytest.approx(y_pred.diff().diff().sum().sum()) == 0
84 |
--------------------------------------------------------------------------------
/gtime/compose/feature_creation.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.compose import ColumnTransformer
3 |
4 |
5 | class FeatureCreation(ColumnTransformer):
6 | """Applies transformers to columns of a pandas DataFrame.
7 |
8 | This estimator is a wrapper of sklearn.compose.ColumnTransformer, the only
9 | difference is the output type of fit_transform and transform methods which is a
10 | DataFrame instead of an array.
11 |
12 | """
13 |
14 | def fit_transform(self, X: pd.DataFrame, y: pd.DataFrame = None):
15 | """Fit all transformers, transform the data and concatenate results.
16 |
17 | Parameters
18 | ----------
19 | X : pd.DataFrame, shape (n_samples, n_features), required
20 | Input data, of which specified subsets are used to fit the
21 | transformers.
22 |
23 | y : pd.DataFrame, shape (n_samples, ...), optional, default: ``None``
24 | Targets for supervised learning.
25 |
26 | Examples
27 | --------
28 | >>> import pandas.util.testing as testing
29 | >>> from gtime.compose import FeatureCreation
30 | >>> from gtime.feature_extraction import Shift, MovingAverage
31 | >>> data = testing.makeTimeDataFrame(freq="s")
32 | >>> fc = FeatureCreation([
33 | ... ('s1', Shift(1), ['A']),
34 | ... ('ma3', MovingAverage(window_size=3), ['B']),
35 | ... ])
36 | >>> fc.fit_transform(data).head()
37 | s1__A__Shift ma3__B__MovingAverage
38 | 2000-01-01 00:00:00 NaN NaN
39 | 2000-01-01 00:00:01 0.211403 NaN
40 | 2000-01-01 00:00:02 -0.313854 0.085045
41 | 2000-01-01 00:00:03 0.502018 -0.239269
42 | 2000-01-01 00:00:04 -0.225324 -0.144625
43 |
44 | Returns
45 | -------
46 | X_t_df : pd.DataFrame, shape (n_samples, sum_n_components)
47 | hstack of results of transformers. sum_n_components is the
48 | sum of n_components (output dimension) over transformers.
49 |
50 | """
51 | X_t = super().fit_transform(X, y)
52 | X_t_df = pd.DataFrame(data=X_t, columns=self.get_feature_names(), index=X.index)
53 | return X_t_df
54 |
55 | def transform(self, X: pd.DataFrame):
56 | """Transform X separately by each transformer, concatenate results.
57 |
58 | Parameters
59 | ----------
60 | X : pd.DataFrame, shape (n_samples, n_features), required
61 | The data to be transformed by subset.
62 |
63 | Returns
64 | -------
65 | X_t_df : DataFrame, shape (n_samples, sum_n_components)
66 | hstack of results of transformers. sum_n_components is the
67 | sum of n_components (output dimension) over transformers. If
68 | any result is a sparse matrix, everything will be converted to
69 | sparse matrices.
70 |
71 | """
72 | X_t = super().transform(X)
73 | X_t_df = pd.DataFrame(data=X_t, columns=self.get_feature_names(), index=X.index)
74 | return X_t_df
75 |
--------------------------------------------------------------------------------
/gtime/model_selection/tests/test_splitters.py:
--------------------------------------------------------------------------------
1 | import hypothesis.strategies as st
2 | import numpy as np
3 | import pytest
4 | from hypothesis import given, settings, HealthCheck
5 | from sklearn.compose import make_column_selector
6 |
7 | from gtime.compose import FeatureCreation
8 | from gtime.feature_extraction import Shift, MovingAverage
9 | from gtime.model_selection import horizon_shift
10 | from gtime.model_selection.splitters import FeatureSplitter
11 | from gtime.utils.hypothesis.feature_matrices import X_y_matrices
12 |
13 | # TODO: refactor, make hypothesis generator instead of a full pipeline
14 | from gtime.utils.hypothesis.time_indexes import giotto_time_series
15 |
16 | df_transformer = FeatureCreation(
17 | [
18 | ("shift_0", Shift(0), make_column_selector(dtype_include=np.number)),
19 | ("shift_1", Shift(1), make_column_selector(dtype_include=np.number)),
20 | (
21 | "moving_average_3",
22 | MovingAverage(window_size=3),
23 | make_column_selector(dtype_include=np.number),
24 | ),
25 | ]
26 | )
27 |
28 | horizon = 4
29 |
30 |
31 | class TestFeatureSplitter:
32 | def test_constructor(self):
33 | FeatureSplitter()
34 |
35 | @given(st.text().filter(lambda x: x != "any"))
36 | def test_constructor_wrong_parameter(self, drop_na_mode: str):
37 | with pytest.raises(ValueError):
38 | FeatureSplitter(drop_na_mode)
39 |
40 | @settings(suppress_health_check=(HealthCheck.too_slow,))
41 | @given(
42 | X_y_matrices(
43 | horizon=horizon, df_transformer=df_transformer, allow_nan_infinity=False,
44 | )
45 | )
46 | def test_transform(self, X_y):
47 | X, y = X_y
48 | feature_splitter = FeatureSplitter()
49 | X_train, y_train, X_test, y_test = feature_splitter.transform(X, y)
50 |
51 | assert X_train.shape[0] == max(0, X.shape[0] - 2 - horizon)
52 | assert y_train.shape[0] == X_train.shape[0]
53 | assert X_test.shape[0] == min(max(0, X.shape[0] - 2), horizon)
54 | assert y_test.shape[0] == X_test.shape[0]
55 |
56 |
57 | class TestHorizonShift:
58 | @given(
59 | giotto_time_series(min_length=10, allow_infinity=False, allow_nan=False),
60 | st.integers(1, 8),
61 | )
62 | def test_horizon_int(self, time_series, horizon):
63 | y_shifted = horizon_shift(time_series, horizon)
64 | assert y_shifted.shape[1] == horizon
65 |
66 | # Check first line of y_shifted
67 | for i in range(1, horizon + 1):
68 | assert time_series.iloc[i, 0] == y_shifted.iloc[0, i - 1]
69 |
70 | @given(
71 | giotto_time_series(min_length=10, allow_infinity=False, allow_nan=False),
72 | st.sets(elements=st.integers(1, 8), min_size=1, max_size=8),
73 | )
74 | def test_horizon_list(self, time_series, horizon):
75 | horizon = list(sorted(horizon))
76 | y_shifted = horizon_shift(time_series, horizon)
77 | assert y_shifted.shape[1] == len(horizon)
78 |
79 | # Check first line of y_shifted
80 | for i, elem in enumerate(horizon):
81 | assert time_series.iloc[elem, 0] == y_shifted.iloc[0, i]
82 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # http://www.sphinx-doc.org/en/master/config
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | import os
14 | import sys
15 | import sphinx_rtd_theme # noqa
16 |
17 | sys.path.insert(0, os.path.abspath(os.path.join("..", "..")))
18 | # sys.path.insert(0, os.path.abspath("../"))
19 |
20 | # -- Project information -----------------------------------------------------
21 |
22 | project = "giotto-time"
23 | copyright = "2022, L2F"
24 |
25 | # The full version, including alpha/beta/rc tags
26 | from gtime import __version__
27 |
28 | release = __version__
29 |
30 | # -- General configuration ---------------------------------------------------
31 |
32 | # Add any Sphinx extension module names here, as strings. They can be
33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
34 | # ones.
35 | extensions = [
36 | "sphinx.ext.autodoc",
37 | "sphinx_rtd_theme",
38 | ]
39 |
40 | # this is needed for some reason...
41 | # see https://github.com/numpy/numpydoc/issues/69
42 | # numpydoc_class_members_toctree = False
43 |
44 | # Add any paths that contain templates here, relative to this directory.
45 | templates_path = ["_templates"]
46 |
47 | # generate autosummary even if no references
48 | #autosummary_generate = True
49 |
50 | # The suffix of source filenames.
51 | # source_suffix = ".rst"
52 |
53 | # The encoding of source files.
54 | # source_encoding = 'utf-8'
55 |
56 | # The master toctree document.
57 | # master_doc = "index"
58 |
59 | # List of patterns, relative to source directory, that match files and
60 | # directories to ignore when looking for source files.
61 | # This pattern also affects html_static_path and html_extra_path.
62 | exclude_patterns = []
63 |
64 | # If true, '()' will be appended to :func: etc. cross-reference text.
65 | # add_function_parentheses = False
66 |
67 | # If true, the current module name will be prepended to all description
68 | # unit titles (such as .. function::).
69 | # add_module_names = True
70 |
71 | # If true, sectionauthor and moduleauthor directives will be shown in the
72 | # output. They are ignored by default.
73 | # show_authors = False
74 |
75 | # The name of the Pygments (syntax highlighting) style to use.
76 | # pygments_style = "sphinx"
77 |
78 | # A list of ignored prefixes for module index sorting.
79 | # modindex_common_prefix = []
80 | # -- Options for HTML output -------------------------------------------------
81 |
82 | # The theme to use for HTML and HTML Help pages. See the documentation for
83 | # a list of builtin themes.
84 | #
85 | html_theme = "sphinx_rtd_theme"
86 |
87 | # Add any paths that contain custom static files (such as style sheets) here,
88 | # relative to this directory. They are copied after the builtin static files,
89 | # so a file named "default.css" will overwrite the builtin "default.css".
90 | html_static_path = [] # ['_static']
91 |
--------------------------------------------------------------------------------
/gtime/forecasting/tests/test_naive.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import pytest
4 | from pandas.util import testing as testing
5 | from hypothesis import given, note
6 | import hypothesis.strategies as st
7 | from gtime.utils.hypothesis.time_indexes import giotto_time_series
8 | from gtime.model_selection import horizon_shift, FeatureSplitter
9 |
10 | from gtime.forecasting import (
11 | NaiveForecaster,
12 | SeasonalNaiveForecaster,
13 | DriftForecaster,
14 | AverageForecaster,
15 | )
16 |
17 |
18 | @st.composite
19 | def forecast_input(draw, max_lenth):
20 | length = draw(st.integers(min_value=2, max_value=max_lenth))
21 | horizon = draw(st.integers(min_value=1, max_value=length - 1))
22 | X = draw(
23 | giotto_time_series(
24 | min_length=length,
25 | max_length=max_lenth,
26 | allow_nan=False,
27 | allow_infinity=False,
28 | )
29 | )
30 | y = horizon_shift(X, horizon=horizon)
31 | X_train, y_train, X_test, y_test = FeatureSplitter().transform(X, y)
32 | return X_train, y_train, X_test
33 |
34 |
35 | class SimplePipelineTest:
36 | def setup(self, data, Model):
37 | X_train, y_train, X_test = data
38 | self.model = Model
39 | self.model.fit(X_train, y_train)
40 | self.X_test = X_test
41 | self.y_pred = self.model.predict(X_test)
42 |
43 | def test_fit_horizon(self):
44 | assert self.model.horizon_ == len(self.X_test)
45 |
46 | def test_predict_shape(self):
47 | assert self.y_pred.shape == (self.model.horizon_, self.model.horizon_)
48 |
49 |
50 | class TestNaiveModel(SimplePipelineTest):
51 | @given(data=forecast_input(50))
52 | def setup(self, data):
53 | super().setup(data, NaiveForecaster())
54 |
55 | def test_predict_df(self):
56 | horizon = len(self.X_test)
57 | y_cols = ["y_" + str(x + 1) for x in range(len(self.X_test))]
58 | res = np.broadcast_to(self.X_test, (horizon, horizon))
59 | expected_df = pd.DataFrame(res, index=self.X_test.index, columns=y_cols)
60 | testing.assert_frame_equal(self.y_pred, expected_df)
61 |
62 |
63 | class TestSeasonalNaiveModel(SimplePipelineTest):
64 | @given(data=forecast_input(50), season_length=st.data())
65 | def setup(self, data, season_length):
66 | season_length = season_length.draw(
67 | st.integers(min_value=1, max_value=len(data[0]))
68 | )
69 | self.season_length = season_length
70 | super().setup(data, SeasonalNaiveForecaster(seasonal_length=season_length))
71 |
72 | def test_predict_seasonality(self):
73 | if self.season_length < self.model.horizon_:
74 | assert all(
75 | self.y_pred.iloc[:, 0] == self.y_pred.iloc[:, self.season_length]
76 | )
77 |
78 |
79 | class TestDriftModel(SimplePipelineTest):
80 | @given(data=forecast_input(50))
81 | def setup(self, data):
82 | super().setup(data, DriftForecaster())
83 |
84 | def test_predict_drift(self):
85 | pytest.approx(self.y_pred.diff().diff().sum().sum())
86 | # assert pytest.approx(self.y_pred.diff().diff().sum().sum()) == 0
87 |
88 |
89 | class TestAverageModel(SimplePipelineTest):
90 | @given(data=forecast_input(50))
91 | def setup(self, data):
92 | super().setup(data, AverageForecaster())
93 |
94 | def test_predict_difference(self):
95 | assert pytest.approx(self.y_pred.diff(axis=1).sum().sum()) == 0
96 |
--------------------------------------------------------------------------------
/examples/hierarchical_model.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Hierarchical model\n",
8 | "This exemple shows how the hierarchical model can be used"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": null,
14 | "metadata": {},
15 | "outputs": [],
16 | "source": [
17 | "import sys\n",
18 | "sys.path.append('../')"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": null,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "import pandas as pd\n",
28 | "import numpy as np\n",
29 | "import matplotlib.pyplot as plt\n",
30 | "import networkx as nx\n",
31 | "%matplotlib inline \n",
32 | "\n",
33 | "from gtime.hierarchical import HierarchicalMiddleOut\n",
34 | "from gtime.hierarchical import HierarchicalTopDown\n",
35 | "from gtime.hierarchical import HierarchicalBottomUp\n",
36 | "import pandas._testing as testing\n",
37 | "from gtime.time_series_models import AR"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "testing.N, testing.K = 20, 1\n",
47 | "\n",
48 | "data1 = testing.makeTimeDataFrame(freq=\"s\")\n",
49 | "data2 = testing.makeTimeDataFrame(freq=\"s\")\n",
50 | "data3 = testing.makeTimeDataFrame(freq=\"s\")\n",
51 | "data4 = testing.makeTimeDataFrame(freq=\"s\")\n",
52 | "data5 = testing.makeTimeDataFrame(freq=\"s\")\n",
53 | "data6 = testing.makeTimeDataFrame(freq=\"s\")\n",
54 | "data = {'data1': data1, 'data2': data2, 'data3' : data3, 'data4' : data4, 'data5' : data5, 'data6' : data6}"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "tree_adj = {'data1' : ['data2','data3'], 'data2': ['data4', 'data5'], 'data3':['data6'], 'data4':[], 'data5':[], 'data6':[]} "
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": null,
69 | "metadata": {},
70 | "outputs": [],
71 | "source": [
72 | "stat_model = AR(p=2, horizon=3)\n",
73 | "middle_out_model = HierarchicalMiddleOut(model=stat_model, hierarchy_tree=tree_adj, method='tdsga', level=0)"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "fitting_middle_out = middle_out_model.fit(data)"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": null,
88 | "metadata": {
89 | "scrolled": true
90 | },
91 | "outputs": [],
92 | "source": [
93 | "fitting_middle_out.predict(data)"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": null,
99 | "metadata": {},
100 | "outputs": [],
101 | "source": []
102 | }
103 | ],
104 | "metadata": {
105 | "kernelspec": {
106 | "display_name": "Python 3 (ipykernel)",
107 | "language": "python",
108 | "name": "python3"
109 | },
110 | "language_info": {
111 | "codemirror_mode": {
112 | "name": "ipython",
113 | "version": 3
114 | },
115 | "file_extension": ".py",
116 | "mimetype": "text/x-python",
117 | "name": "python",
118 | "nbconvert_exporter": "python",
119 | "pygments_lexer": "ipython3",
120 | "version": "3.9.13"
121 | }
122 | },
123 | "nbformat": 4,
124 | "nbformat_minor": 4
125 | }
126 |
--------------------------------------------------------------------------------
/gtime/regressors/linear_regressor.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from scipy.optimize import minimize
4 | from sklearn.metrics import mean_squared_error
5 | from sklearn.utils.validation import check_is_fitted
6 |
7 |
8 | class LinearRegressor:
9 | """Implementation of a LinearRegressor that takes a custom loss function.
10 |
11 | Parameters
12 | ----------
13 | loss : Callable, optional, default: ``mean_squared_error``
14 | The loss function to use when fitting the model. The loss function must accept
15 | y_true, y_pred and return a single real number.
16 |
17 | Examples
18 | --------
19 | >>> from gtime.regressors.linear_regressor import LinearRegressor
20 | >>> from gtime.metrics import max_error
21 | >>> import numpy as np
22 | >>> import pandas as pd
23 | >>> X = np.random.random((100, 10))
24 | >>> y = np.random.random(100)
25 | >>> lr = LinearRegressor(loss=max_error)
26 | >>> X_train, y_train = X[:90], y[:90]
27 | >>> X_test, y_test = X[90:], y[90:]
28 | >>> x0 = [0]*11
29 | >>> lr.fit(X_train, y_train, x0=x0)
30 | >>> lr.predict(X_test)
31 | array([0.62987155, 0.46971378, 0.50421395, 0.5543149 , 0.50848151,
32 | 0.54768797, 0.50968854, 0.50500384, 0.58069366, 0.54912972])
33 |
34 | """
35 |
36 | def __init__(self, loss=mean_squared_error):
37 | self.loss = loss
38 |
39 | def fit(self, X: pd.DataFrame, y: pd.DataFrame, **kwargs) -> "LinearRegressor":
40 | """Fit the linear model on ``X`` and ``y`` on the given loss function.To do the
41 | minimization, the ``scipy.optimize.minimize`` function is used. To have more
42 | details and check which kind of options are available, please refer to the scipy
43 | `documentation
44 | `_.
45 |
46 | Parameters
47 | ----------
48 | X : pd.DataFrame, shape (n_samples, n_features), required
49 | The X matrix used as features in the fitting procedure.
50 |
51 | y : pd.DataFrame, shape (n_samples, 1), required
52 | The y matrix to use as target values in the fitting procedure.
53 |
54 | kwargs: dict, optional.
55 | Optional arguments to pass to the ``minimize`` function of scipy.
56 |
57 | Returns
58 | -------
59 | self: LinearRegressor
60 | The fitted model.
61 |
62 | """
63 |
64 | if isinstance(X, pd.DataFrame):
65 | X = X.values
66 |
67 | if isinstance(y, pd.DataFrame):
68 | y = y.values
69 |
70 | def prediction_error(model_weights):
71 | predictions = [
72 | model_weights[0] + np.dot(model_weights[1:], row) for row in X
73 | ]
74 | return self.loss(y, predictions)
75 |
76 | res = minimize(prediction_error, **kwargs)
77 |
78 | self.model_weights_ = res["x"]
79 |
80 | return self
81 |
82 | def predict(self, X: pd.DataFrame) -> pd.DataFrame:
83 | """Predict the y values associated to the features ``X``.
84 |
85 | Parameters
86 | ----------
87 | X : pd.DataFrame, shape (n_samples, n_features), required
88 | The features used to predict.
89 |
90 | Returns
91 | -------
92 | predictions : pd.DataFrame, shape (n_samples, 1)
93 | The predictions of the model
94 |
95 | """
96 | check_is_fitted(self)
97 |
98 | predictions = self.model_weights_[0] + np.dot(X, self.model_weights_[1:])
99 | return predictions
100 |
--------------------------------------------------------------------------------
/gtime/plotting/tests/test_plotting.py:
--------------------------------------------------------------------------------
1 | from hypothesis import given, settings
2 | import hypothesis.strategies as st
3 | import pytest
4 | import pandas as pd
5 | import numpy as np
6 | import matplotlib.pyplot as plt
7 | from gtime.utils.hypothesis.time_indexes import giotto_time_series
8 |
9 | from gtime.plotting import lag_plot, acf_plot, seasonal_subplots, seasonal_plot
10 | from gtime.plotting.preprocessing import seasonal_split
11 |
12 |
13 | @pytest.fixture()
14 | def time_series():
15 | idx = pd.period_range(start="2000-01-01", end="2003-01-01")
16 | df = pd.DataFrame(np.random.random((len(idx), 1)), index=idx, columns=["ts"])
17 | return df
18 |
19 |
20 | class TestLagplots:
21 | @pytest.mark.parametrize("lags", [1, 5, [1], [1, 3, 5, 100]])
22 | def test_subplots_number(self, time_series, lags):
23 | ax = lag_plot(time_series, lags=lags)
24 | num_plots = sum(map(lambda x: x.has_data(), ax.flatten()))
25 | if isinstance(lags, int):
26 | expected_num_plots = lags
27 | else:
28 | expected_num_plots = len(lags)
29 | assert num_plots == expected_num_plots
30 | plt.close("all")
31 |
32 | @pytest.mark.parametrize("lags", [1, 5, [1], [1, 3, 5, 100]])
33 | @pytest.mark.parametrize("plots_per_row", [1, 3, 10])
34 | def test_rows_and_cols(self, time_series, lags, plots_per_row):
35 | ax = lag_plot(time_series, lags=lags, plots_per_row=plots_per_row)
36 | if isinstance(lags, int):
37 | lag_length = lags
38 | else:
39 | lag_length = len(lags)
40 | assert ax.shape == (
41 | (lag_length - 1) // plots_per_row + 1,
42 | min(lag_length, plots_per_row),
43 | )
44 | plt.close("all")
45 |
46 |
47 | class TestACFplots:
48 | @pytest.mark.parametrize("maxlags", [1, 5, 100])
49 | @pytest.mark.parametrize("ci", [0.0, 0.05])
50 | @pytest.mark.parametrize("partial", [True, False])
51 | def test_ci_lines(self, time_series, maxlags, ci, partial):
52 | ax = acf_plot(time_series, max_lags=maxlags, ci=ci, partial=partial)
53 | assert len(ax.lines) == 3
54 | plt.close("all")
55 |
56 | @pytest.mark.parametrize("maxlags", [1, 5, 100])
57 | @pytest.mark.parametrize("ci", [0.0, 0.05])
58 | @pytest.mark.parametrize("partial", [True, False])
59 | def test_num_bars(self, time_series, maxlags, ci, partial):
60 | ax = acf_plot(time_series, maxlags, ci, partial)
61 | assert len(ax.containers[0]) == min(len(time_series), maxlags)
62 | plt.close("all")
63 |
64 |
65 | class TestSubplots:
66 | @pytest.mark.parametrize("cycle", ["year", "6M"])
67 | @pytest.mark.parametrize("freq", ["M"])
68 | @pytest.mark.parametrize("box", [True, False])
69 | def test_subplots_number(self, time_series, cycle, freq, box):
70 | ax = seasonal_subplots(time_series, cycle=cycle, freq=freq, box=box)
71 | split = seasonal_split(time_series, cycle=cycle, freq=freq)
72 | assert ax.size == split.shape[0]
73 | plt.close("all")
74 |
75 |
76 | class TestSeasonalPlots:
77 | @pytest.mark.parametrize("cycle", ["year", "6M"])
78 | @pytest.mark.parametrize("freq", ["M", None])
79 | @pytest.mark.parametrize("polar", [True, False])
80 | @pytest.mark.parametrize("new_ax", [True, False])
81 | def test_seasonal_num_lines(self, time_series, cycle, freq, polar, new_ax):
82 | if new_ax:
83 | if polar:
84 | ax = plt.subplot(111, projection="polar")
85 | else:
86 | ax = plt.subplot(111)
87 | else:
88 | ax = None
89 | ax = seasonal_plot(time_series, cycle=cycle, freq=freq, polar=polar, ax=ax)
90 | split = seasonal_split(time_series, cycle=cycle, freq=freq)
91 | assert len(ax.lines) == split.shape[1]
92 | plt.close("all")
93 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.rst:
--------------------------------------------------------------------------------
1 | CONTRIBUTOR CODE OF CONDUCT
2 | ===========================
3 | (Code of Conduct)
4 | -----------------
5 |
6 |
7 | Our Pledge
8 | ----------
9 |
10 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation.
11 |
12 | Our Standards
13 | -------------
14 |
15 | Examples of behavior that contributes to creating a positive environment include:
16 |
17 | * Using welcoming and inclusive language;
18 | * Being respectful of differing viewpoints and experiences;
19 | * Gracefully accepting constructive criticism;
20 | * Focusing on what is best for the community;
21 | * Showing empathy towards other community members.
22 |
23 | Examples of unacceptable behavior by participants include:
24 |
25 | * The use of sexualized language or imagery and unwelcome sexual attention or advances;
26 | * Trolling, insulting/derogatory comments, and personal or political attacks;
27 | * Public or private harassment;
28 | * Publishing others’ private information, such as a physical or electronic address, without explicit permission;
29 | * Other conduct which could reasonably be considered inappropriate in a professional setting.
30 |
31 | Our Responsibilities
32 | --------------------
33 |
34 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
35 |
36 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
37 |
38 | Scope
39 | -----
40 |
41 | This Code of Conduct applies within all Giotto’s project spaces, to all content on , Giotto’s GitHub organization, or any other official Giotto web presence allowing for community interactions, and it also applies when an individual is representing the project or its community in public spaces.
42 |
43 | Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
44 |
45 | Enforcement
46 | -----------
47 |
48 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at . All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances, Sanctions may include written warnings, expulsions from the project, project sponsored spaces, or project forums, or any other sanction which is deemed appropriate. [The project team] is obligated to maintain confidentiality with regard to the reporter of an incident. If the act is ongoing (such as someone engaging in harassment) or involves a threat to anyone's safety (e.g. threats of violence), the the project team may issue sanctions without notice. Further details of specific enforcement policies may be posted separately.
49 |
50 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by the project leader.
51 |
52 | Attribution
53 | -----------
54 |
55 | This Code of Conduct is adapted from the Contributor Covenant, version 1.4, available at , and includes some aspects of the TensorFlow Code of Conduct, available at
56 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | [](https://github.com/giotto-ai/giotto-time/actions/workflows/deploy_github_pages.yml)
3 | [](https://github.com/giotto-ai/giotto-time/actions/workflows/build_and_publish.yml)
4 | [](https://github.com/giotto-ai/giotto-time/actions/workflows/ci.yml)
5 | [](https://badge.fury.io/py/giotto-time)
6 | [](https://slack.giotto.ai/)
7 |
8 | # giotto-time
9 |
10 | giotto-time is a machine learning based time series forecasting toolbox in Python.
11 | It is part of the [Giotto](https://github.com/giotto-ai) collection of open-source projects and aims to provide
12 | feature extraction, analysis, causality testing and forecasting models based on
13 | [scikit-learn](https://scikit-learn.org/stable/) API.
14 |
15 | ## License
16 |
17 | giotto-time is distributed under the AGPLv3 [license](https://github.com/giotto-ai/giotto-time/blob/master/LICENSE).
18 | If you need a different distribution license, please contact the L2F team at business@l2f.ch.
19 |
20 | ## Documentation
21 |
22 | - API reference (stable release): https://giotto-ai.github.io/giotto-time/
23 |
24 | ## Getting started
25 |
26 | Get started with giotto-time by following the installation steps below.
27 | Simple tutorials and real-world use cases can be found in example folder as notebooks.
28 |
29 | ## Installation
30 |
31 | ### User installation
32 |
33 | Run this command in your favourite python environment
34 | ```
35 | pip install giotto-time
36 | ```
37 |
38 | ### Developer installation
39 |
40 | Get the latest state of the source code with the command
41 |
42 | ```
43 | git clone https://github.com/giotto-ai/giotto-time.git
44 | cd giotto-time
45 | pip install -e ".[tests, doc]"
46 | ```
47 |
48 | ## Example
49 |
50 | ```python
51 | from gtime import *
52 | from gtime.feature_extraction import *
53 | import pandas as pd
54 | import numpy as np
55 | from sklearn.linear_model import LinearRegression
56 |
57 | # Create random DataFrame with DatetimeIndex
58 | X_dt = pd.DataFrame(np.random.randint(4, size=(20)),
59 | index=pd.date_range("2019-12-20", "2020-01-08"),
60 | columns=['time_series'])
61 |
62 | # Convert the DatetimeIndex to PeriodIndex and create y matrix
63 | X = preprocessing.TimeSeriesPreparation().transform(X_dt)
64 | y = model_selection.horizon_shift(X, horizon=2)
65 |
66 | # Create some features
67 | cal = feature_generation.Calendar(region="europe", country="Switzerland", kernel=np.array([1, 2]))
68 | X_f = compose.FeatureCreation(
69 | [('s_2', Shift(2), ['time_series']),
70 | ('ma_3', MovingAverage(window_size=3), ['time_series']),
71 | ('cal', cal, ['time_series'])]).fit_transform(X)
72 |
73 | # Train/test split
74 | X_train, y_train, X_test, y_test = model_selection.FeatureSplitter().transform(X_f, y)
75 |
76 | # Try sklearn's MultiOutputRegressor as time-series forecasting model
77 | gar = forecasting.GAR(LinearRegression())
78 | gar.fit(X_train, y_train).predict(X_test)
79 |
80 | ```
81 |
82 |
83 | ## Contributing
84 |
85 | We welcome new contributors of all experience levels. The Giotto
86 | community goals are to be helpful, welcoming, and effective. To learn more about
87 | making a contribution to giotto-time, please see the [CONTRIBUTING.rst](https://github.com/giotto-ai/giotto-time/blob/master/CONTRIBUTING.rst)
88 | file.
89 |
90 | ## Links
91 |
92 | - Official source code repo: https://github.com/giotto-ai/giotto-time
93 | - Download releases: https://pypi.org/project/giotto-time/
94 | - Issue tracker: https://github.com/giotto-ai/giotto-time/issues
95 |
96 | ## Community
97 |
98 | Giotto Slack workspace: https://slack.giotto.ai/
99 |
100 | ## Contacts
101 |
102 | maintainers@giotto.ai
103 |
--------------------------------------------------------------------------------
/gtime/experimental/trend_models/function_trend.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from gtime.models.trend_models.base import TrendModel
3 | from scipy.optimize import minimize
4 | from sklearn.metrics import mean_squared_error
5 |
6 |
7 | class FunctionTrend(TrendModel):
8 | """A model for fitting, predicting and removing an custom functional trend
9 | from a time series. The transformed time series created will be trend
10 | stationary with respect to the specific function. To have more details,
11 | you can check this `link `_.
12 |
13 | Parameters
14 | ----------
15 | loss : ``Callable``, optional, (default=``mean_squared_error``).
16 | The loss function to use when fitting the model. The loss function must
17 | accept y_true, y_pred and return a single real number.
18 |
19 | """
20 |
21 | def __init__(self, model_form, loss=mean_squared_error):
22 | self.model_form = model_form
23 | self.loss = loss
24 |
25 | def fit(
26 | self, time_series: pd.DataFrame, x0: list, method: str = "BFGS"
27 | ) -> TrendModel:
28 | """Fit the model on the ``time_series``, with respect to the provided
29 | ``loss`` and using the provided ``method``. In order to see which
30 | methods are available, please check the 'scipy' `documentation
31 | `_.
32 |
33 | Parameters
34 | ----------
35 | time_series : ``pd.DataFrame``, required.
36 | The time series on which to fit the model.
37 |
38 | x0 : ``list``.
39 |
40 | method : ``str``, optional, (default=``'BFGS``).
41 | The method to use in order to minimize the loss function.
42 |
43 | Returns
44 | -------
45 | self : ``TrendModel``
46 | The fitted object.
47 |
48 | """
49 |
50 | def prediction_error(model_weights):
51 | predictions = [
52 | self.model_form(t, model_weights)
53 | for t in range(0, time_series.shape[0])
54 | ]
55 | return self.loss(time_series.values, predictions)
56 |
57 | res = minimize(prediction_error, x0, method=method, options={"disp": False})
58 |
59 | self.model_weights_ = res["x"]
60 |
61 | self.t0_ = time_series.index[0]
62 | freq = time_series.index.freq
63 | if freq is not None:
64 | self.period_ = freq
65 | else:
66 | self.period_ = time_series.index[1] - time_series.index[0]
67 |
68 | return self
69 |
70 | def predict(self, t):
71 | """Using the fitted model, predict the values starting from ``X``.
72 |
73 | Parameters
74 | ----------
75 | X : ``pd.DataFrame``, required.
76 | The time series on which to predict.
77 |
78 | Returns
79 | -------
80 | predictions : ``pd.DataFrame``
81 | The output predictions.
82 |
83 | Raises
84 | ------
85 | ``NotFittedError``
86 | Raised if the model is not fitted yet.
87 |
88 | """
89 | # check fit run
90 | return self.model_form(t, self.model_weights_)
91 |
92 | def transform(self, time_series):
93 | """Transform the ``time_series`` by removing the trend.
94 |
95 | Parameters
96 | ----------
97 | time_series : ``pd.DataFrame``, required.
98 | The time series to transform.
99 |
100 | Returns
101 | -------
102 | transformed_time_series : ``pd.DataFrame``
103 | The transformed time series, without the trend.
104 |
105 | """
106 | # check fit run
107 |
108 | ts = (time_series.index - self.t0_) / self.period_
109 |
110 | predictions = pd.Series(
111 | index=time_series.index,
112 | data=[self.model_form(t, self.model_weights_) for t in ts],
113 | )
114 |
115 | return time_series.sub(predictions, axis=0)
116 |
--------------------------------------------------------------------------------
/gtime/forecasting/trend.py:
--------------------------------------------------------------------------------
1 | from typing import Callable
2 |
3 | import numpy as np
4 | import pandas as pd
5 | from scipy.optimize import minimize
6 | from sklearn.base import BaseEstimator, RegressorMixin
7 | from sklearn.metrics import mean_squared_error
8 | from sklearn.utils.validation import check_is_fitted
9 |
10 | from gtime.utils.trends import TRENDS
11 |
12 |
13 | class TrendForecaster(BaseEstimator, RegressorMixin):
14 | """Trend forecasting model.
15 |
16 | This estimator optimizes a trend function on train data and will forecast using this trend function with optimized
17 | parameters.
18 |
19 | Parameters
20 | ----------
21 | trend : ``"polynomial"`` | ``"exponential"``, required
22 | The kind of trend removal to apply.
23 |
24 | trend_x0 : np.array, required
25 | Initialisation parameters passed to the trend function
26 |
27 | loss : Callable, optional, default: ``mean_squared_error``
28 | Loss function to minimize.
29 |
30 | method : str, optional, default: ``"BFGS"``
31 | Loss function optimisation method
32 |
33 | Examples
34 | --------
35 | >>> import pandas as pd
36 | >>> import numpy as np
37 | >>> from gtime.model_selection import horizon_shift, FeatureSplitter
38 | >>> from gtime.forecasting import TrendForecaster
39 | >>>
40 | >>> X = pd.DataFrame(np.random.random((10, 1)), index=pd.date_range("2020-01-01", "2020-01-10"))
41 | >>> y = horizon_shift(X, horizon=2)
42 | >>> X_train, y_train, X_test, y_test = FeatureSplitter().transform(X, y)
43 | >>>
44 | >>> tf = TrendForecaster(trend='polynomial', trend_x0=np.zeros(2))
45 | >>> tf.fit(X_train).predict(X_test)
46 | array([[0.39703029],
47 | [0.41734957]])
48 |
49 | """
50 |
51 | def __init__(
52 | self,
53 | trend: str,
54 | trend_x0: np.array,
55 | loss: Callable = mean_squared_error,
56 | method: str = "BFGS",
57 | ):
58 | self.trend = trend
59 | self.trend_x0 = trend_x0
60 | self.loss = loss
61 | self.method = method
62 |
63 | def fit(self, X: pd.DataFrame, y=None) -> "TrendForecaster":
64 | """Fit the estimator.
65 |
66 | Parameters
67 | ----------
68 | X : pd.DataFrame, shape (n_samples, n_features), required
69 | Input data.
70 |
71 | y : None
72 | There is no need of a target in a transformer, yet the pipeline API
73 | requires this parameter.
74 |
75 | Returns
76 | -------
77 | self : object
78 | Returns self.
79 |
80 | """
81 |
82 | if self.trend not in TRENDS:
83 | raise ValueError(
84 | "The trend '%s' is not supported. Supported "
85 | "trends are %s." % (self.trend, list(sorted(TRENDS)))
86 | )
87 | print([TRENDS[self.trend](t, 111) for t in range(0, X.shape[0])])
88 | self.best_trend_params_ = minimize(
89 | lambda opt: self.loss(
90 | X.values, [TRENDS[self.trend](t, opt) for t in range(0, X.shape[0])]
91 | ),
92 | self.trend_x0,
93 | method=self.method,
94 | options={"disp": False},
95 | )["x"]
96 |
97 | return self
98 |
99 | def predict(self, X: pd.DataFrame) -> pd.DataFrame:
100 | """Using the fitted polynomial, predict the values starting from ``X``.
101 |
102 | Parameters
103 | ----------
104 | X: pd.DataFrame, shape (n_samples, 1), required
105 | The time series on which to predict.
106 |
107 | Returns
108 | -------
109 | predictions : pd.DataFrame, shape (n_samples, 1)
110 | The output predictions.
111 |
112 | Raises
113 | ------
114 | NotFittedError
115 | Raised if the model is not fitted yet.
116 |
117 | """
118 | check_is_fitted(self)
119 |
120 | predictions = TRENDS[self.trend](X.values, self.best_trend_params_)
121 | return predictions
122 |
--------------------------------------------------------------------------------
/gtime/hierarchical/naive.py:
--------------------------------------------------------------------------------
1 | from copy import deepcopy
2 | from typing import Dict
3 |
4 | import pandas as pd
5 | from sklearn.base import BaseEstimator
6 | from sklearn.utils.validation import check_is_fitted
7 |
8 | from gtime.hierarchical.base import HierarchicalBase
9 |
10 |
11 | class HierarchicalNaive(HierarchicalBase):
12 | """ Simplest hierarchical model possible.
13 | It does not perform any aggregation of the results.
14 | Each time series is fitted and predicted independently.
15 |
16 | Parameters
17 | ----------
18 | model: BaseEstimator, required
19 | time series forecasting model that is applied to each of the time series. A cross validation model
20 | can also be passed.
21 | Examples
22 | --------
23 | >>> import pandas._testing as testing
24 | >>> from gtime.time_series_models import AR
25 | >>> from gtime.hierarchical import HierarchicalNaive
26 | >>>
27 | >>> testing.N, testing.K = 20, 1
28 | >>> data1 = testing.makeTimeDataFrame(freq="s")
29 | >>> data2 = testing.makeTimeDataFrame(freq="s")
30 | >>> data = {'data1': data1, 'data2': data2}
31 | >>> time_series_model = AR(p=2, horizon=3)
32 | >>>
33 | >>> hierarchical_model = HierarchicalNaive(model=time_series_model)
34 | >>> hierarchical_model.fit(data)
35 | >>> hierarchical_model.predict()
36 | {'data1': y_1 y_2 y_3
37 | 2000-01-01 00:00:17 0.475903 0.834633 0.649467
38 | 2000-01-01 00:00:18 0.644168 0.610287 0.383904
39 | 2000-01-01 00:00:19 0.180920 0.596606 0.696133, 'data2': y_1 y_2 y_3
40 | 2000-01-01 00:00:17 -0.117342 0.006594 -0.638133
41 | 2000-01-01 00:00:18 -0.394193 -0.607146 0.323875
42 | 2000-01-01 00:00:19 -0.381479 0.088210 -0.356775}
43 | """
44 |
45 | def __init__(self, model: BaseEstimator):
46 | super().__init__(model=model, hierarchy_tree="infer")
47 |
48 | def fit(self, X: Dict[str, pd.DataFrame], y: pd.DataFrame = None):
49 | """ Fit method
50 |
51 | Parameters
52 | ----------
53 | X : Dict[str, pd.DataFrame], required
54 | A dictionary of time series. Each is fitted independently
55 | y : pd.DataFrame, optional, default = ``None``
56 | only for compatibility
57 |
58 | Returns
59 | -------
60 | self
61 | """
62 | self._check_is_dict_of_dataframes_with_str_key(X)
63 | self._infer_hierarchy_tree(X)
64 | self._initialize_models(X)
65 | for key, time_series in X.items():
66 | self.models_[key].fit(time_series)
67 | return self
68 |
69 | def predict(self, X: Dict[str, pd.DataFrame] = None):
70 | """ Predict method
71 |
72 | Parameters
73 | ----------
74 | X : Dict[str, pd.DataFrame], optional, default = ``None``
75 | time series to predict. If ``None`` all the fitted time series are predicted.
76 | The keys in ``X`` have to match the ones used to fit.
77 |
78 | Returns
79 | -------
80 | predictions : Dict[str, pd.DataFrame]
81 | """
82 | check_is_fitted(self)
83 | if X is None:
84 | return self._predict_fitted_time_series()
85 | else:
86 | return self._predict_new_time_series(X)
87 |
88 | def _initialize_models(self, X: Dict[str, pd.DataFrame]):
89 | print(self.model)
90 | self.models_ = {key: deepcopy(self.model) for key in X}
91 |
92 | def _infer_hierarchy_tree(self, X: Dict[str, pd.DataFrame]):
93 | self.hierarchy_tree_ = set(
94 | X.keys()
95 | ) # No need of a proper hierarchy tree for HierarchicalNaive
96 |
97 | def _predict_fitted_time_series(self) -> Dict[str, pd.DataFrame]:
98 | return {key: model.predict() for key, model in self.models_.items()}
99 |
100 | def _predict_new_time_series(self, X: pd.DataFrame) -> Dict[str, pd.DataFrame]:
101 | return {
102 | key: self.models_[key].predict(time_series)
103 | for key, time_series in X.items()
104 | }
105 |
--------------------------------------------------------------------------------
/gtime/regressors/tests/test_explainable.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 |
3 | import pytest
4 | from hypothesis import given, settings
5 | from sklearn import clone
6 | from sklearn.base import BaseEstimator
7 | from sklearn.cluster import DBSCAN, KMeans, SpectralClustering
8 | from sklearn.decomposition import PCA
9 | from sklearn.exceptions import NotFittedError
10 | import numpy as np
11 | import pandas as pd
12 |
13 | from gtime.explainability import _LimeExplainer, _ShapExplainer
14 | from gtime.forecasting.tests.test_gar import df_transformer
15 | from gtime.model_selection import FeatureSplitter
16 | from gtime.regressors import ExplainableRegressor
17 | from gtime.utils.hypothesis.feature_matrices import (
18 | numpy_X_matrices,
19 | numpy_X_y_matrices,
20 | X_y_matrices,
21 | )
22 | from gtime.utils.hypothesis.general_strategies import regressors
23 | from gtime.utils.hypothesis.time_indexes import samples_from
24 |
25 |
26 | def bad_regressors():
27 | return samples_from([DBSCAN(), SpectralClustering(), PCA(),])
28 |
29 |
30 | @given(bad_regressors())
31 | def test_bad_regressors(bad_regressor):
32 | assert hasattr(bad_regressor, "fit")
33 | assert not hasattr(bad_regressor, "predict")
34 |
35 |
36 | class TestExplainableRegressor:
37 | @pytest.mark.parametrize("explainer_type", ["lime", "shap"])
38 | @given(estimator=regressors())
39 | def test_constructor(self, estimator, explainer_type):
40 | regressor = ExplainableRegressor(estimator, explainer_type)
41 | if explainer_type == "lime":
42 | assert isinstance(regressor.explainer, _LimeExplainer)
43 | elif explainer_type == "shap":
44 | assert isinstance(regressor.explainer, _ShapExplainer)
45 |
46 | @given(estimator=regressors())
47 | def test_constructor_bad_explainer(self, estimator):
48 | with pytest.raises(ValueError):
49 | ExplainableRegressor(estimator, "bad")
50 |
51 | @pytest.mark.parametrize("explainer_type", ["lime", "shap"])
52 | @given(bad_estimator=bad_regressors())
53 | def test_constructor_bad_regressor(self, bad_estimator, explainer_type):
54 | with pytest.raises(TypeError):
55 | ExplainableRegressor(bad_estimator, explainer_type)
56 |
57 | @pytest.mark.parametrize("explainer_type", ["lime", "shap"])
58 | @given(estimator=regressors(), X=numpy_X_matrices())
59 | def test_error_predict_not_fitted(self, estimator, explainer_type, X):
60 | regressor = ExplainableRegressor(estimator, explainer_type)
61 | with pytest.raises(NotFittedError):
62 | regressor.predict(X)
63 |
64 | def _get_fit_attributes(self, estimator: BaseEstimator) -> List[str]:
65 | return [
66 | v for v in vars(estimator) if v.endswith("_") and not v.startswith("__")
67 | ]
68 |
69 | @pytest.mark.parametrize("explainer_type", ["lime", "shap"])
70 | @given(
71 | estimator=regressors(), X_y=numpy_X_y_matrices(min_value=-100, max_value=100)
72 | )
73 | def test_fit_values(self, estimator, explainer_type, X_y):
74 | X, y = X_y
75 | regressor = ExplainableRegressor(estimator, explainer_type)
76 | regressor.fit(X, y)
77 |
78 | cloned_estimator = clone(estimator)
79 | cloned_estimator.fit(X, y)
80 |
81 | estimator_fit_attributes = self._get_fit_attributes(regressor.estimator)
82 | cloned_estimator_fit_attributes = self._get_fit_attributes(cloned_estimator)
83 |
84 | np.testing.assert_array_equal(
85 | estimator_fit_attributes, cloned_estimator_fit_attributes
86 | )
87 |
88 | @settings(deadline=pd.Timedelta(milliseconds=5000), max_examples=7)
89 | @pytest.mark.parametrize("explainer_type", ["lime", "shap"])
90 | @given(
91 | estimator=regressors(), X_y=numpy_X_y_matrices(min_value=-100, max_value=100)
92 | )
93 | def test_predict_values(self, estimator, explainer_type, X_y):
94 | X, y = X_y
95 | X_test = X[:1, :]
96 | regressor = ExplainableRegressor(estimator, explainer_type)
97 | regressor_predictions = regressor.fit(X, y).predict(X_test)
98 |
99 | cloned_estimator = clone(estimator)
100 | estimator_predictions = cloned_estimator.fit(X, y).predict(X_test)
101 |
102 | assert regressor_predictions.shape == estimator_predictions.shape
103 | assert regressor_predictions.shape[0] == len(regressor.explanations_)
104 |
--------------------------------------------------------------------------------
/gtime/regressors/explainable.py:
--------------------------------------------------------------------------------
1 | from typing import Union, List, Tuple
2 |
3 | from sklearn.base import BaseEstimator, RegressorMixin
4 | import numpy as np
5 | from sklearn.utils.validation import check_is_fitted
6 | import pandas as pd
7 |
8 | from gtime.explainability import _LimeExplainer, _ShapExplainer
9 |
10 |
11 | class ExplainableRegressor(BaseEstimator, RegressorMixin):
12 | """ Wraps the most commons scikit-learn regressor to offer a nice to use interface to fit/predict
13 | models and at the same time to explain the predictions.
14 |
15 | Since it follows the fit/predict interface of scikit-learn model it is compatible with
16 | scikit-learn pipelines, etc..
17 |
18 | 2 explainers are available: LIME and SHAP
19 |
20 | You can get the explanation by accessing to `regressor.explainer_.explanations_` after
21 | the predict function,
22 |
23 | Parameters
24 | ----------
25 | estimator: RegressorMixin, required
26 | the scikit-learn model
27 | explainer_type: str, required
28 | 'lime' or 'shap'
29 |
30 | Examples
31 | --------
32 | >>> import numpy as np
33 | >>> from gtime.regressors import ExplainableRegressor
34 | >>> from sklearn.ensemble import RandomForestRegressor
35 | >>> X = np.random.random((30, 5))
36 | >>> y = np.random.random(30)
37 | >>> X_train, y_train = X[:20], y[:20]
38 | >>> X_test, y_test = X[20:], y[20:]
39 | >>>
40 | >>> random_forest = RandomForestRegressor()
41 | >>> explainable_regressor = ExplainableRegressor(random_forest, 'shap')
42 | >>>
43 | >>> explainable_regressor.fit(X_train, y_train, feature_names=['a', 'b', 'c', 'd', 'e'])
44 | >>> explainable_regressor.predict(X_test)
45 | array([0.41323105, 0.40386639, 0.46462663, 0.3795568 , 0.57571486,
46 | 0.37079003, 0.54756082, 0.35160197, 0.30881165, 0.48201442])
47 | >>> explainable_regressor.explainer_.explanations_[0]
48 | {'a': -0.019896434698603117, 'b': 0.029814649814215954, 'c': 0.02447547087613202, 'd': 0.021313815648682066, 'e': -0.10778800140251406}
49 | """
50 |
51 | def __init__(self, estimator: RegressorMixin, explainer_type: str):
52 | self.estimator = self._check_estimator(estimator)
53 | self.explainer_type = explainer_type
54 | self.explainer = self._initialize_explainer()
55 |
56 | def _check_estimator(self, estimator: RegressorMixin) -> RegressorMixin:
57 | if not hasattr(estimator, "fit") or not hasattr(estimator, "predict"):
58 | raise TypeError(f"Estimator not compatible: {estimator}")
59 | return estimator
60 |
61 | def _initialize_explainer(self) -> Union[_LimeExplainer, _ShapExplainer]:
62 | if self.explainer_type == "lime":
63 | return _LimeExplainer()
64 | elif self.explainer_type == "shap":
65 | return _ShapExplainer()
66 | else:
67 | raise ValueError(f"Explainer not available: {self.explainer_type}")
68 |
69 | def fit(
70 | self, X: np.ndarray, y: np.ndarray, feature_names: List[str] = None,
71 | ):
72 | """ Fit function that calls the fit on the estimator and on the explainer.
73 |
74 | Parameters
75 | ----------
76 | X: np.ndarray, required
77 | train matrix
78 | y: np.ndarray, required
79 | train true values
80 | feature_names: List[str], optional, (default=`None`)
81 | the name of the feature column of X
82 |
83 | Returns
84 | -------
85 | Fitted `ExplainableRegressor`
86 | """
87 | self.estimator_ = self.estimator.fit(X, y)
88 | self.explainer_ = self.explainer.fit(
89 | self.estimator_, X, feature_names=feature_names
90 | )
91 | return self
92 |
93 | def predict(self, X: np.ndarray):
94 | """ Predict function that call the predict function of the explainer.
95 |
96 | You can access to the explanation of the predictions via
97 | `regressor.explainer_.explanations_` attribute
98 |
99 | Parameters
100 | ----------
101 | X: np.ndarray, required
102 | test matrix
103 |
104 | Returns
105 | -------
106 | predictions: np.ndarray
107 | """
108 | check_is_fitted(self)
109 | predictions = self.explainer_.predict(X)
110 | self.explanations_ = self.explainer_.explanations_
111 | return predictions
112 |
--------------------------------------------------------------------------------
/gtime/causality/pearson_correlation.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.base import TransformerMixin, BaseEstimator
3 |
4 | from gtime.causality.base import CausalityMixin
5 |
6 |
7 | class ShiftedPearsonCorrelation(BaseEstimator, TransformerMixin, CausalityMixin):
8 | """Class responsible for assessing the shifted Pearson correlations (PPMCC) between
9 | two or more series. For more info about the test, click
10 | `here `_.
11 |
12 | Parameters
13 | ----------
14 | min_shift : int, optional, default: ``1``
15 | The minimum number of shifts to check for.
16 |
17 | max_shift : int, optional, default: ``10``
18 | The maximum number of shifts to check for.
19 |
20 | target_col : str, optional, default: ``None``
21 | The column to use as the a reference (i.e., the columns which is not
22 | shifted).
23 |
24 | dropna : bool, optional, default: ``False``
25 | Determines if the Nan values created by shifting are retained or dropped.
26 |
27 | bootstrap_iterations : int, optional, default: ``None``
28 | If not None, compute the p_values of the test, by performing bootstrapping of
29 | the original data (sampling with replacement).
30 |
31 | permutation_iterations : int, optional, default: ``None``
32 | If not None, compute the p_values of the test, by performing permutations of
33 | the original data.
34 |
35 | Examples
36 | --------
37 | >>> from gtime.causality.pearson_correlation import ShiftedPearsonCorrelation
38 | >>> import pandas.util.testing as testing
39 | >>> data = testing.makeTimeDataFrame(freq="s")
40 | >>> spc = ShiftedPearsonCorrelation(target_col="A")
41 | >>> spc.fit(data)
42 | >>> spc.best_shifts_
43 | y A B C D
44 | x
45 | A 8 9 6 5
46 | B 7 4 4 6
47 | C 3 4 9 9
48 | D 7 1 9 1
49 | >>> spc.max_corrs_
50 | y A B C D
51 | x
52 | A 0.383800 0.260627 0.343628 0.360151
53 | B 0.311608 0.307203 0.255969 0.298523
54 | C 0.373613 0.267335 0.211913 0.140034
55 | D 0.496535 0.204770 0.402473 0.310065
56 | """
57 |
58 | def __init__(
59 | self,
60 | min_shift: int = 1,
61 | max_shift: int = 10,
62 | target_col: str = None,
63 | dropna: bool = False,
64 | bootstrap_iterations: int = None,
65 | permutation_iterations: int = None,
66 | ):
67 | super().__init__(
68 | bootstrap_iterations=bootstrap_iterations,
69 | permutation_iterations=permutation_iterations,
70 | )
71 | self.min_shift = min_shift
72 | self.max_shift = max_shift
73 | self.target_col = target_col
74 | self.dropna = dropna
75 |
76 | def fit(self, data: pd.DataFrame) -> "ShiftedPearsonCorrelation":
77 | """Create the dataframe of shifts of each time series which maximize the
78 | Pearson correlation (PPMCC).
79 |
80 | Parameters
81 | ----------
82 | data : pd.DataFrame, shape (n_samples, n_time_series), required
83 | The DataFrame containing the time series on which to compute the shifted
84 | correlations.
85 |
86 | Returns
87 | -------
88 | self : ``ShiftedPearsonCorrelation``
89 |
90 | """
91 | best_shifts = self._compute_best_shifts(data, self._get_max_corr_shift)
92 |
93 | pivot_tables = self._create_pivot_tables(best_shifts)
94 |
95 | self.best_shifts_ = pivot_tables["best_shifts"]
96 | self.max_corrs_ = pivot_tables["max_corrs"]
97 |
98 | if self.bootstrap_iterations:
99 | self.bootstrap_p_values_ = pivot_tables["bootstrap_p_values"]
100 |
101 | if self.permutation_iterations:
102 | self.permutation_p_values_ = pivot_tables["permutation_p_values"]
103 |
104 | return self
105 |
106 | def _get_max_corr_shift(self, data: pd.DataFrame, x, y):
107 | shifts = pd.DataFrame()
108 |
109 | for shift in range(self.min_shift, self.max_shift + 1):
110 | shifts[shift] = data[x].shift(shift)
111 |
112 | shifts = shifts.dropna()
113 | self.shifted_corrs = shifts.corrwith(data[y])
114 |
115 | q = self.shifted_corrs.max(), self.shifted_corrs.idxmax()
116 | return q
117 |
--------------------------------------------------------------------------------
/gtime/model_selection/splitters.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 |
5 | class FeatureSplitter:
6 | """Splits the feature matrices X and y in X_train, y_train, X_test, y_test.
7 |
8 | X and y are the feature matrices obtained from the FeatureCreation class.
9 |
10 | Parameters
11 | ----------
12 | drop_na_mode : str, optional, default: ``'any'``
13 | How to drop the Nan contained in the ``X`` and ``y`` matrices. Only 'any' is
14 | supported for the moment.
15 |
16 | Examples
17 | --------
18 | >>> import pandas as pd
19 | >>> import numpy as np
20 | >>> from gtime.model_selection import FeatureSplitter
21 | >>> X = pd.DataFrame.from_dict({"feature_0": [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8],
22 | ... "feature_1": [np.nan, np.nan, 0.5, 1.5, 2.5, 3.5,
23 | ... 4.5, 5.5, 6.5, 7.5, ]
24 | ... })
25 | >>> y = pd.DataFrame.from_dict({"y_0": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
26 | ... "y_1": [1, 2, 3, 4, 5, 6, 7, 8, 9, np.nan],
27 | ... "y_2": [2, 3, 4, 5, 6, 7, 8, 9, np.nan, np.nan]
28 | ... })
29 | >>> feature_splitter = FeatureSplitter()
30 | >>> X_train, y_train, X_test, y_test = feature_splitter.transform(X, y)
31 | >>> X_train
32 | feature_0 feature_1
33 | 2 1.0 0.5
34 | 3 2.0 1.5
35 | 4 3.0 2.5
36 | 5 4.0 3.5
37 | 6 5.0 4.5
38 | 7 6.0 5.5
39 | >>> y_train
40 | y_0 y_1 y_2
41 | 2 2 3.0 4.0
42 | 3 3 4.0 5.0
43 | 4 4 5.0 6.0
44 | 5 5 6.0 7.0
45 | 6 6 7.0 8.0
46 | 7 7 8.0 9.0
47 | >>> X_test
48 | feature_0 feature_1
49 | 8 7.0 6.5
50 | 9 8.0 7.5
51 | >>> y_test
52 | y_0 y_1 y_2
53 | 8 8 9.0 NaN
54 | 9 9 NaN NaN
55 |
56 | """
57 |
58 | def __init__(self, drop_na_mode: str = "any"):
59 | if drop_na_mode != "any":
60 | raise ValueError(
61 | f'Only drop_na_mode="any" is supported. Detected: {drop_na_mode}'
62 | )
63 | self.drop_na_mode = drop_na_mode
64 |
65 | def transform(
66 | self, X: pd.DataFrame, y: pd.DataFrame
67 | ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame):
68 | """Split the feature matrices X and y in X_train, y_train, X_test, y_test.
69 |
70 | ``X`` and ``y`` are the feature matrices obtained from the FeatureCreation
71 | class.
72 |
73 | Parameters
74 | ----------
75 | X : pd.DataFrame, shape (n_samples, n_features), required
76 | The feature matrix.
77 |
78 | y : pd.DataFrame, shape (n_samples, horizon), required
79 | The y matrix.
80 |
81 | Returns
82 | -------
83 | X_train, y_train, X_test, y_test : Tuple[pd.DataFrame, pd.DataFrame,
84 | pd.DataFrame, pd.DataFrame]
85 | The X and y, split between train and test.
86 |
87 | """
88 | X, y = self._drop_X_na(X, y)
89 | X_train, y_train, X_test, y_test = self._split_train_test(X, y)
90 | return X_train, y_train, X_test, y_test
91 |
92 | def _drop_X_na(
93 | self, X: pd.DataFrame, y: pd.DataFrame
94 | ) -> (pd.DataFrame, pd.DataFrame):
95 |
96 | X = X.dropna(axis=0, how=self.drop_na_mode)
97 | y = y.loc[X.index]
98 | return X, y
99 |
100 | def _split_train_test(
101 | self, X: pd.DataFrame, y: pd.DataFrame
102 | ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame):
103 |
104 | train_indexes, test_indexes = self._get_train_test_indexes_from_y(y)
105 | X_train, y_train = X.loc[train_indexes], y.loc[train_indexes]
106 | X_test, y_test = X.loc[test_indexes], y.loc[test_indexes]
107 | return X_train, y_train, X_test, y_test
108 |
109 | def _get_train_test_indexes_from_y(self, y):
110 | last_train_index = self._last_non_nan_y_index(y)
111 | train_indexes = y.loc[:last_train_index].index if last_train_index else []
112 | test_indexes = y.index.difference(train_indexes)
113 | return train_indexes, test_indexes
114 |
115 | def _last_non_nan_y_index(self, y: pd.DataFrame) -> pd.Period:
116 | y_nan = y.isnull().any(axis=1).replace(True, np.nan)
117 | return y_nan.last_valid_index()
118 |
--------------------------------------------------------------------------------
/gtime/regressors/multi_output.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List
2 |
3 | import numpy as np
4 | from sklearn.base import RegressorMixin
5 | from sklearn.multioutput import (
6 | MultiOutputRegressor,
7 | _MultiOutputEstimator,
8 | _fit_estimator,
9 | )
10 | from sklearn.utils import check_X_y, check_array
11 | from sklearn.utils.validation import check_is_fitted
12 |
13 | from gtime.explainability.explainer import Explainer, _LimeExplainer, _ShapExplainer
14 |
15 |
16 | class MultiFeatureMultiOutputRegressor(RegressorMixin, _MultiOutputEstimator):
17 | """ Multi target regression with option to choose the features for each target.
18 |
19 | This strategy consists of fitting one regressor per target. It is built over
20 | sklearn.multioutput.MultiOutputRegressor. Compared to this, it allows to choose
21 | different features for each regressor.
22 |
23 | Parameters
24 | ----------
25 | estimator: RegressorMixin, required
26 | An estimator object implementing fit and predict.
27 |
28 | Examples
29 | --------
30 | >>> import numpy as np
31 | >>> from gtime.regressors import MultiFeatureMultiOutputRegressor
32 | >>> from sklearn.ensemble import RandomForestRegressor
33 | >>> X = np.random.random((30, 5))
34 | >>> y = np.random.random((30, 3))
35 | >>> X_train, y_train = X[:20], y[:20]
36 | >>> X_test, y_test = X[20:], y[20:]
37 | >>>
38 | >>> random_forest = RandomForestRegressor()
39 | >>> regressor = MultiFeatureMultiOutputRegressor(estimator=random_forest)
40 | >>>
41 | >>> target_to_features_dict = {0: [0,1,2], 1: [0,1,3], 2: [0,1,4]}
42 | >>> regressor.fit(X_train, y_train, target_to_features_dict=target_to_features_dict)
43 | >>>
44 | >>> predictions = regressor.predict(X_test)
45 | >>> predictions.shape
46 | (10, 3)
47 |
48 | """
49 |
50 | def __init__(
51 | self,
52 | estimator: RegressorMixin,
53 | target_to_features_dict: Dict[int, List[int]] = None,
54 | ):
55 | super().__init__(estimator=estimator, n_jobs=1)
56 | self.target_to_features_dict = target_to_features_dict
57 |
58 | def fit(self, X: np.ndarray, y: np.ndarray, **kwargs):
59 | """Fit the model.
60 |
61 | Train the models, one for each target variable in y.
62 |
63 | Parameters
64 | ----------
65 | X : np.ndarray, shape (n_samples, n_features), required.
66 | The data.
67 | y : np.ndarray, shape (n_samples, horizon), required.
68 | The matrix containing the target variables.
69 |
70 | Returns
71 | -------
72 | self : object
73 |
74 |
75 | """
76 | target_to_features_dict = kwargs.get(
77 | "target_to_features_dict", self.target_to_features_dict
78 | )
79 | if target_to_features_dict is None:
80 | super().fit(X, y)
81 | self.target_to_features_dict_ = None
82 | return self
83 |
84 | X, y = check_X_y(X, y, multi_output=True, accept_sparse=True)
85 |
86 | if y.ndim == 1:
87 | raise ValueError("y must have at least two dimensions")
88 |
89 | self.estimators_ = [
90 | _fit_estimator(self.estimator, X[:, target_to_features_dict[i]], y[:, i])
91 | for i in range(y.shape[1])
92 | ]
93 | self.target_to_features_dict_ = target_to_features_dict
94 | self.expected_X_shape_ = X.shape[1]
95 | return self
96 |
97 | def predict(self, X: np.ndarray) -> np.ndarray:
98 | """For each row in ``X``, make a prediction for each fitted model
99 |
100 | Parameters
101 | ----------
102 | X : np.ndarray, shape (n_samples, n_features), required
103 | The data.
104 |
105 | Returns
106 | -------
107 | predictions : np.ndarray, shape (n_samples, horizon)
108 | The predictions
109 |
110 | """
111 | check_is_fitted(self)
112 | if self.target_to_features_dict_ is None:
113 | return super().predict(X)
114 |
115 | X = check_array(X, accept_sparse=True)
116 | if X.shape[1] != self.expected_X_shape_:
117 | raise ValueError(
118 | f"Expected X shape is {self.expected_X_shape_}. Detected {X.shape[1]}"
119 | )
120 | y = [
121 | estimator.predict(X[:, self.target_to_features_dict_[i]])
122 | for i, estimator in enumerate(self.estimators_)
123 | ]
124 |
125 | return np.asarray(y).T
126 |
--------------------------------------------------------------------------------
/gtime/hierarchical/tests/test_naive.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import pytest
4 | import sklearn
5 | from hypothesis import given
6 | import hypothesis.strategies as st
7 | from hypothesis.extra.numpy import arrays
8 | from pytest import fixture
9 |
10 | from gtime.hierarchical import HierarchicalNaive, HierarchicalBase
11 | from gtime.utils.fixtures import (
12 | time_series_forecasting_model1_no_cache,
13 | features1,
14 | model1,
15 | )
16 | from gtime.utils.hypothesis.time_indexes import giotto_time_series, period_indexes
17 |
18 |
19 | @fixture(scope="function")
20 | def hierarchical_naive_model(time_series_forecasting_model1_no_cache):
21 | return HierarchicalNaive(time_series_forecasting_model1_no_cache)
22 |
23 |
24 | @st.composite
25 | def n_time_series_with_same_index(
26 | draw, min_length: int = 5, min_n: int = 1, max_n: int = 5,
27 | ):
28 | n = draw(st.integers(min_value=min_n, max_value=max_n))
29 | index = draw(period_indexes(min_length=min_length))
30 | dictionary = {}
31 | for i in range(n):
32 | key = str(i)
33 | df_values = draw(
34 | arrays(
35 | dtype=np.float64,
36 | shape=index.shape[0],
37 | elements=st.floats(allow_nan=False, allow_infinity=False, width=32),
38 | )
39 | )
40 | value = pd.DataFrame(index=index, data=df_values)
41 | dictionary[key] = value
42 | return dictionary
43 |
44 |
45 | class TestHierarchicalBase:
46 | def test_class_abstract(self, model1):
47 | HierarchicalBase(model1, {})
48 |
49 |
50 | class TestHierarchicalNaive:
51 | def test_constructor(self, time_series_forecasting_model1_no_cache):
52 | HierarchicalNaive(model=time_series_forecasting_model1_no_cache)
53 |
54 | def test_constructor_no_hierarchy_tree(
55 | self, time_series_forecasting_model1_no_cache
56 | ):
57 | hierarchy_tree = {}
58 | with pytest.raises(TypeError):
59 | HierarchicalNaive(
60 | model=time_series_forecasting_model1_no_cache,
61 | hierarchy_tree=hierarchy_tree,
62 | )
63 |
64 | @given(time_series=giotto_time_series(min_length=5))
65 | def test_error_fit_dataframe(self, time_series, hierarchical_naive_model):
66 | with pytest.raises(ValueError):
67 | hierarchical_naive_model.fit(time_series)
68 |
69 | @given(time_series=giotto_time_series(min_length=5))
70 | def test_error_fit_key_not_string(self, time_series, hierarchical_naive_model):
71 | with pytest.raises(ValueError):
72 | hierarchical_naive_model.fit({1: time_series})
73 |
74 | def test_error_fit_value_not_dataframe(self, hierarchical_naive_model):
75 | with pytest.raises(ValueError):
76 | hierarchical_naive_model.fit({"wrong_field": 12})
77 |
78 | @given(dataframes=n_time_series_with_same_index())
79 | def test_fit_n_dataframes(self, dataframes, hierarchical_naive_model):
80 | hierarchical_naive_model.fit(dataframes)
81 |
82 | @given(dataframes=n_time_series_with_same_index())
83 | def test_fit_predict_n_dataframes_on_different_data(
84 | self, dataframes, hierarchical_naive_model
85 | ):
86 | hierarchical_naive_model.fit(dataframes).predict(dataframes)
87 |
88 | @given(dataframes=n_time_series_with_same_index())
89 | def test_fit_predict_n_dataframes(self, dataframes, hierarchical_naive_model):
90 | hierarchical_naive_model.fit(dataframes).predict()
91 |
92 | @given(dataframes=n_time_series_with_same_index())
93 | def test_fit_predict_on_subset_of_time_series(
94 | self, dataframes, hierarchical_naive_model
95 | ):
96 | key = np.random.choice(list(dataframes.keys()), 1)[0]
97 | hierarchical_naive_model.fit(dataframes)
98 | hierarchical_naive_model.predict({key: dataframes[key]})
99 |
100 | def test_error_predict_not_fitted(self, hierarchical_naive_model):
101 | with pytest.raises(sklearn.exceptions.NotFittedError):
102 | hierarchical_naive_model.predict()
103 |
104 | @given(dataframes=n_time_series_with_same_index())
105 | def test_error_with_bad_predict_key(self, dataframes, hierarchical_naive_model):
106 | correct_key = np.random.choice(list(dataframes.keys()), 1)[0]
107 | bad_key = "".join(dataframes.keys()) + "bad_key"
108 | hierarchical_naive_model.fit(dataframes)
109 | with pytest.raises(KeyError):
110 | hierarchical_naive_model.predict({bad_key: dataframes[correct_key]})
111 |
--------------------------------------------------------------------------------
/gtime/causality/linear_coefficient.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn.base import BaseEstimator, TransformerMixin
4 | from sklearn.linear_model import LinearRegression
5 |
6 | from gtime.causality.base import CausalityMixin
7 |
8 |
9 | class ShiftedLinearCoefficient(BaseEstimator, TransformerMixin, CausalityMixin):
10 | """Test the shifted linear fit coefficients between two or more time series.
11 |
12 | Parameters
13 | ----------
14 | min_shift : int, optional, default: ``1``
15 | The minimum number of shifts to check for.
16 |
17 | max_shift : int, optional, default: ``10``
18 | The maximum number of shifts to check for.
19 |
20 | target_col : str, optional, default: ``None``
21 | The column to use as the a reference (i.e., the column which is not
22 | shifted).
23 |
24 | dropna : bool, optional, default: ``False``
25 | Determines if the Nan values created by shifting are retained or dropped.
26 |
27 | bootstrap_iterations : int, optional, default: ``None``
28 | If not None, compute the p_values of the test, by performing bootstrapping of
29 | the original data (sampling with replacement).
30 |
31 | permutation_iterations : int, optional, default: ``None``
32 | If not None, compute the p_values of the test, by performing permutations of
33 | the original data.
34 |
35 | Examples
36 | --------
37 |
38 | >>> from gtime.causality.linear_coefficient import ShiftedLinearCoefficient
39 | >>> import pandas.util.testing as testing
40 | >>> data = testing.makeTimeDataFrame(freq="s")
41 | >>> slc = ShiftedLinearCoefficient(target_col="A")
42 | >>> slc.fit(data)
43 | >>> slc.best_shifts_
44 | y A B C D
45 | x
46 | A 3 6 8 5
47 | B 9 9 4 1
48 | C 8 2 4 9
49 | D 3 9 4 3
50 | >>> slc.max_corrs_
51 | y A B C D
52 | x
53 | A 0.460236 0.420005 0.339370 0.267143
54 | B 0.177856 0.300350 0.367150 0.550490
55 | C 0.484860 0.263036 0.456046 0.251342
56 | D 0.580068 0.344688 0.253626 0.256220
57 | """
58 |
59 | def __init__(
60 | self,
61 | min_shift: int = 1,
62 | max_shift: int = 10,
63 | target_col: str = None,
64 | dropna: bool = False,
65 | bootstrap_iterations: int = None,
66 | permutation_iterations: int = None,
67 | ):
68 | super().__init__(
69 | bootstrap_iterations=bootstrap_iterations,
70 | permutation_iterations=permutation_iterations,
71 | )
72 | self.min_shift = min_shift
73 | self.max_shift = max_shift
74 | self.target_col = target_col
75 | self.dropna = dropna
76 |
77 | def fit(self, data: pd.DataFrame) -> "ShiftedLinearCoefficient":
78 | """Create the DataFrame of shifts of each time series which maximize the shifted
79 | linear fit coefficients.
80 |
81 | Parameters
82 | ----------
83 | data : pd.DataFrame, shape (n_samples, n_time_series), required
84 | The DataFrame containing the time-series on which to compute the shifted
85 | linear fit coefficients.
86 |
87 | Returns
88 | -------
89 | self : ``ShiftedLinearCoefficient``
90 |
91 | """
92 | best_shifts = self._compute_best_shifts(data, self._get_max_coeff_shift)
93 | pivot_tables = self._create_pivot_tables(best_shifts)
94 |
95 | self.best_shifts_ = pivot_tables["best_shifts"]
96 | self.max_corrs_ = pivot_tables["max_corrs"]
97 |
98 | if self.bootstrap_iterations:
99 | self.bootstrap_p_values_ = pivot_tables["bootstrap_p_values"]
100 |
101 | if self.permutation_iterations:
102 | self.permutation_p_values_ = pivot_tables["permutation_p_values"]
103 |
104 | return self
105 |
106 | def _get_max_coeff_shift(self, data: pd.DataFrame, x, y):
107 | shifts = pd.DataFrame()
108 | shifts[x] = data[x]
109 | shifts[y] = data[y]
110 | # print("shifts:", shifts)
111 | # print("data:", data)
112 | for shift in range(self.min_shift, self.max_shift + 1):
113 | # print("data", shift, ":", data[x].shift(shift))
114 | shifts[shift] = data[x].shift(shift)
115 |
116 | shifts = shifts.dropna()
117 |
118 | lf = LinearRegression().fit(
119 | shifts[range(self.min_shift, self.max_shift + 1)].values, shifts[y].values
120 | )
121 |
122 | q = lf.coef_.max(), np.argmax(lf.coef_) + (self.min_shift - 0)
123 | return q
124 |
--------------------------------------------------------------------------------
/gtime/feature_extraction/trend.py:
--------------------------------------------------------------------------------
1 | from typing import Callable
2 |
3 | import numpy as np
4 | import pandas as pd
5 | from scipy.optimize import minimize
6 | from sklearn.base import BaseEstimator, TransformerMixin
7 | from sklearn.metrics import mean_squared_error
8 | from sklearn.utils.validation import check_is_fitted
9 |
10 | from gtime.base import FeatureMixin, add_class_name
11 | from gtime.utils.trends import TRENDS
12 |
13 | __all__ = "Detrender"
14 |
15 |
16 | class Detrender(BaseEstimator, TransformerMixin, FeatureMixin):
17 | """Apply a de-trend transformation to a time series.
18 |
19 | The purpose of the class is to fit a model, define through the `trend` parameter, in
20 | order to find a trend in the time series. Then, the trend can be removed by removing
21 | the predictions of the fitted model.
22 |
23 | Parameters
24 | ----------
25 | trend : ``'polynomial'`` | ``'exponential'``, required
26 | The kind of trend removal to apply.
27 |
28 | trend_x0 : np.array, required
29 | Initialisation parameters passed to the trend function. This is used to select
30 | a starting point in order to minimize the `loss` function.
31 |
32 | loss : Callable, optional, default: ``mean_squared_error``
33 | The loss function to minimize.
34 |
35 | method : string, optional, default: ``"BFGS"``
36 | Loss function optimisation method.
37 |
38 | Examples
39 | --------
40 | >>> import pandas as pd
41 | >>> import numpy as np
42 | >>> from gtime.feature_extraction import Detrender
43 | >>> detrender = Detrender(trend='polynomial', trend_x0=np.zeros(2))
44 | >>> time_index = pd.date_range("2020-01-01", "2020-01-10")
45 | >>> X = pd.DataFrame(range(0, 10), index=time_index)
46 | >>> detrender.fit_transform(X)
47 | 0__Detrender
48 | 2020-01-01 9.180937e-07
49 | 2020-01-02 8.020709e-07
50 | 2020-01-03 6.860481e-07
51 | 2020-01-04 5.700253e-07
52 | 2020-01-05 4.540024e-07
53 | 2020-01-06 3.379796e-07
54 | 2020-01-07 2.219568e-07
55 | 2020-01-08 1.059340e-07
56 | 2020-01-09 -1.008878e-08
57 | 2020-01-10 -1.261116e-07
58 |
59 | """
60 |
61 | def __init__(
62 | self,
63 | trend: str,
64 | trend_x0: np.array,
65 | loss: Callable = mean_squared_error,
66 | method: str = "BFGS",
67 | ):
68 | self.trend = trend
69 | self.trend_x0 = trend_x0
70 | self.loss = loss
71 | self.method = method
72 |
73 | def fit(self, X: pd.DataFrame, y=None) -> "Detrender":
74 | """Fit the estimator.
75 |
76 | Parameters
77 | ----------
78 | X : pd.DataFrame, shape (n_samples, n_features)
79 | Input data.
80 |
81 | y : None
82 | There is no need of a target in a transformer, yet the pipeline API
83 | requires this parameter.
84 |
85 | Returns
86 | -------
87 | self : object
88 | Returns self.
89 |
90 | """
91 |
92 | # TODO: create validation function
93 | if self.trend not in TRENDS:
94 | raise ValueError(
95 | "The trend '%s' is not supported. Supported "
96 | "trends are %s." % (self.trend, list(sorted(TRENDS)))
97 | )
98 |
99 | self.best_trend_params_ = minimize(
100 | lambda opt: self.loss(
101 | X.values, [TRENDS[self.trend](t, opt) for t in range(0, X.shape[0])]
102 | ),
103 | self.trend_x0,
104 | method=self.method,
105 | options={"disp": False},
106 | )["x"]
107 |
108 | self.t0_ = X.index[0]
109 | freq = X.index.freq
110 | if freq is not None:
111 | self.period_ = freq
112 | else:
113 | self.period_ = X.index[1] - X.index[0]
114 |
115 | return self
116 |
117 | @add_class_name
118 | def transform(self, time_series: pd.DataFrame) -> pd.DataFrame:
119 | """Transform the ``time_series`` by removing the trend.
120 |
121 | Parameters
122 | ----------
123 | time_series: pd.DataFrame, shape (n_samples, 1), required
124 | The time series to transform.
125 |
126 | Returns
127 | -------
128 | time_series_t : pd.DataFrame, shape (n_samples, n_features)
129 | The transformed time series, without the trend.
130 |
131 | """
132 | check_is_fitted(self)
133 |
134 | time_steps = (time_series.index - self.t0_) / self.period_
135 |
136 | predictions = pd.Series(
137 | index=time_series.index,
138 | data=np.array(
139 | [TRENDS[self.trend](t, self.best_trend_params_) for t in time_steps]
140 | ).flatten(),
141 | )
142 |
143 | return time_series.sub(predictions, axis=0)
144 |
--------------------------------------------------------------------------------
/gtime/plotting/tests/test_preprocessing.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import re
4 | import pytest
5 | import hypothesis.strategies as st
6 | from hypothesis import given, settings, example
7 | from gtime.utils.hypothesis.time_indexes import giotto_time_series, period_indexes
8 | from gtime.plotting.preprocessing import (
9 | seasonal_split,
10 | acf,
11 | pacf,
12 | _get_cycle_names,
13 | _get_season_names,
14 | _autocorrelation,
15 | _normalize,
16 | _solve_yw_equation,
17 | _week_of_year,
18 | yule_walker,
19 | )
20 |
21 |
22 | class TestSplits:
23 | @given(t=period_indexes(min_length=1, max_length=1))
24 | @example(t=pd.PeriodIndex(["1974-12-31"], freq="W"))
25 | @example(t=pd.PeriodIndex(["1972-01-01"], freq="W"))
26 | @settings(deadline=None)
27 | def test_week_of_year(self, t):
28 | period = t[0]
29 | week = _week_of_year(period)
30 | assert re.match(r"\d{4}_\d\d?$", week)
31 |
32 | @given(
33 | df=giotto_time_series(min_length=3, max_length=500),
34 | cycle=st.one_of(
35 | st.sampled_from(["year", "quarter", "month", "week"]),
36 | st.from_regex(r"[1-9][DWMQY]", fullmatch=True),
37 | ),
38 | )
39 | @settings(deadline=None)
40 | def test__get_cycle_names_size(self, df, cycle):
41 | cycle = _get_cycle_names(df, cycle)
42 | assert len(cycle) == len(df)
43 |
44 | @given(
45 | df=giotto_time_series(min_length=3, max_length=500),
46 | cycle=st.one_of(
47 | st.sampled_from(["year", "quarter", "month", "week"]),
48 | st.from_regex(r"[1-9][DWMQY]", fullmatch=True),
49 | ),
50 | freq=st.from_regex(r"[1-9]?[DWMQ]", fullmatch=True),
51 | )
52 | @settings(deadline=None)
53 | def test__get_season_names_size(self, df, cycle, freq):
54 | seasons = _get_season_names(df, cycle, freq)
55 | assert len(seasons) == len(df)
56 |
57 | @given(
58 | df=giotto_time_series(min_length=3, max_length=500),
59 | cycle=st.one_of(
60 | st.sampled_from(["year", "quarter", "month", "week"]),
61 | st.from_regex(r"[1-9][DWMQY]", fullmatch=True),
62 | ),
63 | freq=st.one_of(st.from_regex(r"[1-9]?[DWMQ]", fullmatch=True), st.none()),
64 | agg=st.sampled_from(["mean", "sum", "last"]),
65 | )
66 | @settings(deadline=None)
67 | def test_seasonal_split_shape_named(self, df, cycle, freq, agg):
68 | split = seasonal_split(df, cycle=cycle, freq=freq, agg=agg)
69 | if freq is None:
70 | freq = df.index.freqstr
71 | assert split.stack().shape == df.resample(freq).agg(agg).dropna().shape
72 |
73 |
74 | class TestAcf:
75 | @given(x=st.lists(st.floats(allow_nan=False), min_size=1))
76 | def test_autocorrelation(self, x):
77 | autocorr = _autocorrelation(np.array(x))
78 | expected = np.correlate(x, x, mode="full")[-len(x) :] / len(x)
79 | np.testing.assert_array_equal(autocorr, expected)
80 |
81 | @given(
82 | x=st.lists(
83 | st.floats(
84 | allow_nan=False, allow_infinity=False, max_value=1e20, min_value=1e20
85 | ),
86 | min_size=1,
87 | )
88 | )
89 | def test_scale(self, x):
90 | scaled_x = _normalize(np.array(x))
91 | assert scaled_x.mean() == pytest.approx(0.0)
92 | assert scaled_x.std() == pytest.approx(1.0) or scaled_x.std() == pytest.approx(
93 | 0.0
94 | )
95 |
96 | @given(x=st.lists(st.floats(allow_nan=False, allow_infinity=False), min_size=2))
97 | def test_solve_yw(self, x):
98 | rho = _solve_yw_equation(np.array(x))
99 | if not np.isnan(np.sum(rho)):
100 | assert len(rho) == len(x) - 1
101 |
102 | @given(
103 | x=st.lists(st.floats(allow_nan=False, allow_infinity=False), min_size=2),
104 | order=st.integers(min_value=1),
105 | )
106 | def test_yule_walker_abs(self, x, order):
107 | pacf = yule_walker(np.array(x), order)
108 | if not (np.isnan(np.sum(pacf)) or len(pacf) == 0):
109 | assert all(abs(pacf) <= 2)
110 |
111 | @given(
112 | df=giotto_time_series(min_length=1, allow_nan=False, allow_infinity=False),
113 | max_lag=st.one_of(st.integers(min_value=1, max_value=100), st.none()),
114 | )
115 | def test_acf_len(self, df, max_lag):
116 | df_array = np.ravel(df.values)
117 | res = acf(df_array, max_lag)
118 | if max_lag is None:
119 | max_lag = len(df)
120 | assert len(res) == min(max_lag, len(df))
121 |
122 | @given(
123 | df=giotto_time_series(
124 | min_length=1, allow_nan=False, allow_infinity=False, max_length=50
125 | ),
126 | max_lag=st.one_of(st.integers(min_value=1, max_value=100), st.none()),
127 | )
128 | def test_pacf_len(self, df, max_lag):
129 | df_array = np.ravel(df.values)
130 | res = pacf(df_array, max_lag)
131 | if max_lag is None:
132 | max_lag = len(df)
133 | assert len(res) == min(max_lag, len(df))
134 |
--------------------------------------------------------------------------------
/gtime/forecasting/online.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from sklearn.base import BaseEstimator
4 | from sklearn.utils.validation import check_array, check_X_y, check_random_state
5 |
6 |
7 | def l1(a, b):
8 | return np.abs(np.subtract(a, b))
9 |
10 |
11 | class HedgeForecaster(BaseEstimator):
12 | """Regressor model using Hedge algorithm.
13 |
14 | This algorithm is based on a multiplicative weight update method to create a dynamic combination of regressive
15 | models. In theory, there is no common training phase on data, only the loss is necessary to update the model.
16 |
17 | Parameters
18 | ----------
19 |
20 | learning_rate : float, (default=0.001)
21 | The factor to use for the weight update.
22 |
23 | loss : callable, optional (default=`gtime.forecasting.online.l1`)
24 | Loss function use to compute loss matrix.
25 |
26 | random_state : int, RandomState instance or None, optional (default=None)
27 | Controls both the randomness of the bootstrapping of the samples used
28 | when building trees (if ``bootstrap=True``) and the sampling of the
29 | features to consider when looking for the best split at each node
30 | (if ``max_features < n_features``).
31 | # TODO: write glossary
32 | See :term:`Glossary ` for details.
33 |
34 | Attributes
35 | ----------
36 | loss_matrix_ : array, (n_samples, n_experts)
37 | Loss matrix between X and y.
38 |
39 | total_loss_ : int or float,
40 | Sum of losses based on Hedge algorithm decisions.
41 |
42 | weights_ : array, (n_experts)
43 | Last weight of each expert.
44 |
45 | decisions_ : array, (n_samples)
46 | Indices of chosen expert depending on weights.
47 |
48 | Examples
49 | --------
50 | >>> import pandas as pd
51 | >>> import numpy as np
52 | >>> from gtime.forecasting.online import HedgeForecaster
53 | >>> time_index = pd.date_range("2020-01-01", "2020-01-20")
54 | >>> X = pd.DataFrame(np.random.randint(4, size=(20, 3)), index=time_index)
55 | >>> y = pd.DataFrame(np.random.randint(4, size=(20, 1)), index=time_index, columns=["y_1"])
56 | >>> hr = HedgeForecaster(random_state=42)
57 | >>> hr.fit_predict(X, y).head()
58 | 0
59 | 2020-01-01 2
60 | 2020-01-02 0
61 | 2020-01-03 3
62 | 2020-01-04 3
63 | 2020-01-05 2
64 | >>> print(f"Estimator weights: {hr.weights_}")
65 | Estimator weights: [0.97713925 0.97723619 0.97980439]
66 | >>> print(f"Decisions: {hr.decisions_}")
67 | Decisions: [1 2 2 1 0 0 0 2 1 2 0 2 2 0 0 0 0 1 1 0]
68 | >>> print(f"Total loss: {hr.total_loss_}")
69 | Total loss: 30
70 |
71 | """
72 |
73 | def __init__(
74 | self, learning_rate: float = 0.001, loss: callable = l1, random_state=None
75 | ):
76 | self.eps = learning_rate
77 | self.loss = loss
78 | self.random_state = random_state
79 | pass
80 |
81 | def hedge(self, timestamps, n_experts, loss, eps, random_state):
82 | weights = np.ones(n_experts)
83 | self.decisions_ = np.zeros(timestamps, dtype=int)
84 |
85 | total_loss = 0
86 | for t in range(timestamps):
87 | self.decisions_[t] = random_state.choice(
88 | n_experts, p=weights / np.sum(weights)
89 | )
90 | total_loss += loss[t][np.int(self.decisions_[t])]
91 | weights *= np.exp(-eps * loss[t])
92 | return total_loss, weights
93 |
94 | def fit(self, X, y):
95 | """ Fit the model to data, compute weights and decisions iteratively.
96 |
97 | Parameters
98 | ----------
99 | X : array-like, shape (n_samples, n_features)
100 | Data.
101 |
102 | Returns
103 | -------
104 | self : object
105 | """
106 |
107 | random_state = check_random_state(self.random_state)
108 |
109 | # FIXME: multi_output is not currently supported but mono-column dataframe is 2D (n, 1) so multi_output=True
110 | # makes it easier to handle
111 | X, y = check_X_y(X, y, multi_output=True, y_numeric=True)
112 |
113 | self.loss_matrix_ = self.loss(X, y)
114 |
115 | timestamps = len(X)
116 | n_experts = X.shape[1]
117 |
118 | self.total_loss_, self.weights_ = self.hedge(
119 | timestamps=timestamps,
120 | n_experts=n_experts,
121 | loss=self.loss_matrix_,
122 | eps=self.eps,
123 | random_state=random_state,
124 | )
125 |
126 | return self
127 |
128 | def fit_predict(self, X, y):
129 | """Fit and predict variable using Hedge algorithm.
130 |
131 | Parameters
132 | ----------
133 | X : (sparse) array-like, shape (n_samples, n_features)
134 | Data.
135 |
136 | y : (sparse) array-like, shape (n_samples, n_outputs)
137 | Predictions.
138 |
139 | Returns
140 | -------
141 | predictions : pd.DataFrame
142 | Predictions.
143 | """
144 | self.fit(X, y)
145 |
146 | predictions = pd.DataFrame(
147 | np.take_along_axis(check_array(X), self.decisions_.reshape(-1, 1), axis=1),
148 | index=X.index,
149 | )
150 |
151 | return predictions
152 |
--------------------------------------------------------------------------------
/CONTRIBUTING.rst:
--------------------------------------------------------------------------------
1 | Contributing guidelines
2 | =======================
3 |
4 | Pull Request Checklist
5 | ----------------------
6 |
7 | Before sending your pull requests, make sure you followed this list.
8 | - Read the `contributing guidelines `_.
9 | - Read the `code of conduct `_.
10 | - Ensure you have signed the `contributor license agreement (CLA) `_.
11 | - Check if the changes are consistent with the guidelines.
12 | - Changes are consistent with the Coding Style.
13 | - Run Unit Tests.
14 |
15 | How to become a contributor and submit your own code
16 | ----------------------------------------------------
17 |
18 | Contributor License Agreements
19 | ------------------------------
20 |
21 | In order to become a contributor of Giotto, the first step is to sign the
22 | `contributor license agreement (CLA) `_.
23 | **NOTE**: Only original source code from you and other people that have signed
24 | the CLA can be accepted into the main repository.
25 |
26 | Contributing code
27 | -----------------
28 |
29 | If you have improvements to Giotto, do not hesitate to send us pull requests!
30 | Please follow the Github how to (https://help.github.com/articles/using-pull-requests/).
31 | The Giotto Team will review your pull requests. Once the pull requests are approved and pass continuous integration checks, the
32 | Giotto team will work on getting your pull request submitted to our GitHub
33 | repository. Eventually, your pull request will be merged automatically on GitHub.
34 | If you want to contribute, start working through the Giotto codebase,
35 | navigate to the `GitHub issue tab `_
36 | and start looking through interesting issues. These are issues that we believe
37 | are particularly well suited for outside contributions, often because we
38 | probably won't get to them right now. If you decide to start on an issue, leave
39 | a comment so that other people know that you're working on it. If you want to
40 | help out, but not alone, use the issue comment thread to coordinate.
41 |
42 | Contribution guidelines and standards
43 | -------------------------------------
44 |
45 | Before sending your pull request for review, make sure your changes are
46 | consistent with the guidelines and follow the coding style below.
47 |
48 | General guidelines and philosophy for contribution
49 | --------------------------------------------------
50 |
51 | * Include unit tests when you contribute new features, as they help to
52 | a) prove that your code works correctly, and
53 | b) guard against future breaking changes to lower the maintenance cost.
54 | * Bug fixes also generally require unit tests, because the presence of bugs
55 | usually indicates insufficient test coverage.
56 | * Keep API compatibility in mind when you change code in core Giotto.
57 | * Clearly define your exceptions using the utils functions and test the exceptions.
58 | * When you contribute a new feature to Giotto, the maintenance burden is
59 | (by default) transferred to the Giotto team. This means that the benefit
60 | of the contribution must be compared against the cost of maintaining the
61 | feature.
62 |
63 | C++ coding style
64 | ----------------
65 |
66 | Changes to Giotto C/C++ code should conform to `Google C++ Style Guide `_.
67 | Use `clang-tidy` to check your C/C++ changes. To install `clang-tidy` on
68 | ubuntu:16.04, do:
69 |
70 |
71 | .. code-block:: bash
72 |
73 | apt-get install -y clang-tidy
74 |
75 | You can check a C/C++ file by doing:
76 |
77 | .. code-block:: bash
78 |
79 | clang-format --style=google > /tmp/my_cc_file.ccdiff /tmp/my_cc_file.cc
80 |
81 | Python coding style
82 | -------------------
83 |
84 | Changes to Giotto Python code should conform to PEP8 directives.
85 | Use `flake8` to check your Python changes. To install `flake8` just do
86 |
87 | .. code-block:: python
88 |
89 | pip install flake8
90 |
91 | You can use `flake8` on your python code via the following instructions:
92 |
93 | .. code-block:: python
94 |
95 | flake8 name_of_your_script.py
96 |
97 | Git pre-commit hook
98 | -------------------
99 | We provide a pre-commit git hook to prevent accidental commits to the master branch and automatically formats the code
100 | using `black`. To activate, install the `pre-commit` library.
101 |
102 | Development requirements
103 | ------------------------
104 | In order to contributing to giotto-time, some additional python packages are required with respect to the standard
105 | requirements. To install them, do
106 |
107 | .. code-block:: python
108 |
109 | pip install -r dev-requirements.txt
110 |
111 | Running unit tests
112 | ------------------
113 |
114 | There are two ways to run Giotto unit tests.
115 |
116 | 1. Using tools and libraries installed directly on your system. The election tool is `pytest`. To install `pytest` just do
117 |
118 | .. code-block:: python
119 |
120 | pip install pytest
121 |
122 | You can use `pytest` on your python code via the following instructions:
123 |
124 | .. code-block:: python
125 |
126 | pytest name_of_your_script.py
127 |
128 | 2. Using Azure (azure-pipelines.yml) and Giotto's CI scripts.
129 |
--------------------------------------------------------------------------------
/gtime/hierarchical/tests/test_bottom_up.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import pytest
4 | import sklearn
5 | from hypothesis import given
6 | import networkx as nx
7 | import random
8 | import hypothesis.strategies as st
9 | from hypothesis.extra.numpy import arrays
10 | from pytest import fixture
11 |
12 | from gtime.hierarchical import HierarchicalBottomUp
13 | from gtime.utils.fixtures import (
14 | time_series_forecasting_model1_no_cache,
15 | features1,
16 | model1,
17 | )
18 | from gtime.utils.hypothesis.time_indexes import giotto_time_series, period_indexes
19 |
20 |
21 | @st.composite
22 | def n_time_series_with_same_index(
23 | draw, min_length: int = 5, min_n: int = 1, max_n: int = 5,
24 | ):
25 | n = draw(st.integers(min_value=min_n, max_value=max_n))
26 | index = draw(period_indexes(min_length=min_length))
27 | dictionary = {}
28 | for i in range(n):
29 | key = str(i)
30 | df_values = draw(
31 | arrays(
32 | dtype=np.float64,
33 | shape=index.shape[0],
34 | elements=st.floats(allow_nan=False, allow_infinity=False, width=32),
35 | )
36 | )
37 | value = pd.DataFrame(index=index, data=df_values)
38 | dictionary[key] = value
39 | return dictionary
40 |
41 |
42 | @st.composite
43 | def tree_construction(draw, dictionary):
44 | tree_nodes = list(dictionary.keys())
45 | tree = nx.DiGraph()
46 | n = len(tree_nodes)
47 | for i in range(n):
48 | selected_key = random.choice(tree_nodes)
49 | if len(tree) > 0:
50 | selected_node = random.choice(list(tree.nodes))
51 | tree.add_edge(selected_node, selected_key)
52 | tree.add_node(selected_key)
53 | tree_nodes.remove(selected_key)
54 | return tree
55 |
56 |
57 | @st.composite
58 | def hierarchical_bottom_up_model(draw, time_series_forecasting_model1_no_cache):
59 | dataframes = draw(n_time_series_with_same_index(min_n=5))
60 | tree = draw(tree_construction(dataframes))
61 | return HierarchicalBottomUp(time_series_forecasting_model1_no_cache, tree)
62 |
63 |
64 | @fixture(scope="function")
65 | def hierarchical_basic_bottom_up_model(time_series_forecasting_model1_no_cache):
66 | return HierarchicalBottomUp(time_series_forecasting_model1_no_cache, "infer")
67 |
68 |
69 | class TestHierarchicalBottomUp:
70 | def test_basic_constructor(self, time_series_forecasting_model1_no_cache):
71 | HierarchicalBottomUp(
72 | model=time_series_forecasting_model1_no_cache, hierarchy_tree="infer"
73 | )
74 |
75 | @given(dataframes=n_time_series_with_same_index(min_n=5))
76 | def test_fit_predict_basic_bottom_up_on_different_data(
77 | self, dataframes, hierarchical_basic_bottom_up_model
78 | ):
79 | hierarchical_basic_bottom_up_model.fit(dataframes).predict(dataframes)
80 |
81 | @given(dataframes=n_time_series_with_same_index(min_n=5))
82 | def test_fit_predict_basic_bottom_up(
83 | self, dataframes, hierarchical_basic_bottom_up_model
84 | ):
85 | hierarchical_basic_bottom_up_model.fit(dataframes).predict()
86 |
87 | @given(dataframes=n_time_series_with_same_index())
88 | def test_constructor(self, time_series_forecasting_model1_no_cache, dataframes):
89 | tree = tree_construction(dataframes)
90 | HierarchicalBottomUp(time_series_forecasting_model1_no_cache, tree)
91 |
92 | @given(data=st.data(), dataframes=n_time_series_with_same_index(min_n=5))
93 | def test_fit_predict_bottom_up(
94 | self, data, dataframes, time_series_forecasting_model1_no_cache
95 | ):
96 | model = data.draw(
97 | hierarchical_bottom_up_model(time_series_forecasting_model1_no_cache)
98 | )
99 | prediction = model.fit(dataframes).predict()
100 | for key in dataframes.keys():
101 | if key not in prediction.keys():
102 | raise ValueError
103 |
104 | @given(dataframes=n_time_series_with_same_index(min_n=5))
105 | def test_fit_predict_on_subset_of_time_series(
106 | self, dataframes, hierarchical_basic_bottom_up_model
107 | ):
108 | key = np.random.choice(list(dataframes.keys()), 1)[0]
109 | hierarchical_basic_bottom_up_model.fit(dataframes)
110 | hierarchical_basic_bottom_up_model.predict({key: dataframes[key]})
111 |
112 | def test_error_predict_not_fitted(self, hierarchical_basic_bottom_up_model):
113 | with pytest.raises(sklearn.exceptions.NotFittedError):
114 | hierarchical_basic_bottom_up_model.predict()
115 |
116 | @given(dataframes=n_time_series_with_same_index())
117 | def test_error_with_bad_predict_key(
118 | self, dataframes, hierarchical_basic_bottom_up_model
119 | ):
120 | correct_key = np.random.choice(list(dataframes.keys()), 1)[0]
121 | bad_key = "".join(dataframes.keys()) + "bad_key"
122 | hierarchical_basic_bottom_up_model.fit(dataframes)
123 | with pytest.raises(KeyError):
124 | hierarchical_basic_bottom_up_model.predict(
125 | {bad_key: dataframes[correct_key]}
126 | )
127 |
128 | @given(time_series=giotto_time_series(min_length=5))
129 | def test_error_fit_dataframe(self, time_series, hierarchical_basic_bottom_up_model):
130 | with pytest.raises(ValueError):
131 | hierarchical_basic_bottom_up_model.fit(time_series)
132 |
133 | @given(time_series=giotto_time_series(min_length=5))
134 | def test_error_fit_key_not_string(
135 | self, time_series, hierarchical_basic_bottom_up_model
136 | ):
137 | with pytest.raises(ValueError):
138 | hierarchical_basic_bottom_up_model.fit({1: time_series})
139 |
140 | def test_error_fit_value_not_dataframe(self, hierarchical_basic_bottom_up_model):
141 | with pytest.raises(ValueError):
142 | hierarchical_basic_bottom_up_model.fit({"wrong_field": 12})
143 |
--------------------------------------------------------------------------------
/gtime/utils/hypothesis/feature_matrices.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | import hypothesis.strategies as st
4 | import pandas as pd
5 | from hypothesis.extra.numpy import arrays
6 | from hypothesis.strategies import tuples, integers, floats
7 |
8 | from gtime.utils.hypothesis.general_strategies import shape_X_y_matrices, shape_matrix
9 | from .time_indexes import giotto_time_series
10 | from ...compose import FeatureCreation
11 | from ...model_selection import horizon_shift
12 |
13 |
14 | @st.composite
15 | def X_y_matrices(
16 | draw,
17 | horizon: int,
18 | df_transformer: FeatureCreation,
19 | min_length: Optional[int] = None,
20 | allow_nan_infinity: bool = True,
21 | ):
22 | """ Returns a strategy that generates X and y feature matrices.
23 |
24 | Parameters
25 | ----------
26 | horizon : ``int``, required
27 | The number of steps to forecast in the future. It affects the y shape.
28 |
29 | df_transformer : ``List[TimeSeriesFeature]``, required
30 | The list of TimeSeriesFeature that is given as input to the FeaturesCreation
31 |
32 | min_length : ``int``, optional, (default=``None``)
33 | Minimum length of the matrices
34 |
35 | allow_nan_infinity : ``bool``, optional, (default=``True``)
36 | Allow nan and infinity in the starting time series
37 |
38 | Returns
39 | -------
40 | X : pd.DataFrame
41 | X feature matrix
42 |
43 | y : pd.DataFrame
44 | y feature matrix
45 | """
46 | min_length = min_length if min_length is not None else 1
47 | period_index_series = draw(
48 | giotto_time_series(
49 | min_length=min_length,
50 | allow_nan=allow_nan_infinity,
51 | allow_infinity=allow_nan_infinity,
52 | )
53 | )
54 | X = df_transformer.fit_transform(period_index_series)
55 | y = horizon_shift(period_index_series, horizon=horizon)
56 |
57 | return X, y
58 |
59 |
60 | @st.composite
61 | def X_matrices(
62 | draw,
63 | df_transformer: FeatureCreation,
64 | min_length: Optional[int] = None,
65 | allow_nan_infinity: bool = True,
66 | ):
67 | """ Returns a strategy that generates the X feature matrix.
68 |
69 | Parameters
70 | ----------
71 | df_transformer : ``List[TimeSeriesFeature]``, required
72 | the list of TimeSeriesFeature that is given as input to the
73 | FeaturesCreation
74 |
75 | min_length : ``int``, optional, (default=``None``)
76 | minimum length of the matrices
77 |
78 | allow_nan_infinity : ``bool``, optional, (default=``True``)
79 | allow nan and infinity in the starting time series
80 |
81 | Returns
82 | -------
83 | X : ``pd.DataFrame``
84 | X feature matrix
85 | """
86 | min_length = min_length if min_length is not None else 1
87 | period_index_series = draw(
88 | giotto_time_series(
89 | min_length=min_length,
90 | allow_nan=allow_nan_infinity,
91 | allow_infinity=allow_nan_infinity,
92 | )
93 | )
94 |
95 | X = df_transformer.fit_transform(period_index_series)
96 | return X
97 |
98 |
99 | @st.composite
100 | def y_matrices(
101 | draw,
102 | horizon: int = 3,
103 | min_length: Optional[int] = None,
104 | allow_nan_infinity: bool = True,
105 | ):
106 | """ Returns a strategy that generates the y feature matrix.
107 |
108 | Parameters
109 | ----------
110 | horizon : ``int``, optional, (default=3)
111 | the number of steps to forecast in the future. It affects the y shape.
112 |
113 | min_length : ``int``, optional, (default=``None``)
114 | minimum length of the matrices
115 |
116 | allow_nan_infinity : ``bool``, optional, (default=``True``)
117 | allow nan and infinity in the starting time series
118 |
119 | Returns
120 | -------
121 | y : ``pd.DataFrame``
122 | y feature matrix
123 | """
124 | min_length = min_length if min_length is not None else 1
125 | period_index_series = draw(
126 | giotto_time_series(
127 | min_length=min_length,
128 | allow_nan=allow_nan_infinity,
129 | allow_infinity=allow_nan_infinity,
130 | )
131 | )
132 |
133 | y = horizon_shift(period_index_series, horizon=horizon)
134 |
135 | return y
136 |
137 |
138 | @st.composite
139 | def numpy_X_y_matrices(
140 | draw,
141 | X_y_shapes=shape_X_y_matrices(),
142 | min_value: float = None,
143 | max_value: float = None,
144 | allow_nan: bool = False,
145 | allow_infinity: bool = False,
146 | ):
147 | if isinstance(X_y_shapes, tuple) or isinstance(X_y_shapes, list):
148 | X_shape, y_shape = X_y_shapes
149 | else:
150 | X_shape, y_shape = draw(X_y_shapes)
151 | if X_shape[0] != y_shape[0]:
152 | raise ValueError(f"X.shape[0] must be == y.shape[0]: {X_shape}, {y_shape}")
153 | if X_shape[0] <= X_shape[1]:
154 | raise ValueError(f"X.shape[0] must be <= X.shape[1]: {X_shape}")
155 |
156 | elements = floats(
157 | min_value=min_value,
158 | max_value=max_value,
159 | allow_nan=allow_nan,
160 | allow_infinity=allow_infinity,
161 | )
162 | X = draw(arrays(dtype=float, shape=X_shape, elements=elements,))
163 | y = draw(arrays(dtype=float, shape=y_shape, elements=elements,))
164 | return X, y
165 |
166 |
167 | @st.composite
168 | def numpy_X_matrices(
169 | draw,
170 | shape=shape_matrix(),
171 | min_value: float = None,
172 | max_value: float = None,
173 | allow_nan: bool = False,
174 | allow_infinity: bool = False,
175 | ):
176 | if not isinstance(shape, tuple) and not isinstance(shape, list):
177 | shape = draw(shape)
178 | if shape[0] <= shape[1]:
179 | raise ValueError(f"X.shape[0] must be <= X.shape[1]: {shape}")
180 |
181 | elements = floats(
182 | min_value=min_value,
183 | max_value=max_value,
184 | allow_nan=allow_nan,
185 | allow_infinity=allow_infinity,
186 | )
187 |
188 | X = draw(arrays(dtype=float, shape=shape, elements=elements,))
189 | return X
190 |
--------------------------------------------------------------------------------
/gtime/causality/base.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | from itertools import product
3 |
4 | import numpy as np
5 | import pandas as pd
6 | from scipy import stats
7 | from sklearn.utils.validation import check_is_fitted
8 |
9 |
10 | class CausalityMixin:
11 | """ Base class for causality tests. """
12 |
13 | def __init__(self, bootstrap_iterations, permutation_iterations):
14 | self.bootstrap_iterations = bootstrap_iterations
15 | self.permutation_iterations = permutation_iterations
16 |
17 | def transform(self, data: pd.DataFrame) -> pd.DataFrame:
18 | """Shifts each input time series by the amount which optimizes correlation with
19 | the selected 'target_col' column. If no target column is specified, the first
20 | column of the DataFrame is taken as the target.
21 |
22 | Parameters
23 | ----------
24 | data : pd.DataFrame, shape (n_samples, n_time_series), required
25 | The DataFrame containing the time series on which to perform the
26 | transformation.
27 |
28 | Returns
29 | -------
30 | data_t : pd.DataFrame, shape (n_samples, n_time_series)
31 | The DataFrame (Pivot table) of the shifts which maximize the correlation
32 | between each time series. The shift is indicated in rows.
33 |
34 | """
35 | check_is_fitted(self)
36 | data_t = data.copy()
37 |
38 | if self.target_col is None:
39 | self.target_col = data_t.columns[0]
40 | warnings.warn(
41 | "The target column was not specified. Therefore, the first "
42 | f"column {self.target_col } of the DataFrame was taken as "
43 | "target column. If you want to transform with respect to "
44 | "another column, please use it as a target column."
45 | )
46 |
47 | for col in data_t:
48 | if col != self.target_col:
49 | data_t[col] = data_t[col].shift(self.best_shifts_[self.target_col][col])
50 | if self.dropna:
51 | data_t = data_t.dropna()
52 |
53 | return data_t
54 |
55 | def _initialize_table(self):
56 | best_shifts = pd.DataFrame(columns=["x", "y", "shift", "max_corr"])
57 | column_types = {
58 | "x": np.float64,
59 | "y": np.float64,
60 | "shift": np.int64,
61 | "max_corr": np.int64,
62 | }
63 |
64 | if self.bootstrap_iterations:
65 | best_shifts = best_shifts.reindex(
66 | best_shifts.columns.tolist() + ["p_values"], axis=1
67 | )
68 | column_types["p_values"] = np.float64
69 |
70 | best_shifts = best_shifts.astype(column_types)
71 | return best_shifts
72 |
73 | def _compute_best_shifts(self, data, shift_func):
74 | best_shifts = self._initialize_table()
75 |
76 | if self.target_col is None:
77 | columns_to_shift = [(x, y) for x, y in product(data.columns, repeat=2)]
78 |
79 | else:
80 | columns_to_shift = [(col, self.target_col) for col in data.columns]
81 |
82 | for (x, y) in columns_to_shift:
83 | res = shift_func(data, x=x, y=y)
84 | best_shift = res[1]
85 | max_corr = res[0]
86 | tables = {
87 | "x": x,
88 | "y": y,
89 | "shift": best_shift,
90 | "max_corr": max_corr,
91 | }
92 | if self.bootstrap_iterations:
93 | bootstrap_p_value = self._compute_p_values(
94 | data=data, x=x, y=y, shift=best_shift, test_type="bootstrap"
95 | )
96 | tables["bootstrap_p_values"] = bootstrap_p_value
97 |
98 | if self.permutation_iterations:
99 | bootstrap_p_value = self._compute_p_values(
100 | data=data, x=x, y=y, shift=best_shift, test_type="permutation"
101 | )
102 | tables["permutation_p_values"] = bootstrap_p_value
103 |
104 | best_shifts = best_shifts.append(tables, ignore_index=True,)
105 |
106 | return best_shifts
107 |
108 | def _compute_p_values(self, data, x, y, shift, test_type):
109 | data_t = data.copy()
110 | data_t[x] = data_t.shift(shift)[x]
111 | data_t.dropna(axis=0, inplace=True)
112 | rhos = []
113 | n_iterations = (
114 | self.permutation_iterations
115 | if test_type == "permutation"
116 | else self.bootstrap_iterations
117 | )
118 |
119 | for k in range(n_iterations):
120 | if test_type == "permutation":
121 | samples = data_t.sample(frac=1)
122 | else:
123 | samples = data_t.sample(n=len(data), replace=True)
124 |
125 | rhos.append(stats.pearsonr(samples[x], samples[y])[0])
126 | rhos = pd.DataFrame(rhos)
127 | percentiles = stats.percentileofscore(rhos, 0) / 100
128 | # print("percentile: ", percentiles)
129 | p_values = [2*percentile if percentile < 0.5 else 1 - percentile for percentile in percentiles]
130 |
131 | return p_values
132 |
133 | def _create_pivot_tables(self, best_shifts):
134 | pivot_best_shifts = pd.pivot_table(
135 | best_shifts, index=["x"], columns=["y"], values="shift"
136 | )
137 | max_corrs = pd.pivot_table(
138 | best_shifts, index=["x"], columns=["y"], values="max_corr"
139 | )
140 |
141 | pivot_tables = {"best_shifts": pivot_best_shifts, "max_corrs": max_corrs}
142 |
143 | if self.bootstrap_iterations:
144 | bootstrap_p_values = pd.pivot_table(
145 | best_shifts, index=["x"], columns=["y"], values="bootstrap_p_values"
146 | )
147 | pivot_tables["bootstrap_p_values"] = bootstrap_p_values
148 |
149 | if self.permutation_iterations:
150 | permutation_p_values = pd.pivot_table(
151 | best_shifts, index=["x"], columns=["y"], values="permutation_p_values"
152 | )
153 | pivot_tables["permutation_p_values"] = permutation_p_values
154 |
155 | return pivot_tables
156 |
--------------------------------------------------------------------------------
/gtime/preprocessing/tests/utils.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | from typing import List, Union, Optional, Tuple
3 |
4 | import numpy as np
5 | import pandas as pd
6 | from pandas.testing import assert_series_equal
7 |
8 | from gtime.preprocessing.time_series_conversion import (
9 | _SequenceToTimeIndexSeries,
10 | _PandasSeriesToTimeIndexSeries,
11 | _TimeIndexSeriesToPeriodIndexSeries,
12 | count_not_none,
13 | )
14 | from gtime.utils.testing_constants import DEFAULT_START, DEFAULT_FREQ
15 |
16 | PandasDate = Union[datetime, pd.Timestamp, str]
17 |
18 |
19 | def compare_output_of_input_sequence_to_expected_one(
20 | input_sequence, start, end, freq,
21 | ):
22 | computed_pandas_series = transform_sequence_into_time_index_series(
23 | input_sequence, start, end, freq
24 | )
25 | expected_pandas_series = pandas_series_with_period_index(
26 | input_sequence, start, end, freq
27 | )
28 | assert_series_equal(computed_pandas_series, expected_pandas_series)
29 |
30 |
31 | def compare_output_of_input_series_to_expected_one(
32 | input_sequence, start, end, freq,
33 | ):
34 | computed_pandas_series = transform_series_into_time_index_series(
35 | input_sequence, start, end, freq
36 | )
37 | expected_pandas_series = pandas_series_with_period_index(
38 | input_sequence.values, start, end, freq
39 | )
40 | assert_series_equal(computed_pandas_series, expected_pandas_series)
41 |
42 |
43 | def transform_sequence_into_time_index_series(
44 | array_like_object: Union[np.array, list, pd.Series],
45 | start: Optional[str] = None,
46 | end: Optional[str] = None,
47 | freq: Optional[str] = None,
48 | ) -> pd.Series:
49 | time_series_conversion = _SequenceToTimeIndexSeries(start, end, freq)
50 | return time_series_conversion.transform(array_like_object)
51 |
52 |
53 | def transform_series_into_time_index_series(
54 | array_like_object: Union[np.array, list, pd.Series],
55 | start: Optional[str] = None,
56 | end: Optional[str] = None,
57 | freq: Optional[str] = None,
58 | ) -> pd.Series:
59 | time_series_conversion = _PandasSeriesToTimeIndexSeries(start, end, freq)
60 | return time_series_conversion.transform(array_like_object)
61 |
62 |
63 | def transform_time_index_series_into_period_index_series(
64 | series: pd.Series, freq: pd.Timedelta = None,
65 | ) -> pd.Series:
66 | to_period_conversion = _TimeIndexSeriesToPeriodIndexSeries(freq=freq)
67 | return to_period_conversion.transform(series)
68 |
69 |
70 | def pandas_series_with_period_index(
71 | values: Union[np.array, List[float]],
72 | start: Optional[datetime] = None,
73 | end: Optional[datetime] = None,
74 | freq: Optional[pd.Timedelta] = None,
75 | ) -> pd.Series:
76 | start, end, freq = _initialize_start_end_freq(start, end, freq)
77 | index = pd.period_range(start=start, end=end, periods=len(values), freq=freq,)
78 | return pd.Series(index=index, data=values, dtype=np.float64)
79 |
80 |
81 | def _initialize_start_end_freq(
82 | start: PandasDate, end: PandasDate, freq: pd.Timedelta
83 | ) -> Tuple[pd.Timestamp, pd.Timestamp, pd.Timedelta]:
84 | not_none_params = count_not_none(start, end, freq)
85 | if not_none_params == 0:
86 | start, end, freq = _default_params_initialization()
87 | elif not_none_params == 1:
88 | start, end, freq = _one_not_none_param_initialization(start, end, freq)
89 | elif not_none_params == 2:
90 | start, end, freq = _two_not_none_params_initialization(start, end, freq)
91 | else:
92 | raise ValueError(
93 | "Of the three parameters: start, end, and "
94 | "freq, exactly two must be specified"
95 | )
96 | return start, end, freq
97 |
98 |
99 | def _default_params_initialization() -> Tuple[pd.Timestamp, pd.Timestamp, pd.Timedelta]:
100 | start = DEFAULT_START
101 | end = None
102 | freq = DEFAULT_FREQ
103 | return start, end, freq
104 |
105 |
106 | def _one_not_none_param_initialization(
107 | start, end, freq
108 | ) -> Tuple[pd.Timestamp, pd.Timestamp, pd.Timedelta]:
109 | if start is not None:
110 | start = start
111 | end = None
112 | freq = DEFAULT_FREQ
113 | elif end is not None:
114 | start = None
115 | end = end
116 | freq = DEFAULT_FREQ
117 | else:
118 | start = DEFAULT_START
119 | end = None
120 | freq = freq
121 | return start, end, freq
122 |
123 |
124 | def _two_not_none_params_initialization(
125 | start, end, freq
126 | ) -> Tuple[pd.Timestamp, pd.Timestamp, pd.Timedelta]:
127 | start = start
128 | end = end
129 | freq = freq
130 | return start, end, freq
131 |
132 |
133 | def datetime_index_series_to_period_index_series(
134 | datetime_index_series: pd.Series, freq: Optional[pd.Timedelta] = None
135 | ) -> pd.Series:
136 | if datetime_index_series.index.freq is not None:
137 | try:
138 | return pd.Series(
139 | index=pd.PeriodIndex(datetime_index_series.index),
140 | data=datetime_index_series.values,
141 | )
142 | except Exception as e:
143 | print(freq, datetime_index_series.index.freq)
144 | raise e
145 | else:
146 | freq = "1D" if freq is None else freq
147 | return pd.Series(
148 | index=pd.PeriodIndex(datetime_index_series.index, freq=freq),
149 | data=datetime_index_series.values,
150 | )
151 |
152 |
153 | def timedelta_index_to_datetime(
154 | index: pd.TimedeltaIndex, start: datetime = datetime(year=1970, month=1, day=1),
155 | ) -> pd.DatetimeIndex:
156 | return start + index
157 |
158 |
159 | def timedelta_index_series_to_period_index_series(
160 | timedelta_index_series: pd.Series, freq: Optional[pd.Timedelta] = None
161 | ) -> pd.Series:
162 | datetime_index = timedelta_index_to_datetime(timedelta_index_series.index)
163 | if datetime_index.freq is None:
164 | freq = "1D" if freq is None else freq
165 | period_index = pd.PeriodIndex(datetime_index, freq=freq)
166 | else:
167 | period_index = pd.PeriodIndex(datetime_index)
168 | return pd.Series(index=period_index, data=timedelta_index_series.values)
169 |
--------------------------------------------------------------------------------
/gtime/time_series_models/simple_models.py:
--------------------------------------------------------------------------------
1 | from gtime.compose import FeatureCreation
2 | from sklearn.compose import make_column_selector
3 | from gtime.feature_extraction import Shift, MovingAverage, MovingCustomFunction
4 | from gtime.time_series_models import TimeSeriesForecastingModel
5 | from gtime.forecasting import (
6 | NaiveForecaster,
7 | SeasonalNaiveForecaster,
8 | DriftForecaster,
9 | AverageForecaster,
10 | )
11 |
12 |
13 | class Naive(TimeSeriesForecastingModel):
14 | """ Naive model pipeline, no feature creation and ``NaiveModel()`` as a model
15 |
16 | Parameters
17 | ----------
18 | horizon: int - prediction horizon, in time series periods
19 |
20 | Examples
21 | --------
22 | >>> import pandas as pd
23 | >>> import numpy as np
24 | >>> from gtime.time_series_models import Naive
25 | >>> idx = pd.period_range(start='2011-01-01', end='2012-01-01')
26 | >>> np.random.seed(0)
27 | >>> df = pd.DataFrame(np.random.random((len(idx), 1)), index=idx, columns=['1'])
28 | >>> model = Naive(horizon=4)
29 | >>> model.fit(df)
30 | >>> model.predict()
31 | y_1 y_2 y_3 y_4
32 | 2011-12-29 0.543806 0.543806 0.543806 0.543806
33 | 2011-12-30 0.456911 0.456911 0.456911 0.456911
34 | 2011-12-31 0.882041 0.882041 0.882041 0.882041
35 | 2012-01-01 0.458604 0.458604 0.458604 0.458604
36 | """
37 |
38 | def __init__(self, horizon: int):
39 | features = [
40 | ("s1", Shift(0), make_column_selector()),
41 | ]
42 | super().__init__(features=features, horizon=horizon, model=NaiveForecaster())
43 |
44 |
45 | class Average(TimeSeriesForecastingModel):
46 | """ Average model pipeline, no feature creation and ``AverageModel()`` as a model
47 |
48 | Parameters
49 | ----------
50 | horizon: int - prediction horizon, in time series periods
51 |
52 | Examples
53 | --------
54 | >>> import pandas as pd
55 | >>> import numpy as np
56 | >>> from gtime.time_series_models import Average
57 | >>> idx = pd.period_range(start='2011-01-01', end='2012-01-01')
58 | >>> np.random.seed(0)
59 | >>> df = pd.DataFrame(np.random.random((len(idx), 1)), index=idx, columns=['1'])
60 | >>> model = Average(horizon=5)
61 | >>> model.fit(df)
62 | >>> model.predict()
63 | y_1 y_2 y_3 y_4 y_5
64 | 2011-12-28 0.558475 0.558475 0.558475 0.558475 0.558475
65 | 2011-12-29 0.556379 0.556379 0.556379 0.556379 0.556379
66 | 2011-12-30 0.543946 0.543946 0.543946 0.543946 0.543946
67 | 2011-12-31 0.581512 0.581512 0.581512 0.581512 0.581512
68 | 2012-01-01 0.569221 0.569221 0.569221 0.569221 0.569221
69 |
70 | """
71 |
72 | def __init__(self, horizon: int):
73 | features = [
74 | ("s1", Shift(0), make_column_selector()),
75 | ]
76 | super().__init__(features=features, horizon=horizon, model=AverageForecaster())
77 |
78 |
79 | class SeasonalNaive(TimeSeriesForecastingModel):
80 | """ Seasonal naive model pipeline, no feature creation and ``SeasonalNaiveModel()`` as a model
81 |
82 | Parameters
83 | ----------
84 | horizon: int - prediction horizon, in time series periods
85 | seasonal_length: int - full season cycle length, in time series periods
86 |
87 | Examples
88 | --------
89 |
90 | >>> import pandas as pd
91 | >>> import numpy as np
92 | >>> from gtime.time_series_models import SeasonalNaive
93 | >>> idx = pd.period_range(start='2011-01-01', end='2012-01-01')
94 | >>> np.random.seed(0)
95 | >>> df = pd.DataFrame(np.random.random((len(idx), 1)), index=idx, columns=['1'])
96 | >>> model = SeasonalNaive(horizon=5, seasonal_length=4)
97 | >>> model.fit(df)
98 | >>> model.predict()
99 |
100 | y_1 y_2 y_3 y_4 y_5
101 | 2011-12-28 0.392676 0.956406 0.187131 0.128861 0.392676
102 | 2011-12-29 0.956406 0.187131 0.128861 0.392676 0.956406
103 | 2011-12-30 0.187131 0.128861 0.392676 0.956406 0.187131
104 | 2011-12-31 0.128861 0.392676 0.956406 0.187131 0.128861
105 | 2012-01-01 0.392676 0.956406 0.187131 0.128861 0.392676
106 | """
107 |
108 | def __init__(self, horizon: int, seasonal_length: int):
109 | features = [
110 | ("s1", Shift(0), make_column_selector()),
111 | ]
112 | self.seasonal_length = seasonal_length
113 | self.horizon = horizon
114 | super().__init__(
115 | features=features,
116 | horizon=horizon,
117 | model=SeasonalNaiveForecaster(seasonal_length),
118 | )
119 |
120 |
121 | class Drift(TimeSeriesForecastingModel):
122 | """ Simple drift model pipeline, no feature creation and ``DriftModel()`` as a model
123 |
124 | Parameters
125 | ----------
126 | horizon: int - prediction horizon, in time series periods
127 |
128 | Examples
129 | --------
130 |
131 | >>> import pandas as pd
132 | >>> import numpy as np
133 | >>> from gtime.time_series_models import Drift
134 | >>> idx = pd.period_range(start='2011-01-01', end='2012-01-01')
135 | >>> np.random.seed(0)
136 | >>> df = pd.DataFrame(np.random.random((len(idx), 1)), index=idx, columns=['1'])
137 | >>> model = Drift(horizon=5)
138 | >>> model.fit(df)
139 | >>> model.predict()
140 |
141 | y_1 y_2 y_3 y_4 y_5
142 | 2011-12-28 0.903984 0.902982 0.901980 0.900978 0.899976
143 | 2011-12-29 0.543806 0.542804 0.541802 0.540800 0.539798
144 | 2011-12-30 0.456911 0.455910 0.454908 0.453906 0.452904
145 | 2011-12-31 0.882041 0.881040 0.880038 0.879036 0.878034
146 | 2012-01-01 0.458604 0.457602 0.456600 0.455598 0.454596
147 |
148 | """
149 |
150 | def __init__(self, horizon: int):
151 | features = [
152 | ("s1", Shift(0), make_column_selector()),
153 | ]
154 | super().__init__(features=features, horizon=horizon, model=DriftForecaster())
155 |
--------------------------------------------------------------------------------
/gtime/time_series_models/tests/test_cv_pipeline.py:
--------------------------------------------------------------------------------
1 | import pytest
2 | import pandas as pd
3 | import numpy as np
4 | from hypothesis import given, settings
5 | import hypothesis.strategies as st
6 | from gtime.time_series_models import CVPipeline
7 | from gtime.metrics import max_error, mae, rmse, log_mse
8 | from gtime.time_series_models import (
9 | AR,
10 | Naive,
11 | SeasonalNaive,
12 | TimeSeriesForecastingModel,
13 | )
14 | from gtime.feature_extraction import MovingAverage, Shift
15 | from gtime.forecasting import NaiveForecaster, DriftForecaster
16 |
17 |
18 | @st.composite
19 | def draw_unique_subset(draw, lst):
20 | return draw(st.lists(st.sampled_from(lst), min_size=1, max_size=len(lst)))
21 |
22 |
23 | @st.composite
24 | def naive_model(draw):
25 | horizon = draw(
26 | st.lists(
27 | st.integers(min_value=1, max_value=20), min_size=1, max_size=4, unique=True
28 | )
29 | )
30 | return (Naive, {"horizon": horizon})
31 |
32 |
33 | @st.composite
34 | def seasonal_naive_model(draw):
35 | horizon = draw(
36 | st.lists(
37 | st.integers(min_value=1, max_value=20), min_size=1, max_size=4, unique=True
38 | )
39 | )
40 | seasonal_length = draw(
41 | st.lists(
42 | st.integers(min_value=1, max_value=10), min_size=1, max_size=4, unique=True
43 | )
44 | )
45 | return (SeasonalNaive, {"horizon": horizon, "seasonal_length": seasonal_length})
46 |
47 |
48 | @st.composite
49 | def ar_model(draw):
50 | horizon = draw(
51 | st.lists(
52 | st.integers(min_value=1, max_value=20), min_size=1, max_size=4, unique=True
53 | )
54 | )
55 | p = draw(
56 | st.lists(
57 | st.integers(min_value=1, max_value=20), min_size=1, max_size=4, unique=True
58 | )
59 | )
60 | explainer = draw(st.sampled_from([None, "lime", "shap"]))
61 | return (AR, {"horizon": horizon, "p": p, "explainer_type": [explainer]})
62 |
63 |
64 | @st.composite
65 | def models_grid(draw):
66 | model_list = [draw(ar_model()), draw(seasonal_naive_model()), draw(naive_model())]
67 | return dict(draw(draw_unique_subset(model_list)))
68 |
69 |
70 | @st.composite
71 | def metrics(draw):
72 | metric_list = [max_error, mae, rmse, log_mse]
73 | metrics = draw(draw_unique_subset(metric_list))
74 | metrics_dict = dict(zip([x.__name__ for x in metrics], metrics))
75 | return metrics_dict
76 |
77 |
78 | class TestCVPipeline:
79 | @given(
80 | models=models_grid(),
81 | n_splits=st.integers(min_value=2, max_value=10),
82 | blocking=st.booleans(),
83 | metrics=metrics(),
84 | )
85 | def test_constructor(self, models, n_splits, blocking, metrics):
86 | cv_pipeline = CVPipeline(
87 | models_sets=models, n_splits=n_splits, blocking=blocking, metrics=metrics
88 | )
89 | list_len = np.sum(
90 | [np.prod([len(y) for y in x.values()]) for x in models.values()]
91 | )
92 | assert list_len == len(cv_pipeline.model_list)
93 | assert len(metrics) == len(cv_pipeline.metrics)
94 |
95 | @pytest.mark.parametrize(
96 | "models", [{Naive: {"horizon": [3]}, AR: {"horizon": [3], "p": [2, 3]}}]
97 | )
98 | @pytest.mark.parametrize("metrics", [{"RMSE": rmse, "MAE": mae}])
99 | @pytest.mark.parametrize("n_splits", [3, 5])
100 | @pytest.mark.parametrize("blocking", [True, False])
101 | @pytest.mark.parametrize("seed", [5, 1000])
102 | def test_fit_predict(self, models, n_splits, blocking, metrics, seed):
103 | cv_pipeline = CVPipeline(
104 | models_sets=models, n_splits=n_splits, blocking=blocking, metrics=metrics
105 | )
106 | np.random.seed(seed)
107 | idx = pd.period_range(start="2011-01-01", end="2012-01-01")
108 | df = pd.DataFrame(
109 | np.random.standard_normal((len(idx), 1)), index=idx, columns=["1"]
110 | )
111 | cv_pipeline.fit(df)
112 | assert cv_pipeline.cv_results_.shape == (
113 | len(cv_pipeline.model_list) * len(metrics),
114 | 4,
115 | )
116 | y_pred = cv_pipeline.predict()
117 | horizon = cv_pipeline.best_model_.horizon
118 | assert y_pred.shape == (horizon, horizon)
119 |
120 | @pytest.mark.parametrize(
121 | "models",
122 | [
123 | {
124 | TimeSeriesForecastingModel: {
125 | "features": [
126 | [("s3", Shift(1), ["1"])],
127 | [("ma10", MovingAverage(10), ["1"])],
128 | ],
129 | "horizon": [4],
130 | "model": [NaiveForecaster(), DriftForecaster()],
131 | }
132 | }
133 | ],
134 | )
135 | @pytest.mark.parametrize("metrics", [{"RMSE": rmse, "MAE": mae}])
136 | @pytest.mark.parametrize("n_splits", [5])
137 | def test_model_assembly(self, models, n_splits, metrics):
138 | cv_pipeline = CVPipeline(models_sets=models, n_splits=n_splits, metrics=metrics)
139 | idx = pd.period_range(start="2011-01-01", end="2012-01-01")
140 | df = pd.DataFrame(
141 | np.random.standard_normal((len(idx), 1)), index=idx, columns=["1"]
142 | )
143 | cv_pipeline.fit(df)
144 | assert cv_pipeline.cv_results_.shape == (
145 | len(cv_pipeline.model_list) * len(metrics),
146 | 4,
147 | )
148 | y_pred = cv_pipeline.predict()
149 | horizon = cv_pipeline.best_model_.horizon
150 | assert y_pred.shape == (horizon, horizon)
151 |
152 | @pytest.mark.parametrize(
153 | "models", [{Naive: {"horizon": [3]}, AR: {"horizon": [3], "p": [2, 3]}}]
154 | )
155 | @pytest.mark.parametrize("refit", ["all", "best", ["Naive: {'horizon': 3}"]])
156 | def test_models_refit(self, models, refit):
157 | cv_pipeline = CVPipeline(models_sets=models)
158 | idx = pd.period_range(start="2011-01-01", end="2012-01-01")
159 | df = pd.DataFrame(
160 | np.random.standard_normal((len(idx), 1)), index=idx, columns=["1"]
161 | )
162 | cv_pipeline.fit(df, refit=refit)
163 | assert cv_pipeline.cv_results_.shape == (len(cv_pipeline.model_list), 4,)
164 | y_pred = cv_pipeline.predict()
165 | horizon = cv_pipeline.best_model_.horizon
166 | assert y_pred.shape == (horizon, horizon)
167 |
--------------------------------------------------------------------------------
/gtime/feature_generation/tests/test_external.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import pytest
3 | from hypothesis import given, strategies as st
4 |
5 | if pd.__version__ >= "1.0.0":
6 | import pandas._testing as testing
7 | else:
8 | import pandas.util.testing as testing
9 |
10 | from gtime.feature_generation import Constant, PeriodicSeasonal
11 | from gtime.utils.hypothesis.time_indexes import giotto_time_series
12 |
13 |
14 | class TestPeriodicSesonalFeature:
15 | def test_missing_start_date_or_period(self):
16 | periodic_feature = PeriodicSeasonal()
17 | with pytest.raises(ValueError):
18 | periodic_feature.transform()
19 |
20 | periodic_feature = PeriodicSeasonal(index_period=1)
21 | with pytest.raises(ValueError):
22 | periodic_feature.transform()
23 |
24 | periodic_feature = PeriodicSeasonal(start_date="2010-01-01")
25 | with pytest.raises(ValueError):
26 | periodic_feature.transform()
27 |
28 | def test_string_period(self):
29 | testing.N, testing.K = 20, 1
30 | ts = testing.makeTimeDataFrame(freq="s")
31 | periodic_feature = PeriodicSeasonal(period="1 days")
32 | periodic_feature.transform(ts)
33 |
34 | assert type(periodic_feature.period) == pd.Timedelta
35 |
36 | def test_correct_start_date(self):
37 | testing.N, testing.K = 20, 1
38 | ts = testing.makeTimeDataFrame(freq="s")
39 | start_date = "2018-01-01"
40 | periodic_feature = PeriodicSeasonal(period="1 days", start_date=start_date)
41 | periodic_feature.transform(ts)
42 |
43 | assert periodic_feature.start_date == ts.index.values[0]
44 |
45 | periodic_feature = PeriodicSeasonal(
46 | period="3 days", index_period=10, start_date=start_date
47 | )
48 | periodic_feature.transform()
49 | assert periodic_feature.start_date == pd.to_datetime(start_date)
50 |
51 | start_date = pd.to_datetime("2018-01-01")
52 | periodic_feature = PeriodicSeasonal(
53 | period="3 days", index_period=10, start_date=start_date
54 | )
55 | periodic_feature.transform()
56 | assert periodic_feature.start_date == start_date
57 |
58 | def test_too_high_sampling_frequency(self):
59 | start_date = "2018-01-01"
60 | periodic_feature = PeriodicSeasonal(
61 | period="2 days",
62 | start_date=start_date,
63 | index_period=pd.date_range(start=start_date, end="2020-01-01", freq="W"),
64 | )
65 | with pytest.raises(ValueError):
66 | periodic_feature.transform()
67 |
68 | def test_correct_sinusoide(self):
69 | testing.N, testing.K = 30, 1
70 | ts = testing.makeTimeDataFrame(freq="MS")
71 | start_date = "2018-01-01"
72 | periodic_feature = PeriodicSeasonal(
73 | period="365 days",
74 | start_date=start_date,
75 | index_period=pd.date_range(start=start_date, end="2020-01-01", freq="W"),
76 | )
77 | output_sin = periodic_feature.transform(ts)
78 | expected_index = pd.DatetimeIndex(
79 | [
80 | "2000-01-01",
81 | "2000-02-01",
82 | "2000-03-01",
83 | "2000-04-01",
84 | "2000-05-01",
85 | "2000-06-01",
86 | "2000-07-01",
87 | "2000-08-01",
88 | "2000-09-01",
89 | "2000-10-01",
90 | "2000-11-01",
91 | "2000-12-01",
92 | "2001-01-01",
93 | "2001-02-01",
94 | "2001-03-01",
95 | "2001-04-01",
96 | "2001-05-01",
97 | "2001-06-01",
98 | "2001-07-01",
99 | "2001-08-01",
100 | "2001-09-01",
101 | "2001-10-01",
102 | "2001-11-01",
103 | "2001-12-01",
104 | "2002-01-01",
105 | "2002-02-01",
106 | "2002-03-01",
107 | "2002-04-01",
108 | "2002-05-01",
109 | "2002-06-01",
110 | ],
111 | dtype="datetime64[ns]",
112 | freq="MS",
113 | )
114 | expected_df = pd.DataFrame.from_dict(
115 | {
116 | f"0__{periodic_feature.__class__.__name__}": [
117 | 0.0,
118 | 0.25433547,
119 | 0.42938198,
120 | 0.49999537,
121 | 0.43585316,
122 | 0.25062091,
123 | 0.0043035,
124 | -0.25062091,
125 | -0.43585316,
126 | -0.49999537,
127 | -0.42938198,
128 | -0.24688778,
129 | 0.00860668,
130 | 0.2617078,
131 | 0.42938198,
132 | 0.49999537,
133 | 0.43585316,
134 | 0.25062091,
135 | 0.0043035,
136 | -0.25062091,
137 | -0.43585316,
138 | -0.49999537,
139 | -0.42938198,
140 | -0.24688778,
141 | 0.00860668,
142 | 0.2617078,
143 | 0.42938198,
144 | 0.49999537,
145 | 0.43585316,
146 | 0.25062091,
147 | ]
148 | }
149 | )
150 | expected_df.index = expected_index
151 | pd.testing.assert_frame_equal(output_sin, expected_df)
152 |
153 |
154 | class TestConstantFeature:
155 | def test_correct_constant_feature(self):
156 | constant = 12
157 | df = pd.DataFrame.from_dict({"old_name": [0, 1, 2, 3, 4, 5]})
158 |
159 | constant_feature = Constant(constant=constant)
160 |
161 | df_constant = constant_feature.fit_transform(df)
162 | expected_df_constant = pd.DataFrame.from_dict(
163 | {
164 | f"0__{constant_feature.__class__.__name__}": [
165 | constant,
166 | constant,
167 | constant,
168 | constant,
169 | constant,
170 | constant,
171 | ]
172 | }
173 | )
174 |
175 | testing.assert_frame_equal(expected_df_constant, df_constant, check_dtype=False)
176 |
177 | @given(
178 | giotto_time_series(
179 | min_length=1,
180 | start_date=pd.Timestamp(2000, 1, 1),
181 | end_date=pd.Timestamp(2010, 1, 1),
182 | ),
183 | st.integers(0, 100),
184 | )
185 | def test_random_ts_and_constant(self, df: pd.DataFrame, constant: int):
186 |
187 | constant_feature = Constant(constant=constant)
188 | df_constant = constant_feature.fit_transform(df)
189 |
190 | # testing.assert_frame_equal(expected_df_constant, df_constant)
191 |
--------------------------------------------------------------------------------
/gtime/feature_extraction/custom.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.utils.validation import check_is_fitted
3 |
4 | from ..base import add_class_name
5 | from gtime.feature_extraction import MovingCustomFunction
6 |
7 |
8 | class CrestFactorDetrending(MovingCustomFunction):
9 | """Crest factor detrending model.
10 | This class removes the trend from the data by using the crest factor definition.
11 | Each sample is normalize by its weighted surrounding.
12 | Generalized detrending is defined in (eq. 1) of: H. P. Tukuljac, V. Pulkki,
13 | H. Gamper, K. Godin, I. J. Tashev and N. Raghuvanshi, "A Sparsity Measure for Echo
14 | Density Growth in General Environments," ICASSP 2019 - 2019 IEEE International
15 | Conference on Acoustics, Speech and Signal Processing (ICASSP), Brighton, United
16 | Kingdom, 2019, pp. 1-5.
17 | Parameters
18 | ----------
19 | window_size : int, optional, default: ``1``
20 | The number of previous points on which to compute the crest factor detrending.
21 | is_causal : bool, optional, default: ``True``
22 | Whether the current sample is computed based only on the past or also on the future.
23 | Examples
24 | >>> import pandas as pd
25 | >>> from CrestFactorDetrending import CrestFactorDetrending
26 | >>> ts = pd.DataFrame([0, 1, 2, 3, 4, 5])
27 | >>> gnrl_dtr = CrestFactorDetrending(window_size=2)
28 | >>> gnrl_dtr.fit_transform(ts)
29 | 0__CrestFactorDetrending
30 | 0 NaN
31 | 1 1.000000
32 | 2 0.800000
33 | 3 0.692308
34 | 4 0.640000
35 | 5 0.609756
36 | --------
37 | """
38 |
39 | def __init__(self, window_size: int = 1, is_causal: bool = True):
40 | def detrend(signal):
41 | import numpy as np
42 |
43 | N = 2
44 | signal = np.array(signal)
45 | large_signal_segment = signal ** N
46 | large_segment_mean = np.sum(large_signal_segment)
47 | if self.is_causal:
48 | ref_index = -1
49 | else:
50 | ref_index = int(len(signal) / 2)
51 | small_signal_segment = signal[ref_index] ** N
52 | return small_signal_segment / large_segment_mean # (eq. 1)
53 |
54 | super().__init__(detrend)
55 | self.window_size = window_size
56 | self.is_causal = is_causal
57 |
58 | @add_class_name
59 | def transform(self, time_series: pd.DataFrame) -> pd.DataFrame:
60 | """For every row of ``time_series``, compute the moving crest factor detrending function of the
61 | previous ``window_size`` elements.
62 | Parameters
63 | ----------
64 | time_series : pd.DataFrame, shape (n_samples, 1), required
65 | The DataFrame on which to compute the rolling moving custom function.
66 | Returns
67 | -------
68 | time_series_t : pd.DataFrame, shape (n_samples, 1)
69 | A DataFrame, with the same length as ``time_series``, containing the rolling
70 | moving custom function for each element.
71 | """
72 | check_is_fitted(self)
73 |
74 | if self.is_causal:
75 | time_series_mvg_dtr = time_series.rolling(self.window_size).apply(
76 | self.custom_feature_function, raw=self.raw
77 | )
78 | else:
79 | time_series_mvg_dtr = time_series.rolling(
80 | self.window_size, min_periods=int(self.window_size / 2)
81 | ).apply(self.custom_feature_function, raw=self.raw)
82 | time_series_mvg_dtr = time_series_mvg_dtr.dropna()
83 |
84 | time_series_t = time_series_mvg_dtr
85 | return time_series_t
86 |
87 |
88 | class SortedDensity(MovingCustomFunction):
89 | """For each row in ``time_series``, compute the sorted density function of the
90 | previous ``window_size`` rows. If there are not enough rows, the value is ``Nan``.
91 | Sorted density measured is defined in (eq. 1) of: H. P. Tukuljac, V. Pulkki,
92 | H. Gamper, K. Godin, I. J. Tashev and N. Raghuvanshi, "A Sparsity Measure for Echo
93 | Density Growth in General Environments," ICASSP 2019 - 2019 IEEE International
94 | Conference on Acoustics, Speech and Signal Processing (ICASSP), Brighton, United
95 | Kingdom, 2019, pp. 1-5.
96 | Parameters
97 | ----------
98 | window_size : int, optional, default: ``1``
99 | The number of previous points on which to compute the sorted density.
100 | is_causal : bool, optional, default: ``True``
101 | Whether the current sample is computed based only on the past or also on the future.
102 | Examples
103 | --------
104 | >>> import pandas as pd
105 | >>> from gtime.feature_extraction import SortedDensity
106 | >>> ts = pd.DataFrame([0, 1, 2, 3, 4, 5])
107 | >>> mv_avg = SortedDensity(window_size=2)
108 | >>> mv_avg.fit_transform(ts)
109 | 0__SortedDensity
110 | 0 NaN
111 | 1 0.500000
112 | 2 0.666667
113 | 3 0.700000
114 | 4 0.714286
115 | 5 0.722222
116 | --------
117 | """
118 |
119 | def __init__(self, window_size: int = 1, is_causal: bool = True):
120 | def sorted_density(signal):
121 | import numpy as np
122 |
123 | t = np.array(range(len(signal))) + 1
124 | signal = signal[signal.argsort()[::-1]]
125 | t = np.reshape(t, signal.shape)
126 | SD = np.sum(np.multiply(t, signal)) / np.sum(signal) # (eq. 2)
127 | SD = SD / (len(signal))
128 | return SD
129 |
130 | super().__init__(sorted_density)
131 | self.window_size = window_size
132 | self.is_causal = is_causal
133 |
134 | @add_class_name
135 | def transform(self, time_series: pd.DataFrame) -> pd.DataFrame:
136 | """For every row of ``time_series``, compute the moving sorted density function of the
137 | previous ``window_size`` elements.
138 | Parameters
139 | ----------
140 | time_series : pd.DataFrame, shape (n_samples, 1), required
141 | The DataFrame on which to compute the rolling moving custom function.
142 | Returns
143 | -------
144 | time_series_t : pd.DataFrame, shape (n_samples, 1)
145 | A DataFrame, with the same length as ``time_series``, containing the rolling
146 | moving custom function for each element.
147 | """
148 | check_is_fitted(self)
149 |
150 | if self.is_causal:
151 | time_series_mvg_sd = time_series.rolling(self.window_size).apply(
152 | self.custom_feature_function, raw=self.raw
153 | )
154 | else:
155 | time_series_mvg_sd = time_series.rolling(
156 | self.window_size, min_periods=int(self.window_size / 2)
157 | ).apply(self.custom_feature_function, raw=self.raw)
158 | time_series_mvg_sd = time_series_mvg_sd.dropna()
159 |
160 | time_series_t = time_series_mvg_sd
161 | return time_series_t
162 |
--------------------------------------------------------------------------------