├── gtime ├── utils │ ├── __init__.py │ ├── hypothesis │ │ ├── __init__.py │ │ ├── tests │ │ │ ├── __init__.py │ │ │ └── test_general_strategies.py │ │ ├── utils.py │ │ ├── general_strategies.py │ │ └── feature_matrices.py │ ├── testing_constants.py │ ├── trends.py │ └── fixtures.py ├── external │ ├── __init__.py │ └── make_holidays.py ├── causality │ ├── tests │ │ ├── __init__.py │ │ ├── common.py │ │ ├── test_granger_causality.py │ │ ├── test_pearson_correlation.py │ │ └── test_linear_coefficient.py │ ├── __init__.py │ ├── pearson_correlation.py │ ├── linear_coefficient.py │ └── base.py ├── experimental │ ├── __init__.py │ └── trend_models │ │ └── function_trend.py ├── metrics │ ├── tests │ │ └── __init__.py │ └── __init__.py ├── plotting │ ├── tests │ │ ├── __init__.py │ │ ├── test_plotting.py │ │ └── test_preprocessing.py │ └── __init__.py ├── explainability │ ├── tests │ │ └── __init__.py │ └── __init__.py ├── forecasting │ ├── tests │ │ ├── __init__.py │ │ ├── test_online.py │ │ ├── test_trend.py │ │ └── test_naive.py │ ├── __init__.py │ ├── trend.py │ └── online.py ├── hierarchical │ ├── tests │ │ ├── __init__.py │ │ ├── test_naive.py │ │ └── test_bottom_up.py │ ├── __init__.py │ ├── base.py │ └── naive.py ├── preprocessing │ ├── tests │ │ ├── __init__.py │ │ └── utils.py │ ├── time_series_resampling.py │ └── __init__.py ├── regressors │ ├── tests │ │ ├── __init__.py │ │ ├── test_linear_regressor.py │ │ └── test_explainable.py │ ├── __init__.py │ ├── linear_regressor.py │ ├── explainable.py │ └── multi_output.py ├── feature_extraction │ ├── tests │ │ ├── __init__.py │ │ ├── test_trend.py │ │ ├── test_sorted_density.py │ │ └── test_crest_factor_detrending.py │ ├── __init__.py │ ├── trend.py │ └── custom.py ├── feature_generation │ ├── tests │ │ ├── __init__.py │ │ ├── test_calendar.py │ │ └── test_external.py │ └── __init__.py ├── model_selection │ ├── tests │ │ ├── __init__.py │ │ └── test_splitters.py │ ├── __init__.py │ ├── horizon_shift.py │ └── splitters.py ├── time_series_models │ ├── tests │ │ ├── __init__.py │ │ ├── test_simple_models.py │ │ └── test_cv_pipeline.py │ ├── __init__.py │ ├── ar.py │ └── simple_models.py ├── compose │ ├── __init__.py │ ├── tests │ │ └── test_feature_creation.py │ └── feature_creation.py ├── __init__.py ├── _version.py └── base.py ├── doc-requirements.txt ├── docs ├── source │ ├── modules │ │ ├── compose.rst │ │ ├── metrics.rst │ │ ├── regressors.rst │ │ ├── causality.rst │ │ ├── forecasting.rst │ │ ├── preprocessing.rst │ │ ├── model_selection.rst │ │ ├── feature_extraction.rst │ │ ├── feature_generation.rst │ │ ├── time_series_models.rst │ │ └── index.rst │ ├── index.rst │ └── conf.py ├── index.html ├── .nojekyll ├── Makefile └── make.bat ├── setup.cfg ├── MANIFEST.in ├── dev-requirements.txt ├── requirements.txt ├── CODE_AUTHORS ├── .pre-commit-config.yaml ├── .coveragerc ├── conftest.py ├── GOVERNANCE.rst ├── .github ├── workflows │ ├── build_and_publish.yml │ ├── deploy_github_pages.yml │ └── ci.yml └── ISSUE_TEMPLATE │ └── bug_report.md ├── PULL_REQUEST_TEMPLATE.md ├── .gitignore ├── setup.py ├── examples └── hierarchical_model.ipynb ├── CODE_OF_CONDUCT.rst ├── README.md └── CONTRIBUTING.rst /gtime/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gtime/external/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gtime/causality/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gtime/experimental/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gtime/metrics/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gtime/plotting/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gtime/explainability/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gtime/forecasting/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gtime/hierarchical/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gtime/preprocessing/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gtime/regressors/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gtime/utils/hypothesis/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gtime/feature_extraction/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gtime/feature_generation/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gtime/model_selection/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gtime/time_series_models/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gtime/utils/hypothesis/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /doc-requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | numpydoc 3 | sphinx_issues 4 | sphinx_rtd_theme 5 | -------------------------------------------------------------------------------- /docs/source/modules/compose.rst: -------------------------------------------------------------------------------- 1 | Compose 2 | ================== 3 | 4 | .. automodule:: gtime.compose 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/modules/metrics.rst: -------------------------------------------------------------------------------- 1 | Metrics 2 | ================== 3 | 4 | .. automodule:: gtime.metrics 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /docs/source/modules/regressors.rst: -------------------------------------------------------------------------------- 1 | Regressors 2 | ================== 3 | 4 | .. automodule:: gtime.regressors 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/modules/causality.rst: -------------------------------------------------------------------------------- 1 | Causality Tests 2 | ================== 3 | 4 | .. automodule:: gtime.causality 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/modules/forecasting.rst: -------------------------------------------------------------------------------- 1 | Forecasting 2 | ================== 3 | 4 | .. automodule:: gtime.forecasting 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/modules/preprocessing.rst: -------------------------------------------------------------------------------- 1 | Preprocessing 2 | ================== 3 | 4 | .. automodule:: gtime.preprocessing 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/modules/model_selection.rst: -------------------------------------------------------------------------------- 1 | Model Selection 2 | ================== 3 | 4 | .. automodule:: gtime.model_selection 5 | :members: 6 | -------------------------------------------------------------------------------- /gtime/explainability/__init__.py: -------------------------------------------------------------------------------- 1 | from .explainer import _ShapExplainer, _LimeExplainer 2 | 3 | __all__ = ["_ShapExplainer", "_LimeExplainer"] 4 | -------------------------------------------------------------------------------- /docs/source/modules/feature_extraction.rst: -------------------------------------------------------------------------------- 1 | Feature Extraction 2 | ================== 3 | 4 | .. automodule:: gtime.feature_extraction 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/modules/feature_generation.rst: -------------------------------------------------------------------------------- 1 | Feature Generation 2 | ================== 3 | 4 | .. automodule:: gtime.feature_generation 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/modules/time_series_models.rst: -------------------------------------------------------------------------------- 1 | Time Series Models 2 | ================== 3 | 4 | .. automodule:: gtime.time_series_models 5 | :members: 6 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.rst 3 | 4 | [tool:pytest] 5 | addopts = 6 | --ignore doc 7 | -ra 8 | 9 | --ignore gtime/experimental -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Include the README 2 | include README.rst 3 | 4 | # Include the license file 5 | include LICENSE 6 | 7 | # Include the requirements file 8 | include requirements.txt -------------------------------------------------------------------------------- /dev-requirements.txt: -------------------------------------------------------------------------------- 1 | hypothesis==5.5.3 2 | black 3 | pre-commit 4 | pytest 5 | pytest-cov 6 | pytest-xdist 7 | pytest-lazy-fixture 8 | flake8 9 | mypy 10 | nbconvert 11 | jupyter 12 | -------------------------------------------------------------------------------- /gtime/utils/testing_constants.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | DEFAULT_START = pd.Timestamp("1970-01-01") 4 | DEFAULT_END = pd.Timestamp("2020-01-01") 5 | DEFAULT_FREQ = pd.Timedelta("1D") 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas>=0.25.3 2 | scipy>=0.17.0 3 | scikit-learn>=0.22.0 4 | matplotlib>=3.1.0 5 | lime>=0.2.0.0 6 | shap>=0.35 7 | holidays>=0.10.2 8 | lunarcalendar>=0.0.9 9 | giotto-tda 10 | -------------------------------------------------------------------------------- /gtime/plotting/__init__.py: -------------------------------------------------------------------------------- 1 | from .plotting import seasonal_plot, seasonal_subplots, lag_plot, acf_plot 2 | 3 | __all__ = [ 4 | "seasonal_plot", 5 | "seasonal_subplots", 6 | "acf_plot", 7 | "lag_plot", 8 | ] 9 | -------------------------------------------------------------------------------- /gtime/compose/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`gtime.compose` module contains meta-estimators for building composite models 3 | with transformers. 4 | """ 5 | 6 | from .feature_creation import FeatureCreation 7 | 8 | __all__ = ["FeatureCreation"] 9 | -------------------------------------------------------------------------------- /gtime/preprocessing/time_series_resampling.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | # FIXME: TBD 5 | class _TimeSeriesResampler: 6 | def __init__(self): 7 | pass 8 | 9 | def transform(self, X: pd.Series): 10 | raise NotImplementedError 11 | -------------------------------------------------------------------------------- /gtime/utils/trends.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def polynomial(X, weights): 5 | return np.poly1d(weights)(X) 6 | 7 | 8 | def exponential(X, exponent): 9 | return np.exp(X * exponent) 10 | 11 | 12 | TRENDS = {"polynomial": polynomial, "exponential": exponential} 13 | -------------------------------------------------------------------------------- /CODE_AUTHORS: -------------------------------------------------------------------------------- 1 | # The following is the list of the code authors of the giotto-time python 2 | # package. Where component authors are known, add them here. 3 | 4 | Alessio Baccelli a.baccelli@l2f.ch 5 | Stefano Savarè s.savare@l2f.ch 6 | Benjamin Russell b.russell@l2f.ch 7 | Matteo Caorsi m.caorsi@giotto.ai 8 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: stable 4 | hooks: 5 | - id: black 6 | language_version: python3.7 7 | - repo: https://github.com/pre-commit/pre-commit-hooks.git 8 | sha: v0.9.5 9 | hooks: 10 | - id: no-commit-to-branch -------------------------------------------------------------------------------- /gtime/__init__.py: -------------------------------------------------------------------------------- 1 | from gtime._version import __version__ 2 | 3 | __all__ = [ 4 | "causality", 5 | "compose", 6 | "feature_extraction", 7 | "feature_generation", 8 | "forecasting", 9 | "metrics", 10 | "model_selection", 11 | "preprocessing", 12 | "regressors", 13 | "time_series_models", 14 | "utils", 15 | ] 16 | -------------------------------------------------------------------------------- /gtime/feature_generation/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`gtime.feature_generation` module deals with the creation of features that do 3 | not depend on the input data, but just on its index. 4 | """ 5 | 6 | from .calendar import Calendar 7 | from .external import PeriodicSeasonal, Constant 8 | 9 | __all__ = ["PeriodicSeasonal", "Constant", "Calendar"] 10 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | source = gtime 4 | parallel = True 5 | omit = 6 | **/experimental/* 7 | **/setup.py 8 | **/tests/* 9 | 10 | [report] 11 | exclude_lines = 12 | # Have to re-enable the standard pragma 13 | pragma: no cover 14 | 15 | # Don't complain if tests don't hit defensive assertion code: 16 | raise NotImplementedError 17 | -------------------------------------------------------------------------------- /gtime/regressors/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`gtime.regressors` module contains regression models. 3 | """ 4 | 5 | from .linear_regressor import LinearRegressor 6 | from .multi_output import MultiFeatureMultiOutputRegressor 7 | from .explainable import ExplainableRegressor 8 | 9 | __all__ = [ 10 | "LinearRegressor", 11 | "MultiFeatureMultiOutputRegressor", 12 | "ExplainableRegressor", 13 | ] 14 | -------------------------------------------------------------------------------- /gtime/causality/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`gtime.causality` module deals with the causality tests for time 3 | series data. 4 | """ 5 | 6 | from .linear_coefficient import ShiftedLinearCoefficient 7 | from .pearson_correlation import ShiftedPearsonCorrelation 8 | from .granger_causality import GrangerCausality 9 | 10 | 11 | __all__ = ["ShiftedLinearCoefficient", "ShiftedPearsonCorrelation", "GrangerCausality"] 12 | -------------------------------------------------------------------------------- /gtime/model_selection/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`gtime.model_selection` module deals with model selection. 3 | """ 4 | 5 | from .horizon_shift import horizon_shift 6 | from .splitters import FeatureSplitter 7 | from .cross_validation import time_series_split, blocking_time_series_split 8 | 9 | __all__ = [ 10 | "FeatureSplitter", 11 | "horizon_shift", 12 | "time_series_split", 13 | "blocking_time_series_split", 14 | ] 15 | -------------------------------------------------------------------------------- /gtime/preprocessing/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`gtime.preprocessing` module deals with the preprocessing of time series 3 | data. 4 | """ 5 | 6 | from .time_series_conversion import ( 7 | _SequenceToTimeIndexSeries, 8 | _PandasSeriesToTimeIndexSeries, 9 | _TimeIndexSeriesToPeriodIndexSeries, 10 | ) 11 | 12 | from .time_series_preparation import TimeSeriesPreparation 13 | 14 | __all__ = [ 15 | "TimeSeriesPreparation", 16 | ] 17 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. giotto documentation master file, created by 2 | sphinx-quickstart on Mon Jun 3 11:56:46 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to giotto-time's API reference! 7 | ======================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 3 11 | :caption: Contents: 12 | 13 | modules/index 14 | 15 | References 16 | ---------- 17 | 18 | * :ref:`genindex` 19 | * :ref:`modindex` 20 | 21 | -------------------------------------------------------------------------------- /gtime/hierarchical/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`gtime.hierarchical` module contains hierarchical time series models. 3 | """ 4 | 5 | from .base import HierarchicalBase 6 | from .naive import HierarchicalNaive 7 | from .bottom_up import HierarchicalBottomUp 8 | from .top_down import HierarchicalTopDown 9 | from .middle_out import HierarchicalMiddleOut 10 | 11 | __all__ = [ 12 | "HierarchicalBase", 13 | "HierarchicalNaive", 14 | "HierarchicalBottomUp", 15 | "HierarchicalTopDown", 16 | "HierarchicalMiddleOut", 17 | ] 18 | -------------------------------------------------------------------------------- /gtime/time_series_models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`gtime.time_series_models` module contains time series models. 3 | """ 4 | 5 | from .base import TimeSeriesForecastingModel 6 | from .ar import AR 7 | from .simple_models import ( 8 | Naive, 9 | SeasonalNaive, 10 | Average, 11 | Drift, 12 | ) 13 | from .cv_pipeline import CVPipeline 14 | 15 | __all__ = [ 16 | "TimeSeriesForecastingModel", 17 | "AR", 18 | "Naive", 19 | "SeasonalNaive", 20 | "Average", 21 | "Drift", 22 | "CVPipeline", 23 | ] 24 | -------------------------------------------------------------------------------- /conftest.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | from hypothesis import settings, Verbosity, HealthCheck 4 | 5 | settings.register_profile( 6 | "ci", 7 | max_examples=100, 8 | suppress_health_check=(HealthCheck.too_slow,), 9 | deadline=timedelta(milliseconds=1000), 10 | ) 11 | settings.register_profile( 12 | "dev", 13 | max_examples=7, 14 | suppress_health_check=(HealthCheck.too_slow,), 15 | deadline=timedelta(milliseconds=1000), 16 | ) 17 | settings.register_profile("debug", max_examples=7, verbosity=Verbosity.verbose) 18 | -------------------------------------------------------------------------------- /gtime/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`gtime.metrics` module contains a collection of different metrics. 3 | """ 4 | 5 | from .metrics import ( 6 | non_zero_smape, 7 | smape, 8 | max_error, 9 | mse, 10 | log_mse, 11 | r_square, 12 | mae, 13 | mape, 14 | rmse, 15 | rmsle, 16 | gmae, 17 | ) 18 | 19 | __all__ = [ 20 | "non_zero_smape", 21 | "smape", 22 | "max_error", 23 | "mse", 24 | "rmse", 25 | "log_mse", 26 | "rmsle", 27 | "r_square", 28 | "mae", 29 | "mape", 30 | "gmae", 31 | ] 32 | -------------------------------------------------------------------------------- /GOVERNANCE.rst: -------------------------------------------------------------------------------- 1 | This file describe the governance of the Giotto Time project. 2 | 3 | Project owner: 4 | -------------- 5 | 6 | - L2F SA 7 | 8 | Authors: 9 | -------- 10 | 11 | - Please refer to the `authors `_ file 12 | 13 | Giotto Time Project Team: 14 | -------------------- 15 | 16 | - Alessio Baccelli a.baccelli@l2f.ch (Developer) 17 | - Stefano Savarè s.savare@l2f.ch (Developer) 18 | - Philippe Nguyen p.nguyen@l2f.ch (Developer) 19 | 20 | Former Project Team Members: 21 | ---------------------------- 22 | 23 | - Benjamin Russell b.russell@l2f.ch 24 | -------------------------------------------------------------------------------- /docs/source/modules/index.rst: -------------------------------------------------------------------------------- 1 | API reference 2 | ============= 3 | This pages contains a list of available features in the library. 4 | 5 | .. toctree:: 6 | :maxdepth: 3 7 | 8 | causality 9 | 10 | compose 11 | 12 | explainability 13 | 14 | external 15 | 16 | feature_extraction 17 | 18 | feature_generation 19 | 20 | forecasting 21 | 22 | hierarchical 23 | 24 | metrics 25 | 26 | model_selection 27 | 28 | plotting 29 | 30 | preprocessing 31 | 32 | regressors 33 | 34 | time_series_models 35 | 36 | utils 37 | 38 | References 39 | ---------- 40 | 41 | * :ref:`genindex` 42 | * :ref:`modindex` -------------------------------------------------------------------------------- /gtime/_version.py: -------------------------------------------------------------------------------- 1 | """ 2 | ``giotto-time`` is a set of python methods to perform time series forecasting 3 | in a machine learning framework. 4 | """ 5 | # License: Apache 2.0 6 | 7 | # PEP0440 compatible formatted version, see: 8 | # https://www.python.org/dev/peps/pep-0440/ 9 | # 10 | # Generic release markers: 11 | # X.Y 12 | # X.Y.Z # For bugfix releases 13 | # 14 | # Admissible pre-release markers: 15 | # X.YaN # Alpha release 16 | # X.YbN # Beta release 17 | # X.YrcN # Release Candidate 18 | # X.Y # Final release 19 | # 20 | # Dev branch marker is: 'X.Y.dev' or 'X.Y.devN' where N is an integer. 21 | # 'X.Y.dev0' is the canonical version of 'X.Y.dev' 22 | # 23 | 24 | __version__ = "0.2.2" 25 | -------------------------------------------------------------------------------- /gtime/causality/tests/common.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import pandas as pd 4 | 5 | import pandas.util.testing as testing 6 | 7 | 8 | def make_df_from_expected_shifts(expected_shifts: List[int]) -> pd.DataFrame: 9 | testing.N, testing.K = 500, 1 10 | 11 | df = testing.makeTimeDataFrame(freq="D") 12 | for sh, k in zip(expected_shifts, range(3)): 13 | df[f"shift_{k}"] = df["A"].shift(-sh) 14 | df = df.dropna() 15 | 16 | return df 17 | 18 | 19 | def shift_df_from_expected_shifts( 20 | df: pd.DataFrame, expected_shifts: List[int] 21 | ) -> pd.DataFrame: 22 | for sh, k in zip(expected_shifts, range(3)): 23 | df[f"shift_{k}"] = df[f"shift_{k}"].shift(-sh) 24 | return df.dropna() 25 | -------------------------------------------------------------------------------- /docs/.nojekyll: -------------------------------------------------------------------------------- 1 | # Compiled python modules. 2 | *.pyc 3 | *.pyo 4 | *.pyd 5 | **/__pycache__ 6 | 7 | # Setuptools distribution folder. 8 | /dist/ 9 | 10 | # Python egg metadata, regenerated from source files by setuptools. 11 | /*.egg-info 12 | *.so 13 | build 14 | 15 | # Python jupyter notebooks 16 | examples/dask-worker-space 17 | examples/.ipynb_checkpoints 18 | 19 | # Data files 20 | *.pkl 21 | *.csv 22 | *.pqt 23 | data/* 24 | 25 | # Output files 26 | *.out 27 | 28 | # External 29 | **.DS_Store 30 | .idea/* 31 | .vscode/* 32 | *~ 33 | 34 | # Unit test 35 | .pytest_cache/ 36 | .hypothesis/ 37 | 38 | # Pytest output files 39 | test-output.xml 40 | 41 | # Latex 42 | *.aux 43 | *.bbl 44 | *.blg 45 | *.brf 46 | *.log 47 | *.pdf 48 | *.synctex.gz 49 | *.toc -------------------------------------------------------------------------------- /gtime/forecasting/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`gtime.forecasting` module contains a collection of machine learning models, 3 | for dealing with time series data. 4 | """ 5 | 6 | from .gar import GAR, GARFF, MultiFeatureMultiOutputRegressor, MultiFeatureGAR 7 | from .trend import TrendForecaster 8 | from .online import HedgeForecaster 9 | from .naive import ( 10 | NaiveForecaster, 11 | SeasonalNaiveForecaster, 12 | DriftForecaster, 13 | AverageForecaster, 14 | ) 15 | 16 | __all__ = [ 17 | "GAR", 18 | "GARFF", 19 | "MultiFeatureGAR", 20 | "TrendForecaster", 21 | "HedgeForecaster", 22 | "NaiveForecaster", 23 | "SeasonalNaiveForecaster", 24 | "DriftForecaster", 25 | "AverageForecaster", 26 | "MultiFeatureMultiOutputRegressor", 27 | ] 28 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | 22 | clean: 23 | rm -rf build/ generated/ reference/generated/ 24 | -------------------------------------------------------------------------------- /gtime/feature_extraction/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | The :mod:`gtime.feature_extraction` module deals with the creation of features 3 | starting from a time series. 4 | """ 5 | 6 | from gtime.feature_generation.calendar import Calendar 7 | from .standard import ( 8 | Shift, 9 | MovingAverage, 10 | MovingMedian, 11 | Max, 12 | Min, 13 | MovingCustomFunction, 14 | Polynomial, 15 | Exogenous, 16 | CustomFeature, 17 | ) 18 | from .custom import SortedDensity, CrestFactorDetrending 19 | 20 | from .trend import Detrender 21 | 22 | __all__ = [ 23 | "Shift", 24 | "MovingAverage", 25 | "MovingMedian", 26 | "Max", 27 | "Min", 28 | "MovingCustomFunction", 29 | "Polynomial", 30 | "Exogenous", 31 | "Calendar", 32 | "Detrender", 33 | "CustomFeature", 34 | "SortedDensity", 35 | "CrestFactorDetrending", 36 | ] 37 | -------------------------------------------------------------------------------- /gtime/forecasting/tests/test_online.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from gtime.forecasting.online import HedgeForecaster 4 | 5 | 6 | def test_hedge_fit_predict(): 7 | time_index = pd.date_range("2020-01-01", "2020-01-20") 8 | X_np = np.concatenate( 9 | (np.random.randint(4, size=(20, 2)), np.array([100] * 20).reshape(-1, 1)), 10 | axis=1, 11 | ) 12 | X = pd.DataFrame(X_np, index=time_index) 13 | y = pd.DataFrame( 14 | np.random.randint(4, size=(20, 1)), index=time_index, columns=["y_1"] 15 | ) 16 | hr = HedgeForecaster(random_state=42) 17 | 18 | preds = hr.fit_predict(X, y) 19 | np.testing.assert_equal(preds.shape, y.shape) 20 | np.testing.assert_almost_equal(hr.weights_[0], hr.weights_[1], decimal=2) 21 | assert hr.weights_[2] < hr.weights_[0] 22 | assert hr.weights_[2] < hr.weights_[1] 23 | -------------------------------------------------------------------------------- /gtime/compose/tests/test_feature_creation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | if pd.__version__ >= "1.0.0": 4 | import pandas._testing as testing 5 | else: 6 | import pandas.util.testing as testing 7 | from numpy.testing import assert_array_equal 8 | 9 | from gtime.compose import FeatureCreation 10 | from gtime.feature_extraction import Shift, MovingAverage 11 | 12 | 13 | def test_feature_creation_transform(): 14 | data = testing.makeTimeDataFrame(freq="s") 15 | 16 | shift = Shift(1) 17 | ma = MovingAverage(window_size=3) 18 | 19 | col_name = "A" 20 | 21 | fc = FeatureCreation([("s1", shift, [col_name]), ("ma3", ma, [col_name]),]) 22 | res = fc.fit(data).transform(data) 23 | 24 | assert_array_equal( 25 | res.columns.values, 26 | [ 27 | f"s1__{col_name}__{shift.__class__.__name__}", 28 | f"ma3__{col_name}__{ma.__class__.__name__}", 29 | ], 30 | ) 31 | -------------------------------------------------------------------------------- /gtime/base.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | from sklearn.utils.validation import check_is_fitted 4 | 5 | 6 | def add_class_name(func): 7 | @functools.wraps(func) 8 | def wrapper_add_class_name(*args, **kwargs): 9 | value = func(*args, **kwargs) 10 | return value.add_suffix("__" + args[0].__class__.__name__) 11 | 12 | return wrapper_add_class_name 13 | 14 | 15 | class FeatureMixin: 16 | """Mixin class for all feature extraction estimators in giotto-time.""" 17 | 18 | _estimator_type = "feature_extractor" 19 | 20 | def get_feature_names(self): 21 | """Return feature names for output features. 22 | 23 | Returns 24 | ------- 25 | output_feature_names : ndarray, shape (n_output_features,) 26 | Array of feature names. 27 | 28 | """ 29 | check_is_fitted(self) 30 | 31 | return [f"{name}__{self.__class__.__name__}" for name in self.columns_] 32 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /.github/workflows/build_and_publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: [workflow_dispatch] 7 | 8 | jobs: 9 | deploy: 10 | 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: '3.x' 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install setuptools wheel twine 23 | - name: Build and publish 24 | env: 25 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 26 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 27 | run: | 28 | python setup.py sdist bdist_wheel 29 | twine check dist/* 30 | twine upload dist/* -------------------------------------------------------------------------------- /PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 5 | 6 | #### Reference Issues/PRs 7 | 13 | 14 | 15 | #### What does this implement/fix? Explain your changes. 16 | 17 | 18 | #### Any other comments? 19 | 20 | 21 | 29 | -------------------------------------------------------------------------------- /gtime/causality/tests/test_granger_causality.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pytest 3 | import pandas as pd 4 | 5 | 6 | if pd.__version__ >= "1.0.0": 7 | import pandas._testing as testing 8 | else: 9 | import pandas.util.testing as testing 10 | from gtime.causality import GrangerCausality 11 | 12 | 13 | # Expected values from results of statstools 14 | @pytest.mark.parametrize( 15 | "test_input, expected", 16 | [ 17 | (["ssr_f"], 0.8420421667509344), 18 | (["ssr_chi2"], 0.8327660223526767), 19 | (["likelihood_chi2"], 0.8341270186135072), 20 | (["zero_f"], 0.8420421667508992), 21 | ], 22 | ) 23 | def test_granger_pvalues_ssr_f(test_input, expected): 24 | # Set random seed, otherwise testing creates a new dataframe each time. 25 | np.random.seed(12) 26 | 27 | data = testing.makeTimeDataFrame(freq="s", nper=1000) 28 | granger = ( 29 | GrangerCausality(target_col="A", x_col="B", max_shift=10, statistics=test_input) 30 | .fit(data) 31 | .results_[0] 32 | ) 33 | 34 | p_value = granger.values[1] 35 | # Not exactly equal but up test to 7 digits 36 | np.testing.assert_almost_equal(p_value, expected, decimal=7) 37 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | #### Description 4 | 6 | 7 | #### Steps/Code to Reproduce 8 | 12 | 13 | #### Expected Results 14 | 15 | 16 | #### Actual Results 17 | 18 | 19 | #### Versions 20 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /gtime/forecasting/tests/test_trend.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | import pandas.util.testing as testing 5 | 6 | from gtime.forecasting import TrendForecaster 7 | 8 | 9 | def test_polynomial_trend(): 10 | testing.N, testing.K = 500, 1 11 | df = testing.makeTimeDataFrame(freq="D") 12 | 13 | df["A"] = df["A"] + 0.0005 * pd.Series( 14 | index=df.index, data=range(df.shape[0]) 15 | ) * pd.Series(index=df.index, data=range(df.shape[0])) 16 | 17 | tm = TrendForecaster(trend="polynomial", trend_x0=0.0) 18 | tm.fit(df["A"]) 19 | # too hard to expect every time 20 | # assert np.allclose(tm.best_trend_params_, [0.0] * len(tm.best_trend_params_)) 21 | assert len(tm.best_trend_params_) == 1 22 | 23 | 24 | def test_exponential_trend(): 25 | testing.N, testing.K = 500, 1 26 | df = testing.makeTimeDataFrame(freq="D") 27 | 28 | df["A"] = df["A"] + 0.0005 * pd.Series( 29 | index=df.index, data=range(df.shape[0]) 30 | ).apply(lambda x: np.exp(0.03 * x)) 31 | 32 | tm = TrendForecaster(trend="exponential", trend_x0=4 * [0.0]) 33 | tm.fit(df) 34 | # too hard to expect this result every time 35 | # assert np.allclose(tm.best_trend_params_, [0.0] * len(tm.best_trend_params_)) 36 | assert len(tm.best_trend_params_) == 4 37 | 38 | # TODO: predicting tests 39 | -------------------------------------------------------------------------------- /.github/workflows/deploy_github_pages.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Deploy to gh-pages 5 | on: [workflow_dispatch] 6 | jobs: 7 | build: 8 | 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: Set up Python 3.8 14 | uses: actions/setup-python@v2 15 | with: 16 | python-version: 3.8 17 | - name: Install dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 21 | if [ -f doc-requirements.txt ]; then pip install -r doc-requirements.txt; fi 22 | - name: Install giotto-time 23 | run: | 24 | pip install -e . 25 | - name: Git checkout and build sphinx docs 26 | run: | 27 | git config --global user.name "github-pages[bot]" 28 | git config --global user.email "41898281+github-pages[bot]@users.noreply.github.com" 29 | git fetch 30 | git checkout gh-pages 31 | git checkout master 32 | cd docs 33 | make html 34 | - name: push to gh-pages 35 | run: | 36 | git symbolic-ref HEAD refs/heads/gh-pages 37 | git reset --mixed gh-pages 38 | git add --all 39 | git add -f docs/build 40 | git commit -m "push sphinx build" 41 | git push origin gh-pages 42 | -------------------------------------------------------------------------------- /gtime/causality/tests/test_pearson_correlation.py: -------------------------------------------------------------------------------- 1 | from random import randint 2 | 3 | import numpy as np 4 | 5 | from gtime.causality import ShiftedPearsonCorrelation 6 | from gtime.causality.tests.common import make_df_from_expected_shifts 7 | 8 | 9 | def test_pearson_correlation(): 10 | expected_shifts = [randint(2, 6) * 2 for _ in range(3)] 11 | df = make_df_from_expected_shifts(expected_shifts) 12 | 13 | spc = ShiftedPearsonCorrelation(target_col="A", max_shift=12) 14 | spc.fit(df) 15 | 16 | shifts = spc.best_shifts_["A"][4:].values 17 | np.testing.assert_array_equal(shifts, expected_shifts) 18 | 19 | 20 | def test_pearson_bootstrap_p_values(): 21 | expected_shifts = [randint(2, 9) * 2 for _ in range(3)] 22 | df = make_df_from_expected_shifts(expected_shifts) 23 | shifted_test = ShiftedPearsonCorrelation( 24 | target_col="A", max_shift=5, bootstrap_iterations=500, 25 | ) 26 | shifted_test.fit(df) 27 | 28 | pearson_p_values = shifted_test.bootstrap_p_values_ 29 | for col_index in range(len(pearson_p_values.columns)): 30 | assert pearson_p_values.iloc[col_index, col_index] == 0 31 | 32 | 33 | def test_pearson_permutation_p_values(): 34 | expected_shifts = [randint(2, 9) * 2 for _ in range(3)] 35 | df = make_df_from_expected_shifts(expected_shifts) 36 | shifted_test = ShiftedPearsonCorrelation( 37 | target_col="A", max_shift=5, permutation_iterations=50, 38 | ) 39 | shifted_test.fit(df) 40 | 41 | pearson_p_values = shifted_test.permutation_p_values_ 42 | for col_index in range(len(pearson_p_values.columns)): 43 | assert pearson_p_values.iloc[col_index, col_index] == 0 44 | -------------------------------------------------------------------------------- /gtime/hierarchical/base.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | from typing import Any, Dict, Union 3 | 4 | import pandas as pd 5 | from sklearn.base import BaseEstimator, RegressorMixin 6 | 7 | 8 | class HierarchicalBase(BaseEstimator, RegressorMixin): 9 | """ Base class for hierarchical models. 10 | 11 | Parameters 12 | ---------- 13 | model : BaseEstimator, required 14 | base model applied to all the time series 15 | hierarchy_tree: Union[str, Dict[str, Any]], optional, default = ``'infer'`` 16 | hierarchy structure between time series. If 'infer' a standard structure if inferred. It 17 | depends on the subclass the implementation of infer. 18 | """ 19 | 20 | def __init__( 21 | self, model: BaseEstimator, hierarchy_tree: Union[str, Dict[str, Any]] = "infer" 22 | ): 23 | self.model = model 24 | self.hierarchy_tree = hierarchy_tree 25 | 26 | @abstractmethod 27 | def fit(self, X: Dict[str, pd.DataFrame], y=None): 28 | raise NotImplementedError 29 | 30 | @abstractmethod 31 | def predict(self, X: Dict[str, pd.DataFrame] = None): 32 | raise NotImplementedError 33 | 34 | @staticmethod 35 | def _check_is_dict_of_dataframes_with_str_key(X: Any): 36 | if not isinstance(X, dict): 37 | raise ValueError( 38 | f"X must be a dictionary of pd.DataFrame. Detected: {type(X)}" 39 | ) 40 | if not all(isinstance(key, str) for key in X): 41 | raise ValueError("All X keys must be string") 42 | if not all(isinstance(df, pd.DataFrame) for df in X.values()): 43 | raise ValueError("All values of X must be pd.DataFrame") 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | notebooks/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *.cover 46 | 47 | # Translations 48 | *.mo 49 | *.pot 50 | 51 | # Django stuff: 52 | *.log 53 | 54 | # Sphinx documentation 55 | docs/_build/ 56 | doc/build/ 57 | doc/generated/ 58 | doc/reference/generated/ 59 | 60 | # PyBuilder 61 | target/ 62 | 63 | # DotEnv configuration 64 | .env 65 | 66 | # Database 67 | *.db 68 | *.rdb 69 | 70 | # Pycharm 71 | .idea 72 | 73 | # VS Code 74 | .vscode/ 75 | 76 | # Spyder 77 | .spyproject/ 78 | 79 | # Jupyter NB Checkpoints 80 | .ipynb_checkpoints/ 81 | Untitled* 82 | 83 | # exclude data from source control by default 84 | /data/ 85 | 86 | # Mac OS-specific storage files 87 | .DS_Store 88 | 89 | # vim 90 | *.swp 91 | *.swo 92 | 93 | # Mypy cache 94 | .mypy_cache/ 95 | 96 | # ignore huge time_series_models 97 | models/*.joblib 98 | 99 | # Hypothesis 100 | .hypothesis/ 101 | 102 | # PyTest 103 | .pytest_cache/ 104 | 105 | # Excel temporary 106 | ~$*.xls* 107 | -------------------------------------------------------------------------------- /gtime/feature_generation/tests/test_calendar.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | from hypothesis import given, settings 5 | 6 | from gtime.feature_extraction import Calendar 7 | from gtime.utils.hypothesis.time_indexes import giotto_time_series 8 | 9 | 10 | def test_empty_and_non_finite_kernel_error(): 11 | with pytest.raises(ValueError): 12 | Calendar( 13 | start_date="ignored", 14 | end_date="ignored", 15 | country="Brazil", 16 | kernel=np.array([]), 17 | ) 18 | 19 | with pytest.raises(ValueError): 20 | Calendar( 21 | start_date="ignored", 22 | end_date="ignored", 23 | country="Brazil", 24 | kernel=np.array([np.nan, 1]), 25 | ) 26 | 27 | 28 | def test_unevenly_spaced_time_series(): 29 | unevenly_spaced_ts = pd.DataFrame( 30 | index=[ 31 | pd.Period("2012-01-01"), 32 | pd.Period("2012-01-03"), 33 | pd.Period("2012-01-10"), 34 | ] 35 | ) 36 | cal_feature = Calendar( 37 | start_date="ignored", 38 | end_date="ignored", 39 | country="Brazil", 40 | kernel=np.array([0, 1]), 41 | ) 42 | 43 | with pytest.raises(ValueError): 44 | cal_feature.fit_transform(unevenly_spaced_ts) 45 | 46 | 47 | @settings(deadline=pd.Timedelta(milliseconds=5000), max_examples=7) 48 | @given(giotto_time_series(min_length=2, max_length=30)) 49 | def test_correct_index_random_ts(ts): 50 | cal_feature = Calendar( 51 | start_date="ignored", 52 | end_date="ignored", 53 | country="Brazil", 54 | kernel=np.array([1, 2]), 55 | ) 56 | Xt = cal_feature.fit_transform(ts) 57 | np.testing.assert_array_equal(Xt.index, ts.index) 58 | -------------------------------------------------------------------------------- /gtime/time_series_models/ar.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union, Optional 2 | 3 | import numpy as np 4 | from sklearn.compose import make_column_selector 5 | from sklearn.linear_model import LinearRegression 6 | 7 | from gtime.feature_extraction import Shift 8 | from gtime.forecasting import GAR 9 | from gtime.time_series_models import TimeSeriesForecastingModel 10 | 11 | 12 | class AR(TimeSeriesForecastingModel): 13 | """ Standard AR model for time series 14 | 15 | Parameters 16 | ---------- 17 | p: int, required 18 | p parameter in AR 19 | horizon: int, required 20 | how many steps to predict in the future 21 | 22 | Examples 23 | -------- 24 | >>> import pandas._testing as testing 25 | >>> from gtime.time_series_models import AR 26 | >>> 27 | >>> testing.N, testing.K = 20, 1 28 | >>> data = testing.makeTimeDataFrame(freq="s") 29 | >>> ar = AR(p=2, horizon=3, column_name='A') 30 | >>> 31 | >>> ar.fit(data) 32 | >>> ar.predict() 33 | y_1 y_2 y_3 34 | 2000-01-01 00:00:17 0.037228 0.163446 -0.237299 35 | 2000-01-01 00:00:18 -0.139627 -0.018082 0.063273 36 | 2000-01-01 00:00:19 -0.107707 0.052031 -0.105526 37 | """ 38 | 39 | def __init__( 40 | self, 41 | p: int, 42 | horizon: Union[int, List[int]], 43 | explainer_type: Optional[str] = None, 44 | ): 45 | self.p = p 46 | self.explainer_type = explainer_type 47 | features = [ 48 | tuple((f"s{i}", Shift(i), make_column_selector(dtype_include=np.number))) 49 | for i in range(p) 50 | ] 51 | model = GAR(LinearRegression(), explainer_type=explainer_type) 52 | super().__init__(features=features, horizon=horizon, model=model) 53 | -------------------------------------------------------------------------------- /gtime/model_selection/horizon_shift.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union 2 | 3 | import pandas as pd 4 | 5 | from gtime.feature_extraction import Shift 6 | 7 | 8 | def horizon_shift( 9 | time_series: pd.DataFrame, horizon: Union[int, List[int]] = 5 10 | ) -> pd.DataFrame: 11 | """Perform a shift of the original ``time_series`` for each time step between 1 and 12 | ``horizon``. 13 | 14 | Parameters 15 | ---------- 16 | time_series : pd.DataFrame, shape (n_samples, n_features), required 17 | The list of ``TimeSeriesFeature`` from which to compute the feature_extraction. 18 | 19 | horizon : int, optional, default: ``5`` 20 | It represents how much into the future is necessary to predict. This corresponds 21 | to the number of shifts that are going to be performed on y. 22 | 23 | Returns 24 | ------- 25 | y : pd.DataFrame, shape (n_samples, horizon) 26 | The shifted time series. 27 | 28 | Examples 29 | -------- 30 | >>> import pandas as pd 31 | >>> from gtime.model_selection import horizon_shift 32 | >>> X = pd.DataFrame(range(0, 5), index=pd.date_range("2020-01-01", "2020-01-05")) 33 | >>> horizon_shift(X, horizon=2) 34 | y_1 y_2 35 | 2020-01-01 1.0 2.0 36 | 2020-01-02 2.0 3.0 37 | 2020-01-03 3.0 4.0 38 | 2020-01-04 4.0 NaN 39 | 2020-01-05 NaN NaN 40 | >>> horizon_shift(X, horizon=[2]) 41 | y_2 42 | 2020-01-01 2.0 43 | 2020-01-02 3.0 44 | 2020-01-03 4.0 45 | 2020-01-04 NaN 46 | 2020-01-05 NaN 47 | 48 | """ 49 | horizon = range(1, horizon + 1) if isinstance(horizon, (int, float)) else horizon 50 | y = pd.DataFrame(index=time_series.index) 51 | for k in sorted(horizon): 52 | shift_feature = Shift(-k) 53 | y[f"y_{k}"] = shift_feature.fit_transform(time_series) 54 | 55 | return y 56 | -------------------------------------------------------------------------------- /gtime/utils/hypothesis/utils.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import Union, Tuple 3 | 4 | import hypothesis.strategies as st 5 | import pandas as pd 6 | 7 | 8 | def initialize_start_date_end_date( 9 | start: datetime, end: datetime 10 | ) -> Tuple[datetime, datetime]: 11 | start = start if start is not None else pd.Timestamp("1980-01-01") 12 | end = end if end is not None else pd.Timestamp("2020-01-01") 13 | return start, end 14 | 15 | 16 | def initialize_start_timedelta_end_timedelta(start: pd.Timedelta, end: pd.Timedelta): 17 | start = start if start is not None else pd.Timedelta(0) 18 | end = end if end is not None else pd.Timedelta("40Y") 19 | return start, end 20 | 21 | 22 | def order_pair(element1, element2): 23 | return st.builds( 24 | lambda start, end: (start, end), start=element1, end=element2 25 | ).filter(lambda x: x[0] < x[1]) 26 | 27 | 28 | def expected_start_date_from( 29 | end: Union[datetime, pd.Period], periods: int, freq: pd.Timedelta 30 | ) -> Union[datetime, pd.Period]: 31 | return end - periods * freq 32 | 33 | 34 | def expected_end_date_from( 35 | start: Union[datetime, pd.Period], periods: int, freq: pd.Timedelta 36 | ) -> Union[datetime, pd.Period]: 37 | return start + periods * freq 38 | 39 | 40 | def expected_index_length_from( 41 | start: Union[datetime, pd.Period], 42 | end: Union[datetime, pd.Period], 43 | freq: pd.Timedelta, 44 | ) -> int: 45 | expected_index_length = (end - start) // freq 46 | return expected_index_length 47 | 48 | 49 | def freq_to_timedelta( 50 | freq: str, approximate_if_non_uniform: bool = True 51 | ) -> pd.Timedelta: 52 | try: 53 | return pd.to_timedelta(f"1{freq}") 54 | except ValueError as e: 55 | if approximate_if_non_uniform: 56 | correspondences = { 57 | "B": pd.Timedelta(1, unit="D"), 58 | "Q": pd.Timedelta(90, unit="D"), 59 | "A": pd.Timedelta(365, unit="D"), 60 | } 61 | return correspondences[freq] 62 | else: 63 | raise e 64 | -------------------------------------------------------------------------------- /gtime/utils/hypothesis/tests/test_general_strategies.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import pytest 4 | from hypothesis import given 5 | from hypothesis.strategies import integers, data 6 | 7 | from gtime.utils.hypothesis.general_strategies import ( 8 | ordered_pair, 9 | shape_matrix, 10 | shape_X_y_matrices, 11 | regressors, 12 | ) 13 | 14 | 15 | @given(ordered_pair(0, 10)) 16 | def test_ordered_pair(pair: Tuple[int, int]): 17 | assert pair[0] < pair[1] 18 | 19 | 20 | @given(ordered_pair(27, 132)) 21 | def test_ordered_pair_values(pair: Tuple[int, int]): 22 | assert pair[0] >= 27 23 | assert pair[1] <= 132 24 | 25 | 26 | @given(data=data(), value=integers(0, 10)) 27 | def test_ordered_pair_min_equal_max(data, value): 28 | with pytest.raises(ValueError): 29 | data.draw(ordered_pair(value, value)) 30 | 31 | 32 | @given(data=data(), shape_0=ordered_pair(10, 100), shape_1=ordered_pair(1, 8)) 33 | def test_shape_X(data, shape_0, shape_1): 34 | shape = data.draw(shape_matrix(*shape_0, *shape_1)) 35 | assert shape_0[0] <= shape[0] <= shape_0[1] 36 | assert shape_1[0] <= shape[1] <= shape_1[1] 37 | 38 | 39 | @given(shape_X_y_matrices(123, 243, 12, 34, 1, 6, y_as_vector=False)) 40 | def test_shape_X_y_matrices_y_matrix(shape_X_y): 41 | shape_X, shape_y = shape_X_y 42 | assert shape_X[0] == shape_y[0] 43 | assert 12 <= shape_X[1] <= 34 44 | assert 1 <= shape_y[1] <= 6 45 | 46 | 47 | @given(shape_X_y_matrices(123, 243, 12, 34, 1, 6, y_as_vector=True)) 48 | def test_shape_X_y_matrices_y_vector(shape_X_y): 49 | shape_X, shape_y = shape_X_y 50 | assert shape_X[0] == shape_y[0] 51 | assert 12 <= shape_X[1] <= 34 52 | assert len(shape_y) == 1 53 | 54 | 55 | @given(shape_X_y_matrices(10, 20, 10, 20, 1, 6)) 56 | def test_shape_1_X_smaller_shape_0(shape_X_y): 57 | shape_X, shape_y = shape_X_y 58 | assert shape_X[0] > shape_X[1] 59 | 60 | 61 | @given(data=data()) 62 | def test_shape_X_Y_value_error(data): 63 | with pytest.raises(ValueError): 64 | data.draw(shape_X_y_matrices(1, 8, 9, 10, 10, 20)) 65 | 66 | 67 | @given(regressors()) 68 | def test_regressors(regressor): 69 | assert hasattr(regressor, "fit") 70 | assert hasattr(regressor, "predict") 71 | -------------------------------------------------------------------------------- /gtime/utils/hypothesis/general_strategies.py: -------------------------------------------------------------------------------- 1 | from hypothesis import assume 2 | from hypothesis.strategies import tuples, integers, floats, sampled_from 3 | import hypothesis.strategies as st 4 | from sklearn.ensemble import ( 5 | BaggingRegressor, 6 | AdaBoostRegressor, 7 | GradientBoostingRegressor, 8 | RandomForestRegressor, 9 | ) 10 | from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge 11 | from sklearn.tree import ExtraTreeRegressor 12 | 13 | 14 | def ordered_pair(min_value: int, max_value: int): 15 | if min_value == max_value: 16 | raise ValueError("min_value and max_value can not be the same") 17 | return ( 18 | tuples(integers(min_value, max_value), integers(min_value, max_value)) 19 | .map(sorted) 20 | .filter(lambda x: x[0] < x[1]) 21 | ) 22 | 23 | 24 | def shape_matrix(min_shape_0=30, max_shape_0=200, min_shape_1=5, max_shape_1=10): 25 | return tuples( 26 | integers(min_shape_0, max_shape_0), integers(min_shape_1, max_shape_1) 27 | ).filter(lambda x: x[0] > x[1]) 28 | 29 | 30 | @st.composite 31 | def shape_X_y_matrices( 32 | draw, 33 | min_shape_0=30, 34 | max_shape_0=200, 35 | min_shape_1_X=5, 36 | max_shape_1_X=10, 37 | min_shape_1_y=1, 38 | max_shape_1_y=3, 39 | y_as_vector=True, 40 | ): 41 | if max_shape_0 <= min_shape_1_X: 42 | raise ValueError( 43 | f"max_shape_0 must be greater than min_shape_1_X: " 44 | f"{max_shape_0}, {min_shape_1_X}" 45 | ) 46 | shape_0 = draw(integers(min_shape_0, max_shape_0)) 47 | shape_X = draw(shape_matrix(shape_0, shape_0, min_shape_1_X, max_shape_1_X)) 48 | if y_as_vector: 49 | shape_y = (shape_0,) 50 | else: 51 | shape_y = draw(shape_matrix(shape_0, shape_0, min_shape_1_y, max_shape_1_y)) 52 | assume(shape_X[1] < shape_X[0]) 53 | return shape_X, shape_y 54 | 55 | 56 | @st.composite 57 | def regressors(draw): 58 | regressors = [ 59 | LinearRegression(), 60 | Ridge(alpha=draw(floats(0.00001, 2))), 61 | BayesianRidge(), 62 | ExtraTreeRegressor(), 63 | GradientBoostingRegressor(), 64 | RandomForestRegressor(), 65 | ] 66 | return draw(sampled_from(regressors)) 67 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: CI 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: ['3.8', '3.9', '3.10'] 19 | 20 | steps: 21 | - uses: actions/checkout@v2 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v2 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | - name: Install dependencies 27 | run: | 28 | python -m pip install --upgrade pip 29 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 30 | if [ -f dev-requirements.txt ]; then pip install -r dev-requirements.txt; fi 31 | pip install -e . 32 | - name: Lint with flake8 33 | run: | 34 | # stop the build if there are Python syntax errors or undefined names 35 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 36 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 37 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 38 | - name: Type checking with mypy 39 | run: | 40 | mypy --ignore-missing-imports . || { 41 | status=$? echo "Type checking errors!" 42 | } 43 | - name: Test with pytest 44 | continue-on-error: true 45 | run: | 46 | pytest --maxfail=10 47 | - name: Integration tests 48 | run: | 49 | set -e 50 | for n in examples/*.ipynb 51 | do 52 | jupyter nbconvert --to notebook --execute $n 53 | done 54 | - name: Build and install wheels 55 | run: | 56 | set -e 57 | python -m pip install wheel 58 | python setup.py bdist_wheel 59 | python -m pip install dist/*.whl 60 | - name: Upload artifacts 61 | uses: actions/upload-artifact@v2 62 | with: 63 | name: pip_wheel_${{ matrix.python-version }} 64 | path: dist 65 | -------------------------------------------------------------------------------- /gtime/utils/fixtures.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import pytest 4 | from pytest import fixture 5 | import numpy as np 6 | from sklearn.compose import make_column_selector 7 | from sklearn.linear_model import LinearRegression, Ridge 8 | 9 | from gtime.feature_extraction import Shift, MovingAverage 10 | from gtime.forecasting import GAR 11 | from gtime.time_series_models import TimeSeriesForecastingModel 12 | 13 | 14 | @fixture(scope="function") 15 | def features1(): 16 | return [ 17 | ("shift_0", Shift(0), make_column_selector(dtype_include=np.number)), 18 | ("shift_1", Shift(1), make_column_selector(dtype_include=np.number)), 19 | ( 20 | "moving_average_3", 21 | MovingAverage(window_size=3), 22 | make_column_selector(dtype_include=np.number), 23 | ), 24 | ] 25 | 26 | 27 | @fixture(scope="function") 28 | def features2(): 29 | return [ 30 | ("shift_0", Shift(0), make_column_selector(dtype_include=np.number)), 31 | ("shift_1", Shift(1), make_column_selector(dtype_include=np.number)), 32 | ] 33 | 34 | 35 | @fixture(scope="function") 36 | def model1(): 37 | lr = LinearRegression() 38 | return GAR(lr) 39 | 40 | 41 | @fixture(scope="function") 42 | def model2(): 43 | lr = Ridge(alpha=0.1) 44 | return GAR(lr) 45 | 46 | 47 | @fixture(scope="function") 48 | def time_series_forecasting_model1_no_cache(features1, model1): 49 | return TimeSeriesForecastingModel( 50 | features=features1, horizon=2, model=model1, cache_features=False, 51 | ) 52 | 53 | 54 | @fixture(scope="function") 55 | def time_series_forecasting_model1_cache(features1, model1): 56 | return TimeSeriesForecastingModel( 57 | features=features1, horizon=2, model=model1, cache_features=True, 58 | ) 59 | 60 | 61 | @pytest.fixture(scope="function") 62 | def estimator(): 63 | return LinearRegression() 64 | 65 | 66 | def _single_element_lazy_fixtures(*args): 67 | return [pytest.lazy_fixture(arg.__name__) for arg in args[0]] 68 | 69 | 70 | def lazy_fixtures(*args): 71 | if isinstance(args[0], tuple): 72 | raise NotImplementedError 73 | # return [tuple([pytest.lazy_fixture(arg[0].__name__), *arg[1:]]) for arg in args] 74 | else: 75 | return _single_element_lazy_fixtures(*args) 76 | -------------------------------------------------------------------------------- /gtime/external/make_holidays.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | 4 | # This source code is licensed under the MIT license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from __future__ import absolute_import, division, print_function 8 | 9 | import warnings 10 | 11 | import numpy as np 12 | import pandas as pd 13 | 14 | import holidays as hdays_part1 15 | 16 | import gtime.external.hdays as hdays_part2 17 | 18 | 19 | def get_holiday_names(country): 20 | """Return all possible holiday names of given country 21 | Parameters 22 | ---------- 23 | country: country name 24 | Returns 25 | ------- 26 | A set of all possible holiday names of given country 27 | """ 28 | years = np.arange(1995, 2045) 29 | try: 30 | with warnings.catch_warnings(): 31 | warnings.simplefilter("ignore") 32 | holiday_names = getattr(hdays_part2, country)(years=years).values() 33 | except AttributeError: 34 | try: 35 | holiday_names = getattr(hdays_part1, country)(years=years).values() 36 | except AttributeError as e: 37 | raise AttributeError( 38 | "Holidays in {} are not currently supported!".format(country) 39 | ) from e 40 | return set(holiday_names) 41 | 42 | 43 | def make_holidays_df(year_list, country, province=None): 44 | """Make dataframe of holidays for given years and countries 45 | Parameters 46 | ---------- 47 | year_list: a list of years 48 | country: country name 49 | Returns 50 | ------- 51 | Dataframe with 'ds' and 'holiday', which can directly feed 52 | to 'holidays' params in Prophet 53 | """ 54 | try: 55 | holidays = getattr(hdays_part2, country)(years=year_list) 56 | except AttributeError: 57 | try: 58 | holidays = getattr(hdays_part1, country)(prov=province, years=year_list) 59 | except AttributeError as e: 60 | raise AttributeError( 61 | "Holidays in {} are not currently supported!".format(country) 62 | ) from e 63 | holidays_df = pd.DataFrame(list(holidays.items()), columns=["ds", "holiday"]) 64 | holidays_df.reset_index(inplace=True, drop=True) 65 | holidays_df["ds"] = pd.to_datetime(holidays_df["ds"]) 66 | return holidays_df 67 | -------------------------------------------------------------------------------- /gtime/regressors/tests/test_linear_regressor.py: -------------------------------------------------------------------------------- 1 | from random import random 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from hypothesis import given, settings 6 | from hypothesis.extra.numpy import arrays 7 | from hypothesis.strategies import floats 8 | 9 | from gtime.regressors import LinearRegressor 10 | 11 | 12 | class TestLinearRegressor: 13 | def test_linear_regressor(self): 14 | train, test = train_test_dataframe() 15 | 16 | predictions = compute_predictions_for_train_test(train, test) 17 | expected = compute_expectation_from_test(test) 18 | 19 | np.testing.assert_array_almost_equal(predictions, expected, decimal=2) 20 | 21 | @settings(deadline=None) 22 | @given( 23 | arrays( 24 | dtype=float, 25 | shape=(100, 1), 26 | elements=floats(allow_nan=False, allow_infinity=False, width=16), 27 | ) 28 | ) 29 | def test_linear_regressor_random_array(self, random_array): 30 | train, test = train_test_dataframe(random_array) 31 | 32 | predictions = compute_predictions_for_train_test(train, test) 33 | expected = compute_expectation_from_test(test) 34 | 35 | np.testing.assert_array_almost_equal(predictions, expected, decimal=0) 36 | 37 | 38 | def train_test_dataframe( 39 | random_array: np.ndarray = None, 40 | ) -> (pd.DataFrame, pd.DataFrame): 41 | random_array = ( 42 | random_array if random_array is not None else [random() for _ in range(100)] 43 | ) 44 | 45 | a1, a2, b = random() * 10, random() * 100, 2 * (1 - random()) 46 | 47 | df = pd.DataFrame() 48 | df["x1"] = list(range(100)) 49 | df["x2"] = random_array 50 | df["y"] = [b + a1 * t for t in range(100)] 51 | df["y"] = df["y"] + a2 * df["x2"] 52 | 53 | train = df[:90] 54 | test = df[90:] 55 | 56 | return train, test 57 | 58 | 59 | def compute_predictions_for_train_test( 60 | train: pd.DataFrame, test: pd.DataFrame 61 | ) -> np.ndarray: 62 | lr = LinearRegressor() 63 | 64 | lr.fit(train[["x1", "x2"]], train["y"], x0=[0, 0, 0]) 65 | 66 | preds_y = lr.predict(test[["x1", "x2"]]) 67 | preds_y = preds_y / np.sum(preds_y) 68 | 69 | return preds_y 70 | 71 | 72 | def compute_expectation_from_test(test: pd.DataFrame) -> np.ndarray: 73 | test_y = test["y"].values 74 | test_y = test_y / np.sum(test_y) 75 | return test_y 76 | -------------------------------------------------------------------------------- /gtime/causality/tests/test_linear_coefficient.py: -------------------------------------------------------------------------------- 1 | from random import randint 2 | 3 | import numpy as np 4 | import pytest 5 | from hypothesis import given, strategies as st 6 | from pandas.util import testing as testing 7 | 8 | from gtime.causality import ShiftedLinearCoefficient 9 | from gtime.causality.tests.common import make_df_from_expected_shifts 10 | 11 | 12 | def test_linear_coefficient(): 13 | expected_shifts = [randint(2, 6) * 2 for _ in range(3)] 14 | 15 | df = make_df_from_expected_shifts(expected_shifts) 16 | slc = ShiftedLinearCoefficient(target_col="A", max_shift=12) 17 | slc.fit(df) 18 | 19 | shifts = slc.best_shifts_["A"][4:].values 20 | np.testing.assert_array_equal(shifts, expected_shifts) 21 | 22 | 23 | # TODO: tests refactor TBD 24 | @given(st.integers(1, 20)) 25 | @pytest.mark.skip(reason="TODO: Write proper test, increase hypothesis max duration") 26 | def test_linear_coefficient_hyp(shift): 27 | testing.N, testing.K = 500, 1 28 | df = testing.makeTimeDataFrame(freq="D") 29 | df["shifted"] = df["A"].shift(shift) 30 | 31 | slc = ShiftedLinearCoefficient(target_col="A", max_shift=20) 32 | slc.fit(df).transform(df) 33 | 34 | 35 | def test_linear_bootstrap_p_values(): 36 | # This test and the next one just test if the p_values on the diagonal are equal 37 | # to 0. Is hard to implement other unittest, since the bootstrapping always 38 | # gives different result. However, other properties could be tested 39 | expected_shifts = [randint(2, 4) * 2 for _ in range(3)] 40 | df = make_df_from_expected_shifts(expected_shifts) 41 | shifted_test = ShiftedLinearCoefficient( 42 | target_col="A", max_shift=8, bootstrap_iterations=500, 43 | ) 44 | shifted_test.fit(df) 45 | 46 | linear_p_values = shifted_test.bootstrap_p_values_ 47 | for col_index in range(len(linear_p_values.columns)): 48 | assert linear_p_values.iloc[col_index, col_index] == 0 49 | 50 | 51 | def test_linear_permutation_p_values(): 52 | expected_shifts = [randint(2, 4) * 2 for _ in range(3)] 53 | df = make_df_from_expected_shifts(expected_shifts) 54 | shifted_test = ShiftedLinearCoefficient( 55 | target_col="A", max_shift=8, permutation_iterations=50, 56 | ) 57 | shifted_test.fit(df) 58 | 59 | linear_p_values = shifted_test.permutation_p_values_ 60 | for col_index in range(len(linear_p_values.columns)): 61 | assert linear_p_values.iloc[col_index, col_index] == 0 62 | -------------------------------------------------------------------------------- /gtime/feature_extraction/tests/test_trend.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | from gtime.feature_extraction import Detrender 5 | 6 | 7 | def test_polynomial_detrend(): 8 | time_index = pd.date_range(start="2020-01-01", end="2020-01-20") 9 | ts = pd.DataFrame(range(0, 20), index=time_index) 10 | 11 | detrend_feature = Detrender(trend="polynomial", trend_x0=np.zeros(3)) 12 | feature_name = detrend_feature.__class__.__name__ 13 | ts_t = detrend_feature.fit_transform(ts) 14 | expected_ts = pd.DataFrame( 15 | [ 16 | 1.22681324e-05, 17 | 8.34525141e-06, 18 | 4.86108426e-06, 19 | 1.81563099e-06, 20 | -7.91108403e-07, 21 | -2.95913392e-06, 22 | -4.68844555e-06, 23 | -5.97904330e-06, 24 | -6.83092717e-06, 25 | -7.24409716e-06, 26 | -7.21855327e-06, 27 | -6.75429551e-06, 28 | -5.85132385e-06, 29 | -4.50963832e-06, 30 | -2.72923891e-06, 31 | -5.10125625e-07, 32 | 2.14770155e-06, 33 | 5.24424260e-06, 34 | 8.77949753e-06, 35 | 1.27534663e-05, 36 | ], 37 | columns=[f"0__{feature_name}"], 38 | index=time_index, 39 | ) 40 | pd.testing.assert_frame_equal(ts_t, expected_ts, check_less_precise=3) 41 | 42 | 43 | def test_exponential_detrend(): 44 | time_index = pd.date_range(start="2020-01-01", end="2020-01-20") 45 | ts = pd.DataFrame(range(0, 20), index=time_index) 46 | 47 | detrend_feature = Detrender(trend="exponential", trend_x0=0) 48 | feature_name = detrend_feature.__class__.__name__ 49 | ts_t = detrend_feature.fit_transform(ts) 50 | expected_ts = pd.DataFrame( 51 | [ 52 | -1.0, 53 | -0.18238542, 54 | 0.60196471, 55 | 1.34698345, 56 | 2.04549733, 57 | 2.68902453, 58 | 3.26753629, 59 | 3.76917473, 60 | 4.1799193, 61 | 4.48319226, 62 | 4.65939237, 63 | 4.68534338, 64 | 4.53364205, 65 | 4.17188719, 66 | 3.5617681, 67 | 2.65798675, 68 | 1.40698343, 69 | -0.25457009, 70 | -2.40155216, 71 | -5.1224979, 72 | ], 73 | columns=[f"0__{feature_name}"], 74 | index=time_index, 75 | ) 76 | pd.testing.assert_frame_equal(ts_t, expected_ts) 77 | -------------------------------------------------------------------------------- /gtime/feature_extraction/tests/test_sorted_density.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pandas.util.testing as testing 4 | import pytest 5 | 6 | from gtime.feature_extraction.custom import SortedDensity 7 | 8 | 9 | def get_input_data(): 10 | input_data = pd.DataFrame.from_dict({"x_1": [0, 7, 2], "x_2": [2, 10, 4]}) 11 | input_data.index = [ 12 | pd.Timestamp(2000, 1, 1), 13 | pd.Timestamp(2000, 2, 1), 14 | pd.Timestamp(2000, 3, 1), 15 | ] 16 | return input_data 17 | 18 | 19 | def get_output_causal(): 20 | custom_feature = SortedDensity(window_size=2, is_causal=True) 21 | feature_name = custom_feature.__class__.__name__ 22 | output_causal = pd.DataFrame.from_dict( 23 | { 24 | f"x_1__{feature_name}": [np.nan, 0.5, 0.6111111111111112], 25 | f"x_2__{feature_name}": [np.nan, 0.5833333333333334, 0.6428571428571429], 26 | } 27 | ) 28 | output_causal.index = [ 29 | pd.Timestamp(2000, 1, 1), 30 | pd.Timestamp(2000, 2, 1), 31 | pd.Timestamp(2000, 3, 1), 32 | ] 33 | return output_causal 34 | 35 | 36 | def get_output_anticausal(): 37 | custom_feature = SortedDensity(window_size=2, is_causal=False) 38 | feature_name = custom_feature.__class__.__name__ 39 | output_anticausal = pd.DataFrame.from_dict( 40 | { 41 | f"x_1__{feature_name}": [0.5, 0.6111111111111112], 42 | f"x_2__{feature_name}": [0.5833333333333334, 0.6428571428571429], 43 | } 44 | ) 45 | output_anticausal.index = [ 46 | pd.Timestamp(2000, 2, 1), 47 | pd.Timestamp(2000, 3, 1), 48 | ] 49 | return output_anticausal 50 | 51 | 52 | input_data = get_input_data() 53 | output_causal = get_output_causal() 54 | output_anticausal = get_output_anticausal() 55 | 56 | 57 | class TestSortedDensity: 58 | @pytest.mark.parametrize("test_input, expected", [(input_data, output_causal)]) 59 | def test_crest_factor_detrending_causal(self, test_input, expected): 60 | feature = SortedDensity(window_size=2, is_causal=True) 61 | output = feature.fit_transform(test_input) 62 | testing.assert_frame_equal(output, expected) 63 | 64 | @pytest.mark.parametrize("test_input, expected", [(input_data, output_anticausal)]) 65 | def test_crest_factor_detrending_anticausal(self, test_input, expected): 66 | feature = SortedDensity(window_size=2, is_causal=False) 67 | output = feature.fit_transform(test_input) 68 | testing.assert_frame_equal(output, expected) 69 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | """Toolbox for Time Series Analysis.""" 3 | 4 | import os 5 | import codecs 6 | 7 | from setuptools import setup, find_packages 8 | 9 | from gtime import __version__ 10 | 11 | version_file = os.path.join("gtime", "_version.py") 12 | with open(version_file) as f: 13 | exec(f.read()) 14 | 15 | with open("requirements.txt") as f: 16 | requirements = f.read().splitlines() 17 | with open("doc-requirements.txt") as f: 18 | doc_requirements = f.read().splitlines() 19 | with open("dev-requirements.txt") as f: 20 | dev_requirements = f.read().splitlines() 21 | 22 | DISTNAME = "giotto-time" 23 | DESCRIPTION = "Toolbox for Time Series analysis and integration with Machine Learning." 24 | with codecs.open("README.md", encoding="utf-8-sig") as f: 25 | LONG_DESCRIPTION = f.read() 26 | LONG_DESCRIPTION_TYPE = "text/markdown" 27 | MAINTAINER = "Alessio Baccelli" 28 | MAINTAINER_EMAIL = "maintainers@giotto.ai" 29 | URL = "https://github.com/giotto-ai/giotto-time" 30 | LICENSE = "AGPLv3" 31 | DOWNLOAD_URL = "https://github.com/giotto-ai/giotto-time/tarball/v0.0a0" 32 | VERSION = __version__ 33 | CLASSIFIERS = [ 34 | "Intended Audience :: Information Technology", 35 | "Intended Audience :: Developers", 36 | "License :: OSI Approved", 37 | "Programming Language :: Python", 38 | "Topic :: Software Development", 39 | "Topic :: Scientific/Engineering", 40 | "Operating System :: Microsoft :: Windows", 41 | "Operating System :: POSIX", 42 | "Operating System :: Unix", 43 | "Operating System :: MacOS", 44 | "Programming Language :: Python :: 3.7", 45 | "Programming Language :: Python :: 3.8", 46 | "Programming Language :: Python :: 3.9", 47 | ] 48 | KEYWORDS = ( 49 | "machine learning time series data analysis " + "topology, persistence diagrams" 50 | ) 51 | INSTALL_REQUIRES = requirements 52 | EXTRAS_REQUIRE = { 53 | "tests": dev_requirements, 54 | "doc": doc_requirements, 55 | "examples": [], 56 | } 57 | 58 | 59 | setup( 60 | name=DISTNAME, 61 | maintainer=MAINTAINER, 62 | maintainer_email=MAINTAINER_EMAIL, 63 | description=DESCRIPTION, 64 | license=LICENSE, 65 | url=URL, 66 | version=VERSION, 67 | download_url=DOWNLOAD_URL, 68 | long_description=LONG_DESCRIPTION, 69 | long_description_content_type=LONG_DESCRIPTION_TYPE, 70 | zip_safe=False, 71 | classifiers=CLASSIFIERS, 72 | packages=find_packages(), 73 | keywords=KEYWORDS, 74 | install_requires=INSTALL_REQUIRES, 75 | extras_require=EXTRAS_REQUIRE, 76 | ) 77 | -------------------------------------------------------------------------------- /gtime/feature_extraction/tests/test_crest_factor_detrending.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pandas.util.testing as testing 4 | import pytest 5 | 6 | from gtime.feature_extraction.custom import CrestFactorDetrending 7 | 8 | 9 | def get_input_data(): 10 | input_data = pd.DataFrame.from_dict({"x_1": [0, 7, 2], "x_2": [2, 10, 4]}) 11 | input_data.index = [ 12 | pd.Timestamp(2000, 1, 1), 13 | pd.Timestamp(2000, 2, 1), 14 | pd.Timestamp(2000, 3, 1), 15 | ] 16 | return input_data 17 | 18 | 19 | def get_output_causal(): 20 | custom_feature = CrestFactorDetrending(window_size=2, is_causal=True) 21 | feature_name = custom_feature.__class__.__name__ 22 | output_causal = pd.DataFrame.from_dict( 23 | { 24 | f"x_1__{feature_name}": [np.nan, 1.0, 0.07547169811320754], 25 | f"x_2__{feature_name}": [np.nan, 0.9615384615384616, 0.13793103448275862], 26 | } 27 | ) 28 | output_causal.index = [ 29 | pd.Timestamp(2000, 1, 1), 30 | pd.Timestamp(2000, 2, 1), 31 | pd.Timestamp(2000, 3, 1), 32 | ] 33 | return output_causal 34 | 35 | 36 | def get_output_anticausal(): 37 | custom_feature = CrestFactorDetrending(window_size=2, is_causal=False) 38 | feature_name = custom_feature.__class__.__name__ 39 | output_anticausal = pd.DataFrame.from_dict( 40 | { 41 | f"x_1__{feature_name}": [1.0, 0.07547169811320754], 42 | f"x_2__{feature_name}": [0.9615384615384616, 0.13793103448275862], 43 | } 44 | ) 45 | output_anticausal.index = [ 46 | pd.Timestamp(2000, 2, 1), 47 | pd.Timestamp(2000, 3, 1), 48 | ] 49 | return output_anticausal 50 | 51 | 52 | input_data = get_input_data() 53 | output_causal = get_output_causal() 54 | output_anticausal = get_output_anticausal() 55 | 56 | 57 | class TestCrestFactorDetrending: 58 | @pytest.mark.parametrize("test_input, expected", [(input_data, output_causal)]) 59 | def test_crest_factor_detrending_causal(self, test_input, expected): 60 | feature = CrestFactorDetrending(window_size=2, is_causal=True) 61 | output = feature.fit_transform(test_input) 62 | testing.assert_frame_equal(output, expected) 63 | 64 | @pytest.mark.parametrize("test_input, expected", [(input_data, output_anticausal)]) 65 | def test_crest_factor_detrending_anticausal(self, test_input, expected): 66 | feature = CrestFactorDetrending(window_size=2, is_causal=False) 67 | output = feature.fit_transform(test_input) 68 | testing.assert_frame_equal(output, expected) 69 | -------------------------------------------------------------------------------- /gtime/time_series_models/tests/test_simple_models.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import pytest 4 | from pandas.util import testing as testing 5 | from hypothesis import given, note 6 | import hypothesis.strategies as st 7 | from gtime.utils.hypothesis.time_indexes import giotto_time_series 8 | 9 | 10 | from gtime.time_series_models import ( 11 | Naive, 12 | SeasonalNaive, 13 | Average, 14 | Drift, 15 | ) 16 | 17 | 18 | @st.composite 19 | def forecast_input(draw, max_lenth): 20 | length = draw(st.integers(min_value=4, max_value=max_lenth)) 21 | horizon = draw(st.integers(min_value=1, max_value=length - 1)) 22 | window = draw(st.integers(min_value=1, max_value=length - horizon)) 23 | df = draw( 24 | giotto_time_series( 25 | min_length=horizon + window, 26 | max_length=max_lenth, 27 | allow_nan=False, 28 | allow_infinity=False, 29 | ) 30 | ) 31 | return df, horizon, window 32 | 33 | 34 | class TestNaiveForecast: 35 | @given(x=forecast_input(50)) 36 | def test_fit_predict(self, x): 37 | df, horizon, _ = x 38 | model = Naive(horizon=horizon) 39 | model.fit(df) 40 | y_pred = model.predict() 41 | assert y_pred.shape == (horizon, horizon) 42 | res = np.broadcast_to(df.iloc[-horizon:], (horizon, horizon)) 43 | y_cols = ["y_" + str(x + 1) for x in range(horizon)] 44 | expected_df = pd.DataFrame(res, index=model.X_test_.index, columns=y_cols) 45 | testing.assert_frame_equal(y_pred, expected_df) 46 | 47 | 48 | class TestSeasonalNaiveForecast: 49 | @given(x=forecast_input(50)) 50 | def test_fit_predict(self, x): 51 | df, horizon, seasonal_length = x 52 | model = SeasonalNaive(horizon=horizon, seasonal_length=seasonal_length) 53 | model.fit(df) 54 | y_pred = model.predict() 55 | note(y_pred) 56 | assert y_pred.shape[1] == horizon 57 | if seasonal_length < horizon: 58 | assert all(y_pred.iloc[:, 0] == y_pred.iloc[:, seasonal_length]) 59 | 60 | 61 | class TestAverageForecast: 62 | @given(x=forecast_input(50)) 63 | def test_fit_predict(self, x): 64 | df, horizon, _ = x 65 | model = Average(horizon=horizon) 66 | model.fit(df) 67 | y_pred = model.predict() 68 | note(y_pred) 69 | assert y_pred.shape == (horizon, horizon) 70 | assert pytest.approx(y_pred.diff(axis=1).sum().sum()) == 0 71 | means = [df.mean()] + [df.iloc[:-i].mean() for i in range(1, horizon)] 72 | 73 | 74 | class TestDriftForecast: 75 | @given(x=forecast_input(50)) 76 | def test_fit_predict(self, x): 77 | df, horizon, _ = x 78 | model = Drift(horizon=horizon) 79 | model.fit(df) 80 | y_pred = model.predict() 81 | note(y_pred) 82 | assert len(y_pred) == horizon 83 | # assert pytest.approx(y_pred.diff().diff().sum().sum()) == 0 84 | -------------------------------------------------------------------------------- /gtime/compose/feature_creation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.compose import ColumnTransformer 3 | 4 | 5 | class FeatureCreation(ColumnTransformer): 6 | """Applies transformers to columns of a pandas DataFrame. 7 | 8 | This estimator is a wrapper of sklearn.compose.ColumnTransformer, the only 9 | difference is the output type of fit_transform and transform methods which is a 10 | DataFrame instead of an array. 11 | 12 | """ 13 | 14 | def fit_transform(self, X: pd.DataFrame, y: pd.DataFrame = None): 15 | """Fit all transformers, transform the data and concatenate results. 16 | 17 | Parameters 18 | ---------- 19 | X : pd.DataFrame, shape (n_samples, n_features), required 20 | Input data, of which specified subsets are used to fit the 21 | transformers. 22 | 23 | y : pd.DataFrame, shape (n_samples, ...), optional, default: ``None`` 24 | Targets for supervised learning. 25 | 26 | Examples 27 | -------- 28 | >>> import pandas.util.testing as testing 29 | >>> from gtime.compose import FeatureCreation 30 | >>> from gtime.feature_extraction import Shift, MovingAverage 31 | >>> data = testing.makeTimeDataFrame(freq="s") 32 | >>> fc = FeatureCreation([ 33 | ... ('s1', Shift(1), ['A']), 34 | ... ('ma3', MovingAverage(window_size=3), ['B']), 35 | ... ]) 36 | >>> fc.fit_transform(data).head() 37 | s1__A__Shift ma3__B__MovingAverage 38 | 2000-01-01 00:00:00 NaN NaN 39 | 2000-01-01 00:00:01 0.211403 NaN 40 | 2000-01-01 00:00:02 -0.313854 0.085045 41 | 2000-01-01 00:00:03 0.502018 -0.239269 42 | 2000-01-01 00:00:04 -0.225324 -0.144625 43 | 44 | Returns 45 | ------- 46 | X_t_df : pd.DataFrame, shape (n_samples, sum_n_components) 47 | hstack of results of transformers. sum_n_components is the 48 | sum of n_components (output dimension) over transformers. 49 | 50 | """ 51 | X_t = super().fit_transform(X, y) 52 | X_t_df = pd.DataFrame(data=X_t, columns=self.get_feature_names(), index=X.index) 53 | return X_t_df 54 | 55 | def transform(self, X: pd.DataFrame): 56 | """Transform X separately by each transformer, concatenate results. 57 | 58 | Parameters 59 | ---------- 60 | X : pd.DataFrame, shape (n_samples, n_features), required 61 | The data to be transformed by subset. 62 | 63 | Returns 64 | ------- 65 | X_t_df : DataFrame, shape (n_samples, sum_n_components) 66 | hstack of results of transformers. sum_n_components is the 67 | sum of n_components (output dimension) over transformers. If 68 | any result is a sparse matrix, everything will be converted to 69 | sparse matrices. 70 | 71 | """ 72 | X_t = super().transform(X) 73 | X_t_df = pd.DataFrame(data=X_t, columns=self.get_feature_names(), index=X.index) 74 | return X_t_df 75 | -------------------------------------------------------------------------------- /gtime/model_selection/tests/test_splitters.py: -------------------------------------------------------------------------------- 1 | import hypothesis.strategies as st 2 | import numpy as np 3 | import pytest 4 | from hypothesis import given, settings, HealthCheck 5 | from sklearn.compose import make_column_selector 6 | 7 | from gtime.compose import FeatureCreation 8 | from gtime.feature_extraction import Shift, MovingAverage 9 | from gtime.model_selection import horizon_shift 10 | from gtime.model_selection.splitters import FeatureSplitter 11 | from gtime.utils.hypothesis.feature_matrices import X_y_matrices 12 | 13 | # TODO: refactor, make hypothesis generator instead of a full pipeline 14 | from gtime.utils.hypothesis.time_indexes import giotto_time_series 15 | 16 | df_transformer = FeatureCreation( 17 | [ 18 | ("shift_0", Shift(0), make_column_selector(dtype_include=np.number)), 19 | ("shift_1", Shift(1), make_column_selector(dtype_include=np.number)), 20 | ( 21 | "moving_average_3", 22 | MovingAverage(window_size=3), 23 | make_column_selector(dtype_include=np.number), 24 | ), 25 | ] 26 | ) 27 | 28 | horizon = 4 29 | 30 | 31 | class TestFeatureSplitter: 32 | def test_constructor(self): 33 | FeatureSplitter() 34 | 35 | @given(st.text().filter(lambda x: x != "any")) 36 | def test_constructor_wrong_parameter(self, drop_na_mode: str): 37 | with pytest.raises(ValueError): 38 | FeatureSplitter(drop_na_mode) 39 | 40 | @settings(suppress_health_check=(HealthCheck.too_slow,)) 41 | @given( 42 | X_y_matrices( 43 | horizon=horizon, df_transformer=df_transformer, allow_nan_infinity=False, 44 | ) 45 | ) 46 | def test_transform(self, X_y): 47 | X, y = X_y 48 | feature_splitter = FeatureSplitter() 49 | X_train, y_train, X_test, y_test = feature_splitter.transform(X, y) 50 | 51 | assert X_train.shape[0] == max(0, X.shape[0] - 2 - horizon) 52 | assert y_train.shape[0] == X_train.shape[0] 53 | assert X_test.shape[0] == min(max(0, X.shape[0] - 2), horizon) 54 | assert y_test.shape[0] == X_test.shape[0] 55 | 56 | 57 | class TestHorizonShift: 58 | @given( 59 | giotto_time_series(min_length=10, allow_infinity=False, allow_nan=False), 60 | st.integers(1, 8), 61 | ) 62 | def test_horizon_int(self, time_series, horizon): 63 | y_shifted = horizon_shift(time_series, horizon) 64 | assert y_shifted.shape[1] == horizon 65 | 66 | # Check first line of y_shifted 67 | for i in range(1, horizon + 1): 68 | assert time_series.iloc[i, 0] == y_shifted.iloc[0, i - 1] 69 | 70 | @given( 71 | giotto_time_series(min_length=10, allow_infinity=False, allow_nan=False), 72 | st.sets(elements=st.integers(1, 8), min_size=1, max_size=8), 73 | ) 74 | def test_horizon_list(self, time_series, horizon): 75 | horizon = list(sorted(horizon)) 76 | y_shifted = horizon_shift(time_series, horizon) 77 | assert y_shifted.shape[1] == len(horizon) 78 | 79 | # Check first line of y_shifted 80 | for i, elem in enumerate(horizon): 81 | assert time_series.iloc[elem, 0] == y_shifted.iloc[0, i] 82 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # http://www.sphinx-doc.org/en/master/config 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | import sphinx_rtd_theme # noqa 16 | 17 | sys.path.insert(0, os.path.abspath(os.path.join("..", ".."))) 18 | # sys.path.insert(0, os.path.abspath("../")) 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = "giotto-time" 23 | copyright = "2022, L2F" 24 | 25 | # The full version, including alpha/beta/rc tags 26 | from gtime import __version__ 27 | 28 | release = __version__ 29 | 30 | # -- General configuration --------------------------------------------------- 31 | 32 | # Add any Sphinx extension module names here, as strings. They can be 33 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 34 | # ones. 35 | extensions = [ 36 | "sphinx.ext.autodoc", 37 | "sphinx_rtd_theme", 38 | ] 39 | 40 | # this is needed for some reason... 41 | # see https://github.com/numpy/numpydoc/issues/69 42 | # numpydoc_class_members_toctree = False 43 | 44 | # Add any paths that contain templates here, relative to this directory. 45 | templates_path = ["_templates"] 46 | 47 | # generate autosummary even if no references 48 | #autosummary_generate = True 49 | 50 | # The suffix of source filenames. 51 | # source_suffix = ".rst" 52 | 53 | # The encoding of source files. 54 | # source_encoding = 'utf-8' 55 | 56 | # The master toctree document. 57 | # master_doc = "index" 58 | 59 | # List of patterns, relative to source directory, that match files and 60 | # directories to ignore when looking for source files. 61 | # This pattern also affects html_static_path and html_extra_path. 62 | exclude_patterns = [] 63 | 64 | # If true, '()' will be appended to :func: etc. cross-reference text. 65 | # add_function_parentheses = False 66 | 67 | # If true, the current module name will be prepended to all description 68 | # unit titles (such as .. function::). 69 | # add_module_names = True 70 | 71 | # If true, sectionauthor and moduleauthor directives will be shown in the 72 | # output. They are ignored by default. 73 | # show_authors = False 74 | 75 | # The name of the Pygments (syntax highlighting) style to use. 76 | # pygments_style = "sphinx" 77 | 78 | # A list of ignored prefixes for module index sorting. 79 | # modindex_common_prefix = [] 80 | # -- Options for HTML output ------------------------------------------------- 81 | 82 | # The theme to use for HTML and HTML Help pages. See the documentation for 83 | # a list of builtin themes. 84 | # 85 | html_theme = "sphinx_rtd_theme" 86 | 87 | # Add any paths that contain custom static files (such as style sheets) here, 88 | # relative to this directory. They are copied after the builtin static files, 89 | # so a file named "default.css" will overwrite the builtin "default.css". 90 | html_static_path = [] # ['_static'] 91 | -------------------------------------------------------------------------------- /gtime/forecasting/tests/test_naive.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | from pandas.util import testing as testing 5 | from hypothesis import given, note 6 | import hypothesis.strategies as st 7 | from gtime.utils.hypothesis.time_indexes import giotto_time_series 8 | from gtime.model_selection import horizon_shift, FeatureSplitter 9 | 10 | from gtime.forecasting import ( 11 | NaiveForecaster, 12 | SeasonalNaiveForecaster, 13 | DriftForecaster, 14 | AverageForecaster, 15 | ) 16 | 17 | 18 | @st.composite 19 | def forecast_input(draw, max_lenth): 20 | length = draw(st.integers(min_value=2, max_value=max_lenth)) 21 | horizon = draw(st.integers(min_value=1, max_value=length - 1)) 22 | X = draw( 23 | giotto_time_series( 24 | min_length=length, 25 | max_length=max_lenth, 26 | allow_nan=False, 27 | allow_infinity=False, 28 | ) 29 | ) 30 | y = horizon_shift(X, horizon=horizon) 31 | X_train, y_train, X_test, y_test = FeatureSplitter().transform(X, y) 32 | return X_train, y_train, X_test 33 | 34 | 35 | class SimplePipelineTest: 36 | def setup(self, data, Model): 37 | X_train, y_train, X_test = data 38 | self.model = Model 39 | self.model.fit(X_train, y_train) 40 | self.X_test = X_test 41 | self.y_pred = self.model.predict(X_test) 42 | 43 | def test_fit_horizon(self): 44 | assert self.model.horizon_ == len(self.X_test) 45 | 46 | def test_predict_shape(self): 47 | assert self.y_pred.shape == (self.model.horizon_, self.model.horizon_) 48 | 49 | 50 | class TestNaiveModel(SimplePipelineTest): 51 | @given(data=forecast_input(50)) 52 | def setup(self, data): 53 | super().setup(data, NaiveForecaster()) 54 | 55 | def test_predict_df(self): 56 | horizon = len(self.X_test) 57 | y_cols = ["y_" + str(x + 1) for x in range(len(self.X_test))] 58 | res = np.broadcast_to(self.X_test, (horizon, horizon)) 59 | expected_df = pd.DataFrame(res, index=self.X_test.index, columns=y_cols) 60 | testing.assert_frame_equal(self.y_pred, expected_df) 61 | 62 | 63 | class TestSeasonalNaiveModel(SimplePipelineTest): 64 | @given(data=forecast_input(50), season_length=st.data()) 65 | def setup(self, data, season_length): 66 | season_length = season_length.draw( 67 | st.integers(min_value=1, max_value=len(data[0])) 68 | ) 69 | self.season_length = season_length 70 | super().setup(data, SeasonalNaiveForecaster(seasonal_length=season_length)) 71 | 72 | def test_predict_seasonality(self): 73 | if self.season_length < self.model.horizon_: 74 | assert all( 75 | self.y_pred.iloc[:, 0] == self.y_pred.iloc[:, self.season_length] 76 | ) 77 | 78 | 79 | class TestDriftModel(SimplePipelineTest): 80 | @given(data=forecast_input(50)) 81 | def setup(self, data): 82 | super().setup(data, DriftForecaster()) 83 | 84 | def test_predict_drift(self): 85 | pytest.approx(self.y_pred.diff().diff().sum().sum()) 86 | # assert pytest.approx(self.y_pred.diff().diff().sum().sum()) == 0 87 | 88 | 89 | class TestAverageModel(SimplePipelineTest): 90 | @given(data=forecast_input(50)) 91 | def setup(self, data): 92 | super().setup(data, AverageForecaster()) 93 | 94 | def test_predict_difference(self): 95 | assert pytest.approx(self.y_pred.diff(axis=1).sum().sum()) == 0 96 | -------------------------------------------------------------------------------- /examples/hierarchical_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Hierarchical model\n", 8 | "This exemple shows how the hierarchical model can be used" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "import sys\n", 18 | "sys.path.append('../')" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import pandas as pd\n", 28 | "import numpy as np\n", 29 | "import matplotlib.pyplot as plt\n", 30 | "import networkx as nx\n", 31 | "%matplotlib inline \n", 32 | "\n", 33 | "from gtime.hierarchical import HierarchicalMiddleOut\n", 34 | "from gtime.hierarchical import HierarchicalTopDown\n", 35 | "from gtime.hierarchical import HierarchicalBottomUp\n", 36 | "import pandas._testing as testing\n", 37 | "from gtime.time_series_models import AR" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "testing.N, testing.K = 20, 1\n", 47 | "\n", 48 | "data1 = testing.makeTimeDataFrame(freq=\"s\")\n", 49 | "data2 = testing.makeTimeDataFrame(freq=\"s\")\n", 50 | "data3 = testing.makeTimeDataFrame(freq=\"s\")\n", 51 | "data4 = testing.makeTimeDataFrame(freq=\"s\")\n", 52 | "data5 = testing.makeTimeDataFrame(freq=\"s\")\n", 53 | "data6 = testing.makeTimeDataFrame(freq=\"s\")\n", 54 | "data = {'data1': data1, 'data2': data2, 'data3' : data3, 'data4' : data4, 'data5' : data5, 'data6' : data6}" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "tree_adj = {'data1' : ['data2','data3'], 'data2': ['data4', 'data5'], 'data3':['data6'], 'data4':[], 'data5':[], 'data6':[]} " 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "stat_model = AR(p=2, horizon=3)\n", 73 | "middle_out_model = HierarchicalMiddleOut(model=stat_model, hierarchy_tree=tree_adj, method='tdsga', level=0)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "fitting_middle_out = middle_out_model.fit(data)" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": { 89 | "scrolled": true 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "fitting_middle_out.predict(data)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [] 102 | } 103 | ], 104 | "metadata": { 105 | "kernelspec": { 106 | "display_name": "Python 3 (ipykernel)", 107 | "language": "python", 108 | "name": "python3" 109 | }, 110 | "language_info": { 111 | "codemirror_mode": { 112 | "name": "ipython", 113 | "version": 3 114 | }, 115 | "file_extension": ".py", 116 | "mimetype": "text/x-python", 117 | "name": "python", 118 | "nbconvert_exporter": "python", 119 | "pygments_lexer": "ipython3", 120 | "version": "3.9.13" 121 | } 122 | }, 123 | "nbformat": 4, 124 | "nbformat_minor": 4 125 | } 126 | -------------------------------------------------------------------------------- /gtime/regressors/linear_regressor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from scipy.optimize import minimize 4 | from sklearn.metrics import mean_squared_error 5 | from sklearn.utils.validation import check_is_fitted 6 | 7 | 8 | class LinearRegressor: 9 | """Implementation of a LinearRegressor that takes a custom loss function. 10 | 11 | Parameters 12 | ---------- 13 | loss : Callable, optional, default: ``mean_squared_error`` 14 | The loss function to use when fitting the model. The loss function must accept 15 | y_true, y_pred and return a single real number. 16 | 17 | Examples 18 | -------- 19 | >>> from gtime.regressors.linear_regressor import LinearRegressor 20 | >>> from gtime.metrics import max_error 21 | >>> import numpy as np 22 | >>> import pandas as pd 23 | >>> X = np.random.random((100, 10)) 24 | >>> y = np.random.random(100) 25 | >>> lr = LinearRegressor(loss=max_error) 26 | >>> X_train, y_train = X[:90], y[:90] 27 | >>> X_test, y_test = X[90:], y[90:] 28 | >>> x0 = [0]*11 29 | >>> lr.fit(X_train, y_train, x0=x0) 30 | >>> lr.predict(X_test) 31 | array([0.62987155, 0.46971378, 0.50421395, 0.5543149 , 0.50848151, 32 | 0.54768797, 0.50968854, 0.50500384, 0.58069366, 0.54912972]) 33 | 34 | """ 35 | 36 | def __init__(self, loss=mean_squared_error): 37 | self.loss = loss 38 | 39 | def fit(self, X: pd.DataFrame, y: pd.DataFrame, **kwargs) -> "LinearRegressor": 40 | """Fit the linear model on ``X`` and ``y`` on the given loss function.To do the 41 | minimization, the ``scipy.optimize.minimize`` function is used. To have more 42 | details and check which kind of options are available, please refer to the scipy 43 | `documentation 44 | `_. 45 | 46 | Parameters 47 | ---------- 48 | X : pd.DataFrame, shape (n_samples, n_features), required 49 | The X matrix used as features in the fitting procedure. 50 | 51 | y : pd.DataFrame, shape (n_samples, 1), required 52 | The y matrix to use as target values in the fitting procedure. 53 | 54 | kwargs: dict, optional. 55 | Optional arguments to pass to the ``minimize`` function of scipy. 56 | 57 | Returns 58 | ------- 59 | self: LinearRegressor 60 | The fitted model. 61 | 62 | """ 63 | 64 | if isinstance(X, pd.DataFrame): 65 | X = X.values 66 | 67 | if isinstance(y, pd.DataFrame): 68 | y = y.values 69 | 70 | def prediction_error(model_weights): 71 | predictions = [ 72 | model_weights[0] + np.dot(model_weights[1:], row) for row in X 73 | ] 74 | return self.loss(y, predictions) 75 | 76 | res = minimize(prediction_error, **kwargs) 77 | 78 | self.model_weights_ = res["x"] 79 | 80 | return self 81 | 82 | def predict(self, X: pd.DataFrame) -> pd.DataFrame: 83 | """Predict the y values associated to the features ``X``. 84 | 85 | Parameters 86 | ---------- 87 | X : pd.DataFrame, shape (n_samples, n_features), required 88 | The features used to predict. 89 | 90 | Returns 91 | ------- 92 | predictions : pd.DataFrame, shape (n_samples, 1) 93 | The predictions of the model 94 | 95 | """ 96 | check_is_fitted(self) 97 | 98 | predictions = self.model_weights_[0] + np.dot(X, self.model_weights_[1:]) 99 | return predictions 100 | -------------------------------------------------------------------------------- /gtime/plotting/tests/test_plotting.py: -------------------------------------------------------------------------------- 1 | from hypothesis import given, settings 2 | import hypothesis.strategies as st 3 | import pytest 4 | import pandas as pd 5 | import numpy as np 6 | import matplotlib.pyplot as plt 7 | from gtime.utils.hypothesis.time_indexes import giotto_time_series 8 | 9 | from gtime.plotting import lag_plot, acf_plot, seasonal_subplots, seasonal_plot 10 | from gtime.plotting.preprocessing import seasonal_split 11 | 12 | 13 | @pytest.fixture() 14 | def time_series(): 15 | idx = pd.period_range(start="2000-01-01", end="2003-01-01") 16 | df = pd.DataFrame(np.random.random((len(idx), 1)), index=idx, columns=["ts"]) 17 | return df 18 | 19 | 20 | class TestLagplots: 21 | @pytest.mark.parametrize("lags", [1, 5, [1], [1, 3, 5, 100]]) 22 | def test_subplots_number(self, time_series, lags): 23 | ax = lag_plot(time_series, lags=lags) 24 | num_plots = sum(map(lambda x: x.has_data(), ax.flatten())) 25 | if isinstance(lags, int): 26 | expected_num_plots = lags 27 | else: 28 | expected_num_plots = len(lags) 29 | assert num_plots == expected_num_plots 30 | plt.close("all") 31 | 32 | @pytest.mark.parametrize("lags", [1, 5, [1], [1, 3, 5, 100]]) 33 | @pytest.mark.parametrize("plots_per_row", [1, 3, 10]) 34 | def test_rows_and_cols(self, time_series, lags, plots_per_row): 35 | ax = lag_plot(time_series, lags=lags, plots_per_row=plots_per_row) 36 | if isinstance(lags, int): 37 | lag_length = lags 38 | else: 39 | lag_length = len(lags) 40 | assert ax.shape == ( 41 | (lag_length - 1) // plots_per_row + 1, 42 | min(lag_length, plots_per_row), 43 | ) 44 | plt.close("all") 45 | 46 | 47 | class TestACFplots: 48 | @pytest.mark.parametrize("maxlags", [1, 5, 100]) 49 | @pytest.mark.parametrize("ci", [0.0, 0.05]) 50 | @pytest.mark.parametrize("partial", [True, False]) 51 | def test_ci_lines(self, time_series, maxlags, ci, partial): 52 | ax = acf_plot(time_series, max_lags=maxlags, ci=ci, partial=partial) 53 | assert len(ax.lines) == 3 54 | plt.close("all") 55 | 56 | @pytest.mark.parametrize("maxlags", [1, 5, 100]) 57 | @pytest.mark.parametrize("ci", [0.0, 0.05]) 58 | @pytest.mark.parametrize("partial", [True, False]) 59 | def test_num_bars(self, time_series, maxlags, ci, partial): 60 | ax = acf_plot(time_series, maxlags, ci, partial) 61 | assert len(ax.containers[0]) == min(len(time_series), maxlags) 62 | plt.close("all") 63 | 64 | 65 | class TestSubplots: 66 | @pytest.mark.parametrize("cycle", ["year", "6M"]) 67 | @pytest.mark.parametrize("freq", ["M"]) 68 | @pytest.mark.parametrize("box", [True, False]) 69 | def test_subplots_number(self, time_series, cycle, freq, box): 70 | ax = seasonal_subplots(time_series, cycle=cycle, freq=freq, box=box) 71 | split = seasonal_split(time_series, cycle=cycle, freq=freq) 72 | assert ax.size == split.shape[0] 73 | plt.close("all") 74 | 75 | 76 | class TestSeasonalPlots: 77 | @pytest.mark.parametrize("cycle", ["year", "6M"]) 78 | @pytest.mark.parametrize("freq", ["M", None]) 79 | @pytest.mark.parametrize("polar", [True, False]) 80 | @pytest.mark.parametrize("new_ax", [True, False]) 81 | def test_seasonal_num_lines(self, time_series, cycle, freq, polar, new_ax): 82 | if new_ax: 83 | if polar: 84 | ax = plt.subplot(111, projection="polar") 85 | else: 86 | ax = plt.subplot(111) 87 | else: 88 | ax = None 89 | ax = seasonal_plot(time_series, cycle=cycle, freq=freq, polar=polar, ax=ax) 90 | split = seasonal_split(time_series, cycle=cycle, freq=freq) 91 | assert len(ax.lines) == split.shape[1] 92 | plt.close("all") 93 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.rst: -------------------------------------------------------------------------------- 1 | CONTRIBUTOR CODE OF CONDUCT 2 | =========================== 3 | (Code of Conduct) 4 | ----------------- 5 | 6 | 7 | Our Pledge 8 | ---------- 9 | 10 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to make participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, sex characteristics, gender identity and expression, level of experience, education, socio-economic status, nationality, personal appearance, race, religion, or sexual identity and orientation. 11 | 12 | Our Standards 13 | ------------- 14 | 15 | Examples of behavior that contributes to creating a positive environment include: 16 | 17 | * Using welcoming and inclusive language; 18 | * Being respectful of differing viewpoints and experiences; 19 | * Gracefully accepting constructive criticism; 20 | * Focusing on what is best for the community; 21 | * Showing empathy towards other community members. 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or advances; 26 | * Trolling, insulting/derogatory comments, and personal or political attacks; 27 | * Public or private harassment; 28 | * Publishing others’ private information, such as a physical or electronic address, without explicit permission; 29 | * Other conduct which could reasonably be considered inappropriate in a professional setting. 30 | 31 | Our Responsibilities 32 | -------------------- 33 | 34 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 35 | 36 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 37 | 38 | Scope 39 | ----- 40 | 41 | This Code of Conduct applies within all Giotto’s project spaces, to all content on , Giotto’s GitHub organization, or any other official Giotto web presence allowing for community interactions, and it also applies when an individual is representing the project or its community in public spaces. 42 | 43 | Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 44 | 45 | Enforcement 46 | ----------- 47 | 48 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at . All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances, Sanctions may include written warnings, expulsions from the project, project sponsored spaces, or project forums, or any other sanction which is deemed appropriate. [The project team] is obligated to maintain confidentiality with regard to the reporter of an incident. If the act is ongoing (such as someone engaging in harassment) or involves a threat to anyone's safety (e.g. threats of violence), the the project team may issue sanctions without notice. Further details of specific enforcement policies may be posted separately. 49 | 50 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by the project leader. 51 | 52 | Attribution 53 | ----------- 54 | 55 | This Code of Conduct is adapted from the Contributor Covenant, version 1.4, available at , and includes some aspects of the TensorFlow Code of Conduct, available at 56 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![Deploy to gh-pages](https://github.com/giotto-ai/giotto-time/actions/workflows/deploy_github_pages.yml/badge.svg)](https://github.com/giotto-ai/giotto-time/actions/workflows/deploy_github_pages.yml) 3 | [![Upload Python Package](https://github.com/giotto-ai/giotto-time/actions/workflows/build_and_publish.yml/badge.svg)](https://github.com/giotto-ai/giotto-time/actions/workflows/build_and_publish.yml) 4 | [![CI](https://github.com/giotto-ai/giotto-time/actions/workflows/ci.yml/badge.svg)](https://github.com/giotto-ai/giotto-time/actions/workflows/ci.yml) 5 | [![PyPI version](https://badge.fury.io/py/giotto-time.svg)](https://badge.fury.io/py/giotto-time) 6 | [![Slack-join](https://img.shields.io/badge/Slack-Join-blue)](https://slack.giotto.ai/) 7 | 8 | # giotto-time 9 | 10 | giotto-time is a machine learning based time series forecasting toolbox in Python. 11 | It is part of the [Giotto](https://github.com/giotto-ai) collection of open-source projects and aims to provide 12 | feature extraction, analysis, causality testing and forecasting models based on 13 | [scikit-learn](https://scikit-learn.org/stable/) API. 14 | 15 | ## License 16 | 17 | giotto-time is distributed under the AGPLv3 [license](https://github.com/giotto-ai/giotto-time/blob/master/LICENSE). 18 | If you need a different distribution license, please contact the L2F team at business@l2f.ch. 19 | 20 | ## Documentation 21 | 22 | - API reference (stable release): https://giotto-ai.github.io/giotto-time/ 23 | 24 | ## Getting started 25 | 26 | Get started with giotto-time by following the installation steps below. 27 | Simple tutorials and real-world use cases can be found in example folder as notebooks. 28 | 29 | ## Installation 30 | 31 | ### User installation 32 | 33 | Run this command in your favourite python environment 34 | ``` 35 | pip install giotto-time 36 | ``` 37 | 38 | ### Developer installation 39 | 40 | Get the latest state of the source code with the command 41 | 42 | ``` 43 | git clone https://github.com/giotto-ai/giotto-time.git 44 | cd giotto-time 45 | pip install -e ".[tests, doc]" 46 | ``` 47 | 48 | ## Example 49 | 50 | ```python 51 | from gtime import * 52 | from gtime.feature_extraction import * 53 | import pandas as pd 54 | import numpy as np 55 | from sklearn.linear_model import LinearRegression 56 | 57 | # Create random DataFrame with DatetimeIndex 58 | X_dt = pd.DataFrame(np.random.randint(4, size=(20)), 59 | index=pd.date_range("2019-12-20", "2020-01-08"), 60 | columns=['time_series']) 61 | 62 | # Convert the DatetimeIndex to PeriodIndex and create y matrix 63 | X = preprocessing.TimeSeriesPreparation().transform(X_dt) 64 | y = model_selection.horizon_shift(X, horizon=2) 65 | 66 | # Create some features 67 | cal = feature_generation.Calendar(region="europe", country="Switzerland", kernel=np.array([1, 2])) 68 | X_f = compose.FeatureCreation( 69 | [('s_2', Shift(2), ['time_series']), 70 | ('ma_3', MovingAverage(window_size=3), ['time_series']), 71 | ('cal', cal, ['time_series'])]).fit_transform(X) 72 | 73 | # Train/test split 74 | X_train, y_train, X_test, y_test = model_selection.FeatureSplitter().transform(X_f, y) 75 | 76 | # Try sklearn's MultiOutputRegressor as time-series forecasting model 77 | gar = forecasting.GAR(LinearRegression()) 78 | gar.fit(X_train, y_train).predict(X_test) 79 | 80 | ``` 81 | 82 | 83 | ## Contributing 84 | 85 | We welcome new contributors of all experience levels. The Giotto 86 | community goals are to be helpful, welcoming, and effective. To learn more about 87 | making a contribution to giotto-time, please see the [CONTRIBUTING.rst](https://github.com/giotto-ai/giotto-time/blob/master/CONTRIBUTING.rst) 88 | file. 89 | 90 | ## Links 91 | 92 | - Official source code repo: https://github.com/giotto-ai/giotto-time 93 | - Download releases: https://pypi.org/project/giotto-time/ 94 | - Issue tracker: https://github.com/giotto-ai/giotto-time/issues 95 | 96 | ## Community 97 | 98 | Giotto Slack workspace: https://slack.giotto.ai/ 99 | 100 | ## Contacts 101 | 102 | maintainers@giotto.ai 103 | -------------------------------------------------------------------------------- /gtime/experimental/trend_models/function_trend.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from gtime.models.trend_models.base import TrendModel 3 | from scipy.optimize import minimize 4 | from sklearn.metrics import mean_squared_error 5 | 6 | 7 | class FunctionTrend(TrendModel): 8 | """A model for fitting, predicting and removing an custom functional trend 9 | from a time series. The transformed time series created will be trend 10 | stationary with respect to the specific function. To have more details, 11 | you can check this `link `_. 12 | 13 | Parameters 14 | ---------- 15 | loss : ``Callable``, optional, (default=``mean_squared_error``). 16 | The loss function to use when fitting the model. The loss function must 17 | accept y_true, y_pred and return a single real number. 18 | 19 | """ 20 | 21 | def __init__(self, model_form, loss=mean_squared_error): 22 | self.model_form = model_form 23 | self.loss = loss 24 | 25 | def fit( 26 | self, time_series: pd.DataFrame, x0: list, method: str = "BFGS" 27 | ) -> TrendModel: 28 | """Fit the model on the ``time_series``, with respect to the provided 29 | ``loss`` and using the provided ``method``. In order to see which 30 | methods are available, please check the 'scipy' `documentation 31 | `_. 32 | 33 | Parameters 34 | ---------- 35 | time_series : ``pd.DataFrame``, required. 36 | The time series on which to fit the model. 37 | 38 | x0 : ``list``. 39 | 40 | method : ``str``, optional, (default=``'BFGS``). 41 | The method to use in order to minimize the loss function. 42 | 43 | Returns 44 | ------- 45 | self : ``TrendModel`` 46 | The fitted object. 47 | 48 | """ 49 | 50 | def prediction_error(model_weights): 51 | predictions = [ 52 | self.model_form(t, model_weights) 53 | for t in range(0, time_series.shape[0]) 54 | ] 55 | return self.loss(time_series.values, predictions) 56 | 57 | res = minimize(prediction_error, x0, method=method, options={"disp": False}) 58 | 59 | self.model_weights_ = res["x"] 60 | 61 | self.t0_ = time_series.index[0] 62 | freq = time_series.index.freq 63 | if freq is not None: 64 | self.period_ = freq 65 | else: 66 | self.period_ = time_series.index[1] - time_series.index[0] 67 | 68 | return self 69 | 70 | def predict(self, t): 71 | """Using the fitted model, predict the values starting from ``X``. 72 | 73 | Parameters 74 | ---------- 75 | X : ``pd.DataFrame``, required. 76 | The time series on which to predict. 77 | 78 | Returns 79 | ------- 80 | predictions : ``pd.DataFrame`` 81 | The output predictions. 82 | 83 | Raises 84 | ------ 85 | ``NotFittedError`` 86 | Raised if the model is not fitted yet. 87 | 88 | """ 89 | # check fit run 90 | return self.model_form(t, self.model_weights_) 91 | 92 | def transform(self, time_series): 93 | """Transform the ``time_series`` by removing the trend. 94 | 95 | Parameters 96 | ---------- 97 | time_series : ``pd.DataFrame``, required. 98 | The time series to transform. 99 | 100 | Returns 101 | ------- 102 | transformed_time_series : ``pd.DataFrame`` 103 | The transformed time series, without the trend. 104 | 105 | """ 106 | # check fit run 107 | 108 | ts = (time_series.index - self.t0_) / self.period_ 109 | 110 | predictions = pd.Series( 111 | index=time_series.index, 112 | data=[self.model_form(t, self.model_weights_) for t in ts], 113 | ) 114 | 115 | return time_series.sub(predictions, axis=0) 116 | -------------------------------------------------------------------------------- /gtime/forecasting/trend.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from scipy.optimize import minimize 6 | from sklearn.base import BaseEstimator, RegressorMixin 7 | from sklearn.metrics import mean_squared_error 8 | from sklearn.utils.validation import check_is_fitted 9 | 10 | from gtime.utils.trends import TRENDS 11 | 12 | 13 | class TrendForecaster(BaseEstimator, RegressorMixin): 14 | """Trend forecasting model. 15 | 16 | This estimator optimizes a trend function on train data and will forecast using this trend function with optimized 17 | parameters. 18 | 19 | Parameters 20 | ---------- 21 | trend : ``"polynomial"`` | ``"exponential"``, required 22 | The kind of trend removal to apply. 23 | 24 | trend_x0 : np.array, required 25 | Initialisation parameters passed to the trend function 26 | 27 | loss : Callable, optional, default: ``mean_squared_error`` 28 | Loss function to minimize. 29 | 30 | method : str, optional, default: ``"BFGS"`` 31 | Loss function optimisation method 32 | 33 | Examples 34 | -------- 35 | >>> import pandas as pd 36 | >>> import numpy as np 37 | >>> from gtime.model_selection import horizon_shift, FeatureSplitter 38 | >>> from gtime.forecasting import TrendForecaster 39 | >>> 40 | >>> X = pd.DataFrame(np.random.random((10, 1)), index=pd.date_range("2020-01-01", "2020-01-10")) 41 | >>> y = horizon_shift(X, horizon=2) 42 | >>> X_train, y_train, X_test, y_test = FeatureSplitter().transform(X, y) 43 | >>> 44 | >>> tf = TrendForecaster(trend='polynomial', trend_x0=np.zeros(2)) 45 | >>> tf.fit(X_train).predict(X_test) 46 | array([[0.39703029], 47 | [0.41734957]]) 48 | 49 | """ 50 | 51 | def __init__( 52 | self, 53 | trend: str, 54 | trend_x0: np.array, 55 | loss: Callable = mean_squared_error, 56 | method: str = "BFGS", 57 | ): 58 | self.trend = trend 59 | self.trend_x0 = trend_x0 60 | self.loss = loss 61 | self.method = method 62 | 63 | def fit(self, X: pd.DataFrame, y=None) -> "TrendForecaster": 64 | """Fit the estimator. 65 | 66 | Parameters 67 | ---------- 68 | X : pd.DataFrame, shape (n_samples, n_features), required 69 | Input data. 70 | 71 | y : None 72 | There is no need of a target in a transformer, yet the pipeline API 73 | requires this parameter. 74 | 75 | Returns 76 | ------- 77 | self : object 78 | Returns self. 79 | 80 | """ 81 | 82 | if self.trend not in TRENDS: 83 | raise ValueError( 84 | "The trend '%s' is not supported. Supported " 85 | "trends are %s." % (self.trend, list(sorted(TRENDS))) 86 | ) 87 | print([TRENDS[self.trend](t, 111) for t in range(0, X.shape[0])]) 88 | self.best_trend_params_ = minimize( 89 | lambda opt: self.loss( 90 | X.values, [TRENDS[self.trend](t, opt) for t in range(0, X.shape[0])] 91 | ), 92 | self.trend_x0, 93 | method=self.method, 94 | options={"disp": False}, 95 | )["x"] 96 | 97 | return self 98 | 99 | def predict(self, X: pd.DataFrame) -> pd.DataFrame: 100 | """Using the fitted polynomial, predict the values starting from ``X``. 101 | 102 | Parameters 103 | ---------- 104 | X: pd.DataFrame, shape (n_samples, 1), required 105 | The time series on which to predict. 106 | 107 | Returns 108 | ------- 109 | predictions : pd.DataFrame, shape (n_samples, 1) 110 | The output predictions. 111 | 112 | Raises 113 | ------ 114 | NotFittedError 115 | Raised if the model is not fitted yet. 116 | 117 | """ 118 | check_is_fitted(self) 119 | 120 | predictions = TRENDS[self.trend](X.values, self.best_trend_params_) 121 | return predictions 122 | -------------------------------------------------------------------------------- /gtime/hierarchical/naive.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | from typing import Dict 3 | 4 | import pandas as pd 5 | from sklearn.base import BaseEstimator 6 | from sklearn.utils.validation import check_is_fitted 7 | 8 | from gtime.hierarchical.base import HierarchicalBase 9 | 10 | 11 | class HierarchicalNaive(HierarchicalBase): 12 | """ Simplest hierarchical model possible. 13 | It does not perform any aggregation of the results. 14 | Each time series is fitted and predicted independently. 15 | 16 | Parameters 17 | ---------- 18 | model: BaseEstimator, required 19 | time series forecasting model that is applied to each of the time series. A cross validation model 20 | can also be passed. 21 | Examples 22 | -------- 23 | >>> import pandas._testing as testing 24 | >>> from gtime.time_series_models import AR 25 | >>> from gtime.hierarchical import HierarchicalNaive 26 | >>> 27 | >>> testing.N, testing.K = 20, 1 28 | >>> data1 = testing.makeTimeDataFrame(freq="s") 29 | >>> data2 = testing.makeTimeDataFrame(freq="s") 30 | >>> data = {'data1': data1, 'data2': data2} 31 | >>> time_series_model = AR(p=2, horizon=3) 32 | >>> 33 | >>> hierarchical_model = HierarchicalNaive(model=time_series_model) 34 | >>> hierarchical_model.fit(data) 35 | >>> hierarchical_model.predict() 36 | {'data1': y_1 y_2 y_3 37 | 2000-01-01 00:00:17 0.475903 0.834633 0.649467 38 | 2000-01-01 00:00:18 0.644168 0.610287 0.383904 39 | 2000-01-01 00:00:19 0.180920 0.596606 0.696133, 'data2': y_1 y_2 y_3 40 | 2000-01-01 00:00:17 -0.117342 0.006594 -0.638133 41 | 2000-01-01 00:00:18 -0.394193 -0.607146 0.323875 42 | 2000-01-01 00:00:19 -0.381479 0.088210 -0.356775} 43 | """ 44 | 45 | def __init__(self, model: BaseEstimator): 46 | super().__init__(model=model, hierarchy_tree="infer") 47 | 48 | def fit(self, X: Dict[str, pd.DataFrame], y: pd.DataFrame = None): 49 | """ Fit method 50 | 51 | Parameters 52 | ---------- 53 | X : Dict[str, pd.DataFrame], required 54 | A dictionary of time series. Each is fitted independently 55 | y : pd.DataFrame, optional, default = ``None`` 56 | only for compatibility 57 | 58 | Returns 59 | ------- 60 | self 61 | """ 62 | self._check_is_dict_of_dataframes_with_str_key(X) 63 | self._infer_hierarchy_tree(X) 64 | self._initialize_models(X) 65 | for key, time_series in X.items(): 66 | self.models_[key].fit(time_series) 67 | return self 68 | 69 | def predict(self, X: Dict[str, pd.DataFrame] = None): 70 | """ Predict method 71 | 72 | Parameters 73 | ---------- 74 | X : Dict[str, pd.DataFrame], optional, default = ``None`` 75 | time series to predict. If ``None`` all the fitted time series are predicted. 76 | The keys in ``X`` have to match the ones used to fit. 77 | 78 | Returns 79 | ------- 80 | predictions : Dict[str, pd.DataFrame] 81 | """ 82 | check_is_fitted(self) 83 | if X is None: 84 | return self._predict_fitted_time_series() 85 | else: 86 | return self._predict_new_time_series(X) 87 | 88 | def _initialize_models(self, X: Dict[str, pd.DataFrame]): 89 | print(self.model) 90 | self.models_ = {key: deepcopy(self.model) for key in X} 91 | 92 | def _infer_hierarchy_tree(self, X: Dict[str, pd.DataFrame]): 93 | self.hierarchy_tree_ = set( 94 | X.keys() 95 | ) # No need of a proper hierarchy tree for HierarchicalNaive 96 | 97 | def _predict_fitted_time_series(self) -> Dict[str, pd.DataFrame]: 98 | return {key: model.predict() for key, model in self.models_.items()} 99 | 100 | def _predict_new_time_series(self, X: pd.DataFrame) -> Dict[str, pd.DataFrame]: 101 | return { 102 | key: self.models_[key].predict(time_series) 103 | for key, time_series in X.items() 104 | } 105 | -------------------------------------------------------------------------------- /gtime/regressors/tests/test_explainable.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import pytest 4 | from hypothesis import given, settings 5 | from sklearn import clone 6 | from sklearn.base import BaseEstimator 7 | from sklearn.cluster import DBSCAN, KMeans, SpectralClustering 8 | from sklearn.decomposition import PCA 9 | from sklearn.exceptions import NotFittedError 10 | import numpy as np 11 | import pandas as pd 12 | 13 | from gtime.explainability import _LimeExplainer, _ShapExplainer 14 | from gtime.forecasting.tests.test_gar import df_transformer 15 | from gtime.model_selection import FeatureSplitter 16 | from gtime.regressors import ExplainableRegressor 17 | from gtime.utils.hypothesis.feature_matrices import ( 18 | numpy_X_matrices, 19 | numpy_X_y_matrices, 20 | X_y_matrices, 21 | ) 22 | from gtime.utils.hypothesis.general_strategies import regressors 23 | from gtime.utils.hypothesis.time_indexes import samples_from 24 | 25 | 26 | def bad_regressors(): 27 | return samples_from([DBSCAN(), SpectralClustering(), PCA(),]) 28 | 29 | 30 | @given(bad_regressors()) 31 | def test_bad_regressors(bad_regressor): 32 | assert hasattr(bad_regressor, "fit") 33 | assert not hasattr(bad_regressor, "predict") 34 | 35 | 36 | class TestExplainableRegressor: 37 | @pytest.mark.parametrize("explainer_type", ["lime", "shap"]) 38 | @given(estimator=regressors()) 39 | def test_constructor(self, estimator, explainer_type): 40 | regressor = ExplainableRegressor(estimator, explainer_type) 41 | if explainer_type == "lime": 42 | assert isinstance(regressor.explainer, _LimeExplainer) 43 | elif explainer_type == "shap": 44 | assert isinstance(regressor.explainer, _ShapExplainer) 45 | 46 | @given(estimator=regressors()) 47 | def test_constructor_bad_explainer(self, estimator): 48 | with pytest.raises(ValueError): 49 | ExplainableRegressor(estimator, "bad") 50 | 51 | @pytest.mark.parametrize("explainer_type", ["lime", "shap"]) 52 | @given(bad_estimator=bad_regressors()) 53 | def test_constructor_bad_regressor(self, bad_estimator, explainer_type): 54 | with pytest.raises(TypeError): 55 | ExplainableRegressor(bad_estimator, explainer_type) 56 | 57 | @pytest.mark.parametrize("explainer_type", ["lime", "shap"]) 58 | @given(estimator=regressors(), X=numpy_X_matrices()) 59 | def test_error_predict_not_fitted(self, estimator, explainer_type, X): 60 | regressor = ExplainableRegressor(estimator, explainer_type) 61 | with pytest.raises(NotFittedError): 62 | regressor.predict(X) 63 | 64 | def _get_fit_attributes(self, estimator: BaseEstimator) -> List[str]: 65 | return [ 66 | v for v in vars(estimator) if v.endswith("_") and not v.startswith("__") 67 | ] 68 | 69 | @pytest.mark.parametrize("explainer_type", ["lime", "shap"]) 70 | @given( 71 | estimator=regressors(), X_y=numpy_X_y_matrices(min_value=-100, max_value=100) 72 | ) 73 | def test_fit_values(self, estimator, explainer_type, X_y): 74 | X, y = X_y 75 | regressor = ExplainableRegressor(estimator, explainer_type) 76 | regressor.fit(X, y) 77 | 78 | cloned_estimator = clone(estimator) 79 | cloned_estimator.fit(X, y) 80 | 81 | estimator_fit_attributes = self._get_fit_attributes(regressor.estimator) 82 | cloned_estimator_fit_attributes = self._get_fit_attributes(cloned_estimator) 83 | 84 | np.testing.assert_array_equal( 85 | estimator_fit_attributes, cloned_estimator_fit_attributes 86 | ) 87 | 88 | @settings(deadline=pd.Timedelta(milliseconds=5000), max_examples=7) 89 | @pytest.mark.parametrize("explainer_type", ["lime", "shap"]) 90 | @given( 91 | estimator=regressors(), X_y=numpy_X_y_matrices(min_value=-100, max_value=100) 92 | ) 93 | def test_predict_values(self, estimator, explainer_type, X_y): 94 | X, y = X_y 95 | X_test = X[:1, :] 96 | regressor = ExplainableRegressor(estimator, explainer_type) 97 | regressor_predictions = regressor.fit(X, y).predict(X_test) 98 | 99 | cloned_estimator = clone(estimator) 100 | estimator_predictions = cloned_estimator.fit(X, y).predict(X_test) 101 | 102 | assert regressor_predictions.shape == estimator_predictions.shape 103 | assert regressor_predictions.shape[0] == len(regressor.explanations_) 104 | -------------------------------------------------------------------------------- /gtime/regressors/explainable.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List, Tuple 2 | 3 | from sklearn.base import BaseEstimator, RegressorMixin 4 | import numpy as np 5 | from sklearn.utils.validation import check_is_fitted 6 | import pandas as pd 7 | 8 | from gtime.explainability import _LimeExplainer, _ShapExplainer 9 | 10 | 11 | class ExplainableRegressor(BaseEstimator, RegressorMixin): 12 | """ Wraps the most commons scikit-learn regressor to offer a nice to use interface to fit/predict 13 | models and at the same time to explain the predictions. 14 | 15 | Since it follows the fit/predict interface of scikit-learn model it is compatible with 16 | scikit-learn pipelines, etc.. 17 | 18 | 2 explainers are available: LIME and SHAP 19 | 20 | You can get the explanation by accessing to `regressor.explainer_.explanations_` after 21 | the predict function, 22 | 23 | Parameters 24 | ---------- 25 | estimator: RegressorMixin, required 26 | the scikit-learn model 27 | explainer_type: str, required 28 | 'lime' or 'shap' 29 | 30 | Examples 31 | -------- 32 | >>> import numpy as np 33 | >>> from gtime.regressors import ExplainableRegressor 34 | >>> from sklearn.ensemble import RandomForestRegressor 35 | >>> X = np.random.random((30, 5)) 36 | >>> y = np.random.random(30) 37 | >>> X_train, y_train = X[:20], y[:20] 38 | >>> X_test, y_test = X[20:], y[20:] 39 | >>> 40 | >>> random_forest = RandomForestRegressor() 41 | >>> explainable_regressor = ExplainableRegressor(random_forest, 'shap') 42 | >>> 43 | >>> explainable_regressor.fit(X_train, y_train, feature_names=['a', 'b', 'c', 'd', 'e']) 44 | >>> explainable_regressor.predict(X_test) 45 | array([0.41323105, 0.40386639, 0.46462663, 0.3795568 , 0.57571486, 46 | 0.37079003, 0.54756082, 0.35160197, 0.30881165, 0.48201442]) 47 | >>> explainable_regressor.explainer_.explanations_[0] 48 | {'a': -0.019896434698603117, 'b': 0.029814649814215954, 'c': 0.02447547087613202, 'd': 0.021313815648682066, 'e': -0.10778800140251406} 49 | """ 50 | 51 | def __init__(self, estimator: RegressorMixin, explainer_type: str): 52 | self.estimator = self._check_estimator(estimator) 53 | self.explainer_type = explainer_type 54 | self.explainer = self._initialize_explainer() 55 | 56 | def _check_estimator(self, estimator: RegressorMixin) -> RegressorMixin: 57 | if not hasattr(estimator, "fit") or not hasattr(estimator, "predict"): 58 | raise TypeError(f"Estimator not compatible: {estimator}") 59 | return estimator 60 | 61 | def _initialize_explainer(self) -> Union[_LimeExplainer, _ShapExplainer]: 62 | if self.explainer_type == "lime": 63 | return _LimeExplainer() 64 | elif self.explainer_type == "shap": 65 | return _ShapExplainer() 66 | else: 67 | raise ValueError(f"Explainer not available: {self.explainer_type}") 68 | 69 | def fit( 70 | self, X: np.ndarray, y: np.ndarray, feature_names: List[str] = None, 71 | ): 72 | """ Fit function that calls the fit on the estimator and on the explainer. 73 | 74 | Parameters 75 | ---------- 76 | X: np.ndarray, required 77 | train matrix 78 | y: np.ndarray, required 79 | train true values 80 | feature_names: List[str], optional, (default=`None`) 81 | the name of the feature column of X 82 | 83 | Returns 84 | ------- 85 | Fitted `ExplainableRegressor` 86 | """ 87 | self.estimator_ = self.estimator.fit(X, y) 88 | self.explainer_ = self.explainer.fit( 89 | self.estimator_, X, feature_names=feature_names 90 | ) 91 | return self 92 | 93 | def predict(self, X: np.ndarray): 94 | """ Predict function that call the predict function of the explainer. 95 | 96 | You can access to the explanation of the predictions via 97 | `regressor.explainer_.explanations_` attribute 98 | 99 | Parameters 100 | ---------- 101 | X: np.ndarray, required 102 | test matrix 103 | 104 | Returns 105 | ------- 106 | predictions: np.ndarray 107 | """ 108 | check_is_fitted(self) 109 | predictions = self.explainer_.predict(X) 110 | self.explanations_ = self.explainer_.explanations_ 111 | return predictions 112 | -------------------------------------------------------------------------------- /gtime/causality/pearson_correlation.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.base import TransformerMixin, BaseEstimator 3 | 4 | from gtime.causality.base import CausalityMixin 5 | 6 | 7 | class ShiftedPearsonCorrelation(BaseEstimator, TransformerMixin, CausalityMixin): 8 | """Class responsible for assessing the shifted Pearson correlations (PPMCC) between 9 | two or more series. For more info about the test, click 10 | `here `_. 11 | 12 | Parameters 13 | ---------- 14 | min_shift : int, optional, default: ``1`` 15 | The minimum number of shifts to check for. 16 | 17 | max_shift : int, optional, default: ``10`` 18 | The maximum number of shifts to check for. 19 | 20 | target_col : str, optional, default: ``None`` 21 | The column to use as the a reference (i.e., the columns which is not 22 | shifted). 23 | 24 | dropna : bool, optional, default: ``False`` 25 | Determines if the Nan values created by shifting are retained or dropped. 26 | 27 | bootstrap_iterations : int, optional, default: ``None`` 28 | If not None, compute the p_values of the test, by performing bootstrapping of 29 | the original data (sampling with replacement). 30 | 31 | permutation_iterations : int, optional, default: ``None`` 32 | If not None, compute the p_values of the test, by performing permutations of 33 | the original data. 34 | 35 | Examples 36 | -------- 37 | >>> from gtime.causality.pearson_correlation import ShiftedPearsonCorrelation 38 | >>> import pandas.util.testing as testing 39 | >>> data = testing.makeTimeDataFrame(freq="s") 40 | >>> spc = ShiftedPearsonCorrelation(target_col="A") 41 | >>> spc.fit(data) 42 | >>> spc.best_shifts_ 43 | y A B C D 44 | x 45 | A 8 9 6 5 46 | B 7 4 4 6 47 | C 3 4 9 9 48 | D 7 1 9 1 49 | >>> spc.max_corrs_ 50 | y A B C D 51 | x 52 | A 0.383800 0.260627 0.343628 0.360151 53 | B 0.311608 0.307203 0.255969 0.298523 54 | C 0.373613 0.267335 0.211913 0.140034 55 | D 0.496535 0.204770 0.402473 0.310065 56 | """ 57 | 58 | def __init__( 59 | self, 60 | min_shift: int = 1, 61 | max_shift: int = 10, 62 | target_col: str = None, 63 | dropna: bool = False, 64 | bootstrap_iterations: int = None, 65 | permutation_iterations: int = None, 66 | ): 67 | super().__init__( 68 | bootstrap_iterations=bootstrap_iterations, 69 | permutation_iterations=permutation_iterations, 70 | ) 71 | self.min_shift = min_shift 72 | self.max_shift = max_shift 73 | self.target_col = target_col 74 | self.dropna = dropna 75 | 76 | def fit(self, data: pd.DataFrame) -> "ShiftedPearsonCorrelation": 77 | """Create the dataframe of shifts of each time series which maximize the 78 | Pearson correlation (PPMCC). 79 | 80 | Parameters 81 | ---------- 82 | data : pd.DataFrame, shape (n_samples, n_time_series), required 83 | The DataFrame containing the time series on which to compute the shifted 84 | correlations. 85 | 86 | Returns 87 | ------- 88 | self : ``ShiftedPearsonCorrelation`` 89 | 90 | """ 91 | best_shifts = self._compute_best_shifts(data, self._get_max_corr_shift) 92 | 93 | pivot_tables = self._create_pivot_tables(best_shifts) 94 | 95 | self.best_shifts_ = pivot_tables["best_shifts"] 96 | self.max_corrs_ = pivot_tables["max_corrs"] 97 | 98 | if self.bootstrap_iterations: 99 | self.bootstrap_p_values_ = pivot_tables["bootstrap_p_values"] 100 | 101 | if self.permutation_iterations: 102 | self.permutation_p_values_ = pivot_tables["permutation_p_values"] 103 | 104 | return self 105 | 106 | def _get_max_corr_shift(self, data: pd.DataFrame, x, y): 107 | shifts = pd.DataFrame() 108 | 109 | for shift in range(self.min_shift, self.max_shift + 1): 110 | shifts[shift] = data[x].shift(shift) 111 | 112 | shifts = shifts.dropna() 113 | self.shifted_corrs = shifts.corrwith(data[y]) 114 | 115 | q = self.shifted_corrs.max(), self.shifted_corrs.idxmax() 116 | return q 117 | -------------------------------------------------------------------------------- /gtime/model_selection/splitters.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | class FeatureSplitter: 6 | """Splits the feature matrices X and y in X_train, y_train, X_test, y_test. 7 | 8 | X and y are the feature matrices obtained from the FeatureCreation class. 9 | 10 | Parameters 11 | ---------- 12 | drop_na_mode : str, optional, default: ``'any'`` 13 | How to drop the Nan contained in the ``X`` and ``y`` matrices. Only 'any' is 14 | supported for the moment. 15 | 16 | Examples 17 | -------- 18 | >>> import pandas as pd 19 | >>> import numpy as np 20 | >>> from gtime.model_selection import FeatureSplitter 21 | >>> X = pd.DataFrame.from_dict({"feature_0": [np.nan, 0, 1, 2, 3, 4, 5, 6, 7, 8], 22 | ... "feature_1": [np.nan, np.nan, 0.5, 1.5, 2.5, 3.5, 23 | ... 4.5, 5.5, 6.5, 7.5, ] 24 | ... }) 25 | >>> y = pd.DataFrame.from_dict({"y_0": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 26 | ... "y_1": [1, 2, 3, 4, 5, 6, 7, 8, 9, np.nan], 27 | ... "y_2": [2, 3, 4, 5, 6, 7, 8, 9, np.nan, np.nan] 28 | ... }) 29 | >>> feature_splitter = FeatureSplitter() 30 | >>> X_train, y_train, X_test, y_test = feature_splitter.transform(X, y) 31 | >>> X_train 32 | feature_0 feature_1 33 | 2 1.0 0.5 34 | 3 2.0 1.5 35 | 4 3.0 2.5 36 | 5 4.0 3.5 37 | 6 5.0 4.5 38 | 7 6.0 5.5 39 | >>> y_train 40 | y_0 y_1 y_2 41 | 2 2 3.0 4.0 42 | 3 3 4.0 5.0 43 | 4 4 5.0 6.0 44 | 5 5 6.0 7.0 45 | 6 6 7.0 8.0 46 | 7 7 8.0 9.0 47 | >>> X_test 48 | feature_0 feature_1 49 | 8 7.0 6.5 50 | 9 8.0 7.5 51 | >>> y_test 52 | y_0 y_1 y_2 53 | 8 8 9.0 NaN 54 | 9 9 NaN NaN 55 | 56 | """ 57 | 58 | def __init__(self, drop_na_mode: str = "any"): 59 | if drop_na_mode != "any": 60 | raise ValueError( 61 | f'Only drop_na_mode="any" is supported. Detected: {drop_na_mode}' 62 | ) 63 | self.drop_na_mode = drop_na_mode 64 | 65 | def transform( 66 | self, X: pd.DataFrame, y: pd.DataFrame 67 | ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame): 68 | """Split the feature matrices X and y in X_train, y_train, X_test, y_test. 69 | 70 | ``X`` and ``y`` are the feature matrices obtained from the FeatureCreation 71 | class. 72 | 73 | Parameters 74 | ---------- 75 | X : pd.DataFrame, shape (n_samples, n_features), required 76 | The feature matrix. 77 | 78 | y : pd.DataFrame, shape (n_samples, horizon), required 79 | The y matrix. 80 | 81 | Returns 82 | ------- 83 | X_train, y_train, X_test, y_test : Tuple[pd.DataFrame, pd.DataFrame, 84 | pd.DataFrame, pd.DataFrame] 85 | The X and y, split between train and test. 86 | 87 | """ 88 | X, y = self._drop_X_na(X, y) 89 | X_train, y_train, X_test, y_test = self._split_train_test(X, y) 90 | return X_train, y_train, X_test, y_test 91 | 92 | def _drop_X_na( 93 | self, X: pd.DataFrame, y: pd.DataFrame 94 | ) -> (pd.DataFrame, pd.DataFrame): 95 | 96 | X = X.dropna(axis=0, how=self.drop_na_mode) 97 | y = y.loc[X.index] 98 | return X, y 99 | 100 | def _split_train_test( 101 | self, X: pd.DataFrame, y: pd.DataFrame 102 | ) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame): 103 | 104 | train_indexes, test_indexes = self._get_train_test_indexes_from_y(y) 105 | X_train, y_train = X.loc[train_indexes], y.loc[train_indexes] 106 | X_test, y_test = X.loc[test_indexes], y.loc[test_indexes] 107 | return X_train, y_train, X_test, y_test 108 | 109 | def _get_train_test_indexes_from_y(self, y): 110 | last_train_index = self._last_non_nan_y_index(y) 111 | train_indexes = y.loc[:last_train_index].index if last_train_index else [] 112 | test_indexes = y.index.difference(train_indexes) 113 | return train_indexes, test_indexes 114 | 115 | def _last_non_nan_y_index(self, y: pd.DataFrame) -> pd.Period: 116 | y_nan = y.isnull().any(axis=1).replace(True, np.nan) 117 | return y_nan.last_valid_index() 118 | -------------------------------------------------------------------------------- /gtime/regressors/multi_output.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | 3 | import numpy as np 4 | from sklearn.base import RegressorMixin 5 | from sklearn.multioutput import ( 6 | MultiOutputRegressor, 7 | _MultiOutputEstimator, 8 | _fit_estimator, 9 | ) 10 | from sklearn.utils import check_X_y, check_array 11 | from sklearn.utils.validation import check_is_fitted 12 | 13 | from gtime.explainability.explainer import Explainer, _LimeExplainer, _ShapExplainer 14 | 15 | 16 | class MultiFeatureMultiOutputRegressor(RegressorMixin, _MultiOutputEstimator): 17 | """ Multi target regression with option to choose the features for each target. 18 | 19 | This strategy consists of fitting one regressor per target. It is built over 20 | sklearn.multioutput.MultiOutputRegressor. Compared to this, it allows to choose 21 | different features for each regressor. 22 | 23 | Parameters 24 | ---------- 25 | estimator: RegressorMixin, required 26 | An estimator object implementing fit and predict. 27 | 28 | Examples 29 | -------- 30 | >>> import numpy as np 31 | >>> from gtime.regressors import MultiFeatureMultiOutputRegressor 32 | >>> from sklearn.ensemble import RandomForestRegressor 33 | >>> X = np.random.random((30, 5)) 34 | >>> y = np.random.random((30, 3)) 35 | >>> X_train, y_train = X[:20], y[:20] 36 | >>> X_test, y_test = X[20:], y[20:] 37 | >>> 38 | >>> random_forest = RandomForestRegressor() 39 | >>> regressor = MultiFeatureMultiOutputRegressor(estimator=random_forest) 40 | >>> 41 | >>> target_to_features_dict = {0: [0,1,2], 1: [0,1,3], 2: [0,1,4]} 42 | >>> regressor.fit(X_train, y_train, target_to_features_dict=target_to_features_dict) 43 | >>> 44 | >>> predictions = regressor.predict(X_test) 45 | >>> predictions.shape 46 | (10, 3) 47 | 48 | """ 49 | 50 | def __init__( 51 | self, 52 | estimator: RegressorMixin, 53 | target_to_features_dict: Dict[int, List[int]] = None, 54 | ): 55 | super().__init__(estimator=estimator, n_jobs=1) 56 | self.target_to_features_dict = target_to_features_dict 57 | 58 | def fit(self, X: np.ndarray, y: np.ndarray, **kwargs): 59 | """Fit the model. 60 | 61 | Train the models, one for each target variable in y. 62 | 63 | Parameters 64 | ---------- 65 | X : np.ndarray, shape (n_samples, n_features), required. 66 | The data. 67 | y : np.ndarray, shape (n_samples, horizon), required. 68 | The matrix containing the target variables. 69 | 70 | Returns 71 | ------- 72 | self : object 73 | 74 | 75 | """ 76 | target_to_features_dict = kwargs.get( 77 | "target_to_features_dict", self.target_to_features_dict 78 | ) 79 | if target_to_features_dict is None: 80 | super().fit(X, y) 81 | self.target_to_features_dict_ = None 82 | return self 83 | 84 | X, y = check_X_y(X, y, multi_output=True, accept_sparse=True) 85 | 86 | if y.ndim == 1: 87 | raise ValueError("y must have at least two dimensions") 88 | 89 | self.estimators_ = [ 90 | _fit_estimator(self.estimator, X[:, target_to_features_dict[i]], y[:, i]) 91 | for i in range(y.shape[1]) 92 | ] 93 | self.target_to_features_dict_ = target_to_features_dict 94 | self.expected_X_shape_ = X.shape[1] 95 | return self 96 | 97 | def predict(self, X: np.ndarray) -> np.ndarray: 98 | """For each row in ``X``, make a prediction for each fitted model 99 | 100 | Parameters 101 | ---------- 102 | X : np.ndarray, shape (n_samples, n_features), required 103 | The data. 104 | 105 | Returns 106 | ------- 107 | predictions : np.ndarray, shape (n_samples, horizon) 108 | The predictions 109 | 110 | """ 111 | check_is_fitted(self) 112 | if self.target_to_features_dict_ is None: 113 | return super().predict(X) 114 | 115 | X = check_array(X, accept_sparse=True) 116 | if X.shape[1] != self.expected_X_shape_: 117 | raise ValueError( 118 | f"Expected X shape is {self.expected_X_shape_}. Detected {X.shape[1]}" 119 | ) 120 | y = [ 121 | estimator.predict(X[:, self.target_to_features_dict_[i]]) 122 | for i, estimator in enumerate(self.estimators_) 123 | ] 124 | 125 | return np.asarray(y).T 126 | -------------------------------------------------------------------------------- /gtime/hierarchical/tests/test_naive.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | import sklearn 5 | from hypothesis import given 6 | import hypothesis.strategies as st 7 | from hypothesis.extra.numpy import arrays 8 | from pytest import fixture 9 | 10 | from gtime.hierarchical import HierarchicalNaive, HierarchicalBase 11 | from gtime.utils.fixtures import ( 12 | time_series_forecasting_model1_no_cache, 13 | features1, 14 | model1, 15 | ) 16 | from gtime.utils.hypothesis.time_indexes import giotto_time_series, period_indexes 17 | 18 | 19 | @fixture(scope="function") 20 | def hierarchical_naive_model(time_series_forecasting_model1_no_cache): 21 | return HierarchicalNaive(time_series_forecasting_model1_no_cache) 22 | 23 | 24 | @st.composite 25 | def n_time_series_with_same_index( 26 | draw, min_length: int = 5, min_n: int = 1, max_n: int = 5, 27 | ): 28 | n = draw(st.integers(min_value=min_n, max_value=max_n)) 29 | index = draw(period_indexes(min_length=min_length)) 30 | dictionary = {} 31 | for i in range(n): 32 | key = str(i) 33 | df_values = draw( 34 | arrays( 35 | dtype=np.float64, 36 | shape=index.shape[0], 37 | elements=st.floats(allow_nan=False, allow_infinity=False, width=32), 38 | ) 39 | ) 40 | value = pd.DataFrame(index=index, data=df_values) 41 | dictionary[key] = value 42 | return dictionary 43 | 44 | 45 | class TestHierarchicalBase: 46 | def test_class_abstract(self, model1): 47 | HierarchicalBase(model1, {}) 48 | 49 | 50 | class TestHierarchicalNaive: 51 | def test_constructor(self, time_series_forecasting_model1_no_cache): 52 | HierarchicalNaive(model=time_series_forecasting_model1_no_cache) 53 | 54 | def test_constructor_no_hierarchy_tree( 55 | self, time_series_forecasting_model1_no_cache 56 | ): 57 | hierarchy_tree = {} 58 | with pytest.raises(TypeError): 59 | HierarchicalNaive( 60 | model=time_series_forecasting_model1_no_cache, 61 | hierarchy_tree=hierarchy_tree, 62 | ) 63 | 64 | @given(time_series=giotto_time_series(min_length=5)) 65 | def test_error_fit_dataframe(self, time_series, hierarchical_naive_model): 66 | with pytest.raises(ValueError): 67 | hierarchical_naive_model.fit(time_series) 68 | 69 | @given(time_series=giotto_time_series(min_length=5)) 70 | def test_error_fit_key_not_string(self, time_series, hierarchical_naive_model): 71 | with pytest.raises(ValueError): 72 | hierarchical_naive_model.fit({1: time_series}) 73 | 74 | def test_error_fit_value_not_dataframe(self, hierarchical_naive_model): 75 | with pytest.raises(ValueError): 76 | hierarchical_naive_model.fit({"wrong_field": 12}) 77 | 78 | @given(dataframes=n_time_series_with_same_index()) 79 | def test_fit_n_dataframes(self, dataframes, hierarchical_naive_model): 80 | hierarchical_naive_model.fit(dataframes) 81 | 82 | @given(dataframes=n_time_series_with_same_index()) 83 | def test_fit_predict_n_dataframes_on_different_data( 84 | self, dataframes, hierarchical_naive_model 85 | ): 86 | hierarchical_naive_model.fit(dataframes).predict(dataframes) 87 | 88 | @given(dataframes=n_time_series_with_same_index()) 89 | def test_fit_predict_n_dataframes(self, dataframes, hierarchical_naive_model): 90 | hierarchical_naive_model.fit(dataframes).predict() 91 | 92 | @given(dataframes=n_time_series_with_same_index()) 93 | def test_fit_predict_on_subset_of_time_series( 94 | self, dataframes, hierarchical_naive_model 95 | ): 96 | key = np.random.choice(list(dataframes.keys()), 1)[0] 97 | hierarchical_naive_model.fit(dataframes) 98 | hierarchical_naive_model.predict({key: dataframes[key]}) 99 | 100 | def test_error_predict_not_fitted(self, hierarchical_naive_model): 101 | with pytest.raises(sklearn.exceptions.NotFittedError): 102 | hierarchical_naive_model.predict() 103 | 104 | @given(dataframes=n_time_series_with_same_index()) 105 | def test_error_with_bad_predict_key(self, dataframes, hierarchical_naive_model): 106 | correct_key = np.random.choice(list(dataframes.keys()), 1)[0] 107 | bad_key = "".join(dataframes.keys()) + "bad_key" 108 | hierarchical_naive_model.fit(dataframes) 109 | with pytest.raises(KeyError): 110 | hierarchical_naive_model.predict({bad_key: dataframes[correct_key]}) 111 | -------------------------------------------------------------------------------- /gtime/causality/linear_coefficient.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.base import BaseEstimator, TransformerMixin 4 | from sklearn.linear_model import LinearRegression 5 | 6 | from gtime.causality.base import CausalityMixin 7 | 8 | 9 | class ShiftedLinearCoefficient(BaseEstimator, TransformerMixin, CausalityMixin): 10 | """Test the shifted linear fit coefficients between two or more time series. 11 | 12 | Parameters 13 | ---------- 14 | min_shift : int, optional, default: ``1`` 15 | The minimum number of shifts to check for. 16 | 17 | max_shift : int, optional, default: ``10`` 18 | The maximum number of shifts to check for. 19 | 20 | target_col : str, optional, default: ``None`` 21 | The column to use as the a reference (i.e., the column which is not 22 | shifted). 23 | 24 | dropna : bool, optional, default: ``False`` 25 | Determines if the Nan values created by shifting are retained or dropped. 26 | 27 | bootstrap_iterations : int, optional, default: ``None`` 28 | If not None, compute the p_values of the test, by performing bootstrapping of 29 | the original data (sampling with replacement). 30 | 31 | permutation_iterations : int, optional, default: ``None`` 32 | If not None, compute the p_values of the test, by performing permutations of 33 | the original data. 34 | 35 | Examples 36 | -------- 37 | 38 | >>> from gtime.causality.linear_coefficient import ShiftedLinearCoefficient 39 | >>> import pandas.util.testing as testing 40 | >>> data = testing.makeTimeDataFrame(freq="s") 41 | >>> slc = ShiftedLinearCoefficient(target_col="A") 42 | >>> slc.fit(data) 43 | >>> slc.best_shifts_ 44 | y A B C D 45 | x 46 | A 3 6 8 5 47 | B 9 9 4 1 48 | C 8 2 4 9 49 | D 3 9 4 3 50 | >>> slc.max_corrs_ 51 | y A B C D 52 | x 53 | A 0.460236 0.420005 0.339370 0.267143 54 | B 0.177856 0.300350 0.367150 0.550490 55 | C 0.484860 0.263036 0.456046 0.251342 56 | D 0.580068 0.344688 0.253626 0.256220 57 | """ 58 | 59 | def __init__( 60 | self, 61 | min_shift: int = 1, 62 | max_shift: int = 10, 63 | target_col: str = None, 64 | dropna: bool = False, 65 | bootstrap_iterations: int = None, 66 | permutation_iterations: int = None, 67 | ): 68 | super().__init__( 69 | bootstrap_iterations=bootstrap_iterations, 70 | permutation_iterations=permutation_iterations, 71 | ) 72 | self.min_shift = min_shift 73 | self.max_shift = max_shift 74 | self.target_col = target_col 75 | self.dropna = dropna 76 | 77 | def fit(self, data: pd.DataFrame) -> "ShiftedLinearCoefficient": 78 | """Create the DataFrame of shifts of each time series which maximize the shifted 79 | linear fit coefficients. 80 | 81 | Parameters 82 | ---------- 83 | data : pd.DataFrame, shape (n_samples, n_time_series), required 84 | The DataFrame containing the time-series on which to compute the shifted 85 | linear fit coefficients. 86 | 87 | Returns 88 | ------- 89 | self : ``ShiftedLinearCoefficient`` 90 | 91 | """ 92 | best_shifts = self._compute_best_shifts(data, self._get_max_coeff_shift) 93 | pivot_tables = self._create_pivot_tables(best_shifts) 94 | 95 | self.best_shifts_ = pivot_tables["best_shifts"] 96 | self.max_corrs_ = pivot_tables["max_corrs"] 97 | 98 | if self.bootstrap_iterations: 99 | self.bootstrap_p_values_ = pivot_tables["bootstrap_p_values"] 100 | 101 | if self.permutation_iterations: 102 | self.permutation_p_values_ = pivot_tables["permutation_p_values"] 103 | 104 | return self 105 | 106 | def _get_max_coeff_shift(self, data: pd.DataFrame, x, y): 107 | shifts = pd.DataFrame() 108 | shifts[x] = data[x] 109 | shifts[y] = data[y] 110 | # print("shifts:", shifts) 111 | # print("data:", data) 112 | for shift in range(self.min_shift, self.max_shift + 1): 113 | # print("data", shift, ":", data[x].shift(shift)) 114 | shifts[shift] = data[x].shift(shift) 115 | 116 | shifts = shifts.dropna() 117 | 118 | lf = LinearRegression().fit( 119 | shifts[range(self.min_shift, self.max_shift + 1)].values, shifts[y].values 120 | ) 121 | 122 | q = lf.coef_.max(), np.argmax(lf.coef_) + (self.min_shift - 0) 123 | return q 124 | -------------------------------------------------------------------------------- /gtime/feature_extraction/trend.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from scipy.optimize import minimize 6 | from sklearn.base import BaseEstimator, TransformerMixin 7 | from sklearn.metrics import mean_squared_error 8 | from sklearn.utils.validation import check_is_fitted 9 | 10 | from gtime.base import FeatureMixin, add_class_name 11 | from gtime.utils.trends import TRENDS 12 | 13 | __all__ = "Detrender" 14 | 15 | 16 | class Detrender(BaseEstimator, TransformerMixin, FeatureMixin): 17 | """Apply a de-trend transformation to a time series. 18 | 19 | The purpose of the class is to fit a model, define through the `trend` parameter, in 20 | order to find a trend in the time series. Then, the trend can be removed by removing 21 | the predictions of the fitted model. 22 | 23 | Parameters 24 | ---------- 25 | trend : ``'polynomial'`` | ``'exponential'``, required 26 | The kind of trend removal to apply. 27 | 28 | trend_x0 : np.array, required 29 | Initialisation parameters passed to the trend function. This is used to select 30 | a starting point in order to minimize the `loss` function. 31 | 32 | loss : Callable, optional, default: ``mean_squared_error`` 33 | The loss function to minimize. 34 | 35 | method : string, optional, default: ``"BFGS"`` 36 | Loss function optimisation method. 37 | 38 | Examples 39 | -------- 40 | >>> import pandas as pd 41 | >>> import numpy as np 42 | >>> from gtime.feature_extraction import Detrender 43 | >>> detrender = Detrender(trend='polynomial', trend_x0=np.zeros(2)) 44 | >>> time_index = pd.date_range("2020-01-01", "2020-01-10") 45 | >>> X = pd.DataFrame(range(0, 10), index=time_index) 46 | >>> detrender.fit_transform(X) 47 | 0__Detrender 48 | 2020-01-01 9.180937e-07 49 | 2020-01-02 8.020709e-07 50 | 2020-01-03 6.860481e-07 51 | 2020-01-04 5.700253e-07 52 | 2020-01-05 4.540024e-07 53 | 2020-01-06 3.379796e-07 54 | 2020-01-07 2.219568e-07 55 | 2020-01-08 1.059340e-07 56 | 2020-01-09 -1.008878e-08 57 | 2020-01-10 -1.261116e-07 58 | 59 | """ 60 | 61 | def __init__( 62 | self, 63 | trend: str, 64 | trend_x0: np.array, 65 | loss: Callable = mean_squared_error, 66 | method: str = "BFGS", 67 | ): 68 | self.trend = trend 69 | self.trend_x0 = trend_x0 70 | self.loss = loss 71 | self.method = method 72 | 73 | def fit(self, X: pd.DataFrame, y=None) -> "Detrender": 74 | """Fit the estimator. 75 | 76 | Parameters 77 | ---------- 78 | X : pd.DataFrame, shape (n_samples, n_features) 79 | Input data. 80 | 81 | y : None 82 | There is no need of a target in a transformer, yet the pipeline API 83 | requires this parameter. 84 | 85 | Returns 86 | ------- 87 | self : object 88 | Returns self. 89 | 90 | """ 91 | 92 | # TODO: create validation function 93 | if self.trend not in TRENDS: 94 | raise ValueError( 95 | "The trend '%s' is not supported. Supported " 96 | "trends are %s." % (self.trend, list(sorted(TRENDS))) 97 | ) 98 | 99 | self.best_trend_params_ = minimize( 100 | lambda opt: self.loss( 101 | X.values, [TRENDS[self.trend](t, opt) for t in range(0, X.shape[0])] 102 | ), 103 | self.trend_x0, 104 | method=self.method, 105 | options={"disp": False}, 106 | )["x"] 107 | 108 | self.t0_ = X.index[0] 109 | freq = X.index.freq 110 | if freq is not None: 111 | self.period_ = freq 112 | else: 113 | self.period_ = X.index[1] - X.index[0] 114 | 115 | return self 116 | 117 | @add_class_name 118 | def transform(self, time_series: pd.DataFrame) -> pd.DataFrame: 119 | """Transform the ``time_series`` by removing the trend. 120 | 121 | Parameters 122 | ---------- 123 | time_series: pd.DataFrame, shape (n_samples, 1), required 124 | The time series to transform. 125 | 126 | Returns 127 | ------- 128 | time_series_t : pd.DataFrame, shape (n_samples, n_features) 129 | The transformed time series, without the trend. 130 | 131 | """ 132 | check_is_fitted(self) 133 | 134 | time_steps = (time_series.index - self.t0_) / self.period_ 135 | 136 | predictions = pd.Series( 137 | index=time_series.index, 138 | data=np.array( 139 | [TRENDS[self.trend](t, self.best_trend_params_) for t in time_steps] 140 | ).flatten(), 141 | ) 142 | 143 | return time_series.sub(predictions, axis=0) 144 | -------------------------------------------------------------------------------- /gtime/plotting/tests/test_preprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import re 4 | import pytest 5 | import hypothesis.strategies as st 6 | from hypothesis import given, settings, example 7 | from gtime.utils.hypothesis.time_indexes import giotto_time_series, period_indexes 8 | from gtime.plotting.preprocessing import ( 9 | seasonal_split, 10 | acf, 11 | pacf, 12 | _get_cycle_names, 13 | _get_season_names, 14 | _autocorrelation, 15 | _normalize, 16 | _solve_yw_equation, 17 | _week_of_year, 18 | yule_walker, 19 | ) 20 | 21 | 22 | class TestSplits: 23 | @given(t=period_indexes(min_length=1, max_length=1)) 24 | @example(t=pd.PeriodIndex(["1974-12-31"], freq="W")) 25 | @example(t=pd.PeriodIndex(["1972-01-01"], freq="W")) 26 | @settings(deadline=None) 27 | def test_week_of_year(self, t): 28 | period = t[0] 29 | week = _week_of_year(period) 30 | assert re.match(r"\d{4}_\d\d?$", week) 31 | 32 | @given( 33 | df=giotto_time_series(min_length=3, max_length=500), 34 | cycle=st.one_of( 35 | st.sampled_from(["year", "quarter", "month", "week"]), 36 | st.from_regex(r"[1-9][DWMQY]", fullmatch=True), 37 | ), 38 | ) 39 | @settings(deadline=None) 40 | def test__get_cycle_names_size(self, df, cycle): 41 | cycle = _get_cycle_names(df, cycle) 42 | assert len(cycle) == len(df) 43 | 44 | @given( 45 | df=giotto_time_series(min_length=3, max_length=500), 46 | cycle=st.one_of( 47 | st.sampled_from(["year", "quarter", "month", "week"]), 48 | st.from_regex(r"[1-9][DWMQY]", fullmatch=True), 49 | ), 50 | freq=st.from_regex(r"[1-9]?[DWMQ]", fullmatch=True), 51 | ) 52 | @settings(deadline=None) 53 | def test__get_season_names_size(self, df, cycle, freq): 54 | seasons = _get_season_names(df, cycle, freq) 55 | assert len(seasons) == len(df) 56 | 57 | @given( 58 | df=giotto_time_series(min_length=3, max_length=500), 59 | cycle=st.one_of( 60 | st.sampled_from(["year", "quarter", "month", "week"]), 61 | st.from_regex(r"[1-9][DWMQY]", fullmatch=True), 62 | ), 63 | freq=st.one_of(st.from_regex(r"[1-9]?[DWMQ]", fullmatch=True), st.none()), 64 | agg=st.sampled_from(["mean", "sum", "last"]), 65 | ) 66 | @settings(deadline=None) 67 | def test_seasonal_split_shape_named(self, df, cycle, freq, agg): 68 | split = seasonal_split(df, cycle=cycle, freq=freq, agg=agg) 69 | if freq is None: 70 | freq = df.index.freqstr 71 | assert split.stack().shape == df.resample(freq).agg(agg).dropna().shape 72 | 73 | 74 | class TestAcf: 75 | @given(x=st.lists(st.floats(allow_nan=False), min_size=1)) 76 | def test_autocorrelation(self, x): 77 | autocorr = _autocorrelation(np.array(x)) 78 | expected = np.correlate(x, x, mode="full")[-len(x) :] / len(x) 79 | np.testing.assert_array_equal(autocorr, expected) 80 | 81 | @given( 82 | x=st.lists( 83 | st.floats( 84 | allow_nan=False, allow_infinity=False, max_value=1e20, min_value=1e20 85 | ), 86 | min_size=1, 87 | ) 88 | ) 89 | def test_scale(self, x): 90 | scaled_x = _normalize(np.array(x)) 91 | assert scaled_x.mean() == pytest.approx(0.0) 92 | assert scaled_x.std() == pytest.approx(1.0) or scaled_x.std() == pytest.approx( 93 | 0.0 94 | ) 95 | 96 | @given(x=st.lists(st.floats(allow_nan=False, allow_infinity=False), min_size=2)) 97 | def test_solve_yw(self, x): 98 | rho = _solve_yw_equation(np.array(x)) 99 | if not np.isnan(np.sum(rho)): 100 | assert len(rho) == len(x) - 1 101 | 102 | @given( 103 | x=st.lists(st.floats(allow_nan=False, allow_infinity=False), min_size=2), 104 | order=st.integers(min_value=1), 105 | ) 106 | def test_yule_walker_abs(self, x, order): 107 | pacf = yule_walker(np.array(x), order) 108 | if not (np.isnan(np.sum(pacf)) or len(pacf) == 0): 109 | assert all(abs(pacf) <= 2) 110 | 111 | @given( 112 | df=giotto_time_series(min_length=1, allow_nan=False, allow_infinity=False), 113 | max_lag=st.one_of(st.integers(min_value=1, max_value=100), st.none()), 114 | ) 115 | def test_acf_len(self, df, max_lag): 116 | df_array = np.ravel(df.values) 117 | res = acf(df_array, max_lag) 118 | if max_lag is None: 119 | max_lag = len(df) 120 | assert len(res) == min(max_lag, len(df)) 121 | 122 | @given( 123 | df=giotto_time_series( 124 | min_length=1, allow_nan=False, allow_infinity=False, max_length=50 125 | ), 126 | max_lag=st.one_of(st.integers(min_value=1, max_value=100), st.none()), 127 | ) 128 | def test_pacf_len(self, df, max_lag): 129 | df_array = np.ravel(df.values) 130 | res = pacf(df_array, max_lag) 131 | if max_lag is None: 132 | max_lag = len(df) 133 | assert len(res) == min(max_lag, len(df)) 134 | -------------------------------------------------------------------------------- /gtime/forecasting/online.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from sklearn.base import BaseEstimator 4 | from sklearn.utils.validation import check_array, check_X_y, check_random_state 5 | 6 | 7 | def l1(a, b): 8 | return np.abs(np.subtract(a, b)) 9 | 10 | 11 | class HedgeForecaster(BaseEstimator): 12 | """Regressor model using Hedge algorithm. 13 | 14 | This algorithm is based on a multiplicative weight update method to create a dynamic combination of regressive 15 | models. In theory, there is no common training phase on data, only the loss is necessary to update the model. 16 | 17 | Parameters 18 | ---------- 19 | 20 | learning_rate : float, (default=0.001) 21 | The factor to use for the weight update. 22 | 23 | loss : callable, optional (default=`gtime.forecasting.online.l1`) 24 | Loss function use to compute loss matrix. 25 | 26 | random_state : int, RandomState instance or None, optional (default=None) 27 | Controls both the randomness of the bootstrapping of the samples used 28 | when building trees (if ``bootstrap=True``) and the sampling of the 29 | features to consider when looking for the best split at each node 30 | (if ``max_features < n_features``). 31 | # TODO: write glossary 32 | See :term:`Glossary ` for details. 33 | 34 | Attributes 35 | ---------- 36 | loss_matrix_ : array, (n_samples, n_experts) 37 | Loss matrix between X and y. 38 | 39 | total_loss_ : int or float, 40 | Sum of losses based on Hedge algorithm decisions. 41 | 42 | weights_ : array, (n_experts) 43 | Last weight of each expert. 44 | 45 | decisions_ : array, (n_samples) 46 | Indices of chosen expert depending on weights. 47 | 48 | Examples 49 | -------- 50 | >>> import pandas as pd 51 | >>> import numpy as np 52 | >>> from gtime.forecasting.online import HedgeForecaster 53 | >>> time_index = pd.date_range("2020-01-01", "2020-01-20") 54 | >>> X = pd.DataFrame(np.random.randint(4, size=(20, 3)), index=time_index) 55 | >>> y = pd.DataFrame(np.random.randint(4, size=(20, 1)), index=time_index, columns=["y_1"]) 56 | >>> hr = HedgeForecaster(random_state=42) 57 | >>> hr.fit_predict(X, y).head() 58 | 0 59 | 2020-01-01 2 60 | 2020-01-02 0 61 | 2020-01-03 3 62 | 2020-01-04 3 63 | 2020-01-05 2 64 | >>> print(f"Estimator weights: {hr.weights_}") 65 | Estimator weights: [0.97713925 0.97723619 0.97980439] 66 | >>> print(f"Decisions: {hr.decisions_}") 67 | Decisions: [1 2 2 1 0 0 0 2 1 2 0 2 2 0 0 0 0 1 1 0] 68 | >>> print(f"Total loss: {hr.total_loss_}") 69 | Total loss: 30 70 | 71 | """ 72 | 73 | def __init__( 74 | self, learning_rate: float = 0.001, loss: callable = l1, random_state=None 75 | ): 76 | self.eps = learning_rate 77 | self.loss = loss 78 | self.random_state = random_state 79 | pass 80 | 81 | def hedge(self, timestamps, n_experts, loss, eps, random_state): 82 | weights = np.ones(n_experts) 83 | self.decisions_ = np.zeros(timestamps, dtype=int) 84 | 85 | total_loss = 0 86 | for t in range(timestamps): 87 | self.decisions_[t] = random_state.choice( 88 | n_experts, p=weights / np.sum(weights) 89 | ) 90 | total_loss += loss[t][np.int(self.decisions_[t])] 91 | weights *= np.exp(-eps * loss[t]) 92 | return total_loss, weights 93 | 94 | def fit(self, X, y): 95 | """ Fit the model to data, compute weights and decisions iteratively. 96 | 97 | Parameters 98 | ---------- 99 | X : array-like, shape (n_samples, n_features) 100 | Data. 101 | 102 | Returns 103 | ------- 104 | self : object 105 | """ 106 | 107 | random_state = check_random_state(self.random_state) 108 | 109 | #  FIXME: multi_output is not currently supported but mono-column dataframe is 2D (n, 1) so multi_output=True 110 | # makes it easier to handle 111 | X, y = check_X_y(X, y, multi_output=True, y_numeric=True) 112 | 113 | self.loss_matrix_ = self.loss(X, y) 114 | 115 | timestamps = len(X) 116 | n_experts = X.shape[1] 117 | 118 | self.total_loss_, self.weights_ = self.hedge( 119 | timestamps=timestamps, 120 | n_experts=n_experts, 121 | loss=self.loss_matrix_, 122 | eps=self.eps, 123 | random_state=random_state, 124 | ) 125 | 126 | return self 127 | 128 | def fit_predict(self, X, y): 129 | """Fit and predict variable using Hedge algorithm. 130 | 131 | Parameters 132 | ---------- 133 | X : (sparse) array-like, shape (n_samples, n_features) 134 | Data. 135 | 136 | y : (sparse) array-like, shape (n_samples, n_outputs) 137 | Predictions. 138 | 139 | Returns 140 | ------- 141 | predictions : pd.DataFrame 142 | Predictions. 143 | """ 144 | self.fit(X, y) 145 | 146 | predictions = pd.DataFrame( 147 | np.take_along_axis(check_array(X), self.decisions_.reshape(-1, 1), axis=1), 148 | index=X.index, 149 | ) 150 | 151 | return predictions 152 | -------------------------------------------------------------------------------- /CONTRIBUTING.rst: -------------------------------------------------------------------------------- 1 | Contributing guidelines 2 | ======================= 3 | 4 | Pull Request Checklist 5 | ---------------------- 6 | 7 | Before sending your pull requests, make sure you followed this list. 8 | - Read the `contributing guidelines `_. 9 | - Read the `code of conduct `_. 10 | - Ensure you have signed the `contributor license agreement (CLA) `_. 11 | - Check if the changes are consistent with the guidelines. 12 | - Changes are consistent with the Coding Style. 13 | - Run Unit Tests. 14 | 15 | How to become a contributor and submit your own code 16 | ---------------------------------------------------- 17 | 18 | Contributor License Agreements 19 | ------------------------------ 20 | 21 | In order to become a contributor of Giotto, the first step is to sign the 22 | `contributor license agreement (CLA) `_. 23 | **NOTE**: Only original source code from you and other people that have signed 24 | the CLA can be accepted into the main repository. 25 | 26 | Contributing code 27 | ----------------- 28 | 29 | If you have improvements to Giotto, do not hesitate to send us pull requests! 30 | Please follow the Github how to (https://help.github.com/articles/using-pull-requests/). 31 | The Giotto Team will review your pull requests. Once the pull requests are approved and pass continuous integration checks, the 32 | Giotto team will work on getting your pull request submitted to our GitHub 33 | repository. Eventually, your pull request will be merged automatically on GitHub. 34 | If you want to contribute, start working through the Giotto codebase, 35 | navigate to the `GitHub issue tab `_ 36 | and start looking through interesting issues. These are issues that we believe 37 | are particularly well suited for outside contributions, often because we 38 | probably won't get to them right now. If you decide to start on an issue, leave 39 | a comment so that other people know that you're working on it. If you want to 40 | help out, but not alone, use the issue comment thread to coordinate. 41 | 42 | Contribution guidelines and standards 43 | ------------------------------------- 44 | 45 | Before sending your pull request for review, make sure your changes are 46 | consistent with the guidelines and follow the coding style below. 47 | 48 | General guidelines and philosophy for contribution 49 | -------------------------------------------------- 50 | 51 | * Include unit tests when you contribute new features, as they help to 52 | a) prove that your code works correctly, and 53 | b) guard against future breaking changes to lower the maintenance cost. 54 | * Bug fixes also generally require unit tests, because the presence of bugs 55 | usually indicates insufficient test coverage. 56 | * Keep API compatibility in mind when you change code in core Giotto. 57 | * Clearly define your exceptions using the utils functions and test the exceptions. 58 | * When you contribute a new feature to Giotto, the maintenance burden is    59 | (by default) transferred to the Giotto team. This means that the benefit    60 | of the contribution must be compared against the cost of maintaining the    61 | feature. 62 | 63 | C++ coding style 64 | ---------------- 65 | 66 | Changes to Giotto C/C++ code should conform to `Google C++ Style Guide `_. 67 | Use `clang-tidy` to check your C/C++ changes. To install `clang-tidy` on 68 | ubuntu:16.04, do: 69 | 70 | 71 | .. code-block:: bash 72 | 73 | apt-get install -y clang-tidy 74 | 75 | You can check a C/C++ file by doing: 76 | 77 | .. code-block:: bash 78 | 79 | clang-format --style=google > /tmp/my_cc_file.ccdiff /tmp/my_cc_file.cc 80 | 81 | Python coding style 82 | ------------------- 83 | 84 | Changes to Giotto Python code should conform to PEP8 directives. 85 | Use `flake8` to check your Python changes. To install `flake8` just do 86 | 87 | .. code-block:: python 88 | 89 | pip install flake8 90 | 91 | You can use `flake8` on your python code via the following instructions: 92 | 93 | .. code-block:: python 94 | 95 | flake8 name_of_your_script.py 96 | 97 | Git pre-commit hook 98 | ------------------- 99 | We provide a pre-commit git hook to prevent accidental commits to the master branch and automatically formats the code 100 | using `black`. To activate, install the `pre-commit` library. 101 | 102 | Development requirements 103 | ------------------------ 104 | In order to contributing to giotto-time, some additional python packages are required with respect to the standard 105 | requirements. To install them, do 106 | 107 | .. code-block:: python 108 | 109 | pip install -r dev-requirements.txt 110 | 111 | Running unit tests 112 | ------------------ 113 | 114 | There are two ways to run Giotto unit tests. 115 | 116 | 1. Using tools and libraries installed directly on your system. The election tool is `pytest`. To install `pytest` just do 117 | 118 | .. code-block:: python 119 | 120 | pip install pytest 121 | 122 | You can use `pytest` on your python code via the following instructions: 123 | 124 | .. code-block:: python 125 | 126 | pytest name_of_your_script.py 127 | 128 | 2. Using Azure (azure-pipelines.yml) and Giotto's CI scripts. 129 | -------------------------------------------------------------------------------- /gtime/hierarchical/tests/test_bottom_up.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pytest 4 | import sklearn 5 | from hypothesis import given 6 | import networkx as nx 7 | import random 8 | import hypothesis.strategies as st 9 | from hypothesis.extra.numpy import arrays 10 | from pytest import fixture 11 | 12 | from gtime.hierarchical import HierarchicalBottomUp 13 | from gtime.utils.fixtures import ( 14 | time_series_forecasting_model1_no_cache, 15 | features1, 16 | model1, 17 | ) 18 | from gtime.utils.hypothesis.time_indexes import giotto_time_series, period_indexes 19 | 20 | 21 | @st.composite 22 | def n_time_series_with_same_index( 23 | draw, min_length: int = 5, min_n: int = 1, max_n: int = 5, 24 | ): 25 | n = draw(st.integers(min_value=min_n, max_value=max_n)) 26 | index = draw(period_indexes(min_length=min_length)) 27 | dictionary = {} 28 | for i in range(n): 29 | key = str(i) 30 | df_values = draw( 31 | arrays( 32 | dtype=np.float64, 33 | shape=index.shape[0], 34 | elements=st.floats(allow_nan=False, allow_infinity=False, width=32), 35 | ) 36 | ) 37 | value = pd.DataFrame(index=index, data=df_values) 38 | dictionary[key] = value 39 | return dictionary 40 | 41 | 42 | @st.composite 43 | def tree_construction(draw, dictionary): 44 | tree_nodes = list(dictionary.keys()) 45 | tree = nx.DiGraph() 46 | n = len(tree_nodes) 47 | for i in range(n): 48 | selected_key = random.choice(tree_nodes) 49 | if len(tree) > 0: 50 | selected_node = random.choice(list(tree.nodes)) 51 | tree.add_edge(selected_node, selected_key) 52 | tree.add_node(selected_key) 53 | tree_nodes.remove(selected_key) 54 | return tree 55 | 56 | 57 | @st.composite 58 | def hierarchical_bottom_up_model(draw, time_series_forecasting_model1_no_cache): 59 | dataframes = draw(n_time_series_with_same_index(min_n=5)) 60 | tree = draw(tree_construction(dataframes)) 61 | return HierarchicalBottomUp(time_series_forecasting_model1_no_cache, tree) 62 | 63 | 64 | @fixture(scope="function") 65 | def hierarchical_basic_bottom_up_model(time_series_forecasting_model1_no_cache): 66 | return HierarchicalBottomUp(time_series_forecasting_model1_no_cache, "infer") 67 | 68 | 69 | class TestHierarchicalBottomUp: 70 | def test_basic_constructor(self, time_series_forecasting_model1_no_cache): 71 | HierarchicalBottomUp( 72 | model=time_series_forecasting_model1_no_cache, hierarchy_tree="infer" 73 | ) 74 | 75 | @given(dataframes=n_time_series_with_same_index(min_n=5)) 76 | def test_fit_predict_basic_bottom_up_on_different_data( 77 | self, dataframes, hierarchical_basic_bottom_up_model 78 | ): 79 | hierarchical_basic_bottom_up_model.fit(dataframes).predict(dataframes) 80 | 81 | @given(dataframes=n_time_series_with_same_index(min_n=5)) 82 | def test_fit_predict_basic_bottom_up( 83 | self, dataframes, hierarchical_basic_bottom_up_model 84 | ): 85 | hierarchical_basic_bottom_up_model.fit(dataframes).predict() 86 | 87 | @given(dataframes=n_time_series_with_same_index()) 88 | def test_constructor(self, time_series_forecasting_model1_no_cache, dataframes): 89 | tree = tree_construction(dataframes) 90 | HierarchicalBottomUp(time_series_forecasting_model1_no_cache, tree) 91 | 92 | @given(data=st.data(), dataframes=n_time_series_with_same_index(min_n=5)) 93 | def test_fit_predict_bottom_up( 94 | self, data, dataframes, time_series_forecasting_model1_no_cache 95 | ): 96 | model = data.draw( 97 | hierarchical_bottom_up_model(time_series_forecasting_model1_no_cache) 98 | ) 99 | prediction = model.fit(dataframes).predict() 100 | for key in dataframes.keys(): 101 | if key not in prediction.keys(): 102 | raise ValueError 103 | 104 | @given(dataframes=n_time_series_with_same_index(min_n=5)) 105 | def test_fit_predict_on_subset_of_time_series( 106 | self, dataframes, hierarchical_basic_bottom_up_model 107 | ): 108 | key = np.random.choice(list(dataframes.keys()), 1)[0] 109 | hierarchical_basic_bottom_up_model.fit(dataframes) 110 | hierarchical_basic_bottom_up_model.predict({key: dataframes[key]}) 111 | 112 | def test_error_predict_not_fitted(self, hierarchical_basic_bottom_up_model): 113 | with pytest.raises(sklearn.exceptions.NotFittedError): 114 | hierarchical_basic_bottom_up_model.predict() 115 | 116 | @given(dataframes=n_time_series_with_same_index()) 117 | def test_error_with_bad_predict_key( 118 | self, dataframes, hierarchical_basic_bottom_up_model 119 | ): 120 | correct_key = np.random.choice(list(dataframes.keys()), 1)[0] 121 | bad_key = "".join(dataframes.keys()) + "bad_key" 122 | hierarchical_basic_bottom_up_model.fit(dataframes) 123 | with pytest.raises(KeyError): 124 | hierarchical_basic_bottom_up_model.predict( 125 | {bad_key: dataframes[correct_key]} 126 | ) 127 | 128 | @given(time_series=giotto_time_series(min_length=5)) 129 | def test_error_fit_dataframe(self, time_series, hierarchical_basic_bottom_up_model): 130 | with pytest.raises(ValueError): 131 | hierarchical_basic_bottom_up_model.fit(time_series) 132 | 133 | @given(time_series=giotto_time_series(min_length=5)) 134 | def test_error_fit_key_not_string( 135 | self, time_series, hierarchical_basic_bottom_up_model 136 | ): 137 | with pytest.raises(ValueError): 138 | hierarchical_basic_bottom_up_model.fit({1: time_series}) 139 | 140 | def test_error_fit_value_not_dataframe(self, hierarchical_basic_bottom_up_model): 141 | with pytest.raises(ValueError): 142 | hierarchical_basic_bottom_up_model.fit({"wrong_field": 12}) 143 | -------------------------------------------------------------------------------- /gtime/utils/hypothesis/feature_matrices.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import hypothesis.strategies as st 4 | import pandas as pd 5 | from hypothesis.extra.numpy import arrays 6 | from hypothesis.strategies import tuples, integers, floats 7 | 8 | from gtime.utils.hypothesis.general_strategies import shape_X_y_matrices, shape_matrix 9 | from .time_indexes import giotto_time_series 10 | from ...compose import FeatureCreation 11 | from ...model_selection import horizon_shift 12 | 13 | 14 | @st.composite 15 | def X_y_matrices( 16 | draw, 17 | horizon: int, 18 | df_transformer: FeatureCreation, 19 | min_length: Optional[int] = None, 20 | allow_nan_infinity: bool = True, 21 | ): 22 | """ Returns a strategy that generates X and y feature matrices. 23 | 24 | Parameters 25 | ---------- 26 | horizon : ``int``, required 27 | The number of steps to forecast in the future. It affects the y shape. 28 | 29 | df_transformer : ``List[TimeSeriesFeature]``, required 30 | The list of TimeSeriesFeature that is given as input to the FeaturesCreation 31 | 32 | min_length : ``int``, optional, (default=``None``) 33 | Minimum length of the matrices 34 | 35 | allow_nan_infinity : ``bool``, optional, (default=``True``) 36 | Allow nan and infinity in the starting time series 37 | 38 | Returns 39 | ------- 40 | X : pd.DataFrame 41 | X feature matrix 42 | 43 | y : pd.DataFrame 44 | y feature matrix 45 | """ 46 | min_length = min_length if min_length is not None else 1 47 | period_index_series = draw( 48 | giotto_time_series( 49 | min_length=min_length, 50 | allow_nan=allow_nan_infinity, 51 | allow_infinity=allow_nan_infinity, 52 | ) 53 | ) 54 | X = df_transformer.fit_transform(period_index_series) 55 | y = horizon_shift(period_index_series, horizon=horizon) 56 | 57 | return X, y 58 | 59 | 60 | @st.composite 61 | def X_matrices( 62 | draw, 63 | df_transformer: FeatureCreation, 64 | min_length: Optional[int] = None, 65 | allow_nan_infinity: bool = True, 66 | ): 67 | """ Returns a strategy that generates the X feature matrix. 68 | 69 | Parameters 70 | ---------- 71 | df_transformer : ``List[TimeSeriesFeature]``, required 72 | the list of TimeSeriesFeature that is given as input to the 73 | FeaturesCreation 74 | 75 | min_length : ``int``, optional, (default=``None``) 76 | minimum length of the matrices 77 | 78 | allow_nan_infinity : ``bool``, optional, (default=``True``) 79 | allow nan and infinity in the starting time series 80 | 81 | Returns 82 | ------- 83 | X : ``pd.DataFrame`` 84 | X feature matrix 85 | """ 86 | min_length = min_length if min_length is not None else 1 87 | period_index_series = draw( 88 | giotto_time_series( 89 | min_length=min_length, 90 | allow_nan=allow_nan_infinity, 91 | allow_infinity=allow_nan_infinity, 92 | ) 93 | ) 94 | 95 | X = df_transformer.fit_transform(period_index_series) 96 | return X 97 | 98 | 99 | @st.composite 100 | def y_matrices( 101 | draw, 102 | horizon: int = 3, 103 | min_length: Optional[int] = None, 104 | allow_nan_infinity: bool = True, 105 | ): 106 | """ Returns a strategy that generates the y feature matrix. 107 | 108 | Parameters 109 | ---------- 110 | horizon : ``int``, optional, (default=3) 111 | the number of steps to forecast in the future. It affects the y shape. 112 | 113 | min_length : ``int``, optional, (default=``None``) 114 | minimum length of the matrices 115 | 116 | allow_nan_infinity : ``bool``, optional, (default=``True``) 117 | allow nan and infinity in the starting time series 118 | 119 | Returns 120 | ------- 121 | y : ``pd.DataFrame`` 122 | y feature matrix 123 | """ 124 | min_length = min_length if min_length is not None else 1 125 | period_index_series = draw( 126 | giotto_time_series( 127 | min_length=min_length, 128 | allow_nan=allow_nan_infinity, 129 | allow_infinity=allow_nan_infinity, 130 | ) 131 | ) 132 | 133 | y = horizon_shift(period_index_series, horizon=horizon) 134 | 135 | return y 136 | 137 | 138 | @st.composite 139 | def numpy_X_y_matrices( 140 | draw, 141 | X_y_shapes=shape_X_y_matrices(), 142 | min_value: float = None, 143 | max_value: float = None, 144 | allow_nan: bool = False, 145 | allow_infinity: bool = False, 146 | ): 147 | if isinstance(X_y_shapes, tuple) or isinstance(X_y_shapes, list): 148 | X_shape, y_shape = X_y_shapes 149 | else: 150 | X_shape, y_shape = draw(X_y_shapes) 151 | if X_shape[0] != y_shape[0]: 152 | raise ValueError(f"X.shape[0] must be == y.shape[0]: {X_shape}, {y_shape}") 153 | if X_shape[0] <= X_shape[1]: 154 | raise ValueError(f"X.shape[0] must be <= X.shape[1]: {X_shape}") 155 | 156 | elements = floats( 157 | min_value=min_value, 158 | max_value=max_value, 159 | allow_nan=allow_nan, 160 | allow_infinity=allow_infinity, 161 | ) 162 | X = draw(arrays(dtype=float, shape=X_shape, elements=elements,)) 163 | y = draw(arrays(dtype=float, shape=y_shape, elements=elements,)) 164 | return X, y 165 | 166 | 167 | @st.composite 168 | def numpy_X_matrices( 169 | draw, 170 | shape=shape_matrix(), 171 | min_value: float = None, 172 | max_value: float = None, 173 | allow_nan: bool = False, 174 | allow_infinity: bool = False, 175 | ): 176 | if not isinstance(shape, tuple) and not isinstance(shape, list): 177 | shape = draw(shape) 178 | if shape[0] <= shape[1]: 179 | raise ValueError(f"X.shape[0] must be <= X.shape[1]: {shape}") 180 | 181 | elements = floats( 182 | min_value=min_value, 183 | max_value=max_value, 184 | allow_nan=allow_nan, 185 | allow_infinity=allow_infinity, 186 | ) 187 | 188 | X = draw(arrays(dtype=float, shape=shape, elements=elements,)) 189 | return X 190 | -------------------------------------------------------------------------------- /gtime/causality/base.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from itertools import product 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from scipy import stats 7 | from sklearn.utils.validation import check_is_fitted 8 | 9 | 10 | class CausalityMixin: 11 | """ Base class for causality tests. """ 12 | 13 | def __init__(self, bootstrap_iterations, permutation_iterations): 14 | self.bootstrap_iterations = bootstrap_iterations 15 | self.permutation_iterations = permutation_iterations 16 | 17 | def transform(self, data: pd.DataFrame) -> pd.DataFrame: 18 | """Shifts each input time series by the amount which optimizes correlation with 19 | the selected 'target_col' column. If no target column is specified, the first 20 | column of the DataFrame is taken as the target. 21 | 22 | Parameters 23 | ---------- 24 | data : pd.DataFrame, shape (n_samples, n_time_series), required 25 | The DataFrame containing the time series on which to perform the 26 | transformation. 27 | 28 | Returns 29 | ------- 30 | data_t : pd.DataFrame, shape (n_samples, n_time_series) 31 | The DataFrame (Pivot table) of the shifts which maximize the correlation 32 | between each time series. The shift is indicated in rows. 33 | 34 | """ 35 | check_is_fitted(self) 36 | data_t = data.copy() 37 | 38 | if self.target_col is None: 39 | self.target_col = data_t.columns[0] 40 | warnings.warn( 41 | "The target column was not specified. Therefore, the first " 42 | f"column {self.target_col } of the DataFrame was taken as " 43 | "target column. If you want to transform with respect to " 44 | "another column, please use it as a target column." 45 | ) 46 | 47 | for col in data_t: 48 | if col != self.target_col: 49 | data_t[col] = data_t[col].shift(self.best_shifts_[self.target_col][col]) 50 | if self.dropna: 51 | data_t = data_t.dropna() 52 | 53 | return data_t 54 | 55 | def _initialize_table(self): 56 | best_shifts = pd.DataFrame(columns=["x", "y", "shift", "max_corr"]) 57 | column_types = { 58 | "x": np.float64, 59 | "y": np.float64, 60 | "shift": np.int64, 61 | "max_corr": np.int64, 62 | } 63 | 64 | if self.bootstrap_iterations: 65 | best_shifts = best_shifts.reindex( 66 | best_shifts.columns.tolist() + ["p_values"], axis=1 67 | ) 68 | column_types["p_values"] = np.float64 69 | 70 | best_shifts = best_shifts.astype(column_types) 71 | return best_shifts 72 | 73 | def _compute_best_shifts(self, data, shift_func): 74 | best_shifts = self._initialize_table() 75 | 76 | if self.target_col is None: 77 | columns_to_shift = [(x, y) for x, y in product(data.columns, repeat=2)] 78 | 79 | else: 80 | columns_to_shift = [(col, self.target_col) for col in data.columns] 81 | 82 | for (x, y) in columns_to_shift: 83 | res = shift_func(data, x=x, y=y) 84 | best_shift = res[1] 85 | max_corr = res[0] 86 | tables = { 87 | "x": x, 88 | "y": y, 89 | "shift": best_shift, 90 | "max_corr": max_corr, 91 | } 92 | if self.bootstrap_iterations: 93 | bootstrap_p_value = self._compute_p_values( 94 | data=data, x=x, y=y, shift=best_shift, test_type="bootstrap" 95 | ) 96 | tables["bootstrap_p_values"] = bootstrap_p_value 97 | 98 | if self.permutation_iterations: 99 | bootstrap_p_value = self._compute_p_values( 100 | data=data, x=x, y=y, shift=best_shift, test_type="permutation" 101 | ) 102 | tables["permutation_p_values"] = bootstrap_p_value 103 | 104 | best_shifts = best_shifts.append(tables, ignore_index=True,) 105 | 106 | return best_shifts 107 | 108 | def _compute_p_values(self, data, x, y, shift, test_type): 109 | data_t = data.copy() 110 | data_t[x] = data_t.shift(shift)[x] 111 | data_t.dropna(axis=0, inplace=True) 112 | rhos = [] 113 | n_iterations = ( 114 | self.permutation_iterations 115 | if test_type == "permutation" 116 | else self.bootstrap_iterations 117 | ) 118 | 119 | for k in range(n_iterations): 120 | if test_type == "permutation": 121 | samples = data_t.sample(frac=1) 122 | else: 123 | samples = data_t.sample(n=len(data), replace=True) 124 | 125 | rhos.append(stats.pearsonr(samples[x], samples[y])[0]) 126 | rhos = pd.DataFrame(rhos) 127 | percentiles = stats.percentileofscore(rhos, 0) / 100 128 | # print("percentile: ", percentiles) 129 | p_values = [2*percentile if percentile < 0.5 else 1 - percentile for percentile in percentiles] 130 | 131 | return p_values 132 | 133 | def _create_pivot_tables(self, best_shifts): 134 | pivot_best_shifts = pd.pivot_table( 135 | best_shifts, index=["x"], columns=["y"], values="shift" 136 | ) 137 | max_corrs = pd.pivot_table( 138 | best_shifts, index=["x"], columns=["y"], values="max_corr" 139 | ) 140 | 141 | pivot_tables = {"best_shifts": pivot_best_shifts, "max_corrs": max_corrs} 142 | 143 | if self.bootstrap_iterations: 144 | bootstrap_p_values = pd.pivot_table( 145 | best_shifts, index=["x"], columns=["y"], values="bootstrap_p_values" 146 | ) 147 | pivot_tables["bootstrap_p_values"] = bootstrap_p_values 148 | 149 | if self.permutation_iterations: 150 | permutation_p_values = pd.pivot_table( 151 | best_shifts, index=["x"], columns=["y"], values="permutation_p_values" 152 | ) 153 | pivot_tables["permutation_p_values"] = permutation_p_values 154 | 155 | return pivot_tables 156 | -------------------------------------------------------------------------------- /gtime/preprocessing/tests/utils.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import List, Union, Optional, Tuple 3 | 4 | import numpy as np 5 | import pandas as pd 6 | from pandas.testing import assert_series_equal 7 | 8 | from gtime.preprocessing.time_series_conversion import ( 9 | _SequenceToTimeIndexSeries, 10 | _PandasSeriesToTimeIndexSeries, 11 | _TimeIndexSeriesToPeriodIndexSeries, 12 | count_not_none, 13 | ) 14 | from gtime.utils.testing_constants import DEFAULT_START, DEFAULT_FREQ 15 | 16 | PandasDate = Union[datetime, pd.Timestamp, str] 17 | 18 | 19 | def compare_output_of_input_sequence_to_expected_one( 20 | input_sequence, start, end, freq, 21 | ): 22 | computed_pandas_series = transform_sequence_into_time_index_series( 23 | input_sequence, start, end, freq 24 | ) 25 | expected_pandas_series = pandas_series_with_period_index( 26 | input_sequence, start, end, freq 27 | ) 28 | assert_series_equal(computed_pandas_series, expected_pandas_series) 29 | 30 | 31 | def compare_output_of_input_series_to_expected_one( 32 | input_sequence, start, end, freq, 33 | ): 34 | computed_pandas_series = transform_series_into_time_index_series( 35 | input_sequence, start, end, freq 36 | ) 37 | expected_pandas_series = pandas_series_with_period_index( 38 | input_sequence.values, start, end, freq 39 | ) 40 | assert_series_equal(computed_pandas_series, expected_pandas_series) 41 | 42 | 43 | def transform_sequence_into_time_index_series( 44 | array_like_object: Union[np.array, list, pd.Series], 45 | start: Optional[str] = None, 46 | end: Optional[str] = None, 47 | freq: Optional[str] = None, 48 | ) -> pd.Series: 49 | time_series_conversion = _SequenceToTimeIndexSeries(start, end, freq) 50 | return time_series_conversion.transform(array_like_object) 51 | 52 | 53 | def transform_series_into_time_index_series( 54 | array_like_object: Union[np.array, list, pd.Series], 55 | start: Optional[str] = None, 56 | end: Optional[str] = None, 57 | freq: Optional[str] = None, 58 | ) -> pd.Series: 59 | time_series_conversion = _PandasSeriesToTimeIndexSeries(start, end, freq) 60 | return time_series_conversion.transform(array_like_object) 61 | 62 | 63 | def transform_time_index_series_into_period_index_series( 64 | series: pd.Series, freq: pd.Timedelta = None, 65 | ) -> pd.Series: 66 | to_period_conversion = _TimeIndexSeriesToPeriodIndexSeries(freq=freq) 67 | return to_period_conversion.transform(series) 68 | 69 | 70 | def pandas_series_with_period_index( 71 | values: Union[np.array, List[float]], 72 | start: Optional[datetime] = None, 73 | end: Optional[datetime] = None, 74 | freq: Optional[pd.Timedelta] = None, 75 | ) -> pd.Series: 76 | start, end, freq = _initialize_start_end_freq(start, end, freq) 77 | index = pd.period_range(start=start, end=end, periods=len(values), freq=freq,) 78 | return pd.Series(index=index, data=values, dtype=np.float64) 79 | 80 | 81 | def _initialize_start_end_freq( 82 | start: PandasDate, end: PandasDate, freq: pd.Timedelta 83 | ) -> Tuple[pd.Timestamp, pd.Timestamp, pd.Timedelta]: 84 | not_none_params = count_not_none(start, end, freq) 85 | if not_none_params == 0: 86 | start, end, freq = _default_params_initialization() 87 | elif not_none_params == 1: 88 | start, end, freq = _one_not_none_param_initialization(start, end, freq) 89 | elif not_none_params == 2: 90 | start, end, freq = _two_not_none_params_initialization(start, end, freq) 91 | else: 92 | raise ValueError( 93 | "Of the three parameters: start, end, and " 94 | "freq, exactly two must be specified" 95 | ) 96 | return start, end, freq 97 | 98 | 99 | def _default_params_initialization() -> Tuple[pd.Timestamp, pd.Timestamp, pd.Timedelta]: 100 | start = DEFAULT_START 101 | end = None 102 | freq = DEFAULT_FREQ 103 | return start, end, freq 104 | 105 | 106 | def _one_not_none_param_initialization( 107 | start, end, freq 108 | ) -> Tuple[pd.Timestamp, pd.Timestamp, pd.Timedelta]: 109 | if start is not None: 110 | start = start 111 | end = None 112 | freq = DEFAULT_FREQ 113 | elif end is not None: 114 | start = None 115 | end = end 116 | freq = DEFAULT_FREQ 117 | else: 118 | start = DEFAULT_START 119 | end = None 120 | freq = freq 121 | return start, end, freq 122 | 123 | 124 | def _two_not_none_params_initialization( 125 | start, end, freq 126 | ) -> Tuple[pd.Timestamp, pd.Timestamp, pd.Timedelta]: 127 | start = start 128 | end = end 129 | freq = freq 130 | return start, end, freq 131 | 132 | 133 | def datetime_index_series_to_period_index_series( 134 | datetime_index_series: pd.Series, freq: Optional[pd.Timedelta] = None 135 | ) -> pd.Series: 136 | if datetime_index_series.index.freq is not None: 137 | try: 138 | return pd.Series( 139 | index=pd.PeriodIndex(datetime_index_series.index), 140 | data=datetime_index_series.values, 141 | ) 142 | except Exception as e: 143 | print(freq, datetime_index_series.index.freq) 144 | raise e 145 | else: 146 | freq = "1D" if freq is None else freq 147 | return pd.Series( 148 | index=pd.PeriodIndex(datetime_index_series.index, freq=freq), 149 | data=datetime_index_series.values, 150 | ) 151 | 152 | 153 | def timedelta_index_to_datetime( 154 | index: pd.TimedeltaIndex, start: datetime = datetime(year=1970, month=1, day=1), 155 | ) -> pd.DatetimeIndex: 156 | return start + index 157 | 158 | 159 | def timedelta_index_series_to_period_index_series( 160 | timedelta_index_series: pd.Series, freq: Optional[pd.Timedelta] = None 161 | ) -> pd.Series: 162 | datetime_index = timedelta_index_to_datetime(timedelta_index_series.index) 163 | if datetime_index.freq is None: 164 | freq = "1D" if freq is None else freq 165 | period_index = pd.PeriodIndex(datetime_index, freq=freq) 166 | else: 167 | period_index = pd.PeriodIndex(datetime_index) 168 | return pd.Series(index=period_index, data=timedelta_index_series.values) 169 | -------------------------------------------------------------------------------- /gtime/time_series_models/simple_models.py: -------------------------------------------------------------------------------- 1 | from gtime.compose import FeatureCreation 2 | from sklearn.compose import make_column_selector 3 | from gtime.feature_extraction import Shift, MovingAverage, MovingCustomFunction 4 | from gtime.time_series_models import TimeSeriesForecastingModel 5 | from gtime.forecasting import ( 6 | NaiveForecaster, 7 | SeasonalNaiveForecaster, 8 | DriftForecaster, 9 | AverageForecaster, 10 | ) 11 | 12 | 13 | class Naive(TimeSeriesForecastingModel): 14 | """ Naive model pipeline, no feature creation and ``NaiveModel()`` as a model 15 | 16 | Parameters 17 | ---------- 18 | horizon: int - prediction horizon, in time series periods 19 | 20 | Examples 21 | -------- 22 | >>> import pandas as pd 23 | >>> import numpy as np 24 | >>> from gtime.time_series_models import Naive 25 | >>> idx = pd.period_range(start='2011-01-01', end='2012-01-01') 26 | >>> np.random.seed(0) 27 | >>> df = pd.DataFrame(np.random.random((len(idx), 1)), index=idx, columns=['1']) 28 | >>> model = Naive(horizon=4) 29 | >>> model.fit(df) 30 | >>> model.predict() 31 | y_1 y_2 y_3 y_4 32 | 2011-12-29 0.543806 0.543806 0.543806 0.543806 33 | 2011-12-30 0.456911 0.456911 0.456911 0.456911 34 | 2011-12-31 0.882041 0.882041 0.882041 0.882041 35 | 2012-01-01 0.458604 0.458604 0.458604 0.458604 36 | """ 37 | 38 | def __init__(self, horizon: int): 39 | features = [ 40 | ("s1", Shift(0), make_column_selector()), 41 | ] 42 | super().__init__(features=features, horizon=horizon, model=NaiveForecaster()) 43 | 44 | 45 | class Average(TimeSeriesForecastingModel): 46 | """ Average model pipeline, no feature creation and ``AverageModel()`` as a model 47 | 48 | Parameters 49 | ---------- 50 | horizon: int - prediction horizon, in time series periods 51 | 52 | Examples 53 | -------- 54 | >>> import pandas as pd 55 | >>> import numpy as np 56 | >>> from gtime.time_series_models import Average 57 | >>> idx = pd.period_range(start='2011-01-01', end='2012-01-01') 58 | >>> np.random.seed(0) 59 | >>> df = pd.DataFrame(np.random.random((len(idx), 1)), index=idx, columns=['1']) 60 | >>> model = Average(horizon=5) 61 | >>> model.fit(df) 62 | >>> model.predict() 63 | y_1 y_2 y_3 y_4 y_5 64 | 2011-12-28 0.558475 0.558475 0.558475 0.558475 0.558475 65 | 2011-12-29 0.556379 0.556379 0.556379 0.556379 0.556379 66 | 2011-12-30 0.543946 0.543946 0.543946 0.543946 0.543946 67 | 2011-12-31 0.581512 0.581512 0.581512 0.581512 0.581512 68 | 2012-01-01 0.569221 0.569221 0.569221 0.569221 0.569221 69 | 70 | """ 71 | 72 | def __init__(self, horizon: int): 73 | features = [ 74 | ("s1", Shift(0), make_column_selector()), 75 | ] 76 | super().__init__(features=features, horizon=horizon, model=AverageForecaster()) 77 | 78 | 79 | class SeasonalNaive(TimeSeriesForecastingModel): 80 | """ Seasonal naive model pipeline, no feature creation and ``SeasonalNaiveModel()`` as a model 81 | 82 | Parameters 83 | ---------- 84 | horizon: int - prediction horizon, in time series periods 85 | seasonal_length: int - full season cycle length, in time series periods 86 | 87 | Examples 88 | -------- 89 | 90 | >>> import pandas as pd 91 | >>> import numpy as np 92 | >>> from gtime.time_series_models import SeasonalNaive 93 | >>> idx = pd.period_range(start='2011-01-01', end='2012-01-01') 94 | >>> np.random.seed(0) 95 | >>> df = pd.DataFrame(np.random.random((len(idx), 1)), index=idx, columns=['1']) 96 | >>> model = SeasonalNaive(horizon=5, seasonal_length=4) 97 | >>> model.fit(df) 98 | >>> model.predict() 99 | 100 | y_1 y_2 y_3 y_4 y_5 101 | 2011-12-28 0.392676 0.956406 0.187131 0.128861 0.392676 102 | 2011-12-29 0.956406 0.187131 0.128861 0.392676 0.956406 103 | 2011-12-30 0.187131 0.128861 0.392676 0.956406 0.187131 104 | 2011-12-31 0.128861 0.392676 0.956406 0.187131 0.128861 105 | 2012-01-01 0.392676 0.956406 0.187131 0.128861 0.392676 106 | """ 107 | 108 | def __init__(self, horizon: int, seasonal_length: int): 109 | features = [ 110 | ("s1", Shift(0), make_column_selector()), 111 | ] 112 | self.seasonal_length = seasonal_length 113 | self.horizon = horizon 114 | super().__init__( 115 | features=features, 116 | horizon=horizon, 117 | model=SeasonalNaiveForecaster(seasonal_length), 118 | ) 119 | 120 | 121 | class Drift(TimeSeriesForecastingModel): 122 | """ Simple drift model pipeline, no feature creation and ``DriftModel()`` as a model 123 | 124 | Parameters 125 | ---------- 126 | horizon: int - prediction horizon, in time series periods 127 | 128 | Examples 129 | -------- 130 | 131 | >>> import pandas as pd 132 | >>> import numpy as np 133 | >>> from gtime.time_series_models import Drift 134 | >>> idx = pd.period_range(start='2011-01-01', end='2012-01-01') 135 | >>> np.random.seed(0) 136 | >>> df = pd.DataFrame(np.random.random((len(idx), 1)), index=idx, columns=['1']) 137 | >>> model = Drift(horizon=5) 138 | >>> model.fit(df) 139 | >>> model.predict() 140 | 141 | y_1 y_2 y_3 y_4 y_5 142 | 2011-12-28 0.903984 0.902982 0.901980 0.900978 0.899976 143 | 2011-12-29 0.543806 0.542804 0.541802 0.540800 0.539798 144 | 2011-12-30 0.456911 0.455910 0.454908 0.453906 0.452904 145 | 2011-12-31 0.882041 0.881040 0.880038 0.879036 0.878034 146 | 2012-01-01 0.458604 0.457602 0.456600 0.455598 0.454596 147 | 148 | """ 149 | 150 | def __init__(self, horizon: int): 151 | features = [ 152 | ("s1", Shift(0), make_column_selector()), 153 | ] 154 | super().__init__(features=features, horizon=horizon, model=DriftForecaster()) 155 | -------------------------------------------------------------------------------- /gtime/time_series_models/tests/test_cv_pipeline.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import pandas as pd 3 | import numpy as np 4 | from hypothesis import given, settings 5 | import hypothesis.strategies as st 6 | from gtime.time_series_models import CVPipeline 7 | from gtime.metrics import max_error, mae, rmse, log_mse 8 | from gtime.time_series_models import ( 9 | AR, 10 | Naive, 11 | SeasonalNaive, 12 | TimeSeriesForecastingModel, 13 | ) 14 | from gtime.feature_extraction import MovingAverage, Shift 15 | from gtime.forecasting import NaiveForecaster, DriftForecaster 16 | 17 | 18 | @st.composite 19 | def draw_unique_subset(draw, lst): 20 | return draw(st.lists(st.sampled_from(lst), min_size=1, max_size=len(lst))) 21 | 22 | 23 | @st.composite 24 | def naive_model(draw): 25 | horizon = draw( 26 | st.lists( 27 | st.integers(min_value=1, max_value=20), min_size=1, max_size=4, unique=True 28 | ) 29 | ) 30 | return (Naive, {"horizon": horizon}) 31 | 32 | 33 | @st.composite 34 | def seasonal_naive_model(draw): 35 | horizon = draw( 36 | st.lists( 37 | st.integers(min_value=1, max_value=20), min_size=1, max_size=4, unique=True 38 | ) 39 | ) 40 | seasonal_length = draw( 41 | st.lists( 42 | st.integers(min_value=1, max_value=10), min_size=1, max_size=4, unique=True 43 | ) 44 | ) 45 | return (SeasonalNaive, {"horizon": horizon, "seasonal_length": seasonal_length}) 46 | 47 | 48 | @st.composite 49 | def ar_model(draw): 50 | horizon = draw( 51 | st.lists( 52 | st.integers(min_value=1, max_value=20), min_size=1, max_size=4, unique=True 53 | ) 54 | ) 55 | p = draw( 56 | st.lists( 57 | st.integers(min_value=1, max_value=20), min_size=1, max_size=4, unique=True 58 | ) 59 | ) 60 | explainer = draw(st.sampled_from([None, "lime", "shap"])) 61 | return (AR, {"horizon": horizon, "p": p, "explainer_type": [explainer]}) 62 | 63 | 64 | @st.composite 65 | def models_grid(draw): 66 | model_list = [draw(ar_model()), draw(seasonal_naive_model()), draw(naive_model())] 67 | return dict(draw(draw_unique_subset(model_list))) 68 | 69 | 70 | @st.composite 71 | def metrics(draw): 72 | metric_list = [max_error, mae, rmse, log_mse] 73 | metrics = draw(draw_unique_subset(metric_list)) 74 | metrics_dict = dict(zip([x.__name__ for x in metrics], metrics)) 75 | return metrics_dict 76 | 77 | 78 | class TestCVPipeline: 79 | @given( 80 | models=models_grid(), 81 | n_splits=st.integers(min_value=2, max_value=10), 82 | blocking=st.booleans(), 83 | metrics=metrics(), 84 | ) 85 | def test_constructor(self, models, n_splits, blocking, metrics): 86 | cv_pipeline = CVPipeline( 87 | models_sets=models, n_splits=n_splits, blocking=blocking, metrics=metrics 88 | ) 89 | list_len = np.sum( 90 | [np.prod([len(y) for y in x.values()]) for x in models.values()] 91 | ) 92 | assert list_len == len(cv_pipeline.model_list) 93 | assert len(metrics) == len(cv_pipeline.metrics) 94 | 95 | @pytest.mark.parametrize( 96 | "models", [{Naive: {"horizon": [3]}, AR: {"horizon": [3], "p": [2, 3]}}] 97 | ) 98 | @pytest.mark.parametrize("metrics", [{"RMSE": rmse, "MAE": mae}]) 99 | @pytest.mark.parametrize("n_splits", [3, 5]) 100 | @pytest.mark.parametrize("blocking", [True, False]) 101 | @pytest.mark.parametrize("seed", [5, 1000]) 102 | def test_fit_predict(self, models, n_splits, blocking, metrics, seed): 103 | cv_pipeline = CVPipeline( 104 | models_sets=models, n_splits=n_splits, blocking=blocking, metrics=metrics 105 | ) 106 | np.random.seed(seed) 107 | idx = pd.period_range(start="2011-01-01", end="2012-01-01") 108 | df = pd.DataFrame( 109 | np.random.standard_normal((len(idx), 1)), index=idx, columns=["1"] 110 | ) 111 | cv_pipeline.fit(df) 112 | assert cv_pipeline.cv_results_.shape == ( 113 | len(cv_pipeline.model_list) * len(metrics), 114 | 4, 115 | ) 116 | y_pred = cv_pipeline.predict() 117 | horizon = cv_pipeline.best_model_.horizon 118 | assert y_pred.shape == (horizon, horizon) 119 | 120 | @pytest.mark.parametrize( 121 | "models", 122 | [ 123 | { 124 | TimeSeriesForecastingModel: { 125 | "features": [ 126 | [("s3", Shift(1), ["1"])], 127 | [("ma10", MovingAverage(10), ["1"])], 128 | ], 129 | "horizon": [4], 130 | "model": [NaiveForecaster(), DriftForecaster()], 131 | } 132 | } 133 | ], 134 | ) 135 | @pytest.mark.parametrize("metrics", [{"RMSE": rmse, "MAE": mae}]) 136 | @pytest.mark.parametrize("n_splits", [5]) 137 | def test_model_assembly(self, models, n_splits, metrics): 138 | cv_pipeline = CVPipeline(models_sets=models, n_splits=n_splits, metrics=metrics) 139 | idx = pd.period_range(start="2011-01-01", end="2012-01-01") 140 | df = pd.DataFrame( 141 | np.random.standard_normal((len(idx), 1)), index=idx, columns=["1"] 142 | ) 143 | cv_pipeline.fit(df) 144 | assert cv_pipeline.cv_results_.shape == ( 145 | len(cv_pipeline.model_list) * len(metrics), 146 | 4, 147 | ) 148 | y_pred = cv_pipeline.predict() 149 | horizon = cv_pipeline.best_model_.horizon 150 | assert y_pred.shape == (horizon, horizon) 151 | 152 | @pytest.mark.parametrize( 153 | "models", [{Naive: {"horizon": [3]}, AR: {"horizon": [3], "p": [2, 3]}}] 154 | ) 155 | @pytest.mark.parametrize("refit", ["all", "best", ["Naive: {'horizon': 3}"]]) 156 | def test_models_refit(self, models, refit): 157 | cv_pipeline = CVPipeline(models_sets=models) 158 | idx = pd.period_range(start="2011-01-01", end="2012-01-01") 159 | df = pd.DataFrame( 160 | np.random.standard_normal((len(idx), 1)), index=idx, columns=["1"] 161 | ) 162 | cv_pipeline.fit(df, refit=refit) 163 | assert cv_pipeline.cv_results_.shape == (len(cv_pipeline.model_list), 4,) 164 | y_pred = cv_pipeline.predict() 165 | horizon = cv_pipeline.best_model_.horizon 166 | assert y_pred.shape == (horizon, horizon) 167 | -------------------------------------------------------------------------------- /gtime/feature_generation/tests/test_external.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pytest 3 | from hypothesis import given, strategies as st 4 | 5 | if pd.__version__ >= "1.0.0": 6 | import pandas._testing as testing 7 | else: 8 | import pandas.util.testing as testing 9 | 10 | from gtime.feature_generation import Constant, PeriodicSeasonal 11 | from gtime.utils.hypothesis.time_indexes import giotto_time_series 12 | 13 | 14 | class TestPeriodicSesonalFeature: 15 | def test_missing_start_date_or_period(self): 16 | periodic_feature = PeriodicSeasonal() 17 | with pytest.raises(ValueError): 18 | periodic_feature.transform() 19 | 20 | periodic_feature = PeriodicSeasonal(index_period=1) 21 | with pytest.raises(ValueError): 22 | periodic_feature.transform() 23 | 24 | periodic_feature = PeriodicSeasonal(start_date="2010-01-01") 25 | with pytest.raises(ValueError): 26 | periodic_feature.transform() 27 | 28 | def test_string_period(self): 29 | testing.N, testing.K = 20, 1 30 | ts = testing.makeTimeDataFrame(freq="s") 31 | periodic_feature = PeriodicSeasonal(period="1 days") 32 | periodic_feature.transform(ts) 33 | 34 | assert type(periodic_feature.period) == pd.Timedelta 35 | 36 | def test_correct_start_date(self): 37 | testing.N, testing.K = 20, 1 38 | ts = testing.makeTimeDataFrame(freq="s") 39 | start_date = "2018-01-01" 40 | periodic_feature = PeriodicSeasonal(period="1 days", start_date=start_date) 41 | periodic_feature.transform(ts) 42 | 43 | assert periodic_feature.start_date == ts.index.values[0] 44 | 45 | periodic_feature = PeriodicSeasonal( 46 | period="3 days", index_period=10, start_date=start_date 47 | ) 48 | periodic_feature.transform() 49 | assert periodic_feature.start_date == pd.to_datetime(start_date) 50 | 51 | start_date = pd.to_datetime("2018-01-01") 52 | periodic_feature = PeriodicSeasonal( 53 | period="3 days", index_period=10, start_date=start_date 54 | ) 55 | periodic_feature.transform() 56 | assert periodic_feature.start_date == start_date 57 | 58 | def test_too_high_sampling_frequency(self): 59 | start_date = "2018-01-01" 60 | periodic_feature = PeriodicSeasonal( 61 | period="2 days", 62 | start_date=start_date, 63 | index_period=pd.date_range(start=start_date, end="2020-01-01", freq="W"), 64 | ) 65 | with pytest.raises(ValueError): 66 | periodic_feature.transform() 67 | 68 | def test_correct_sinusoide(self): 69 | testing.N, testing.K = 30, 1 70 | ts = testing.makeTimeDataFrame(freq="MS") 71 | start_date = "2018-01-01" 72 | periodic_feature = PeriodicSeasonal( 73 | period="365 days", 74 | start_date=start_date, 75 | index_period=pd.date_range(start=start_date, end="2020-01-01", freq="W"), 76 | ) 77 | output_sin = periodic_feature.transform(ts) 78 | expected_index = pd.DatetimeIndex( 79 | [ 80 | "2000-01-01", 81 | "2000-02-01", 82 | "2000-03-01", 83 | "2000-04-01", 84 | "2000-05-01", 85 | "2000-06-01", 86 | "2000-07-01", 87 | "2000-08-01", 88 | "2000-09-01", 89 | "2000-10-01", 90 | "2000-11-01", 91 | "2000-12-01", 92 | "2001-01-01", 93 | "2001-02-01", 94 | "2001-03-01", 95 | "2001-04-01", 96 | "2001-05-01", 97 | "2001-06-01", 98 | "2001-07-01", 99 | "2001-08-01", 100 | "2001-09-01", 101 | "2001-10-01", 102 | "2001-11-01", 103 | "2001-12-01", 104 | "2002-01-01", 105 | "2002-02-01", 106 | "2002-03-01", 107 | "2002-04-01", 108 | "2002-05-01", 109 | "2002-06-01", 110 | ], 111 | dtype="datetime64[ns]", 112 | freq="MS", 113 | ) 114 | expected_df = pd.DataFrame.from_dict( 115 | { 116 | f"0__{periodic_feature.__class__.__name__}": [ 117 | 0.0, 118 | 0.25433547, 119 | 0.42938198, 120 | 0.49999537, 121 | 0.43585316, 122 | 0.25062091, 123 | 0.0043035, 124 | -0.25062091, 125 | -0.43585316, 126 | -0.49999537, 127 | -0.42938198, 128 | -0.24688778, 129 | 0.00860668, 130 | 0.2617078, 131 | 0.42938198, 132 | 0.49999537, 133 | 0.43585316, 134 | 0.25062091, 135 | 0.0043035, 136 | -0.25062091, 137 | -0.43585316, 138 | -0.49999537, 139 | -0.42938198, 140 | -0.24688778, 141 | 0.00860668, 142 | 0.2617078, 143 | 0.42938198, 144 | 0.49999537, 145 | 0.43585316, 146 | 0.25062091, 147 | ] 148 | } 149 | ) 150 | expected_df.index = expected_index 151 | pd.testing.assert_frame_equal(output_sin, expected_df) 152 | 153 | 154 | class TestConstantFeature: 155 | def test_correct_constant_feature(self): 156 | constant = 12 157 | df = pd.DataFrame.from_dict({"old_name": [0, 1, 2, 3, 4, 5]}) 158 | 159 | constant_feature = Constant(constant=constant) 160 | 161 | df_constant = constant_feature.fit_transform(df) 162 | expected_df_constant = pd.DataFrame.from_dict( 163 | { 164 | f"0__{constant_feature.__class__.__name__}": [ 165 | constant, 166 | constant, 167 | constant, 168 | constant, 169 | constant, 170 | constant, 171 | ] 172 | } 173 | ) 174 | 175 | testing.assert_frame_equal(expected_df_constant, df_constant, check_dtype=False) 176 | 177 | @given( 178 | giotto_time_series( 179 | min_length=1, 180 | start_date=pd.Timestamp(2000, 1, 1), 181 | end_date=pd.Timestamp(2010, 1, 1), 182 | ), 183 | st.integers(0, 100), 184 | ) 185 | def test_random_ts_and_constant(self, df: pd.DataFrame, constant: int): 186 | 187 | constant_feature = Constant(constant=constant) 188 | df_constant = constant_feature.fit_transform(df) 189 | 190 | #  testing.assert_frame_equal(expected_df_constant, df_constant) 191 | -------------------------------------------------------------------------------- /gtime/feature_extraction/custom.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from sklearn.utils.validation import check_is_fitted 3 | 4 | from ..base import add_class_name 5 | from gtime.feature_extraction import MovingCustomFunction 6 | 7 | 8 | class CrestFactorDetrending(MovingCustomFunction): 9 | """Crest factor detrending model. 10 | This class removes the trend from the data by using the crest factor definition. 11 | Each sample is normalize by its weighted surrounding. 12 | Generalized detrending is defined in (eq. 1) of: H. P. Tukuljac, V. Pulkki, 13 | H. Gamper, K. Godin, I. J. Tashev and N. Raghuvanshi, "A Sparsity Measure for Echo 14 | Density Growth in General Environments," ICASSP 2019 - 2019 IEEE International 15 | Conference on Acoustics, Speech and Signal Processing (ICASSP), Brighton, United 16 | Kingdom, 2019, pp. 1-5. 17 | Parameters 18 | ---------- 19 | window_size : int, optional, default: ``1`` 20 | The number of previous points on which to compute the crest factor detrending. 21 | is_causal : bool, optional, default: ``True`` 22 | Whether the current sample is computed based only on the past or also on the future. 23 | Examples 24 | >>> import pandas as pd 25 | >>> from CrestFactorDetrending import CrestFactorDetrending 26 | >>> ts = pd.DataFrame([0, 1, 2, 3, 4, 5]) 27 | >>> gnrl_dtr = CrestFactorDetrending(window_size=2) 28 | >>> gnrl_dtr.fit_transform(ts) 29 | 0__CrestFactorDetrending 30 | 0 NaN 31 | 1 1.000000 32 | 2 0.800000 33 | 3 0.692308 34 | 4 0.640000 35 | 5 0.609756 36 | -------- 37 | """ 38 | 39 | def __init__(self, window_size: int = 1, is_causal: bool = True): 40 | def detrend(signal): 41 | import numpy as np 42 | 43 | N = 2 44 | signal = np.array(signal) 45 | large_signal_segment = signal ** N 46 | large_segment_mean = np.sum(large_signal_segment) 47 | if self.is_causal: 48 | ref_index = -1 49 | else: 50 | ref_index = int(len(signal) / 2) 51 | small_signal_segment = signal[ref_index] ** N 52 | return small_signal_segment / large_segment_mean # (eq. 1) 53 | 54 | super().__init__(detrend) 55 | self.window_size = window_size 56 | self.is_causal = is_causal 57 | 58 | @add_class_name 59 | def transform(self, time_series: pd.DataFrame) -> pd.DataFrame: 60 | """For every row of ``time_series``, compute the moving crest factor detrending function of the 61 | previous ``window_size`` elements. 62 | Parameters 63 | ---------- 64 | time_series : pd.DataFrame, shape (n_samples, 1), required 65 | The DataFrame on which to compute the rolling moving custom function. 66 | Returns 67 | ------- 68 | time_series_t : pd.DataFrame, shape (n_samples, 1) 69 | A DataFrame, with the same length as ``time_series``, containing the rolling 70 | moving custom function for each element. 71 | """ 72 | check_is_fitted(self) 73 | 74 | if self.is_causal: 75 | time_series_mvg_dtr = time_series.rolling(self.window_size).apply( 76 | self.custom_feature_function, raw=self.raw 77 | ) 78 | else: 79 | time_series_mvg_dtr = time_series.rolling( 80 | self.window_size, min_periods=int(self.window_size / 2) 81 | ).apply(self.custom_feature_function, raw=self.raw) 82 | time_series_mvg_dtr = time_series_mvg_dtr.dropna() 83 | 84 | time_series_t = time_series_mvg_dtr 85 | return time_series_t 86 | 87 | 88 | class SortedDensity(MovingCustomFunction): 89 | """For each row in ``time_series``, compute the sorted density function of the 90 | previous ``window_size`` rows. If there are not enough rows, the value is ``Nan``. 91 | Sorted density measured is defined in (eq. 1) of: H. P. Tukuljac, V. Pulkki, 92 | H. Gamper, K. Godin, I. J. Tashev and N. Raghuvanshi, "A Sparsity Measure for Echo 93 | Density Growth in General Environments," ICASSP 2019 - 2019 IEEE International 94 | Conference on Acoustics, Speech and Signal Processing (ICASSP), Brighton, United 95 | Kingdom, 2019, pp. 1-5. 96 | Parameters 97 | ---------- 98 | window_size : int, optional, default: ``1`` 99 | The number of previous points on which to compute the sorted density. 100 | is_causal : bool, optional, default: ``True`` 101 | Whether the current sample is computed based only on the past or also on the future. 102 | Examples 103 | -------- 104 | >>> import pandas as pd 105 | >>> from gtime.feature_extraction import SortedDensity 106 | >>> ts = pd.DataFrame([0, 1, 2, 3, 4, 5]) 107 | >>> mv_avg = SortedDensity(window_size=2) 108 | >>> mv_avg.fit_transform(ts) 109 | 0__SortedDensity 110 | 0 NaN 111 | 1 0.500000 112 | 2 0.666667 113 | 3 0.700000 114 | 4 0.714286 115 | 5 0.722222 116 | -------- 117 | """ 118 | 119 | def __init__(self, window_size: int = 1, is_causal: bool = True): 120 | def sorted_density(signal): 121 | import numpy as np 122 | 123 | t = np.array(range(len(signal))) + 1 124 | signal = signal[signal.argsort()[::-1]] 125 | t = np.reshape(t, signal.shape) 126 | SD = np.sum(np.multiply(t, signal)) / np.sum(signal) # (eq. 2) 127 | SD = SD / (len(signal)) 128 | return SD 129 | 130 | super().__init__(sorted_density) 131 | self.window_size = window_size 132 | self.is_causal = is_causal 133 | 134 | @add_class_name 135 | def transform(self, time_series: pd.DataFrame) -> pd.DataFrame: 136 | """For every row of ``time_series``, compute the moving sorted density function of the 137 | previous ``window_size`` elements. 138 | Parameters 139 | ---------- 140 | time_series : pd.DataFrame, shape (n_samples, 1), required 141 | The DataFrame on which to compute the rolling moving custom function. 142 | Returns 143 | ------- 144 | time_series_t : pd.DataFrame, shape (n_samples, 1) 145 | A DataFrame, with the same length as ``time_series``, containing the rolling 146 | moving custom function for each element. 147 | """ 148 | check_is_fitted(self) 149 | 150 | if self.is_causal: 151 | time_series_mvg_sd = time_series.rolling(self.window_size).apply( 152 | self.custom_feature_function, raw=self.raw 153 | ) 154 | else: 155 | time_series_mvg_sd = time_series.rolling( 156 | self.window_size, min_periods=int(self.window_size / 2) 157 | ).apply(self.custom_feature_function, raw=self.raw) 158 | time_series_mvg_sd = time_series_mvg_sd.dropna() 159 | 160 | time_series_t = time_series_mvg_sd 161 | return time_series_t 162 | --------------------------------------------------------------------------------