├── docs ├── source │ ├── getting_started │ │ └── index.rst │ ├── api_reference │ │ ├── index.rst │ │ ├── main_classes.rst │ │ └── validators.rst │ ├── mff_documentation.rst │ ├── contributing.rst │ ├── conf.py │ ├── index.rst │ └── examples.rst ├── Makefile └── make.bat ├── src └── macroframe_forecast │ ├── __init__.py │ ├── MFF.py │ ├── MFF_mixed_frequency.py │ ├── examples.py │ └── utils.py ├── .gitignore ├── .github └── workflows │ ├── tests.yaml │ ├── documentation.yml │ └── release.yml ├── .pre-commit-config.yaml ├── pyproject.toml ├── README.md ├── tests ├── test_utils.py └── test_MFF.py └── LICENSE /docs/source/getting_started/index.rst: -------------------------------------------------------------------------------- 1 | 2 | .. _getting_started: 3 | 4 | Getting started 5 | =============== 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | 10 | demo_notebook -------------------------------------------------------------------------------- /docs/source/api_reference/index.rst: -------------------------------------------------------------------------------- 1 | 2 | .. _api_reference: 3 | 4 | API Reference 5 | ============= 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | 10 | main_classes 11 | validators 12 | -------------------------------------------------------------------------------- /src/macroframe_forecast/__init__.py: -------------------------------------------------------------------------------- 1 | from importlib.metadata import version 2 | 3 | __version__ = version("macroframe_forecast") 4 | 5 | from .MFF import MFF # noqa: F401 6 | from .MFF_mixed_frequency import MFF_mixed_freqency # noqa: F401 7 | -------------------------------------------------------------------------------- /docs/source/api_reference/main_classes.rst: -------------------------------------------------------------------------------- 1 | Main classes 2 | ------------ 3 | 4 | Sample text to be updated. 5 | 6 | .. autoclass:: mff.mff.MFF 7 | 8 | .. autofunction:: mff.step0_parse_constraints.find_strings_to_replace_wildcard 9 | 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | dist/ 3 | .git_local_repo_credentials 4 | 5 | # setuptools files 6 | *.egg-info 7 | 8 | # temporary csv files 9 | tests/*.csv 10 | 11 | # Sphinx documentation 12 | docs/build/ 13 | 14 | # MacOS 15 | .DS_Store 16 | 17 | # pixi environments 18 | .pixi 19 | *.egg-info 20 | 21 | # build files 22 | build/ -------------------------------------------------------------------------------- /docs/source/api_reference/validators.rst: -------------------------------------------------------------------------------- 1 | Validators 2 | ---------- 3 | 4 | Validators check that the data and constraints have the appropriate shape and content. 5 | 6 | .. autofunction:: mff.validators.can_forecast 7 | 8 | .. autofunction:: mff.validators.is_consistent_shape 9 | 10 | .. autofunction:: mff.validators.is_consistent_intercept 11 | -------------------------------------------------------------------------------- /docs/source/mff_documentation.rst: -------------------------------------------------------------------------------- 1 | Macroframework Forecasting Package API 2 | ---------------------------------- 3 | 4 | .. automodule:: macroframe_forecast.MFF 5 | :members: 6 | :member-order: groupwise 7 | :show-inheritance: 8 | 9 | 10 | .. automodule:: macroframe_forecast.MFF_mixed_frequency 11 | :members: 12 | :member-order: groupwise 13 | :show-inheritance: 14 | 15 | 16 | .. automodule:: macroframe_forecast.utils 17 | :members: 18 | :member-order: groupwise 19 | :show-inheritance: 20 | -------------------------------------------------------------------------------- /.github/workflows/tests.yaml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: [pull_request, workflow_dispatch] 4 | 5 | jobs: 6 | docs: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: actions/checkout@v4 10 | - uses: actions/setup-python@v5 11 | with: 12 | python-version: '3.12' 13 | - name: Install dependencies 14 | run: | 15 | pip install pytest . 16 | - name: Install the package 17 | run: pip install . 18 | - name: Run tests 19 | run: | 20 | pytest -m "" 21 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v2.3.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | - repo: https://github.com/astral-sh/ruff-pre-commit 9 | # Ruff version. 10 | rev: v0.11.13 11 | hooks: 12 | # Run the linter. 13 | - id: ruff-check 14 | # Run the formatter. 15 | - id: ruff-format 16 | - repo: local 17 | hooks: 18 | - id: pytest-check 19 | name: pytest-check 20 | entry: pytest 21 | language: system 22 | pass_filenames: false 23 | always_run: true 24 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/source/contributing.rst: -------------------------------------------------------------------------------- 1 | Contributing 2 | ============ 3 | 4 | Contributions to the code are welcome! 5 | 6 | For development of the code, it's recommended to install the editable version of the package, so the edits are immediately reflected for testing: 7 | 8 | ```shell 9 | python -m pip install -e . 10 | ``` 11 | 12 | Make sure to install the dependencies in the `dev` dependency group of `pyproject.toml`. 13 | 14 | It's also recommended to install `pre-commit`, to set up git hooks, run the following once: 15 | 16 | ```shell 17 | pre-commit install 18 | ``` 19 | 20 | Note that this will run tests, skipping the slow tests. 21 | 22 | Building documentation 23 | ====================== 24 | 25 | To build/update documentation, run: 26 | 27 | ```shell 28 | sphinx-build -M html docs/source/ docs/build/ 29 | ``` 30 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /.github/workflows/documentation.yml: -------------------------------------------------------------------------------- 1 | name: documentation 2 | 3 | on: [push, pull_request, workflow_dispatch] 4 | 5 | permissions: 6 | contents: write 7 | 8 | jobs: 9 | docs: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | - uses: actions/setup-python@v5 14 | with: 15 | python-version: '3.12' 16 | - name: Install dependencies 17 | run: | 18 | pip install sphinx sphinx_rtd_theme myst_parser numpydoc pydata_sphinx_theme furo 19 | - name: Install the package 20 | run: pip install . 21 | - name: Sphinx build 22 | run: | 23 | sphinx-build docs/source docs_ready 24 | - name: Deploy to GitHub Pages 25 | uses: peaceiris/actions-gh-pages@v3 26 | if: ${{ github.ref == 'refs/heads/main' }} 27 | with: 28 | publish_branch: gh-pages 29 | github_token: ${{ secrets.GITHUB_TOKEN }} 30 | publish_dir: docs_ready/ 31 | force_orphan: true 32 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: release 2 | on: [release] 3 | 4 | jobs: 5 | release-build: 6 | runs-on: ubuntu-latest 7 | 8 | steps: 9 | - uses: actions/checkout@v4 10 | 11 | - uses: actions/setup-python@v5 12 | with: 13 | python-version: "3.x" 14 | 15 | - name: build release distributions 16 | run: | 17 | # NOTE: put your own distribution build steps here. 18 | python -m pip install build 19 | python -m build 20 | 21 | - name: upload windows dists 22 | uses: actions/upload-artifact@v4 23 | with: 24 | name: release-dists 25 | path: dist/ 26 | 27 | pypi-publish: 28 | runs-on: ubuntu-latest 29 | needs: 30 | - release-build 31 | permissions: 32 | id-token: write 33 | 34 | steps: 35 | - name: Retrieve release distributions 36 | uses: actions/download-artifact@v4 37 | with: 38 | name: release-dists 39 | path: dist/ 40 | 41 | - name: Publish release distributions to PyPI 42 | uses: pypa/gh-action-pypi-publish@release/v1 43 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | 9 | project = "macroframe-forecast" 10 | copyright = "2024-2025, IMF" 11 | author = "Ando Sakai, Shuvam Das, Sultan Orazbayev" 12 | release = "0.1.5" 13 | 14 | 15 | # -- General configuration --------------------------------------------------- 16 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 17 | extensions = [ 18 | "sphinx.ext.autodoc", 19 | "numpydoc", 20 | "sphinx.ext.mathjax", 21 | "sphinx.ext.napoleon", 22 | "sphinx.ext.autosummary", 23 | "sphinx.ext.autosectionlabel", 24 | "sphinx.ext.intersphinx", 25 | "sphinx.ext.doctest", 26 | ] 27 | 28 | templates_path = ["_templates"] 29 | exclude_patterns = ["_build", "**.ipynb_checkpoints", "**.ipynb"] 30 | 31 | # generate autosummary even if no references 32 | autosummary_generate = True 33 | 34 | # Members and inherited-members default to showing methods and attributes from a 35 | # class or those inherited. 36 | # Member-order orders the documentation in the order of how the members are defined in 37 | # the source code. 38 | autodoc_default_options = { 39 | "members": True, 40 | "inherited-members": True, 41 | "member-order": "bysource", 42 | } 43 | 44 | # -- Options for HTML output ------------------------------------------------- 45 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 46 | 47 | html_theme = "furo" 48 | html_static_path = [] 49 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "macroframe-forecast" 3 | version = "0.1.6" 4 | description = "Macroframework forecasting with accounting identities" 5 | readme = "README.md" 6 | requires-python = ">=3.11" 7 | maintainers = [ 8 | { name = "Sakai Ando", email = "sando@imf.org" }, 9 | { name = "Sultan Orazbayev", email = "sorazbayev@imf.org" }, 10 | ] 11 | keywords = [ 12 | "macroframework", 13 | "forecasting", 14 | "macroeconomic identities", 15 | "high-dimensional forecasting", 16 | "econometrics", 17 | ] 18 | # https://pypi.org/classifiers/ 19 | classifiers = [ 20 | "Programming Language :: Python :: 3.11", 21 | "Programming Language :: Python :: 3.12", 22 | "Programming Language :: Python :: 3 :: Only", 23 | "Intended Audience :: Developers", 24 | "Intended Audience :: Other Audience", 25 | "Intended Audience :: Science/Research", 26 | "Intended Audience :: Education", 27 | "Topic :: Scientific/Engineering", 28 | "Topic :: Scientific/Engineering :: Information Analysis", 29 | "Topic :: Scientific/Engineering :: Mathematics", 30 | ] 31 | # tbd 32 | dependencies = [ 33 | "pandas >= 2.2.0", 34 | "numpy >= 1.26.3", 35 | "scipy >= 1.12.0", 36 | "scikit-learn >= 1.4.0", 37 | "dask[dataframe] >= 2024.8.1", 38 | "distributed >= 2024.2.0", 39 | "sktime >= 0.27.0", 40 | "sympy >= 1.12", 41 | "cvxpy >= 1.5.3", 42 | "statsmodels>=0.14.4", 43 | "matplotlib>=3.10.3", 44 | ] 45 | [project.urls] 46 | homepage = "https://github.com/sakaiando/macroframe-forecast" 47 | repository = "https://github.com/sakaiando/macroframe-forecast" 48 | 49 | [build-system] 50 | requires = ["hatchling"] 51 | build-backend = "hatchling.build" 52 | 53 | [dependency-groups] 54 | dev = [ 55 | "furo>=2024.8.6", 56 | "numpydoc>=1.9.0", 57 | "pre-commit>=4.2.0", 58 | "pytest>=8.4.0", 59 | "ruff>=0.11.13", 60 | "sphinx>=8.2.3", 61 | ] 62 | 63 | [tool.pytest.ini_options] 64 | addopts = [ 65 | "--strict-config", # Force error if config is mispelled 66 | "--strict-markers", # Force error if marker is mispelled (must be defined in config) 67 | "-ra", # Print summary of all fails/errors 68 | "-m not slow", # Skip slow tests 69 | ] 70 | markers = ["slow: Skipped unless '-m slow' passed"] 71 | 72 | [tool.ruff] 73 | line-length = 120 74 | 75 | [tool.ruff.lint] 76 | pydocstyle = { convention = "numpy" } 77 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `macroframe-forecast`: a Python package to assist with macroframework forecasting 2 | 3 | 4 | [![!pypi](https://img.shields.io/pypi/v/macroframe-forecast?color=green)](https://pypi.org/project/macroframe-forecast/) [![Downloads](https://static.pepy.tech/personalized-badge/macroframe-forecast?period=total&units=international_system&left_color=grey&right_color=blue&left_text=cumulative%20(pypi))](https://pepy.tech/project/macroframe-forecast) 5 | 6 | This package is based on the following papers: 7 | * [A Python Package to Assist Macroframework Forecasting: Concepts and Examples (2025)](https://www.imf.org/en/Publications/WP/Issues/2025/08/29/A-Python-Package-to-Assist-Macroframework-Forecasting-Concepts-and-Examples-570041). 8 | * [Smooth Forecast Reconciliation (2024)](https://www.imf.org/en/Publications/WP/Issues/2024/03/22/Smooth-Forecast-Reconciliation-546654) 9 | * [Systematizing Macroframework Forecasting: High-Dimensional Conditional Forecasting with Accounting Identities (2023)](https://link.springer.com/article/10.1057/s41308-023-00225-8) 10 | 11 | # Documentation 12 | 13 | Please refer to [this link](https://sakaiando.github.io/macroframe-forecast/) for documentation. 14 | 15 | # Installation 16 | 17 | To install the `macroframe-forecast` package, run the following from the repository root: 18 | 19 | ```shell 20 | pip install macroframe-forecast 21 | ``` 22 | 23 | # Quick start 24 | 25 | ```python 26 | import numpy as np 27 | import pandas as pd 28 | import matplotlib.pyplot as plt 29 | from macroframe_forecast import MFF 30 | 31 | # true data 32 | df_true = pd.DataFrame({ 33 | 'var1': np.random.randn(30), # 100 random values from normal distribution 34 | 'var2': np.random.randn(30) 35 | }) 36 | df_true['sum'] = df_true['var1'] + df_true['var2'] 37 | 38 | # input dataframe 39 | df = df_true.copy() 40 | fh = 5 41 | df.iloc[-fh:, 1:] = np.nan 42 | 43 | # apply MFF 44 | m = MFF(df, equality_constraints=['var1_? + var2_? - sum_?']) 45 | df2 = m.fit() 46 | 47 | # plots results 48 | fig,axes = plt.subplots(3,1,sharey=True, figsize=(9,9)) 49 | 50 | axes[0].plot(df2['var2'], label='forecasted var2') 51 | axes[0].plot(df_true['var2'], label='true var2') 52 | axes[0].legend() 53 | 54 | axes[1].plot(df2['sum'], label='forecasted sum') 55 | axes[1].plot(df_true['sum'], label='true sum') 56 | axes[1].legend() 57 | 58 | axes[2].plot( df2['var1'] + df2['var2'] - df2['sum'], label='summation error') 59 | axes[2].legend() 60 | ``` 61 | 62 | # Disclaimer 63 | 64 | Reuse of this tool and IMF information does not imply any endorsement of the research and/or product. Any research presented should not be reported as representing the views of the IMF, its Executive Board, or member governments. 65 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. module:: mff 2 | 3 | .. Systematic Macroframework Forecasting documentation master file, created by 4 | sphinx-quickstart on Mon Feb 12 12:29:05 2024. 5 | You can adapt this file completely to your liking, but it should at least 6 | contain the root `toctree` directive. 7 | 8 | Documentation for the Macroframework Forecasting package. 9 | ==================================================================== 10 | 11 | This repository contains the Python code for the forecasting method described in: 12 | 13 | `A Python Package to Assist Macroframework Forecasting: Concepts and Examples (2025) `_. 14 | 15 | `Smooth Forecast Reconciliation (2024) `_. 16 | 17 | `Systematizing Macroframework Forecasting: High-Dimensional Conditional Forecasting with Accounting Identities (2023) `_. 18 | 19 | Installation 20 | ------------ 21 | 22 | To install the `macroframe-foreacst` package, run the following in the terminal/shell: 23 | 24 | .. code-block:: console 25 | 26 | pip install macroframe-forecast 27 | 28 | 29 | 30 | Quick start 31 | ----------- 32 | 33 | The relevant import from `macroframe-foreacst` is `MFF`: 34 | 35 | .. code-block:: python 36 | 37 | import numpy as np 38 | import pandas as pd 39 | import matplotlib.pyplot as plt 40 | from macroframe_forecast import MFF 41 | 42 | # true data 43 | df_true = pd.DataFrame({ 44 | 'var1': np.random.randn(30), # 100 random values from normal distribution 45 | 'var2': np.random.randn(30) 46 | }) 47 | df_true['sum'] = df_true['var1'] + df_true['var2'] 48 | 49 | # input dataframe, 50 | df = df_true.copy() 51 | fh = 5 52 | df.iloc[-fh:, 1:] = np.nan 53 | 54 | # apply MFF 55 | m = MFF(df, equality_constraints=['var1_? + var2_? - sum_?']) 56 | df2 = m.fit() 57 | 58 | # plots results 59 | fig,axes = plt.subplots(3,1,sharey=True, figsize=(9,9)) 60 | 61 | axes[0].plot(df2['var2'], label='forecasted var2') 62 | axes[0].plot(df_true['var2'], label='true var2') 63 | axes[0].legend() 64 | 65 | axes[1].plot(df2['sum'], label='forecasted sum') 66 | axes[1].plot(df_true['sum'], label='true sum') 67 | axes[1].legend() 68 | 69 | axes[2].plot( df2['var1'] + df2['var2'] - df2['sum'], label='summation error') 70 | axes[2].legend() 71 | 72 | 73 | .. toctree:: 74 | :maxdepth: 2 75 | :caption: Contents: 76 | 77 | examples 78 | mff_documentation 79 | contributing 80 | 81 | * :ref:`genindex` 82 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from numpy.random import sample 2 | from pandas import DataFrame, Index, PeriodIndex, Series 3 | 4 | from macroframe_forecast.utils import ( 5 | expand_wildcard, 6 | find_permissible_wildcard, 7 | find_strings_to_replace_wildcard, 8 | get_freq_of_freq, 9 | ) 10 | 11 | 12 | def test_find_permissible_wildcard(): 13 | assert find_permissible_wildcard(["a", "b", "c"], _seed=0) == "m" 14 | assert find_permissible_wildcard(["a", "b", "c"], _seed=10) == "s" 15 | 16 | 17 | def test_find_strings_to_replace_wildcard(): 18 | n = 30 19 | p = 2 20 | years = [str(y) for y in range(2000, 2000 + n)] 21 | df = DataFrame(sample([n, p]), columns=["a", "b"], index=years) 22 | df0_stacked = df.T.stack() 23 | all_cells_index = df0_stacked.index 24 | var_list = Series([f"{a}_{b}" for a, b in all_cells_index], index=all_cells_index) 25 | constraint = "ax + bx" 26 | wildcard = "x" 27 | missing_string_list = find_strings_to_replace_wildcard(constraint, var_list, wildcard) 28 | assert missing_string_list == [f"_{y}" for y in years] 29 | 30 | 31 | def test_expand_wildcard(): 32 | import numpy as np 33 | import pandas as pd 34 | 35 | n = 30 36 | p = 2 37 | years = [str(y) for y in range(2000, 2000 + n)] 38 | df = pd.DataFrame(np.random.sample([n, p]), columns=["a", "b"], index=years) 39 | df0_stacked = df.T.stack() 40 | all_cells_index = df0_stacked.index 41 | var_list = pd.Series([f"{a}_{b}" for a, b in all_cells_index], index=all_cells_index) 42 | constraints_with_alphabet_wildcard = ["ax + bx"] 43 | alphabet_wildcard = "x" 44 | constraints = expand_wildcard(constraints_with_alphabet_wildcard, var_list=var_list, wildcard=alphabet_wildcard) 45 | assert constraints == [f"a_{y} + b_{y}" for y in years] 46 | 47 | 48 | def test_get_freq_of_freq_quarterly(): 49 | years = [2000, 2000, 2001] 50 | quarters = [1, 2, 4] 51 | 52 | test_index = PeriodIndex.from_fields(year=years, quarter=quarters) 53 | 54 | assert get_freq_of_freq(test_index, "Y").equals(Index(years, dtype="int64")) 55 | assert get_freq_of_freq(test_index, "Q").equals(Index(quarters, dtype="int64")) 56 | 57 | 58 | def test_get_freq_of_freq_datetime(): 59 | years = [2000, 2000, 2001] 60 | months = [1, 6, 11] 61 | days = [3, 10, 10] 62 | hours = [4, 6, 10] 63 | minutes = [5, 10, 30] 64 | seconds = [1, 4, 5] 65 | 66 | test_index_2 = PeriodIndex.from_fields( 67 | year=years, month=months, day=days, hour=hours, minute=minutes, second=seconds, freq="s" 68 | ) 69 | assert get_freq_of_freq(test_index_2, "M").equals(Index(months, dtype="int64")) 70 | assert get_freq_of_freq(test_index_2, "W").equals(Index([1, 23, 45], dtype="int32")) 71 | assert get_freq_of_freq(test_index_2, "D").equals(Index(days, dtype="int32")) 72 | assert get_freq_of_freq(test_index_2, "H").equals(Index(hours, dtype="int32")) 73 | assert get_freq_of_freq(test_index_2, "T").equals(Index(minutes, dtype="int32")) 74 | assert get_freq_of_freq(test_index_2, "S").equals(Index(seconds, dtype="int32")) 75 | -------------------------------------------------------------------------------- /tests/test_MFF.py: -------------------------------------------------------------------------------- 1 | # Disclaimer: Reuse of this tool and IMF information does not imply 2 | # any endorsement of the research and/or product. Any research presented 3 | # should not be reported as representing the views of the IMF, 4 | # its Executive Board, member governments. 5 | 6 | 7 | from string import ascii_uppercase 8 | 9 | import numpy as np 10 | import pandas as pd 11 | from pytest import mark 12 | 13 | from macroframe_forecast import MFF, MFF_mixed_freqency 14 | 15 | # %% 16 | 17 | 18 | @mark.slow 19 | def test_MFF_non_parallel(): 20 | n = 30 21 | p = 3 22 | fh = 1 23 | df_true = pd.DataFrame( 24 | np.random.rand(n, p), 25 | columns=[f"{L}{i}" for i in range(int(np.ceil(p / 26))) for L in ascii_uppercase][:p], 26 | index=pd.date_range(start="2000", periods=n, freq="YE").year, 27 | ) 28 | df_true.iloc[:, -1] = df_true.iloc[:, :-1].sum(axis=1) 29 | df = df_true.copy() 30 | df.iloc[-fh:, : np.ceil(p / 2).astype(int)] = np.nan 31 | df.iloc[-1, 0] = df_true.iloc[-1, 0] # island 32 | equality_constraints = ["A0?+B0?-C0?"] 33 | 34 | m = MFF(df, equality_constraints=equality_constraints, parallelize=False) 35 | df2 = m.fit() 36 | 37 | assert df2.iloc[-1, 0] == df_true.iloc[-1, 0] 38 | 39 | 40 | @mark.slow 41 | def test_MFF_parallel(): 42 | n = 30 43 | p = 3 44 | fh = 1 45 | df_true = pd.DataFrame( 46 | np.random.rand(n, p), 47 | columns=[f"{L}{i}" for i in range(int(np.ceil(p / 26))) for L in ascii_uppercase][:p], 48 | index=pd.date_range(start="2000", periods=n, freq="YE").year, 49 | ) 50 | df_true.iloc[:, -1] = df_true.iloc[:, :-1].sum(axis=1) 51 | df = df_true.copy() 52 | df.iloc[-fh:, : np.ceil(p / 2).astype(int)] = np.nan 53 | df.iloc[-1, 0] = df_true.iloc[-1, 0] # island 54 | 55 | equality_constraints = ["A0?+B0?-C0?"] 56 | 57 | m = MFF(df, equality_constraints=equality_constraints, parallelize=True) 58 | df2 = m.fit() 59 | 60 | assert df2.iloc[-1, 0] == df_true.iloc[-1, 0] 61 | 62 | 63 | @mark.slow 64 | def test_MFF_mixed_frequency(): 65 | import warnings 66 | 67 | warnings.filterwarnings("ignore", category=UserWarning) 68 | 69 | n = 120 70 | p = 3 71 | fhA = 5 72 | fhQ = 7 73 | dfQ_true = pd.DataFrame( 74 | np.random.rand(n, p), 75 | columns=[f"{L}{i}" for i in range(int(np.ceil(p / 26))) for L in ascii_uppercase][:p], 76 | index=pd.period_range(start="2000-1-1", periods=n, freq="Q"), 77 | ) 78 | dfQ_true.iloc[:, -1] = dfQ_true.iloc[:, :-1].sum(axis=1) 79 | dfA_true = dfQ_true.groupby(dfQ_true.index.year).sum() 80 | dfA_true.index = pd.PeriodIndex(dfA_true.index, freq="Y") 81 | 82 | dfA = dfA_true.copy() 83 | dfA.iloc[-fhA:, : np.ceil(p / 2).astype(int)] = np.nan 84 | 85 | dfQ = dfQ_true.iloc[:-12, :].copy() 86 | dfQ.iloc[-fhQ:, : np.ceil(p / 2).astype(int)] = np.nan 87 | 88 | # inputs 89 | df_dict = {"Y": dfA, "Q": dfQ} 90 | constraints_with_wildcard = ["A0?+B0?-C0?", "?Q1+?Q2+?Q3+?Q4-?"] 91 | 92 | mff = MFF_mixed_freqency(df_dict, constraints_with_wildcard=constraints_with_wildcard) 93 | df2_list = mff.fit() 94 | assert ~np.isnan(df2_list[0].iloc[-1, 0]) 95 | 96 | 97 | @mark.slow 98 | def test_small_sample_MFF(): 99 | n = 20 100 | p = 2 101 | fh = 5 102 | df_true = pd.DataFrame( 103 | np.random.rand(n, p), 104 | columns=[f"{L}{i}" for i in range(int(np.ceil(p / 26))) for L in ascii_uppercase][:p], 105 | index=pd.date_range(start="2000", periods=n, freq="YE").year, 106 | ) 107 | # df_true.iloc[:,-1] = df_true.iloc[:,:-1].sum(axis=1) 108 | df = df_true.copy() 109 | df.iloc[-fh:, : np.ceil(p / 2).astype(int)] = np.nan 110 | # df.iloc[-1,0] = df_true.iloc[-1,0] # island 111 | equality_constraints = [] 112 | 113 | m = MFF(df, equality_constraints=equality_constraints, parallelize=False) 114 | df2 = m.fit() 115 | 116 | assert ~np.isnan(df2.iloc[-1, 0]) 117 | 118 | 119 | @mark.slow 120 | def test_inequality_constraints(): 121 | 122 | n = 20 123 | p = 2 124 | fh = 5 125 | df_true = pd.DataFrame( 126 | np.random.rand(n, p), 127 | columns=[f"{L}{i}" for i in range(int(np.ceil(p / 26))) for L in ascii_uppercase][:p], 128 | index=pd.date_range(start="2000", periods=n, freq="YE").year, 129 | ) 130 | # df_true.iloc[:,-1] = df_true.iloc[:,:-1].sum(axis=1) 131 | df = df_true.copy() 132 | df.iloc[-fh:, : np.ceil(p / 2).astype(int)] = np.nan 133 | # df.iloc[-1,0] = df_true.iloc[-1,0] # island 134 | 135 | equality_constraints = [] 136 | 137 | inequality_constraints = [ df.columns[0] + '_' + str(df_true.index[-1]) + ' + 1'] 138 | 139 | m = MFF(df, equality_constraints=equality_constraints, 140 | inequality_constraints = inequality_constraints, 141 | parallelize=False) 142 | df2 = m.fit() 143 | df2.iloc[-1, 0] 144 | 145 | assert (df2.iloc[-1, 0] <= -1) or np.isclose(df2.iloc[-1, 0], -1, atol=1e-12) 146 | 147 | 148 | @mark.slow 149 | def test_equality_constraints(): 150 | 151 | n = 20 152 | p = 2 153 | fh = 5 154 | df_true = pd.DataFrame( 155 | np.random.rand(n, p), 156 | columns=[f"{L}{i}" for i in range(int(np.ceil(p / 26))) for L in ascii_uppercase][:p], 157 | index=pd.date_range(start="2000", periods=n, freq="YE").year, 158 | ) 159 | # df_true.iloc[:,-1] = df_true.iloc[:,:-1].sum(axis=1) 160 | df = df_true.copy() 161 | df.iloc[-fh:, : np.ceil(p / 2).astype(int)] = np.nan 162 | # df.iloc[-1,0] = df_true.iloc[-1,0] # island 163 | 164 | equality_constraints = [df.columns[0] + '_' + str(df_true.index[-1]) + ' + 1'] 165 | 166 | inequality_constraints = [] 167 | 168 | m = MFF(df, equality_constraints=equality_constraints, 169 | inequality_constraints = inequality_constraints, 170 | parallelize=False) 171 | df2 = m.fit() 172 | df2.iloc[-1, 0] 173 | 174 | assert round(df2.iloc[-1, 0],2) == -1 175 | -------------------------------------------------------------------------------- /src/macroframe_forecast/MFF.py: -------------------------------------------------------------------------------- 1 | # Disclaimer: Reuse of this tool and IMF information does not imply 2 | # any endorsement of the research and/or product. Any research presented 3 | # should not be reported as representing the views of the IMF, 4 | # its Executive Board, member governments. 5 | 6 | 7 | import pandas as pd 8 | from sktime.forecasting.base import BaseForecaster 9 | 10 | from macroframe_forecast.utils import ( 11 | AddIslandsToConstraints, 12 | BreakDataFrameIntoTimeSeriesList, 13 | CheckTrainingSampleSize, 14 | DefaultForecaster, 15 | FillAllEmptyCells, 16 | GenLamstar, 17 | GenPredTrueData, 18 | GenSmoothingMatrix, 19 | GenVecForecastWithIslands, 20 | GenWeightMatrix, 21 | OrganizeCells, 22 | Reconciliation, 23 | StringToMatrixConstraints, 24 | ) 25 | 26 | # %% MFF 27 | 28 | 29 | class MFF: 30 | """A class for Macro-Framework Forecasting (MFF). 31 | 32 | This class facilitates forecasting of single frequency time series data 33 | using a two-step process. First step of the forecasting procedure generates 34 | unconstrained forecasts using the forecaster specified. In the next step, 35 | these forecasts are then reconclied so that they satisfy the supplied 36 | constrants, and smoothness of the forecasts is maintained. 37 | 38 | Parameters 39 | ---------- 40 | df : pd.DataFrame 41 | Input dataframe containing time series data. Data should be in wide 42 | format, with each row containing data for one period, and each 43 | column containing data for one variable. 44 | 45 | forecaster : BaseForecaster, optional(default: None) 46 | sktime BaseForecaster descendant. If not defined, then DefaultForecaster 47 | is used. 48 | 49 | constraints_with_wildcard : str, optional(default: None) 50 | Constraints that hold with equality. Constraints may include wildcard, 51 | in which case constraints will be applied across all horizons, or 52 | may be defined for specified time periods. 53 | 54 | ineq_constraints_with_wildcard : str, optional(default: None) 55 | Inequality constraints, comparable to ``constraints_with_wildcard``. 56 | Constraints may include wildcard, in which case constraints will be 57 | applied across all horizons, or may be defined for specified time 58 | periods. Constraints should be written in the form of 'C_ineq*y - d_ineq ≤ 0 '. 59 | 60 | parallelize : boolean 61 | Indicate whether parallelization should be employed for generating the 62 | first step forecasts. Default value is `True`. 63 | 64 | n_forecast_error : int 65 | Number of windows to split data into training and testing sets for 66 | generating matrix of forecast errors. Default is 5. 67 | 68 | shrinkage_method : str, optional(default: 'oas') 69 | Method to be used for shrinking sample covariance matrix. Default is 70 | Oracle Shrinking Approximating Estimator ('oas'). Other options are 71 | oas, identity and monotone_diagonal. 72 | 73 | default_lam : float, optional(default: -1) 74 | The value of lambda to be used for calculating smoothing parameter if 75 | frequency of observations cannot be determined from index names. If this 76 | is set to -1, lambda is calculated empirically. Default is -1. 77 | 78 | max_lam : float, optional(default: 129600) 79 | Maximum value of lamstar to be used for smoothing forecasts when being 80 | estimated empirically. 81 | 82 | Returns 83 | ------- 84 | df2 : pd.Dataframe 85 | Output dataframe with all reconciled forecasts filled into the original 86 | input. 87 | 88 | 89 | """ 90 | 91 | def __init__( 92 | self, 93 | df: pd.DataFrame, 94 | forecaster: BaseForecaster | None = None, 95 | equality_constraints: list[str] = [], 96 | inequality_constraints: list[str] = [], 97 | parallelize: bool = True, 98 | n_forecast_error: int = 5, 99 | shrinkage_method: str = "oas", 100 | default_lam: float = -1, 101 | max_lam: float = 129600 102 | ): 103 | self.df = df 104 | self.forecaster = forecaster 105 | self.equality_constraints = equality_constraints 106 | self.inequality_constraints = inequality_constraints 107 | self.parallelize = parallelize 108 | self.n_forecast_error = n_forecast_error 109 | self.shrinkage_method = shrinkage_method 110 | self.default_lam = default_lam 111 | self.max_lam = max_lam 112 | 113 | def fit( 114 | self, 115 | ) -> pd.DataFrame: 116 | """ 117 | Fits the model and generates reconciled forecasts for the input 118 | dataframe subject to defined constraints. 119 | """ 120 | 121 | df = self.df 122 | forecaster = self.forecaster 123 | equality_constraints = self.equality_constraints 124 | inequality_constraints = self.inequality_constraints 125 | parallelize = self.parallelize 126 | n_forecast_error = self.n_forecast_error 127 | shrinkage_method = self.shrinkage_method 128 | default_lam = self.default_lam 129 | max_lam = self.max_lam 130 | 131 | # modify inputs into machine-friendly shape 132 | df0, all_cells, unknown_cells, known_cells, islands = OrganizeCells(df) 133 | 134 | # get constraint matrices 135 | C, d = StringToMatrixConstraints(df0.T.stack(), all_cells, unknown_cells, known_cells, equality_constraints) 136 | C, d = AddIslandsToConstraints(C, d, islands) 137 | C_ineq, d_ineq = StringToMatrixConstraints( 138 | df0.T.stack(), all_cells, unknown_cells, known_cells, inequality_constraints 139 | ) 140 | 141 | # Initiate DefaultForecaster only if a forecaster has not already been defined by the user. 142 | # Use OLS PCA if small_sample is True, and Grid Search if false. 143 | small_sample: bool = CheckTrainingSampleSize(df0, n_forecast_error) 144 | if forecaster is None: 145 | forecaster = DefaultForecaster(small_sample) 146 | 147 | # 1st stage forecast and its model 148 | df1, df1_model = FillAllEmptyCells(df0, forecaster, parallelize=parallelize) 149 | 150 | # get pseudo out-of-sample prediction, true values, and prediction models 151 | pred, true, model = GenPredTrueData(df0, forecaster, n_forecast_error=n_forecast_error, parallelize=parallelize) 152 | 153 | # break dataframe into list of time series 154 | ts_list, pred_list, true_list = BreakDataFrameIntoTimeSeriesList(df0, df1, pred, true) 155 | 156 | # get parts for reconciliation 157 | y1 = GenVecForecastWithIslands(ts_list, islands) 158 | W, shrinkage = GenWeightMatrix(pred_list, true_list, shrinkage_method=shrinkage_method) 159 | smoothness = GenLamstar(pred_list, true_list, default_lam=default_lam, max_lam=max_lam) 160 | Phi = GenSmoothingMatrix(W, smoothness) 161 | 162 | # 2nd stage reconciled forecast 163 | y2 = Reconciliation(y1, W, Phi, C, d, C_ineq, d_ineq) 164 | 165 | # reshape vector y2 into df2 166 | y2 = y2.T.stack(future_stack=True) 167 | y2.index = y2.index.droplevel(level=0) 168 | df2 = df0.copy() 169 | df2.update(y2, overwrite=False) # fill only nan cells of df0 170 | 171 | self.df0 = df0 172 | self.C = C 173 | self.d = d 174 | self.C_ineq = C_ineq 175 | self.d_ineq = d_ineq 176 | self.islands = islands 177 | 178 | self.df1 = df1 179 | self.df1_model = df1_model 180 | 181 | self.pred = pred 182 | self.true = true 183 | self.model = model 184 | self.ts_list = ts_list 185 | self.pred_list = pred_list 186 | self.true_list = true_list 187 | self.y1 = y1 188 | self.W = W 189 | self.Phi = Phi 190 | self.shrinkage = shrinkage 191 | self.smoothness = smoothness 192 | 193 | self.y2 = y2 194 | self.df2 = df2 195 | 196 | return self.df2 197 | -------------------------------------------------------------------------------- /src/macroframe_forecast/MFF_mixed_frequency.py: -------------------------------------------------------------------------------- 1 | # Disclaimer: Reuse of this tool and IMF information does not imply 2 | # any endorsement of the research and/or product. Any research presented 3 | # should not be reported as representing the views of the IMF, 4 | # its Executive Board, member governments. 5 | 6 | # Mix-frequency is not working properly yet, waiting for Pandas to fix error: https://github.com/pandas-dev/pandas/issues/59775 7 | 8 | import pandas as pd 9 | 10 | from .utils import ( 11 | BreakDataFrameIntoTimeSeriesList, 12 | ConcatMixFreqMultiIndexSeries, 13 | DefaultForecaster, 14 | FillAllEmptyCells, 15 | GenLamstar, 16 | GenPredTrueData, 17 | GenSmoothingMatrix, 18 | GenVecForecastWithIslands, 19 | GenWeightMatrix, 20 | OrganizeCells, 21 | Reconciliation, 22 | StringToMatrixConstraints, 23 | get_freq_of_freq, 24 | ) 25 | 26 | class MFF_mixed_freqency: 27 | def __init__( 28 | self, df_dict, forecaster=DefaultForecaster(), constraints_with_wildcard=[], ineq_constraints_with_wildcard=[] 29 | ): 30 | self.df_dict = df_dict 31 | self.forecaster = forecaster 32 | self.constraints_with_wildcard = constraints_with_wildcard 33 | self.ineq_constraints_with_wildcard = ineq_constraints_with_wildcard 34 | 35 | def fit(self): 36 | df_dict = self.df_dict 37 | forecaster = self.forecaster 38 | constraints_with_wildcard = self.constraints_with_wildcard 39 | # TODO: delete, the assignment below, if not needed 40 | ineq_constraints_with_wildcard = self.ineq_constraints_with_wildcard # noqa: F841 41 | 42 | # create constraints 43 | freq_order = ["Y", "Q", "M", "W", "D", "H", "T", "S"] 44 | lowest_freq = freq_order[min([freq_order.index(k) for k in df_dict.keys()])] 45 | 46 | df0_list = [] 47 | all_cells_list = [] 48 | unknown_cells_list = [] 49 | known_cells_list = [] 50 | islands_list = [] 51 | for k in df_dict.keys(): 52 | df0_k, all_cells_k, unknown_cells_k, known_cells_k, islands_k = OrganizeCells(df_dict[k]) 53 | df0_list.append(df0_k) 54 | all_cells_list.append(all_cells_k) 55 | unknown_cells_list.append(unknown_cells_k) 56 | known_cells_list.append(known_cells_k) 57 | islands_list.append(islands_k) 58 | 59 | df0_stacked = ConcatMixFreqMultiIndexSeries([df0.T.stack() for df0 in df0_list], axis=0) 60 | all_cells = pd.concat(all_cells_list, axis=0) 61 | unknown_cells = pd.concat(unknown_cells_list, axis=0) 62 | known_cells = pd.concat(known_cells_list, axis=0) 63 | islands = pd.concat(islands_list, axis=0) 64 | 65 | C, d = StringToMatrixConstraints(df0_stacked, all_cells, unknown_cells, known_cells, constraints_with_wildcard) 66 | 67 | # combine all frequncies into the lowest frequency dataframe 68 | df0wide_list = [] 69 | df0wide_colflat_list = [] 70 | for df in df0_list: 71 | df0 = df.copy() # don't want to change df0_list 72 | df0_freq = df0.index.freqstr[0] 73 | 74 | if df0_freq == lowest_freq: 75 | df0wide_freq = df0.copy() 76 | df0wide_colfat_freq = pd.Series(df0wide_freq.columns, index=df0wide_freq.columns) 77 | 78 | else: 79 | index_freq = df0.index.asfreq(lowest_freq) 80 | col_freq = df0_freq + get_freq_of_freq(df0.index, df0_freq).astype(str) 81 | df0.index = pd.MultiIndex.from_arrays([index_freq, col_freq]) 82 | df0wide_freq = df0.unstack() 83 | df0wide_colfat_freq = pd.Series(df0wide_freq.columns.map("_".join), index=df0wide_freq.columns) 84 | 85 | df0wide_list.append(df0wide_freq) 86 | df0wide_colflat_list.append(df0wide_colfat_freq) 87 | 88 | df0wide = pd.concat(df0wide_list, axis=1) 89 | # TODO: delete, the assignment below, if not needed 90 | df0wide_col = df0wide.columns # noqa: F841 91 | df0wide_colflat = pd.concat(df0wide_colflat_list) 92 | 93 | # 1st step forecast 94 | df0wide.columns = df0wide_colflat.values.tolist() # colname has to be single index 95 | df1wide, df1wide_model = FillAllEmptyCells(df0wide, forecaster) 96 | predwide, truewide, modelwide = GenPredTrueData(df0wide, forecaster) 97 | 98 | # get df1_list by breaking wide dataframe into different frequencies 99 | df1_list = [] 100 | for df0i, df0 in enumerate(df0_list): 101 | if df0.index.freqstr[0] == lowest_freq: 102 | df1_freq = df0.copy() 103 | df1_freq.update(df1wide.loc[:, df0wide_colflat_list[df0i].values]) 104 | else: 105 | df1wide_freq = df1wide.loc[:, df0wide_colflat_list[df0i].values] 106 | df1wide_freq.columns = pd.MultiIndex.from_tuples(df0wide_colflat_list[df0i].index) 107 | df1_freq = df0wide_list[df0i].copy().stack(future_stack=True) # storage 108 | df1_freq.update(df1wide_freq.stack(future_stack=True)) 109 | df1_freq.index = df0_list[df0i].index 110 | 111 | df1_list.append(df1_freq) 112 | 113 | # get pred_list, true_list by breaking dataframes into different frequencies 114 | pred_allfreq = [] 115 | true_allfreq = [] 116 | for df0i, df0 in enumerate(df0_list): 117 | # get nan cells 118 | df0wide_freq = df0wide_list[df0i].copy() 119 | df0wide_freq.columns = df0wide_colflat_list[df0i].values 120 | na_cells = df0wide_freq.isna()[df0wide_freq.isna()].T.stack().index 121 | 122 | # slice predwide 123 | pred_freq = predwide.loc[:, na_cells] 124 | true_freq = truewide.loc[:, na_cells] 125 | 126 | if df0.index.freqstr[0] != lowest_freq: 127 | # reshape colname multiindex of (var_freq,lowestfreq) to var_lowestfreqfreq 128 | colflat = pred_freq.columns 129 | var_list = [v[: v.rfind("_")] for v in colflat.get_level_values(0)] 130 | freq_list = [v[v.rfind("_") + 1 :] for v in colflat.get_level_values(0)] 131 | lowest_freq_list = colflat.get_level_values(-1).astype(str) 132 | original_time = pd.PeriodIndex( 133 | [lowest_freq_list[i] + freq_list[i] for i in range(len(colflat))], freq=df0.index.freq 134 | ) 135 | pred_freq_colname = pd.MultiIndex.from_arrays([var_list, original_time]) 136 | pred_freq.columns = pred_freq_colname 137 | true_freq.columns = pred_freq_colname 138 | 139 | # change col order 140 | pred_freq = pred_freq.loc[:, df0.isna()[df0.isna()].T.stack().index] 141 | true_freq = true_freq.loc[:, pred_freq.columns] 142 | 143 | # append pred, true for each frequency 144 | pred_allfreq.append(pred_freq) 145 | true_allfreq.append(true_freq) 146 | 147 | # break dataframes in to lists 148 | ts_list = [] 149 | pred_list = [] 150 | true_list = [] 151 | for df0i, df0 in enumerate(df0_list): 152 | ts_list_freq, pred_list_freq, true_list_freq = BreakDataFrameIntoTimeSeriesList( 153 | df0, df1_list[df0i], pred_allfreq[df0i], true_allfreq[df0i] 154 | ) 155 | 156 | ts_list += ts_list_freq 157 | pred_list += pred_list_freq 158 | true_list += true_list_freq 159 | 160 | # get parts for reconciliation 161 | # islands_list_all_freq = pd.concat(islands_list) 162 | 163 | y1 = GenVecForecastWithIslands(ts_list, islands) 164 | W, shrinkage = GenWeightMatrix(pred_list, true_list) 165 | smoothness = GenLamstar(pred_list, true_list) 166 | Phi = GenSmoothingMatrix(W, smoothness) 167 | 168 | y2 = Reconciliation(y1, W, Phi, C, d) 169 | 170 | # reshape vector y2 into df2 171 | y2 = y2.T.stack(future_stack=True) 172 | y2.index = y2.index.droplevel(level=0) 173 | df2_list = [] 174 | for df0 in df0_list: 175 | df2_freq = df0.copy() 176 | df2_freq.update(y2, overwrite=False) # fill only nan cells of df0 177 | df2_list.append(df2_freq) 178 | 179 | self.df0_list = df0_list 180 | self.df1_list = df1_list 181 | self.df2_list = df2_list 182 | return self.df2_list 183 | -------------------------------------------------------------------------------- /src/macroframe_forecast/examples.py: -------------------------------------------------------------------------------- 1 | # Disclaimer: Reuse of this tool and IMF information does not imply 2 | # any endorsement of the research and/or product. Any research presented 3 | # should not be reported as representing the views of the IMF, 4 | # its Executive Board, member governments. 5 | 6 | 7 | from string import ascii_uppercase 8 | from pandas import DataFrame 9 | import numpy as np 10 | import pandas as pd 11 | from sktime.datasets import load_macroeconomic 12 | 13 | from macroframe_forecast import MFF, MFF_mixed_freqency 14 | 15 | # %% 16 | 17 | 18 | def example1(): # no constraints 19 | # load data 20 | # from sktime.datasets import load_macroeconomic 21 | df_true = load_macroeconomic().iloc[:, :5] 22 | 23 | # input dataframe 24 | df = df_true.copy() 25 | fh = 5 26 | df.iloc[-fh:, 0] = np.nan 27 | 28 | # apply MFF 29 | m = MFF(df, equality_constraints=[]) 30 | df2 = m.fit() 31 | df0 = m.df0 32 | df1 = m.df1 33 | df1_model = m.df1_model 34 | smoothness = m.smoothness 35 | shrinkage = m.shrinkage 36 | 37 | # plot results 38 | t0 = -30 39 | ax = df0.iloc[t0:, 0].plot(label="df0") 40 | df1.iloc[t0:, 0].plot(ax=ax, label="df1") 41 | df2.iloc[t0:, 0].plot(ax=ax, label="df2") 42 | df_true.iloc[t0:, 0].plot(ax=ax, label="df_true") 43 | ax.axvline(x=df0.index[-fh]) 44 | ax.legend() 45 | 46 | print("smoothness", smoothness.values) 47 | print("shrinkage", np.round(shrinkage, 3)) 48 | for ri, ci in np.argwhere(df.isna()): 49 | print(df1_model.index[ri], df1_model.columns[ci], df1_model.iloc[ri, ci].best_params_) 50 | 51 | 52 | # example 2: with constraints 53 | def example2(): 54 | # create data 55 | n = 30 56 | p = 3 57 | fh = 5 58 | df_true = pd.DataFrame( 59 | np.random.rand(n, p), 60 | columns=[f"{L}{i}" for i in range(int(np.ceil(p / 26))) for L in ascii_uppercase][:p], 61 | index=pd.date_range(start="2000", periods=n, freq="YE").year, 62 | ) 63 | df_true.iloc[:, -1] = df_true.iloc[:, :-1].sum(axis=1) 64 | df = df_true.copy() 65 | df.iloc[-fh:, : np.ceil(p / 2).astype(int)] = np.nan 66 | df.iloc[-1, 0] = df_true.iloc[-1, 0] # island 67 | # df.iloc[-fh,-1] = df.iloc[:,-1].mean() 68 | # df.iloc[-3,1] = df_true.iloc[-3,1] # island 69 | equality_constraints = ["A0?+B0?-C0?"] 70 | # ineq_constraints_with_wildcard = ['A0?-0.5'] # A0 <=0.5 for all years 71 | 72 | # fit data 73 | m = MFF(df, equality_constraints=equality_constraints) 74 | df2 = m.fit() 75 | df0 = m.df0 76 | df1 = m.df1 77 | df1_model = m.df1_model 78 | shrinkage = m.shrinkage 79 | smoothness = m.smoothness 80 | # TODO: delete, the assignment below, if not needed 81 | W = m.W # noqa: F841 82 | for ri, ci in np.argwhere(df.isna()): 83 | print(df1_model.index[ri], df1_model.columns[ci], df1_model.iloc[ri, ci].best_params_) 84 | 85 | import matplotlib.pyplot as plt 86 | 87 | plt.figure() 88 | t0 = -20 89 | plt.subplot(2, 1, 1) 90 | ax = df0.iloc[t0:, 0].plot(label="df0") 91 | df1.iloc[t0:, 0].plot(ax=ax, label="df1") 92 | df2.iloc[t0:, 0].plot(ax=ax, label="df2") 93 | df_true.iloc[t0:, 0].plot(ax=ax, label="df_true") 94 | ax.axvline(x=df0.index[-fh]) 95 | 96 | plt.subplot(2, 1, 2) 97 | ax = df0.iloc[t0:, 1].plot(label="df0") 98 | df1.iloc[t0:, 1].plot(ax=ax, label="df1") 99 | df2.iloc[t0:, 1].plot(ax=ax, label="df2") 100 | df_true.iloc[t0:, 1].plot(ax=ax, label="df_true") 101 | ax.axvline(x=df0.index[-fh], label="fh=1") 102 | ax.legend(loc="lower left") 103 | 104 | print("smoothness", smoothness.values) 105 | print("shrinkage", np.round(shrinkage, 3)) 106 | 107 | # confirm constraints 108 | assert np.isclose(df2["A0"] + df2["B0"] - df2["C0"], 0).all() 109 | 110 | 111 | # example, mixed-frequency intra-inter-temporal constraints 112 | def example3(): 113 | import warnings 114 | 115 | warnings.filterwarnings("ignore", category=UserWarning) 116 | 117 | n = 120 118 | p = 3 119 | fhA = 5 120 | fhQ = 7 121 | dfQ_true = pd.DataFrame( 122 | np.random.rand(n, p), 123 | columns=[f"{L}{i}" for i in range(int(np.ceil(p / 26))) for L in ascii_uppercase][:p], 124 | index=pd.period_range(start="2000-1-1", periods=n, freq="Q"), 125 | ) 126 | dfQ_true.iloc[:, -1] = dfQ_true.iloc[:, :-1].sum(axis=1) 127 | dfA_true = dfQ_true.groupby(dfQ_true.index.year).sum() 128 | dfA_true.index = pd.PeriodIndex(dfA_true.index, freq="Y") 129 | 130 | dfA = dfA_true.copy() 131 | dfA.iloc[-fhA:, : np.ceil(p / 2).astype(int)] = np.nan 132 | 133 | dfQ = dfQ_true.iloc[:-12, :].copy() 134 | dfQ.iloc[-fhQ:, : np.ceil(p / 2).astype(int)] = np.nan 135 | 136 | # inputs 137 | df_dict = {"Y": dfA, "Q": dfQ} 138 | constraints_with_wildcard = ["A0?+B0?-C0?", "?Q1+?Q2+?Q3+?Q4-?"] 139 | 140 | mff = MFF_mixed_freqency(df_dict, constraints_with_wildcard=constraints_with_wildcard) 141 | df2_list = mff.fit() 142 | df1_list = mff.df1_list 143 | df0_list = mff.df0_list 144 | 145 | # plot results 146 | import matplotlib.pyplot as plt 147 | 148 | t0 = -30 149 | plt.subplot(2, 1, 1) 150 | ax = df0_list[1].iloc[t0:, 0].plot(label="df0") 151 | df1_list[1].iloc[t0:, 0].plot(ax=ax, label="df1") 152 | df2_list[1].iloc[t0:, 0].plot(ax=ax, label="df2") 153 | dfQ_true.iloc[t0:, 0].plot(ax=ax, label="df_true") 154 | ax.axvline(x=df0_list[1].index[-fhQ], label="fh=1") 155 | ax.legend(loc="lower left") 156 | 157 | plt.subplot(2, 1, 2) 158 | ax = df0_list[0].iloc[t0:, 0].plot(label="df0") 159 | df1_list[0].iloc[t0:, 0].plot(ax=ax, label="df1") 160 | df2_list[0].iloc[t0:, 0].plot(ax=ax, label="df2") 161 | dfA_true.iloc[t0:, 0].plot(ax=ax, label="df_true") 162 | ax.axvline(x=df0_list[0].index[-fhQ], label="fh=1") 163 | ax.legend(loc="lower left") 164 | 165 | # check constraints 166 | df2A = df2_list[0] 167 | df2Q = df2_list[1] 168 | df2A.eval("A0+B0-C0") 169 | (df2Q.resample("Y").sum() - df2A).dropna() 170 | 171 | 172 | def generate_example_GDP_df() -> DataFrame: 173 | """Utility function to generate example GDP data for quick demonstration purposes. 174 | 175 | Example: 176 | 177 | ```python 178 | from macroframe_forecast import MFF 179 | from macroframe_forecast.examples import generate_example_GDP_df 180 | 181 | df0 = generate_example_GDP_df() 182 | m = MFF(df0, equality_constraints=["GDP_2030 - 1.04 * GDP_2029"]) 183 | m.fit() 184 | ``` 185 | 186 | """ 187 | GDP_data_true = DataFrame( 188 | { 189 | "year": [ 190 | 1950, 191 | 1951, 192 | 1952, 193 | 1953, 194 | 1954, 195 | 1955, 196 | 1956, 197 | 1957, 198 | 1958, 199 | 1959, 200 | 1960, 201 | 1961, 202 | 1962, 203 | 1963, 204 | 1964, 205 | 1965, 206 | 1966, 207 | 1967, 208 | 1968, 209 | 1969, 210 | 1970, 211 | 1971, 212 | 1972, 213 | 1973, 214 | 1974, 215 | 1975, 216 | 1976, 217 | 1977, 218 | 1978, 219 | 1979, 220 | 1980, 221 | 1981, 222 | 1982, 223 | 1983, 224 | 1984, 225 | 1985, 226 | 1986, 227 | 1987, 228 | 1988, 229 | 1989, 230 | 1990, 231 | 1991, 232 | 1992, 233 | 1993, 234 | 1994, 235 | 1995, 236 | 1996, 237 | 1997, 238 | 1998, 239 | 1999, 240 | 2000, 241 | 2001, 242 | 2002, 243 | 2003, 244 | 2004, 245 | 2005, 246 | 2006, 247 | 2007, 248 | 2008, 249 | 2009, 250 | 2010, 251 | 2011, 252 | 2012, 253 | 2013, 254 | 2014, 255 | 2015, 256 | 2016, 257 | 2017, 258 | 2018, 259 | 2019, 260 | 2020, 261 | 2021, 262 | 2022, 263 | 2023, 264 | 2024, 265 | 2025, 266 | 2026, 267 | 2027, 268 | 2028, 269 | 2029, 270 | 2030, 271 | ], 272 | "GDP": [ 273 | 301782704906.154, 274 | 348993057004.926, 275 | 368027835977.609, 276 | 389147698401.843, 277 | 390276672099.46, 278 | 424868331217.657, 279 | 448388356231.708, 280 | 471707274214.225, 281 | 478166880805.205, 282 | 519476064642.104, 283 | 539899866168.654, 284 | 558583293630.287, 285 | 600454646133.34, 286 | 633368190949.311, 287 | 680153540812.135, 288 | 737201978910.734, 289 | 808045440847.441, 290 | 853883822469.0601, 291 | 933096436159.1281, 292 | 1008751520510.61, 293 | 1064366709379.28, 294 | 1155403629216.3, 295 | 1269884411457.22, 296 | 1418456050381.57, 297 | 1536647924378.57, 298 | 1674009506825.93, 299 | 1867242215504.46, 300 | 2079644632633.34, 301 | 2350400768409.49, 302 | 2627325000000.0, 303 | 2857325000000.0, 304 | 3207025000000.0, 305 | 3343800000000.0, 306 | 3634025000000.0, 307 | 4037650000000.0, 308 | 4339000000000.0, 309 | 4579625000000.0, 310 | 4855250000000.0, 311 | 5236425000000.0, 312 | 5641600000000.0, 313 | 5963125000000.0, 314 | 6158125000000.0, 315 | 6520325000000.0, 316 | 6858550000000.0, 317 | 7287250000000.0, 318 | 7639750000000.0, 319 | 8073125000000.0, 320 | 8577550000000.0, 321 | 9062825000000.0, 322 | 9631175000000.0, 323 | 10250950000000.0, 324 | 10581925000000.0, 325 | 10929100000000.0, 326 | 11456450000000.0, 327 | 12217175000000.0, 328 | 13039200000000.0, 329 | 13815600000000.0, 330 | 14474250000000.0, 331 | 14769850000000.0, 332 | 14478050000000.0, 333 | 15048975000000.0, 334 | 15599725000000.0, 335 | 16253950000000.0, 336 | 16880675000000.0, 337 | 17608125000000.0, 338 | 18295000000000.0, 339 | 18804900000000.0, 340 | 19612100000000.0, 341 | 20656525000000.0, 342 | 21539975000000.0, 343 | 21354125000000.0, 344 | 23681175000000.0, 345 | 26006900000000.0, 346 | 27720725000000.0, 347 | 29184900000000.0, 348 | 30507217002511.25, 349 | 31717641479090.75, 350 | 32941710359665.25, 351 | 34342131994149.0, 352 | 35712823521822.0, 353 | 37153089058192.75, 354 | ], 355 | } 356 | ) 357 | 358 | # The original GDP data is in dollar numbers, but changing this to billions 359 | # going forward in order to deal with problem of matrix invertibility. 360 | GDP_data_true["GDP"] = GDP_data_true["GDP"] / 1e12 361 | 362 | # Time period hs to be set as the index. Here year is the time identifier, 363 | # therefore setting this as the index. 364 | GDP_data_true.set_index(GDP_data_true["year"], inplace=True) 365 | GDP_data_true.drop(columns="year", inplace=True) 366 | 367 | # Creating a copy which is used for geenrating the forecasts. Removing the last 368 | # six years of data for ease of forecasts 369 | GDP_data = GDP_data_true.copy() 370 | # Removing the last six years of data so that they are forecasted by the 371 | # function. 372 | GDP_data.iloc[-6:, 0] = np.nan 373 | return GDP_data 374 | -------------------------------------------------------------------------------- /docs/source/examples.rst: -------------------------------------------------------------------------------- 1 | Examples 2 | -------- 3 | 4 | Single-variable example 5 | ~~~~~~~~~~~~~~~~~~~~~~~ 6 | 7 | 8 | .. code-block:: python 9 | 10 | import pandas as pd 11 | import numpy as np 12 | from macroframe_forecast import MFF 13 | import macroframe_forecast 14 | from string import ascii_uppercase, ascii_lowercase 15 | from sktime.datasets import load_macroeconomic 16 | import matplotlib.pyplot as plt 17 | 18 | #%% Reading the data and generating forecasts. 19 | 20 | # Reading GDP data as a pandas dataframe. 21 | # This dataframe has two columns: year and GDP. Data from 2024-2029 are WEO forecasts. 22 | from pandas import DataFrame 23 | 24 | GDP_data_true = DataFrame({ 25 | "year": [ 26 | 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 27 | 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 28 | 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 29 | 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 30 | 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 31 | 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 32 | 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 33 | 2020, 2021, 2022, 2023, 2024, 2025, 2026, 2027, 2028, 2029, 34 | 2030 35 | ], 36 | "GDP": [ 37 | 301782704906.154, 348993057004.926, 368027835977.609, 389147698401.843, 38 | 390276672099.46, 424868331217.657, 448388356231.708, 471707274214.225, 39 | 478166880805.205, 519476064642.104, 539899866168.654, 558583293630.287, 40 | 600454646133.34, 633368190949.311, 680153540812.135, 737201978910.734, 41 | 808045440847.441, 853883822469.0601, 933096436159.1281, 1008751520510.61, 42 | 1064366709379.28, 1155403629216.3, 1269884411457.22, 1418456050381.57, 43 | 1536647924378.57, 1674009506825.93, 1867242215504.46, 2079644632633.34, 44 | 2350400768409.49, 2627325000000.0, 2857325000000.0, 3207025000000.0, 45 | 3343800000000.0, 3634025000000.0, 4037650000000.0, 4339000000000.0, 46 | 4579625000000.0, 4855250000000.0, 5236425000000.0, 5641600000000.0, 47 | 5963125000000.0, 6158125000000.0, 6520325000000.0, 6858550000000.0, 48 | 7287250000000.0, 7639750000000.0, 8073125000000.0, 8577550000000.0, 49 | 9062825000000.0, 9631175000000.0, 10250950000000.0, 10581925000000.0, 50 | 10929100000000.0, 11456450000000.0, 12217175000000.0, 13039200000000.0, 51 | 13815600000000.0, 14474250000000.0, 14769850000000.0, 14478050000000.0, 52 | 15048975000000.0, 15599725000000.0, 16253950000000.0, 16880675000000.0, 53 | 17608125000000.0, 18295000000000.0, 18804900000000.0, 19612100000000.0, 54 | 20656525000000.0, 21539975000000.0, 21354125000000.0, 23681175000000.0, 55 | 26006900000000.0, 27720725000000.0, 29184900000000.0, 30507217002511.25, 56 | 31717641479090.75, 32941710359665.25, 34342131994149.0, 35712823521822.0, 57 | 37153089058192.75 58 | ] 59 | }) 60 | 61 | 62 | # Forecasted GDP growth in 2029 (last year) is as given below 63 | final_year_growth = 100*(GDP_data_true.iloc[-1,1]/GDP_data_true.iloc[-2,1]-1) 64 | 65 | # The original GDP data is in dollar numbers, but changing this to billions 66 | # going forward in order to deal with problem of matrix invertibility. 67 | GDP_data_true['GDP'] = GDP_data_true['GDP']/1e12 68 | 69 | # Time period hs to be set as the index. Here year is the time identifier, 70 | # therefore setting this as the index. 71 | GDP_data_true.set_index(GDP_data_true['year'], inplace = True) 72 | GDP_data_true.drop(columns = 'year', inplace = True) 73 | 74 | # Creating a copy which is used for geenrating the forecasts. Removing the last 75 | # six years of data for ease of forecasts 76 | GDP_data = GDP_data_true.copy() 77 | # Removing the last six years of data so that they are forecasted by the 78 | # function. 79 | GDP_data.iloc[-6:,0] = np.nan 80 | 81 | # Now we assume that US GDP grows by 4% from 2028 to 2029, which is given by the 82 | # WEO forecast. This therefore works as a constraint for the forecasts. 83 | # The dataframe has GDP in levels terms, therefore the constraint has to be 84 | # specified in levels terms as well. The constraints can be rewritten in the following 85 | # steps. 86 | # GDP_2029/GDP_2028 - 1 = 0.04 87 | # GDP_2029 = 1.04*GDP_2028 88 | # GDP_2029 - 1.04*GDP_2028 = 0 89 | 90 | # Constraints are to be provided in the form of a list, even when there is only 91 | # constraint. 92 | GDP_constraint = ['GDP_2030 - 1.04*GDP_2029'] 93 | 94 | m = MFF(df = GDP_data, 95 | equality_constraints = GDP_constraint, 96 | parallelize = False) 97 | 98 | # Using the fit method generates first as well as second step forecasts. 99 | m.fit() 100 | 101 | # First step forecasts are stored as df1 in the fitted object. 102 | firststep_GDP = m.df1 103 | 104 | # The forecasted data is filled into the df2 dataframe in the fitted object. 105 | reconciled_GDP = m.df2 106 | 107 | # Models are stored in a dataframe in the fitted object. 108 | 109 | models_used = m.df1_model 110 | models_used.iloc[-1,0] 111 | #%% Plotting first and second step forecasts 112 | fig, ax = plt.subplots(figsize=(8, 4.8)) 113 | 114 | firststep_GDP['GDP'].plot(ax=ax, label='First step forecasts', linestyle = '--') 115 | reconciled_GDP['GDP'].plot(ax=ax, label='Final forecasts', linestyle = '-.') 116 | GDP_data['GDP'].plot(ax = ax, label = 'Known values', color = 'red') 117 | 118 | ax.set_xlabel('Year') 119 | ax.set_ylabel('US Nominal GDP (in US$ trn)') 120 | ax.set_title('US GDP in levels') 121 | ax.legend(loc = 'lower left') 122 | 123 | # max_xlastvalue = reconciled_GDP.index.max() 124 | ax.set_xlim([2020, 2030]) 125 | ax.set_ylim([15, 40]) 126 | 127 | plt.xticks(np.arange(2019, 2031,2)) 128 | 129 | plt.show() 130 | 131 | # %% 132 | 133 | firststep_GDP['GDP_growth'] = (firststep_GDP['GDP']/firststep_GDP['GDP'].shift(1) - 1)*100 134 | reconciled_GDP['GDP_growth'] = (reconciled_GDP['GDP']/reconciled_GDP['GDP'].shift(1) - 1)*100 135 | GDP_data['GDP_growth'] = (GDP_data['GDP']/GDP_data['GDP'].shift(1) - 1)*100 136 | 137 | fig, ax = plt.subplots(figsize=(8, 4.8)) 138 | 139 | firststep_GDP['GDP_growth'].plot(ax=ax, label='First-step forecasts', linestyle = '--') 140 | reconciled_GDP['GDP_growth'].plot(ax=ax, label='Second-step forecasts', linestyle = '-.') 141 | GDP_data['GDP_growth'].plot(ax = ax, label = 'Known values', color = 'red') 142 | 143 | ax.set_xlabel('Year') 144 | ax.set_ylabel('Nominal GDP growth (annual, %)') 145 | ax.set_title('US GDP growth rates') 146 | ax.legend(loc = 'upper left') 147 | 148 | # Add triangle marker at (2029, 4) 149 | ax.plot(2030, 4, marker='v', color='black', markersize=8, label='_nolegend_') 150 | 151 | # Add text annotation 152 | ax.annotate('2030 growth constraint', xy=(2030, 4), xytext=(2030-2, 2.5), 153 | arrowprops=dict(arrowstyle='->', color='black'), color='black') 154 | 155 | # max_xlastvalue = reconciled_GDP.index.max() 156 | ax.set_xlim([2019, 2031]) 157 | 158 | plt.xticks(np.arange(2020, 2031,2)) 159 | 160 | plt.show() 161 | 162 | # %% Looking at externally generated first-stage 163 | 164 | GDP_forecasts_external = pd.DataFrame({"GDP": [29.0, 31.5, 33, 34.1,36.8, 39]}, 165 | index = [2025, 2026, 2027, 2028, 2029, 2030]) 166 | 167 | # Build MultiIndex using column name 168 | multi_index = pd.MultiIndex.from_product([[GDP_forecasts_external.columns[0]], GDP_forecasts_external.index], 169 | names=[None, 'year']) 170 | 171 | # Correct: flatten the 2D array to 1D 172 | GDP_multiindex_series = pd.Series(GDP_forecasts_external.values.ravel(), index=multi_index) 173 | 174 | 175 | 176 | W_alt = pd.DataFrame(np.eye(len(multi_index)), index=multi_index, columns=multi_index) # Create identity matrix with shape (n x n) 177 | 178 | smoothness_alt = pd.Series(np.ones(1) * 100, index=[multi_index]) 179 | 180 | Phi_alt = macroframe_forecast.utils.GenSmoothingMatrix(W_alt, smoothness_alt) 181 | 182 | 183 | final_forecasts = macroframe_forecast.utils.Reconciliation(y1 = GDP_multiindex_series, 184 | W = m.W, Phi = m.Phi, 185 | C = m.C, d = m.d, 186 | C_ineq = m.C_ineq, 187 | d_ineq = m.d_ineq) 188 | # %% 189 | 190 | # Convert MultiIndex Series to regular Series with year index 191 | 192 | gdp_to_forecast_series = GDP_data 193 | 194 | gdp_series = GDP_multiindex_series.xs('GDP', level=0) 195 | second_stage_series = final_forecasts.xs('GDP', level=0) 196 | 197 | # Now plot it 198 | fig, ax = plt.subplots(figsize=(8, 4.8)) 199 | gdp_series.plot(ax=ax, label='Externally generated first-step forecasts', linestyle='--') 200 | second_stage_series.iloc[:,0].plot(ax=ax, label='Second-step forecasts', linestyle = '-.') 201 | 202 | 203 | 204 | # Add labels and formatting 205 | ax.set_xlabel('Year') 206 | ax.set_ylabel('US Nominal GDP (in US$ trn)') 207 | ax.set_title('US GDP in levels') 208 | ax.legend(loc='upper left') 209 | ax.set_xlim([2024, 2030]) 210 | ax.set_ylim([15, 40]) 211 | 212 | 213 | Multi-variable example 214 | ~~~~~~~~~~~~~~~~~~~~~~ 215 | 216 | .. code-block:: python 217 | 218 | import pandas as pd 219 | import numpy as np 220 | from macroframe_forecast import MFF 221 | import matplotlib.pyplot as plt 222 | from sktime.forecasting.compose import DirectReductionForecaster 223 | from sktime.forecasting.compose import ForecastingPipeline 224 | from sklearn.linear_model import LinearRegression 225 | from pandas import DataFrame 226 | 227 | data = DataFrame({ 228 | "year": [ 229 | 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 230 | 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 231 | 2021, 2022, 2023, 2024, 2025, 2026, 2027, 2028, 2029, 2030 232 | ], 233 | "exp": [ 234 | 32.801, 33.698, 34.037, 33.719, 33.928, 33.692, 34.562, 37.144, 41.399, 39.763, 235 | 38.796, 37.223, 35.782, 35.324, 35.031, 35.333, 35.194, 35.349, 35.819, 44.779, 236 | 43.218, 36.829, 37.113, 37.593, 37.848, 38.004, 38.107, 38.024, 37.711, 37.862 237 | ], 238 | "rev": [ 239 | 32.257, 29.877, 29.266, 29.476, 30.853, 31.656, 31.649, 30.532, 28.222, 28.770, 240 | 29.080, 29.109, 31.222, 31.298, 31.501, 30.977, 30.400, 30.014, 30.014, 30.631, 241 | 31.827, 33.130, 29.949, 30.331, 31.389, 32.514, 32.754, 32.409, 32.222, 32.248 242 | ], 243 | "int_payments": [ 244 | 3.255, 2.892, 2.658, 2.563, 2.704, 2.775, 2.933, 2.776, 2.574, 2.678, 245 | 2.880, 2.726, 2.485, 2.474, 2.341, 2.490, 2.522, 2.769, 2.817, 2.537, 246 | 2.669, 3.137, 3.600, 4.195, 4.301, 4.427, 4.451, 4.370, 4.353, 4.290 247 | ], 248 | "pb": [ 249 | 2.711, -0.929, -2.113, -1.681, -0.371, 0.739, 0.020, -3.836, -10.603, -8.315, 250 | -6.836, -5.387, -2.076, -1.552, -1.189, -1.867, -2.272, -2.566, -2.988, -11.610, 251 | -8.721, -0.561, -3.564, -3.067, -2.158, -1.063, -0.902, -1.246, -1.136, -1.324 252 | ] 253 | }) 254 | 255 | 256 | # Data upto 2024 is known for all variables. 2024 onwards data are all WEO forecasts. 257 | # Let us assume that the path for Primary Balance/GDP is known to the forecaster, 258 | # which is given by the WEO forecasts, while the other three variables are to be 259 | # forecasted. These unknown values are therefore replaced by NaNs. 260 | 261 | fiscal_data = fiscal_data_true.copy() 262 | fiscal_data.iloc[-6:,:3] = np.nan 263 | 264 | # fiscal_data.iloc[-1,0] = fiscal_data_true.iloc[-1,0].copy() 265 | 266 | # The basic acccounting identiy can be writted as: 267 | # Primary Balance/GDP = Revenue /GDP - Expenditure/GDP + Interest Payments/GDP 268 | # We know that this identity has to bind throughout the forecasting horizon, and 269 | # therefore we can specify this using the wildcard feature. 270 | 271 | fiscal_constraint = ['pb? - rev? + exp? - int_payments?', 272 | 'exp_2030 - 37'] 273 | 274 | # Defining the OLS forecasting pipeline for the example 275 | 276 | ols = ForecastingPipeline(steps=[ 277 | ('ols',DirectReductionForecaster(LinearRegression())) 278 | ]) 279 | 280 | m = MFF(df = fiscal_data, 281 | equality_constraints = fiscal_constraint, 282 | forecaster = ols, 283 | parallelize = False) 284 | 285 | m.fit() 286 | 287 | first_step_forecasts = m.df1 288 | second_step_forecasts = m.df2 289 | 290 | # final_forecasts = 291 | # %% Expenditure forecasts 292 | fig, ax = plt.subplots(figsize=(8, 4.8)) 293 | 294 | first_step_forecasts['exp'].plot(ax=ax, label='First-step forecasts', linestyle = '--') 295 | second_step_forecasts['exp'].plot(ax=ax, label='Second-step forecast', linestyle = '-.') 296 | fiscal_data['exp'].plot(ax = ax, label = 'WEO values', color = 'red') 297 | 298 | ax.set_xlabel('Year') 299 | ax.set_ylabel('US Government Expenditure to GDP ratio (%)') 300 | ax.set_title('Government Expenditure') 301 | ax.legend(loc = 'lower left') 302 | 303 | ax.plot(2030, 36.65, marker='v', color='black', markersize=8, label='_nolegend_') 304 | 305 | # Add text annotation 306 | ax.annotate('2030 expenditure constraint value', xy=(2030, fiscal_data_true.iloc[-1,0]), xytext=(2027, 40), 307 | arrowprops=dict(arrowstyle='->', color='black'), color='black') 308 | 309 | # max_xlastvalue = reconciled_GDP.index.max() 310 | ax.set_xlim([2021, 2030]) 311 | 312 | plt.xticks(np.arange(2021, 2031,2)) 313 | 314 | plt.show() 315 | # %% Revenue forecasts 316 | fig, ax = plt.subplots(figsize=(8, 4.8)) 317 | 318 | first_step_forecasts['rev'].plot(ax=ax, label='First-step forecast', linestyle = '--') 319 | second_step_forecasts['rev'].plot(ax=ax, label='Second-step forecast', linestyle = '-.') 320 | fiscal_data['rev'].plot(ax = ax, label = 'WEO values', color = 'red') 321 | 322 | ax.set_xlabel('Year') 323 | ax.set_ylabel('US Government Revenue to GDP ratio (%)') 324 | ax.set_title('Government Revenue') 325 | ax.legend(loc = 'lower left') 326 | 327 | # max_xlastvalue = reconciled_GDP.index.max() 328 | ax.set_xlim([2021, 2030]) 329 | 330 | plt.xticks(np.arange(2021, 2031,2)) 331 | 332 | plt.show() 333 | 334 | # %% Interest Payment forecasts 335 | fig, ax = plt.subplots(figsize=(8, 4.8)) 336 | 337 | first_step_forecasts['int_payments'].plot(ax=ax, label='First-step forecast', linestyle = '--') 338 | second_step_forecasts['int_payments'].plot(ax=ax, label='Second-step forecast', linestyle = '-.') 339 | fiscal_data['int_payments'].plot(ax = ax, label = 'WEO values', color = 'red') 340 | 341 | ax.set_xlabel('Year') 342 | ax.set_ylabel('US Government Interest Payments to GDP ratio (%)') 343 | ax.set_title('Interest Payments') 344 | ax.legend(loc = 'lower left') 345 | 346 | # max_xlastvalue = reconciled_GDP.index.max() 347 | ax.set_xlim([2021, 2030]) 348 | 349 | plt.xticks(np.arange(2021, 2031,2)) 350 | 351 | plt.show() 352 | 353 | 354 | 355 | # %% First step primary balance vs. the constraints 356 | 357 | first_step_forecasts['pb_calculated'] = first_step_forecasts['rev'] - first_step_forecasts['exp'] + first_step_forecasts['int_payments'] 358 | second_step_forecasts['pb_calculated'] = second_step_forecasts['rev'] - second_step_forecasts['exp'] + second_step_forecasts['int_payments'] 359 | 360 | fig, ax = plt.subplots(figsize=(8, 4.8)) 361 | 362 | first_step_forecasts['pb_calculated'].plot(ax=ax, label='First-step forecast', linestyle = '--') 363 | second_step_forecasts['pb_calculated'].plot(ax=ax, label='Second-step forecast', linestyle = '-.') 364 | 365 | # fiscal_data[fiscal_data]['pb'].plot(ax=ax, label='WEO values', linestyle = '-.') 366 | fiscal_data[fiscal_data.index<2024]['pb'].plot(ax = ax, label = 'WEO values', color = 'red') 367 | fiscal_data[fiscal_data.index>=2024]['pb'].plot(ax = ax, label = 'Constraint values', color = 'green', marker = 'o', linestyle = 'None') 368 | 369 | 370 | ax.set_xlabel('Year') 371 | ax.set_ylabel('Primary Balance to GDP ratio (%)') 372 | ax.set_title('Primary Balance') 373 | ax.legend(loc = 'lower left') 374 | 375 | # max_xlastvalue = reconciled_GDP.index.max() 376 | ax.set_xlim([2021, 2030]) 377 | 378 | plt.xticks(np.arange(2021, 2031,2)) 379 | 380 | plt.show() 381 | # %% 382 | 383 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Attribution-NonCommercial-ShareAlike 3.0 IGO 2 | 3 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE. THE LICENSOR IS NOT NECESSARILY AN INTERGOVERNMENTAL ORGANIZATION (IGO), AS DEFINED IN THE LICENSE BELOW. 4 | License 5 | THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE (“LICENSE”). THE LICENSOR (DEFINED BELOW) HOLDS COPYRIGHT AND OTHER RIGHTS IN THE WORK. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE IS PROHIBITED. 6 | 7 | BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN CONSIDERATION FOR YOUR ACCEPTANCE AND AGREEMENT TO THE TERMS OF THE LICENSE. 8 | 9 | 1. Definitions 10 | 11 | “IGO” means, solely and exclusively for purposes of this License, an organization established by a treaty or other instrument governed by international law and possessing its own international legal personality. Other organizations established to carry out activities across national borders and that accordingly enjoy immunity from legal process are also IGOs for the sole and exclusive purposes of this License. IGOs may include as members, in addition to states, other entities. 12 | "Work" means the literary and/or artistic work eligible for copyright protection, whatever may be the mode or form of its expression including digital form, and offered under the terms of this License. It is understood that a database, which by reason of the selection and arrangement of its contents constitutes an intellectual creation, is considered a Work. 13 | "Licensor" means the individual, individuals, entity or entities that offer(s) the Work under the terms of this License and may be, but is not necessarily, an IGO. 14 | "You" means an individual or entity exercising rights under this License. 15 | "License Elements" means the following high-level license attributes as selected by the Licensor and indicated in the title of this License: Attribution, Noncommercial, ShareAlike. 16 | "Reproduce" means to make a copy of the Work in any manner or form, and by any means. 17 | "Distribute" means the activity of making publicly available the Work or Adaptation (or copies of the Work or Adaptation), as applicable, by sale, rental, public lending or any other known form of transfer of ownership or possession of the Work or copy of the Work. 18 | "Publicly Perform" means to perform public recitations of the Work and to communicate to the public those public recitations, by any means or process, including by wire or wireless means or public digital performances; to make available to the public Works in such a way that members of the public may access these Works from a place and at a place individually chosen by them; to perform the Work to the public by any means or process and the communication to the public of the performances of the Work, including by public digital performance; to broadcast and rebroadcast the Work by any means including signs, sounds or images. 19 | "Adaptation" means a work derived from or based upon the Work, or upon the Work and other pre-existing works. Adaptations may include works such as translations, derivative works, or any alterations and arrangements of any kind involving the Work. For purposes of this License, where the Work is a musical work, performance, or phonogram, the synchronization of the Work in timed-relation with a moving image is an Adaptation. For the avoidance of doubt, including the Work in a Collection is not an Adaptation. 20 | "Collection" means a collection of literary or artistic works or other works or subject matter other than works listed in Section 1(b) which by reason of the selection and arrangement of their contents, constitute intellectual creations, in which the Work is included in its entirety in unmodified form along with one or more other contributions, each constituting separate and independent works in themselves, which together are assembled into a collective whole. For the avoidance of doubt, a Collection will not be considered as an Adaptation. 21 | 2. Scope of this License. Nothing in this License is intended to reduce, limit, or restrict any uses free from copyright protection. 22 | 23 | 3. License Grant. Subject to the terms and conditions of this License, the Licensor hereby grants You a worldwide, royalty-free, non-exclusive license to exercise the rights in the Work as follows: 24 | 25 | to Reproduce, Distribute and Publicly Perform the Work, to incorporate the Work into one or more Collections, and to Reproduce, Distribute and Publicly Perform the Work as incorporated in the Collections; and, 26 | to create, Reproduce, Distribute and Publicly Perform Adaptations, provided that You clearly label, demarcate or otherwise identify that changes were made to the original Work. 27 | This License lasts for the duration of the term of the copyright in the Work licensed by the Licensor. The above rights may be exercised in all media and formats whether now known or hereafter devised. The above rights include the right to make such modifications as are technically necessary to exercise the rights in other media and formats. All rights not expressly granted by the Licensor are hereby reserved, including but not limited to the rights set forth in Section 4(e). 28 | 29 | 4. Restrictions. The license granted in Section 3 above is expressly made subject to and limited by the following restrictions: 30 | 31 | You may Distribute or Publicly Perform the Work only under the terms of this License. You must include a copy of, or the Uniform Resource Identifier (URI) for, this License with every copy of the Work You Distribute or Publicly Perform. You may not offer or impose any terms on the Work that restrict the terms of this License or the ability of the recipient of the Work to exercise the rights granted to that recipient under the terms of the License. You may not sublicense the Work (see section 8(a)). You must keep intact all notices that refer to this License and to the disclaimer of warranties with every copy of the Work You Distribute or Publicly Perform. When You Distribute or Publicly Perform the Work, You may not impose any effective technological measures on the Work that restrict the ability of a recipient of the Work from You to exercise the rights granted to that recipient under the terms of the License. This Section 4(a) applies to the Work as incorporated in a Collection, but this does not require the Collection apart from the Work itself to be made subject to the terms of this License. If You create a Collection, upon notice from a Licensor You must, to the extent practicable, remove from the Collection any credit (inclusive of any logo, trademark, official mark or official emblem) as required by Section 4(d), as requested. If You create an Adaptation, upon notice from a Licensor You must, to the extent practicable, remove from the Adaptation any credit (inclusive of any logo, trademark, official mark or official emblem) as required by Section 4(d), as requested. 32 | You may Distribute or Publicly Perform an Adaptation only under the terms of: (i) this License; (ii) a later version of this License with the same License Elements as this License; or (iii) either the unported Creative Commons license or a ported Creative Commons license (either this or a later license version) containing the same License Elements (the “Applicable License”). (I) You must include a copy of, or the URI for, the Applicable License with every copy of each Adaptation You Distribute or Publicly Perform. (II) You may not offer or impose any terms on the Adaptation that restrict the terms of the Applicable License or the ability of the recipient of the Adaptation to exercise the rights granted to that recipient under the terms of the Applicable License. (III) You must keep intact all notices that refer to this License and to the disclaimer of warranties with every copy of the Work as included in the Adaptation You Distribute or Publicly Perform. (IV) When You Distribute or Publicly Perform the Adaptation, You may not impose any effective technological measures on the Adaptation that restrict the ability of a recipient of the Adaptation from You to exercise the rights granted to that recipient under the terms of the Applicable License. This Section 4(b) applies to the Adaptation as incorporated in a Collection, but this does not require the Collection apart from the Adaptation itself to be made subject to the terms of the Applicable License. 33 | You may not exercise any of the rights granted to You in Section 3 above in any manner that is primarily intended for or directed toward commercial advantage or private monetary compensation. The exchange of the Work for other copyrighted works by means of digital file-sharing or otherwise shall not be considered to be primarily intended for or directed toward commercial advantage or private monetary compensation, provided there is no payment of any monetary compensation in connection with the exchange of copyrighted works. 34 | If You Distribute, or Publicly Perform the Work or any Adaptations or Collections, You must, unless a request has been made pursuant to Section 4(a), keep intact all copyright notices for the Work and provide, reasonable to the medium or means You are utilizing: (i) any attributions that the Licensor indicates be associated with the Work as indicated in a copyright notice, (ii) the title of the Work if supplied; (iii) to the extent reasonably practicable, the URI, if any, that the Licensor specifies to be associated with the Work, unless such URI does not refer to the copyright notice or licensing information for the Work; and, (iv) consistent with Section 3(b), in the case of an Adaptation, a credit identifying the use of the Work in the Adaptation. The credit required by this Section 4(d) may be implemented in any reasonable manner; provided, however, that in the case of an Adaptation or Collection, at a minimum such credit will appear, if a credit for all contributors to the Adaptation or Collection appears, then as part of these credits and in a manner at least as prominent as the credits for the other contributors. For the avoidance of doubt, You may only use the credit required by this Section for the purpose of attribution in the manner set out above and, by exercising Your rights under this License, You may not implicitly or explicitly assert or imply any connection with, sponsorship or endorsement by the Licensor or others designated for attribution, of You or Your use of the Work, without the separate, express prior written permission of the Licensor or such others. 35 | For the avoidance of doubt: 36 | 37 | Non-waivable Compulsory License Schemes. In those jurisdictions in which the right to collect royalties through any statutory or compulsory licensing scheme cannot be waived, the Licensor reserves the exclusive right to collect such royalties for any exercise by You of the rights granted under this License; 38 | Waivable Compulsory License Schemes. In those jurisdictions in which the right to collect royalties through any statutory or compulsory licensing scheme can be waived, the Licensor reserves the exclusive right to collect such royalties for any exercise by You of the rights granted under this License if Your exercise of such rights is for a purpose or use which is otherwise than noncommercial as permitted under Section 4(c) and otherwise waives the right to collect royalties through any statutory or compulsory licensing scheme; and, 39 | Voluntary License Schemes. To the extent possible, the Licensor waives the right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary licensing scheme. In all other cases the Licensor expressly reserves the right to collect such royalties. 40 | Except as otherwise agreed in writing by the Licensor, if You Reproduce, Distribute or Publicly Perform the Work either by itself or as part of any Adaptations or Collections, You must not distort, mutilate, modify or take other derogatory action in relation to the Work which would be prejudicial to the honor or reputation of the Licensor where moral rights apply. 41 | 5. Representations, Warranties and Disclaimer 42 | 43 | THE LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE. 44 | 45 | 6. Limitation on Liability 46 | 47 | IN NO EVENT WILL THE LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 48 | 49 | 7. Termination 50 | 51 | Subject to the terms and conditions set forth in this License, the license granted here lasts for the duration of the term of the copyright in the Work licensed by the Licensor as stated in Section 3. Notwithstanding the above, the Licensor reserves the right to release the Work under different license terms or to stop distributing the Work at any time; provided, however that any such election will not serve to withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License will continue in full force and effect unless terminated as stated below. 52 | If You fail to comply with this License, then this License and the rights granted hereunder will terminate automatically upon any breach by You of the terms of this License. Individuals or entities who have received Adaptations or Collections from You under this License, however, will not have their licenses terminated provided such individuals or entities remain in full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any termination of this License. Notwithstanding the foregoing, this License reinstates automatically as of the date the violation is cured, provided it is cured within 30 days of You discovering the violation, or upon express reinstatement by the Licensor. For the avoidance of doubt, this Section 7(b) does not affect any rights the Licensor may have to seek remedies for violations of this License by You. 53 | 8. Miscellaneous 54 | 55 | Each time You Distribute or Publicly Perform the Work or a Collection, the Licensor offers to the recipient a license to the Work on the same terms and conditions as the license granted to You under this License. 56 | Each time You Distribute or Publicly Perform an Adaptation, the Licensor offers to the recipient a license to the original Work on the same terms and conditions as the license granted to You under this License. 57 | If any provision of this License is invalid or unenforceable, it shall not affect the validity or enforceability of the remainder of the terms of this License, and without further action, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable. 58 | No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the Licensor. 59 | This License constitutes the entire agreement between You and the Licensor with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. The Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You. 60 | The rights granted under, and the subject matter referenced, in this License were drafted utilizing the terminology of the Berne Convention for the Protection of Literary and Artistic Works (as amended on September 28, 1979), the Rome Convention of 1961, the WIPO Copyright Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 and the Universal Copyright Convention (as revised on July 24, 1971). Interpretation of the scope of the rights granted by the Licensor and the conditions imposed on You under this License, this License, and the rights and conditions set forth herein shall be made with reference to copyright as determined in accordance with general principles of international law, including the above mentioned conventions. 61 | Nothing in this License constitutes or may be interpreted as a limitation upon or waiver of any privileges and immunities that may apply to the Licensor or You, including immunity from the legal processes of any jurisdiction, national court or other authority. 62 | Where the Licensor is an IGO, any and all disputes arising under this License that cannot be settled amicably shall be resolved in accordance with the following procedure: 63 | 64 | Pursuant to a notice of mediation communicated by reasonable means by either You or the Licensor to the other, the dispute shall be submitted to non-binding mediation conducted in accordance with rules designated by the Licensor in the copyright notice published with the Work, or if none then in accordance with those communicated in the notice of mediation. The language used in the mediation proceedings shall be English unless otherwise agreed. 65 | If any such dispute has not been settled within 45 days following the date on which the notice of mediation is provided, either You or the Licensor may, pursuant to a notice of arbitration communicated by reasonable means to the other, elect to have the dispute referred to and finally determined by arbitration. The arbitration shall be conducted in accordance with the rules designated by the Licensor in the copyright notice published with the Work, or if none then in accordance with the UNCITRAL Arbitration Rules as then in force. The arbitral tribunal shall consist of a sole arbitrator and the language of the proceedings shall be English unless otherwise agreed. The place of arbitration shall be where the Licensor has its headquarters. The arbitral proceedings shall be conducted remotely (e.g., via telephone conference or written submissions) whenever practicable. 66 | Interpretation of this License in any dispute submitted to mediation or arbitration shall be as set forth in Section 8(f), above. 67 | Creative Commons Notice 68 | Creative Commons is not a party to this License, and makes no warranty whatsoever in connection with the Work. Creative Commons will not be liable to You or any party on any legal theory for any damages whatsoever, including without limitation any general, special, incidental or consequential damages arising in connection to this license. Notwithstanding the foregoing two (2) sentences, if Creative Commons has expressly identified itself as the Licensor hereunder, it shall have all rights and obligations of the Licensor. 69 | 70 | Except for the limited purpose of indicating to the public that the Work is licensed under the CCPL, Creative Commons does not authorize the use by either party of the trademark "Creative Commons" or any related trademark or logo of Creative Commons without the prior written consent of Creative Commons. Any permitted use will be in compliance with Creative Commons' then-current trademark usage guidelines, as may be published on its website or otherwise made available upon request from time to time. For the avoidance of doubt, this trademark restriction does not form part of this License. 71 | 72 | Creative Commons may be contacted at https://creativecommons.org/ . 73 | -------------------------------------------------------------------------------- /src/macroframe_forecast/utils.py: -------------------------------------------------------------------------------- 1 | # Disclaimer: Reuse of this tool and IMF information does not imply 2 | # any endorsement of the research and/or product. Any research presented 3 | # should not be reported as representing the views of the IMF, 4 | # its Executive Board, member governments. 5 | 6 | import copy 7 | import re 8 | import warnings 9 | from random import sample, seed 10 | from string import ascii_lowercase 11 | from time import time 12 | from typing import Literal 13 | 14 | import cvxpy as cp 15 | import numpy as np 16 | import pandas as pd 17 | import scipy 18 | import sympy as sp 19 | from dask.distributed import Client 20 | from numpy import ndarray 21 | from numpy.linalg import inv 22 | from pandas import DataFrame, Index, PeriodIndex, Series 23 | from scipy.linalg import block_diag 24 | from sklearn.decomposition import PCA 25 | from sklearn.linear_model import ElasticNetCV, LinearRegression 26 | from sklearn.model_selection import TimeSeriesSplit 27 | from sklearn.preprocessing import StandardScaler 28 | from sktime.forecasting.base import BaseForecaster 29 | from sktime.forecasting.compose import ( 30 | DirectReductionForecaster, 31 | ForecastingPipeline, 32 | MultiplexForecaster, 33 | TransformedTargetForecaster, 34 | ) 35 | from sktime.forecasting.model_selection import ForecastingGridSearchCV 36 | from sktime.forecasting.naive import NaiveForecaster 37 | from sktime.split import ExpandingGreedySplitter 38 | from sktime.transformations.series.adapt import TabularToSeriesAdaptor 39 | from sktime.transformations.series.feature_selection import FeatureSelection 40 | 41 | # %% 42 | 43 | 44 | def CheckTrainingSampleSize(df0: DataFrame, n_forecast_error: int = 5) -> bool: 45 | """ 46 | Check sample size available for training window. Raise an exception if the 47 | number of observations available is too low. 48 | 49 | Parameters 50 | ---------- 51 | 52 | df0 : pd.DataFrame 53 | Input dataframe with island values replaced by nan. 54 | 55 | n_forecast_error : int 56 | Number of training and testing sets to split data into for generating 57 | matrix of forecast errors. 58 | 59 | Returns 60 | ------- 61 | 62 | small_sample : bool 63 | Indicator for whether the sample of observations available for training 64 | is small. 65 | 66 | """ 67 | 68 | forecast_horizon = max(np.argwhere(df0.isna())[:, 0]) - min(np.argwhere(df0.isna())[:, 0]) + 1 69 | 70 | minimum_training_obs = min(np.argwhere(df0.isna())[:, 0]) - forecast_horizon - n_forecast_error 71 | 72 | if minimum_training_obs <= 0: 73 | raise ValueError( 74 | "Number of observations too low for given forecast horizon " 75 | "and n_sample_splits; consider reducing forecast horizon and/or " 76 | "n_sample_splits" 77 | ) 78 | 79 | elif minimum_training_obs <= 15: 80 | return True 81 | 82 | else: 83 | return False 84 | 85 | 86 | def DefaultForecaster(small_sample: bool = False) -> BaseForecaster: 87 | """ 88 | Set up forecasting pipeline, specifying the scaling (transforming) to be 89 | applied and forecasting model to be used. 90 | 91 | Parameters 92 | ---------- 93 | small_sample : boolean 94 | Indicator for whether the sample of observations available for training 95 | is small. By default this is turned to False. 96 | 97 | Returns 98 | ------- 99 | gscv : BaseForecaster 100 | Instance of sktime's Grid Search forecaster, derived from BaseForecaster, 101 | which is configured for hyperparameter tuning and model selection. 102 | 103 | """ 104 | 105 | pipe_y_elasticnet = TransformedTargetForecaster( 106 | steps=[ 107 | ("scaler", TabularToSeriesAdaptor(StandardScaler())), 108 | ("forecaster", DirectReductionForecaster(ElasticNetCV(max_iter=5000, 109 | cv=TimeSeriesSplit(n_splits=5)), 110 | window_length = 5)), 111 | ] 112 | ) 113 | 114 | pipe_yX_elasticnet = ForecastingPipeline( 115 | steps=[ 116 | ("scaler", TabularToSeriesAdaptor(StandardScaler())), 117 | ("pipe_y", pipe_y_elasticnet), 118 | ] 119 | ) 120 | 121 | ols_1feature = ForecastingPipeline( 122 | steps=[ 123 | ("feature_selection", FeatureSelection(n_columns=1)), 124 | ("ols", DirectReductionForecaster(LinearRegression())), 125 | ] 126 | ) 127 | 128 | ols_pca = ForecastingPipeline( 129 | steps=[ 130 | ("pca", TabularToSeriesAdaptor(PCA(n_components=0.9))), 131 | ("ols", DirectReductionForecaster(LinearRegression())), 132 | ] 133 | ) 134 | 135 | # forecaster representation for selection among the listed models 136 | forecaster = MultiplexForecaster( 137 | forecasters=[ 138 | ("naive_drift", NaiveForecaster(strategy="drift", window_length=2)), 139 | ("naive_last", NaiveForecaster(strategy="last")), 140 | ("naive_mean", NaiveForecaster(strategy="mean", window_length=5)), 141 | ("elasticnetcv", pipe_yX_elasticnet), 142 | ("ols_1feature", ols_1feature), 143 | ("ols_pca", ols_pca), 144 | ] 145 | ) 146 | 147 | cv = ExpandingGreedySplitter(test_size=1, folds=5) 148 | 149 | # If the number of observations is small, Grid Search is no longer used for 150 | # model selection. Instead, OLS with PCA is used is used. 151 | 152 | if not small_sample: 153 | gscv = ForecastingGridSearchCV( 154 | forecaster=forecaster, 155 | cv=cv, 156 | param_grid={ 157 | "selected_forecaster": [ 158 | "naive_drift", 159 | "naive_last", 160 | "naive_mean", 161 | "elasticnetcv", 162 | "ols_1feature", 163 | "ols_pca", 164 | ] 165 | }, 166 | backend=None, 167 | ) 168 | 169 | else: 170 | gscv = NaiveForecaster(strategy = "last") 171 | 172 | return gscv 173 | 174 | 175 | def CleanIslands(df: DataFrame) -> tuple[DataFrame, Series]: 176 | """ 177 | Separate island values from input dataframe, replacing them with nan. 178 | Called by ``OrganizeCells``. 179 | 180 | Parameters 181 | ---------- 182 | df : pd.DataFrame 183 | Input dataframe with raw data. 184 | 185 | Returns 186 | ------- 187 | df_no_islands : pd.DataFrame 188 | Dataframe with island values replaced by nan. 189 | 190 | islands : pd.Series 191 | Series containing island values. 192 | 193 | Examples 194 | -------- 195 | >>> import numpy as np 196 | >>> import pandas as pd 197 | >>> n = 30 198 | >>> p = 2 199 | >>> df = pd.DataFrame(np.random.sample([n,p]), 200 | >>> columns=['a','b'], 201 | >>> index=pd.date_range(start='2000',periods=n,freq='YE').year) 202 | >>> df.iloc[-5:-1,:1] = np.nan 203 | >>> df0, islands = CleanIslands(df) 204 | 205 | """ 206 | df_no_islands = df.copy() # to keep original df as it is 207 | col_with_islands = df.columns[df.isna().any()] 208 | coli_list = [df_no_islands.columns.get_loc(col) for col in col_with_islands] 209 | for coli in coli_list: # for col with na 210 | first_na_index = np.argwhere(df.iloc[:, coli].isna()).min() 211 | df_no_islands.iloc[first_na_index:, coli] = np.nan 212 | 213 | islands: Series = df[df_no_islands.isna()].T.stack() 214 | return df_no_islands, islands 215 | 216 | 217 | def OrganizeCells(df: DataFrame) -> tuple[DataFrame, Series, Series, Series]: 218 | """ 219 | Extract island values (if existing) from input dataframe, replacing them 220 | with nan values. This is useful for generating first step forecasts, which 221 | disregard known island values for the prediction. Also identifies separate 222 | Pandas series of names of cells for known and unknown values in the input 223 | dataframe. 224 | 225 | Parameters 226 | ---------- 227 | df : pd.DataFrame 228 | Input dataframe with raw data. 229 | 230 | Returns 231 | ------- 232 | df0 : pd.DataFrame 233 | Dataframe with island values replaced by nan. 234 | 235 | all_cells : pd.Series 236 | Series containing cell names of all cells in the input dataframe. 237 | 238 | unknown_cells : pd.Series 239 | Series containing cell names of cells whose values are to be forecasted. 240 | 241 | known_cells : pd.Series 242 | Series containing cell names of cells whose values are known. 243 | 244 | islands : pd.Series 245 | Series containing island values. 246 | 247 | Examples 248 | -------- 249 | >>> import numpy as np 250 | >>> import pandas as pd 251 | >>> n = 30 252 | >>> p = 2 253 | >>> df = pd.DataFrame(np.random.sample([n,p]), 254 | >>> columns=['a','b'], 255 | >>> index=pd.date_range(start='2000',periods=n,freq='YE').year) 256 | >>> df.iloc[-5:-1,:1] = np.nan 257 | >>> df0, all_cells, unknown_cells, known_cells, islands = OrganizeCells(df) 258 | """ 259 | 260 | # clean islands 261 | df0, islands = CleanIslands(df) 262 | 263 | # all cells in forecast horizon 264 | all_cells_index = df0.T.stack(future_stack=True).index 265 | all_cells = pd.Series([f"{a}_{b}" for a, b in all_cells_index], index=all_cells_index) 266 | 267 | # unknown cells with nan 268 | unknown_cells_index = df0.isna()[df0.isna()].T.stack().index 269 | unknown_cells = pd.Series([f"{a}_{b}" for a, b in unknown_cells_index], index=unknown_cells_index) 270 | 271 | # known cells 272 | known_cells_index = all_cells_index.difference(unknown_cells_index) 273 | known_cells = pd.Series([f"{a}_{b}" for a, b in known_cells_index], index=known_cells_index) 274 | 275 | return df0, all_cells, unknown_cells, known_cells, islands 276 | 277 | 278 | def find_permissible_wildcard(constraints_with_wildcard: list[str], _seed: int = 0) -> str: 279 | """Generate random letter to be used in constraints.""" 280 | wild_card_length = 1 281 | seed(_seed) 282 | candidate = "".join(sample(ascii_lowercase, wild_card_length)) 283 | while candidate in "".join(constraints_with_wildcard): 284 | wild_card_length = wild_card_length + 1 285 | candidate = "".join(sample(ascii_lowercase, wild_card_length)) 286 | alphabet_wildcard = candidate 287 | return alphabet_wildcard 288 | 289 | 290 | def find_strings_to_replace_wildcard(constraint: str, var_list: Series, wildcard: str) -> list[str]: 291 | """Identify list of strings to be substituted with the wildcard character.""" 292 | 293 | varlist_regex = ["^" + str(v).replace(wildcard, "(.*)") + "$" for v in sp.sympify(constraint).free_symbols] 294 | missing_string_set_list = [] 295 | for w in varlist_regex: 296 | missing_string = [] 297 | for v in var_list: 298 | match = re.compile(w).search(v) 299 | if match: 300 | missing_string.append(match.group(1)) 301 | missing_string_set_list.append(set(missing_string)) 302 | missing_string_list = list(set.intersection(*missing_string_set_list)) 303 | missing_string_list.sort() 304 | 305 | return missing_string_list 306 | 307 | 308 | def expand_wildcard(constraints_with_alphabet_wildcard: list[str], var_list: Series, wildcard: str): 309 | """ 310 | Expand constraints with wildcard to all possible time periods. This is 311 | called within ``StringToMatrixConstraints``, and the wildcard character 312 | has already been replaced by a random letter before this function is 313 | called. 314 | 315 | Parameters 316 | ---------- 317 | constraints_with_alphabet_wildcard : string 318 | Linear equality constraints with wildcard string replaced 319 | with alphabets. 320 | var_list : list 321 | List of indices of all cells (known and unknown) in raw dataframe. 322 | wildcard : string 323 | Alphabet which has replaced wildcard string in the constraints. 324 | 325 | Return 326 | ------ 327 | expanded_constraints : list 328 | Expanded list of constraints over all time periods. 329 | 330 | Examples 331 | -------- 332 | >>> import numpy as np 333 | >>> import pandas as pd 334 | >>> n = 30 335 | >>> p = 2 336 | >>> df = pd.DataFrame(np.random.sample([n,p]), 337 | >>> columns=['a','b'], 338 | >>> index=pd.date_range(start='2000',periods=n,freq='YE').year) 339 | >>> df0_stacked = df.T.stack() 340 | >>> all_cells_index = df0_stacked.index 341 | >>> var_list = pd.Series([f'{a}_{b}' for a, b in all_cells_index], 342 | >>> index = all_cells_index) 343 | >>> constraints_with_alphabet_wildcard = ['ax + bx'] 344 | >>> alphabet_wildcard = 'x' 345 | >>> constraints = expand_wildcard(constraints_with_alphabet_wildcard, 346 | >>> var_list = var_list, 347 | >>> wildcard = alphabet_wildcard) 348 | 349 | """ 350 | expanded_constraints = [] 351 | for constraint in constraints_with_alphabet_wildcard: 352 | if wildcard not in constraint: 353 | expanded_constraints.append(constraint) 354 | else: 355 | missing_string_list = find_strings_to_replace_wildcard(constraint, var_list, wildcard) 356 | expanded_constraints += [constraint.replace(f"{wildcard}", m) for m in missing_string_list] 357 | return expanded_constraints 358 | 359 | 360 | def StringToMatrixConstraints( 361 | df0_stacked: DataFrame, # stack df0 to accomodate mixed frequency 362 | all_cells: Series, 363 | unknown_cells: Series, 364 | known_cells: Series, 365 | constraints_with_wildcard: list[str] | None = None, 366 | wildcard_string: str = "?", 367 | ) -> tuple[DataFrame, DataFrame]: 368 | """ 369 | Convert equality constraints from list to matrix form for horizons to 370 | be forecasted (Cy = d, where C and d are dataframes containing the 371 | linear constraints). The input dataframe should not be in a standard wide 372 | format, but instead all columns should be stacked on one another. This is 373 | needed to control for dealing with the case of mixed frequency among 374 | observations. All island values in the dinput dataframe should be replaced 375 | by nan prior to this step. 376 | 377 | Parameters 378 | ---------- 379 | df0_stacked : pd.Series 380 | Stacked version of df0 (input dataframe with islands removed). 381 | all_cells : pd.Series 382 | Series containing cell names of all cells in the input dataframe. 383 | unknown_cells : pd.Series 384 | Series containing cell names of cells whose values are to be forecasted. 385 | known_cells : pd.Series 386 | Series containing cell names of cells whose values are known.. 387 | constraints_with_wildcard : str, optional 388 | String specifying equality constraints that have to hold. 389 | The default is []. 390 | wildcard_string : str, optional 391 | String that is used as wildcard identifier in constraint. 392 | The default is '?'. 393 | 394 | Returns 395 | ------- 396 | C: pd.DataFrame 397 | Dataframe containing matrix of the linear constraints on the left side of 398 | equation Cy=d. 399 | d: pd.DataFrame 400 | Dataframe containing matrix of the linear constraints on the right side of 401 | equation Cy=d. 402 | 403 | Examples 404 | -------- 405 | >>> import numpy as np 406 | >>> import pandas as pd 407 | >>> n = 30 408 | >>> p = 2 409 | >>> df = pd.DataFrame(np.random.sample([n,p]), 410 | >>> columns=['a','b'], 411 | >>> index=pd.date_range(start='2000',periods=n,freq='YE').year) 412 | >>> df.iloc[-5:-1,:1] = np.nan 413 | >>> df0, all_cells, unknown_cells, known_cells, islands = OrganizeCells(df) 414 | >>> df0_stacked = df0.T.stack() 415 | >>> constraints_with_wildcard = ['a?+b?'] 416 | >>> C,d = StringToMatrixConstraints(df0_stacked, 417 | >>> all_cells, 418 | >>> unknown_cells, 419 | >>> known_cells, 420 | >>> constraints_with_wildcard) 421 | """ 422 | 423 | if constraints_with_wildcard is None: 424 | constraints_with_wildcard = list() 425 | 426 | # replace wildcard with alphabet to utilize sympy 427 | alphabet_wildcard = find_permissible_wildcard(constraints_with_wildcard) 428 | constraints_with_alphabet_wildcard = [ 429 | c.replace(wildcard_string, alphabet_wildcard) for c in constraints_with_wildcard 430 | ] 431 | 432 | # expand constraints using all cells at forecast horizon 433 | constraints = expand_wildcard( 434 | constraints_with_alphabet_wildcard, var_list=all_cells.tolist(), wildcard=alphabet_wildcard 435 | ) 436 | 437 | # obtain C_unknown by differentiating constraints wrt unknown cells with nan 438 | A, b = sp.linear_eq_to_matrix(constraints, sp.sympify(unknown_cells.tolist())) 439 | C = pd.DataFrame(np.array(A).astype(float), index=constraints, columns=unknown_cells.index) 440 | nonzero_rows = (C != 0).any(axis=1) 441 | C = C.loc[nonzero_rows] # drop rows with all zeros 442 | 443 | # obtain d_unknown by substituting known cells 444 | known_cell_dict = pd.Series( 445 | [df0_stacked.loc[idx] for idx in known_cells.index], index=known_cells.tolist() 446 | ).to_dict() 447 | d = pd.DataFrame(np.array(b.subs(known_cell_dict)).astype(float), index=constraints) 448 | d = d.loc[nonzero_rows] # drop rows with all zeros in C 449 | 450 | return C, d 451 | 452 | 453 | def AddIslandsToConstraints(C: DataFrame, d: DataFrame, islands: Series) -> tuple[DataFrame, DataFrame]: 454 | """ 455 | Add island values into the matrix form equality constraints which have been 456 | constructed by ``StringToMatrixConstraints``. 457 | 458 | Parameters 459 | ---------- 460 | C : pd.DataFrame 461 | Dataframe containing matrix of the linear constraints on the left side of 462 | equation Cy=d. 463 | d : pd.DataFrame 464 | Dataframe containing matrix of the linear constraints on the right side of 465 | equation Cy=d. 466 | islands : pd.Series 467 | Series containing island values to be introduced into linear equation. 468 | 469 | Returns 470 | ------- 471 | C_aug : pd.DataFrame 472 | Dataframe containing the augmented C matrix, with island values incorporated. 473 | d_aug : pd.DataFrame 474 | Dataframe containing the augmented d vector, with island values incorporated. 475 | 476 | Examples 477 | -------- 478 | >>> import numpy as np 479 | >>> import pandas as pd 480 | >>> n = 30 481 | >>> p = 2 482 | >>> df = pd.DataFrame(np.random.sample([n,p]), 483 | >>> columns=['a','b'], 484 | >>> index=pd.date_range(start='2000',periods=n,freq='YE').year) 485 | >>> df.iloc[-5:-1,:1] = np.nan 486 | >>> df0, all_cells, unknown_cells, known_cells, islands = OrganizeCells(df) 487 | >>> df0_stacked = df0.T.stack() 488 | >>> constraints_with_wildcard = ['a?+b?'] 489 | >>> C,d = StringToMatrixConstraints(df0_stacked, 490 | >>> all_cells, 491 | >>> unknown_cells, 492 | >>> known_cells, 493 | >>> constraints_with_wildcard) 494 | >>> C,d = AddIslandsToConstraints(C,d,islands) 495 | """ 496 | C_aug_index = islands.index.union(C.index, sort=False) # singleton constraints prioritize over islands 497 | C_aug = pd.DataFrame(np.zeros([len(C_aug_index), len(C.columns)]), index=C_aug_index, columns=C.columns) 498 | d_aug = pd.DataFrame(np.zeros([len(C_aug_index), 1]), index=C_aug_index) 499 | for idx in islands.index: 500 | C_aug.loc[C_aug.index == idx, idx] = 1 501 | d_aug.loc[d_aug.index == idx] = islands.loc[idx] 502 | C_aug.update(C) 503 | d_aug.update(d) 504 | 505 | return C_aug, d_aug 506 | 507 | 508 | def FillAnEmptyCell( 509 | df: DataFrame, row: int | str, col: int | str, forecaster: BaseForecaster 510 | ) -> tuple[float, BaseForecaster]: 511 | """ 512 | Generate a forecast for a given cell based on the latest known value 513 | for the given column (variable) and using the predefined forecasting pipeline. 514 | Called by ``FillAllEmptyCells``. 515 | 516 | Parameters 517 | ---------- 518 | df : pd.DataFrame 519 | Dataframe containing known values of all variables and nan for 520 | unknown values. 521 | row : str 522 | Row index of cell to be forecasted. 523 | col : str 524 | Column index of cell to be forecasted. 525 | forecaster : BaseForecaster 526 | 527 | 528 | Returns 529 | ------- 530 | y_pred : double 531 | Forecasted value of the variable for the given horizon. 532 | forecaster : BaseForecaster 533 | sktime BaseForecaster descendant 534 | 535 | Examples 536 | -------- 537 | >>> from string import ascii_lowercase 538 | >>> import numpy as np 539 | >>> import pandas as pd 540 | >>> from sklearn.linear_model import ElasticNetCV 541 | >>> from sktime.forecasting.compose import YfromX 542 | >>> n = 30 543 | >>> p = 2 544 | >>> df = pd.DataFrame(np.random.sample([n,p]), 545 | >>> columns=list(ascii_lowercase[:p]), 546 | >>> index=pd.date_range(start='2000',periods=n,freq='YE').year) 547 | >>> df.iloc[-5:,:1] = np.nan 548 | >>> row = df.index[-1] 549 | >>> col = df.columns[0] 550 | >>> forecaster = YfromX(ElasticNetCV()) 551 | >>> y_pred, forecaster = FillAnEmptyCell(df,row,col,forecaster) 552 | """ 553 | warnings.filterwarnings("ignore", category=UserWarning) 554 | 555 | # clone a forecaster 556 | f = forecaster.clone() 557 | 558 | # last historical data and forecast horizon in num 559 | T = np.argwhere(df.loc[:, col].isna()).min() - 1 560 | h = np.where(df.index == row)[0][0] - T 561 | 562 | y = df.iloc[:T, :].loc[:, col] 563 | 564 | X = df.iloc[: T + h].drop(columns=[col]).dropna(axis=1) 565 | X_train = X.iloc[:T, :] 566 | X_pred = X.iloc[T:, :] 567 | 568 | y_pred = f.fit(y=y, X=X_train, fh=h).predict(X=X_pred) 569 | 570 | return y_pred, f 571 | 572 | def FillAllEmptyCells( 573 | df: DataFrame, forecaster: BaseForecaster, parallelize: bool = True 574 | ) -> tuple[DataFrame, DataFrame]: 575 | """ 576 | Generate forecasts for all unknown cells in the supplied dataframe. 577 | All forecasts are made independently from each other. (TBC) 578 | 579 | Parameters 580 | ---------- 581 | df: pd.DataFrame 582 | Dataframe containing known values of all variables and nan for 583 | unknown values. 584 | 585 | forecaster : BaseForecaster 586 | sktime BaseForecaster descendant 587 | 588 | parallelize : boolean 589 | Indicate whether parallelization should be employed for generating the 590 | first step forecasts. Default value is `True`. 591 | 592 | Return 593 | ------ 594 | df1: pd.DataFrame 595 | Dataframe with all known cells, as well as unknown cells filled in by 596 | one-step forecasts. 597 | df1_model: pd.DataFrame 598 | Dataframe with all known cells, with unknown cells containing details 599 | of the forecaster used for generating forecast of that cell. 600 | 601 | Examples 602 | -------- 603 | >>> from string import ascii_lowercase 604 | >>> import numpy as np 605 | >>> import pandas as pd 606 | >>> from sklearn.linear_model import ElasticNetCV 607 | >>> from sktime.forecasting.compose import YfromX 608 | >>> from mff.utils import FillAllEmptyCells 609 | >>> n = 30 610 | >>> p = 2 611 | >>> df = pd.DataFrame(np.random.sample([n,p]), 612 | >>> columns=list(ascii_lowercase[:p]), 613 | >>> index=pd.date_range(start='2000',periods=n,freq='YE').year) 614 | >>> df.iloc[-5:,:1] = np.nan 615 | >>> def DefaultForecaster(): 616 | >>> return YfromX(ElasticNetCV(max_iter=5000)) 617 | >>> df1,df1_models = FillAllEmptyCells(df,DefaultForecaster()) 618 | 619 | """ 620 | 621 | # get indices of all np.nan cells 622 | na_cells = [(df.index[rowi], df.columns[coli]) for rowi, coli in np.argwhere(df.isna())] 623 | 624 | # apply dask 625 | if parallelize: 626 | start = time() 627 | client = Client() 628 | df_future = client.scatter(df,broadcast=True) 629 | forecaster_future = client.scatter(forecaster, broadcast=True) 630 | futures = [client.submit(FillAnEmptyCell, df_future, row, col, forecaster_future) 631 | for (row, col) in na_cells] 632 | results = client.gather(futures) 633 | client.close() 634 | print("Dask filled", len(results), "out-of-sample cells:", round(time() - start, 3), "seconds") 635 | 636 | else: 637 | start = time() 638 | results = [FillAnEmptyCell(df, row, col, forecaster) for row, col in na_cells] 639 | print("Forecast", len(results), "cells:", round(time() - start, 3), "seconds") 640 | 641 | # fill empty cells 642 | df1 = df.copy() 643 | df1_models = df.copy().astype(object) 644 | for idx, rowcol in enumerate(na_cells): 645 | df1.loc[rowcol] = results[idx][0].iloc[0] 646 | df1_models.loc[rowcol] = results[idx][1] 647 | 648 | return df1, df1_models 649 | 650 | 651 | def GenPredTrueData( 652 | df: DataFrame, forecaster: BaseForecaster, n_forecast_error: int = 5, parallelize: bool = True 653 | ) -> tuple[DataFrame, DataFrame, DataFrame]: 654 | """ 655 | Generate in-sample forecasts from existing data by constructing 656 | pseudo-historical datasets. 657 | 658 | Parameters 659 | ---------- 660 | df : pd.DataFrame 661 | Dataframe with all known as well as unknown values. 662 | forecaster : BaseForecaster 663 | sktime BaseForecaster descendant. 664 | n_forecast_error : int, optional 665 | Number of horizons for which in-sample forecasts are generated. 666 | The default is 5. 667 | parallelize : boolean, optional 668 | Indicate whether parallelization should be used. The default is True. 669 | 670 | Returns 671 | ------- 672 | pred : pd.DataFrame 673 | Dataframe with in-sample predictions generated using pseudo-historical 674 | datasets. 675 | true : pd.DataFrame 676 | Dataframe with actual values of the variable corresponding to predicted 677 | values contained in pred. 678 | model : pd.DataFrame 679 | Dataframe with information on the models used for generating each 680 | forecast. 681 | 682 | Examples 683 | -------- 684 | >>> import numpy as np 685 | >>> import pandas as pd 686 | >>> from sktime.forecasting.compose import YfromX 687 | >>> from sklearn.linear_model import ElasticNetCV 688 | >>> n = 30 689 | >>> p = 2 690 | >>> df = pd.DataFrame(np.random.sample([n,p]), 691 | >>> columns=['a','b'], 692 | >>> index=pd.date_range(start='2000',periods=n,freq='YE').year) 693 | >>> df.iloc[-5:,:1] = np.nan 694 | >>> def DefaultForecaster(): 695 | >>> return YfromX(ElasticNetCV(max_iter=5000)) 696 | >>> pred,true,model = GenPredTrueData(df0,forecaster,parallelize=parallelize) 697 | """ 698 | 699 | # last historical data and length of forecast horizon 700 | T = min(np.argwhere(df.isna())[:, 0]) - 1 701 | h = max(np.argwhere(df.isna())[:, 0]) - T 702 | 703 | # create pseudo historical dataframes and their na cells 704 | df_list = [df.shift(-h - n).mask(df.shift(-h - n).notna(), df).iloc[: -h - n, :] for n in range(n_forecast_error)] 705 | 706 | # unpack all the na cells for pseudo historical dataframes to use dask 707 | tasks = [ 708 | (dfi, df.index[rowi], df.columns[coli]) 709 | for dfi, df in enumerate(df_list) 710 | for (rowi, coli) in np.argwhere(df.isna()) 711 | ] 712 | 713 | if parallelize: 714 | start = time() 715 | client = Client() 716 | df_futures = client.scatter(df_list, broadcast=True) 717 | forecaster_future = client.scatter(forecaster, broadcast=True) 718 | futures = [client.submit(FillAnEmptyCell, df_futures[dfi], row, col, forecaster_future) for (dfi, row, col) in tasks] 719 | results = client.gather(futures) 720 | client.close() 721 | print("Dask filled", len(results), "in-sample cells:", round(time() - start, 3), "seconds") 722 | else: 723 | start = time() 724 | results = [FillAnEmptyCell(df_list[dfi], row, col, forecaster) for (dfi, row, col) in tasks] 725 | print("Fill", len(results), "in-sample cells:", round(time() - start, 3), "seconds") 726 | 727 | # repackage results by filling na of df_list 728 | filled_list = copy.deepcopy(df_list) 729 | model_list = [df.astype(object) for df in copy.deepcopy(df_list)] 730 | for task_idx, task in enumerate(tasks): 731 | dfi, row, col = task 732 | filled_list[dfi].loc[row, col] = results[task_idx][0].iloc[0] 733 | model_list[dfi].loc[row, col] = results[task_idx][1] 734 | 735 | # reduce n samples into a dataframe 736 | colname = df.isna()[df.isna()].T.stack().index 737 | idxname = pd.Index( 738 | [df_list[n].index[np.argwhere(df_list[n].isna())[:, 0].min()] for n in range(n_forecast_error)], name="LastData" 739 | ) 740 | pred = pd.DataFrame( 741 | [filled_list[n][df_list[n].isna()].T.stack().values for n in range(n_forecast_error)], 742 | index=idxname, 743 | columns=colname, 744 | ) 745 | model = pd.DataFrame( 746 | [model_list[n][df_list[n].isna()].T.stack().values for n in range(n_forecast_error)], 747 | index=idxname, 748 | columns=colname, 749 | ) 750 | true = pd.DataFrame( 751 | [df[df_list[n].isna()].T.stack().values for n in range(n_forecast_error)], index=idxname, columns=colname 752 | ) 753 | 754 | return pred, true, model 755 | 756 | 757 | def BreakDataFrameIntoTimeSeriesList( 758 | df0: DataFrame, df1: DataFrame, pred: DataFrame, true: DataFrame 759 | ) -> tuple[list[DataFrame], list[DataFrame], list[DataFrame]]: 760 | """Transform relevant dataframes into lists for ensuing reconciliation step. 761 | 762 | Parameters 763 | ---------- 764 | df0 : pd.DataFrame 765 | Dataframe with all known and unknown values, without any islands. 766 | df1 : pd.DataFrame 767 | Dataframe with unknown values as well as islands filled in with 768 | first step forecasts. 769 | pred : pd.DataFrame 770 | Dataframe with in-sample predictions generated using pseudo-historical 771 | datasets, output from ``GenPredTrueData``. 772 | true : pd.DataFrame 773 | Dataframe with actual values of the variable corresponding to predicted 774 | values contained in pred. 775 | 776 | Returns 777 | ------- 778 | ts_list : list 779 | List containing all first step out of sample forecasts. 780 | pred_list : list 781 | List of dataframes, with each dataframe containing in-sample forecasts 782 | for one variable. 783 | true_list : list 784 | List of dataframes, with each dataframe containing the actual values 785 | for a variable corresponding to in-sample predictions stored in 786 | pred_list. 787 | 788 | Examples 789 | -------- 790 | >>> import numpy as np 791 | >>> import pandas as pd 792 | >>> from sktime.forecasting.compose import YfromX 793 | >>> from sklearn.linear_model import ElasticNetCV 794 | >>> n = 30 795 | >>> p = 2 796 | >>> df = pd.DataFrame(np.random.sample([n,p]), 797 | >>> columns=['a','b'], 798 | >>> index=pd.date_range(start='2000',periods=n,freq='YE').year) 799 | >>> df.iloc[-5:,:1] = np.nan 800 | >>> def DefaultForecaster(): 801 | >>> return YfromX(ElasticNetCV(max_iter=5000)) 802 | >>> df1,df1_models = FillAllEmptyCells(df,DefaultForecaster()) 803 | >>> pred,true,model = GenPredTrueData(df0,forecaster,parallelize=parallelize) 804 | >>> ts_list,pred_list,true_list = BreakDataFrameIntoTimeSeriesList(df,df1,pred,true) 805 | """ 806 | ts_list = [df1[df0.isna()].loc[:, col:col].dropna().T.stack() for col in df0.columns[df0.isna().any()]] 807 | pred_list = [pred.loc[:, ts.index] for ts in ts_list] 808 | true_list = [true.loc[:, ts.index] for ts in ts_list] 809 | 810 | return ts_list, pred_list, true_list 811 | 812 | 813 | def HP_matrix(size: int) -> ndarray: 814 | """ 815 | Create the degenerate penta-diagonal matrix (the one used in HP Filter), 816 | with dimensions (size x size). 817 | 818 | Parameters 819 | ---------- 820 | size : integer 821 | Number of rows for the square matrix. 822 | 823 | Returns 824 | ------- 825 | F : np.array 826 | Array containing the F matrix. 827 | 828 | """ 829 | if size >= 2: 830 | D = np.zeros((size - 2, size)) 831 | for i in range(size - 2): 832 | D[i, i] = 1 833 | D[i, i + 1] = -2 834 | D[i, i + 2] = 1 835 | F = D.T @ D 836 | elif size == 1: 837 | F = np.zeros([1, 1]) 838 | return F 839 | 840 | 841 | def GenVecForecastWithIslands(ts_list: list[DataFrame], islands: list[Series]) -> Series: 842 | """Overwrite forecasted values for islands with known island value. 843 | 844 | Parameters 845 | ---------- 846 | ts_list : list 847 | List of all first step forecasted values. 848 | islands : pd.Series 849 | Series containing island values. 850 | 851 | Returns 852 | ------- 853 | y1 : pd.Series 854 | Series of forecasted values with island values incorporated. 855 | 856 | Examples 857 | -------- 858 | >>> import numpy as np 859 | >>> import pandas as pd 860 | >>> from sktime.forecasting.compose import YfromX 861 | >>> from sklearn.linear_model import ElasticNetCV 862 | >>> n = 30 863 | >>> p = 2 864 | >>> df = pd.DataFrame(np.random.sample([n,p]), 865 | >>> columns=['a','b'], 866 | >>> index=pd.date_range(start='2000',periods=n,freq='YE').year) 867 | >>> df.iloc[-5:-1,:1] = np.nan 868 | >>> df0, all_cells, unknown_cells, known_cells, islands = OrganizeCells(df) 869 | >>> def DefaultForecaster(): 870 | >>> return YfromX(ElasticNetCV(max_iter=5000)) 871 | >>> df1,df1_models = FillAllEmptyCells(df,DefaultForecaster(),parallelize=False) 872 | >>> ts_list = [df1[df0.isna()].loc[:,col:col].dropna().T.stack() for col in df0.columns[df.isna().any()]] 873 | >>> y1 = GenVecForecastWithIslands(ts_list,islands) 874 | """ 875 | try: 876 | y1 = pd.concat(ts_list, axis=0) 877 | 878 | except Exception: # only used in mixed-freq, pd.concat cann't process 4 mix-freq series 879 | y1 = ConcatMixFreqMultiIndexSeries(ts_list, axis=0) 880 | 881 | y1.update(islands) 882 | 883 | return y1 884 | 885 | 886 | def GenWeightMatrix( 887 | pred_list: list[DataFrame], true_list: list[DataFrame], shrinkage_method: Literal["oas", "oasd"] = "oas" 888 | ) -> tuple[DataFrame, float]: 889 | """ 890 | Generate weighting matrix based on in-sample forecasts and actual values 891 | for the corresponding periods. 892 | 893 | Parameters 894 | ---------- 895 | pred_list : list 896 | List of dataframes, with each dataframe containing in-sample forecasts 897 | for one variable.. 898 | true_list : list 899 | List of dataframes, with each dataframe containing the actual values 900 | for a variable corresponding to in-sample predictions stored in 901 | pred_list. 902 | shrinkage_method : str, optional 903 | Type of algorithm to use for shrinking the covariance matrix, with 904 | options of identity, oas and oasd. The default is 'oas'. 905 | 906 | Returns 907 | ------- 908 | W : pd.DataFrame 909 | Weighting matrix to be used for reconciliation. 910 | shrinkage: float 911 | Shrinkage parameter associated with the weight. Nan in case identity 912 | is selected as method. 913 | 914 | Examples 915 | -------- 916 | >>> import pandas as pd 917 | >>> import numpy as np 918 | >>> pred_list = [pd.DataFrame(np.random.rand(5, 5), columns=[f'Col{i+1}' for i in range(5)]) for _ in range(2)] 919 | >>> true_list = [pd.DataFrame(np.random.rand(5, 5), columns=[f'Col{i+1}' for i in range(5)]) for _ in range(2)] 920 | >>> W,shrinkage = GenWeightMatrix(pred_list, true_list) 921 | 922 | """ 923 | fe_list = [pred_list[i] - true_list[i] for i in range(len(pred_list))] 924 | 925 | try: # fe: sample size x vairables 926 | fe = pd.concat(fe_list, axis=1) 927 | 928 | except Exception: # only used in mixed-freq, pd.concat cann't process 4 mix-freq series 929 | fe = ConcatMixFreqMultiIndexSeries(fe_list, axis=1) 930 | 931 | # sample covariance 932 | n_samp = fe.shape[0] 933 | n_vars = fe.shape[1] 934 | sample_cov = fe.cov() 935 | 936 | if shrinkage_method == "identity": 937 | W = pd.DataFrame(np.eye(sample_cov.shape[0]), index=sample_cov.index, columns=sample_cov.columns) 938 | return W, np.nan 939 | 940 | if shrinkage_method == "oas": 941 | from sklearn.covariance import OAS 942 | 943 | oas = OAS().fit(fe.values) 944 | W = pd.DataFrame(oas.covariance_, index=sample_cov.index, columns=sample_cov.columns) 945 | rho = oas.shrinkage_ 946 | return W, rho 947 | 948 | if shrinkage_method == "oasd": 949 | if n_vars >= 2: 950 | # shrinkage target 951 | diag = np.diag(np.diag(sample_cov)) 952 | 953 | # shrinkage parameter 954 | numerator = np.trace(sample_cov @ sample_cov) - np.trace(diag @ diag) 955 | denominator = np.trace(sample_cov @ sample_cov) + np.trace(sample_cov) ** 2 - 2 * np.trace(diag @ diag) 956 | phi = numerator / denominator 957 | rho = min([1 / (n_samp * phi), 1]) 958 | 959 | # shrink covariance matrix 960 | W = (1 - rho) * sample_cov + rho * diag 961 | elif n_vars == 1: 962 | W = sample_cov 963 | rho = np.nan 964 | return W, rho 965 | 966 | if shrinkage_method == "monotone diagonal": 967 | if n_vars >= 2: 968 | diag = pd.Series(np.diag(sample_cov), index=sample_cov.index) 969 | W = pd.DataFrame( 970 | np.diag(diag.groupby(level=0).cummax()), index=sample_cov.index, columns=sample_cov.columns 971 | ) 972 | elif n_vars == 1: 973 | W = sample_cov 974 | rho = np.nan 975 | return W, np.nan 976 | 977 | 978 | def GenLamstar(pred_list: list, true_list: list, default_lam: float = -1, max_lam: float = 129600) -> pd.Series: 979 | """ 980 | Calculate the smoothness parameter (lambda) associated with each variable 981 | being forecasted. 982 | 983 | Parameters 984 | ---------- 985 | pred_list : list 986 | List of dataframes, with each dataframe containing in-sample forecasts 987 | for one variable. 988 | true_list : list 989 | List of dataframes, with each dataframe containing the actual values 990 | for a variable corresponding to in-sample predictions stored in 991 | pred_list. 992 | default_lam : float, optional(default: -1) 993 | The value of lambda to be used for calculating smoothing parameter if 994 | frequency of observations cannot be determined from index names. If this 995 | is set to -1, lambda is calculated empirically. The default value is -1. 996 | max_lam : float, optional 997 | The upperbound of HP filter penalty term (lambda) searched by scipy 998 | minimizer. The default is 129600. 999 | 1000 | Returns 1001 | ------- 1002 | lamstar : pd.Series 1003 | Series containing smoothing parameters to be used for each variable. 1004 | 1005 | Examples 1006 | -------- 1007 | >>> import pandas as pd 1008 | >>> import numpy as np 1009 | >>> pred_list = [pd.DataFrame(np.random.rand(5, 5), columns=[f'Col{i+1}' for i in range(5)]) for _ in range(2)] 1010 | >>> true_list = [pd.DataFrame(np.random.rand(5, 5), columns=[f'Col{i+1}' for i in range(5)]) for _ in range(2)] 1011 | >>> W,shrinkage = GenWeightMatrix(pred_list, true_list) 1012 | """ 1013 | # index of time series to deal with mixed-frequency 1014 | tsidx_list = [df.columns for df in pred_list] 1015 | 1016 | # box to store lamstar, columsn are the index of time series 1017 | try: # extract freq info if available 1018 | freq_list = [tsidx.get_level_values(1).freqstr[0] for tsidx in tsidx_list] 1019 | ly = 100 1020 | lambda_dict = { 1021 | "Y": ly, 1022 | "Q": ly * (4**2), 1023 | "M": ly * (12**2), 1024 | "W": ly * (52**2), 1025 | "D": ly * (365**2), 1026 | "H": ly * ((365 * 24) ** 2), 1027 | "T": ly * ((365 * 24 * 60) ** 2), 1028 | "S": ly * ((365 * 24 * 60 * 60) ** 2), 1029 | } 1030 | lamstar = pd.Series([float(lambda_dict[item]) for item in freq_list], index=tsidx_list) 1031 | except Exception: 1032 | lamstar = pd.Series(np.ones(len(tsidx_list)) * default_lam, index=tsidx_list) 1033 | 1034 | # optimal lambda 1035 | if default_lam == -1: 1036 | 1037 | def loss_fn(x, T, yt, yp): 1038 | return (yt - inv(np.eye(T) + x * HP_matrix(T)) @ yp).T @ (yt - inv(np.eye(T) + x * HP_matrix(T)) @ yp) 1039 | 1040 | for tsidxi, tsidx in enumerate(tsidx_list): 1041 | y_pred = pred_list[tsidxi] 1042 | y_true = true_list[tsidxi] 1043 | T = len(tsidx) 1044 | 1045 | # TODO: pick a better name for the function 1046 | def obj(x): 1047 | return np.mean( 1048 | [ 1049 | loss_fn(x, T, y_true.iloc[i : i + 1, :].T.values, y_pred.iloc[i : i + 1, :].T.values) 1050 | for i in range(y_pred.shape[0]) 1051 | ] 1052 | ) 1053 | 1054 | constraint_lb = {"type": "ineq", "fun": lambda lam: lam} # lambda >=0 1055 | 1056 | # lambda <= max_lam, without this, I+xF may be too close to F to invert 1057 | constraint_ub = {"type": "ineq", "fun": lambda lam: -lam + max_lam} 1058 | result = scipy.optimize.minimize(obj, 0, constraints=[constraint_lb, constraint_ub]) 1059 | lamstar.iloc[tsidxi] = result.x[0] 1060 | return lamstar 1061 | 1062 | 1063 | def GenSmoothingMatrix(W: DataFrame, lamstar: Series) -> DataFrame: 1064 | """ 1065 | Generate symmetric smoothing matrix using optimal lambda and weighting matrix. 1066 | 1067 | Parameters 1068 | ---------- 1069 | W : pd.DataFrame 1070 | Dataframe containing the weighting matrix. 1071 | lamstar : pd.Series 1072 | Series containing smoothing parameters to be used for each variable. 1073 | 1074 | Returns 1075 | ------- 1076 | Phi : pd.DataFrame 1077 | Dataframe containing the smoothing matrix. 1078 | 1079 | Examples 1080 | -------- 1081 | >>> import pandas as pd 1082 | >>> import numpy as np 1083 | >>> pred_list_1 = [pd.DataFrame(np.random.rand(5, 5), 1084 | >>> columns=pd.MultiIndex.from_product([['A'], [f'Col{i+1}' for i in range(5)]])) if i == 0 else 1085 | >>> pd.DataFrame(np.random.rand(5, 5), 1086 | >>> columns=pd.MultiIndex.from_product([['B'], [f'Col{i+1}' for i in range(5)]])) 1087 | >>> for i in range(2)] 1088 | >>> true_list_1 = [pd.DataFrame(np.random.rand(5, 5), 1089 | >>> columns=pd.MultiIndex.from_product([['A'], [f'Col{i+1}' for i in range(5)]])) if i == 0 else 1090 | >>> pd.DataFrame(np.random.rand(5, 5), 1091 | >>> columns=pd.MultiIndex.from_product([['B'], [f'Col{i+1}' for i in range(5)]])) 1092 | >>> for i in range(2)] 1093 | >>> smoothness = GenLamstar(pred_list_1,true_list_1) 1094 | 1095 | """ 1096 | lam = lamstar / [np.diag(W.loc[tsidx, tsidx]).min() for tsidx in lamstar.index] 1097 | Phi_np = block_diag(*[lam.iloc[tsidxi] * HP_matrix(len(tsidx)) for tsidxi, tsidx in enumerate(lam.index)]) 1098 | Phi = pd.DataFrame(Phi_np, index=W.index, columns=W.columns) 1099 | return Phi 1100 | 1101 | 1102 | def Reconciliation( 1103 | y1: Series, 1104 | W: DataFrame, 1105 | Phi: DataFrame, 1106 | C: DataFrame, 1107 | d: DataFrame, 1108 | C_ineq: DataFrame | None = None, 1109 | d_ineq: DataFrame | None = None, 1110 | ) -> DataFrame: 1111 | """ 1112 | Reconcile first step forecasts to satisfy equality as well as inequality 1113 | constraints, subject to smoothening. 1114 | 1115 | Parameters 1116 | ---------- 1117 | y1 : pd.Series 1118 | Series of all forecasted and island values. 1119 | W : pd.DataFrame 1120 | Dataframe containing the weighting matrix. 1121 | Phi : pd.DataFrame 1122 | Dataframe containing the smoothing matrix. 1123 | C : pd.DataFrame 1124 | Dataframe containing matrix of the linear constraints on the left side of 1125 | the equality constraint Cy=d. 1126 | d : pd.DataFrame 1127 | Dataframe containing matrix of the linear constraints on the right side of 1128 | the equality constraint Cy=d. 1129 | C_ineq : pd.DataFrame, optional 1130 | Dataframe containing matrix of the linear constraints on the left side of 1131 | the inequality constraint C_ineq · y - d_ineq ≤ 0. The default is None. 1132 | d_ineq : pd.DataFrame, optional 1133 | Dataframe containing matrix of the linear constraints on the right side of 1134 | the inequality constraint C_ineq · y - d_ineq ≤ 0. The default is None. 1135 | 1136 | Returns 1137 | ------- 1138 | y2 : pd.DataFrame 1139 | Dataframe containing the final reconciled forecasts for all variables. 1140 | 1141 | Examples 1142 | -------- 1143 | >>> import numpy as np 1144 | >>> import pandas as pd 1145 | >>> from sktime.forecasting.compose import YfromX 1146 | >>> from sklearn.linear_model import ElasticNetCV 1147 | >>> n = 30 1148 | >>> p = 2 1149 | >>> df = pd.DataFrame(np.random.sample([n,p]), 1150 | >>> columns=['a','b'], 1151 | >>> index=pd.date_range(start='2000',periods=n,freq='YE').year) 1152 | >>> df.iloc[-5:,:1] = np.nan 1153 | >>> df0, all_cells, unknown_cells, known_cells, islands = OrganizeCells(df) 1154 | >>> def DefaultForecaster(): 1155 | >>> return YfromX(ElasticNetCV(max_iter=5000)) 1156 | >>> df1,df1_models = FillAllEmptyCells(df0,DefaultForecaster(),parallelize=False) 1157 | >>> pred,true,model = GenPredTrueData(df0,DefaultForecaster(),parallelize=False) 1158 | >>> ts_list,pred_list,true_list = BreakDataFrameIntoTimeSeriesList(df0,df1,pred,true) 1159 | >>> y1 = pd.concat(ts_list) 1160 | >>> C = pd.DataFrame(columns = y1.index).astype(float) 1161 | >>> d = pd.DataFrame().astype(float) 1162 | >>> W = pd.DataFrame(np.eye(5),index=y1.index,columns=y1.index) 1163 | >>> smoothness = GenLamstar(pred_list,true_list) 1164 | >>> Phi = GenSmoothingMatrix(W,smoothness) 1165 | >>> y2 = Reconciliation(y1,W,Phi,C,d) 1166 | >>> y2 = Reconciliation(m.y1,m.W,m.Phi,m.C,m.d) 1167 | 1168 | """ 1169 | assert (y1.index == W.index).all() 1170 | assert (y1.index == Phi.index).all() 1171 | assert (y1.index == C.columns).all() 1172 | assert (C.index == d.index).all() 1173 | 1174 | def DropLinDepRows(C_aug, d_aug): 1175 | C = C_aug.values 1176 | 1177 | # Convert the matrix to a SymPy Matrix 1178 | sympy_matrix = sp.Matrix(C) 1179 | 1180 | # Compute the RREF and get the indices of linearly independent rows 1181 | rref_matrix, independent_rows = sympy_matrix.T.rref() 1182 | 1183 | # Extract the independent rows 1184 | independent_rows = list(independent_rows) 1185 | 1186 | # dependent rows 1187 | all_rows = set(range(C.shape[0])) 1188 | dependent_rows = list(all_rows - set(independent_rows)) 1189 | 1190 | C = C_aug.iloc[independent_rows, :] 1191 | d = d_aug.iloc[independent_rows, :] 1192 | 1193 | if dependent_rows != []: 1194 | print( 1195 | "Constraints are linearly dependent. The following constraints are dropped.", 1196 | C_aug.index[dependent_rows], 1197 | ) 1198 | return C, d 1199 | 1200 | # keep lin indep rows 1201 | C, d = DropLinDepRows(C, d) 1202 | 1203 | # reconcile with np.array 1204 | W_inv = inv(W) 1205 | denom = inv(W_inv + Phi) 1206 | Cn = C.values 1207 | dn = d.values 1208 | CdC_inv = inv(Cn @ denom @ Cn.T) # removing linearly dependent rows to use inv doesn't change results much 1209 | 1210 | In = np.eye(len(y1)) 1211 | y1n = y1.values.reshape(-1, 1) 1212 | y2n = (In - denom @ Cn.T @ CdC_inv @ Cn) @ denom @ W_inv @ y1n + denom @ Cn.T @ CdC_inv @ dn 1213 | 1214 | if C_ineq is not None and C_ineq.shape[0] > 0: 1215 | C_ineq, d_ineq = DropLinDepRows(C_ineq, d_ineq) 1216 | 1217 | # augment C_ineq, d_ineq to be compatible with y1 1218 | C_ineq_aug = pd.DataFrame(np.zeros([len(C_ineq.index), len(y1)]), index=C_ineq.index, columns=y1.index) 1219 | C_ineq_aug.update(C_ineq) 1220 | d_ineq_aug = pd.DataFrame(np.zeros([len(d_ineq.index), 1]), index=d_ineq.index) 1221 | d_ineq_aug.update(d_ineq) 1222 | Cn_ineq = C_ineq_aug.values 1223 | dn_ineq = d_ineq_aug.values 1224 | 1225 | # use CVXPY to solve numerically 1226 | P = W_inv + Phi 1227 | q = -2 * W_inv @ y1n 1228 | x = cp.Variable([len(y1), 1]) 1229 | objective = cp.Minimize(cp.quad_form(x, P, assume_PSD=True) + q.T @ x) 1230 | 1231 | # If equality constraints do not exist, dropping C matrix from solver 1232 | if C.shape[0] >0: 1233 | constraints = [Cn @ x == dn, Cn_ineq @ x <= dn_ineq] 1234 | else: 1235 | constraints = [Cn_ineq @ x <= dn_ineq] 1236 | prob = cp.Problem(objective, constraints) 1237 | prob.solve() 1238 | y2n = x.value 1239 | 1240 | if y2n is None: 1241 | import warnings 1242 | 1243 | warnings.warn("Reconciliation failed. Feasible sets might be empty.") 1244 | 1245 | # put reconciled y2 back to dataframe 1246 | y2 = pd.DataFrame(y2n, index=y1.index) 1247 | 1248 | return y2 1249 | 1250 | 1251 | def get_freq_of_freq(periodindex: PeriodIndex, freqstr: Literal["Y", "Q", "M", "W", "D", "H", "T", "S"]) -> Index: 1252 | if freqstr == "Y": 1253 | return periodindex.year 1254 | if freqstr == "Q": 1255 | return periodindex.quarter 1256 | if freqstr == "M": 1257 | return periodindex.month 1258 | if freqstr == "W": 1259 | return periodindex.week 1260 | if freqstr == "D": 1261 | return periodindex.day 1262 | if freqstr == "H": 1263 | return periodindex.hour 1264 | if freqstr == "T": 1265 | return periodindex.minute 1266 | if freqstr == "S": 1267 | return periodindex.second 1268 | 1269 | 1270 | def ConcatMixFreqMultiIndexSeries(df_list: list[DataFrame], axis: int) -> DataFrame: 1271 | # used only in mixed freq case, pd.concat doesn't work for more than 4 mix-freq series 1272 | # doesn't work when there are more than 3 freq! 1273 | try: 1274 | return pd.concat(df_list, axis=axis) 1275 | except Exception: 1276 | if axis == 0: 1277 | # concat by freq 1278 | freqs = [df.index.get_level_values(1).freqstr[0] for df in df_list] 1279 | seen = set() 1280 | freq_unique = [x for x in freqs if not (x in seen or seen.add(x))] 1281 | dflong_list = [] 1282 | for k in freq_unique: 1283 | df_list_k = [df for df in df_list if df.index.get_level_values(1).freqstr[0] == k] 1284 | dflong_k = pd.concat(df_list_k, axis=0) 1285 | dflong_list.append(dflong_k) 1286 | 1287 | dflong = pd.concat(dflong_list, axis=0) 1288 | return dflong 1289 | 1290 | if axis == 1: 1291 | # concat by freq 1292 | freqs = [df.columns.get_level_values(1).freqstr[0] for df in df_list] 1293 | seen = set() 1294 | freq_unique = [x for x in freqs if not (x in seen or seen.add(x))] 1295 | dfwide_list = [] 1296 | for k in freq_unique: 1297 | df_list_k = [df for df in df_list if df.columns.get_level_values(1).freqstr[0] == k] 1298 | dfwide_k = pd.concat(df_list_k, axis=1) 1299 | dfwide_list.append(dfwide_k) 1300 | 1301 | dfwide = pd.concat(dfwide_list, axis=1) 1302 | return dfwide 1303 | --------------------------------------------------------------------------------