├── docs
    ├── source
    │   ├── getting_started
    │   │   └── index.rst
    │   ├── api_reference
    │   │   ├── index.rst
    │   │   ├── main_classes.rst
    │   │   └── validators.rst
    │   ├── mff_documentation.rst
    │   ├── contributing.rst
    │   ├── conf.py
    │   ├── index.rst
    │   └── examples.rst
    ├── Makefile
    └── make.bat
├── src
    └── macroframe_forecast
    │   ├── __init__.py
    │   ├── MFF.py
    │   ├── MFF_mixed_frequency.py
    │   ├── examples.py
    │   └── utils.py
├── .gitignore
├── .github
    └── workflows
    │   ├── tests.yaml
    │   ├── documentation.yml
    │   └── release.yml
├── .pre-commit-config.yaml
├── pyproject.toml
├── README.md
├── tests
    ├── test_utils.py
    └── test_MFF.py
└── LICENSE


/docs/source/getting_started/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | .. _getting_started:
 3 | 
 4 | Getting started
 5 | ===============
 6 | 
 7 | .. toctree::
 8 |     :maxdepth: 2
 9 | 
10 |     demo_notebook


--------------------------------------------------------------------------------
/docs/source/api_reference/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | .. _api_reference:
 3 | 
 4 | API Reference
 5 | =============
 6 | 
 7 | .. toctree::
 8 |     :maxdepth: 2
 9 | 
10 |     main_classes
11 |     validators
12 | 


--------------------------------------------------------------------------------
/src/macroframe_forecast/__init__.py:
--------------------------------------------------------------------------------
1 | from importlib.metadata import version
2 | 
3 | __version__ = version("macroframe_forecast")
4 | 
5 | from .MFF import MFF  # noqa: F401
6 | from .MFF_mixed_frequency import MFF_mixed_freqency  # noqa: F401
7 | 


--------------------------------------------------------------------------------
/docs/source/api_reference/main_classes.rst:
--------------------------------------------------------------------------------
 1 | Main classes
 2 | ------------
 3 | 
 4 | Sample text to be updated.
 5 | 
 6 | .. autoclass:: mff.mff.MFF
 7 | 
 8 | .. autofunction:: mff.step0_parse_constraints.find_strings_to_replace_wildcard
 9 | 
10 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | dist/
 3 | .git_local_repo_credentials
 4 | 
 5 | # setuptools files
 6 | *.egg-info
 7 | 
 8 | # temporary csv files
 9 | tests/*.csv
10 | 
11 | # Sphinx documentation
12 | docs/build/
13 | 
14 | # MacOS
15 | .DS_Store
16 | 
17 | # pixi environments
18 | .pixi
19 | *.egg-info
20 | 
21 | # build files
22 | build/


--------------------------------------------------------------------------------
/docs/source/api_reference/validators.rst:
--------------------------------------------------------------------------------
 1 | Validators
 2 | ----------
 3 | 
 4 | Validators check that the data and constraints have the appropriate shape and content.
 5 | 
 6 | .. autofunction:: mff.validators.can_forecast
 7 | 
 8 | .. autofunction:: mff.validators.is_consistent_shape
 9 | 
10 | .. autofunction:: mff.validators.is_consistent_intercept
11 | 


--------------------------------------------------------------------------------
/docs/source/mff_documentation.rst:
--------------------------------------------------------------------------------
 1 | Macroframework Forecasting Package API
 2 | ----------------------------------
 3 | 
 4 | .. automodule:: macroframe_forecast.MFF
 5 |    :members:
 6 |    :member-order: groupwise
 7 |    :show-inheritance:
 8 | 
 9 | 
10 | .. automodule:: macroframe_forecast.MFF_mixed_frequency
11 |    :members:
12 |    :member-order: groupwise
13 |    :show-inheritance:
14 | 
15 | 
16 | .. automodule:: macroframe_forecast.utils
17 |    :members:
18 |    :member-order: groupwise
19 |    :show-inheritance:
20 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yaml:
--------------------------------------------------------------------------------
 1 | name: tests
 2 | 
 3 | on: [pull_request, workflow_dispatch]
 4 | 
 5 | jobs:
 6 |   docs:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - uses: actions/checkout@v4
10 |       - uses: actions/setup-python@v5
11 |         with:
12 |           python-version: '3.12' 
13 |       - name: Install dependencies
14 |         run: |
15 |           pip install pytest .
16 |       - name: Install the package
17 |         run: pip install .
18 |       - name: Run tests
19 |         run: |
20 |           pytest -m ""
21 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v2.3.0
 4 |     hooks:
 5 |     -   id: check-yaml
 6 |     -   id: end-of-file-fixer
 7 |     -   id: trailing-whitespace
 8 | - repo: https://github.com/astral-sh/ruff-pre-commit
 9 |   # Ruff version.
10 |   rev: v0.11.13
11 |   hooks:
12 |     # Run the linter.
13 |     - id: ruff-check
14 |     # Run the formatter.
15 |     - id: ruff-format
16 | - repo: local
17 |   hooks:
18 |     - id: pytest-check
19 |       name: pytest-check
20 |       entry: pytest
21 |       language: system
22 |       pass_filenames: false
23 |       always_run: true
24 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/source/contributing.rst:
--------------------------------------------------------------------------------
 1 | Contributing
 2 | ============
 3 | 
 4 | Contributions to the code are welcome!
 5 | 
 6 | For development of the code, it's recommended to install the editable version of the package, so the edits are immediately reflected for testing:
 7 | 
 8 | ```shell
 9 | python -m pip install -e .
10 | ```
11 | 
12 | Make sure to install the dependencies in the `dev` dependency group of `pyproject.toml`.
13 | 
14 | It's also recommended to install `pre-commit`, to set up git hooks, run the following once:
15 | 
16 | ```shell
17 | pre-commit install
18 | ```
19 | 
20 | Note that this will run tests, skipping the slow tests.
21 | 
22 | Building documentation
23 | ======================
24 | 
25 | To build/update documentation, run:
26 | 
27 | ```shell
28 | sphinx-build -M html docs/source/ docs/build/
29 | ```
30 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/.github/workflows/documentation.yml:
--------------------------------------------------------------------------------
 1 | name: documentation
 2 | 
 3 | on: [push, pull_request, workflow_dispatch]
 4 | 
 5 | permissions:
 6 |   contents: write
 7 | 
 8 | jobs:
 9 |   docs:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v4
13 |       - uses: actions/setup-python@v5
14 |         with:
15 |           python-version: '3.12' 
16 |       - name: Install dependencies
17 |         run: |
18 |           pip install sphinx sphinx_rtd_theme myst_parser numpydoc pydata_sphinx_theme furo
19 |       - name: Install the package
20 |         run: pip install .
21 |       - name: Sphinx build
22 |         run: |
23 |           sphinx-build docs/source docs_ready
24 |       - name: Deploy to GitHub Pages
25 |         uses: peaceiris/actions-gh-pages@v3
26 |         if: ${{ github.ref == 'refs/heads/main' }}
27 |         with:
28 |           publish_branch: gh-pages
29 |           github_token: ${{ secrets.GITHUB_TOKEN }}
30 |           publish_dir: docs_ready/
31 |           force_orphan: true
32 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: release
 2 | on: [release]
 3 | 
 4 | jobs:
 5 |   release-build:
 6 |     runs-on: ubuntu-latest
 7 | 
 8 |     steps:
 9 |       - uses: actions/checkout@v4
10 | 
11 |       - uses: actions/setup-python@v5
12 |         with:
13 |           python-version: "3.x"
14 | 
15 |       - name: build release distributions
16 |         run: |
17 |           # NOTE: put your own distribution build steps here.
18 |           python -m pip install build
19 |           python -m build
20 | 
21 |       - name: upload windows dists
22 |         uses: actions/upload-artifact@v4
23 |         with:
24 |           name: release-dists
25 |           path: dist/
26 | 
27 |   pypi-publish:
28 |     runs-on: ubuntu-latest
29 |     needs:
30 |       - release-build
31 |     permissions:
32 |       id-token: write
33 | 
34 |     steps:
35 |       - name: Retrieve release distributions
36 |         uses: actions/download-artifact@v4
37 |         with:
38 |           name: release-dists
39 |           path: dist/
40 | 
41 |       - name: Publish release distributions to PyPI
42 |         uses: pypa/gh-action-pypi-publish@release/v1
43 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # For the full list of built-in configuration values, see the documentation:
 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 5 | 
 6 | # -- Project information -----------------------------------------------------
 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 8 | 
 9 | project = "macroframe-forecast"
10 | copyright = "2024-2025, IMF"
11 | author = "Ando Sakai, Shuvam Das, Sultan Orazbayev"
12 | release = "0.1.5"
13 | 
14 | 
15 | # -- General configuration ---------------------------------------------------
16 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
17 | extensions = [
18 |     "sphinx.ext.autodoc",
19 |     "numpydoc",
20 |     "sphinx.ext.mathjax",
21 |     "sphinx.ext.napoleon",
22 |     "sphinx.ext.autosummary",
23 |     "sphinx.ext.autosectionlabel",
24 |     "sphinx.ext.intersphinx",
25 |     "sphinx.ext.doctest",
26 | ]
27 | 
28 | templates_path = ["_templates"]
29 | exclude_patterns = ["_build", "**.ipynb_checkpoints", "**.ipynb"]
30 | 
31 | # generate autosummary even if no references
32 | autosummary_generate = True
33 | 
34 | # Members and inherited-members default to showing methods and attributes from a
35 | # class or those inherited.
36 | # Member-order orders the documentation in the order of how the members are defined in
37 | # the source code.
38 | autodoc_default_options = {
39 |     "members": True,
40 |     "inherited-members": True,
41 |     "member-order": "bysource",
42 | }
43 | 
44 | # -- Options for HTML output -------------------------------------------------
45 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
46 | 
47 | html_theme = "furo"
48 | html_static_path = []
49 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "macroframe-forecast"
 3 | version = "0.1.6"
 4 | description = "Macroframework forecasting with accounting identities"
 5 | readme = "README.md"
 6 | requires-python = ">=3.11"
 7 | maintainers = [
 8 |     { name = "Sakai Ando", email = "sando@imf.org" },
 9 |     { name = "Sultan Orazbayev", email = "sorazbayev@imf.org" },
10 | ]
11 | keywords = [
12 |     "macroframework",
13 |     "forecasting",
14 |     "macroeconomic identities",
15 |     "high-dimensional forecasting",
16 |     "econometrics",
17 | ]
18 | # https://pypi.org/classifiers/
19 | classifiers = [
20 |     "Programming Language :: Python :: 3.11",
21 |     "Programming Language :: Python :: 3.12",
22 |     "Programming Language :: Python :: 3 :: Only",
23 |     "Intended Audience :: Developers",
24 |     "Intended Audience :: Other Audience",
25 |     "Intended Audience :: Science/Research",
26 |     "Intended Audience :: Education",
27 |     "Topic :: Scientific/Engineering",
28 |     "Topic :: Scientific/Engineering :: Information Analysis",
29 |     "Topic :: Scientific/Engineering :: Mathematics",
30 | ]
31 | # tbd
32 | dependencies = [
33 |     "pandas >= 2.2.0",
34 |     "numpy >= 1.26.3",
35 |     "scipy >= 1.12.0",
36 |     "scikit-learn >= 1.4.0",
37 |     "dask[dataframe] >= 2024.8.1",
38 |     "distributed >= 2024.2.0",
39 |     "sktime >= 0.27.0",
40 |     "sympy >= 1.12",
41 |     "cvxpy >= 1.5.3",
42 |     "statsmodels>=0.14.4",
43 |     "matplotlib>=3.10.3",
44 | ]
45 | [project.urls]
46 | homepage = "https://github.com/sakaiando/macroframe-forecast"
47 | repository = "https://github.com/sakaiando/macroframe-forecast"
48 | 
49 | [build-system]
50 | requires = ["hatchling"]
51 | build-backend = "hatchling.build"
52 | 
53 | [dependency-groups]
54 | dev = [
55 |     "furo>=2024.8.6",
56 |     "numpydoc>=1.9.0",
57 |     "pre-commit>=4.2.0",
58 |     "pytest>=8.4.0",
59 |     "ruff>=0.11.13",
60 |     "sphinx>=8.2.3",
61 | ]
62 | 
63 | [tool.pytest.ini_options]
64 | addopts = [
65 |     "--strict-config",  # Force error if config is mispelled
66 |     "--strict-markers", # Force error if marker is mispelled (must be defined in config)
67 |     "-ra",              # Print summary of all fails/errors
68 |     "-m not slow",      # Skip slow tests
69 | ]
70 | markers = ["slow: Skipped unless '-m slow' passed"]
71 | 
72 | [tool.ruff]
73 | line-length = 120
74 | 
75 | [tool.ruff.lint]
76 | pydocstyle = { convention = "numpy" }
77 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # `macroframe-forecast`: a Python package to assist with macroframework forecasting
 2 | 
 3 | 
 4 | [![!pypi](https://img.shields.io/pypi/v/macroframe-forecast?color=green)](https://pypi.org/project/macroframe-forecast/) [![Downloads](https://static.pepy.tech/personalized-badge/macroframe-forecast?period=total&units=international_system&left_color=grey&right_color=blue&left_text=cumulative%20(pypi))](https://pepy.tech/project/macroframe-forecast)
 5 | 
 6 | This package is based on the following papers:
 7 | * [A Python Package to Assist Macroframework Forecasting: Concepts and Examples (2025)](https://www.imf.org/en/Publications/WP/Issues/2025/08/29/A-Python-Package-to-Assist-Macroframework-Forecasting-Concepts-and-Examples-570041).
 8 | * [Smooth Forecast Reconciliation (2024)](https://www.imf.org/en/Publications/WP/Issues/2024/03/22/Smooth-Forecast-Reconciliation-546654)
 9 | * [Systematizing Macroframework Forecasting: High-Dimensional Conditional Forecasting with Accounting Identities (2023)](https://link.springer.com/article/10.1057/s41308-023-00225-8)
10 | 
11 | # Documentation
12 | 
13 | Please refer to [this link](https://sakaiando.github.io/macroframe-forecast/) for documentation.
14 | 
15 | # Installation
16 | 
17 | To install the `macroframe-forecast` package, run the following from the repository root:
18 | 
19 | ```shell
20 | pip install macroframe-forecast
21 | ```
22 | 
23 | # Quick start
24 | 
25 | ```python
26 | import numpy as np
27 | import pandas as pd
28 | import matplotlib.pyplot as plt
29 | from macroframe_forecast import MFF
30 | 
31 | # true data
32 | df_true = pd.DataFrame({
33 |     'var1': np.random.randn(30),  # 100 random values from normal distribution
34 |     'var2': np.random.randn(30)
35 | })
36 | df_true['sum'] = df_true['var1'] + df_true['var2']
37 | 
38 | # input dataframe
39 | df = df_true.copy()
40 | fh = 5
41 | df.iloc[-fh:, 1:] = np.nan
42 | 
43 | # apply MFF
44 | m = MFF(df, equality_constraints=['var1_? + var2_? - sum_?'])
45 | df2 = m.fit()
46 | 
47 | # plots results
48 | fig,axes = plt.subplots(3,1,sharey=True, figsize=(9,9))
49 | 
50 | axes[0].plot(df2['var2'], label='forecasted var2')
51 | axes[0].plot(df_true['var2'], label='true var2')
52 | axes[0].legend()
53 | 
54 | axes[1].plot(df2['sum'], label='forecasted sum')
55 | axes[1].plot(df_true['sum'], label='true sum')
56 | axes[1].legend()
57 | 
58 | axes[2].plot( df2['var1'] + df2['var2'] - df2['sum'], label='summation error')
59 | axes[2].legend()
60 | ```
61 | 
62 | # Disclaimer
63 | 
64 | Reuse of this tool and IMF information does not imply any endorsement of the research and/or product. Any research presented should not be reported as representing the views of the IMF, its Executive Board, or member governments.
65 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. module:: mff
 2 | 
 3 | .. Systematic Macroframework Forecasting documentation master file, created by
 4 |    sphinx-quickstart on Mon Feb 12 12:29:05 2024.
 5 |    You can adapt this file completely to your liking, but it should at least
 6 |    contain the root `toctree` directive.
 7 | 
 8 | Documentation for the Macroframework Forecasting package.
 9 | ====================================================================
10 | 
11 | This repository contains the Python code for the forecasting method described in:
12 | 
13 | `A Python Package to Assist Macroframework Forecasting: Concepts and Examples (2025) <https://www.imf.org/en/Publications/WP/Issues/2025/08/29/A-Python-Package-to-Assist-Macroframework-Forecasting-Concepts-and-Examples-570041>`_.
14 | 
15 | `Smooth Forecast Reconciliation (2024) <https://www.imf.org/en/Publications/WP/Issues/2024/03/22/Smooth-Forecast-Reconciliation-546654>`_.
16 | 
17 | `Systematizing Macroframework Forecasting: High-Dimensional Conditional Forecasting with Accounting Identities (2023) <https://link.springer.com/article/10.1057/s41308-023-00225-8>`_.
18 | 
19 | Installation
20 | ------------
21 | 
22 | To install the `macroframe-foreacst` package, run the following in the terminal/shell:
23 | 
24 | .. code-block:: console
25 | 
26 |    pip install macroframe-forecast
27 | 
28 | 
29 | 
30 | Quick start
31 | -----------
32 | 
33 | The relevant import from `macroframe-foreacst` is `MFF`:
34 | 
35 | .. code-block:: python
36 | 
37 |    import numpy as np
38 |    import pandas as pd
39 |    import matplotlib.pyplot as plt
40 |    from macroframe_forecast import MFF
41 |    
42 |    # true data
43 |    df_true = pd.DataFrame({
44 |        'var1': np.random.randn(30),  # 100 random values from normal distribution
45 |        'var2': np.random.randn(30)
46 |    })
47 |    df_true['sum'] = df_true['var1'] + df_true['var2']
48 |    
49 |    # input dataframe, 
50 |    df = df_true.copy()
51 |    fh = 5
52 |    df.iloc[-fh:, 1:] = np.nan
53 |    
54 |    # apply MFF
55 |    m = MFF(df, equality_constraints=['var1_? + var2_? - sum_?'])
56 |    df2 = m.fit()
57 |    
58 |    # plots results
59 |    fig,axes = plt.subplots(3,1,sharey=True, figsize=(9,9))
60 |    
61 |    axes[0].plot(df2['var2'], label='forecasted var2')
62 |    axes[0].plot(df_true['var2'], label='true var2')
63 |    axes[0].legend()
64 |    
65 |    axes[1].plot(df2['sum'], label='forecasted sum')
66 |    axes[1].plot(df_true['sum'], label='true sum')
67 |    axes[1].legend()
68 |    
69 |    axes[2].plot( df2['var1'] + df2['var2'] - df2['sum'], label='summation error')
70 |    axes[2].legend()
71 | 
72 | 
73 | .. toctree::
74 |    :maxdepth: 2
75 |    :caption: Contents:
76 | 
77 |    examples
78 |    mff_documentation
79 |    contributing
80 | 
81 | * :ref:`genindex`
82 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
 1 | from numpy.random import sample
 2 | from pandas import DataFrame, Index, PeriodIndex, Series
 3 | 
 4 | from macroframe_forecast.utils import (
 5 |     expand_wildcard,
 6 |     find_permissible_wildcard,
 7 |     find_strings_to_replace_wildcard,
 8 |     get_freq_of_freq,
 9 | )
10 | 
11 | 
12 | def test_find_permissible_wildcard():
13 |     assert find_permissible_wildcard(["a", "b", "c"], _seed=0) == "m"
14 |     assert find_permissible_wildcard(["a", "b", "c"], _seed=10) == "s"
15 | 
16 | 
17 | def test_find_strings_to_replace_wildcard():
18 |     n = 30
19 |     p = 2
20 |     years = [str(y) for y in range(2000, 2000 + n)]
21 |     df = DataFrame(sample([n, p]), columns=["a", "b"], index=years)
22 |     df0_stacked = df.T.stack()
23 |     all_cells_index = df0_stacked.index
24 |     var_list = Series([f"{a}_{b}" for a, b in all_cells_index], index=all_cells_index)
25 |     constraint = "ax + bx"
26 |     wildcard = "x"
27 |     missing_string_list = find_strings_to_replace_wildcard(constraint, var_list, wildcard)
28 |     assert missing_string_list == [f"_{y}" for y in years]
29 | 
30 | 
31 | def test_expand_wildcard():
32 |     import numpy as np
33 |     import pandas as pd
34 | 
35 |     n = 30
36 |     p = 2
37 |     years = [str(y) for y in range(2000, 2000 + n)]
38 |     df = pd.DataFrame(np.random.sample([n, p]), columns=["a", "b"], index=years)
39 |     df0_stacked = df.T.stack()
40 |     all_cells_index = df0_stacked.index
41 |     var_list = pd.Series([f"{a}_{b}" for a, b in all_cells_index], index=all_cells_index)
42 |     constraints_with_alphabet_wildcard = ["ax + bx"]
43 |     alphabet_wildcard = "x"
44 |     constraints = expand_wildcard(constraints_with_alphabet_wildcard, var_list=var_list, wildcard=alphabet_wildcard)
45 |     assert constraints == [f"a_{y} + b_{y}" for y in years]
46 | 
47 | 
48 | def test_get_freq_of_freq_quarterly():
49 |     years = [2000, 2000, 2001]
50 |     quarters = [1, 2, 4]
51 | 
52 |     test_index = PeriodIndex.from_fields(year=years, quarter=quarters)
53 | 
54 |     assert get_freq_of_freq(test_index, "Y").equals(Index(years, dtype="int64"))
55 |     assert get_freq_of_freq(test_index, "Q").equals(Index(quarters, dtype="int64"))
56 | 
57 | 
58 | def test_get_freq_of_freq_datetime():
59 |     years = [2000, 2000, 2001]
60 |     months = [1, 6, 11]
61 |     days = [3, 10, 10]
62 |     hours = [4, 6, 10]
63 |     minutes = [5, 10, 30]
64 |     seconds = [1, 4, 5]
65 | 
66 |     test_index_2 = PeriodIndex.from_fields(
67 |         year=years, month=months, day=days, hour=hours, minute=minutes, second=seconds, freq="s"
68 |     )
69 |     assert get_freq_of_freq(test_index_2, "M").equals(Index(months, dtype="int64"))
70 |     assert get_freq_of_freq(test_index_2, "W").equals(Index([1, 23, 45], dtype="int32"))
71 |     assert get_freq_of_freq(test_index_2, "D").equals(Index(days, dtype="int32"))
72 |     assert get_freq_of_freq(test_index_2, "H").equals(Index(hours, dtype="int32"))
73 |     assert get_freq_of_freq(test_index_2, "T").equals(Index(minutes, dtype="int32"))
74 |     assert get_freq_of_freq(test_index_2, "S").equals(Index(seconds, dtype="int32"))
75 | 


--------------------------------------------------------------------------------
/tests/test_MFF.py:
--------------------------------------------------------------------------------
  1 | # Disclaimer: Reuse of this tool and IMF information does not imply
  2 | # any endorsement  of the research and/or product. Any research presented
  3 | # should not be reported as representing the views of the IMF,
  4 | # its Executive Board, member governments.
  5 | 
  6 | 
  7 | from string import ascii_uppercase
  8 | 
  9 | import numpy as np
 10 | import pandas as pd
 11 | from pytest import mark
 12 | 
 13 | from macroframe_forecast import MFF, MFF_mixed_freqency
 14 | 
 15 | # %%
 16 | 
 17 | 
 18 | @mark.slow
 19 | def test_MFF_non_parallel():
 20 |     n = 30
 21 |     p = 3
 22 |     fh = 1
 23 |     df_true = pd.DataFrame(
 24 |         np.random.rand(n, p),
 25 |         columns=[f"{L}{i}" for i in range(int(np.ceil(p / 26))) for L in ascii_uppercase][:p],
 26 |         index=pd.date_range(start="2000", periods=n, freq="YE").year,
 27 |     )
 28 |     df_true.iloc[:, -1] = df_true.iloc[:, :-1].sum(axis=1)
 29 |     df = df_true.copy()
 30 |     df.iloc[-fh:, : np.ceil(p / 2).astype(int)] = np.nan
 31 |     df.iloc[-1, 0] = df_true.iloc[-1, 0]  # island
 32 |     equality_constraints = ["A0?+B0?-C0?"]
 33 | 
 34 |     m = MFF(df, equality_constraints=equality_constraints, parallelize=False)
 35 |     df2 = m.fit()
 36 | 
 37 |     assert df2.iloc[-1, 0] == df_true.iloc[-1, 0]
 38 | 
 39 | 
 40 | @mark.slow
 41 | def test_MFF_parallel():
 42 |     n = 30
 43 |     p = 3
 44 |     fh = 1
 45 |     df_true = pd.DataFrame(
 46 |         np.random.rand(n, p),
 47 |         columns=[f"{L}{i}" for i in range(int(np.ceil(p / 26))) for L in ascii_uppercase][:p],
 48 |         index=pd.date_range(start="2000", periods=n, freq="YE").year,
 49 |     )
 50 |     df_true.iloc[:, -1] = df_true.iloc[:, :-1].sum(axis=1)
 51 |     df = df_true.copy()
 52 |     df.iloc[-fh:, : np.ceil(p / 2).astype(int)] = np.nan
 53 |     df.iloc[-1, 0] = df_true.iloc[-1, 0]  # island
 54 | 
 55 |     equality_constraints = ["A0?+B0?-C0?"]
 56 | 
 57 |     m = MFF(df, equality_constraints=equality_constraints, parallelize=True)
 58 |     df2 = m.fit()
 59 | 
 60 |     assert df2.iloc[-1, 0] == df_true.iloc[-1, 0]
 61 | 
 62 | 
 63 | @mark.slow
 64 | def test_MFF_mixed_frequency():
 65 |     import warnings
 66 | 
 67 |     warnings.filterwarnings("ignore", category=UserWarning)
 68 | 
 69 |     n = 120
 70 |     p = 3
 71 |     fhA = 5
 72 |     fhQ = 7
 73 |     dfQ_true = pd.DataFrame(
 74 |         np.random.rand(n, p),
 75 |         columns=[f"{L}{i}" for i in range(int(np.ceil(p / 26))) for L in ascii_uppercase][:p],
 76 |         index=pd.period_range(start="2000-1-1", periods=n, freq="Q"),
 77 |     )
 78 |     dfQ_true.iloc[:, -1] = dfQ_true.iloc[:, :-1].sum(axis=1)
 79 |     dfA_true = dfQ_true.groupby(dfQ_true.index.year).sum()
 80 |     dfA_true.index = pd.PeriodIndex(dfA_true.index, freq="Y")
 81 | 
 82 |     dfA = dfA_true.copy()
 83 |     dfA.iloc[-fhA:, : np.ceil(p / 2).astype(int)] = np.nan
 84 | 
 85 |     dfQ = dfQ_true.iloc[:-12, :].copy()
 86 |     dfQ.iloc[-fhQ:, : np.ceil(p / 2).astype(int)] = np.nan
 87 | 
 88 |     # inputs
 89 |     df_dict = {"Y": dfA, "Q": dfQ}
 90 |     constraints_with_wildcard = ["A0?+B0?-C0?", "?Q1+?Q2+?Q3+?Q4-?"]
 91 | 
 92 |     mff = MFF_mixed_freqency(df_dict, constraints_with_wildcard=constraints_with_wildcard)
 93 |     df2_list = mff.fit()
 94 |     assert ~np.isnan(df2_list[0].iloc[-1, 0])
 95 | 
 96 | 
 97 | @mark.slow
 98 | def test_small_sample_MFF():
 99 |     n = 20
100 |     p = 2
101 |     fh = 5
102 |     df_true = pd.DataFrame(
103 |         np.random.rand(n, p),
104 |         columns=[f"{L}{i}" for i in range(int(np.ceil(p / 26))) for L in ascii_uppercase][:p],
105 |         index=pd.date_range(start="2000", periods=n, freq="YE").year,
106 |     )
107 |     # df_true.iloc[:,-1] = df_true.iloc[:,:-1].sum(axis=1)
108 |     df = df_true.copy()
109 |     df.iloc[-fh:, : np.ceil(p / 2).astype(int)] = np.nan
110 |     # df.iloc[-1,0] = df_true.iloc[-1,0] # island
111 |     equality_constraints = []
112 | 
113 |     m = MFF(df, equality_constraints=equality_constraints, parallelize=False)
114 |     df2 = m.fit()
115 | 
116 |     assert ~np.isnan(df2.iloc[-1, 0])
117 | 
118 | 
119 | @mark.slow
120 | def test_inequality_constraints():
121 | 
122 |     n = 20
123 |     p = 2
124 |     fh = 5
125 |     df_true = pd.DataFrame(
126 |         np.random.rand(n, p),
127 |         columns=[f"{L}{i}" for i in range(int(np.ceil(p / 26))) for L in ascii_uppercase][:p],
128 |         index=pd.date_range(start="2000", periods=n, freq="YE").year,
129 |     )
130 |     # df_true.iloc[:,-1] = df_true.iloc[:,:-1].sum(axis=1)
131 |     df = df_true.copy()
132 |     df.iloc[-fh:, : np.ceil(p / 2).astype(int)] = np.nan
133 |     # df.iloc[-1,0] = df_true.iloc[-1,0] # island
134 | 
135 |     equality_constraints = []
136 | 
137 |     inequality_constraints = [ df.columns[0] + '_' + str(df_true.index[-1]) + ' + 1']
138 | 
139 |     m = MFF(df, equality_constraints=equality_constraints, 
140 |             inequality_constraints = inequality_constraints, 
141 |             parallelize=False)
142 |     df2 = m.fit()
143 |     df2.iloc[-1, 0]
144 | 
145 |     assert (df2.iloc[-1, 0] <= -1) or np.isclose(df2.iloc[-1, 0], -1, atol=1e-12)
146 | 
147 | 
148 | @mark.slow
149 | def test_equality_constraints():
150 | 
151 |     n = 20
152 |     p = 2
153 |     fh = 5
154 |     df_true = pd.DataFrame(
155 |         np.random.rand(n, p),
156 |         columns=[f"{L}{i}" for i in range(int(np.ceil(p / 26))) for L in ascii_uppercase][:p],
157 |         index=pd.date_range(start="2000", periods=n, freq="YE").year,
158 |     )
159 |     # df_true.iloc[:,-1] = df_true.iloc[:,:-1].sum(axis=1)
160 |     df = df_true.copy()
161 |     df.iloc[-fh:, : np.ceil(p / 2).astype(int)] = np.nan
162 |     # df.iloc[-1,0] = df_true.iloc[-1,0] # island
163 | 
164 |     equality_constraints = [df.columns[0] + '_' + str(df_true.index[-1]) + ' + 1']
165 | 
166 |     inequality_constraints = []
167 | 
168 |     m = MFF(df, equality_constraints=equality_constraints, 
169 |             inequality_constraints = inequality_constraints, 
170 |             parallelize=False)
171 |     df2 = m.fit()
172 |     df2.iloc[-1, 0]
173 | 
174 |     assert round(df2.iloc[-1, 0],2) == -1
175 | 


--------------------------------------------------------------------------------
/src/macroframe_forecast/MFF.py:
--------------------------------------------------------------------------------
  1 | # Disclaimer: Reuse of this tool and IMF information does not imply
  2 | # any endorsement of the research and/or product. Any research presented
  3 | # should not be reported as representing the views of the IMF,
  4 | # its Executive Board, member governments.
  5 | 
  6 | 
  7 | import pandas as pd
  8 | from sktime.forecasting.base import BaseForecaster
  9 | 
 10 | from macroframe_forecast.utils import (
 11 |     AddIslandsToConstraints,
 12 |     BreakDataFrameIntoTimeSeriesList,
 13 |     CheckTrainingSampleSize,
 14 |     DefaultForecaster,
 15 |     FillAllEmptyCells,
 16 |     GenLamstar,
 17 |     GenPredTrueData,
 18 |     GenSmoothingMatrix,
 19 |     GenVecForecastWithIslands,
 20 |     GenWeightMatrix,
 21 |     OrganizeCells,
 22 |     Reconciliation,
 23 |     StringToMatrixConstraints,
 24 | )
 25 | 
 26 | # %% MFF
 27 | 
 28 | 
 29 | class MFF:
 30 |     """A class for Macro-Framework Forecasting (MFF).
 31 | 
 32 |     This class facilitates forecasting of single frequency time series data
 33 |     using a two-step process. First step of the forecasting procedure generates
 34 |     unconstrained forecasts using the forecaster specified. In the next step,
 35 |     these forecasts are then reconclied so that they satisfy the supplied
 36 |     constrants, and smoothness of the forecasts is maintained.
 37 | 
 38 |     Parameters
 39 |     ----------
 40 |     df : pd.DataFrame
 41 |        Input dataframe containing time series data. Data should be in wide
 42 |        format, with each row containing data for one period, and each
 43 |        column containing data for one variable.
 44 | 
 45 |     forecaster : BaseForecaster, optional(default: None)
 46 |         sktime BaseForecaster descendant. If not defined, then DefaultForecaster
 47 |         is used.
 48 | 
 49 |     constraints_with_wildcard : str, optional(default: None)
 50 |         Constraints that hold with equality. Constraints may include wildcard,
 51 |         in which case constraints will be applied across all horizons, or
 52 |         may be defined for specified time periods.
 53 | 
 54 |     ineq_constraints_with_wildcard : str, optional(default: None)
 55 |         Inequality constraints, comparable to ``constraints_with_wildcard``.
 56 |         Constraints may include wildcard, in which case constraints will be
 57 |         applied across all horizons, or may be defined for specified time
 58 |         periods. Constraints should be written in the form of 'C_ineq*y - d_ineq ≤ 0 '. 
 59 | 
 60 |     parallelize : boolean
 61 |         Indicate whether parallelization should be employed for generating the
 62 |         first step forecasts. Default value is `True`.
 63 | 
 64 |     n_forecast_error : int
 65 |         Number of windows to split data into training and testing sets for
 66 |         generating matrix of forecast errors. Default is 5.
 67 | 
 68 |     shrinkage_method : str, optional(default: 'oas')
 69 |         Method to be used for shrinking sample covariance matrix. Default is
 70 |         Oracle Shrinking Approximating Estimator ('oas'). Other options are
 71 |         oas, identity and monotone_diagonal.
 72 | 
 73 |     default_lam : float, optional(default: -1)
 74 |         The value of lambda to be used for calculating smoothing parameter if
 75 |         frequency of observations cannot be determined from index names. If this
 76 |         is set to -1, lambda is calculated empirically. Default is -1.
 77 | 
 78 |     max_lam : float, optional(default: 129600)
 79 |         Maximum value of lamstar to be used for smoothing forecasts when being
 80 |         estimated empirically.
 81 | 
 82 |     Returns
 83 |     -------
 84 |     df2 : pd.Dataframe
 85 |         Output dataframe with all reconciled forecasts filled into the original
 86 |         input.
 87 | 
 88 | 
 89 |     """
 90 | 
 91 |     def __init__(
 92 |         self,
 93 |         df: pd.DataFrame,
 94 |         forecaster: BaseForecaster | None = None,
 95 |         equality_constraints: list[str] = [],
 96 |         inequality_constraints: list[str] = [],
 97 |         parallelize: bool = True,
 98 |         n_forecast_error: int = 5,
 99 |         shrinkage_method: str = "oas",
100 |         default_lam: float = -1,
101 |         max_lam: float = 129600
102 |     ):
103 |         self.df = df
104 |         self.forecaster = forecaster
105 |         self.equality_constraints = equality_constraints
106 |         self.inequality_constraints = inequality_constraints
107 |         self.parallelize = parallelize
108 |         self.n_forecast_error = n_forecast_error
109 |         self.shrinkage_method = shrinkage_method
110 |         self.default_lam = default_lam
111 |         self.max_lam = max_lam
112 | 
113 |     def fit(
114 |         self,
115 |     ) -> pd.DataFrame:
116 |         """
117 |         Fits the model and generates reconciled forecasts for the input
118 |         dataframe subject to defined constraints.
119 |         """
120 | 
121 |         df = self.df
122 |         forecaster = self.forecaster
123 |         equality_constraints = self.equality_constraints
124 |         inequality_constraints = self.inequality_constraints
125 |         parallelize = self.parallelize
126 |         n_forecast_error = self.n_forecast_error
127 |         shrinkage_method = self.shrinkage_method
128 |         default_lam = self.default_lam
129 |         max_lam = self.max_lam
130 | 
131 |         # modify inputs into machine-friendly shape
132 |         df0, all_cells, unknown_cells, known_cells, islands = OrganizeCells(df)
133 | 
134 |         # get constraint matrices
135 |         C, d = StringToMatrixConstraints(df0.T.stack(), all_cells, unknown_cells, known_cells, equality_constraints)
136 |         C, d = AddIslandsToConstraints(C, d, islands)
137 |         C_ineq, d_ineq = StringToMatrixConstraints(
138 |             df0.T.stack(), all_cells, unknown_cells, known_cells, inequality_constraints
139 |         )
140 | 
141 |         # Initiate DefaultForecaster only if a forecaster has not already been defined by the user.
142 |         # Use OLS PCA if small_sample is True, and Grid Search if false.
143 |         small_sample: bool = CheckTrainingSampleSize(df0, n_forecast_error)
144 |         if forecaster is None:
145 |             forecaster = DefaultForecaster(small_sample)
146 |         
147 |         # 1st stage forecast and its model
148 |         df1, df1_model = FillAllEmptyCells(df0, forecaster, parallelize=parallelize)
149 | 
150 |         # get pseudo out-of-sample prediction, true values, and prediction models
151 |         pred, true, model = GenPredTrueData(df0, forecaster, n_forecast_error=n_forecast_error, parallelize=parallelize)
152 | 
153 |         # break dataframe into list of time series
154 |         ts_list, pred_list, true_list = BreakDataFrameIntoTimeSeriesList(df0, df1, pred, true)
155 | 
156 |         # get parts for reconciliation
157 |         y1 = GenVecForecastWithIslands(ts_list, islands)
158 |         W, shrinkage = GenWeightMatrix(pred_list, true_list, shrinkage_method=shrinkage_method)
159 |         smoothness = GenLamstar(pred_list, true_list, default_lam=default_lam, max_lam=max_lam)
160 |         Phi = GenSmoothingMatrix(W, smoothness)
161 | 
162 |         # 2nd stage reconciled forecast
163 |         y2 = Reconciliation(y1, W, Phi, C, d, C_ineq, d_ineq)
164 | 
165 |         # reshape vector y2 into df2
166 |         y2 = y2.T.stack(future_stack=True)
167 |         y2.index = y2.index.droplevel(level=0)
168 |         df2 = df0.copy()
169 |         df2.update(y2, overwrite=False)  # fill only nan cells of df0
170 | 
171 |         self.df0 = df0
172 |         self.C = C
173 |         self.d = d
174 |         self.C_ineq = C_ineq
175 |         self.d_ineq = d_ineq
176 |         self.islands = islands
177 | 
178 |         self.df1 = df1
179 |         self.df1_model = df1_model
180 | 
181 |         self.pred = pred
182 |         self.true = true
183 |         self.model = model
184 |         self.ts_list = ts_list
185 |         self.pred_list = pred_list
186 |         self.true_list = true_list
187 |         self.y1 = y1
188 |         self.W = W
189 |         self.Phi = Phi
190 |         self.shrinkage = shrinkage
191 |         self.smoothness = smoothness
192 | 
193 |         self.y2 = y2
194 |         self.df2 = df2
195 | 
196 |         return self.df2
197 | 


--------------------------------------------------------------------------------
/src/macroframe_forecast/MFF_mixed_frequency.py:
--------------------------------------------------------------------------------
  1 | # Disclaimer: Reuse of this tool and IMF information does not imply
  2 | # any endorsement  of the research and/or product. Any research presented
  3 | # should not be reported as representing the views of the IMF,
  4 | # its Executive Board, member governments.
  5 | 
  6 | # Mix-frequency is not working properly yet, waiting for Pandas to fix error: https://github.com/pandas-dev/pandas/issues/59775
  7 | 
  8 | import pandas as pd
  9 | 
 10 | from .utils import (
 11 |     BreakDataFrameIntoTimeSeriesList,
 12 |     ConcatMixFreqMultiIndexSeries,
 13 |     DefaultForecaster,
 14 |     FillAllEmptyCells,
 15 |     GenLamstar,
 16 |     GenPredTrueData,
 17 |     GenSmoothingMatrix,
 18 |     GenVecForecastWithIslands,
 19 |     GenWeightMatrix,
 20 |     OrganizeCells,
 21 |     Reconciliation,
 22 |     StringToMatrixConstraints,
 23 |     get_freq_of_freq,
 24 | )
 25 | 
 26 | class MFF_mixed_freqency:
 27 |     def __init__(
 28 |         self, df_dict, forecaster=DefaultForecaster(), constraints_with_wildcard=[], ineq_constraints_with_wildcard=[]
 29 |     ):
 30 |         self.df_dict = df_dict
 31 |         self.forecaster = forecaster
 32 |         self.constraints_with_wildcard = constraints_with_wildcard
 33 |         self.ineq_constraints_with_wildcard = ineq_constraints_with_wildcard
 34 | 
 35 |     def fit(self):
 36 |         df_dict = self.df_dict
 37 |         forecaster = self.forecaster
 38 |         constraints_with_wildcard = self.constraints_with_wildcard
 39 |         # TODO: delete, the assignment below, if not needed
 40 |         ineq_constraints_with_wildcard = self.ineq_constraints_with_wildcard  # noqa: F841
 41 | 
 42 |         # create constraints
 43 |         freq_order = ["Y", "Q", "M", "W", "D", "H", "T", "S"]
 44 |         lowest_freq = freq_order[min([freq_order.index(k) for k in df_dict.keys()])]
 45 | 
 46 |         df0_list = []
 47 |         all_cells_list = []
 48 |         unknown_cells_list = []
 49 |         known_cells_list = []
 50 |         islands_list = []
 51 |         for k in df_dict.keys():
 52 |             df0_k, all_cells_k, unknown_cells_k, known_cells_k, islands_k = OrganizeCells(df_dict[k])
 53 |             df0_list.append(df0_k)
 54 |             all_cells_list.append(all_cells_k)
 55 |             unknown_cells_list.append(unknown_cells_k)
 56 |             known_cells_list.append(known_cells_k)
 57 |             islands_list.append(islands_k)
 58 | 
 59 |         df0_stacked = ConcatMixFreqMultiIndexSeries([df0.T.stack() for df0 in df0_list], axis=0)
 60 |         all_cells = pd.concat(all_cells_list, axis=0)
 61 |         unknown_cells = pd.concat(unknown_cells_list, axis=0)
 62 |         known_cells = pd.concat(known_cells_list, axis=0)
 63 |         islands = pd.concat(islands_list, axis=0)
 64 | 
 65 |         C, d = StringToMatrixConstraints(df0_stacked, all_cells, unknown_cells, known_cells, constraints_with_wildcard)
 66 | 
 67 |         # combine all frequncies into the lowest frequency dataframe
 68 |         df0wide_list = []
 69 |         df0wide_colflat_list = []
 70 |         for df in df0_list:
 71 |             df0 = df.copy()  # don't want to change df0_list
 72 |             df0_freq = df0.index.freqstr[0]
 73 | 
 74 |             if df0_freq == lowest_freq:
 75 |                 df0wide_freq = df0.copy()
 76 |                 df0wide_colfat_freq = pd.Series(df0wide_freq.columns, index=df0wide_freq.columns)
 77 | 
 78 |             else:
 79 |                 index_freq = df0.index.asfreq(lowest_freq)
 80 |                 col_freq = df0_freq + get_freq_of_freq(df0.index, df0_freq).astype(str)
 81 |                 df0.index = pd.MultiIndex.from_arrays([index_freq, col_freq])
 82 |                 df0wide_freq = df0.unstack()
 83 |                 df0wide_colfat_freq = pd.Series(df0wide_freq.columns.map("_".join), index=df0wide_freq.columns)
 84 | 
 85 |             df0wide_list.append(df0wide_freq)
 86 |             df0wide_colflat_list.append(df0wide_colfat_freq)
 87 | 
 88 |         df0wide = pd.concat(df0wide_list, axis=1)
 89 |         # TODO: delete, the assignment below, if not needed
 90 |         df0wide_col = df0wide.columns  # noqa: F841
 91 |         df0wide_colflat = pd.concat(df0wide_colflat_list)
 92 | 
 93 |         # 1st step forecast
 94 |         df0wide.columns = df0wide_colflat.values.tolist()  # colname has to be single index
 95 |         df1wide, df1wide_model = FillAllEmptyCells(df0wide, forecaster)
 96 |         predwide, truewide, modelwide = GenPredTrueData(df0wide, forecaster)
 97 | 
 98 |         # get df1_list by breaking wide dataframe into different frequencies
 99 |         df1_list = []
100 |         for df0i, df0 in enumerate(df0_list):
101 |             if df0.index.freqstr[0] == lowest_freq:
102 |                 df1_freq = df0.copy()
103 |                 df1_freq.update(df1wide.loc[:, df0wide_colflat_list[df0i].values])
104 |             else:
105 |                 df1wide_freq = df1wide.loc[:, df0wide_colflat_list[df0i].values]
106 |                 df1wide_freq.columns = pd.MultiIndex.from_tuples(df0wide_colflat_list[df0i].index)
107 |                 df1_freq = df0wide_list[df0i].copy().stack(future_stack=True)  # storage
108 |                 df1_freq.update(df1wide_freq.stack(future_stack=True))
109 |                 df1_freq.index = df0_list[df0i].index
110 | 
111 |             df1_list.append(df1_freq)
112 | 
113 |         # get pred_list, true_list by breaking dataframes into different frequencies
114 |         pred_allfreq = []
115 |         true_allfreq = []
116 |         for df0i, df0 in enumerate(df0_list):
117 |             # get nan cells
118 |             df0wide_freq = df0wide_list[df0i].copy()
119 |             df0wide_freq.columns = df0wide_colflat_list[df0i].values
120 |             na_cells = df0wide_freq.isna()[df0wide_freq.isna()].T.stack().index
121 | 
122 |             # slice predwide
123 |             pred_freq = predwide.loc[:, na_cells]
124 |             true_freq = truewide.loc[:, na_cells]
125 | 
126 |             if df0.index.freqstr[0] != lowest_freq:
127 |                 # reshape colname multiindex of (var_freq,lowestfreq) to var_lowestfreqfreq
128 |                 colflat = pred_freq.columns
129 |                 var_list = [v[: v.rfind("_")] for v in colflat.get_level_values(0)]
130 |                 freq_list = [v[v.rfind("_") + 1 :] for v in colflat.get_level_values(0)]
131 |                 lowest_freq_list = colflat.get_level_values(-1).astype(str)
132 |                 original_time = pd.PeriodIndex(
133 |                     [lowest_freq_list[i] + freq_list[i] for i in range(len(colflat))], freq=df0.index.freq
134 |                 )
135 |                 pred_freq_colname = pd.MultiIndex.from_arrays([var_list, original_time])
136 |                 pred_freq.columns = pred_freq_colname
137 |                 true_freq.columns = pred_freq_colname
138 | 
139 |             # change col order
140 |             pred_freq = pred_freq.loc[:, df0.isna()[df0.isna()].T.stack().index]
141 |             true_freq = true_freq.loc[:, pred_freq.columns]
142 | 
143 |             # append pred, true for each frequency
144 |             pred_allfreq.append(pred_freq)
145 |             true_allfreq.append(true_freq)
146 | 
147 |         # break dataframes in to lists
148 |         ts_list = []
149 |         pred_list = []
150 |         true_list = []
151 |         for df0i, df0 in enumerate(df0_list):
152 |             ts_list_freq, pred_list_freq, true_list_freq = BreakDataFrameIntoTimeSeriesList(
153 |                 df0, df1_list[df0i], pred_allfreq[df0i], true_allfreq[df0i]
154 |             )
155 | 
156 |             ts_list += ts_list_freq
157 |             pred_list += pred_list_freq
158 |             true_list += true_list_freq
159 | 
160 |         # get parts for reconciliation
161 |         # islands_list_all_freq = pd.concat(islands_list)
162 | 
163 |         y1 = GenVecForecastWithIslands(ts_list, islands)
164 |         W, shrinkage = GenWeightMatrix(pred_list, true_list)
165 |         smoothness = GenLamstar(pred_list, true_list)
166 |         Phi = GenSmoothingMatrix(W, smoothness)
167 | 
168 |         y2 = Reconciliation(y1, W, Phi, C, d)
169 | 
170 |         # reshape vector y2 into df2
171 |         y2 = y2.T.stack(future_stack=True)
172 |         y2.index = y2.index.droplevel(level=0)
173 |         df2_list = []
174 |         for df0 in df0_list:
175 |             df2_freq = df0.copy()
176 |             df2_freq.update(y2, overwrite=False)  # fill only nan cells of df0
177 |             df2_list.append(df2_freq)
178 | 
179 |         self.df0_list = df0_list
180 |         self.df1_list = df1_list
181 |         self.df2_list = df2_list
182 |         return self.df2_list
183 | 


--------------------------------------------------------------------------------
/src/macroframe_forecast/examples.py:
--------------------------------------------------------------------------------
  1 | # Disclaimer: Reuse of this tool and IMF information does not imply
  2 | # any endorsement  of the research and/or product. Any research presented
  3 | # should not be reported as representing the views of the IMF,
  4 | # its Executive Board, member governments.
  5 | 
  6 | 
  7 | from string import ascii_uppercase
  8 | from pandas import DataFrame
  9 | import numpy as np
 10 | import pandas as pd
 11 | from sktime.datasets import load_macroeconomic
 12 | 
 13 | from macroframe_forecast import MFF, MFF_mixed_freqency
 14 | 
 15 | # %%
 16 | 
 17 | 
 18 | def example1():  # no constraints
 19 |     # load data
 20 |     # from sktime.datasets import load_macroeconomic
 21 |     df_true = load_macroeconomic().iloc[:, :5]
 22 | 
 23 |     # input dataframe
 24 |     df = df_true.copy()
 25 |     fh = 5
 26 |     df.iloc[-fh:, 0] = np.nan
 27 | 
 28 |     # apply MFF
 29 |     m = MFF(df, equality_constraints=[])
 30 |     df2 = m.fit()
 31 |     df0 = m.df0
 32 |     df1 = m.df1
 33 |     df1_model = m.df1_model
 34 |     smoothness = m.smoothness
 35 |     shrinkage = m.shrinkage
 36 | 
 37 |     # plot results
 38 |     t0 = -30
 39 |     ax = df0.iloc[t0:, 0].plot(label="df0")
 40 |     df1.iloc[t0:, 0].plot(ax=ax, label="df1")
 41 |     df2.iloc[t0:, 0].plot(ax=ax, label="df2")
 42 |     df_true.iloc[t0:, 0].plot(ax=ax, label="df_true")
 43 |     ax.axvline(x=df0.index[-fh])
 44 |     ax.legend()
 45 | 
 46 |     print("smoothness", smoothness.values)
 47 |     print("shrinkage", np.round(shrinkage, 3))
 48 |     for ri, ci in np.argwhere(df.isna()):
 49 |         print(df1_model.index[ri], df1_model.columns[ci], df1_model.iloc[ri, ci].best_params_)
 50 | 
 51 | 
 52 | # example 2: with constraints
 53 | def example2():
 54 |     # create data
 55 |     n = 30
 56 |     p = 3
 57 |     fh = 5
 58 |     df_true = pd.DataFrame(
 59 |         np.random.rand(n, p),
 60 |         columns=[f"{L}{i}" for i in range(int(np.ceil(p / 26))) for L in ascii_uppercase][:p],
 61 |         index=pd.date_range(start="2000", periods=n, freq="YE").year,
 62 |     )
 63 |     df_true.iloc[:, -1] = df_true.iloc[:, :-1].sum(axis=1)
 64 |     df = df_true.copy()
 65 |     df.iloc[-fh:, : np.ceil(p / 2).astype(int)] = np.nan
 66 |     df.iloc[-1, 0] = df_true.iloc[-1, 0]  # island
 67 |     # df.iloc[-fh,-1] = df.iloc[:,-1].mean()
 68 |     # df.iloc[-3,1] = df_true.iloc[-3,1] # island
 69 |     equality_constraints = ["A0?+B0?-C0?"]
 70 |     # ineq_constraints_with_wildcard = ['A0?-0.5'] # A0 <=0.5 for all years
 71 | 
 72 |     # fit data
 73 |     m = MFF(df, equality_constraints=equality_constraints)
 74 |     df2 = m.fit()
 75 |     df0 = m.df0
 76 |     df1 = m.df1
 77 |     df1_model = m.df1_model
 78 |     shrinkage = m.shrinkage
 79 |     smoothness = m.smoothness
 80 |     # TODO: delete, the assignment below, if not needed
 81 |     W = m.W  # noqa: F841
 82 |     for ri, ci in np.argwhere(df.isna()):
 83 |         print(df1_model.index[ri], df1_model.columns[ci], df1_model.iloc[ri, ci].best_params_)
 84 | 
 85 |     import matplotlib.pyplot as plt
 86 | 
 87 |     plt.figure()
 88 |     t0 = -20
 89 |     plt.subplot(2, 1, 1)
 90 |     ax = df0.iloc[t0:, 0].plot(label="df0")
 91 |     df1.iloc[t0:, 0].plot(ax=ax, label="df1")
 92 |     df2.iloc[t0:, 0].plot(ax=ax, label="df2")
 93 |     df_true.iloc[t0:, 0].plot(ax=ax, label="df_true")
 94 |     ax.axvline(x=df0.index[-fh])
 95 | 
 96 |     plt.subplot(2, 1, 2)
 97 |     ax = df0.iloc[t0:, 1].plot(label="df0")
 98 |     df1.iloc[t0:, 1].plot(ax=ax, label="df1")
 99 |     df2.iloc[t0:, 1].plot(ax=ax, label="df2")
100 |     df_true.iloc[t0:, 1].plot(ax=ax, label="df_true")
101 |     ax.axvline(x=df0.index[-fh], label="fh=1")
102 |     ax.legend(loc="lower left")
103 | 
104 |     print("smoothness", smoothness.values)
105 |     print("shrinkage", np.round(shrinkage, 3))
106 | 
107 |     # confirm constraints
108 |     assert np.isclose(df2["A0"] + df2["B0"] - df2["C0"], 0).all()
109 | 
110 | 
111 | # example, mixed-frequency intra-inter-temporal constraints
112 | def example3():
113 |     import warnings
114 | 
115 |     warnings.filterwarnings("ignore", category=UserWarning)
116 | 
117 |     n = 120
118 |     p = 3
119 |     fhA = 5
120 |     fhQ = 7
121 |     dfQ_true = pd.DataFrame(
122 |         np.random.rand(n, p),
123 |         columns=[f"{L}{i}" for i in range(int(np.ceil(p / 26))) for L in ascii_uppercase][:p],
124 |         index=pd.period_range(start="2000-1-1", periods=n, freq="Q"),
125 |     )
126 |     dfQ_true.iloc[:, -1] = dfQ_true.iloc[:, :-1].sum(axis=1)
127 |     dfA_true = dfQ_true.groupby(dfQ_true.index.year).sum()
128 |     dfA_true.index = pd.PeriodIndex(dfA_true.index, freq="Y")
129 | 
130 |     dfA = dfA_true.copy()
131 |     dfA.iloc[-fhA:, : np.ceil(p / 2).astype(int)] = np.nan
132 | 
133 |     dfQ = dfQ_true.iloc[:-12, :].copy()
134 |     dfQ.iloc[-fhQ:, : np.ceil(p / 2).astype(int)] = np.nan
135 | 
136 |     # inputs
137 |     df_dict = {"Y": dfA, "Q": dfQ}
138 |     constraints_with_wildcard = ["A0?+B0?-C0?", "?Q1+?Q2+?Q3+?Q4-?"]
139 | 
140 |     mff = MFF_mixed_freqency(df_dict, constraints_with_wildcard=constraints_with_wildcard)
141 |     df2_list = mff.fit()
142 |     df1_list = mff.df1_list
143 |     df0_list = mff.df0_list
144 | 
145 |     # plot results
146 |     import matplotlib.pyplot as plt
147 | 
148 |     t0 = -30
149 |     plt.subplot(2, 1, 1)
150 |     ax = df0_list[1].iloc[t0:, 0].plot(label="df0")
151 |     df1_list[1].iloc[t0:, 0].plot(ax=ax, label="df1")
152 |     df2_list[1].iloc[t0:, 0].plot(ax=ax, label="df2")
153 |     dfQ_true.iloc[t0:, 0].plot(ax=ax, label="df_true")
154 |     ax.axvline(x=df0_list[1].index[-fhQ], label="fh=1")
155 |     ax.legend(loc="lower left")
156 | 
157 |     plt.subplot(2, 1, 2)
158 |     ax = df0_list[0].iloc[t0:, 0].plot(label="df0")
159 |     df1_list[0].iloc[t0:, 0].plot(ax=ax, label="df1")
160 |     df2_list[0].iloc[t0:, 0].plot(ax=ax, label="df2")
161 |     dfA_true.iloc[t0:, 0].plot(ax=ax, label="df_true")
162 |     ax.axvline(x=df0_list[0].index[-fhQ], label="fh=1")
163 |     ax.legend(loc="lower left")
164 | 
165 |     # check constraints
166 |     df2A = df2_list[0]
167 |     df2Q = df2_list[1]
168 |     df2A.eval("A0+B0-C0")
169 |     (df2Q.resample("Y").sum() - df2A).dropna()
170 | 
171 | 
172 | def generate_example_GDP_df() -> DataFrame:
173 |     """Utility function to generate example GDP data for quick demonstration purposes.
174 | 
175 |     Example:
176 | 
177 |     ```python
178 |     from macroframe_forecast import MFF
179 |     from macroframe_forecast.examples import generate_example_GDP_df
180 | 
181 |     df0 = generate_example_GDP_df()
182 |     m = MFF(df0, equality_constraints=["GDP_2030 - 1.04 * GDP_2029"])
183 |     m.fit()
184 |     ```
185 | 
186 |     """
187 |     GDP_data_true = DataFrame(
188 |         {
189 |             "year": [
190 |                 1950,
191 |                 1951,
192 |                 1952,
193 |                 1953,
194 |                 1954,
195 |                 1955,
196 |                 1956,
197 |                 1957,
198 |                 1958,
199 |                 1959,
200 |                 1960,
201 |                 1961,
202 |                 1962,
203 |                 1963,
204 |                 1964,
205 |                 1965,
206 |                 1966,
207 |                 1967,
208 |                 1968,
209 |                 1969,
210 |                 1970,
211 |                 1971,
212 |                 1972,
213 |                 1973,
214 |                 1974,
215 |                 1975,
216 |                 1976,
217 |                 1977,
218 |                 1978,
219 |                 1979,
220 |                 1980,
221 |                 1981,
222 |                 1982,
223 |                 1983,
224 |                 1984,
225 |                 1985,
226 |                 1986,
227 |                 1987,
228 |                 1988,
229 |                 1989,
230 |                 1990,
231 |                 1991,
232 |                 1992,
233 |                 1993,
234 |                 1994,
235 |                 1995,
236 |                 1996,
237 |                 1997,
238 |                 1998,
239 |                 1999,
240 |                 2000,
241 |                 2001,
242 |                 2002,
243 |                 2003,
244 |                 2004,
245 |                 2005,
246 |                 2006,
247 |                 2007,
248 |                 2008,
249 |                 2009,
250 |                 2010,
251 |                 2011,
252 |                 2012,
253 |                 2013,
254 |                 2014,
255 |                 2015,
256 |                 2016,
257 |                 2017,
258 |                 2018,
259 |                 2019,
260 |                 2020,
261 |                 2021,
262 |                 2022,
263 |                 2023,
264 |                 2024,
265 |                 2025,
266 |                 2026,
267 |                 2027,
268 |                 2028,
269 |                 2029,
270 |                 2030,
271 |             ],
272 |             "GDP": [
273 |                 301782704906.154,
274 |                 348993057004.926,
275 |                 368027835977.609,
276 |                 389147698401.843,
277 |                 390276672099.46,
278 |                 424868331217.657,
279 |                 448388356231.708,
280 |                 471707274214.225,
281 |                 478166880805.205,
282 |                 519476064642.104,
283 |                 539899866168.654,
284 |                 558583293630.287,
285 |                 600454646133.34,
286 |                 633368190949.311,
287 |                 680153540812.135,
288 |                 737201978910.734,
289 |                 808045440847.441,
290 |                 853883822469.0601,
291 |                 933096436159.1281,
292 |                 1008751520510.61,
293 |                 1064366709379.28,
294 |                 1155403629216.3,
295 |                 1269884411457.22,
296 |                 1418456050381.57,
297 |                 1536647924378.57,
298 |                 1674009506825.93,
299 |                 1867242215504.46,
300 |                 2079644632633.34,
301 |                 2350400768409.49,
302 |                 2627325000000.0,
303 |                 2857325000000.0,
304 |                 3207025000000.0,
305 |                 3343800000000.0,
306 |                 3634025000000.0,
307 |                 4037650000000.0,
308 |                 4339000000000.0,
309 |                 4579625000000.0,
310 |                 4855250000000.0,
311 |                 5236425000000.0,
312 |                 5641600000000.0,
313 |                 5963125000000.0,
314 |                 6158125000000.0,
315 |                 6520325000000.0,
316 |                 6858550000000.0,
317 |                 7287250000000.0,
318 |                 7639750000000.0,
319 |                 8073125000000.0,
320 |                 8577550000000.0,
321 |                 9062825000000.0,
322 |                 9631175000000.0,
323 |                 10250950000000.0,
324 |                 10581925000000.0,
325 |                 10929100000000.0,
326 |                 11456450000000.0,
327 |                 12217175000000.0,
328 |                 13039200000000.0,
329 |                 13815600000000.0,
330 |                 14474250000000.0,
331 |                 14769850000000.0,
332 |                 14478050000000.0,
333 |                 15048975000000.0,
334 |                 15599725000000.0,
335 |                 16253950000000.0,
336 |                 16880675000000.0,
337 |                 17608125000000.0,
338 |                 18295000000000.0,
339 |                 18804900000000.0,
340 |                 19612100000000.0,
341 |                 20656525000000.0,
342 |                 21539975000000.0,
343 |                 21354125000000.0,
344 |                 23681175000000.0,
345 |                 26006900000000.0,
346 |                 27720725000000.0,
347 |                 29184900000000.0,
348 |                 30507217002511.25,
349 |                 31717641479090.75,
350 |                 32941710359665.25,
351 |                 34342131994149.0,
352 |                 35712823521822.0,
353 |                 37153089058192.75,
354 |             ],
355 |         }
356 |     )
357 | 
358 |     # The original GDP data is in dollar numbers, but changing this to billions
359 |     # going forward in order to deal with problem of matrix invertibility.
360 |     GDP_data_true["GDP"] = GDP_data_true["GDP"] / 1e12
361 | 
362 |     # Time period hs to be set as the index. Here year is the time identifier,
363 |     # therefore setting this as the index.
364 |     GDP_data_true.set_index(GDP_data_true["year"], inplace=True)
365 |     GDP_data_true.drop(columns="year", inplace=True)
366 | 
367 |     # Creating a copy which is used for geenrating the forecasts. Removing the last
368 |     # six years of data for ease of forecasts
369 |     GDP_data = GDP_data_true.copy()
370 |     # Removing the last six years of data so that they are forecasted by the
371 |     # function.
372 |     GDP_data.iloc[-6:, 0] = np.nan
373 |     return GDP_data
374 | 


--------------------------------------------------------------------------------
/docs/source/examples.rst:
--------------------------------------------------------------------------------
  1 | Examples
  2 | --------
  3 | 
  4 | Single-variable example
  5 | ~~~~~~~~~~~~~~~~~~~~~~~
  6 | 
  7 | 
  8 | .. code-block:: python
  9 | 
 10 |   import pandas as pd
 11 |   import numpy as np
 12 |   from macroframe_forecast import MFF
 13 |   import macroframe_forecast
 14 |   from string import ascii_uppercase, ascii_lowercase
 15 |   from sktime.datasets import load_macroeconomic
 16 |   import matplotlib.pyplot as plt
 17 |   
 18 |   #%% Reading the data and generating forecasts.
 19 |   
 20 |   # Reading GDP data as a pandas dataframe.
 21 |   # This dataframe has two columns: year and GDP. Data from 2024-2029 are WEO forecasts. 
 22 |   from pandas import DataFrame
 23 |   
 24 |   GDP_data_true = DataFrame({
 25 |       "year": [
 26 |           1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959,
 27 |           1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969,
 28 |           1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979,
 29 |           1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989,
 30 |           1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
 31 |           2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009,
 32 |           2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019,
 33 |           2020, 2021, 2022, 2023, 2024, 2025, 2026, 2027, 2028, 2029,
 34 |           2030
 35 |       ],
 36 |       "GDP": [
 37 |           301782704906.154, 348993057004.926, 368027835977.609, 389147698401.843,
 38 |           390276672099.46, 424868331217.657, 448388356231.708, 471707274214.225,
 39 |           478166880805.205, 519476064642.104, 539899866168.654, 558583293630.287,
 40 |           600454646133.34, 633368190949.311, 680153540812.135, 737201978910.734,
 41 |           808045440847.441, 853883822469.0601, 933096436159.1281, 1008751520510.61,
 42 |           1064366709379.28, 1155403629216.3, 1269884411457.22, 1418456050381.57,
 43 |           1536647924378.57, 1674009506825.93, 1867242215504.46, 2079644632633.34,
 44 |           2350400768409.49, 2627325000000.0, 2857325000000.0, 3207025000000.0,
 45 |           3343800000000.0, 3634025000000.0, 4037650000000.0, 4339000000000.0,
 46 |           4579625000000.0, 4855250000000.0, 5236425000000.0, 5641600000000.0,
 47 |           5963125000000.0, 6158125000000.0, 6520325000000.0, 6858550000000.0,
 48 |           7287250000000.0, 7639750000000.0, 8073125000000.0, 8577550000000.0,
 49 |           9062825000000.0, 9631175000000.0, 10250950000000.0, 10581925000000.0,
 50 |           10929100000000.0, 11456450000000.0, 12217175000000.0, 13039200000000.0,
 51 |           13815600000000.0, 14474250000000.0, 14769850000000.0, 14478050000000.0,
 52 |           15048975000000.0, 15599725000000.0, 16253950000000.0, 16880675000000.0,
 53 |           17608125000000.0, 18295000000000.0, 18804900000000.0, 19612100000000.0,
 54 |           20656525000000.0, 21539975000000.0, 21354125000000.0, 23681175000000.0,
 55 |           26006900000000.0, 27720725000000.0, 29184900000000.0, 30507217002511.25,
 56 |           31717641479090.75, 32941710359665.25, 34342131994149.0, 35712823521822.0,
 57 |           37153089058192.75
 58 |       ]
 59 |   })
 60 |   
 61 |   
 62 |   # Forecasted GDP growth in 2029 (last year) is as given below
 63 |   final_year_growth =  100*(GDP_data_true.iloc[-1,1]/GDP_data_true.iloc[-2,1]-1)
 64 |   
 65 |   # The original GDP data is in dollar numbers, but changing this to billions 
 66 |   # going forward in order to deal with problem of matrix invertibility.
 67 |   GDP_data_true['GDP'] = GDP_data_true['GDP']/1e12
 68 |   
 69 |   # Time period hs to be set as the index. Here year is the time identifier, 
 70 |   # therefore setting this as the index. 
 71 |   GDP_data_true.set_index(GDP_data_true['year'], inplace = True)
 72 |   GDP_data_true.drop(columns = 'year', inplace = True)
 73 |   
 74 |   # Creating a copy which is used for geenrating the forecasts. Removing the last
 75 |   # six years of data for ease of forecasts
 76 |   GDP_data = GDP_data_true.copy()
 77 |   # Removing the last six years of data so that they are forecasted by the 
 78 |   # function. 
 79 |   GDP_data.iloc[-6:,0] = np.nan
 80 |   
 81 |   # Now we assume that US GDP grows by 4% from 2028 to 2029, which is given by the 
 82 |   # WEO forecast. This therefore works as a constraint for the forecasts.
 83 |   # The dataframe has GDP in levels terms, therefore the constraint has to be 
 84 |   # specified in levels terms as well. The constraints can be rewritten in the following
 85 |   # steps.
 86 |   # GDP_2029/GDP_2028 - 1 = 0.04
 87 |   # GDP_2029 = 1.04*GDP_2028 
 88 |   # GDP_2029 - 1.04*GDP_2028 = 0
 89 |   
 90 |   # Constraints are to be provided in the form of a list, even when there is only 
 91 |   # constraint.
 92 |   GDP_constraint = ['GDP_2030 - 1.04*GDP_2029']
 93 |   
 94 |   m = MFF(df = GDP_data,
 95 |           equality_constraints = GDP_constraint,
 96 |           parallelize = False)
 97 |   
 98 |   # Using the fit method generates first as well as second step forecasts.
 99 |   m.fit()
100 |   
101 |   # First step forecasts are stored as df1 in the fitted object.
102 |   firststep_GDP = m.df1
103 |   
104 |   # The forecasted data is filled into the df2 dataframe in the fitted object.
105 |   reconciled_GDP = m.df2
106 |   
107 |   # Models are stored in a dataframe in the fitted object.
108 |   
109 |   models_used = m.df1_model
110 |   models_used.iloc[-1,0]
111 |   #%% Plotting first and second step forecasts 
112 |   fig, ax = plt.subplots(figsize=(8, 4.8)) 
113 |   
114 |   firststep_GDP['GDP'].plot(ax=ax, label='First step forecasts', linestyle = '--')
115 |   reconciled_GDP['GDP'].plot(ax=ax, label='Final forecasts', linestyle = '-.')
116 |   GDP_data['GDP'].plot(ax = ax, label = 'Known values', color = 'red')
117 |   
118 |   ax.set_xlabel('Year')  
119 |   ax.set_ylabel('US Nominal GDP (in US$ trn)')  
120 |   ax.set_title('US GDP in levels')  
121 |   ax.legend(loc = 'lower left')
122 |   
123 |   # max_xlastvalue = reconciled_GDP.index.max()
124 |   ax.set_xlim([2020, 2030])
125 |   ax.set_ylim([15, 40])
126 |   
127 |   plt.xticks(np.arange(2019, 2031,2))
128 |   
129 |   plt.show()
130 |   
131 |   # %%
132 |   
133 |   firststep_GDP['GDP_growth'] = (firststep_GDP['GDP']/firststep_GDP['GDP'].shift(1) - 1)*100
134 |   reconciled_GDP['GDP_growth'] = (reconciled_GDP['GDP']/reconciled_GDP['GDP'].shift(1) - 1)*100
135 |   GDP_data['GDP_growth'] = (GDP_data['GDP']/GDP_data['GDP'].shift(1) - 1)*100
136 |   
137 |   fig, ax = plt.subplots(figsize=(8, 4.8))
138 |   
139 |   firststep_GDP['GDP_growth'].plot(ax=ax, label='First-step forecasts', linestyle = '--')
140 |   reconciled_GDP['GDP_growth'].plot(ax=ax, label='Second-step forecasts', linestyle = '-.')
141 |   GDP_data['GDP_growth'].plot(ax = ax, label = 'Known values', color = 'red')
142 |   
143 |   ax.set_xlabel('Year')  
144 |   ax.set_ylabel('Nominal GDP growth (annual, %)')  
145 |   ax.set_title('US GDP growth rates')  
146 |   ax.legend(loc = 'upper left')
147 |   
148 |   # Add triangle marker at (2029, 4)
149 |   ax.plot(2030, 4, marker='v', color='black', markersize=8, label='_nolegend_')
150 |   
151 |   # Add text annotation
152 |   ax.annotate('2030 growth constraint', xy=(2030, 4), xytext=(2030-2, 2.5),
153 |               arrowprops=dict(arrowstyle='->', color='black'), color='black')
154 |   
155 |   # max_xlastvalue = reconciled_GDP.index.max()
156 |   ax.set_xlim([2019, 2031])
157 |   
158 |   plt.xticks(np.arange(2020, 2031,2))
159 |   
160 |   plt.show()
161 |   
162 |   # %% Looking at externally generated first-stage
163 |   
164 |   GDP_forecasts_external = pd.DataFrame({"GDP": [29.0, 31.5, 33, 34.1,36.8, 39]},
165 |                               index = [2025, 2026, 2027, 2028, 2029, 2030])
166 |   
167 |   # Build MultiIndex using column name
168 |   multi_index = pd.MultiIndex.from_product([[GDP_forecasts_external.columns[0]], GDP_forecasts_external.index],
169 |                                            names=[None, 'year'])
170 |   
171 |   # Correct: flatten the 2D array to 1D
172 |   GDP_multiindex_series = pd.Series(GDP_forecasts_external.values.ravel(), index=multi_index)
173 |   
174 |   
175 |   
176 |   W_alt =  pd.DataFrame(np.eye(len(multi_index)), index=multi_index, columns=multi_index)  # Create identity matrix with shape (n x n)
177 |   
178 |   smoothness_alt = pd.Series(np.ones(1) * 100, index=[multi_index])
179 |   
180 |   Phi_alt = macroframe_forecast.utils.GenSmoothingMatrix(W_alt, smoothness_alt)
181 |   
182 |   
183 |   final_forecasts = macroframe_forecast.utils.Reconciliation(y1 = GDP_multiindex_series, 
184 |                                                              W = m.W, Phi = m.Phi, 
185 |                                                              C = m.C, d = m.d, 
186 |                                                              C_ineq = m.C_ineq, 
187 |                                                              d_ineq = m.d_ineq)
188 |   # %%
189 |   
190 |   # Convert MultiIndex Series to regular Series with year index
191 |   
192 |   gdp_to_forecast_series = GDP_data
193 |   
194 |   gdp_series = GDP_multiindex_series.xs('GDP', level=0)
195 |   second_stage_series = final_forecasts.xs('GDP', level=0)
196 |   
197 |   # Now plot it
198 |   fig, ax = plt.subplots(figsize=(8, 4.8))
199 |   gdp_series.plot(ax=ax, label='Externally generated first-step forecasts', linestyle='--')
200 |   second_stage_series.iloc[:,0].plot(ax=ax, label='Second-step forecasts', linestyle = '-.')
201 |   
202 |   
203 |   
204 |   # Add labels and formatting
205 |   ax.set_xlabel('Year')
206 |   ax.set_ylabel('US Nominal GDP (in US$ trn)')  
207 |   ax.set_title('US GDP in levels') 
208 |   ax.legend(loc='upper left')
209 |   ax.set_xlim([2024, 2030])
210 |   ax.set_ylim([15, 40])
211 | 
212 | 
213 | Multi-variable example
214 | ~~~~~~~~~~~~~~~~~~~~~~
215 | 
216 | .. code-block:: python
217 | 
218 |   import pandas as pd
219 |   import numpy as np
220 |   from macroframe_forecast import MFF
221 |   import matplotlib.pyplot as plt
222 |   from sktime.forecasting.compose import DirectReductionForecaster
223 |   from sktime.forecasting.compose import ForecastingPipeline
224 |   from sklearn.linear_model import LinearRegression  
225 |   from pandas import DataFrame
226 |   
227 |   data = DataFrame({
228 |       "year": [
229 |           2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
230 |           2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020,
231 |           2021, 2022, 2023, 2024, 2025, 2026, 2027, 2028, 2029, 2030
232 |       ],
233 |       "exp": [
234 |           32.801, 33.698, 34.037, 33.719, 33.928, 33.692, 34.562, 37.144, 41.399, 39.763,
235 |           38.796, 37.223, 35.782, 35.324, 35.031, 35.333, 35.194, 35.349, 35.819, 44.779,
236 |           43.218, 36.829, 37.113, 37.593, 37.848, 38.004, 38.107, 38.024, 37.711, 37.862
237 |       ],
238 |       "rev": [
239 |           32.257, 29.877, 29.266, 29.476, 30.853, 31.656, 31.649, 30.532, 28.222, 28.770,
240 |           29.080, 29.109, 31.222, 31.298, 31.501, 30.977, 30.400, 30.014, 30.014, 30.631,
241 |           31.827, 33.130, 29.949, 30.331, 31.389, 32.514, 32.754, 32.409, 32.222, 32.248
242 |       ],
243 |       "int_payments": [
244 |           3.255, 2.892, 2.658, 2.563, 2.704, 2.775, 2.933, 2.776, 2.574, 2.678,
245 |           2.880, 2.726, 2.485, 2.474, 2.341, 2.490, 2.522, 2.769, 2.817, 2.537,
246 |           2.669, 3.137, 3.600, 4.195, 4.301, 4.427, 4.451, 4.370, 4.353, 4.290
247 |       ],
248 |       "pb": [
249 |           2.711, -0.929, -2.113, -1.681, -0.371, 0.739, 0.020, -3.836, -10.603, -8.315,
250 |           -6.836, -5.387, -2.076, -1.552, -1.189, -1.867, -2.272, -2.566, -2.988, -11.610,
251 |           -8.721, -0.561, -3.564, -3.067, -2.158, -1.063, -0.902, -1.246, -1.136, -1.324
252 |       ]
253 |   })
254 |   
255 |   
256 |   # Data upto 2024 is known for all variables. 2024 onwards data are all WEO forecasts.
257 |   # Let us assume that the path for Primary Balance/GDP is known to the forecaster,
258 |   # which is given by the WEO forecasts, while the other three variables are to be
259 |   # forecasted. These unknown values are therefore replaced by NaNs.
260 |   
261 |   fiscal_data = fiscal_data_true.copy()
262 |   fiscal_data.iloc[-6:,:3] = np.nan
263 |   
264 |   # fiscal_data.iloc[-1,0] = fiscal_data_true.iloc[-1,0].copy()
265 |   
266 |   # The basic acccounting identiy can be writted as:
267 |   # Primary Balance/GDP = Revenue /GDP - Expenditure/GDP + Interest Payments/GDP
268 |   # We know that this identity has to bind throughout the forecasting horizon, and
269 |   # therefore we can specify this using the wildcard feature.
270 |   
271 |   fiscal_constraint = ['pb? - rev? + exp? - int_payments?',
272 |                        'exp_2030 - 37']
273 |   
274 |   # Defining the OLS forecasting pipeline for the example
275 |   
276 |   ols = ForecastingPipeline(steps=[
277 |       ('ols',DirectReductionForecaster(LinearRegression()))
278 |       ])
279 |   
280 |   m = MFF(df = fiscal_data,
281 |           equality_constraints = fiscal_constraint,
282 |           forecaster = ols,
283 |           parallelize = False) 
284 |   
285 |   m.fit()
286 |   
287 |   first_step_forecasts = m.df1
288 |   second_step_forecasts = m.df2
289 |   
290 |   # final_forecasts = 
291 |   # %% Expenditure forecasts
292 |   fig, ax = plt.subplots(figsize=(8, 4.8)) 
293 |   
294 |   first_step_forecasts['exp'].plot(ax=ax, label='First-step forecasts', linestyle = '--')
295 |   second_step_forecasts['exp'].plot(ax=ax, label='Second-step forecast', linestyle = '-.')
296 |   fiscal_data['exp'].plot(ax = ax, label = 'WEO values', color = 'red')
297 |   
298 |   ax.set_xlabel('Year')  
299 |   ax.set_ylabel('US Government Expenditure to GDP ratio (%)')  
300 |   ax.set_title('Government Expenditure')  
301 |   ax.legend(loc = 'lower left')
302 |   
303 |   ax.plot(2030, 36.65, marker='v', color='black', markersize=8, label='_nolegend_')
304 |   
305 |   # Add text annotation
306 |   ax.annotate('2030 expenditure constraint value', xy=(2030, fiscal_data_true.iloc[-1,0]), xytext=(2027, 40),
307 |               arrowprops=dict(arrowstyle='->', color='black'), color='black')
308 |   
309 |   # max_xlastvalue = reconciled_GDP.index.max()
310 |   ax.set_xlim([2021, 2030])
311 |   
312 |   plt.xticks(np.arange(2021, 2031,2))
313 |   
314 |   plt.show()
315 |   # %% Revenue forecasts
316 |   fig, ax = plt.subplots(figsize=(8, 4.8)) 
317 |   
318 |   first_step_forecasts['rev'].plot(ax=ax, label='First-step forecast', linestyle = '--')
319 |   second_step_forecasts['rev'].plot(ax=ax, label='Second-step forecast', linestyle = '-.')
320 |   fiscal_data['rev'].plot(ax = ax, label = 'WEO values', color = 'red')
321 |   
322 |   ax.set_xlabel('Year')  
323 |   ax.set_ylabel('US Government Revenue to GDP ratio (%)')  
324 |   ax.set_title('Government Revenue')  
325 |   ax.legend(loc = 'lower left')
326 |   
327 |   # max_xlastvalue = reconciled_GDP.index.max()
328 |   ax.set_xlim([2021, 2030])
329 |   
330 |   plt.xticks(np.arange(2021, 2031,2))
331 |   
332 |   plt.show()
333 |   
334 |   # %% Interest Payment forecasts
335 |   fig, ax = plt.subplots(figsize=(8, 4.8)) 
336 |   
337 |   first_step_forecasts['int_payments'].plot(ax=ax, label='First-step forecast', linestyle = '--')
338 |   second_step_forecasts['int_payments'].plot(ax=ax, label='Second-step forecast', linestyle = '-.')
339 |   fiscal_data['int_payments'].plot(ax = ax, label = 'WEO values', color = 'red')
340 |   
341 |   ax.set_xlabel('Year')  
342 |   ax.set_ylabel('US Government Interest Payments to GDP ratio (%)')  
343 |   ax.set_title('Interest Payments')  
344 |   ax.legend(loc = 'lower left')
345 |   
346 |   # max_xlastvalue = reconciled_GDP.index.max()
347 |   ax.set_xlim([2021, 2030])
348 |   
349 |   plt.xticks(np.arange(2021, 2031,2))
350 |   
351 |   plt.show()
352 |   
353 |   
354 |   
355 |   # %% First step primary balance vs. the constraints
356 |   
357 |   first_step_forecasts['pb_calculated'] = first_step_forecasts['rev'] - first_step_forecasts['exp'] + first_step_forecasts['int_payments']
358 |   second_step_forecasts['pb_calculated'] = second_step_forecasts['rev'] - second_step_forecasts['exp'] + second_step_forecasts['int_payments']
359 |   
360 |   fig, ax = plt.subplots(figsize=(8, 4.8)) 
361 |   
362 |   first_step_forecasts['pb_calculated'].plot(ax=ax, label='First-step forecast', linestyle = '--')
363 |   second_step_forecasts['pb_calculated'].plot(ax=ax, label='Second-step forecast', linestyle = '-.')
364 |   
365 |   # fiscal_data[fiscal_data]['pb'].plot(ax=ax, label='WEO values', linestyle = '-.')
366 |   fiscal_data[fiscal_data.index<2024]['pb'].plot(ax = ax, label = 'WEO values', color = 'red')
367 |   fiscal_data[fiscal_data.index>=2024]['pb'].plot(ax = ax, label = 'Constraint values', color = 'green', marker = 'o', linestyle = 'None')
368 |   
369 |   
370 |   ax.set_xlabel('Year')  
371 |   ax.set_ylabel('Primary Balance to GDP ratio (%)')  
372 |   ax.set_title('Primary Balance')  
373 |   ax.legend(loc = 'lower left')
374 |   
375 |   # max_xlastvalue = reconciled_GDP.index.max()
376 |   ax.set_xlim([2021, 2030])
377 |   
378 |   plt.xticks(np.arange(2021, 2031,2))
379 |   
380 |   plt.show()
381 |   # %%
382 |   
383 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Attribution-NonCommercial-ShareAlike 3.0 IGO
 2 | 
 3 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE LEGAL SERVICES. DISTRIBUTION OF THIS LICENSE DOES NOT CREATE AN ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES REGARDING THE INFORMATION PROVIDED, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM ITS USE. THE LICENSOR IS NOT NECESSARILY AN INTERGOVERNMENTAL ORGANIZATION (IGO), AS DEFINED IN THE LICENSE BELOW.
 4 | License
 5 | THE WORK (AS DEFINED BELOW) IS PROVIDED UNDER THE TERMS OF THIS CREATIVE COMMONS PUBLIC LICENSE (“LICENSE”). THE LICENSOR (DEFINED BELOW) HOLDS COPYRIGHT AND OTHER RIGHTS IN THE WORK. ANY USE OF THE WORK OTHER THAN AS AUTHORIZED UNDER THIS LICENSE IS PROHIBITED.
 6 | 
 7 | BY EXERCISING ANY RIGHTS TO THE WORK PROVIDED HERE, YOU ACCEPT AND AGREE TO BE BOUND BY THE TERMS OF THIS LICENSE. THE LICENSOR GRANTS YOU THE RIGHTS CONTAINED HERE IN CONSIDERATION FOR YOUR ACCEPTANCE AND AGREEMENT TO THE TERMS OF THE LICENSE.
 8 | 
 9 | 1. Definitions
10 | 
11 | “IGO” means, solely and exclusively for purposes of this License, an organization established by a treaty or other instrument governed by international law and possessing its own international legal personality. Other organizations established to carry out activities across national borders and that accordingly enjoy immunity from legal process are also IGOs for the sole and exclusive purposes of this License. IGOs may include as members, in addition to states, other entities.
12 | "Work" means the literary and/or artistic work eligible for copyright protection, whatever may be the mode or form of its expression including digital form, and offered under the terms of this License. It is understood that a database, which by reason of the selection and arrangement of its contents constitutes an intellectual creation, is considered a Work.
13 | "Licensor" means the individual, individuals, entity or entities that offer(s) the Work under the terms of this License and may be, but is not necessarily, an IGO.
14 | "You" means an individual or entity exercising rights under this License.
15 | "License Elements" means the following high-level license attributes as selected by the Licensor and indicated in the title of this License: Attribution, Noncommercial, ShareAlike.
16 | "Reproduce" means to make a copy of the Work in any manner or form, and by any means.
17 | "Distribute" means the activity of making publicly available the Work or Adaptation (or copies of the Work or Adaptation), as applicable, by sale, rental, public lending or any other known form of transfer of ownership or possession of the Work or copy of the Work.
18 | "Publicly Perform" means to perform public recitations of the Work and to communicate to the public those public recitations, by any means or process, including by wire or wireless means or public digital performances; to make available to the public Works in such a way that members of the public may access these Works from a place and at a place individually chosen by them; to perform the Work to the public by any means or process and the communication to the public of the performances of the Work, including by public digital performance; to broadcast and rebroadcast the Work by any means including signs, sounds or images.
19 | "Adaptation" means a work derived from or based upon the Work, or upon the Work and other pre-existing works. Adaptations may include works such as translations, derivative works, or any alterations and arrangements of any kind involving the Work. For purposes of this License, where the Work is a musical work, performance, or phonogram, the synchronization of the Work in timed-relation with a moving image is an Adaptation. For the avoidance of doubt, including the Work in a Collection is not an Adaptation.
20 | "Collection" means a collection of literary or artistic works or other works or subject matter other than works listed in Section 1(b) which by reason of the selection and arrangement of their contents, constitute intellectual creations, in which the Work is included in its entirety in unmodified form along with one or more other contributions, each constituting separate and independent works in themselves, which together are assembled into a collective whole. For the avoidance of doubt, a Collection will not be considered as an Adaptation.
21 | 2. Scope of this License. Nothing in this License is intended to reduce, limit, or restrict any uses free from copyright protection.
22 | 
23 | 3. License Grant. Subject to the terms and conditions of this License, the Licensor hereby grants You a worldwide, royalty-free, non-exclusive license to exercise the rights in the Work as follows:
24 | 
25 | to Reproduce, Distribute and Publicly Perform the Work, to incorporate the Work into one or more Collections, and to Reproduce, Distribute and Publicly Perform the Work as incorporated in the Collections; and,
26 | to create, Reproduce, Distribute and Publicly Perform Adaptations, provided that You clearly label, demarcate or otherwise identify that changes were made to the original Work.
27 | This License lasts for the duration of the term of the copyright in the Work licensed by the Licensor. The above rights may be exercised in all media and formats whether now known or hereafter devised. The above rights include the right to make such modifications as are technically necessary to exercise the rights in other media and formats. All rights not expressly granted by the Licensor are hereby reserved, including but not limited to the rights set forth in Section 4(e).
28 | 
29 | 4. Restrictions. The license granted in Section 3 above is expressly made subject to and limited by the following restrictions:
30 | 
31 | You may Distribute or Publicly Perform the Work only under the terms of this License. You must include a copy of, or the Uniform Resource Identifier (URI) for, this License with every copy of the Work You Distribute or Publicly Perform. You may not offer or impose any terms on the Work that restrict the terms of this License or the ability of the recipient of the Work to exercise the rights granted to that recipient under the terms of the License. You may not sublicense the Work (see section 8(a)). You must keep intact all notices that refer to this License and to the disclaimer of warranties with every copy of the Work You Distribute or Publicly Perform. When You Distribute or Publicly Perform the Work, You may not impose any effective technological measures on the Work that restrict the ability of a recipient of the Work from You to exercise the rights granted to that recipient under the terms of the License. This Section 4(a) applies to the Work as incorporated in a Collection, but this does not require the Collection apart from the Work itself to be made subject to the terms of this License. If You create a Collection, upon notice from a Licensor You must, to the extent practicable, remove from the Collection any credit (inclusive of any logo, trademark, official mark or official emblem) as required by Section 4(d), as requested. If You create an Adaptation, upon notice from a Licensor You must, to the extent practicable, remove from the Adaptation any credit (inclusive of any logo, trademark, official mark or official emblem) as required by Section 4(d), as requested.
32 | You may Distribute or Publicly Perform an Adaptation only under the terms of: (i) this License; (ii) a later version of this License with the same License Elements as this License; or (iii) either the unported Creative Commons license or a ported Creative Commons license (either this or a later license version) containing the same License Elements (the “Applicable License”). (I) You must include a copy of, or the URI for, the Applicable License with every copy of each Adaptation You Distribute or Publicly Perform. (II) You may not offer or impose any terms on the Adaptation that restrict the terms of the Applicable License or the ability of the recipient of the Adaptation to exercise the rights granted to that recipient under the terms of the Applicable License. (III) You must keep intact all notices that refer to this License and to the disclaimer of warranties with every copy of the Work as included in the Adaptation You Distribute or Publicly Perform. (IV) When You Distribute or Publicly Perform the Adaptation, You may not impose any effective technological measures on the Adaptation that restrict the ability of a recipient of the Adaptation from You to exercise the rights granted to that recipient under the terms of the Applicable License. This Section 4(b) applies to the Adaptation as incorporated in a Collection, but this does not require the Collection apart from the Adaptation itself to be made subject to the terms of the Applicable License.
33 | You may not exercise any of the rights granted to You in Section 3 above in any manner that is primarily intended for or directed toward commercial advantage or private monetary compensation. The exchange of the Work for other copyrighted works by means of digital file-sharing or otherwise shall not be considered to be primarily intended for or directed toward commercial advantage or private monetary compensation, provided there is no payment of any monetary compensation in connection with the exchange of copyrighted works.
34 | If You Distribute, or Publicly Perform the Work or any Adaptations or Collections, You must, unless a request has been made pursuant to Section 4(a), keep intact all copyright notices for the Work and provide, reasonable to the medium or means You are utilizing: (i) any attributions that the Licensor indicates be associated with the Work as indicated in a copyright notice, (ii) the title of the Work if supplied; (iii) to the extent reasonably practicable, the URI, if any, that the Licensor specifies to be associated with the Work, unless such URI does not refer to the copyright notice or licensing information for the Work; and, (iv) consistent with Section 3(b), in the case of an Adaptation, a credit identifying the use of the Work in the Adaptation. The credit required by this Section 4(d) may be implemented in any reasonable manner; provided, however, that in the case of an Adaptation or Collection, at a minimum such credit will appear, if a credit for all contributors to the Adaptation or Collection appears, then as part of these credits and in a manner at least as prominent as the credits for the other contributors. For the avoidance of doubt, You may only use the credit required by this Section for the purpose of attribution in the manner set out above and, by exercising Your rights under this License, You may not implicitly or explicitly assert or imply any connection with, sponsorship or endorsement by the Licensor or others designated for attribution, of You or Your use of the Work, without the separate, express prior written permission of the Licensor or such others.
35 | For the avoidance of doubt:
36 | 
37 | Non-waivable Compulsory License Schemes. In those jurisdictions in which the right to collect royalties through any statutory or compulsory licensing scheme cannot be waived, the Licensor reserves the exclusive right to collect such royalties for any exercise by You of the rights granted under this License;
38 | Waivable Compulsory License Schemes. In those jurisdictions in which the right to collect royalties through any statutory or compulsory licensing scheme can be waived, the Licensor reserves the exclusive right to collect such royalties for any exercise by You of the rights granted under this License if Your exercise of such rights is for a purpose or use which is otherwise than noncommercial as permitted under Section 4(c) and otherwise waives the right to collect royalties through any statutory or compulsory licensing scheme; and,
39 | Voluntary License Schemes. To the extent possible, the Licensor waives the right to collect royalties from You for the exercise of the Licensed Rights, whether directly or through a collecting society under any voluntary licensing scheme. In all other cases the Licensor expressly reserves the right to collect such royalties.
40 | Except as otherwise agreed in writing by the Licensor, if You Reproduce, Distribute or Publicly Perform the Work either by itself or as part of any Adaptations or Collections, You must not distort, mutilate, modify or take other derogatory action in relation to the Work which would be prejudicial to the honor or reputation of the Licensor where moral rights apply.
41 | 5. Representations, Warranties and Disclaimer
42 | 
43 | THE LICENSOR OFFERS THE WORK AS-IS AND MAKES NO REPRESENTATIONS OR WARRANTIES OF ANY KIND CONCERNING THE WORK, EXPRESS, IMPLIED, STATUTORY OR OTHERWISE, INCLUDING, WITHOUT LIMITATION, WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR THE ABSENCE OF LATENT OR OTHER DEFECTS, ACCURACY, OR THE PRESENCE OF ERRORS, WHETHER OR NOT DISCOVERABLE.
44 | 
45 | 6. Limitation on Liability
46 | 
47 | IN NO EVENT WILL THE LICENSOR BE LIABLE TO YOU ON ANY LEGAL THEORY FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY DAMAGES ARISING OUT OF THIS LICENSE OR THE USE OF THE WORK, EVEN IF THE LICENSOR HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
48 | 
49 | 7. Termination
50 | 
51 | Subject to the terms and conditions set forth in this License, the license granted here lasts for the duration of the term of the copyright in the Work licensed by the Licensor as stated in Section 3. Notwithstanding the above, the Licensor reserves the right to release the Work under different license terms or to stop distributing the Work at any time; provided, however that any such election will not serve to withdraw this License (or any other license that has been, or is required to be, granted under the terms of this License), and this License will continue in full force and effect unless terminated as stated below.
52 | If You fail to comply with this License, then this License and the rights granted hereunder will terminate automatically upon any breach by You of the terms of this License. Individuals or entities who have received Adaptations or Collections from You under this License, however, will not have their licenses terminated provided such individuals or entities remain in full compliance with those licenses. Sections 1, 2, 5, 6, 7, and 8 will survive any termination of this License. Notwithstanding the foregoing, this License reinstates automatically as of the date the violation is cured, provided it is cured within 30 days of You discovering the violation, or upon express reinstatement by the Licensor. For the avoidance of doubt, this Section 7(b) does not affect any rights the Licensor may have to seek remedies for violations of this License by You.
53 | 8. Miscellaneous
54 | 
55 | Each time You Distribute or Publicly Perform the Work or a Collection, the Licensor offers to the recipient a license to the Work on the same terms and conditions as the license granted to You under this License.
56 | Each time You Distribute or Publicly Perform an Adaptation, the Licensor offers to the recipient a license to the original Work on the same terms and conditions as the license granted to You under this License.
57 | If any provision of this License is invalid or unenforceable, it shall not affect the validity or enforceability of the remainder of the terms of this License, and without further action, such provision shall be reformed to the minimum extent necessary to make such provision valid and enforceable.
58 | No term or provision of this License shall be deemed waived and no breach consented to unless such waiver or consent shall be in writing and signed by the Licensor.
59 | This License constitutes the entire agreement between You and the Licensor with respect to the Work licensed here. There are no understandings, agreements or representations with respect to the Work not specified here. The Licensor shall not be bound by any additional provisions that may appear in any communication from You. This License may not be modified without the mutual written agreement of the Licensor and You.
60 | The rights granted under, and the subject matter referenced, in this License were drafted utilizing the terminology of the Berne Convention for the Protection of Literary and Artistic Works (as amended on September 28, 1979), the Rome Convention of 1961, the WIPO Copyright Treaty of 1996, the WIPO Performances and Phonograms Treaty of 1996 and the Universal Copyright Convention (as revised on July 24, 1971). Interpretation of the scope of the rights granted by the Licensor and the conditions imposed on You under this License, this License, and the rights and conditions set forth herein shall be made with reference to copyright as determined in accordance with general principles of international law, including the above mentioned conventions.
61 | Nothing in this License constitutes or may be interpreted as a limitation upon or waiver of any privileges and immunities that may apply to the Licensor or You, including immunity from the legal processes of any jurisdiction, national court or other authority.
62 | Where the Licensor is an IGO, any and all disputes arising under this License that cannot be settled amicably shall be resolved in accordance with the following procedure:
63 | 
64 | Pursuant to a notice of mediation communicated by reasonable means by either You or the Licensor to the other, the dispute shall be submitted to non-binding mediation conducted in accordance with rules designated by the Licensor in the copyright notice published with the Work, or if none then in accordance with those communicated in the notice of mediation. The language used in the mediation proceedings shall be English unless otherwise agreed.
65 | If any such dispute has not been settled within 45 days following the date on which the notice of mediation is provided, either You or the Licensor may, pursuant to a notice of arbitration communicated by reasonable means to the other, elect to have the dispute referred to and finally determined by arbitration. The arbitration shall be conducted in accordance with the rules designated by the Licensor in the copyright notice published with the Work, or if none then in accordance with the UNCITRAL Arbitration Rules as then in force. The arbitral tribunal shall consist of a sole arbitrator and the language of the proceedings shall be English unless otherwise agreed. The place of arbitration shall be where the Licensor has its headquarters. The arbitral proceedings shall be conducted remotely (e.g., via telephone conference or written submissions) whenever practicable.
66 | Interpretation of this License in any dispute submitted to mediation or arbitration shall be as set forth in Section 8(f), above.
67 | Creative Commons Notice
68 | Creative Commons is not a party to this License, and makes no warranty whatsoever in connection with the Work. Creative Commons will not be liable to You or any party on any legal theory for any damages whatsoever, including without limitation any general, special, incidental or consequential damages arising in connection to this license. Notwithstanding the foregoing two (2) sentences, if Creative Commons has expressly identified itself as the Licensor hereunder, it shall have all rights and obligations of the Licensor.
69 | 
70 | Except for the limited purpose of indicating to the public that the Work is licensed under the CCPL, Creative Commons does not authorize the use by either party of the trademark "Creative Commons" or any related trademark or logo of Creative Commons without the prior written consent of Creative Commons. Any permitted use will be in compliance with Creative Commons' then-current trademark usage guidelines, as may be published on its website or otherwise made available upon request from time to time. For the avoidance of doubt, this trademark restriction does not form part of this License.
71 | 
72 | Creative Commons may be contacted at https://creativecommons.org/ .
73 | 


--------------------------------------------------------------------------------
/src/macroframe_forecast/utils.py:
--------------------------------------------------------------------------------
   1 | # Disclaimer: Reuse of this tool and IMF information does not imply
   2 | # any endorsement  of the research and/or product. Any research presented
   3 | # should not be reported as representing the views of the IMF,
   4 | # its Executive Board, member governments.
   5 | 
   6 | import copy
   7 | import re
   8 | import warnings
   9 | from random import sample, seed
  10 | from string import ascii_lowercase
  11 | from time import time
  12 | from typing import Literal
  13 | 
  14 | import cvxpy as cp
  15 | import numpy as np
  16 | import pandas as pd
  17 | import scipy
  18 | import sympy as sp
  19 | from dask.distributed import Client
  20 | from numpy import ndarray
  21 | from numpy.linalg import inv
  22 | from pandas import DataFrame, Index, PeriodIndex, Series
  23 | from scipy.linalg import block_diag
  24 | from sklearn.decomposition import PCA
  25 | from sklearn.linear_model import ElasticNetCV, LinearRegression
  26 | from sklearn.model_selection import TimeSeriesSplit
  27 | from sklearn.preprocessing import StandardScaler
  28 | from sktime.forecasting.base import BaseForecaster
  29 | from sktime.forecasting.compose import (
  30 |     DirectReductionForecaster,
  31 |     ForecastingPipeline,
  32 |     MultiplexForecaster,
  33 |     TransformedTargetForecaster,
  34 | )
  35 | from sktime.forecasting.model_selection import ForecastingGridSearchCV
  36 | from sktime.forecasting.naive import NaiveForecaster
  37 | from sktime.split import ExpandingGreedySplitter
  38 | from sktime.transformations.series.adapt import TabularToSeriesAdaptor
  39 | from sktime.transformations.series.feature_selection import FeatureSelection
  40 | 
  41 | # %%
  42 | 
  43 | 
  44 | def CheckTrainingSampleSize(df0: DataFrame, n_forecast_error: int = 5) -> bool:
  45 |     """
  46 |     Check sample size available for training window. Raise an exception if the
  47 |     number of observations available is too low.
  48 | 
  49 |     Parameters
  50 |     ----------
  51 | 
  52 |     df0 : pd.DataFrame
  53 |         Input dataframe with island values replaced by nan.
  54 | 
  55 |     n_forecast_error : int
  56 |         Number of training and testing sets to split data into for generating
  57 |         matrix of forecast errors.
  58 | 
  59 |     Returns
  60 |     -------
  61 | 
  62 |     small_sample : bool
  63 |         Indicator for whether the sample of observations available for training
  64 |         is small.
  65 | 
  66 |     """
  67 | 
  68 |     forecast_horizon = max(np.argwhere(df0.isna())[:, 0]) - min(np.argwhere(df0.isna())[:, 0]) + 1
  69 | 
  70 |     minimum_training_obs = min(np.argwhere(df0.isna())[:, 0]) - forecast_horizon - n_forecast_error
  71 | 
  72 |     if minimum_training_obs <= 0:
  73 |         raise ValueError(
  74 |             "Number of observations too low for given forecast horizon "
  75 |             "and n_sample_splits; consider reducing forecast horizon and/or "
  76 |             "n_sample_splits"
  77 |         )
  78 | 
  79 |     elif minimum_training_obs <= 15:
  80 |         return True
  81 | 
  82 |     else:
  83 |         return False
  84 | 
  85 | 
  86 | def DefaultForecaster(small_sample: bool = False) -> BaseForecaster:
  87 |     """
  88 |     Set up forecasting pipeline, specifying the scaling (transforming) to be
  89 |     applied and forecasting model to be used.
  90 | 
  91 |     Parameters
  92 |     ----------
  93 |     small_sample : boolean
  94 |         Indicator for whether the sample of observations available for training
  95 |         is small. By default this is turned to False.
  96 | 
  97 |     Returns
  98 |     -------
  99 |     gscv : BaseForecaster
 100 |         Instance of sktime's Grid Search forecaster, derived from BaseForecaster,
 101 |         which is configured for hyperparameter tuning and model selection.
 102 | 
 103 |     """
 104 | 
 105 |     pipe_y_elasticnet = TransformedTargetForecaster(
 106 |         steps=[
 107 |             ("scaler", TabularToSeriesAdaptor(StandardScaler())),
 108 |             ("forecaster", DirectReductionForecaster(ElasticNetCV(max_iter=5000,
 109 |                                                                   cv=TimeSeriesSplit(n_splits=5)),
 110 |                                                      window_length = 5)),
 111 |         ]
 112 |     )
 113 | 
 114 |     pipe_yX_elasticnet = ForecastingPipeline(
 115 |         steps=[
 116 |             ("scaler", TabularToSeriesAdaptor(StandardScaler())),
 117 |             ("pipe_y", pipe_y_elasticnet),
 118 |         ]
 119 |     )
 120 | 
 121 |     ols_1feature = ForecastingPipeline(
 122 |         steps=[
 123 |             ("feature_selection", FeatureSelection(n_columns=1)),
 124 |             ("ols", DirectReductionForecaster(LinearRegression())),
 125 |         ]
 126 |     )
 127 | 
 128 |     ols_pca = ForecastingPipeline(
 129 |         steps=[
 130 |             ("pca", TabularToSeriesAdaptor(PCA(n_components=0.9))),
 131 |             ("ols", DirectReductionForecaster(LinearRegression())),
 132 |         ]
 133 |     )
 134 | 
 135 |     # forecaster representation for selection among the listed models
 136 |     forecaster = MultiplexForecaster(
 137 |         forecasters=[
 138 |             ("naive_drift", NaiveForecaster(strategy="drift", window_length=2)),
 139 |             ("naive_last", NaiveForecaster(strategy="last")),
 140 |             ("naive_mean", NaiveForecaster(strategy="mean", window_length=5)),
 141 |             ("elasticnetcv", pipe_yX_elasticnet),
 142 |             ("ols_1feature", ols_1feature),
 143 |             ("ols_pca", ols_pca),
 144 |         ]
 145 |     )
 146 | 
 147 |     cv = ExpandingGreedySplitter(test_size=1, folds=5)
 148 | 
 149 |     # If the number of observations is small, Grid Search is no longer used for
 150 |     # model selection. Instead, OLS with PCA is used is used.
 151 | 
 152 |     if not small_sample:
 153 |         gscv = ForecastingGridSearchCV(
 154 |             forecaster=forecaster,
 155 |             cv=cv,
 156 |             param_grid={
 157 |                 "selected_forecaster": [
 158 |                     "naive_drift",
 159 |                     "naive_last",
 160 |                     "naive_mean",
 161 |                     "elasticnetcv",
 162 |                     "ols_1feature",
 163 |                     "ols_pca",
 164 |                 ]
 165 |             },
 166 |             backend=None,
 167 |         )
 168 | 
 169 |     else:
 170 |         gscv = NaiveForecaster(strategy = "last")
 171 | 
 172 |     return gscv
 173 | 
 174 | 
 175 | def CleanIslands(df: DataFrame) -> tuple[DataFrame, Series]:
 176 |     """
 177 |     Separate island values from input dataframe, replacing them with nan.
 178 |     Called by ``OrganizeCells``.
 179 | 
 180 |     Parameters
 181 |     ----------
 182 |     df : pd.DataFrame
 183 |         Input dataframe with raw data.
 184 | 
 185 |     Returns
 186 |     -------
 187 |     df_no_islands : pd.DataFrame
 188 |         Dataframe with island values replaced by nan.
 189 | 
 190 |     islands : pd.Series
 191 |         Series containing island values.
 192 | 
 193 |         Examples
 194 |         --------
 195 |         >>> import numpy as np
 196 |         >>> import pandas as pd
 197 |         >>> n = 30
 198 |         >>> p = 2
 199 |         >>> df = pd.DataFrame(np.random.sample([n,p]),
 200 |         >>>                   columns=['a','b'],
 201 |         >>>                   index=pd.date_range(start='2000',periods=n,freq='YE').year)
 202 |         >>> df.iloc[-5:-1,:1] = np.nan
 203 |         >>> df0, islands = CleanIslands(df)
 204 | 
 205 |     """
 206 |     df_no_islands = df.copy()  # to keep original df as it is
 207 |     col_with_islands = df.columns[df.isna().any()]
 208 |     coli_list = [df_no_islands.columns.get_loc(col) for col in col_with_islands]
 209 |     for coli in coli_list:  # for col with na
 210 |         first_na_index = np.argwhere(df.iloc[:, coli].isna()).min()
 211 |         df_no_islands.iloc[first_na_index:, coli] = np.nan
 212 | 
 213 |     islands: Series = df[df_no_islands.isna()].T.stack()
 214 |     return df_no_islands, islands
 215 | 
 216 | 
 217 | def OrganizeCells(df: DataFrame) -> tuple[DataFrame, Series, Series, Series]:
 218 |     """
 219 |     Extract island values (if existing) from input dataframe, replacing them
 220 |     with nan values. This is useful for generating first step forecasts, which
 221 |     disregard known island values for the prediction. Also identifies separate
 222 |     Pandas series of names of cells for known and unknown values in the input
 223 |     dataframe.
 224 | 
 225 |     Parameters
 226 |     ----------
 227 |     df : pd.DataFrame
 228 |         Input dataframe with raw data.
 229 | 
 230 |     Returns
 231 |     -------
 232 |     df0 : pd.DataFrame
 233 |         Dataframe with island values replaced by nan.
 234 | 
 235 |     all_cells : pd.Series
 236 |         Series containing cell names of all cells in the input dataframe.
 237 | 
 238 |     unknown_cells : pd.Series
 239 |         Series containing cell names of cells whose values are to be forecasted.
 240 | 
 241 |     known_cells : pd.Series
 242 |         Series containing cell names of cells whose values are known.
 243 | 
 244 |     islands : pd.Series
 245 |         Series containing island values.
 246 | 
 247 |     Examples
 248 |     --------
 249 |     >>> import numpy as np
 250 |     >>> import pandas as pd
 251 |     >>> n = 30
 252 |     >>> p = 2
 253 |     >>> df = pd.DataFrame(np.random.sample([n,p]),
 254 |     >>>                   columns=['a','b'],
 255 |     >>>                   index=pd.date_range(start='2000',periods=n,freq='YE').year)
 256 |     >>> df.iloc[-5:-1,:1] = np.nan
 257 |     >>> df0, all_cells, unknown_cells, known_cells, islands = OrganizeCells(df)
 258 |     """
 259 | 
 260 |     # clean islands
 261 |     df0, islands = CleanIslands(df)
 262 | 
 263 |     # all cells in forecast horizon
 264 |     all_cells_index = df0.T.stack(future_stack=True).index
 265 |     all_cells = pd.Series([f"{a}_{b}" for a, b in all_cells_index], index=all_cells_index)
 266 | 
 267 |     # unknown cells with nan
 268 |     unknown_cells_index = df0.isna()[df0.isna()].T.stack().index
 269 |     unknown_cells = pd.Series([f"{a}_{b}" for a, b in unknown_cells_index], index=unknown_cells_index)
 270 | 
 271 |     # known cells
 272 |     known_cells_index = all_cells_index.difference(unknown_cells_index)
 273 |     known_cells = pd.Series([f"{a}_{b}" for a, b in known_cells_index], index=known_cells_index)
 274 | 
 275 |     return df0, all_cells, unknown_cells, known_cells, islands
 276 | 
 277 | 
 278 | def find_permissible_wildcard(constraints_with_wildcard: list[str], _seed: int = 0) -> str:
 279 |     """Generate random letter to be used in constraints."""
 280 |     wild_card_length = 1
 281 |     seed(_seed)
 282 |     candidate = "".join(sample(ascii_lowercase, wild_card_length))
 283 |     while candidate in "".join(constraints_with_wildcard):
 284 |         wild_card_length = wild_card_length + 1
 285 |         candidate = "".join(sample(ascii_lowercase, wild_card_length))
 286 |     alphabet_wildcard = candidate
 287 |     return alphabet_wildcard
 288 | 
 289 | 
 290 | def find_strings_to_replace_wildcard(constraint: str, var_list: Series, wildcard: str) -> list[str]:
 291 |     """Identify list of strings to be substituted with the wildcard character."""
 292 | 
 293 |     varlist_regex = ["^" + str(v).replace(wildcard, "(.*)") + "$" for v in sp.sympify(constraint).free_symbols]
 294 |     missing_string_set_list = []
 295 |     for w in varlist_regex:
 296 |         missing_string = []
 297 |         for v in var_list:
 298 |             match = re.compile(w).search(v)
 299 |             if match:
 300 |                 missing_string.append(match.group(1))
 301 |         missing_string_set_list.append(set(missing_string))
 302 |     missing_string_list = list(set.intersection(*missing_string_set_list))
 303 |     missing_string_list.sort()
 304 | 
 305 |     return missing_string_list
 306 | 
 307 | 
 308 | def expand_wildcard(constraints_with_alphabet_wildcard: list[str], var_list: Series, wildcard: str):
 309 |     """
 310 |     Expand constraints with wildcard to all possible time periods. This is
 311 |     called within ``StringToMatrixConstraints``, and the wildcard character
 312 |     has already been replaced by a random letter before this function is
 313 |     called.
 314 | 
 315 |     Parameters
 316 |     ----------
 317 |     constraints_with_alphabet_wildcard : string
 318 |         Linear equality constraints with wildcard string replaced
 319 |         with alphabets.
 320 |     var_list : list
 321 |         List of indices of all cells (known and unknown) in raw dataframe.
 322 |     wildcard : string
 323 |         Alphabet which has replaced wildcard string in the constraints.
 324 | 
 325 |     Return
 326 |     ------
 327 |     expanded_constraints : list
 328 |         Expanded list of constraints over all time periods.
 329 | 
 330 |     Examples
 331 |     --------
 332 |     >>> import numpy as np
 333 |     >>> import pandas as pd
 334 |     >>> n = 30
 335 |     >>> p = 2
 336 |     >>> df = pd.DataFrame(np.random.sample([n,p]),
 337 |     >>>                   columns=['a','b'],
 338 |     >>>                   index=pd.date_range(start='2000',periods=n,freq='YE').year)
 339 |     >>> df0_stacked = df.T.stack()
 340 |     >>> all_cells_index = df0_stacked.index
 341 |     >>> var_list = pd.Series([f'{a}_{b}' for a, b in all_cells_index],
 342 |     >>>                      index = all_cells_index)
 343 |     >>> constraints_with_alphabet_wildcard = ['ax + bx']
 344 |     >>> alphabet_wildcard = 'x'
 345 |     >>> constraints = expand_wildcard(constraints_with_alphabet_wildcard,
 346 |     >>>                               var_list = var_list,
 347 |     >>>                               wildcard = alphabet_wildcard)
 348 | 
 349 |     """
 350 |     expanded_constraints = []
 351 |     for constraint in constraints_with_alphabet_wildcard:
 352 |         if wildcard not in constraint:
 353 |             expanded_constraints.append(constraint)
 354 |         else:
 355 |             missing_string_list = find_strings_to_replace_wildcard(constraint, var_list, wildcard)
 356 |             expanded_constraints += [constraint.replace(f"{wildcard}", m) for m in missing_string_list]
 357 |     return expanded_constraints
 358 | 
 359 | 
 360 | def StringToMatrixConstraints(
 361 |     df0_stacked: DataFrame,  # stack df0 to accomodate mixed frequency
 362 |     all_cells: Series,
 363 |     unknown_cells: Series,
 364 |     known_cells: Series,
 365 |     constraints_with_wildcard: list[str] | None = None,
 366 |     wildcard_string: str = "?",
 367 | ) -> tuple[DataFrame, DataFrame]:
 368 |     """
 369 |     Convert equality constraints from list to matrix form for horizons to
 370 |     be forecasted (Cy = d, where C and d are dataframes containing the
 371 |     linear constraints). The input dataframe should not be in a standard wide
 372 |     format, but instead all columns should be stacked on one another. This is
 373 |     needed to control for dealing with the case of mixed frequency among
 374 |     observations. All island values in the dinput dataframe should be replaced
 375 |     by nan prior to this step.
 376 | 
 377 |     Parameters
 378 |     ----------
 379 |     df0_stacked : pd.Series
 380 |         Stacked version of df0 (input  dataframe with islands removed).
 381 |     all_cells : pd.Series
 382 |         Series containing cell names of all cells in the input dataframe.
 383 |     unknown_cells : pd.Series
 384 |         Series containing cell names of cells whose values are to be forecasted.
 385 |     known_cells : pd.Series
 386 |         Series containing cell names of cells whose values are known..
 387 |     constraints_with_wildcard : str, optional
 388 |         String specifying equality constraints that have to hold.
 389 |         The default is [].
 390 |     wildcard_string : str, optional
 391 |         String that is used as wildcard identifier in constraint.
 392 |         The default is '?'.
 393 | 
 394 |     Returns
 395 |     -------
 396 |     C: pd.DataFrame
 397 |         Dataframe containing matrix of the linear constraints on the left side of
 398 |         equation Cy=d.
 399 |     d: pd.DataFrame
 400 |         Dataframe containing matrix of the linear constraints on the right side of
 401 |         equation Cy=d.
 402 | 
 403 |     Examples
 404 |     --------
 405 |     >>> import numpy as np
 406 |     >>> import pandas as pd
 407 |     >>> n = 30
 408 |     >>> p = 2
 409 |     >>> df = pd.DataFrame(np.random.sample([n,p]),
 410 |     >>>                   columns=['a','b'],
 411 |     >>>                   index=pd.date_range(start='2000',periods=n,freq='YE').year)
 412 |     >>> df.iloc[-5:-1,:1] = np.nan
 413 |     >>> df0, all_cells, unknown_cells, known_cells, islands = OrganizeCells(df)
 414 |     >>> df0_stacked = df0.T.stack()
 415 |     >>> constraints_with_wildcard = ['a?+b?']
 416 |     >>> C,d = StringToMatrixConstraints(df0_stacked,
 417 |     >>>                                 all_cells,
 418 |     >>>                                 unknown_cells,
 419 |     >>>                                 known_cells,
 420 |     >>>                                 constraints_with_wildcard)
 421 |     """
 422 | 
 423 |     if constraints_with_wildcard is None:
 424 |         constraints_with_wildcard = list()
 425 | 
 426 |     # replace wildcard with alphabet to utilize sympy
 427 |     alphabet_wildcard = find_permissible_wildcard(constraints_with_wildcard)
 428 |     constraints_with_alphabet_wildcard = [
 429 |         c.replace(wildcard_string, alphabet_wildcard) for c in constraints_with_wildcard
 430 |     ]
 431 | 
 432 |     # expand constraints using all cells at forecast horizon
 433 |     constraints = expand_wildcard(
 434 |         constraints_with_alphabet_wildcard, var_list=all_cells.tolist(), wildcard=alphabet_wildcard
 435 |     )
 436 | 
 437 |     # obtain C_unknown by differentiating constraints wrt unknown cells with nan
 438 |     A, b = sp.linear_eq_to_matrix(constraints, sp.sympify(unknown_cells.tolist()))
 439 |     C = pd.DataFrame(np.array(A).astype(float), index=constraints, columns=unknown_cells.index)
 440 |     nonzero_rows = (C != 0).any(axis=1)
 441 |     C = C.loc[nonzero_rows]  # drop rows with all zeros
 442 | 
 443 |     # obtain d_unknown by substituting known cells
 444 |     known_cell_dict = pd.Series(
 445 |         [df0_stacked.loc[idx] for idx in known_cells.index], index=known_cells.tolist()
 446 |     ).to_dict()
 447 |     d = pd.DataFrame(np.array(b.subs(known_cell_dict)).astype(float), index=constraints)
 448 |     d = d.loc[nonzero_rows]  # drop rows with all zeros in C
 449 | 
 450 |     return C, d
 451 | 
 452 | 
 453 | def AddIslandsToConstraints(C: DataFrame, d: DataFrame, islands: Series) -> tuple[DataFrame, DataFrame]:
 454 |     """
 455 |     Add island values into the matrix form equality constraints which have been
 456 |     constructed by ``StringToMatrixConstraints``.
 457 | 
 458 |     Parameters
 459 |     ----------
 460 |     C : pd.DataFrame
 461 |         Dataframe containing matrix of the linear constraints on the left side of
 462 |         equation Cy=d.
 463 |     d : pd.DataFrame
 464 |         Dataframe containing matrix of the linear constraints on the right side of
 465 |         equation Cy=d.
 466 |     islands : pd.Series
 467 |         Series containing island values to be introduced into linear equation.
 468 | 
 469 |     Returns
 470 |     -------
 471 |     C_aug : pd.DataFrame
 472 |         Dataframe containing the augmented C matrix, with island values incorporated.
 473 |     d_aug : pd.DataFrame
 474 |         Dataframe containing the augmented d vector, with island values incorporated.
 475 | 
 476 |     Examples
 477 |     --------
 478 |     >>> import numpy as np
 479 |     >>> import pandas as pd
 480 |     >>> n = 30
 481 |     >>> p = 2
 482 |     >>> df = pd.DataFrame(np.random.sample([n,p]),
 483 |     >>>                   columns=['a','b'],
 484 |     >>>                   index=pd.date_range(start='2000',periods=n,freq='YE').year)
 485 |     >>> df.iloc[-5:-1,:1] = np.nan
 486 |     >>> df0, all_cells, unknown_cells, known_cells, islands = OrganizeCells(df)
 487 |     >>> df0_stacked = df0.T.stack()
 488 |     >>> constraints_with_wildcard = ['a?+b?']
 489 |     >>> C,d = StringToMatrixConstraints(df0_stacked,
 490 |     >>>                                 all_cells,
 491 |     >>>                                 unknown_cells,
 492 |     >>>                                 known_cells,
 493 |     >>>                                 constraints_with_wildcard)
 494 |     >>> C,d = AddIslandsToConstraints(C,d,islands)
 495 |     """
 496 |     C_aug_index = islands.index.union(C.index, sort=False)  # singleton constraints prioritize over islands
 497 |     C_aug = pd.DataFrame(np.zeros([len(C_aug_index), len(C.columns)]), index=C_aug_index, columns=C.columns)
 498 |     d_aug = pd.DataFrame(np.zeros([len(C_aug_index), 1]), index=C_aug_index)
 499 |     for idx in islands.index:
 500 |         C_aug.loc[C_aug.index == idx, idx] = 1
 501 |         d_aug.loc[d_aug.index == idx] = islands.loc[idx]
 502 |     C_aug.update(C)
 503 |     d_aug.update(d)
 504 | 
 505 |     return C_aug, d_aug
 506 | 
 507 | 
 508 | def FillAnEmptyCell(
 509 |     df: DataFrame, row: int | str, col: int | str, forecaster: BaseForecaster
 510 | ) -> tuple[float, BaseForecaster]:
 511 |     """
 512 |     Generate a forecast for a given cell based on the latest known value
 513 |     for the given column (variable) and using the predefined forecasting pipeline.
 514 |     Called by ``FillAllEmptyCells``.
 515 | 
 516 |     Parameters
 517 |     ----------
 518 |     df : pd.DataFrame
 519 |         Dataframe containing known values of all variables and nan for
 520 |         unknown values.
 521 |     row : str
 522 |         Row index of cell to be forecasted.
 523 |     col : str
 524 |         Column index of cell to be forecasted.
 525 |     forecaster : BaseForecaster
 526 | 
 527 | 
 528 |     Returns
 529 |     -------
 530 |     y_pred : double
 531 |         Forecasted value of the variable for the given horizon.
 532 |     forecaster : BaseForecaster
 533 |          sktime BaseForecaster descendant
 534 | 
 535 |     Examples
 536 |     --------
 537 |     >>> from string import ascii_lowercase
 538 |     >>> import numpy as np
 539 |     >>> import pandas as pd
 540 |     >>> from sklearn.linear_model import ElasticNetCV
 541 |     >>> from sktime.forecasting.compose import YfromX
 542 |     >>> n = 30
 543 |     >>> p = 2
 544 |     >>> df = pd.DataFrame(np.random.sample([n,p]),
 545 |     >>>                   columns=list(ascii_lowercase[:p]),
 546 |     >>>                   index=pd.date_range(start='2000',periods=n,freq='YE').year)
 547 |     >>> df.iloc[-5:,:1] = np.nan
 548 |     >>> row = df.index[-1]
 549 |     >>> col = df.columns[0]
 550 |     >>> forecaster = YfromX(ElasticNetCV())
 551 |     >>> y_pred, forecaster = FillAnEmptyCell(df,row,col,forecaster)
 552 |     """
 553 |     warnings.filterwarnings("ignore", category=UserWarning)
 554 | 
 555 |     # clone a forecaster
 556 |     f = forecaster.clone()
 557 |     
 558 |     # last historical data and forecast horizon in num
 559 |     T = np.argwhere(df.loc[:, col].isna()).min() - 1
 560 |     h = np.where(df.index == row)[0][0] - T
 561 | 
 562 |     y = df.iloc[:T, :].loc[:, col]
 563 | 
 564 |     X = df.iloc[: T + h].drop(columns=[col]).dropna(axis=1)
 565 |     X_train = X.iloc[:T, :]
 566 |     X_pred = X.iloc[T:, :]
 567 | 
 568 |     y_pred = f.fit(y=y, X=X_train, fh=h).predict(X=X_pred)
 569 | 
 570 |     return y_pred, f
 571 | 
 572 | def FillAllEmptyCells(
 573 |     df: DataFrame, forecaster: BaseForecaster, parallelize: bool = True
 574 | ) -> tuple[DataFrame, DataFrame]:
 575 |     """
 576 |     Generate forecasts for all unknown cells in the supplied dataframe.
 577 |     All forecasts are made independently from each other. (TBC)
 578 | 
 579 |     Parameters
 580 |     ----------
 581 |     df: pd.DataFrame
 582 |         Dataframe containing known values of all variables and nan for
 583 |         unknown values.
 584 | 
 585 |     forecaster : BaseForecaster
 586 |         sktime BaseForecaster descendant
 587 | 
 588 |     parallelize : boolean
 589 |         Indicate whether parallelization should be employed for generating the
 590 |         first step forecasts. Default value is `True`.
 591 | 
 592 |     Return
 593 |     ------
 594 |     df1: pd.DataFrame
 595 |         Dataframe with all known cells, as well as unknown cells filled in by
 596 |         one-step forecasts.
 597 |     df1_model: pd.DataFrame
 598 |         Dataframe with all known cells, with unknown cells containing details
 599 |         of the forecaster used for generating forecast of that cell.
 600 | 
 601 |     Examples
 602 |     --------
 603 |     >>> from string import ascii_lowercase
 604 |     >>> import numpy as np
 605 |     >>> import pandas as pd
 606 |     >>> from sklearn.linear_model import ElasticNetCV
 607 |     >>> from sktime.forecasting.compose import YfromX
 608 |     >>> from mff.utils import FillAllEmptyCells
 609 |     >>> n = 30
 610 |     >>> p = 2
 611 |     >>> df = pd.DataFrame(np.random.sample([n,p]),
 612 |     >>>                   columns=list(ascii_lowercase[:p]),
 613 |     >>>                   index=pd.date_range(start='2000',periods=n,freq='YE').year)
 614 |     >>> df.iloc[-5:,:1] = np.nan
 615 |     >>> def DefaultForecaster():
 616 |     >>>     return YfromX(ElasticNetCV(max_iter=5000))
 617 |     >>> df1,df1_models = FillAllEmptyCells(df,DefaultForecaster())
 618 | 
 619 |     """
 620 | 
 621 |     # get indices of all np.nan cells
 622 |     na_cells = [(df.index[rowi], df.columns[coli]) for rowi, coli in np.argwhere(df.isna())]
 623 | 
 624 |     # apply dask
 625 |     if parallelize:
 626 |         start = time()
 627 |         client = Client()
 628 |         df_future = client.scatter(df,broadcast=True)
 629 |         forecaster_future = client.scatter(forecaster, broadcast=True)
 630 |         futures = [client.submit(FillAnEmptyCell, df_future, row, col, forecaster_future)
 631 |                    for (row, col) in na_cells]
 632 |         results = client.gather(futures)
 633 |         client.close()
 634 |         print("Dask filled", len(results), "out-of-sample cells:", round(time() - start, 3), "seconds")
 635 | 
 636 |     else:
 637 |         start = time()
 638 |         results = [FillAnEmptyCell(df, row, col, forecaster) for row, col in na_cells]
 639 |         print("Forecast", len(results), "cells:", round(time() - start, 3), "seconds")
 640 | 
 641 |     # fill empty cells
 642 |     df1 = df.copy()
 643 |     df1_models = df.copy().astype(object)
 644 |     for idx, rowcol in enumerate(na_cells):
 645 |         df1.loc[rowcol] = results[idx][0].iloc[0]
 646 |         df1_models.loc[rowcol] = results[idx][1]
 647 | 
 648 |     return df1, df1_models
 649 | 
 650 | 
 651 | def GenPredTrueData(
 652 |     df: DataFrame, forecaster: BaseForecaster, n_forecast_error: int = 5, parallelize: bool = True
 653 | ) -> tuple[DataFrame, DataFrame, DataFrame]:
 654 |     """
 655 |     Generate in-sample forecasts from existing data by constructing
 656 |     pseudo-historical datasets.
 657 | 
 658 |     Parameters
 659 |     ----------
 660 |     df : pd.DataFrame
 661 |         Dataframe with all known as well as unknown values.
 662 |     forecaster : BaseForecaster
 663 |         sktime BaseForecaster descendant.
 664 |     n_forecast_error : int, optional
 665 |         Number of horizons for which in-sample forecasts are generated.
 666 |         The default is 5.
 667 |     parallelize : boolean, optional
 668 |         Indicate whether parallelization should be used. The default is True.
 669 | 
 670 |     Returns
 671 |     -------
 672 |     pred : pd.DataFrame
 673 |         Dataframe with in-sample predictions generated using pseudo-historical
 674 |         datasets.
 675 |     true : pd.DataFrame
 676 |         Dataframe with actual values of the variable corresponding to predicted
 677 |         values contained in pred.
 678 |     model : pd.DataFrame
 679 |         Dataframe with information on the models used for generating each
 680 |         forecast.
 681 | 
 682 |     Examples
 683 |     --------
 684 |     >>> import numpy as np
 685 |     >>> import pandas as pd
 686 |     >>> from sktime.forecasting.compose import YfromX
 687 |     >>> from sklearn.linear_model import ElasticNetCV
 688 |     >>> n = 30
 689 |     >>> p = 2
 690 |     >>> df = pd.DataFrame(np.random.sample([n,p]),
 691 |     >>>                   columns=['a','b'],
 692 |     >>>                   index=pd.date_range(start='2000',periods=n,freq='YE').year)
 693 |     >>> df.iloc[-5:,:1] = np.nan
 694 |     >>> def DefaultForecaster():
 695 |     >>>     return YfromX(ElasticNetCV(max_iter=5000))
 696 |     >>> pred,true,model = GenPredTrueData(df0,forecaster,parallelize=parallelize)
 697 |     """
 698 | 
 699 |     # last historical data and length of forecast horizon
 700 |     T = min(np.argwhere(df.isna())[:, 0]) - 1
 701 |     h = max(np.argwhere(df.isna())[:, 0]) - T
 702 | 
 703 |     # create pseudo historical dataframes and their na cells
 704 |     df_list = [df.shift(-h - n).mask(df.shift(-h - n).notna(), df).iloc[: -h - n, :] for n in range(n_forecast_error)]
 705 | 
 706 |     # unpack all the na cells for pseudo historical dataframes to use dask
 707 |     tasks = [
 708 |         (dfi, df.index[rowi], df.columns[coli])
 709 |         for dfi, df in enumerate(df_list)
 710 |         for (rowi, coli) in np.argwhere(df.isna())
 711 |     ]
 712 | 
 713 |     if parallelize:
 714 |         start = time()
 715 |         client = Client()
 716 |         df_futures = client.scatter(df_list, broadcast=True)
 717 |         forecaster_future = client.scatter(forecaster, broadcast=True)
 718 |         futures = [client.submit(FillAnEmptyCell, df_futures[dfi], row, col, forecaster_future) for (dfi, row, col) in tasks]
 719 |         results = client.gather(futures)
 720 |         client.close()
 721 |         print("Dask filled", len(results), "in-sample cells:", round(time() - start, 3), "seconds")
 722 |     else:
 723 |         start = time()
 724 |         results = [FillAnEmptyCell(df_list[dfi], row, col, forecaster) for (dfi, row, col) in tasks]
 725 |         print("Fill", len(results), "in-sample cells:", round(time() - start, 3), "seconds")
 726 | 
 727 |     # repackage results by filling na of df_list
 728 |     filled_list = copy.deepcopy(df_list)
 729 |     model_list = [df.astype(object) for df in copy.deepcopy(df_list)]
 730 |     for task_idx, task in enumerate(tasks):
 731 |         dfi, row, col = task
 732 |         filled_list[dfi].loc[row, col] = results[task_idx][0].iloc[0]
 733 |         model_list[dfi].loc[row, col] = results[task_idx][1]
 734 | 
 735 |     # reduce n samples into a dataframe
 736 |     colname = df.isna()[df.isna()].T.stack().index
 737 |     idxname = pd.Index(
 738 |         [df_list[n].index[np.argwhere(df_list[n].isna())[:, 0].min()] for n in range(n_forecast_error)], name="LastData"
 739 |     )
 740 |     pred = pd.DataFrame(
 741 |         [filled_list[n][df_list[n].isna()].T.stack().values for n in range(n_forecast_error)],
 742 |         index=idxname,
 743 |         columns=colname,
 744 |     )
 745 |     model = pd.DataFrame(
 746 |         [model_list[n][df_list[n].isna()].T.stack().values for n in range(n_forecast_error)],
 747 |         index=idxname,
 748 |         columns=colname,
 749 |     )
 750 |     true = pd.DataFrame(
 751 |         [df[df_list[n].isna()].T.stack().values for n in range(n_forecast_error)], index=idxname, columns=colname
 752 |     )
 753 | 
 754 |     return pred, true, model
 755 | 
 756 | 
 757 | def BreakDataFrameIntoTimeSeriesList(
 758 |     df0: DataFrame, df1: DataFrame, pred: DataFrame, true: DataFrame
 759 | ) -> tuple[list[DataFrame], list[DataFrame], list[DataFrame]]:
 760 |     """Transform relevant dataframes into lists for ensuing reconciliation step.
 761 | 
 762 |     Parameters
 763 |     ----------
 764 |     df0 : pd.DataFrame
 765 |         Dataframe with all known and unknown values, without any islands.
 766 |     df1 : pd.DataFrame
 767 |         Dataframe with unknown values as well as islands filled in with
 768 |         first step forecasts.
 769 |     pred : pd.DataFrame
 770 |         Dataframe with in-sample predictions generated using pseudo-historical
 771 |         datasets, output from ``GenPredTrueData``.
 772 |     true : pd.DataFrame
 773 |        Dataframe with actual values of the variable corresponding to predicted
 774 |         values contained in pred.
 775 | 
 776 |     Returns
 777 |     -------
 778 |     ts_list : list
 779 |         List containing all first step out of sample forecasts.
 780 |     pred_list : list
 781 |         List of dataframes, with each dataframe containing in-sample forecasts
 782 |         for one variable.
 783 |     true_list : list
 784 |         List of dataframes, with each dataframe containing the actual values
 785 |         for a variable corresponding to in-sample predictions stored in
 786 |         pred_list.
 787 | 
 788 |     Examples
 789 |     --------
 790 |     >>> import numpy as np
 791 |     >>> import pandas as pd
 792 |     >>> from sktime.forecasting.compose import YfromX
 793 |     >>> from sklearn.linear_model import ElasticNetCV
 794 |     >>> n = 30
 795 |     >>> p = 2
 796 |     >>> df = pd.DataFrame(np.random.sample([n,p]),
 797 |     >>>                   columns=['a','b'],
 798 |     >>>                   index=pd.date_range(start='2000',periods=n,freq='YE').year)
 799 |     >>> df.iloc[-5:,:1] = np.nan
 800 |     >>> def DefaultForecaster():
 801 |     >>>     return YfromX(ElasticNetCV(max_iter=5000))
 802 |     >>> df1,df1_models = FillAllEmptyCells(df,DefaultForecaster())
 803 |     >>> pred,true,model = GenPredTrueData(df0,forecaster,parallelize=parallelize)
 804 |     >>> ts_list,pred_list,true_list = BreakDataFrameIntoTimeSeriesList(df,df1,pred,true)
 805 |     """
 806 |     ts_list = [df1[df0.isna()].loc[:, col:col].dropna().T.stack() for col in df0.columns[df0.isna().any()]]
 807 |     pred_list = [pred.loc[:, ts.index] for ts in ts_list]
 808 |     true_list = [true.loc[:, ts.index] for ts in ts_list]
 809 | 
 810 |     return ts_list, pred_list, true_list
 811 | 
 812 | 
 813 | def HP_matrix(size: int) -> ndarray:
 814 |     """
 815 |     Create the degenerate penta-diagonal matrix (the one used in HP Filter),
 816 |     with dimensions (size x size).
 817 | 
 818 |     Parameters
 819 |     ----------
 820 |     size : integer
 821 |         Number of rows for the square matrix.
 822 | 
 823 |     Returns
 824 |     -------
 825 |     F : np.array
 826 |         Array containing the F matrix.
 827 | 
 828 |     """
 829 |     if size >= 2:
 830 |         D = np.zeros((size - 2, size))
 831 |         for i in range(size - 2):
 832 |             D[i, i] = 1
 833 |             D[i, i + 1] = -2
 834 |             D[i, i + 2] = 1
 835 |         F = D.T @ D
 836 |     elif size == 1:
 837 |         F = np.zeros([1, 1])
 838 |     return F
 839 | 
 840 | 
 841 | def GenVecForecastWithIslands(ts_list: list[DataFrame], islands: list[Series]) -> Series:
 842 |     """Overwrite forecasted values for islands with known island value.
 843 | 
 844 |     Parameters
 845 |     ----------
 846 |     ts_list : list
 847 |         List of all first step forecasted values.
 848 |     islands : pd.Series
 849 |         Series containing island values.
 850 | 
 851 |     Returns
 852 |     -------
 853 |     y1 : pd.Series
 854 |         Series of forecasted values with island values incorporated.
 855 | 
 856 |     Examples
 857 |     --------
 858 |     >>> import numpy as np
 859 |     >>> import pandas as pd
 860 |     >>> from sktime.forecasting.compose import YfromX
 861 |     >>> from sklearn.linear_model import ElasticNetCV
 862 |     >>> n = 30
 863 |     >>> p = 2
 864 |     >>> df = pd.DataFrame(np.random.sample([n,p]),
 865 |     >>>                   columns=['a','b'],
 866 |     >>>                   index=pd.date_range(start='2000',periods=n,freq='YE').year)
 867 |     >>> df.iloc[-5:-1,:1] = np.nan
 868 |     >>> df0, all_cells, unknown_cells, known_cells, islands = OrganizeCells(df)
 869 |     >>> def DefaultForecaster():
 870 |     >>>     return YfromX(ElasticNetCV(max_iter=5000))
 871 |     >>> df1,df1_models = FillAllEmptyCells(df,DefaultForecaster(),parallelize=False)
 872 |     >>> ts_list = [df1[df0.isna()].loc[:,col:col].dropna().T.stack() for col in df0.columns[df.isna().any()]]
 873 |     >>> y1 = GenVecForecastWithIslands(ts_list,islands)
 874 |     """
 875 |     try:
 876 |         y1 = pd.concat(ts_list, axis=0)
 877 | 
 878 |     except Exception:  # only used in mixed-freq, pd.concat cann't process 4 mix-freq series
 879 |         y1 = ConcatMixFreqMultiIndexSeries(ts_list, axis=0)
 880 | 
 881 |     y1.update(islands)
 882 | 
 883 |     return y1
 884 | 
 885 | 
 886 | def GenWeightMatrix(
 887 |     pred_list: list[DataFrame], true_list: list[DataFrame], shrinkage_method: Literal["oas", "oasd"] = "oas"
 888 | ) -> tuple[DataFrame, float]:
 889 |     """
 890 |     Generate weighting matrix based on in-sample forecasts and actual values
 891 |     for the corresponding periods.
 892 | 
 893 |     Parameters
 894 |     ----------
 895 |     pred_list : list
 896 |         List of dataframes, with each dataframe containing in-sample forecasts
 897 |         for one variable..
 898 |     true_list : list
 899 |         List of dataframes, with each dataframe containing the actual values
 900 |         for a variable corresponding to in-sample predictions stored in
 901 |         pred_list.
 902 |     shrinkage_method : str, optional
 903 |         Type of algorithm to use for shrinking the covariance matrix, with
 904 |         options of identity, oas and oasd. The default is 'oas'.
 905 | 
 906 |     Returns
 907 |     -------
 908 |     W : pd.DataFrame
 909 |         Weighting matrix to be used for reconciliation.
 910 |     shrinkage: float
 911 |         Shrinkage parameter associated with the weight. Nan in case identity
 912 |         is selected as method.
 913 | 
 914 |     Examples
 915 |     --------
 916 |     >>> import pandas as pd
 917 |     >>> import numpy as np
 918 |     >>> pred_list = [pd.DataFrame(np.random.rand(5, 5), columns=[f'Col{i+1}' for i in range(5)]) for _ in range(2)]
 919 |     >>> true_list = [pd.DataFrame(np.random.rand(5, 5), columns=[f'Col{i+1}' for i in range(5)]) for _ in range(2)]
 920 |     >>> W,shrinkage = GenWeightMatrix(pred_list, true_list)
 921 | 
 922 |     """
 923 |     fe_list = [pred_list[i] - true_list[i] for i in range(len(pred_list))]
 924 | 
 925 |     try:  # fe: sample size x vairables
 926 |         fe = pd.concat(fe_list, axis=1)
 927 | 
 928 |     except Exception:  # only used in mixed-freq, pd.concat cann't process 4 mix-freq series
 929 |         fe = ConcatMixFreqMultiIndexSeries(fe_list, axis=1)
 930 | 
 931 |     # sample covariance
 932 |     n_samp = fe.shape[0]
 933 |     n_vars = fe.shape[1]
 934 |     sample_cov = fe.cov()
 935 | 
 936 |     if shrinkage_method == "identity":
 937 |         W = pd.DataFrame(np.eye(sample_cov.shape[0]), index=sample_cov.index, columns=sample_cov.columns)
 938 |         return W, np.nan
 939 | 
 940 |     if shrinkage_method == "oas":
 941 |         from sklearn.covariance import OAS
 942 | 
 943 |         oas = OAS().fit(fe.values)
 944 |         W = pd.DataFrame(oas.covariance_, index=sample_cov.index, columns=sample_cov.columns)
 945 |         rho = oas.shrinkage_
 946 |         return W, rho
 947 | 
 948 |     if shrinkage_method == "oasd":
 949 |         if n_vars >= 2:
 950 |             # shrinkage target
 951 |             diag = np.diag(np.diag(sample_cov))
 952 | 
 953 |             # shrinkage parameter
 954 |             numerator = np.trace(sample_cov @ sample_cov) - np.trace(diag @ diag)
 955 |             denominator = np.trace(sample_cov @ sample_cov) + np.trace(sample_cov) ** 2 - 2 * np.trace(diag @ diag)
 956 |             phi = numerator / denominator
 957 |             rho = min([1 / (n_samp * phi), 1])
 958 | 
 959 |             # shrink covariance matrix
 960 |             W = (1 - rho) * sample_cov + rho * diag
 961 |         elif n_vars == 1:
 962 |             W = sample_cov
 963 |             rho = np.nan
 964 |         return W, rho
 965 | 
 966 |     if shrinkage_method == "monotone diagonal":
 967 |         if n_vars >= 2:
 968 |             diag = pd.Series(np.diag(sample_cov), index=sample_cov.index)
 969 |             W = pd.DataFrame(
 970 |                 np.diag(diag.groupby(level=0).cummax()), index=sample_cov.index, columns=sample_cov.columns
 971 |             )
 972 |         elif n_vars == 1:
 973 |             W = sample_cov
 974 |             rho = np.nan
 975 |         return W, np.nan
 976 | 
 977 | 
 978 | def GenLamstar(pred_list: list, true_list: list, default_lam: float = -1, max_lam: float = 129600) -> pd.Series:
 979 |     """
 980 |     Calculate the smoothness parameter (lambda) associated with each variable
 981 |     being forecasted.
 982 | 
 983 |     Parameters
 984 |     ----------
 985 |     pred_list : list
 986 |         List of dataframes, with each dataframe containing in-sample forecasts
 987 |         for one variable.
 988 |     true_list : list
 989 |         List of dataframes, with each dataframe containing the actual values
 990 |         for a variable corresponding to in-sample predictions stored in
 991 |         pred_list.
 992 |     default_lam : float, optional(default: -1)
 993 |         The value of lambda to be used for calculating smoothing parameter if
 994 |         frequency of observations cannot be determined from index names. If this
 995 |         is set to -1, lambda is calculated empirically. The default value is -1.
 996 |     max_lam : float, optional
 997 |         The upperbound of HP filter penalty term (lambda) searched by scipy
 998 |         minimizer. The default is 129600.
 999 | 
1000 |     Returns
1001 |     -------
1002 |     lamstar : pd.Series
1003 |         Series containing smoothing parameters to be used for each variable.
1004 | 
1005 |     Examples
1006 |     --------
1007 |     >>> import pandas as pd
1008 |     >>> import numpy as np
1009 |     >>> pred_list = [pd.DataFrame(np.random.rand(5, 5), columns=[f'Col{i+1}' for i in range(5)]) for _ in range(2)]
1010 |     >>> true_list = [pd.DataFrame(np.random.rand(5, 5), columns=[f'Col{i+1}' for i in range(5)]) for _ in range(2)]
1011 |     >>> W,shrinkage = GenWeightMatrix(pred_list, true_list)
1012 |     """
1013 |     # index of time series to deal with mixed-frequency
1014 |     tsidx_list = [df.columns for df in pred_list]
1015 | 
1016 |     # box to store lamstar, columsn are the index of time series
1017 |     try:  # extract freq info if available
1018 |         freq_list = [tsidx.get_level_values(1).freqstr[0] for tsidx in tsidx_list]
1019 |         ly = 100
1020 |         lambda_dict = {
1021 |             "Y": ly,
1022 |             "Q": ly * (4**2),
1023 |             "M": ly * (12**2),
1024 |             "W": ly * (52**2),
1025 |             "D": ly * (365**2),
1026 |             "H": ly * ((365 * 24) ** 2),
1027 |             "T": ly * ((365 * 24 * 60) ** 2),
1028 |             "S": ly * ((365 * 24 * 60 * 60) ** 2),
1029 |         }
1030 |         lamstar = pd.Series([float(lambda_dict[item]) for item in freq_list], index=tsidx_list)
1031 |     except Exception:
1032 |         lamstar = pd.Series(np.ones(len(tsidx_list)) * default_lam, index=tsidx_list)
1033 | 
1034 |     # optimal lambda
1035 |     if default_lam == -1:
1036 | 
1037 |         def loss_fn(x, T, yt, yp):
1038 |             return (yt - inv(np.eye(T) + x * HP_matrix(T)) @ yp).T @ (yt - inv(np.eye(T) + x * HP_matrix(T)) @ yp)
1039 | 
1040 |         for tsidxi, tsidx in enumerate(tsidx_list):
1041 |             y_pred = pred_list[tsidxi]
1042 |             y_true = true_list[tsidxi]
1043 |             T = len(tsidx)
1044 | 
1045 |             # TODO: pick a better name for the function
1046 |             def obj(x):
1047 |                 return np.mean(
1048 |                     [
1049 |                         loss_fn(x, T, y_true.iloc[i : i + 1, :].T.values, y_pred.iloc[i : i + 1, :].T.values)
1050 |                         for i in range(y_pred.shape[0])
1051 |                     ]
1052 |                 )
1053 | 
1054 |             constraint_lb = {"type": "ineq", "fun": lambda lam: lam}  # lambda >=0
1055 | 
1056 |             # lambda <= max_lam, without this, I+xF may be too close to F to invert
1057 |             constraint_ub = {"type": "ineq", "fun": lambda lam: -lam + max_lam}
1058 |             result = scipy.optimize.minimize(obj, 0, constraints=[constraint_lb, constraint_ub])
1059 |             lamstar.iloc[tsidxi] = result.x[0]
1060 |     return lamstar
1061 | 
1062 | 
1063 | def GenSmoothingMatrix(W: DataFrame, lamstar: Series) -> DataFrame:
1064 |     """
1065 |     Generate symmetric smoothing matrix using optimal lambda and weighting matrix.
1066 | 
1067 |     Parameters
1068 |     ----------
1069 |     W : pd.DataFrame
1070 |         Dataframe containing the weighting matrix.
1071 |     lamstar : pd.Series
1072 |         Series containing smoothing parameters to be used for each variable.
1073 | 
1074 |     Returns
1075 |     -------
1076 |     Phi : pd.DataFrame
1077 |         Dataframe containing the smoothing matrix.
1078 | 
1079 |     Examples
1080 |     --------
1081 |     >>> import pandas as pd
1082 |     >>> import numpy as np
1083 |     >>> pred_list_1 = [pd.DataFrame(np.random.rand(5, 5),
1084 |     >>>                             columns=pd.MultiIndex.from_product([['A'], [f'Col{i+1}' for i in range(5)]])) if i == 0 else
1085 |     >>>                pd.DataFrame(np.random.rand(5, 5),
1086 |     >>>                             columns=pd.MultiIndex.from_product([['B'], [f'Col{i+1}' for i in range(5)]]))
1087 |     >>>                for i in range(2)]
1088 |     >>> true_list_1 = [pd.DataFrame(np.random.rand(5, 5),
1089 |     >>>                             columns=pd.MultiIndex.from_product([['A'], [f'Col{i+1}' for i in range(5)]])) if i == 0 else
1090 |     >>>                pd.DataFrame(np.random.rand(5, 5),
1091 |     >>>                             columns=pd.MultiIndex.from_product([['B'], [f'Col{i+1}' for i in range(5)]]))
1092 |     >>>                for i in range(2)]
1093 |     >>> smoothness = GenLamstar(pred_list_1,true_list_1)
1094 | 
1095 |     """
1096 |     lam = lamstar / [np.diag(W.loc[tsidx, tsidx]).min() for tsidx in lamstar.index]
1097 |     Phi_np = block_diag(*[lam.iloc[tsidxi] * HP_matrix(len(tsidx)) for tsidxi, tsidx in enumerate(lam.index)])
1098 |     Phi = pd.DataFrame(Phi_np, index=W.index, columns=W.columns)
1099 |     return Phi
1100 | 
1101 | 
1102 | def Reconciliation(
1103 |     y1: Series,
1104 |     W: DataFrame,
1105 |     Phi: DataFrame,
1106 |     C: DataFrame,
1107 |     d: DataFrame,
1108 |     C_ineq: DataFrame | None = None,
1109 |     d_ineq: DataFrame | None = None,
1110 | ) -> DataFrame:
1111 |     """
1112 |     Reconcile first step forecasts to satisfy equality as well as inequality
1113 |     constraints, subject to smoothening.
1114 | 
1115 |     Parameters
1116 |     ----------
1117 |     y1 : pd.Series
1118 |         Series of all forecasted and island values.
1119 |     W : pd.DataFrame
1120 |         Dataframe containing the weighting matrix.
1121 |     Phi : pd.DataFrame
1122 |         Dataframe containing the smoothing matrix.
1123 |     C : pd.DataFrame
1124 |         Dataframe containing matrix of the linear constraints on the left side of
1125 |         the equality constraint Cy=d.
1126 |     d : pd.DataFrame
1127 |         Dataframe containing matrix of the linear constraints on the right side of
1128 |         the equality constraint Cy=d.
1129 |     C_ineq : pd.DataFrame, optional
1130 |         Dataframe containing matrix of the linear constraints on the left side of
1131 |         the inequality constraint C_ineq · y - d_ineq ≤ 0. The default is None.
1132 |     d_ineq : pd.DataFrame, optional
1133 |         Dataframe containing matrix of the linear constraints on the right side of 
1134 |         the inequality constraint C_ineq · y - d_ineq ≤ 0.  The default is None.
1135 | 
1136 |     Returns
1137 |     -------
1138 |     y2 : pd.DataFrame
1139 |         Dataframe containing the final reconciled forecasts for all variables.
1140 | 
1141 |     Examples
1142 |     --------
1143 |     >>> import numpy as np
1144 |     >>> import pandas as pd
1145 |     >>> from sktime.forecasting.compose import YfromX
1146 |     >>> from sklearn.linear_model import ElasticNetCV
1147 |     >>> n = 30
1148 |     >>> p = 2
1149 |     >>> df = pd.DataFrame(np.random.sample([n,p]),
1150 |     >>>                   columns=['a','b'],
1151 |     >>>                   index=pd.date_range(start='2000',periods=n,freq='YE').year)
1152 |     >>> df.iloc[-5:,:1] = np.nan
1153 |     >>> df0, all_cells, unknown_cells, known_cells, islands = OrganizeCells(df)
1154 |     >>> def DefaultForecaster():
1155 |     >>>     return YfromX(ElasticNetCV(max_iter=5000))
1156 |     >>> df1,df1_models = FillAllEmptyCells(df0,DefaultForecaster(),parallelize=False)
1157 |     >>> pred,true,model = GenPredTrueData(df0,DefaultForecaster(),parallelize=False)
1158 |     >>> ts_list,pred_list,true_list = BreakDataFrameIntoTimeSeriesList(df0,df1,pred,true)
1159 |     >>> y1 = pd.concat(ts_list)
1160 |     >>> C = pd.DataFrame(columns = y1.index).astype(float)
1161 |     >>> d = pd.DataFrame().astype(float)
1162 |     >>> W = pd.DataFrame(np.eye(5),index=y1.index,columns=y1.index)
1163 |     >>> smoothness = GenLamstar(pred_list,true_list)
1164 |     >>> Phi = GenSmoothingMatrix(W,smoothness)
1165 |     >>> y2 = Reconciliation(y1,W,Phi,C,d)
1166 |     >>> y2 = Reconciliation(m.y1,m.W,m.Phi,m.C,m.d)
1167 | 
1168 |     """
1169 |     assert (y1.index == W.index).all()
1170 |     assert (y1.index == Phi.index).all()
1171 |     assert (y1.index == C.columns).all()
1172 |     assert (C.index == d.index).all()
1173 | 
1174 |     def DropLinDepRows(C_aug, d_aug):
1175 |         C = C_aug.values
1176 | 
1177 |         # Convert the matrix to a SymPy Matrix
1178 |         sympy_matrix = sp.Matrix(C)
1179 | 
1180 |         # Compute the RREF and get the indices of linearly independent rows
1181 |         rref_matrix, independent_rows = sympy_matrix.T.rref()
1182 | 
1183 |         # Extract the independent rows
1184 |         independent_rows = list(independent_rows)
1185 | 
1186 |         # dependent rows
1187 |         all_rows = set(range(C.shape[0]))
1188 |         dependent_rows = list(all_rows - set(independent_rows))
1189 | 
1190 |         C = C_aug.iloc[independent_rows, :]
1191 |         d = d_aug.iloc[independent_rows, :]
1192 | 
1193 |         if dependent_rows != []:
1194 |             print(
1195 |                 "Constraints are linearly dependent. The following constraints are dropped.",
1196 |                 C_aug.index[dependent_rows],
1197 |             )
1198 |         return C, d
1199 | 
1200 |     # keep lin indep rows
1201 |     C, d = DropLinDepRows(C, d)
1202 | 
1203 |     # reconcile with np.array
1204 |     W_inv = inv(W)
1205 |     denom = inv(W_inv + Phi)
1206 |     Cn = C.values
1207 |     dn = d.values
1208 |     CdC_inv = inv(Cn @ denom @ Cn.T)  # removing linearly dependent rows to use inv doesn't change results much
1209 | 
1210 |     In = np.eye(len(y1))
1211 |     y1n = y1.values.reshape(-1, 1)
1212 |     y2n = (In - denom @ Cn.T @ CdC_inv @ Cn) @ denom @ W_inv @ y1n + denom @ Cn.T @ CdC_inv @ dn
1213 | 
1214 |     if C_ineq is not None and C_ineq.shape[0] > 0:
1215 |         C_ineq, d_ineq = DropLinDepRows(C_ineq, d_ineq)
1216 | 
1217 |         # augment C_ineq, d_ineq to be compatible with y1
1218 |         C_ineq_aug = pd.DataFrame(np.zeros([len(C_ineq.index), len(y1)]), index=C_ineq.index, columns=y1.index)
1219 |         C_ineq_aug.update(C_ineq)
1220 |         d_ineq_aug = pd.DataFrame(np.zeros([len(d_ineq.index), 1]), index=d_ineq.index)
1221 |         d_ineq_aug.update(d_ineq)
1222 |         Cn_ineq = C_ineq_aug.values
1223 |         dn_ineq = d_ineq_aug.values
1224 | 
1225 |         # use CVXPY to solve numerically
1226 |         P = W_inv + Phi
1227 |         q = -2 * W_inv @ y1n
1228 |         x = cp.Variable([len(y1), 1])
1229 |         objective = cp.Minimize(cp.quad_form(x, P, assume_PSD=True) + q.T @ x)
1230 |         
1231 |         # If equality constraints do not exist, dropping C matrix from solver
1232 |         if C.shape[0] >0:
1233 |             constraints = [Cn @ x == dn, Cn_ineq @ x <= dn_ineq]
1234 |         else:
1235 |             constraints = [Cn_ineq @ x <= dn_ineq]
1236 |         prob = cp.Problem(objective, constraints)
1237 |         prob.solve()
1238 |         y2n = x.value
1239 | 
1240 |         if y2n is None:
1241 |             import warnings
1242 | 
1243 |             warnings.warn("Reconciliation failed. Feasible sets might be empty.")
1244 | 
1245 |     # put reconciled y2 back to dataframe
1246 |     y2 = pd.DataFrame(y2n, index=y1.index)
1247 | 
1248 |     return y2
1249 | 
1250 | 
1251 | def get_freq_of_freq(periodindex: PeriodIndex, freqstr: Literal["Y", "Q", "M", "W", "D", "H", "T", "S"]) -> Index:
1252 |     if freqstr == "Y":
1253 |         return periodindex.year
1254 |     if freqstr == "Q":
1255 |         return periodindex.quarter
1256 |     if freqstr == "M":
1257 |         return periodindex.month
1258 |     if freqstr == "W":
1259 |         return periodindex.week
1260 |     if freqstr == "D":
1261 |         return periodindex.day
1262 |     if freqstr == "H":
1263 |         return periodindex.hour
1264 |     if freqstr == "T":
1265 |         return periodindex.minute
1266 |     if freqstr == "S":
1267 |         return periodindex.second
1268 | 
1269 | 
1270 | def ConcatMixFreqMultiIndexSeries(df_list: list[DataFrame], axis: int) -> DataFrame:
1271 |     # used only in mixed freq case, pd.concat doesn't work for more than 4 mix-freq series
1272 |     # doesn't work when there are more than 3 freq!
1273 |     try:
1274 |         return pd.concat(df_list, axis=axis)
1275 |     except Exception:
1276 |         if axis == 0:
1277 |             # concat by freq
1278 |             freqs = [df.index.get_level_values(1).freqstr[0] for df in df_list]
1279 |             seen = set()
1280 |             freq_unique = [x for x in freqs if not (x in seen or seen.add(x))]
1281 |             dflong_list = []
1282 |             for k in freq_unique:
1283 |                 df_list_k = [df for df in df_list if df.index.get_level_values(1).freqstr[0] == k]
1284 |                 dflong_k = pd.concat(df_list_k, axis=0)
1285 |                 dflong_list.append(dflong_k)
1286 | 
1287 |             dflong = pd.concat(dflong_list, axis=0)
1288 |             return dflong
1289 | 
1290 |         if axis == 1:
1291 |             # concat by freq
1292 |             freqs = [df.columns.get_level_values(1).freqstr[0] for df in df_list]
1293 |             seen = set()
1294 |             freq_unique = [x for x in freqs if not (x in seen or seen.add(x))]
1295 |             dfwide_list = []
1296 |             for k in freq_unique:
1297 |                 df_list_k = [df for df in df_list if df.columns.get_level_values(1).freqstr[0] == k]
1298 |                 dfwide_k = pd.concat(df_list_k, axis=1)
1299 |                 dfwide_list.append(dfwide_k)
1300 | 
1301 |             dfwide = pd.concat(dfwide_list, axis=1)
1302 |             return dfwide
1303 | 


--------------------------------------------------------------------------------