├── .gitignore
├── JM_value_example.png
├── LICENSE
├── README.md
├── environment.yaml
├── examples
    └── nasdaq
    │   ├── data
    │       ├── NDX.csv
    │       └── NDX.pkl
    │   ├── example.ipynb
    │   ├── example.py
    │   ├── feature.py
    │   ├── get_data.py
    │   ├── plots
    │       ├── CJM_lambd-600.0_test_online.pdf
    │       ├── CJM_lambd-600.0_train.pdf
    │       ├── JM_lambd-0.0_train.pdf
    │       ├── JM_lambd-50.0_test_online.pdf
    │       ├── JM_lambd-50.0_train.pdf
    │       ├── SJM_lambd-50.0_max-feats-3.0_test_online.pdf
    │       └── SJM_lambd-50.0_max-feats-3.0_train.pdf
    │   └── utils_dir.py
├── jumpmodels
    ├── __init__.py
    ├── base.py
    ├── jump.py
    ├── plot.py
    ├── preprocess.py
    ├── sparse_jump.py
    └── utils
    │   ├── __init__.py
    │   ├── calculation.py
    │   ├── cluster.py
    │   ├── index.py
    │   └── validation.py
└── pyproject.toml


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Mac system trash
  2 | .DS_Store
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | cover/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | .pybuilder/
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | #   For a library or package, you might want to ignore these files since the code is
 90 | #   intended to run in multiple environments; otherwise, check them in:
 91 | # .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # poetry
101 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
103 | #   commonly ignored for libraries.
104 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105 | #poetry.lock
106 | 
107 | # pdm
108 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
109 | #pdm.lock
110 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
111 | #   in version control.
112 | #   https://pdm.fming.dev/#use-with-ide
113 | .pdm.toml
114 | 
115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
116 | __pypackages__/
117 | 
118 | # Celery stuff
119 | celerybeat-schedule
120 | celerybeat.pid
121 | 
122 | # SageMath parsed files
123 | *.sage.py
124 | 
125 | # Environments
126 | .env
127 | .venv
128 | env/
129 | venv/
130 | ENV/
131 | env.bak/
132 | venv.bak/
133 | 
134 | # Spyder project settings
135 | .spyderproject
136 | .spyproject
137 | 
138 | # Rope project settings
139 | .ropeproject
140 | 
141 | # mkdocs documentation
142 | /site
143 | 
144 | # mypy
145 | .mypy_cache/
146 | .dmypy.json
147 | dmypy.json
148 | 
149 | # Pyre type checker
150 | .pyre/
151 | 
152 | # pytype static type analyzer
153 | .pytype/
154 | 
155 | # Cython debug symbols
156 | cython_debug/
157 | 
158 | # PyCharm
159 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
160 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
161 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
162 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
163 | #.idea/
164 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
165 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
166 | 
167 | # User-specific stuff
168 | .idea/**/workspace.xml
169 | .idea/**/tasks.xml
170 | .idea/**/usage.statistics.xml
171 | .idea/**/dictionaries
172 | .idea/**/shelf
173 | 
174 | # AWS User-specific
175 | .idea/**/aws.xml
176 | 
177 | # Generated files
178 | .idea/**/contentModel.xml
179 | 
180 | # Sensitive or high-churn files
181 | .idea/**/dataSources/
182 | .idea/**/dataSources.ids
183 | .idea/**/dataSources.local.xml
184 | .idea/**/sqlDataSources.xml
185 | .idea/**/dynamic.xml
186 | .idea/**/uiDesigner.xml
187 | .idea/**/dbnavigator.xml
188 | 
189 | # Gradle
190 | .idea/**/gradle.xml
191 | .idea/**/libraries
192 | 
193 | # Gradle and Maven with auto-import
194 | # When using Gradle or Maven with auto-import, you should exclude module files,
195 | # since they will be recreated, and may cause churn.  Uncomment if using
196 | # auto-import.
197 | # .idea/artifacts
198 | # .idea/compiler.xml
199 | # .idea/jarRepositories.xml
200 | # .idea/modules.xml
201 | # .idea/*.iml
202 | # .idea/modules
203 | # *.iml
204 | # *.ipr
205 | 
206 | # CMake
207 | cmake-build-*/
208 | 
209 | # Mongo Explorer plugin
210 | .idea/**/mongoSettings.xml
211 | 
212 | # File-based project format
213 | *.iws
214 | 
215 | # IntelliJ
216 | out/
217 | 
218 | # mpeltonen/sbt-idea plugin
219 | .idea_modules/
220 | 
221 | # JIRA plugin
222 | atlassian-ide-plugin.xml
223 | 
224 | # Cursive Clojure plugin
225 | .idea/replstate.xml
226 | 
227 | # SonarLint plugin
228 | .idea/sonarlint/
229 | 
230 | # Crashlytics plugin (for Android Studio and IntelliJ)
231 | com_crashlytics_export_strings.xml
232 | crashlytics.properties
233 | crashlytics-build.properties
234 | fabric.properties
235 | 
236 | # Editor-based Rest Client
237 | .idea/httpRequests
238 | 
239 | # Android studio 3.1+ serialized cache file
240 | .idea/caches/build_file_checksums.ser
241 | 
242 | # Visual Studio temporary files, build results, and
243 | # files generated by popular Visual Studio add-ons.
244 | 
245 | # User-specific files
246 | .vs/
247 | *.user
248 | *.userosscache
249 | *.suo
250 | *.userprefs
251 | *.dll.config
252 | *.pdb


--------------------------------------------------------------------------------
/JM_value_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yizhan-Oliver-Shu/jump-models/d0fa00ce10126791695a259a45c5ddd41fbced80/JM_value_example.png


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![Fitting Example](JM_value_example.png)
  2 | 
  3 | *Note: An explanation of the application of JMs to the value factor in this figure can be found in the* [Examples](#usage-and-examples) *section.*
  4 | 
  5 | # **jumpmodels**: Python Library for Statistical Jump Models
  6 | 
  7 | [![JumpModels on PyPI](https://img.shields.io/pypi/v/jumpmodels.svg)](https://pypi.org/project/jumpmodels/)
  8 | 
  9 | `jumpmodels` is a Python library offering a collection of statistical jump models (JMs), an unsupervised algorithm designed for regime identification in time series data. 
 10 | It includes implementations of the original discrete JM, the continuous JM (CJM), and the sparse JM (SJM) with feature selection. 
 11 | The library follows a [`scikit-learn`](https://github.com/scikit-learn/scikit-learn)-style API and supports `pandas` DataFrames for both input and output.
 12 | 
 13 | 
 14 | 
 15 | ---
 16 | 
 17 | - [Installation](#installation)
 18 | - [Usage & Examples](#usage-and-examples)
 19 | - [References & Citations](#references-and-citations)
 20 | - [Contributing](#contributing)
 21 | - [Credits & Related Repo](#credits-and-related-repo)
 22 | - [License](#license)
 23 | 
 24 | 
 25 | 
 26 | 
 27 | ## Installation
 28 | 
 29 | To install the package, use the following [pip](https://pip.pypa.io/en/stable/) command:
 30 | 
 31 | ```bash
 32 | pip install jumpmodels
 33 | ```
 34 | 
 35 | 
 36 | `jumpmodels` requires the following dependencies:
 37 | 
 38 | - Python `(>=3.8)`
 39 | - `numpy`
 40 | - `pandas`
 41 | - `scipy`
 42 | - `scikit-learn`
 43 | - `matplotlib`
 44 | 
 45 | All dependencies will be installed automatically with the package. While version sensitivity is minimal, an `environment.yaml` file is provided to ensure reproducibility.
 46 | 
 47 | To run the example notebook, you will also need the following additional dependencies:
 48 | 
 49 | - `yfinance`
 50 | - `jupyterlab`
 51 | 
 52 | You can install these along with the package by running:
 53 | 
 54 | ```bash
 55 | pip install jumpmodels[example]
 56 | ```
 57 | 
 58 | 
 59 | **Note:**   In addition to the Python package dependencies listed above, the plotting functions in `plot.py` work best with LaTeX installed locally. To enable this:
 60 | 
 61 | 1. First install a LaTeX distribution such as [MiKTeX](https://miktex.org/download).
 62 | 2. Second run `pip install latex` in your Python environment.
 63 | 
 64 | Special thanks to [@Peter](https://github.com/peter1357908) for pointing this out in [issue #3](https://github.com/Yizhan-Oliver-Shu/jump-models/issues/3).
 65 | 
 66 | If you prefer not to install LaTeX, you can comment out the `matplotlib_setting()` function at the beginning of `plot.py`.
 67 | 
 68 | 
 69 | ## Usage and Examples
 70 | 
 71 | You can import the two core classes, `JumpModel` and `SparseJumpModel`, as follows:
 72 | 
 73 | ```python
 74 | from jumpmodels.jump import JumpModel                 # JM & CJM class
 75 | from jumpmodels.sparse_jump import SparseJumpModel    # Sparse JM class
 76 | ```
 77 | 
 78 | We follow a `scikit-learn`-style API, with class methods such as `.fit()`, `.predict()`, `.predict_proba()`, and `.set_params()` for model fitting, state and probability prediction, and resetting model parameters. 
 79 | Specifically designed for time series applications, we also provide `.predict_online()` and `.predict_proba_online()` methods for online prediction.
 80 | 
 81 | 
 82 | A comprehensive demonstration of the core functionality is available in the `examples/Nasdaq/example.ipynb` notebook, which includes an analysis of the Nasdaq-100 Index using data from [Yahoo Finance](https://finance.yahoo.com/quote/%5ENDX/) (fully public source).
 83 | 
 84 | The figure on top features an application of the sparse JM, showing the in-sample identified bull and bear market regimes for the value factor index based on its daily active returns relative to the market. 
 85 | Further details can be found in Shu and Mulvey (2024), as listed in the [References](#factor) section.
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | ## References and Citations
 92 | 
 93 | Below are articles related to the methodology and applications of JMs. 
 94 | If any of them assist your research, please cite the corresponding paper.
 95 | 
 96 | ### JM Methodology
 97 | 
 98 | - **Continuous Statistical Jump Models** (CJM): Aydınhan, A. O., Kolm, P. N., Mulvey, J. M., and Shu, Y. (2024). Identifying patterns in financial markets: Extending the statistical jump model for regime identification. *Annals of Operations Research*. To appear. [[journal](https://link.springer.com/article/10.1007/s10479-024-06035-z)] [[SSRN](https://papers.ssrn.com/abstract=4556048)]
 99 | 
100 | 
101 | ```bibtex
102 | @article{Aydinhan2024CJM,
103 |     title   = {Identifying patterns in financial markets: extending the statistical jump model for regime identification},
104 |     author  = {Afşar Onat Aydınhan and Petter N. Kolm and John M. Mulvey and Yizhan Shu},
105 |     journal = {Annals of Operations Research},
106 |     year    = {2024},
107 |     note    = {To appear},
108 |     doi     = {https://doi.org/10.1007/s10479-024-06035-z},
109 | }
110 | ```
111 | 
112 | 
113 | - (Original) **Statistical Jump Models**: Nystrup, P., Lindström, E., and Madsen, H. (2020a). Learning hidden Markov models with persistent states by penalizing jumps. *Expert Systems with Applications*, 150:113307. [[journal](https://www.sciencedirect.com/science/article/abs/pii/S0957417420301329)] [[OpenAccess](https://orbit.dtu.dk/files/255194701/Learning_hidden_Markov_models_with_persistent_states_by_penalizing_jumps_ACCEPTED_ESWA.pdf)]
114 | 
115 | 
116 | ```bibtex
117 | @article{Nystrup2020JM,
118 |     title   = {Learning hidden {Markov} models with persistent states by penalizing jumps},
119 |     author  = {Peter Nystrup and Erik Lindstr{\"o}m and Henrik Madsen},
120 |     journal = {Expert Systems with Applications},
121 |     year    = {2020},
122 |     pages   = {113307},
123 |     volume  = {150},
124 |     doi     = {https://doi.org/10.1016/j.eswa.2020.113307},
125 | }
126 | ```
127 | 
128 | 
129 | - **Sparse Jump Models**: Nystrup, P., Kolm, P. N., and Lindström, E. (2021). Feature selection in jump models.  *Expert Systems with Applications*, 184:115558.  [[journal](https://www.sciencedirect.com/science/article/pii/S0957417421009647)] [[SSRN](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3805831)]
130 | 
131 | 
132 | ```bibtex
133 | @article{nystrup2021SJM,
134 |     title     = {Feature selection in jump models},
135 |     author    = {Peter Nystrup and Petter N. Kolm and Erik Lindstr{\"o}m},
136 |     journal   = {Expert Systems with Applications},
137 |     volume    = {184},
138 |     pages     = {115558},
139 |     year      = {2021},
140 |     doi       = {https://doi.org/10.1016/j.eswa.2021.115558},
141 | }
142 | ```
143 | 
144 | 
145 | 
146 | 
147 | - **Online Inference for JMs**:  Nystrup, P., Kolm, P. N., and Lindström, E. (2020b). Greedy online classification of persistent market states using realized intraday volatility features. *The Journal of Financial Data Science*, 2(3):25–39. [[journal](https://www.pm-research.com/content/iijjfds/2/3/25)] [[OpenAccess](https://backend.orbit.dtu.dk/ws/portalfiles/portal/242396317/Greedy_online_classification_of_persistent_market_states_using_realized_intraday_volatility_features.pdf)]
148 | 
149 | ```bibtex
150 | @article{Nystrup2020onlineJM,
151 |     title   = {Greedy Online Classification of Persistent Market States Using Realized Intraday Volatility Features},
152 |     author  = {Peter Nystrup and Petter N. Kolm and Erik Lindstr{\"o}m},
153 |     journal = {The Journal of Financial Data Science}
154 |     year    = {2020},
155 |     volume  = {2},
156 |     number  = {3},
157 |     pages   = {25--39},
158 |     doi     = {https://doi.org/10.3905/jfds.2020.2.3.025},
159 | }
160 | ```
161 | 
162 | 
163 | ### JM Applications
164 | 
165 | 
166 | - **Downside Risk Recution**: Shu, Y., Yu, C., and Mulvey, J. M. (2024a). Downside risk reduction using regime-switching signals: A statistical jump model approach. *Journal of Asset Management*. To appear. [[journal](https://link.springer.com/article/10.1057/s41260-024-00376-x)] [[SSRN](https://ssrn.com/abstract=4719989)]
167 | 
168 | 
169 | ```bibtex
170 | @article{Shu2024downside,
171 |     title   = {Downside Risk Reduction Using Regime-Switching Signals: A Statistical Jump Model Approach},
172 |     author  = {Shu, Yizhan and Yu, Chenyu and Mulvey, John M.},
173 |     journal = {Journal of Asset Management},
174 |     year    = {2024},
175 |     note    = {To appear},
176 |     doi     = {https://doi.org/10.1057/s41260-024-00376-x},
177 | }
178 | ```
179 | 
180 | 
181 | 
182 | 
183 | - **Dynamic Asset Allocation**: Shu, Y., Yu, C., and Mulvey, J. M. (2024b). Dynamic asset allocation with asset-specific regime forecasts. *Annals of Operations Research*. To appear. [[journal](https://link.springer.com/article/10.1007/s10479-024-06266-0)] [[SSRN](https://ssrn.com/abstract=4864358)]
184 | 
185 | ```bibtex
186 | @article{Shu2024DAA,
187 |     title   = {Dynamic Asset Allocation with Asset-Specific Regime Forecasts},
188 |     author  = {Shu, Yizhan and Yu, Chenyu and Mulvey, John M.},
189 |     journal = {Annals of Operations Research},
190 |     year    = {2024},
191 |     note    = {To appear},
192 |     doi     = {https://doi.org/10.1007/s10479-024-06266-0},
193 | }
194 | ```
195 | 
196 | 
197 | 
198 | 
199 | - <a id="factor"></a> **Dynamic Factor Allocation**: Shu, Y. and Mulvey, J. M. (2024). Dynamic Factor Allocation Leveraging Regime-Switching Signals. [[SSRN](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4960484)]
200 | 
201 | 
202 | ```bibtex
203 | @article{Shu2024factor,
204 |     title   = {Dynamic Factor Allocation Leveraging Regime-Switching Signals},
205 |     author  = {Shu, Yizhan and Mulvey, John M.},
206 |     journal = {SSRN},
207 |     year    = {2024},
208 | }
209 | ```
210 | 
211 | 
212 | 
213 | ## Contributing
214 | 
215 | Pull requests and open issues are welcome. I am happy to discuss any related questions.
216 | 
217 | 
218 | ## Credits and Related Repo
219 | 
220 | This library builds upon the open-source [code](https://www.sciencedirect.com/science/article/pii/S0957417421009647#appSB) accompanying Nystrup et al. (2021).
221 | 
222 | The GitHub [Repo](https://github.com/FedericoCortese/GIC-for-SJM) by Federico P. Cortese implements the generalized information criteria (GIC) for high-dimensional SJMs, detailed in Cortese, F. P., Kolm, P. N., and Lindström, E. (2024). Generalized information criteria for high-dimensional sparse statistical jump models [[SSRN](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4774429)].
223 | 
224 | The structure of this README file is inspired by the format used in [`cvxpylayers`](https://github.com/cvxgrp/cvxpylayers). 
225 | 
226 | 
227 | 
228 | 
229 | 
230 | ## License
231 | 
232 | Our library carries an Apache 2.0 license.
233 | 
234 | 
235 | 
236 | 
237 | 
238 | 
239 | 
240 | 
241 | 
242 | 
243 | 


--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
 1 | name: jm-pack-base
 2 | channels:
 3 |   - conda-forge
 4 |   - defaults
 5 | dependencies:
 6 |   - brotli=1.1.0
 7 |   - brotli-bin=1.1.0
 8 |   - bzip2=1.0.8
 9 |   - ca-certificates=2024.8.30
10 |   - certifi=2024.8.30
11 |   - contourpy=1.3.0
12 |   - cycler=0.12.1
13 |   - fonttools=4.54.1
14 |   - freetype=2.12.1
15 |   - joblib=1.4.2
16 |   - kiwisolver=1.4.7
17 |   - lcms2=2.16
18 |   - lerc=4.0.0
19 |   - libblas=3.9.0
20 |   - libbrotlicommon=1.1.0
21 |   - libbrotlidec=1.1.0
22 |   - libbrotlienc=1.1.0
23 |   - libcblas=3.9.0
24 |   - libcxx=19.1.1
25 |   - libdeflate=1.21
26 |   - libexpat=2.6.3
27 |   - libffi=3.4.2
28 |   - libgfortran=5.0.0
29 |   - libgfortran5=13.2.0
30 |   - libjpeg-turbo=3.0.0
31 |   - liblapack=3.9.0
32 |   - libopenblas=0.3.27
33 |   - libpng=1.6.44
34 |   - libsqlite=3.46.1
35 |   - libtiff=4.7.0
36 |   - libwebp-base=1.4.0
37 |   - libxcb=1.17.0
38 |   - libzlib=1.3.1
39 |   - llvm-openmp=19.1.0
40 |   - matplotlib=3.9.2
41 |   - matplotlib-base=3.9.2
42 |   - munkres=1.1.4
43 |   - ncurses=6.5
44 |   - numpy=2.1.1
45 |   - openjpeg=2.5.2
46 |   - openssl=3.3.2
47 |   - packaging=24.1
48 |   - pandas=2.2.3
49 |   - pillow=10.4.0
50 |   - pip=24.2
51 |   - pthread-stubs=0.4
52 |   - pyparsing=3.1.4
53 |   - python=3.12.6
54 |   - python-dateutil=2.9.0
55 |   - python-tzdata=2024.2
56 |   - python_abi=3.12
57 |   - pytz=2024.1
58 |   - qhull=2020.2
59 |   - readline=8.2
60 |   - scikit-learn=1.5.2
61 |   - scipy=1.14.1
62 |   - setuptools=75.1.0
63 |   - six=1.16.0
64 |   - threadpoolctl=3.5.0
65 |   - tk=8.6.13
66 |   - tornado=6.4.1
67 |   - tzdata=2024a
68 |   - wheel=0.44.0
69 |   - xorg-libxau=1.0.11
70 |   - xorg-libxdmcp=1.1.5
71 |   - xz=5.2.6
72 |   - zstd=1.5.6
73 | prefix: /Users/yizhan/mambaforge/envs/jm-pack-base
74 | 


--------------------------------------------------------------------------------
/examples/nasdaq/data/NDX.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yizhan-Oliver-Shu/jump-models/d0fa00ce10126791695a259a45c5ddd41fbced80/examples/nasdaq/data/NDX.pkl


--------------------------------------------------------------------------------
/examples/nasdaq/example.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | from utils_dir import get_curr_dir, include_home_dir
  7 | include_home_dir()
  8 | 
  9 | import pandas as pd
 10 | 
 11 | from jumpmodels.utils import filter_date_range        # useful helpers
 12 | from jumpmodels.jump import JumpModel                 # class of JM & CJM
 13 | from jumpmodels.sparse_jump import SparseJumpModel    # class of Sparse JM
 14 | 
 15 | 
 16 | # # Load Data & Features
 17 | # 
 18 | # This example demonstrates the class of *statistical jump models* (JMs) and various helper functions for regime analysis provided by our package `jumpmodels`, using an application on the Nasdaq-100 Index. 
 19 | # The core classes, `JumpModel` and `SparseJumpModel`, implement the original JM, continuous JM (CJM), and sparse JM (SJM) with feature selection.
 20 | # These models follow the API style used in `scikit-learn` for easy integration and efficient usage.
 21 | # For detailed mathematical and algorithmic explanations of these models, please refer to the literature cited in the `README`.
 22 | # 
 23 | # Relevant helper functions will be imported as needed throughout this example.
 24 | # If running this notebook in Jupyter Lab/Notebook poses any issues, there is an exported `.py` script available in this folder for convenient execution.
 25 | # 
 26 | 
 27 | # ## Raw Data
 28 | # 
 29 | # In this example, we analyze the regimes of the Nasdaq-100 Index.
 30 | # The daily index price data is retrieved from [Yahoo Finance](https://finance.yahoo.com/quote/%5ENDX/) under the ticker `NDX`. 
 31 | # 
 32 | # The data retrieval is handled in the script `get_data.py`, and the dataset is already saved in the `example/Nasdaq/data/` folder in both `csv` and `pkl` formats, so there’s no need to run `get_data.py` manually.
 33 | # 
 34 | # We work with daily frequency data using `pandas` DataFrames, where the index is of type `datetime.date`. 
 35 | # This format is consistent with the convention used in the `CRSP` database. 
 36 | # All helper functions in this package are designed to support this type of date index.
 37 | 
 38 | # ## Feature Engineering
 39 | # 
 40 | # Feeding the model a robust feature set is key to the successful application of any learning algorithm. 
 41 | # This example uses a simple feature set consisting of nine features: the exponentially weighted moving (EWM) return, downside deviation (in log scale), and Sortino ratio, each computed with three halflife values ranging from one week (5 days) to one quarter (3 months). 
 42 | # 
 43 | # Users may need to adjust the features or halflives to suit their specific applications. 
 44 | # The literature referenced in the `README` offers a solid foundation for further exploration.
 45 | # 
 46 | # The computation of these features is detailed in `feature.py` in the same folder as this example, and we use the `DataLoader` class to load both the index returns and the engineered features.
 47 | # The loaded data covers the period from the start of 2007 to the end of September 2024.
 48 | 
 49 | # In[2]:
 50 | 
 51 | 
 52 | from feature import DataLoader
 53 | 
 54 | data = DataLoader(ticker="NDX", ver="v0").load(start_date="2007-1-1", end_date="2024-09-30")
 55 | 
 56 | print("Daily returns stored in `data.ret_ser`:", "-"*50, sep="\n")
 57 | print(data.ret_ser, "-"*50, sep="\n")
 58 | print("Features stored in `data.X`:", "-"*50, sep="\n")
 59 | print(data.X)
 60 | 
 61 | 
 62 | # ## Train/Test Split and Preprocessing
 63 | # 
 64 | # We perform a simple time-based split: data from the beginning of 2007 to the end of 2021, covering a 15-year period, is used as the training set for fitting the JMs.
 65 | # The period from 2022 to late 2024 is reserved as the test set, where we apply the trained JMs to perform online regime inference.
 66 | # We use the helper function `filter_date_range` to filter the start and end dates of a DataFrame.
 67 | 
 68 | # In[3]:
 69 | 
 70 | 
 71 | train_start, test_start = "2007-1-1", "2022-1-1"
 72 | # filter dates
 73 | X_train = filter_date_range(data.X, start_date=train_start, end_date=test_start)
 74 | X_test = filter_date_range(data.X, start_date=test_start)
 75 | # print time split
 76 | train_start, train_end = X_train.index[[0, -1]]
 77 | test_start, test_end = X_test.index[[0, -1]]
 78 | print("Training starts at:", train_start, "and ends at:", train_end)
 79 | print("Testing starts at:", test_start, "and ends at:", test_end)
 80 | 
 81 | 
 82 | # The module `jumpmodels.preprocess` provides two classes for preprocessing: one for standardizing and one for clipping the feature data. 
 83 | # We first clip the data within three standard deviations for all features and then perform standardization before feeding the data into the JMs. 
 84 | # Both classes are first fitted on the training data and subsequently used to transform the test data.
 85 | # 
 86 | # These classes support both `pandas` DataFrames and `numpy` arrays as direct inputs and outputs. 
 87 | # We prefer to retain the DataFrame type whenever possible to preserve the date index and column labels.
 88 | 
 89 | # In[4]:
 90 | 
 91 | 
 92 | # Preprocessing
 93 | from jumpmodels.preprocess import StandardScalerPD, DataClipperStd
 94 | clipper = DataClipperStd(mul=3.)
 95 | scalar = StandardScalerPD()
 96 | # fit on training data
 97 | X_train_processed = scalar.fit_transform(clipper.fit_transform(X_train))
 98 | # transform the test data
 99 | X_test_processed = scalar.transform(clipper.transform(X_test))
100 | 
101 | 
102 | # # Original JM
103 | 
104 | # ## In-Sample Fitting
105 | # 
106 | # We begin by illustrating the in-sample training of the original JM.
107 | # The model parameters are set as follows: the number of components/states/regimes is 2, the jump penalty $\lambda$ is 50.0, and `cont=False`, indicating the original discrete JM that performs hard clustering. 
108 | # It is important to note that the jump penalty $\lambda$ is a crucial hyperparameter that requires tuning, either through statistical criteria or cross-validation (see references for details). 
109 | # 
110 | # The docstring provides comprehensive documentation of all parameters and attributes (thanks to ChatGPT).
111 | 
112 | # In[5]:
113 | 
114 | 
115 | # set the jump penalty
116 | jump_penalty=50.
117 | # initlalize the JM instance
118 | jm = JumpModel(n_components=2, jump_penalty=jump_penalty, cont=False, )
119 | 
120 | 
121 | # In the `.fit()` call, we pass the return series for each period to be used for sorting the states.
122 | # We specify `sort_by="cumret"`, meaning that the state labels (0 or 1) are determined by the cumulative returns under each state. The state with higher cumulative returns is denoted as $s_t=0$ (bull market), and the state with lower returns is denoted as $s_t=1$ (bear market). 
123 | # 
124 | 
125 | # In[6]:
126 | 
127 | 
128 | # call .fit()
129 | jm.fit(X_train_processed, data.ret_ser, sort_by="cumret")
130 | 
131 | 
132 | # The cluster centroids for each state are stored in the `centers_` attribute. 
133 | # While these values are scaled, making direct interpretation hard, the bull market state is clearly characterized by higher returns, lower downside deviation, and a higher Sortino ratio, with a distinct separation between the two regimes.
134 | 
135 | # In[7]:
136 | 
137 | 
138 | print("Scaled Cluster Centroids:", pd.DataFrame(jm.centers_, index=["Bull", "Bear"], columns=X_train.columns), sep="\n" + "-"*50 + "\n")
139 | 
140 | 
141 | # ### Visualization
142 | # 
143 | # The `jumpmodels.plot` module provides useful functions for visualizing regime identification. 
144 | # We'll use the `labels_` attribute of the JM instance, which contains integers from 0 to `n_c-1`, representing the in-sample fitted regime assignment for each period.
145 | # 
146 | # From the plot, we observe that the identified regimes for the Nasdaq-100 Index successfully capture several significant market downturns, including the global financial crisis, corrections in 2012, 2015-2016, 2019, and the COVID-19 crash in 2020. 
147 | # These identified regimes correspond well to shifts in market fundamentals, as interpreted in hindsight.
148 | # 
149 | 
150 | # In[8]:
151 | 
152 | 
153 | from jumpmodels.plot import plot_regimes_and_cumret, savefig_plt
154 | 
155 | ax, ax2 = plot_regimes_and_cumret(jm.labels_, data.ret_ser, n_c=2, start_date=train_start, end_date=train_end, )
156 | ax.set(title=f"In-Sample Fitted Regimes by the JM ($\\lambda$={jump_penalty})")
157 | savefig_plt(f"{get_curr_dir()}/plots/JM_lambd-{jump_penalty}_train.pdf")
158 | 
159 | 
160 | # ### Modifying Parameters via `set_params()`
161 | # 
162 | # Our model inherits from the `BaseEstimator` class provided by `scikit-learn`, enabling a wide range of utility methods.
163 | # Among these, we highlight the `.set_params()` function, which allows users to reset any input parameters without creating a new instance.
164 | # This functionality is particularly useful when the model needs to be refitted multiple times, such as when testing different jump penalties.
165 | # 
166 | # As an example, we reset the jump penalty to zero, effectively reducing the model to a baseline $k$-means clustering algorithm where temporal information is ignored. 
167 | # This comparison illustrates the value of applying a jump penalty to ensure temporal consistency and reduce the occurrence of unrealistic regime shifts.
168 | 
169 | # In[9]:
170 | 
171 | 
172 | # reset jump_penalty to zero
173 | jump_penalty=0.
174 | jm.set_params(jump_penalty=jump_penalty)
175 | print("The jump penalty of the JM instance has been reset to: jm.jump_penalty =", jm.jump_penalty)
176 | 
177 | 
178 | # In[10]:
179 | 
180 | 
181 | # refit
182 | jm.fit(X_train_processed, data.ret_ser, sort_by="cumret")
183 | 
184 | # plot
185 | ax, ax2 = plot_regimes_and_cumret(jm.labels_, data.ret_ser, n_c=2, start_date=train_start, end_date=train_end, )
186 | ax.set(title=f"In-Sample Fitted Regimes by the JM ($\\lambda$={jump_penalty})")
187 | savefig_plt(f"{get_curr_dir()}/plots/JM_lambd-{jump_penalty}_train.pdf")
188 | 
189 | 
190 | # ## Online Inference
191 | # 
192 | # After completing the in-sample training, we apply the trained models for online inference on the test period using the `predict_online()` method. 
193 | # Here, *online inference* means that the regime inference for period $t$ is based solely on the data available up to the end of that period, without using any future data.
194 | # We revert the jump penalty to a reasonable value of 50.0.
195 | # 
196 | # 
197 | 
198 | # In[11]:
199 | 
200 | 
201 | # refit
202 | jump_penalty=50.
203 | jm.set_params(jump_penalty=jump_penalty).fit(X_train_processed, data.ret_ser, sort_by="cumret")
204 | # make online inference 
205 | labels_test_online = jm.predict_online(X_test_processed)
206 | 
207 | 
208 | # From the visualization below, we observe that the JM effectively signals the bear market in 2022, driven by aggressive interest rate hikes. 
209 | # This period saw a return of over $-$15% and a significant drawdown.
210 | # However, the brief bear period captured in the second half of 2024 is followed by a strong price reversal.
211 | # This latency issue constitutes a common challenge in real-time applications of regime-switching signals.
212 | # Improving the feature set or fine-tuning the jump penalty may help address this issue.
213 | 
214 | # In[12]:
215 | 
216 | 
217 | # plot and save
218 | ax, ax2 = plot_regimes_and_cumret(labels_test_online, data.ret_ser, n_c=2, start_date=test_start, end_date=test_end, )
219 | ax.set(title=f"Out-of-Sample Online Inferred Regimes by the JM ($\\lambda$={jump_penalty})")
220 | savefig_plt(f"{get_curr_dir()}/plots/JM_lambd-{jump_penalty}_test_online.pdf")
221 | 
222 | 
223 | # In contrast to online inference, the `.predict()` method performs state decoding using all test data (i.e., from 2022 to 2024) at once. 
224 | # While this approach is less realistic for trading applications, we observe that, with access to the full dataset, the model avoids the reversal in late 2024 and exits the bear signal in 2023 slightly earlier than with online inference.
225 | # 
226 | # Though this approach is less applicable for real-world backtesting in financial markets, it holds potential uses in other engineering fields (such as language modeling, where access to an entire sentence is available at once.)
227 | 
228 | # In[13]:
229 | 
230 | 
231 | # make inference using all test data
232 | labels_test = jm.predict(X_test_processed)
233 | # plot
234 | ax, ax2 = plot_regimes_and_cumret(labels_test, data.ret_ser, n_c=2, start_date=test_start, end_date=test_end, )
235 | _ = ax.set(title=f"Out-of-Sample Predicted Regimes by the JM Using All Test Data ($\\lambda$={jump_penalty})")
236 | 
237 | 
238 | # # CJM: Continuous Extension of the JM
239 | # 
240 | # With this, we conclude a minimal overview of the core functionality of using JMs to assign regime labels to in-sample training periods and leverage trained models for out-of-sample prediction, either through online inference or by processing all data at once. 
241 | # The methods -- such as `.fit()`, `.set_params()`, and `predict_online()` -- extend seamlessly to the following JM variants: CJM and SJM. 
242 | # Here, we provide brief illustrations of these extensions.
243 | 
244 | # ## In-Sample Fitting
245 | # 
246 | # The CJM (Continuous Jump Model) uses the same `JumpModel` class as the discrete model, with the parameter `cont=True`. 
247 | # 
248 | # ### Parameters 
249 | # 
250 | # Regarding the jump penalty value, it is typically set to be 10 times larger than the $\lambda$ used in the discrete model to achieve similar fittings, so we choose $\lambda=600.0$ here.
251 | # 
252 | # Additionally, CJM introduces two specialized parameters: `mode_loss` and `grid_size`, which require more nuanced understanding. 
253 | # Generally, the default values are recommended for most cases.
254 | # 
255 | 
256 | # In[14]:
257 | 
258 | 
259 | jump_penalty=600.
260 | cjm = JumpModel(n_components=2, jump_penalty=jump_penalty, cont=True)
261 | 
262 | 
263 | # The `proba_` attribute of the CJM instance stores the estimated probability of each period belonging to each state.
264 | # Unlike the discrete model, where the state assignment changes abruptly, CJM offers smooth probability transitions, ranging from 0% to 100%. 
265 | # This probabilistic interpretation has potential applications in many domains, especially where softer regime assignments are beneficial.
266 | 
267 | # In[15]:
268 | 
269 | 
270 | cjm.fit(X_train_processed, data.ret_ser, sort_by="cumret")
271 | 
272 | # plot
273 | ax, ax2 = plot_regimes_and_cumret(cjm.proba_, data.ret_ser, n_c=2, start_date=train_start, end_date=train_end, )
274 | ax2.set(ylabel="Regime Probability")
275 | ax.set(title=f"In-Sample Fitted Regimes by the CJM ($\\lambda$={jump_penalty})")
276 | savefig_plt(f"{get_curr_dir()}/plots/CJM_lambd-{jump_penalty}_train.pdf")
277 | 
278 | 
279 | # ## Online Inference
280 | # 
281 | # The `.predict_proba_online()` method allows CJM to make probabilistic regime inferences online.
282 | # From the plot, we observe that the confidence in the bear market during late 2024 doesn't fully reach 100%, potentially reducing the mislabeling issue discussed earlier. 
283 | # This smoother transition in probabilities may offer better regime detection in uncertain market conditions.
284 | 
285 | # In[16]:
286 | 
287 | 
288 | # online inference
289 | proba_test_online = cjm.predict_proba_online(X_test_processed)
290 | 
291 | # plot
292 | ax, ax2 = plot_regimes_and_cumret(proba_test_online, data.ret_ser, start_date=test_start, end_date=test_end, )
293 | ax2.set(ylabel="Regime Probability")
294 | ax.set(title=f"Out-of-Sample Online Inferred Regimes by the CJM ($\\lambda$={jump_penalty})")
295 | savefig_plt(f"{get_curr_dir()}/plots/CJM_lambd-{jump_penalty}_test_online.pdf")
296 | 
297 | 
298 | # # SJM: Sparse JM with Feature Selection
299 | # 
300 | # Finally, the Sparse Jump Model (SJM) introduces feature weighting on top of the original JM or CJM. 
301 | # Features leading to better in-sample clustering effects, as measured by variance reduction, are assigned higher weights, while a LASSO-like constraint on the weight vector ensures that noisy features receive zero weight.
302 | # 
303 | # ## In-Sample Fitting
304 | # 
305 | # ### Parameters
306 | # 
307 | # SJM is implemented in the class `SparseJumpModel`, with an additional parameter `max_feats`, which controls the number of features included.
308 | # This parameter roughly reflects the effective number of features. (In the notation of Nystrup et al. (2021), `max_feats` corresponds to $\kappa^2$.)
309 | # 
310 | # The jump penalty value is of a similar magnitude to the non-sparse model. In this case, we try `max_feats=3.` and `jump_penalty=50.`
311 | 
312 | # In[17]:
313 | 
314 | 
315 | max_feats=3.
316 | jump_penalty=50.
317 | # init sjm instance
318 | sjm = SparseJumpModel(n_components=2, max_feats=max_feats, jump_penalty=jump_penalty, )
319 | # fit
320 | sjm.fit(X_train_processed, ret_ser=data.ret_ser, sort_by="cumret")
321 | 
322 | 
323 | # The feature weights are stored in the attribute `feature_weights`. 
324 | # Generally, we observe that features with longer halflives receive higher weights, indicating that less smoothed features are noisier and are excluded from the model, thanks to the feature weighting mechanism.
325 | 
326 | # In[18]:
327 | 
328 | 
329 | print("SJM Feature Weights:", "-"*50, sjm.feat_weights, sep="\n")
330 | 
331 | 
332 | # A comparison of the SJM-identified regimes with those identified by JM reveals that the GFC is consolidated into a single bear regime, demonstrating that short-term noise has been effectively mitigated.
333 | 
334 | # In[19]:
335 | 
336 | 
337 | # plot
338 | ax, ax2 = plot_regimes_and_cumret(sjm.labels_, data.ret_ser, n_c=2, start_date=train_start, end_date=train_end, )
339 | ax.set(title=f"In-Sample Fitted Regimes by the SJM ($\\lambda$={jump_penalty}, $\\kappa^2$={max_feats})")
340 | savefig_plt(f"{get_curr_dir()}/plots/SJM_lambd-{jump_penalty}_max-feats-{max_feats}_train.pdf")
341 | 
342 | 
343 | # ## Online Inference
344 | # 
345 | # As before, the `.predict_online()` method handles online inference. 
346 | # Notably, through feature selection, the previously problematic bear market signal in late 2024 is absent in the SJM's online inference, highlighting the potential benefits of feature selection.
347 | # 
348 | # 
349 | 
350 | # In[20]:
351 | 
352 | 
353 | # online inference
354 | labels_test_online_sjm = sjm.predict_online(X_test_processed)
355 | 
356 | # plot
357 | ax, ax2 = plot_regimes_and_cumret(labels_test_online_sjm, data.ret_ser, start_date=test_start, end_date=test_end, )
358 | ax.set(title=f"Out-of-Sample Online Inferred Regimes by the SJM ($\\lambda$={jump_penalty}, $\\kappa^2$={max_feats})")
359 | savefig_plt(f"{get_curr_dir()}/plots/SJM_lambd-{jump_penalty}_max-feats-{max_feats}_test_online.pdf")
360 | 
361 | 
362 | # # Conclusion
363 | # 
364 | # This concludes the introduction to the functionalities of our `jumpmodels` library. 
365 | # The field of statistical jump models is still actively evolving, with ongoing research exploring new avenues.
366 | # We hope that the models and helper functions provided in this package will be useful in your own work. 
367 | # Citations and credits are always appreciated.
368 | # 
369 | # We welcome pull requests and open issues, and I’m happy to discuss any related questions.
370 | 
371 | # In[ ]:
372 | 
373 | 
374 | 
375 | 
376 | 


--------------------------------------------------------------------------------
/examples/nasdaq/feature.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Helpers for engineering the features to be input to JMs.
  3 | """
  4 | 
  5 | from utils_dir import *
  6 | include_home_dir()
  7 | 
  8 | import numpy as np
  9 | import pandas as pd
 10 | from sklearn.base import BaseEstimator
 11 | 
 12 | from jumpmodels.utils import *
 13 | 
 14 | ############################################
 15 | ## Feature Engineering
 16 | ############################################
 17 | 
 18 | # reviewed
 19 | def compute_ewm_DD(ret_ser: pd.Series, hl: float) -> pd.Series:
 20 |     """
 21 |     Compute the exponentially weighted moving downside deviation (DD) for a return series.
 22 | 
 23 |     The downside deviation is calculated as the square root of the exponentially 
 24 |     weighted second moment of negative returns.
 25 | 
 26 |     Parameters
 27 |     ----------
 28 |     ret_ser : pd.Series
 29 |         The input return series.
 30 | 
 31 |     hl : float
 32 |         The halflife parameter for the exponentially weighted moving average.
 33 | 
 34 |     Returns
 35 |     -------
 36 |     pd.Series
 37 |         The exponentially weighted moving downside deviation for the return series.
 38 |     """
 39 |     ret_ser_neg: pd.Series = np.minimum(ret_ser, 0.)
 40 |     sq_mean = ret_ser_neg.pow(2).ewm(halflife=hl).mean()
 41 |     return np.sqrt(sq_mean)
 42 | 
 43 | # reviewed
 44 | def feature_engineer(ret_ser: pd.Series, ver: str = "v0") -> pd.DataFrame:
 45 |     """
 46 |     Engineer a set of features based on a return series.
 47 | 
 48 |     This function customizes the feature set according to the specified version string.
 49 | 
 50 |     Parameters
 51 |     ----------
 52 |     ret_ser : pd.Series
 53 |         The input return series for feature engineering.
 54 | 
 55 |     ver : str
 56 |         The version of feature engineering to apply. Only supports "v0".
 57 |     
 58 |     Returns
 59 |     -------
 60 |     pd.DataFrame
 61 |         The engineered feature set.
 62 |     """
 63 |     if ver == "v0":
 64 |         feat_dict = {}
 65 |         hls = [5, 20, 60]
 66 |         for hl in hls:
 67 |             # Feature 1: EWM-ret
 68 |             feat_dict[f"ret_{hl}"] = ret_ser.ewm(halflife=hl).mean()
 69 |             # Feature 2: log(EWM-DD)
 70 |             DD = compute_ewm_DD(ret_ser, hl)
 71 |             feat_dict[f"DD-log_{hl}"] = np.log(DD)
 72 |             # Feature 3: EWM-Sortino-ratio = EWM-ret/EWM-DD 
 73 |             feat_dict[f"sortino_{hl}"] = feat_dict[f"ret_{hl}"].div(DD)
 74 |         return pd.DataFrame(feat_dict)
 75 | 
 76 |     # try out your favorite feature sets
 77 |     else:
 78 |         raise NotImplementedError()
 79 | 
 80 | ############################################
 81 | ## DataLoader Class
 82 | ############################################
 83 | 
 84 | class DataLoader(BaseEstimator):
 85 |     """
 86 |     Class for loading the feature matrix.
 87 | 
 88 |     This class loads raw return data, computes features, and filters the data by date.
 89 |     
 90 |     Parameters
 91 |     ----------
 92 |     ticker : str
 93 |         The ticker symbol for which to load data. Only supports "NDX".
 94 | 
 95 |     ver : str
 96 |         The version of the feature set to apply. Only supports "v0".
 97 | 
 98 |     Attributes
 99 |     ----------
100 |     X : pd.DataFrame
101 |         The feature matrix.
102 |     
103 |     ret_ser : pd.Series
104 |         The return series.
105 |     """
106 |     def __init__(self, ticker: str = "NDX", ver: str = "v0"):
107 |         self.ticker = ticker
108 |         self.ver = ver
109 |     
110 |     # reviewed
111 |     def load(self, start_date: DATE_TYPE = None, end_date: DATE_TYPE = None):
112 |         """
113 |         Load the raw return data, compute features, and filter by date range.
114 | 
115 |         Parameters
116 |         ----------
117 |         start_date : DATE_TYPE, optional
118 |             The start date for filtering the data. If None, no start filtering is applied.
119 | 
120 |         end_date : DATE_TYPE, optional
121 |             The end date for filtering the data. If None, no end filtering is applied.
122 | 
123 |         Returns
124 |         -------
125 |         self
126 |             The DataLoader instance with the feature matrix and return series stored in attributes.
127 |         """
128 |         # load raw data
129 |         curr_dir = get_curr_dir()
130 |         ret_ser_raw = pd.read_pickle(f"{curr_dir}/data/{self.ticker}.pkl").ret.dropna()
131 |         ret_ser_raw.name = self.ticker
132 |         # features
133 |         df_features_all = feature_engineer(ret_ser_raw, self.ver)
134 |         
135 |         # filter date
136 |         X = filter_date_range(df_features_all, start_date, end_date)
137 |         valid_no_nan(X)
138 |         # save attributes
139 |         self.X = X
140 |         self.ret_ser = filter_date_range(ret_ser_raw, start_date, end_date)
141 |         # save more useful attributes if needed
142 |         return self
143 | 


--------------------------------------------------------------------------------
/examples/nasdaq/get_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script retrieves the daily closing price data for 
 3 | the Nasdaq-100 index from Yahoo Finance via its Python API.
 4 | 
 5 | Users do not need to run this script manually, as the return data 
 6 | is already saved in `example/Nasdaq/data/`. 
 7 | """
 8 | 
 9 | from utils_dir import *
10 | include_home_dir()
11 | 
12 | import numpy as np
13 | import pandas as pd
14 | import yfinance as yf
15 | 
16 | from jumpmodels.utils import check_dir_exist
17 | 
18 | TICKER = "NDX"   # Nasdaq-100 Index
19 | 
20 | def get_data():
21 |     # download closing prices
22 |     close: pd.Series = yf.download("^"+TICKER, start="1985-10-01", end="2024-09-30")['Close']
23 |     # convert to ret
24 |     ret = close.pct_change()
25 |     # concat as df
26 |     df = pd.DataFrame({"close": close.squeeze(), "ret": ret.squeeze()}, index=close.index.date)
27 |     df.index.name = "date"
28 | 
29 |     # save
30 |     curr_dir = get_curr_dir()
31 |     data_dir = f"{curr_dir}/data/"; check_dir_exist(data_dir)
32 |     pd.to_pickle(df, f"{data_dir}{TICKER}.pkl")
33 |     np.round(df, 6).to_csv(f"{data_dir}{TICKER}.csv")
34 |     print("Successfully downloaded data for ticker:", TICKER)
35 |     return 
36 | 
37 | if __name__ == "__main__":
38 |     get_data()


--------------------------------------------------------------------------------
/examples/nasdaq/plots/CJM_lambd-600.0_test_online.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yizhan-Oliver-Shu/jump-models/d0fa00ce10126791695a259a45c5ddd41fbced80/examples/nasdaq/plots/CJM_lambd-600.0_test_online.pdf


--------------------------------------------------------------------------------
/examples/nasdaq/plots/CJM_lambd-600.0_train.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yizhan-Oliver-Shu/jump-models/d0fa00ce10126791695a259a45c5ddd41fbced80/examples/nasdaq/plots/CJM_lambd-600.0_train.pdf


--------------------------------------------------------------------------------
/examples/nasdaq/plots/JM_lambd-0.0_train.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yizhan-Oliver-Shu/jump-models/d0fa00ce10126791695a259a45c5ddd41fbced80/examples/nasdaq/plots/JM_lambd-0.0_train.pdf


--------------------------------------------------------------------------------
/examples/nasdaq/plots/JM_lambd-50.0_test_online.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yizhan-Oliver-Shu/jump-models/d0fa00ce10126791695a259a45c5ddd41fbced80/examples/nasdaq/plots/JM_lambd-50.0_test_online.pdf


--------------------------------------------------------------------------------
/examples/nasdaq/plots/JM_lambd-50.0_train.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yizhan-Oliver-Shu/jump-models/d0fa00ce10126791695a259a45c5ddd41fbced80/examples/nasdaq/plots/JM_lambd-50.0_train.pdf


--------------------------------------------------------------------------------
/examples/nasdaq/plots/SJM_lambd-50.0_max-feats-3.0_test_online.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yizhan-Oliver-Shu/jump-models/d0fa00ce10126791695a259a45c5ddd41fbced80/examples/nasdaq/plots/SJM_lambd-50.0_max-feats-3.0_test_online.pdf


--------------------------------------------------------------------------------
/examples/nasdaq/plots/SJM_lambd-50.0_max-feats-3.0_train.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Yizhan-Oliver-Shu/jump-models/d0fa00ce10126791695a259a45c5ddd41fbced80/examples/nasdaq/plots/SJM_lambd-50.0_max-feats-3.0_train.pdf


--------------------------------------------------------------------------------
/examples/nasdaq/utils_dir.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Helpers for working with file directories.
 3 | 
 4 | Useful for all scripts/notebooks in this folder. 
 5 | Please ensure the file structure under `example/Nasdaq` is preserved 
 6 | in its original form for everything to function properly.
 7 | """
 8 | 
 9 | import sys, os
10 | 
11 | def get_curr_dir():
12 |     """
13 |     Return the current directory of this `get_data.py` file.
14 |     """
15 |     return os.path.dirname(os.path.abspath(__file__))
16 | 
17 | def include_home_dir():
18 |     """
19 |     Add the project's home directory to `sys.path`.
20 | 
21 |     This function ensures that the home directory of the project is included in 
22 |     `sys.path` to allow imports from other parts of the project. For this to work 
23 |     correctly, the script must be placed in the `example/Nasdaq/` folder.
24 |     """
25 |     curr_dir = get_curr_dir()
26 |     home_dir = os.path.dirname(os.path.dirname(curr_dir))
27 |     sys.path.append(home_dir)
28 |     return


--------------------------------------------------------------------------------
/jumpmodels/__init__.py:
--------------------------------------------------------------------------------
1 | # global constants
2 | RANDOM_STATE = 0


--------------------------------------------------------------------------------
/jumpmodels/base.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module for the base class used in clustering-like algorithms.
  3 | 
  4 | This module provides helpers for parameter sorting, parameter initialization, and base class 
  5 | definitions for clustering-like algorithms.
  6 | 
  7 | Depends on
  8 | ----------
  9 | utils/ : Modules
 10 | """
 11 | 
 12 | from .utils import *
 13 | 
 14 | from sklearn.base import BaseEstimator
 15 | from sklearn.utils import check_random_state
 16 | from sklearn.cluster import kmeans_plusplus
 17 | 
 18 | ##################################
 19 | # Sorting
 20 | ##################################
 21 | 
 22 | # reviewed
 23 | def sort_param_dict_from_idx(params: dict, idx: np.ndarray) -> None:
 24 |     """
 25 |     Sort a dictionary of parameters according to a given index array.
 26 | 
 27 |     Expected parameter shapes:
 28 |     - `ret_` : (n_c,)
 29 |     - `vol_` : (n_c,)
 30 |     - `means_` : (n_c, n_f)
 31 |     - `centers_` : (n_c, n_f)
 32 |     - `transmat_` : (n_c, n_c)
 33 |     - `startprob_` : (n_c,)
 34 |     - `proba_` : (n_s, n_c)
 35 |     - `covars_` : (n_c, 1)
 36 | 
 37 |     Parameters
 38 |     ----------
 39 |     params : dict
 40 |         A dictionary of parameters, each corresponding to a clustering result.
 41 | 
 42 |     idx : ndarray of shape (n_c,)
 43 |         The index array to sort the parameters by.
 44 |     """
 45 |     # permute `axis=0`
 46 |     for key in ['ret_', 'vol_', 'means_', 'centers_', 'startprob_', 'covars_']:
 47 |         if key in params: params[key] = params[key][idx]
 48 |     # transmat, need to permute both `axis=0 & 1`
 49 |     if 'transmat_' in params: params['transmat_'] = params['transmat_'][idx][:, idx]
 50 |     # proba, need to permute `axis=1`
 51 |     if 'proba_' in params: params['proba_'] = params['proba_'][:, idx]
 52 |     return 
 53 |  
 54 | # reviewed
 55 | def sort_param_dict(params: dict, sort_by='ret') -> None:
 56 |     """
 57 |     Sort the states by a given criterion and permute all parameters accordingly.
 58 |     Supported sorting criteria are ["cumret", "vol", "freq", "ret"], i.e.
 59 |     states sorted by decreasing (cumulative) return, increasing vol, decreasing frequency.
 60 | 
 61 |     `nan` values will be (ideally) sorted to the end.
 62 | 
 63 |     Parameters
 64 |     ----------
 65 |     params : dict
 66 |         A dictionary of parameters, each corresponding to a clustering result.
 67 | 
 68 |     sort_by : str, optional (default='ret')
 69 |         The criterion to sort the parameters by. Must be one of ["cumret", "vol", "freq", "ret"].
 70 |     """
 71 |     if sort_by is None: return
 72 |     assert sort_by in ["cumret", "vol", "freq", "ret"]
 73 |     if "proba_" in params: freq = params["proba_"].sum(axis=0)
 74 |     if sort_by == 'vol':
 75 |         assert 'vol_' in params
 76 |         criterion = params['vol_']
 77 |     elif sort_by == "cumret":
 78 |         assert "ret_" in params and "proba_" in params
 79 |         criterion = -params["ret_"] * freq   # missing regimes will have a cumret of nan*0 = nan
 80 |     elif sort_by == "ret":
 81 |         assert "ret_" in params
 82 |         criterion = -params['ret_']
 83 |     elif sort_by == "freq":
 84 |         assert "proba_" in params
 85 |         criterion = -freq # decreasing freq
 86 |     else:
 87 |         raise NotImplementedError()
 88 |     criterion = replace_inf_by_nan(criterion)
 89 |     idx = np.argsort(criterion)  
 90 |     sort_param_dict_from_idx(params, idx)
 91 |     return 
 92 | 
 93 | # reviewed
 94 | def align_and_check_ret_ser(ret_ser: SER_ARR_TYPE, X: DF_ARR_TYPE) -> np.ndarray:
 95 |     """
 96 |     Align a return series with the input data matrix `X`,
 97 |     and convert it to a 1D array.
 98 | 
 99 |     Parameters
100 |     ----------
101 |     ret_ser : Series or ndarray
102 |         The return series to validate.
103 | 
104 |     X : DataFrame or ndarray
105 |         The data matrix to align with.
106 | 
107 |     Returns
108 |     -------
109 |     ndarray
110 |         The aligned and validated 1D return array.
111 |     """
112 |     ret_ser = align_x_with_y(ret_ser, X)
113 |     return check_1d_array(ret_ser)
114 | 
115 | # reviewed
116 | def sort_states_from_ret(ret_ser: Optional[SER_ARR_TYPE], 
117 |                          X: DF_ARR_TYPE,
118 |                          best_res: dict, 
119 |                          sort_by: str = "cumret") -> None:
120 |     """
121 |     Sort the states in the fitted parameters stored in a dictionary according to a specified criterion.
122 |     This is intended for financial applications. If not applicable, input `None` for `ret_ser`.
123 | 
124 |     Parameters
125 |     ----------
126 |     ret_ser : Series or ndarray, optional
127 |         The return series to use for computing average return and volatility within each state.
128 |         If `None`, sorting is attempted by decreasing frequency (given that the `proba_` param is estimated).
129 | 
130 |     X : DataFrame or ndarray
131 |         The data matrix to use for alignment.
132 | 
133 |     best_res : dict
134 |         Fitted parameters of the best clustering results to sort.
135 | 
136 |     sort_by : str, optional (default="cumret")
137 |         The criterion to use for sorting. Must be one of ["cumret", "vol", "freq", "ret"].
138 | 
139 |         - If `ret_ser` is provided, it is used to compute the mean return (`ret_`) and volatility (`vol_`) 
140 |         within each state. Sorting by decreasing (cumulative) return and increasing volatility is possible. 
141 |         - If `ret_ser` is `None`, sort by frequency if the `proba_` attribute exists, otherwise 
142 |         don't sort anything.
143 |     """
144 |     if ret_ser is not None: 
145 |         # valid inputs
146 |         ret_ser_arr = align_and_check_ret_ser(ret_ser, X)
147 |         # compute mean & vol for each cluster
148 |         best_res['ret_'], best_res['vol_'] = weighted_mean_std_cluster(ret_ser_arr, best_res['proba_'])
149 |         # the best parameters sorted by a criterion
150 |         sort_param_dict(best_res, sort_by=sort_by)
151 |     elif "proba_" in best_res:
152 |         sort_param_dict(best_res, sort_by="freq")
153 |     return 
154 | 
155 | ##################################
156 | # Initialization
157 | ##################################
158 | 
159 | # reviewed
160 | def init_centers_kmeans_plusplus(X: np.ndarray, n_c=2, n_init=10, random_state=None) -> list[np.ndarray]:
161 |     """
162 |     Initialize the cluster centers using the K-Means++ algorithm, repeated `n_init` times.
163 | 
164 |     Parameters
165 |     ----------
166 |     X : ndarray of shape (n_s, n_f)
167 |         The data matrix.
168 | 
169 |     n_c : int, optional (default=2)
170 |         The number of clusters.
171 | 
172 |     n_init : int, optional (default=10)
173 |         The number of initializations to perform.
174 | 
175 |     random_state : int, RandomState instance, or None, optional (default=None)
176 |         Controls the randomness of the center initialization.
177 | 
178 |     Returns
179 |     -------
180 |     centers : list of ndarray
181 |         A list of initialized centers for each run.
182 |     """
183 |     random_state = check_random_state(random_state)
184 |     centers = [kmeans_plusplus(X, n_c, random_state=random_state)[0] for _ in range(n_init)]
185 |     return centers   # (n_init, n_c, n_f)
186 | 
187 | ##################################
188 | # Base Class
189 | ##################################
190 | 
191 | class BaseClusteringAlgo(BaseEstimator):
192 |     """
193 |     A base class for all clustering-like algorithms.
194 | 
195 |     This class provides several common methods but does not include any model fitting logic. 
196 |     It is intended to be inherited with specific implementations.
197 | 
198 |     Parameters
199 |     ----------
200 |     n_components : int
201 |         The number of components (clusters).
202 | 
203 |     n_init : int
204 |         The number of initializations to perform.
205 | 
206 |     max_iter : int
207 |         The maximum number of iterations.
208 | 
209 |     tol : float
210 |         The tolerance for convergence.
211 | 
212 |     random_state : int, RandomState instance, or None
213 |         Controls the randomness.
214 | 
215 |     verbose : int
216 |         Controls the verbosity of the output.
217 |     """
218 |     # reviewed
219 |     def __init__(self,
220 |                  n_components,
221 |                  n_init,
222 |                  max_iter,
223 |                  tol,
224 |                  random_state,
225 |                  verbose
226 |                  ) -> None:
227 |         self.n_components = n_components
228 |         self.n_init = n_init
229 |         self.max_iter = max_iter
230 |         self.tol = tol
231 |         self.random_state = random_state
232 |         self.verbose = verbose
233 | 
234 |     # reviewed
235 |     def is_shape_match_X_centers(self, X: DF_ARR_TYPE) -> bool:
236 |         """
237 |         Check whether the shape of `X` and `centers_` matches. Useful for `predict` methods.
238 |         `self` must already has the attribute `centers_`.
239 | 
240 |         Parameters
241 |         ----------
242 |         X : DataFrame or ndarray
243 |             The input data matrix.
244 | 
245 |         Returns
246 |         -------
247 |         bool
248 |             True if the shapes match, False otherwise.
249 |         """
250 |         n_f = X.shape[1]
251 |         return self.centers_.shape == (self.n_components, n_f)
252 |     
253 |     # reviewed
254 |     def init_centers(self, X: np.ndarray) -> np.ndarray:
255 |         """
256 |         Initialize the centers using k-Means++ for multiple initializations. 
257 |         If attribute `centers_` exists and matches the shape of `X`, it will also 
258 |         be included as an initial value.
259 | 
260 |         Parameters
261 |         ----------
262 |         X : ndarray of shape (n_s, n_f)
263 |             The input data matrix.
264 | 
265 |         Returns
266 |         -------
267 |         centers : ndarray
268 |             The initialized centers for each run.
269 |         """
270 |         centers = init_centers_kmeans_plusplus(X, self.n_components, self.n_init, self.random_state)
271 |         if hasattr(self, "centers_") and self.is_shape_match_X_centers(X): 
272 |             centers.append(self.centers_)  # use previously fitted value as one initial center value
273 |         return np.array(centers)
274 |     
275 |     # reviewed
276 |     def check_X_predict_func(self, X: DF_ARR_TYPE) -> np.ndarray:
277 |         """
278 |         Check the input data matrix for `.predict` methods, ensuring it is a 2D array and 
279 |         matches the shape of `centers_`.
280 | 
281 |         Parameters
282 |         ----------
283 |         X : DataFrame or ndarray
284 |             The input data matrix.
285 | 
286 |         Returns
287 |         -------
288 |         ndarray
289 |             The validated 2D data array.
290 |         """
291 |         X_arr = check_2d_array(X)
292 |         assert self.is_shape_match_X_centers(X_arr)
293 |         return X_arr
294 |  


--------------------------------------------------------------------------------
/jumpmodels/jump.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module for statistical jump models (JMs) and continuous jump models (CJMs).
  3 | 
  4 | This module provides utilities and helper functions for implementing and working 
  5 | with jump models and their continuous variants.
  6 | 
  7 | Depends on
  8 | ----------
  9 | utils/ : Modules
 10 |     Utility functions for validation and clustering operations.
 11 | base : Module
 12 |     Base class for clustering-like algorithms.
 13 | """
 14 | 
 15 | from itertools import product
 16 | from scipy.spatial.distance import cdist
 17 | from scipy.special import logsumexp
 18 | 
 19 | from . import RANDOM_STATE
 20 | from .utils import *
 21 | from .base import *
 22 | 
 23 | #################################
 24 | ## model helpers
 25 | #################################
 26 | 
 27 | # reviewed
 28 | def jump_penalty_to_mx(jump_penalty: float, n_c: int) -> np.ndarray:
 29 |     """
 30 |     Convert a scalar jump penalty into a penalty matrix.
 31 | 
 32 |     Parameters
 33 |     ----------
 34 |     jump_penalty : float
 35 |         The scalar value representing the jump penalty.
 36 | 
 37 |     n_c : int
 38 |         The number of clusters or components.
 39 | 
 40 |     Returns
 41 |     -------
 42 |     np.ndarray
 43 |         A matrix of shape (n_c, n_c) where off-diagonal elements are the penalty values 
 44 |         and diagonal elements are zero.
 45 |     """
 46 |     # assert is_numbers(jump_penalty)
 47 |     return jump_penalty * (np.ones((n_c, n_c)) - np.eye(n_c))   # default dtype is float
 48 | 
 49 | # reviewed
 50 | def discretize_prob_simplex(n_c: int, grid_size: float) -> np.ndarray:
 51 |     """
 52 |     Sample grid points on a probability simplex. This function generates all possible 
 53 |     combinations of probabilities that sum to 1, given the grid size.
 54 |     NB: this operation is of combinatorial complexity.
 55 | 
 56 |     Parameters
 57 |     ----------
 58 |     n_c : int
 59 |         The number of components or clusters.
 60 | 
 61 |     grid_size : float
 62 |         The step size for discretization of the simplex.
 63 | 
 64 |     Returns
 65 |     -------
 66 |     np.ndarray
 67 |         An array of shape (n_candidates, n_c), where each row represents a point on the 
 68 |         simplex. The number of candidates depends on the grid size.
 69 |     """
 70 |     N = int(1/grid_size)
 71 |     tuples = filter(lambda x: sum(x)==N, product(range(N+1), repeat = n_c))
 72 |     lst = np.array(list(tuples)[::-1], dtype=float)/N   # (n_candidates, n_c)
 73 |     return lst
 74 | 
 75 | #################################
 76 | ## DP algo & E step
 77 | #################################
 78 | 
 79 | # reviewed
 80 | def dp(loss_mx: np.ndarray, 
 81 |        penalty_mx: np.ndarray, 
 82 |        return_value_mx: bool = False) -> Union[tuple[np.ndarray, float], np.ndarray]:
 83 |     r"""
 84 |     Solve the optimization problem involved in the E-step calculation (state assignment), 
 85 |     using a dynamic programming (DP) algorithm.
 86 | 
 87 |     The objective is to minimize:
 88 | 
 89 |     $$\min \sum_{t=0}^{T-1} L(t, s_t) + \sum_{t=1}^{T-1} \Lambda(s_{t-1}, s_t).$$
 90 | 
 91 |     If some columns of `loss_mx` contain `NaN` values, they are replaced with `inf`, 
 92 |     making those clusters unreachable.
 93 | 
 94 |     Note: The DP algorithm cannot be easily sped up using Numba due to issues with 
 95 |     `.min(axis=0)` in Numba.
 96 | 
 97 |     Parameters
 98 |     ----------
 99 |     loss_mx : ndarray of shape (n_s, n_c)
100 |         The loss matrix, where `L(t, k)` represents the loss for time `t` and state `k`.
101 | 
102 |     penalty_mx : ndarray of shape (n_c, n_c)
103 |         The jump penalty matrix between states.
104 | 
105 |     return_value_mx : bool, optional (default=False)
106 |         If `True`, compute and return the value matrix from the DP algorithm. The value at 
107 |         each time step `t` is based on all information up to that point, making it suitable 
108 |         for online inference.
109 | 
110 |     Returns
111 |     -------
112 |     tuple[np.ndarray, float] or np.ndarray
113 |         If `return_value_mx` is `False`, returns a tuple containing:
114 |         - The optimal state assignments.
115 |         - The optimal loss function value.
116 |         
117 |         If `return_value_mx` is `True`, returns the value matrix.
118 |     """
119 |     # valid shape
120 |     n_s, n_c = loss_mx.shape
121 |     assert penalty_mx.shape == (n_c, n_c)
122 |     # replace nan by inf
123 |     loss_mx = replace_nan_by_inf(loss_mx)
124 |     # DP algo
125 |     values, assign = np.empty((n_s, n_c)), np.empty(n_s, dtype=int)
126 |     # initial
127 |     values[0] = loss_mx[0]
128 |     # DP iteration
129 |     for t in range(1, n_s):
130 |         values[t] = loss_mx[t] + (values[t-1][:, np.newaxis] + penalty_mx).min(axis=0) # values[t-1][:, np.newaxis] turns the (t-1)-th row into a column
131 |     # 
132 |     if return_value_mx:
133 |         return values
134 |     # find optimal path backwards
135 |     assign[-1] = values[-1].argmin()
136 |     value_opt = values[-1, assign[-1]]
137 |     # traceback
138 |     for t in range(n_s - 1, 0, -1):
139 |         assign[t-1] = (values[t-1] + penalty_mx[:, assign[t]]).argmin()
140 |     return assign, value_opt
141 | 
142 | # reviewed
143 | def raise_JM_labels_to_proba(labels_: np.ndarray, n_c: int, prob_vecs: Optional[np.ndarray] = None) -> np.ndarray:
144 |     """
145 |     Convert JM labels into a probability matrix. If `prob_vecs` is provided, 
146 |     the probability matrix is constructed using the probability vectors corresponding to each label. 
147 |     Otherwise, a hard-clustering probability matrix is created from the labels.
148 |     """
149 |     return prob_vecs[labels_] if prob_vecs is not None else raise_labels_into_proba(labels_, n_c)
150 | 
151 | # reviewed
152 | def raise_JM_proba_to_df(proba_: np.ndarray, X: DF_ARR_TYPE) -> DF_ARR_TYPE:
153 |     """
154 |     Convert a probability matrix into a pandas DataFrame, aligning with the index of the input 
155 |     data matrix `X`.
156 |     """
157 |     return raise_arr_to_pd_obj(proba_, X, columns_key=None, return_as_ser=False)
158 | 
159 | LARGE_FLOAT = 1e100
160 | 
161 | # reviewed
162 | def do_E_step(X: np.ndarray, 
163 |               centers_: np.ndarray, 
164 |               penalty_mx: np.ndarray, 
165 |               prob_vecs: Optional[np.ndarray] = None, 
166 |               return_value_mx: bool = False) -> Union[tuple[np.ndarray, np.ndarray, float], np.ndarray]:
167 |     """
168 |     Perform a single E-step: compute the loss matrix and calling the solver.
169 | 
170 |     This function handles both hard clustering and continuous models. The `centers_` parameter 
171 |     can contain `NaN` values. It returns the probabilities, labels, and optimal value, where 
172 |     `labels_` correspond to the state space.
173 | 
174 |     Parameters
175 |     ----------
176 |     X : ndarray of shape (n_s, n_f)
177 |         The input data matrix, where `n_s` is the number of samples and `n_f` is the number of features.
178 | 
179 |     centers_ : ndarray of shape (n_c, n_f)
180 |         The cluster centers. Can contain `NaN` values.
181 | 
182 |     penalty_mx : ndarray of shape (n_c, n_c)
183 |         The penalty matrix representing the transition cost between states.
184 | 
185 |     prob_vecs : ndarray of shape (N, n_c), optional
186 |         Probability vectors for the continuous model. If provided, this adjusts the loss matrix.
187 | 
188 |     return_value_mx : bool, optional (default=False)
189 |         If `True`, return the value matrix from the DP algorithm, which can be used for online inference.
190 | 
191 |     Returns
192 |     -------
193 |     tuple[np.ndarray, np.ndarray, float] or np.ndarray
194 |         If `return_value_mx` is `False`, returns a tuple containing:
195 |         - `proba_` : ndarray of shape (n_s, n_c)
196 |             The probability matrix, where each row corresponds to the probabilities for a sample.
197 |         - `labels_` : ndarray of shape (n_s,)
198 |             The state labels assigned to each sample.
199 |         - `val_` : float
200 |             The optimal value of the objective function.
201 | 
202 |         If `return_value_mx` is `True`, returns the value matrix instead of the tuple.
203 |     """
204 |     n_c = len(centers_)     # (n_c, n_f)
205 |     # compute loss matrix
206 |     loss_mx = .5 * cdist(X, centers_, "sqeuclidean")    # (n_s, n_c)
207 |                                                         # contain `nan` if `centers_` contains `nan`.
208 |     if prob_vecs is not None:    # cont model, (N, n_c)
209 |         # replace the nan in loss_mx by a very large floating number
210 |         loss_mx = np.nan_to_num(loss_mx, nan=LARGE_FLOAT, posinf=LARGE_FLOAT, neginf=LARGE_FLOAT)
211 |         loss_mx = loss_mx @ prob_vecs.T     # each pair of loss between period t and candidate vector, (n_s, N)
212 |     if return_value_mx: return dp(loss_mx, penalty_mx, return_value_mx=True)
213 |     # do a full E step
214 |     labels_, val_ = dp(loss_mx, penalty_mx, return_value_mx=False)     # output labels_ is of type int
215 |     proba_ = raise_JM_labels_to_proba(labels_, n_c, prob_vecs)
216 |     return proba_, labels_, val_    # the returned proba_ must be a valid proba arr
217 | 
218 | #################################
219 | ## feature weights
220 | #################################
221 | 
222 | # reviewed
223 | def valid_feat_weights(feat_weights: Optional[SER_ARR_TYPE]) -> None:
224 |     """
225 |     Validate the input `feat_weights`, ensuring all weights are non-negative and at least 
226 |     one is positive. This function is called at the beginning of the method to ensure 
227 |     the feature weights are valid.
228 | 
229 |     Parameters
230 |     ----------
231 |     feat_weights : Series or ndarray, optional
232 |         The array of feature weights to validate. If `None`, no validation is performed.
233 | 
234 |     Raises
235 |     ------
236 |     AssertionError
237 |         If any feature weights are negative or if no positive weights exist.
238 |     """
239 |     if feat_weights is None: return 
240 |     feat_weights_arr = check_1d_array(feat_weights)
241 |     assert (feat_weights_arr >= 0.).all(), "Feature weights must be non-negative."
242 |     assert (feat_weights_arr > 0.).any(), "At least one feature weight must be positive."
243 |     return 
244 | 
245 | # reviewed
246 | def _valid_shape_X_feat_weights(X: DF_ARR_TYPE, feat_weights: Optional[SER_ARR_TYPE]) -> None:
247 |     """
248 |     Assert that the dimensions of the input data matrix `X` and feature weights match.
249 | 
250 |     Parameters
251 |     ----------
252 |     X : DataFrame or ndarray
253 |         The input data matrix.
254 | 
255 |     feat_weights : Series or ndarray, optional
256 |         The array of feature weights. If `None`, no assertion is made.
257 | 
258 |     Raises
259 |     ------
260 |     AssertionError
261 |         If the dimensions of `X` and `feat_weights` do not match.
262 |     """
263 |     if feat_weights is None: 
264 |         return
265 |     if is_ser_df(X) and is_ser_df(feat_weights): 
266 |         assert (X.columns==feat_weights.index).all(), "Feature mismatch: column names do not match feature weight index."
267 |     else: 
268 |         assert X.shape[1]==len(feat_weights) , "Feature mismatch: number of features does not match feature weights."
269 |     return 
270 | 
271 | # reviewed
272 | def _weight_X(X: DF_ARR_TYPE, feat_weights: Optional[SER_ARR_TYPE]) -> np.ndarray:
273 |     """
274 |     Apply feature weights to the input data matrix `X`. If `feat_weights` is `None`, no 
275 |     weights are applied. It is assumed that dimensions match.
276 | 
277 |     Parameters
278 |     ----------
279 |     X : DataFrame or ndarray
280 |         The input data matrix.
281 | 
282 |     feat_weights : Series or ndarray, optional
283 |         The array of feature weights. If `None`, no weighting is applied.
284 | 
285 |     Returns
286 |     -------
287 |     np.ndarray
288 |         The weighted data matrix, with the same shape as `X`.
289 |     """
290 |     X_arr = check_2d_array(X)
291 |     if feat_weights is None: return X_arr
292 |     # Apply feature weights
293 |     feat_weights_arr = check_1d_array(feat_weights)
294 |     return X_arr * feat_weights_arr
295 |         
296 | # reviewed
297 | def check_X_with_feat_weights(X: DF_ARR_TYPE, feat_weights: Optional[SER_ARR_TYPE]) -> np.ndarray:
298 |     """
299 |     Process the input data matrix `X` and feature weights, returning a weighted version of `X`.
300 | 
301 |     Parameters
302 |     ----------
303 |     X : DataFrame or ndarray
304 |         The input data matrix.
305 | 
306 |     feat_weights : Series or ndarray, optional
307 |         The array of feature weights. If `None`, no weighting is applied.
308 | 
309 |     Returns
310 |     -------
311 |     np.ndarray
312 |         The weighted data matrix.
313 |     """
314 |     # Validate that the dimensions of X and feat_weights match
315 |     _valid_shape_X_feat_weights(X, feat_weights)
316 |     # Apply feature weights to X
317 |     return _weight_X(X, feat_weights)
318 | 
319 | #################################
320 | ## model code
321 | #################################
322 | 
323 | class JumpModel(BaseClusteringAlgo):
324 |     """
325 |     Statistical jump model estimation, supporting both discrete and continuous models.
326 | 
327 |     This class provides methods for fitting and predicting with jump models, using coordinate 
328 |     descent for optimization. Both discrete and continuous models are supported, with optional 
329 |     feature weighting and state sorting.
330 | 
331 |     Parameters
332 |     ----------
333 |     n_components : int, default=2
334 |         The number of components (states) in the model.
335 | 
336 |     jump_penalty : float, default=0.
337 |         Penalty term (`lambda`) applied to state transitions in both discrete and continuous models.
338 | 
339 |     cont : bool, default=False
340 |         If `True`, the continuous jump model is used. Otherwise, the discrete model is applied.
341 | 
342 |     grid_size : float, default=0.05
343 |         The grid size for discretizing the probability simplex. Only relevant for the continuous model.
344 | 
345 |     mode_loss : bool, default=True
346 |         Whether to apply the mode loss penalty. Only relevant for the continuous model.
347 | 
348 |     random_state : int or RandomState, optional (default=None)
349 |         Random number seed for reproducibility.
350 | 
351 |     max_iter : int, default=1000
352 |         Maximum number of iterations for the coordinate descent algorithm during model fitting.
353 | 
354 |     tol : float, default=1e-8
355 |         Stopping tolerance for the improvement in objective value during optimization.
356 | 
357 |     n_init : int, default=10
358 |         Number of initializations for the model fitting process.
359 | 
360 |     verbose : int, default=0
361 |         Controls the verbosity of the output. Higher values indicate more verbose output.
362 | 
363 |     Attributes
364 |     ----------
365 |     centers_ : ndarray of shape (n_c, n_f)
366 |         The cluster centroids estimated during model fitting.
367 | 
368 |     labels_ : Series or ndarray
369 |         In-sample fitted optimal label sequence.
370 | 
371 |     proba_ : DataFrame or ndarray
372 |         In-sample fitted optimal probability matrix.
373 | 
374 |     ret_, vol_ : Series or ndarray
375 |         The average return (`ret_`) and volatility (`vol_`) for each state. These attributes 
376 |         are available only if `ret_ser` is provided to the `.fit()` method.
377 | 
378 |     transmat_ : ndarray of shape (n_c, n_c)
379 |         The estimated transition probability matrix between states.
380 | 
381 |     val_ : float
382 |         The optimal value of the loss function.
383 |     """
384 |     # reviewed
385 |     def __init__(self,
386 |                  n_components: int = 2, 
387 |                  jump_penalty: float = 0., 
388 |                  cont: bool = False, 
389 |                  grid_size: float = 0.05, 
390 |                  mode_loss: bool = True, 
391 |                  random_state = RANDOM_STATE, 
392 |                  max_iter: int = 1000, 
393 |                  tol: float = 1e-8, 
394 |                  n_init: int = 10, 
395 |                  verbose: int = 0):
396 |         super().__init__(int(n_components), n_init, max_iter, tol, random_state, verbose)
397 |         self.jump_penalty = jump_penalty
398 |         self.cont = cont
399 |         self.grid_size = grid_size
400 |         self.mode_loss = mode_loss
401 |         self.alpha = 2  # the power raised to the jump penalty in CJM
402 | 
403 |     # reviewed           
404 |     def check_jump_penalty_mx(self) -> np.ndarray:
405 |         """
406 |         Initialize the jump penalty matrix for state transitions.
407 | 
408 |         - For the discrete model, the state space is {0, 1, ..., n_c - 1}, and the scalar 
409 |           `jump_penalty` is converted into a matrix.
410 |         - For the continuous model, `jump_penalty` is multiplied by the pairwise L1 distance 
411 |           between probability vectors. Optionally applies a mode loss penalty.
412 | 
413 |         Returns
414 |         -------
415 |         np.ndarray
416 |             The jump penalty matrix to be used in the model.
417 |         """
418 |         assert is_numbers(self.jump_penalty)
419 |         if not self.cont:
420 |             self.prob_vecs = None      # useful in the E step to tell whether the model is continuous/discrete.
421 |             jump_penalty_mx = jump_penalty_to_mx(self.jump_penalty, self.n_components) 
422 |         else:    # continuous model
423 |             self.prob_vecs = discretize_prob_simplex(self.n_components, self.grid_size)   # state space. useful for computing L mx in E step
424 |             pairwise_l1_dist = cdist(self.prob_vecs, self.prob_vecs, 'cityblock')/2
425 |             jump_penalty_mx = self.jump_penalty * (pairwise_l1_dist ** self.alpha)
426 |             if self.mode_loss:      # adding mode loss ensures that the penalty mx has correspondence with a TPM. i.e. sum(exp(- )) of every row leads to the same value.
427 |                 mode_loss = logsumexp(-jump_penalty_mx, axis=1, keepdims=True)
428 |                 mode_loss -= mode_loss[0]     # offset a constant
429 |                 jump_penalty_mx += mode_loss
430 |         self.jump_penalty_mx = jump_penalty_mx      # to be used in `.predict()`  & `.predict_proba()`
431 |         return jump_penalty_mx
432 |     
433 |     # reviewed
434 |     def check_X_predict_func(self, X: DF_ARR_TYPE) -> np.ndarray:
435 |         """
436 |         Validate the input data `X` for all prediction methods (but not for fitting), 
437 |         and apply feature weighting if applicable. Assumes that the model has already 
438 |         been fitted.
439 | 
440 |         This method overrides the superclass method.
441 | 
442 |         Parameters
443 |         ----------
444 |         X : DataFrame or ndarray
445 |             The input data matrix.
446 | 
447 |         Returns
448 |         -------
449 |         np.ndarray
450 |             The weighted input data matrix, if feature weights are provided.
451 |         """
452 |         self.is_shape_match_X_centers(X)
453 |         feat_weights = getattr_(self, "feat_weights")
454 |         return check_X_with_feat_weights(X, feat_weights)
455 |     
456 |     # reviewed
457 |     def fit(self, 
458 |             X: DF_ARR_TYPE, 
459 |             ret_ser: Optional[SER_ARR_TYPE] = None, 
460 |             feat_weights: Optional[SER_ARR_TYPE] = None,
461 |             sort_by: Optional[str] = "cumret"):
462 |         """
463 |         Fit the jump model using the coordinate descent algorithm.
464 | 
465 |         The states are sorted by the specified criterion: ["cumret", "vol", "freq", "ret"].
466 |         The Viterbi algorithm is optionally used for state assignment. This choice does 
467 |         not impact the final numerical results but may affect computational speed.
468 | 
469 |         Parameters
470 |         ----------
471 |         X : DataFrame or ndarray
472 |             The input data matrix.
473 | 
474 |         ret_ser : Series or ndarray, optional
475 |             A return series used for sorting states and calculating state-specific returns 
476 |             and volatilities.
477 | 
478 |         feat_weights : Series or ndarray, optional
479 |             Feature weights to apply to the input data matrix.
480 | 
481 |         sort_by : ["cumret", "vol", "freq", "ret"], optional (default="cumret")
482 |             Criterion for sorting the states.
483 |         """
484 |         # valid feat weights
485 |         valid_feat_weights(feat_weights)
486 |         # check X
487 |         X_arr = check_X_with_feat_weights(X, feat_weights)
488 |         # save valid feat weights
489 |         self.feat_weights = feat_weights
490 |         # get attributes
491 |         n_c = self.n_components
492 |         max_iter = self.max_iter
493 |         tol = self.tol
494 |         verbose = self.verbose
495 |         # make sure the state space, and compute the penalty matrix used for the E step
496 |         jump_penalty_mx = self.check_jump_penalty_mx()
497 |         # init centers
498 |         init_centers_values = self.init_centers(X_arr)
499 |         # the best results over all initializations, compare to it in the last part of each iteration
500 |         best_val = np.inf
501 |         best_res = {}   # store: "centers_", "proba_", "labels_".
502 |         best_res['labels_'] = None # "labels_" is not always 0/1, but the labels of the state space (candidate prob vecs)
503 |                                    #  it is only used to compare whether two inits lead to the same estimation. the final `labels_` is based on `proba_.argmax(axis=1)`.
504 |         # iter over all the initializations
505 |         for n_init_, centers_ in enumerate(init_centers_values):
506 |             # initialize the labels and value in the previous iteration.
507 |             labels_pre, val_pre = None, np.inf
508 |             # do one E step
509 |             proba_, labels_, val_ = do_E_step(X_arr, centers_, jump_penalty_mx, prob_vecs=self.prob_vecs)
510 |             num_iter = 0
511 |             # iterate between M and E steps
512 |             while (num_iter < max_iter and (not is_same_clustering(labels_, labels_pre)) and val_pre - val_ > tol):
513 |                 # update
514 |                 num_iter += 1
515 |                 labels_pre, val_pre = labels_, val_
516 |                 # M step: update centers
517 |                 centers_ = weighted_mean_cluster(X_arr, proba_) 
518 |                 # E step
519 |                 proba_, labels_, val_ = do_E_step(X_arr, centers_, jump_penalty_mx, prob_vecs=self.prob_vecs)
520 |             if verbose: print(f"{n_init_}-th init. val: {val_}")
521 |             # compare with previous initializations
522 |             if (not is_same_clustering(best_res['labels_'], labels_)) and val_ < best_val:
523 |                 best_idx = n_init_
524 |                 best_val = val_
525 |                 # save model attributes
526 |                 best_res['centers_'] = centers_
527 |                 best_res['labels_'] = labels_   # only used to compare with later iters, won't permutate
528 |                 best_res['proba_'] = proba_
529 |         self.val_ = best_val
530 |         if verbose: print(f"{best_idx}-th init has the best value: {best_val}.")
531 |         # sort states
532 |         sort_states_from_ret(ret_ser, X, best_res, sort_by=sort_by)
533 |         # save attributes
534 |         if ret_ser is not None:
535 |             self.ret_ = best_res["ret_"]
536 |             self.vol_ = best_res["vol_"]
537 |         self.centers_ = best_res['centers_']        # weighted centers
538 |         self.proba_ = raise_JM_proba_to_df(best_res['proba_'], X)
539 |         self.labels_ = reduce_proba_to_labels(self.proba_)
540 |         self.transmat_ = empirical_trans_mx(self.labels_, n_components=n_c)
541 |         return self
542 |         
543 |     # reviewed
544 |     def predict_proba_online(self, X: DF_ARR_TYPE) -> DF_ARR_TYPE:
545 |         """
546 |         Predict the probability of each state in an online fashion, where the prediction 
547 |         for the i-th row is based only on data prior to that row.
548 | 
549 |         Parameters
550 |         ----------
551 |         X : DataFrame or ndarray
552 |             The input data matrix.
553 | 
554 |         Returns
555 |         -------
556 |         DataFrame or ndarray
557 |             The predicted probabilities for each state.
558 |         """
559 |         X_arr = self.check_X_predict_func(X)
560 |         value_mx = do_E_step(X_arr, self.centers_, self.jump_penalty_mx, self.prob_vecs, return_value_mx=True)
561 |         labels_ = value_mx.argmin(axis=1)
562 |         proba_ = raise_JM_labels_to_proba(labels_, self.n_components, self.prob_vecs)
563 |         return raise_JM_proba_to_df(proba_, X)
564 |     
565 |     # reviewed
566 |     def predict_online(self, X: DF_ARR_TYPE) -> SER_ARR_TYPE:
567 |         """
568 |         Predict the state in an online fashion, where the prediction for the i-th row 
569 |         is based only on data prior to that row.
570 | 
571 |         Parameters
572 |         ----------
573 |         X : DataFrame or ndarray
574 |             The input data matrix.
575 | 
576 |         Returns
577 |         -------
578 |         Series or ndarray
579 |             The predicted state labels for each sample.
580 |         """
581 |         return reduce_proba_to_labels(self.predict_proba_online(X))
582 |     
583 |     # reviewed
584 |     def predict_proba(self, X: DF_ARR_TYPE) -> DF_ARR_TYPE:
585 |         """
586 |         Predict the probability of each state, using all available data in `X`.
587 | 
588 |         Parameters
589 |         ----------
590 |         X : DataFrame or ndarray
591 |             The input data matrix.
592 | 
593 |         use_viterbi : bool, optional (default=True)
594 |             Whether to use the Viterbi solver.
595 | 
596 |         Returns
597 |         -------
598 |         DataFrame or ndarray
599 |             The predicted probabilities for each state.
600 |         """
601 |         X_arr = self.check_X_predict_func(X)
602 |         proba_, _, _ = do_E_step(X_arr, self.centers_, self.jump_penalty_mx, self.prob_vecs)
603 |         return raise_JM_proba_to_df(proba_, X)
604 | 
605 |     # reviewed
606 |     def predict(self, X: DF_ARR_TYPE) -> SER_ARR_TYPE:
607 |         """
608 |         Predict the state for each sample, using all available data in `X`.
609 | 
610 |         Parameters
611 |         ----------
612 |         X : DataFrame or ndarray
613 |             The input data matrix.
614 | 
615 |         use_viterbi : bool, optional (default=True)
616 |             Whether to use the Viterbi solver.
617 | 
618 |         Returns
619 |         -------
620 |         Series or ndarray
621 |             The predicted state labels for each sample.
622 |         """
623 |         return reduce_proba_to_labels(self.predict_proba(X))
624 | 


--------------------------------------------------------------------------------
/jumpmodels/plot.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module for plotting functions, especially for visualizing regime identification.
  3 | 
  4 | Depends on:
  5 | -----------
  6 | utils/ : Modules
  7 | """
  8 | 
  9 | from .utils import *
 10 | 
 11 | import matplotlib.pyplot as plt
 12 | 
 13 | ALPHA_LINE = .8
 14 | ALPHA_FILL = .3
 15 | AXES_TYPE = Optional[plt.Axes]
 16 | 
 17 | ############################
 18 | ## matplotlib setting
 19 | ############################
 20 | 
 21 | # reviewed
 22 | def matplotlib_setting():
 23 |     """
 24 |     Set global rcParams for matplotlib to produce nice and large publication-quality figures.
 25 |     """
 26 |     plt.rcParams['figure.figsize'] = (24, 12)
 27 |     plt.rcParams['axes.titlesize'] = 30
 28 |     plt.rcParams['axes.labelsize'] = 30
 29 |     plt.rcParams['xtick.labelsize'] = 30
 30 |     plt.rcParams['ytick.labelsize'] = 30
 31 |     plt.rcParams['legend.fontsize'] = 30
 32 |     plt.rcParams['font.size'] = 26
 33 |     plt.rcParams['font.family'] = 'cmr10'
 34 |     plt.rcParams['axes.formatter.use_mathtext'] = True
 35 |     plt.rcParams['text.usetex'] = True
 36 |     plt.rcParams['text.latex.preamble'] = r'\usepackage{amsmath}'
 37 |     plt.rcParams["savefig.dpi"] = 300
 38 |     plt.rcParams["savefig.bbox"] = "tight"
 39 |     return 
 40 | 
 41 | # Set global matplotlib params
 42 | matplotlib_setting()
 43 | 
 44 | ########################################################
 45 | ## File I/O + Function I/O
 46 | ########################################################
 47 | 
 48 | # reviewed
 49 | def savefig_plt(filepath, close=False):
 50 |     """
 51 |     Save the current figure to a specified path. Automatically creates the folder if it doesn't exist.
 52 | 
 53 |     Parameters
 54 |     ----------
 55 |     filepath : str
 56 |         The path where the figure should be saved.
 57 | 
 58 |     close : bool, optional (default=False)
 59 |         Whether to close the figure after saving.
 60 |     """
 61 |     check_dir_exist(filepath)
 62 |     plt.savefig(filepath)
 63 |     if close: plt.close()
 64 |     return 
 65 | 
 66 | # reviewed
 67 | def check_axes(ax: AXES_TYPE = None, nrows=1, ncols=1, figsize_single=(24, 12), **kwargs) -> Union[plt.Axes, np.ndarray]:
 68 |     """
 69 |     Create a new axes if `ax` is None; otherwise return the existing axes.
 70 | 
 71 |     Parameters
 72 |     ----------
 73 |     ax : plt.Axes, optional
 74 |         An existing matplotlib Axes object. If None, a new one is created.
 75 | 
 76 |     nrows : int, optional (default=1)
 77 |         Number of rows for the subplots.
 78 | 
 79 |     ncols : int, optional (default=1)
 80 |         Number of columns for the subplots.
 81 | 
 82 |     figsize_single : tuple, optional (default=(24, 12))
 83 |         The size of a single subplot.
 84 | 
 85 |     Returns
 86 |     -------
 87 |     plt.Axes or np.ndarray
 88 |         The axes object(s) for plotting.
 89 |     """
 90 |     if ax is None:
 91 |         w, h = figsize_single
 92 |         _, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols*w, nrows*h), **kwargs)
 93 |     return ax
 94 | 
 95 | ########################################################
 96 | ## Plotting Cumulative Returns
 97 | ########################################################
 98 | 
 99 | # Convert y-axis to percent format
100 | from matplotlib.ticker import FuncFormatter
101 | 
102 | def convert_yaxis_to_percent(ax: plt.Axes) -> None:
103 |     """
104 |     Convert the ticks on the y-axis to percent without decimals (e.g., 4.0 becomes 400%).
105 |     """
106 |     def to_percent(x, position): 
107 |         pos_flag = x >= 0 
108 |         string = f"{abs(x) * 100:.0f}\\%"
109 |         if pos_flag: return string
110 |         return "$-$" + string
111 |     ax.yaxis.set_major_formatter(FuncFormatter(to_percent))
112 |     return 
113 | 
114 | # reviewed
115 | def plot_cumret(ret_df: Union[PD_TYPE, dict], 
116 |                 start_date: DATE_TYPE = None, 
117 |                 end_date: DATE_TYPE = None, 
118 |                 ax: AXES_TYPE = None, 
119 |                 ylabel_ret="Cumulative Returns",
120 |                 ) -> plt.Axes:
121 |     """
122 |     Plot the cumulative returns from a return DataFrame or dictionary.
123 | 
124 |     Parameters
125 |     ----------
126 |     ret_df : DataFrame or dict
127 |         The input return data for computing cumulative returns.
128 | 
129 |     start_date : str or datetime.date, optional
130 |         The start date for the plot. Defaults to None.
131 | 
132 |     end_date : str or datetime.date, optional
133 |         The end date for the plot. Defaults to None.
134 | 
135 |     ax : plt.Axes, optional
136 |         The axes on which to plot. If None, a new one is created.
137 | 
138 |     ylabel_ret : str, optional (default="Cumulative Returns")
139 |         The label for the y-axis.
140 | 
141 |     Returns
142 |     -------
143 |     plt.Axes
144 |         The axes object with the plotted cumulative returns.
145 |     """
146 |     ax = check_axes(ax)
147 |     # Process and filter the return data
148 |     ret_df = filter_date_range(pd.DataFrame(ret_df), start_date, end_date)
149 |     ret_df.index.name = None
150 |     # plot cumret
151 |     ret_df.cumsum(axis=0).plot(ax=ax)
152 |     # set ax attrs
153 |     ax.set(ylabel=ylabel_ret)
154 |     convert_yaxis_to_percent(ax)
155 |     return ax
156 | 
157 | ############################
158 | ## plot regimes
159 | ############################
160 | 
161 | # reviewed
162 | def fill_between(ser: pd.Series,
163 |                  start_date: DATE_TYPE = None, 
164 |                  end_date: DATE_TYPE = None, 
165 |                  ax: AXES_TYPE = None, 
166 |                  color: Optional[str] = None, 
167 |                  fill_between_label: Optional[str] = None) -> plt.Axes:
168 |     """
169 |     Fill the area between a curve and the x-axis with a specified color and label.
170 | 
171 |     Parameters
172 |     ----------
173 |     ser : pd.Series
174 |         The data series to plot.
175 | 
176 |     start_date : str or datetime.date, optional
177 |         The start date for the plot. Defaults to None.
178 | 
179 |     end_date : str or datetime.date, optional
180 |         The end date for the plot. Defaults to None.
181 | 
182 |     ax : plt.Axes, optional
183 |         The axes on which to plot. If None, a new one is created.
184 | 
185 |     color : str, optional
186 |         The fill color. Defaults to None.
187 | 
188 |     fill_between_label : str, optional
189 |         The label for the filled area. Defaults to None.
190 | 
191 |     Returns
192 |     -------
193 |     plt.Axes
194 |         The axes object with the filled area plot.
195 |     """
196 |     ax = check_axes(ax)
197 |     # filter dates
198 |     ser = filter_date_range(ser, start_date, end_date)
199 |     # plot
200 |     ax.fill_between(ser.index, ser, step="pre", alpha=ALPHA_FILL, color=color, label=fill_between_label)
201 |     ax.legend()
202 |     return ax
203 | 
204 | # reviewed
205 | def plot_regimes(regimes: PD_TYPE, 
206 |                  n_c: int = 2, 
207 |                  start_date: DATE_TYPE = None, 
208 |                  end_date: DATE_TYPE = None, 
209 |                  ax: AXES_TYPE = None, 
210 |                  colors_regimes: Optional[list] = ['g', 'r'], 
211 |                  labels_regimes: Optional[list] = ['Bull', 'Bear'],
212 |                  ) -> plt.Axes:
213 |     """
214 |     Plot regime identification based on a 1D label series or 2D probability matrix.
215 | 
216 |     Parameters
217 |     ----------
218 |     regimes : DataFrame or Series
219 |         The regime data to plot. A integer sequence from {0, 1, ..., n_c-1} if 1D input, 
220 |         or a probability matrix of shape (n_s, n_c)
221 | 
222 |     n_c : int, optional (default=2)
223 |         The number of components (regimes) to plot.
224 | 
225 |     start_date : str or datetime.date, optional
226 |         The start date for the plot. Defaults to None.
227 | 
228 |     end_date : str or datetime.date, optional
229 |         The end date for the plot. Defaults to None.
230 | 
231 |     ax : plt.Axes, optional
232 |         The axes on which to plot. If None, a new one is created.
233 | 
234 |     colors_regimes : list, optional
235 |         The colors for the regimes. Defaults to ['g', 'r'] (`n_c = 2`).
236 |         if `None`, colors will be automatically generated.
237 | 
238 |     labels_regimes : list, optional
239 |         The labels for the regimes. Defaults to ['Bull', 'Bear'] (`n_c = 2`).
240 |         if `None`, labels will be automatically generated.
241 | 
242 |     Returns
243 |     -------
244 |     plt.Axes
245 |         The axes object with the regime plot.
246 |     """
247 |     regimes = filter_date_range(regimes, start_date, end_date)
248 |     if is_ser(regimes):
249 |         regimes = pd.DataFrame(raise_labels_into_proba(regimes.to_numpy(), n_c=n_c), index=regimes.index)
250 |     assert regimes.shape[1]==n_c, "Mismatch between number of components and regime data shape."
251 |     ax = check_axes(ax)
252 |     # color list
253 |     if colors_regimes is None:  # generate color list
254 |         color_cycle = plt.rcParams['axes.prop_cycle'].by_key()['color']
255 |         colors_regimes = [color_cycle[i % len(color_cycle)] for i in range(n_c)]
256 |     else:
257 |         assert len(colors_regimes) == n_c, "Mismatch between length of color list and number of components. You can input `colors_regimes = None` for colors to be generated authomatically."
258 |     # labels
259 |     if labels_regimes is None:
260 |         labels_regimes = [f"Regime {i}" for i in range(1, n_c+1)]
261 |     else:
262 |         assert len(labels_regimes) == n_c, "Mismatch between length of label list and number of components. You can input `labels_regimes = None` for labels to be generated authomatically."
263 |     # plot
264 |     for i in range(n_c):
265 |         fill_between(regimes.iloc[:, i], ax=ax, color=colors_regimes[i], fill_between_label=labels_regimes[i])
266 |     return ax
267 | 
268 | # reviewed
269 | def plot_regimes_and_cumret(regimes: PD_TYPE, 
270 |                             ret_df: Union[PD_TYPE, dict], 
271 |                             n_c: int = 2, 
272 |                             start_date: DATE_TYPE = None, 
273 |                             end_date: DATE_TYPE = None, 
274 |                             ax: AXES_TYPE = None, 
275 |                             colors_regimes: Optional[list] = ['g', 'r'], 
276 |                             labels_regimes: Optional[list] = ['Bull', 'Bear'],
277 |                             ylabel_ret="Cumulative Returns",
278 |                             legend_loc="upper left"
279 |                             ) -> tuple[plt.Axes, plt.Axes]:
280 |     """
281 |     Plot cumulative returns alongside regime identification in a single figure.
282 | 
283 |     Parameters
284 |     ----------
285 |     regimes : DataFrame or Series
286 |         The regime data to plot. A integer sequence from {0, 1, ..., n_c-1} if 1D input, 
287 |         or a probability matrix of shape (n_s, n_c)
288 | 
289 |     ret_df : DataFrame or dict
290 |         The return data to plot.
291 | 
292 |     n_c : int, optional (default=2)
293 |         The number of regimes/components.
294 | 
295 |     start_date : str or datetime.date, optional
296 |         The start date for the plot. Defaults to None.
297 | 
298 |     end_date : str or datetime.date, optional
299 |         The end date for the plot. Defaults to None.
300 | 
301 |     ax : plt.Axes, optional
302 |         The axes on which to plot. If None, a new one is created.
303 | 
304 |     colors_regimes : list, optional
305 |         The colors for the regimes. Defaults to ['g', 'r'] (`n_c = 2`).
306 |         if `None`, colors will be automatically generated.
307 | 
308 |     labels_regimes : list, optional
309 |         The labels for the regimes. Defaults to ['Bull', 'Bear'] (`n_c = 2`).
310 |         if `None`, labels will be automatically generated.
311 | 
312 |     ylabel_ret : str, optional
313 |         The label for the cumulative return y-axis.
314 | 
315 |     legend_loc : str, optional
316 |         The location of the legend.
317 | 
318 |     Returns
319 |     -------
320 |     tuple
321 |         The axes objects for cumulative returns and regimes.
322 |     """
323 |     # plot cumret
324 |     ax = plot_cumret(ret_df, start_date=start_date, end_date=end_date, ax=ax, ylabel_ret=ylabel_ret)
325 |     # plot regimes
326 |     ax2 = ax.twinx()
327 |     ax2.set(ylabel="Regime")
328 |     plot_regimes(regimes, n_c, start_date=start_date, end_date=end_date, ax=ax2, colors_regimes=colors_regimes, labels_regimes=labels_regimes)
329 |     # merge legneds
330 |     lines, labels = ax.get_legend_handles_labels()
331 |     lines2, labels2 = ax2.get_legend_handles_labels()
332 |     legend = ax2.legend(lines + lines2, labels + labels2, loc=legend_loc)
333 |     return (ax, ax2)
334 | 


--------------------------------------------------------------------------------
/jumpmodels/preprocess.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module for data preprocessing.
  3 | 
  4 | This module contains classes for scaling and clipping data, with a focus on 
  5 | handling pandas DataFrame input/output.
  6 | 
  7 | Depends on
  8 | ----------
  9 | utils/ : Modules
 10 | """
 11 | 
 12 | from .utils import *
 13 | 
 14 | from sklearn.base import BaseEstimator
 15 | from sklearn.preprocessing import StandardScaler
 16 | 
 17 | ############################################
 18 | ## Scaler
 19 | ############################################
 20 | 
 21 | # reviewed
 22 | class StandardScalerPD(BaseEstimator):
 23 |     """
 24 |     Provides support for pandas DataFrame input/output with the `StandardScaler()` class.
 25 |     
 26 |     This class extends the functionality of the standard `StandardScaler` by ensuring that
 27 |     the input and output are handled as pandas DataFrames, preserving index and column labels.
 28 |     """
 29 |     def init_scaler(self):
 30 |         """
 31 |         Initialize and return the standard `StandardScaler` instance.
 32 |         """
 33 |         return StandardScaler()
 34 |     
 35 |     def fit_transform(self, X: DF_ARR_TYPE) -> DF_ARR_TYPE:
 36 |         """
 37 |         Fit the scaler to the DataFrame and transform it in one step.
 38 |         
 39 |         Parameters
 40 |         ----------
 41 |         X : DataFrame or ndarray
 42 |             The input DataFrame to be scaled.
 43 |             
 44 |         Returns
 45 |         -------
 46 |         DataFrame or ndarray
 47 |             The scaled DataFrame.
 48 |         """
 49 |         return self.fit(X).transform(X)
 50 |     
 51 |     def fit(self, X: DF_ARR_TYPE):
 52 |         """
 53 |         Fit the scaler to the input DataFrame.
 54 |         
 55 |         Parameters
 56 |         ----------
 57 |         X : DataFrame or ndarray
 58 |             The input DataFrame to be used for fitting.
 59 |         
 60 |         Returns
 61 |         -------
 62 |         self
 63 |         """
 64 |         self.scaler = self.init_scaler().fit(X)
 65 |         return self
 66 | 
 67 |     def transform(self, X: DF_ARR_TYPE) -> DF_ARR_TYPE:
 68 |         """
 69 |         Transform the input DataFrame using the fitted scaler.
 70 |         
 71 |         Parameters
 72 |         ----------
 73 |         X : DataFrame or ndarray
 74 |             The input DataFrame to be transformed.
 75 |         
 76 |         Returns
 77 |         -------
 78 |         DataFrame or ndarray
 79 |             The transformed (scaled) DataFrame.
 80 |         """
 81 |         return raise_arr_to_pd_obj(self.scaler.transform(X), X, return_as_ser=False)
 82 | 
 83 | ############################################
 84 | ## Clipper
 85 | ############################################
 86 | 
 87 | # reviewed
 88 | class BaseDataClipper(BaseEstimator):
 89 |     """
 90 |     Base class for data clippers. 
 91 | 
 92 |     This class implements the `.transform()` and `.fit_transform()` methods, but leaves the `.fit()` 
 93 |     method to be implemented in subclasses. It is designed to clip data values within a specified range.
 94 |     
 95 |     Should be inherited by other classes that define the clipping bounds.
 96 |     """
 97 |     def __init__(self) -> None:
 98 |         self.lb = None  # Lower bound, initialized as None. Must be a numpy array.
 99 |         self.ub = None  # Upper bound, initialized as None. Must be a numpy array.
100 | 
101 |     def fit(self, X: DF_ARR_TYPE):
102 |         raise NotImplementedError()
103 | 
104 |     def fit_transform(self, X: DF_ARR_TYPE) -> DF_ARR_TYPE:
105 |         """
106 |         Fit the clipper and transform the input data in one step.
107 |         
108 |         Parameters
109 |         ----------
110 |         X : DataFrame or ndarray
111 |             The input data to be clipped.
112 | 
113 |         Returns
114 |         -------
115 |         DataFrame or ndarray
116 |             The clipped data.
117 |         """
118 |         return self.fit(X).transform(X)
119 |     
120 |     def transform(self, X: DF_ARR_TYPE) -> DF_ARR_TYPE:
121 |         """
122 |         Clip the input data using the fitted lower (`lb`) and upper (`ub`) bounds.
123 |         
124 |         Parameters
125 |         ----------
126 |         X : DataFrame or ndarray
127 |             The input data to be clipped.
128 | 
129 |         Returns
130 |         -------
131 |         DataFrame or ndarray
132 |             The clipped data.
133 |         """
134 |         if self.ub is None and self.lb is None: return X
135 |         return np.clip(X, self.lb, self.ub)
136 | 
137 | # reviewed
138 | class DataClipperStd(BaseDataClipper):
139 |     """
140 |     Data clipper based on feature standard deviation.
141 | 
142 |     This class performs winsorization of the data, clipping it within a specified multiple of the 
143 |     feature's standard deviation. The clipping bounds are defined as:
144 |     
145 |     lower bound = mean - (mul * std)
146 |     upper bound = mean + (mul * std)
147 | 
148 |     Parameters
149 |     ----------
150 |     mul : float, default=3.
151 |         The multiple of the feature's standard deviation used for clipping.
152 | 
153 |     Attributes
154 |     ----------
155 |     lb : ndarray
156 |         The lower bound for each feature, calculated as mean - (mul * std).
157 |     
158 |     ub : ndarray
159 |         The upper bound for each feature, calculated as mean + (mul * std).
160 |     """
161 |     def __init__(self, mul: float = 3.) -> None:
162 |         super().__init__()
163 |         self.mul = mul
164 | 
165 |     def fit(self, X: DF_ARR_TYPE):
166 |         """
167 |         Fit the clipper to the data by calculating the clipping bounds based on 
168 |         the mean and standard deviation of each feature.
169 | 
170 |         Parameters
171 |         ----------
172 |         X : DataFrame or ndarray
173 |             The input data to fit the clipper.
174 | 
175 |         Returns
176 |         -------
177 |         DataClipperStd
178 |             The fitted clipper instance.
179 |         """
180 |         mul = self.mul
181 |         assert mul > 0, "The multiplier `mul` must be positive."
182 | 
183 |         mean, std = X.mean(axis=0), X.std(axis=0, ddof=0)
184 |         if is_df(X):
185 |             mean = mean.to_numpy()
186 |             std = std.to_numpy()
187 |         self.lb = mean - mul * std; assert isinstance(self.lb, np.ndarray)
188 |         self.ub = mean + mul * std; assert isinstance(self.ub, np.ndarray)
189 |         return self
190 | 


--------------------------------------------------------------------------------
/jumpmodels/sparse_jump.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module for Sparse Jump Models (SJMs).
  3 | 
  4 | This module provides an implementation of sparse jump models, extending the jump model 
  5 | with additional support for feature selection through Lasso-like optimization.
  6 | 
  7 | Depends on
  8 | ----------
  9 | utils/ : Modules
 10 |     Utility functions for validation and clustering operations.
 11 | jump : Module
 12 |     Discrete and continuous jump models.
 13 | """
 14 | 
 15 | from .utils import *
 16 | from .jump import *
 17 | 
 18 | from numpy.linalg import norm
 19 | 
 20 | ########################################################
 21 | ## Lasso Problem for Feature Weights
 22 | ########################################################
 23 | 
 24 | # reviewed
 25 | def binary_search_decrease(func, 
 26 |                            left: float, 
 27 |                            right: float, 
 28 |                            value: float, 
 29 |                            *args, 
 30 |                            tol_x: float = 1e-8, 
 31 |                            tol_y: float = 0., 
 32 |                            max_iter: int = 100, 
 33 |                            verbose: int = 0,
 34 |                            **kwargs) -> float:
 35 |     """
 36 |     Binary search for a decreasing function.
 37 | 
 38 |     This method performs binary search to find the point where the function `func` 
 39 |     decreases to a specified value within given tolerances.
 40 | 
 41 |     Parameters
 42 |     ----------
 43 |     func : callable
 44 |         The function to be minimized.
 45 |     
 46 |     left : float
 47 |         The left bound for the search.
 48 | 
 49 |     right : float
 50 |         The right bound for the search.
 51 | 
 52 |     value : float
 53 |         The target value to find.
 54 | 
 55 |     tol_x : float, optional (default=1e-8)
 56 |         The tolerance for the search along the x-axis.
 57 | 
 58 |     tol_y : float, optional (default=0.)
 59 |         The tolerance for the search along the y-axis (function value).
 60 | 
 61 |     max_iter : int, optional (default=100)
 62 |         Maximum number of iterations.
 63 | 
 64 |     verbose : int, optional (default=0)
 65 |         Verbosity level. If greater than 0, prints progress information.
 66 | 
 67 |     Returns
 68 |     -------
 69 |     float
 70 |         The optimal point where the function reaches the target value.
 71 |     """
 72 |     if value >= func(left): return left
 73 |     if value <= func(right): return right
 74 |     # 
 75 |     gap = right-left
 76 |     num_iter = 0
 77 |     while (gap > tol_x and num_iter < max_iter):
 78 |         # print(f"{left}, {right}")
 79 |         num_iter += 1
 80 |         middle = (right + left) / 2
 81 |         func_call = func(middle, *args, **kwargs)
 82 |         if verbose: print("x value", middle, "y value", func_call)
 83 |         if func_call < value-tol_y/2:
 84 |             right = middle
 85 |         elif func_call > value+tol_y/2:
 86 |             left = middle
 87 |         else:
 88 |             return middle
 89 |         gap /= 2
 90 |     if num_iter < max_iter:
 91 |         return middle
 92 |     raise Exception("Non-convergence: Possible mathematical error.")
 93 |     
 94 | # reviewed
 95 | def soft_thres_l2_normalized(x: SER_ARR_TYPE, thres: float = 0.) -> SER_ARR_TYPE:
 96 |     """
 97 |     Soft thresholding for a non-negative vector `x`, followed by L2 normalization.
 98 | 
 99 |     Parameters
100 |     ----------
101 |     x : Series or ndarray
102 |         The input vector to be thresholded and normalized.
103 | 
104 |     thres : float, optional (default=0.)
105 |         The threshold for soft thresholding.
106 | 
107 |     Returns
108 |     -------
109 |     Series or ndarray
110 |         The thresholded and L2-normalized vector.
111 |     """
112 |     y = np.maximum(0, x-thres)
113 |     y_norm = norm(y)
114 |     assert y_norm > 0
115 |     return y / y_norm
116 | 
117 | # reviewed
118 | def solve_lasso(a: SER_ARR_TYPE, 
119 |                 norm_ub: float, 
120 |                 tol: float = 1e-8) -> SER_ARR_TYPE:
121 |     """
122 |     Solve the Lasso problem for feature weights.
123 | 
124 |     This function finds the optimal feature weights subject to the constraint that the 
125 |     L1-norm of the weights is bounded by `norm_ub`.
126 | 
127 |     Parameters
128 |     ----------
129 |     a : Series or ndarray
130 |         The input vector for the Lasso problem.
131 | 
132 |     norm_ub : float
133 |         The upper bound for the L1-norm of the feature weights. 
134 |         Equals to `kappa` in the published articles.
135 | 
136 |     tol : float, optional (default=1e-8)
137 |         The tolerance for the binary search.
138 | 
139 |     Returns
140 |     -------
141 |     Series or ndarray
142 |         The optimized feature weights.
143 |     """
144 |     assert norm_ub >= 1.
145 |     a_arr = check_1d_array(a)
146 |     left, right = 0., np.unique(a_arr)[-2]  # right is the second largest element of `a`
147 |     if right < tol: thres_sol = 0.
148 |     else:
149 |         func = lambda thres: soft_thres_l2_normalized(a_arr, thres).sum()
150 |         thres_sol = binary_search_decrease(func, left, right, norm_ub, tol_x=tol)
151 |     # return thres_sol
152 |     w = soft_thres_l2_normalized(a_arr, thres_sol)
153 |     return raise_arr_to_pd_obj(w, a)
154 | 
155 | # reviewed
156 | def compute_BCSS(X: DF_ARR_TYPE, 
157 |                  proba_: DF_ARR_TYPE, 
158 |                  centers_: Optional[np.ndarray] = None,
159 |                  tol: float = 1e-6) -> SER_ARR_TYPE:
160 |     """
161 |     Compute the Between Cluster Sum of Squares (BCSS).
162 | 
163 |     The BCSS is computed based on the cluster centers and probabilities. If no centers are provided, 
164 |     they will be computed from probabilities. Any BCSS values below the tolerance are set to zero.
165 | 
166 |     Parameters
167 |     ----------
168 |     X : DataFrame or ndarray
169 |         The input data matrix.
170 | 
171 |     proba_ : DataFrame or ndarray
172 |         The cluster assignment probabilities.
173 | 
174 |     centers_ : ndarray, optional
175 |         The cluster centers. NA values are acceptable.
176 |         If not provided, they are estimated from the data.
177 | 
178 |     tol : float, optional (default=1e-6)
179 |         The tolerance for setting BCSS values to zero.
180 | 
181 |     Returns
182 |     -------
183 |     Series or ndarray
184 |         The BCSS values for each feature.
185 |     """
186 |     X_arr, proba_arr = check_2d_array(X), check_2d_array(proba_)
187 |     if centers_ is None: centers_ = weighted_mean_cluster(X_arr, proba_arr)
188 |     # replace NAs in centers with 0. won't affect computation
189 |     centers_ = np.nan_to_num(centers_, nan=0.)
190 |     # assert not np.isnan(centers_).any()
191 |     Ns = proba_arr.sum(axis=0)
192 |     BCSS = Ns @ ((centers_ - X_arr.mean(axis=0))**2)
193 |     BCSS = set_zero_arr(BCSS, tol=tol)
194 |     assert not np.isnan(BCSS).any()
195 |     return raise_arr_to_pd_obj(BCSS, X, index_key="columns")
196 | 
197 | ############################
198 | ## SJM
199 | ############################
200 | 
201 | class SparseJumpModel(BaseEstimator):
202 |     """
203 |     Sparse Jump Model (SJM) with feature selection.
204 | 
205 |     This model extends the standard jump model by incorporating a Lasso-like feature 
206 |     selection process, where the number of selected features is controlled by `max_feats`.
207 | 
208 |     Parameters
209 |     ----------
210 |     n_components : int, default=2
211 |         Number of components (clusters).
212 | 
213 |     max_feats : float, default=100.
214 |         Controls the number of features included. This is the square of `kappa`, and 
215 |         represents the effective number of features.
216 | 
217 |     jump_penalty : float, default=0.
218 |         The jump penalty. In SJM, this penalty is scaled by 
219 |         `1 / sqrt(n_features)` since features are weighted.
220 | 
221 |     cont : bool, default=False
222 |         If `True`, the continuous jump model is used. Otherwise, the discrete model is applied.
223 | 
224 |     grid_size : float, default=0.05
225 |         The grid size for discretizing the probability simplex (only used for continuous models).
226 | 
227 |     mode_loss : bool, default=True
228 |         Whether to apply the mode loss penalty (only relevant for continuous models).
229 | 
230 |     random_state : int or RandomState, optional
231 |         Random number generator seed for reproducibility.
232 | 
233 |     max_iter : int, default=30
234 |         Maximum number of iterations for the coordinate descent algorithm in feature selection.
235 | 
236 |     tol_w : float, default=1e-4
237 |         Tolerance for stopping the optimization of feature weights.
238 | 
239 |     max_iter_jm : int, default=1000
240 |         Maximum number of iterations for the jump model fitting process.
241 | 
242 |     tol_jm : float, default=1e-8
243 |         Stopping tolerance for the jump model fitting.
244 | 
245 |     n_init_jm : int, default=10
246 |         Number of initializations for the jump model.
247 | 
248 |     verbose : int, default=0
249 |         Controls the verbosity of the output.
250 | 
251 |     Attributes
252 |     ----------
253 |     jm_ins : JumpModel
254 |         The fitted jump model instance, with feature weighting.
255 | 
256 |     feat_weights : ndarray
257 |         The optimal feature weights.
258 |         Square root of the `w` vector in the oroginal SJM formulation.
259 | 
260 |     labels_ : Series or ndarray
261 |         In-sample optimal state assignments.
262 | 
263 |     proba_ : DataFrame or ndarray
264 |         In-sample optimal probability matrix.
265 | 
266 |     ret_, vol_ : Series or ndarray
267 |         Average return (`ret_`) and volatility (`vol_`) for each state, if `ret_ser` is provided.
268 | 
269 |     centers_ : ndarray
270 |         The weighted cluster centers.
271 |     """
272 |     # reviewed
273 |     def __init__(self,
274 |                  n_components: int = 2, 
275 |                  max_feats: float = 100.,
276 |                  jump_penalty: float = 0., 
277 |                  cont: bool = False, 
278 |                  grid_size: float = 0.05, 
279 |                  mode_loss: bool = True, 
280 |                  random_state = RANDOM_STATE, 
281 |                  max_iter: int = 30, 
282 |                  tol_w: float = 1e-4, 
283 |                  max_iter_jm: int = 1000,
284 |                  tol_jm: float = 1e-8,
285 |                  n_init_jm: int = 10,
286 |                  verbose: int = 0):
287 |         self.n_components = int(n_components)
288 |         self.max_feats = max_feats
289 |         self.jump_penalty = jump_penalty
290 |         self.cont = cont
291 |         self.grid_size = grid_size
292 |         self.mode_loss = mode_loss
293 |         self.random_state = random_state
294 |         self.max_iter = max_iter
295 |         self.tol_w = tol_w
296 |         self.max_iter_jm = max_iter_jm
297 |         self.tol_jm = tol_jm
298 |         self.n_init_jm = n_init_jm
299 |         self.verbose = verbose
300 | 
301 |     # reviewed
302 |     def init_jm(self):
303 |         """
304 |         Initialize the jump model instance with scaled jump penalty.
305 |         """
306 |         jump_penalty = self.jump_penalty / np.sqrt(self.n_features_all)
307 |         jm = JumpModel(n_components=self.n_components,
308 |                        jump_penalty=jump_penalty,
309 |                        cont=self.cont,
310 |                        grid_size=self.grid_size,
311 |                        mode_loss=self.mode_loss,
312 |                        random_state=self.random_state,
313 |                        max_iter=self.max_iter_jm,
314 |                        tol=self.tol_jm,
315 |                        n_init=self.n_init_jm,
316 |                        verbose=decre_verbose(self.verbose))
317 |         self.jm_ins = jm
318 |         return jm
319 |     
320 |     # reviewed
321 |     def print_log(self, n_iter, BCSS, w):
322 |         """
323 |         Print fitting logs if verbosity is enabled.
324 |         """
325 |         if self.verbose:
326 |             print("Iter:", n_iter)
327 |             print("BCSS:\n", BCSS)     #, "sum:", BCSS.sum()
328 |             print("w:\n", w, "\n")
329 |         return 
330 | 
331 |     # reviewed
332 |     def fit(self, 
333 |             X: DF_ARR_TYPE, 
334 |             ret_ser: Optional[SER_ARR_TYPE] = None,
335 |             sort_by: Optional[str] = "cumret"):
336 |         """
337 |         Fit the sparse jump model using coordinate descent.
338 | 
339 |         This method iteratively optimizes the feature weights and fits the jump model 
340 |         on the weighted data.
341 | 
342 |         Parameters
343 |         ----------
344 |         X : DataFrame or ndarray
345 |             The input data matrix.
346 | 
347 |         ret_ser : Series or ndarray, optional
348 |             A return series used for sorting states.
349 | 
350 |         sort_by : ["cumret", "vol", "freq", "ret"], optional (default="cumret")
351 |             Criterion for sorting states.
352 | 
353 |         Returns
354 |         -------
355 |         SparseJumpModel
356 |             The fitted sparse jump model.
357 |         """
358 |         #
359 |         X_arr = check_2d_array(X)
360 |         self.n_features_all = X_arr.shape[1]
361 |         # jm ins
362 |         jm = self.init_jm()
363 |         # get attrs
364 |         max_iter = self.max_iter
365 |         tol_w = self.tol_w
366 |         norm_ub = np.sqrt(self.max_feats)
367 |         # 
368 |         w_old = np.ones(self.n_features_all)*2  # not a valid weight, only used for entering the 1st iter
369 |         w = np.ones(self.n_features_all) / np.sqrt(self.n_features_all)  # initial weight   #  np.repeat(1/np.sqrt(self.n_features_all), self.n_features_all)  
370 |         n_iter = 0
371 |         while (n_iter < max_iter and norm(w-w_old, 1) / norm(w_old, 1) > tol_w):
372 |             # 
373 |             n_iter += 1
374 |             w_old = w
375 |             # Step 1: fix w, fit JM
376 |             feat_weights = np.sqrt(w)
377 |             # use the previous optimal center, weighted by the most recent w, as an initialization
378 |             if n_iter > 1: jm.centers_ = centers_unweighted * feat_weights    
379 |             # fit JM on weighted data
380 |             jm.fit(X, ret_ser=ret_ser, feat_weights=feat_weights, sort_by=sort_by)
381 |             # Step 2: optimize w
382 |             # update (unweighted) centers
383 |             centers_unweighted = weighted_mean_cluster(X_arr, jm.proba_)
384 |             # compute BCSS on the original data
385 |             BCSS = compute_BCSS(X_arr, jm.proba_, centers_unweighted)
386 |             if (BCSS <= 0).all(): # all in one cluster
387 |                 self.print_log(n_iter, BCSS, w)
388 |                 break
389 |             w = solve_lasso(BCSS/BCSS.max(), norm_ub)
390 |             self.print_log(n_iter, BCSS, w)
391 |         # best res
392 |         self.w = raise_arr_to_pd_obj(w, X, index_key="columns")
393 |         self.feat_weights = raise_arr_to_pd_obj(jm.feat_weights, X, index_key="columns")
394 |         self.centers_ = jm.centers_ # weighted centers
395 |         # self.centers_ = weighted_mean_cluster(X_arr, jm.proba_, )
396 |         self.labels_ = jm.labels_
397 |         self.proba_ = jm.proba_
398 |         if ret_ser is not None:
399 |             self.ret_ = jm.ret_
400 |             self.vol_ = jm.vol_
401 |         return self
402 |     
403 |     def predict_proba_online(self, X: DF_ARR_TYPE) -> DF_ARR_TYPE:
404 |         """
405 |         Predict state probabilities in an online fashion.
406 |         """
407 |         return self.jm_ins.predict_proba_online(X)
408 |     
409 |     def predict_online(self, X: DF_ARR_TYPE) -> SER_ARR_TYPE:
410 |         """
411 |         Predict states in an online fashion.
412 |         """
413 |         return self.jm_ins.predict_online(X)
414 |     
415 |     def predict_proba(self, X: DF_ARR_TYPE) -> DF_ARR_TYPE:
416 |         """
417 |         Predict state probabilities using all available data.
418 |         """
419 |         return self.jm_ins.predict_proba(X)
420 | 
421 |     def predict(self, X: DF_ARR_TYPE) -> SER_ARR_TYPE:
422 |         """
423 |         Predict states using all available data.
424 |         """
425 |         return self.jm_ins.predict(X)


--------------------------------------------------------------------------------
/jumpmodels/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Although this import style is generally discouraged, 
2 | # it works well for our codebase given the simple structure
3 | from .validation import *
4 | from .index import *
5 | from .calculation import *
6 | from .cluster import *


--------------------------------------------------------------------------------
/jumpmodels/utils/calculation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Helpers for basic numerical calculations.
  3 | 
  4 | This module focuses on numerical calculations with special attention to `numpy` behaviors 
  5 | involving NaN and infinity:
  6 | 
  7 | - 0. / 0. = np.nan
  8 | - 0. * np.inf = np.nan
  9 | - 0. * np.nan = np.nan
 10 | - 1. / 0. = np.inf
 11 | - -1. / 0. = -np.inf
 12 | 
 13 | Typically, it is rare for a statement to directly yield `np.inf`; the first two examples 
 14 | are the most common cases.
 15 | 
 16 | Depends on
 17 | ----------
 18 | utils.validation : Module
 19 | """
 20 | 
 21 | from .validation import *
 22 | 
 23 | # will not raise warnings if: divide by zero, take sqrt of nega values
 24 | np.seterr(divide="ignore", invalid="ignore")
 25 | 
 26 | # reviewed
 27 | def set_zero_arr(x: np.ndarray, tol=1e-6) -> np.ndarray:
 28 |     """
 29 |     Set elements of a numpy array that are close to zero to exactly zero.
 30 | 
 31 |     Parameters
 32 |     ----------
 33 |     x : ndarray
 34 |         The input numpy array.
 35 | 
 36 |     tol : float, optional (default=1e-6)
 37 |         The tolerance value. Elements with absolute values smaller than `tol` 
 38 |         are set to zero.
 39 | 
 40 |     Returns
 41 |     -------
 42 |     ndarray
 43 |         A numpy array with near-zero values replaced by exact zeros.
 44 |     """
 45 |     return np.where(np.abs(x) < tol, 0., x)
 46 | 
 47 | # reviewed
 48 | def replace_inf_by_nan(x: Union[float, np.ndarray]) -> Union[float, np.ndarray]:
 49 |     """
 50 |     Replace both positive and negative infinity values with NaN in a float or numpy array.
 51 | 
 52 |     Parameters
 53 |     ----------
 54 |     x : float or ndarray
 55 |         The input float or numpy array.
 56 | 
 57 |     Returns
 58 |     -------
 59 |     float or ndarray
 60 |         A float or numpy array with infinities replaced by NaN.
 61 |     """
 62 |     return np.where(np.isinf(x), np.nan, x)
 63 | 
 64 | # reviewed
 65 | def replace_nan_by_inf(x: Union[float, np.ndarray]) -> Union[float, np.ndarray]:
 66 |     """
 67 |     Replace all NaN values with positive infinity in a float or numpy array.
 68 | 
 69 |     Parameters
 70 |     ----------
 71 |     x : float or ndarray
 72 |         The input float or numpy array.
 73 | 
 74 |     Returns
 75 |     -------
 76 |     float or ndarray
 77 |         A float or numpy array with NaN values replaced by infinity.
 78 |     """
 79 |     return np.where(np.isnan(x), np.inf, x)
 80 | 
 81 | # reviewed
 82 | def decre_verbose(verbose: int) -> int:
 83 |     """
 84 |     Decrement a non-negative integer by 1, ensuring the result is non-negative.
 85 | 
 86 |     Parameters
 87 |     ----------
 88 |     verbose : int
 89 |         A non-negative integer to decrement.
 90 | 
 91 |     Returns
 92 |     -------
 93 |     int
 94 |         The decremented value, ensuring it is non-negative.
 95 |     """
 96 |     return max(0, verbose-1)
 97 | 
 98 | #################################
 99 | ## weighted ave
100 | #################################
101 | 
102 | # reviewed
103 | def weighted_mean_cluster(X: np.ndarray, weights: np.ndarray) -> np.ndarray:
104 |     """
105 |     Compute the weighted sample average for each cluster. `X` can be a 1D or 2D array.
106 |     If the total weights sum to zero (indicating no observation), return `np.nan`.
107 |     No `np.inf` will appear in the result.
108 | 
109 |     Parameters
110 |     ----------
111 |     X : ndarray of shape (n_s,) or (n_s, n_f)
112 |         The data matrix, where `n_s` is the number of samples and `n_f` is the number of features.
113 | 
114 |     weights : ndarray of shape (n_s, n_c)
115 |         The weight array for each sample and cluster. Must be all non-negative. Support for 
116 |         `weights` of shape (n_s,) can be added later if needed.
117 | 
118 |     Returns
119 |     -------
120 |     ndarray of shape (n_c,) or (n_c, n_f)
121 |         The weighted mean for each cluster.
122 |     """
123 |     # valid X
124 |     assert X.ndim in [1, 2]   # (n_s,) or (n_s, n_f)
125 |     X_2d = check_2d_array(X, assert_na=False)   # (n_s, n_f)
126 |     # valid weights
127 |     weights = check_2d_array(weights, assert_na=False)   # (n_s, n_c)
128 |     assert len(X_2d) == len(weights)
129 |     assert (weights >= 0).all()
130 |     # 
131 |     weighted_sum = weights.T @ X_2d        # (n_c, n_f)
132 |     Ns = weights.sum(axis=0, keepdims=True).T   # (n_c, 1)
133 |     means_ = weighted_sum / Ns   # (n_c, n_f)
134 |     if X.ndim == 1: means_ = means_.squeeze()
135 |     return means_        # (n_c,) or (n_c, n_f)
136 | 
137 | # reviewed
138 | def weighted_mean_std_cluster(X: np.ndarray, weights: np.ndarray, bias=False) -> np.ndarray:
139 |     """
140 |     Compute the weighted means and standard deviations for each cluster.
141 | 
142 |     In extreme cases leading to NaNs (otherwise, all values are normal):
143 |     - No observation: both `var_` and `factor` will be NaNs, and standard deviation will also be NaN.
144 |     - Only one observation: `var_` will be zero, while `factor` will be `np.inf`. When considering the debiasing 
145 |       factor, this results in NaN standard deviations.
146 | 
147 |     Parameters
148 |     ----------
149 |     X : ndarray of shape (n_s,) or (n_s, n_f)
150 |         The data matrix, where `n_s` is the number of samples and `n_f` is the number of features.
151 | 
152 |     weights : ndarray of shape (n_s, n_c)
153 |         The weight array for each sample and cluster. Must be all non-negative.
154 | 
155 |     bias : bool, optional (default=False)
156 |         If False, apply a debiasing factor to the variance calculation.
157 | 
158 |     Returns
159 |     -------
160 |     means_ : ndarray of shape (n_c,) or (n_c, n_f)
161 |         The weighted mean for each cluster.
162 | 
163 |     stds_ : ndarray of shape (n_c,) or (n_c, n_f)
164 |         The weighted standard deviation for each cluster.
165 |     """
166 |     X_2d = check_2d_array(X, assert_na=False)    # (n_s, n_f)
167 |     means_ = weighted_mean_cluster(X_2d, weights)   # (n_c, n_f)
168 |     sq_means_ = weighted_mean_cluster(X_2d ** 2, weights)   # (n_c, n_f)
169 |     var_ = sq_means_ - means_ ** 2  # (n_c, n_f)
170 |     if not bias:    # debiase factor, see: https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights
171 |         V1 = weights.sum(axis=0, keepdims=True)     # (1, n_c)
172 |         V2 = (weights**2).sum(axis=0, keepdims=True)   # (1, n_c)
173 |         factor = 1. / (1. - V2/V1**2)   # (1, n_c)
174 |         factor = factor.T   # (n_c, 1)
175 |         var_ *= factor  # (n_c, n_f)
176 |     stds_ = np.sqrt(var_)  # (n_c, n_f)
177 |     if X.ndim == 1:
178 |         return means_.squeeze(), stds_.squeeze()
179 |     return means_, stds_
180 | 


--------------------------------------------------------------------------------
/jumpmodels/utils/cluster.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Helpers for numerical calculations in clustering analysis.
  3 | 
  4 | This module provides functions to handle clustering-related tasks such as label validation, 
  5 | probability conversion, and transition matrix computation.
  6 | 
  7 | Depends on
  8 | ----------
  9 | utils.validation : Module
 10 | """
 11 | 
 12 | from .validation import *
 13 | 
 14 | # reviewed
 15 | def is_valid_labels(labels_: SER_ARR_TYPE, n_c: int = 2) -> bool:
 16 |     """
 17 |     Check whether a label array/series is a valid label sequence. The values of `labels_` must 
 18 |     lie in the set {0, 1, ..., n_c-1}.
 19 | 
 20 |     Parameters
 21 |     ----------
 22 |     labels_ : ndarray or Series
 23 |         The array or series of labels to check.
 24 | 
 25 |     n_c : int, optional (default=2)
 26 |         The number of clusters. Labels must lie in {0, 1, ..., n_c-1}.
 27 | 
 28 |     Returns
 29 |     -------
 30 |     bool
 31 |         True if the labels are valid, False otherwise.
 32 |     """
 33 |     labels_arr = check_1d_array(labels_)   # check whether it is intrinsically 1-d
 34 |     return set(labels_arr).issubset(set(range(n_c)))
 35 | 
 36 | # reviewed
 37 | def is_valid_proba(proba_: DF_ARR_TYPE) -> bool:
 38 |     """
 39 |     Check whether a probability array/series is valid, meaning all values are non-negative 
 40 |     and all rows sum to 1.
 41 | 
 42 |     Parameters
 43 |     ----------
 44 |     proba_ : ndarray or DataFrame
 45 |         The probability matrix to check.
 46 | 
 47 |     Returns
 48 |     -------
 49 |     bool
 50 |         True if the probability matrix is valid, False otherwise.
 51 |     """
 52 |     proba_arr = check_2d_array(proba_)
 53 |     return (proba_arr>=0).all() and np.isclose(proba_arr.sum(axis=1), 1.).all()
 54 | 
 55 | # reviewed
 56 | def raise_labels_into_proba(labels_: np.ndarray, n_c: int) -> np.ndarray:
 57 |     """
 58 |     Convert a discrete label array into a probability matrix. The resulting matrix corresponds 
 59 |     to hard clustering, with 0./1. values.
 60 | 
 61 |     Parameters
 62 |     ----------
 63 |     labels_ : ndarray of shape (n_s,)
 64 |         The array of integer labels.
 65 | 
 66 |     n_c : int
 67 |         The number of clusters.
 68 | 
 69 |     Returns
 70 |     -------
 71 |     proba_ : ndarray of shape (n_s, n_c)
 72 |         The probability assignment array.
 73 |     """
 74 |     # labels_ must be ints, and smaller than n_c
 75 |     # don't verify inputs, for performance consideration
 76 |     n_s = len(labels_)
 77 |     proba_ = np.zeros((n_s, n_c)) 
 78 |     proba_[range(n_s), labels_] = 1.
 79 |     # assert is_valid_proba(proba_)
 80 |     return proba_
 81 | 
 82 | # reviewed
 83 | def reduce_proba_to_labels(proba_: DF_ARR_TYPE) -> SER_ARR_TYPE:
 84 |     """
 85 |     Convert a probability matrix into a label series by taking the argmax of each row.
 86 | 
 87 |     Parameters
 88 |     ----------
 89 |     proba_ : ndarray or DataFrame
 90 |         The probability matrix to convert.
 91 | 
 92 |     Returns
 93 |     -------
 94 |     labels_ : ndarray or Series
 95 |         The label series obtained by taking the argmax of each row.
 96 |     """
 97 |     if is_df(proba_): return proba_.idxmax(axis=1)
 98 |     # arr
 99 |     return proba_.argmax(axis=1)
100 | 
101 | # reviewed
102 | def is_map_from_left_to_right(labels_left: Optional[SER_ARR_TYPE], labels_right: Optional[SER_ARR_TYPE]) -> bool:
103 |     """
104 |     Check whether the map from `labels_left` to `labels_right` is valid, meaning elements with the same label 
105 |     in `labels_left` must have the same label in `labels_right`. If either label array is `None`, return `False`.
106 | 
107 |     Parameters
108 |     ----------
109 |     labels_left : ndarray or Series, optional
110 |         The left-side label array.
111 | 
112 |     labels_right : ndarray or Series, optional
113 |         The right-side label array.
114 | 
115 |     Returns
116 |     -------
117 |     bool
118 |         True if the mapping is valid, False otherwise.
119 |     """
120 |     if labels_left is None or labels_right is None:
121 |         return False
122 |     assert len(labels_left) == len(labels_right)
123 |     for label in np.unique(labels_left):
124 |         if len(np.unique(labels_right[labels_left==label])) != 1:
125 |             return False
126 |     return True
127 | 
128 | # reviewed
129 | def is_same_clustering(labels1: Optional[SER_ARR_TYPE], labels2: Optional[SER_ARR_TYPE]) -> bool:
130 |     """
131 |     Check whether two clustering results are the same, under permutation. If either input is `None`, return `False`.
132 | 
133 |     Parameters
134 |     ----------
135 |     labels1 : ndarray or Series, optional
136 |         The first label array.
137 | 
138 |     labels2 : ndarray or Series, optional
139 |         The second label array.
140 | 
141 |     Returns
142 |     -------
143 |     bool
144 |         True if the two clustering results are the same, False otherwise.
145 |     """
146 |     return is_map_from_left_to_right(labels1, labels2) and is_map_from_left_to_right(labels2, labels1)
147 | 
148 | # reviewed
149 | def empirical_trans_mx(labels_: SER_ARR_TYPE, n_components=2, return_counts=False) -> np.ndarray:
150 |     """
151 |     Compute the empirical transition count or probability matrix from a label array/series. 
152 |     Probability values will be `nan` if no transition from a state is observed.
153 | 
154 |     Parameters
155 |     ----------
156 |     labels_ : ndarray or Series
157 |         The label array/series with values in {0, 1, ..., n_components - 1}, of both float/int dtype.
158 | 
159 |     n_components : int, optional (default=2)
160 |         The number of unique labels.
161 | 
162 |     return_counts : bool, optional (default=False)
163 |         If True, return the transition counts instead of probabilities.
164 | 
165 |     Returns
166 |     -------
167 |     ndarray
168 |         The transition count or probability matrix.
169 |     """
170 |     assert is_valid_labels(labels_, n_c=n_components)
171 |     labels_ = check_1d_array(labels_, dtype=int)    # labels must be int type, as it will be used as arr index.
172 |     # count transitions
173 |     count_mx = np.zeros((n_components, n_components), dtype=int)
174 |     for i in range(n_components):
175 |         # the next states after label==i
176 |         labels_next = labels_[1:][labels_[:-1]==i]  # shift label by 1
177 |         # count next states
178 |         states, counts = np.unique(labels_next, return_counts=True)     # states must be ints.
179 |         count_mx[i, states] = counts
180 |     if return_counts: return count_mx
181 |     # return probability
182 |     return (1.*count_mx) / count_mx.sum(axis=1, keepdims=True)
183 | 
184 | # reviewed
185 | def compute_num_shifts(labels_: SER_ARR_TYPE) -> int:
186 |     """
187 |     Count the number of regime shifts in a (int) label array/series.
188 |     """
189 |     labels_arr = check_1d_array(labels_)
190 |     return (labels_arr[:-1]!=labels_arr[1:]).sum()
191 | 


--------------------------------------------------------------------------------
/jumpmodels/utils/index.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Helpers for working with the index of pandas objects, typically of type `datetime.date`.
 3 | 
 4 | This module provides functions to filter and align the index of pandas Series 
 5 | and DataFrames. The functionality ensures proper handling of date-based indices and 
 6 | alignment of pandas objects.
 7 | 
 8 | Depends on
 9 | ----------
10 | utils.validation : Module
11 | """
12 | 
13 | from .validation import *
14 | 
15 | # reviewed
16 | def filter_date_range(obj: PD_TYPE, start_date: DATE_TYPE = None, end_date: DATE_TYPE = None) -> PD_TYPE:
17 |     """
18 |     Filter a pandas Series or DataFrame with a `datetime.date` index by a specified date range.
19 |     Returns a copy of the filtered object for data safety.
20 | 
21 |     Parameters
22 |     ----------
23 |     obj : Series or DataFrame
24 |         The pandas object to filter, which must have an index of dtype `datetime.date`.
25 | 
26 |     start_date : str, datetime.date, or None, optional
27 |         The start date of the range. If `None`, no start date filter is applied.
28 | 
29 |     end_date : str, datetime.date, or None, optional
30 |         The end date of the range. If `None`, no end date filter is applied.
31 | 
32 |     Returns
33 |     -------
34 |     Series or DataFrame
35 |         A copy of the filtered pandas object.
36 |     """
37 |     assert is_ser_df(obj)
38 |     start_date, end_date =  check_datetime_date(start_date), check_datetime_date(end_date)
39 |     if start_date is not None: obj = obj.loc[start_date:]
40 |     if end_date is not None: obj = obj.loc[:end_date]
41 |     return obj.copy()
42 | 
43 | # reviewed
44 | def align_index(x: PD_TYPE, y: PD_TYPE) -> PD_TYPE:
45 |     """
46 |     Return a subset of `x` so that its index aligns with the index of `y`. 
47 |     Returns a copy of the subset for data safety.
48 | 
49 |     Parameters
50 |     ----------
51 |     x : Series or DataFrame
52 |         The pandas object whose index is to be aligned with `y`.
53 | 
54 |     y : Series or DataFrame
55 |         The pandas object whose index is used for alignment.
56 | 
57 |     Returns
58 |     -------
59 |     Series or DataFrame
60 |         A copy of `x` with its index aligned to `y`.
61 |     """
62 |     return x.loc[y.index].copy()    # throw error if the index is not contained
63 | 
64 | # reviewed
65 | def align_x_with_y(x: NUMERICAL_OBJ_TYPE, y: NUMERICAL_OBJ_TYPE) -> NUMERICAL_OBJ_TYPE:
66 |     """
67 |     Align `x` with `y`. If both `x` and `y` are pandas objects, align their indices using 
68 |     `align_index`. If they are not both pandas objects, assert that their lengths match.
69 |     Returns a copy for data safety.
70 | 
71 |     Parameters
72 |     ----------
73 |     x : ndarray, Series, or DataFrame
74 |         The first numerical object to align.
75 | 
76 |     y : ndarray, Series, or DataFrame
77 |         The second numerical object to align.
78 | 
79 |     Returns
80 |     -------
81 |     ndarray, Series, or DataFrame
82 |         A copy of `x`, aligned with `y`.
83 |     """
84 |     if is_ser_df(x) and is_ser_df(y): return align_index(x, y)
85 |     # not all pd objects, assert that lens match
86 |     assert is_same_len(x, y), "the two input arrays should be of the same length"
87 |     return x.copy()
88 | 


--------------------------------------------------------------------------------
/jumpmodels/utils/validation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Module of functions to validate input/output and parameters in functions or estimators.
  3 | 
  4 | This module provides general validation functions and does not depend on any custom modules.
  5 | """
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import numbers
 10 | from typing import Union, Optional, Dict
 11 | import datetime
 12 | 
 13 | # custom data types
 14 | PD_TYPE = Union[pd.Series, pd.DataFrame]
 15 | NUMERICAL_OBJ_TYPE = Union[np.ndarray, PD_TYPE]
 16 | SER_ARR_TYPE = Union[np.ndarray, pd.Series]
 17 | DF_ARR_TYPE = Union[np.ndarray, pd.DataFrame]
 18 | DATE_TYPE = Optional[Union[str, datetime.date]]
 19 | 
 20 | pd.set_option('display.width', 300)
 21 | 
 22 | ###############################
 23 | ## convert input types 
 24 | ###############################
 25 | 
 26 | # reviewed
 27 | def is_no_nan(obj: NUMERICAL_OBJ_TYPE) -> bool:
 28 |     """
 29 |     Check whether an object does not contain any NaN or None values.
 30 | 
 31 |     Parameters
 32 |     ----------
 33 |     obj : Array/Series/DataFrame
 34 |         The input numerical object to check. It can be a numpy array, pandas Series, 
 35 |         or pandas DataFrame.
 36 | 
 37 |     Returns
 38 |     -------
 39 |     bool
 40 |         `True` if the object does not contain any NaN or None values, `False` otherwise.
 41 |     """
 42 |     return not pd.isna(np.asarray(obj)).any()
 43 | 
 44 | # reviewed
 45 | def valid_no_nan(obj: NUMERICAL_OBJ_TYPE):
 46 |     """
 47 |     Assert that an object does not contain any NaN or None values.
 48 | 
 49 |     Parameters
 50 |     ----------
 51 |     obj : Array/Series/DataFrame
 52 |         The input numerical object to check. It can be a numpy array, pandas Series, 
 53 |         or pandas DataFrame.
 54 | 
 55 |     Raises
 56 |     ------
 57 |     AssertionError
 58 |         If the object contains NaN or None values.
 59 |     """
 60 |     assert is_no_nan(obj), f"input numerical object contains NaNs."
 61 |     return 
 62 | 
 63 | # reviewed
 64 | def check_2d_array(X: NUMERICAL_OBJ_TYPE, single_col=False, dtype=None, assert_na=True) -> np.ndarray:
 65 |     """
 66 |     Convert an array-like object into a 2D array. If the input is 1D, a new axis will be appended.
 67 |     Only accepts 1D and 2D inputs. If `single_col` is True, the function will assert that 
 68 |     `X.shape[1] == 1`. The function returns a copy for data safety.
 69 | 
 70 |     Parameters
 71 |     ----------
 72 |     X : Array/Series/DataFrame
 73 |         Array-like object (numpy array, pandas Series, or pandas DataFrame). Raises an exception if 
 74 |         the dimensionality is not 1 or 2.
 75 |     
 76 |     single_col : bool, optional (default=False)
 77 |         If True, assert that `X.shape[1] == 1`, ensuring that the input contains only one column.
 78 | 
 79 |     dtype : data-type, optional
 80 |         Desired numpy data type for the returned array.
 81 | 
 82 |     assert_na : bool, optional (default=True)
 83 |         Whether to assert that the input `X` does not contain any NA values.
 84 | 
 85 |     Returns
 86 |     -------
 87 |     np.ndarray
 88 |         A 2D numpy array.
 89 |     """
 90 |     X = np.array(X, dtype=dtype)
 91 |     if X.ndim == 1: X = X[:, np.newaxis]    # append new axis
 92 |     assert X.ndim == 2
 93 |     if single_col: assert X.shape[1] == 1
 94 |     if assert_na: valid_no_nan(X)
 95 |     return X
 96 | 
 97 | # reviewed
 98 | def check_1d_array(X: NUMERICAL_OBJ_TYPE, dtype=None, assert_na=True) -> np.ndarray:
 99 |     """
100 |     Convert an array-like object into a 1D array. The function returns a copy for data safety.
101 | 
102 |     Parameters
103 |     ----------
104 |     X : Array/Series/DataFrame
105 |         Array-like object (numpy array, pandas Series, or pandas DataFrame). Raises an exception if 
106 |         the dimensionality after calling `.squeeze()` is not 1.
107 | 
108 |     dtype : data-type, optional
109 |         Desired numpy data type for the returned array.
110 | 
111 |     assert_na : bool, optional (default=True)
112 |         Whether to assert that the input `X` does not contain any NA values.
113 | 
114 |     Returns
115 |     -------
116 |     np.ndarray
117 |         A 1D numpy array.
118 |     """
119 |     X = np.array(X, dtype=dtype).squeeze()
120 |     assert X.ndim == 1
121 |     if assert_na: valid_no_nan(X)
122 |     return X
123 | 
124 | # reviewed
125 | def check_datetime_date(date: DATE_TYPE) -> Optional[datetime.date]:
126 |     """
127 |     Convert a date-like object into a `datetime.date` object. If the input is `None`, 
128 |     return `None`.
129 | 
130 |     Parameters
131 |     ----------
132 |     date : str, datetime.date, or None
133 |         The input date-like object to be converted. Can be a string, a datetime object, 
134 |         or `None`.
135 | 
136 |     Returns
137 |     -------
138 |     datetime.date or None
139 |         A `datetime.date` object if the input is a valid date-like object, otherwise `None`.
140 |     """
141 |     if date is None: return None
142 |     return pd.Timestamp(date).date()
143 | 
144 | ###############################
145 | ## binary checks
146 | ###############################
147 | 
148 | # reviewed
149 | def is_ser(obj) -> bool:
150 |     """
151 |     Check whether the input object is a Series.
152 |     """
153 |     return isinstance(obj, pd.Series)
154 | 
155 | # reviewed
156 | def is_df(obj) -> bool:
157 |     """
158 |     Check whether the input object is a DataFrame.
159 |     """
160 |     return isinstance(obj, pd.DataFrame)
161 | 
162 | # reviewed
163 | def is_ser_df(obj) -> bool:
164 |     """
165 |     Check whether the input object is a Series/DataFrame.
166 |     """
167 |     return isinstance(obj, PD_TYPE) 
168 | 
169 | # reviewed
170 | def is_numbers(x) -> bool:
171 |     """
172 |     Check whether the input is a scalar number.
173 |     """
174 |     return isinstance(x, numbers.Number)
175 | 
176 | # reviewed
177 | def is_same_len(*args) -> bool:
178 |     """
179 |     Check whether all input arguments have the same length.
180 | 
181 |     Parameters
182 |     ----------
183 |     *args : iterable
184 |         Variable number of input iterables (e.g., lists, arrays, or other iterable objects).
185 | 
186 |     Returns
187 |     -------
188 |     bool
189 |         `True` if all input arguments have the same length, `False` otherwise.
190 |     """
191 |     return len(set(len(x) for x in args)) == 1
192 | 
193 | # reviewed
194 | def is_same_index(*args) -> bool:
195 |     """
196 |     Check whether the index of all input pandas Series or DataFrames are exactly the same.
197 |     This function is typically used to verify if the date indices of different Series/DataFrames 
198 |     align with each other.
199 | 
200 |     Parameters
201 |     ----------
202 |     *args : Series or DataFrame
203 |         Variable number of pandas Series or DataFrame objects whose indices are to be compared.
204 | 
205 |     Returns
206 |     -------
207 |     bool
208 |         `True` if all input Series/DataFrames have the same index, `False` otherwise.
209 |     """
210 |     assert is_same_len(*args)
211 |     index_this = None
212 |     for item in args:
213 |         # assert is_ser_df(item)
214 |         if index_this is None: # the first item
215 |             index_this = item.index
216 |             continue 
217 |         index_that = item.index
218 |         if not (index_this==index_that).all():
219 |             return False
220 |     return True
221 | 
222 | ###############################
223 | ## output cast in pd types 
224 | ###############################
225 | 
226 | # reviewed
227 | def getattr_(obj: object, key: Optional[str]): 
228 |     """
229 |     Retrieve the attribute `key` from the object `obj`. If `key` is `None`, or the object 
230 |     does not have the attribute `key`, return `None`.
231 | 
232 |     Parameters
233 |     ----------
234 |     obj : object
235 |         The object from which to retrieve the attribute.
236 | 
237 |     key : str, optional
238 |         The name of the attribute to retrieve. If `None`, the function returns `None`.
239 | 
240 |     Returns
241 |     -------
242 |     any or None
243 |         The value of the attribute if it exists, otherwise `None`.
244 |     """
245 |     if key is not None and hasattr(obj, key):
246 |         return getattr(obj, key) 
247 |     else:
248 |         return None
249 | 
250 | # reviewed
251 | def raise_arr_to_pd_obj(arr: np.ndarray, pd_obj: NUMERICAL_OBJ_TYPE, index_key="index", columns_key="columns", return_as_ser=True) -> NUMERICAL_OBJ_TYPE:
252 |     """
253 |     Convert a numpy array into a pandas Series or DataFrame, using the index and columns 
254 |     attributes of `pd_obj` for labeling. If `pd_obj` is not a pandas object, the function 
255 |     returns the array unchanged.
256 | 
257 |     Parameters
258 |     ----------
259 |     arr : np.ndarray
260 |         The array to be converted into a pandas Series or DataFrame.
261 | 
262 |     pd_obj : Series, DataFrame, or array-like
263 |         The pandas object from which to extract the index and columns for the new pandas object.
264 | 
265 |     index_key : str, optional (default="index")
266 |         The attribute name for retrieving the index of the output from `pd_obj`.
267 | 
268 |     columns_key : str, optional (default="columns")
269 |         The attribute name for retrieving the columns of the output from `pd_obj`.
270 |         Only useful if the parameter `return_as_ser` is set to `False`.
271 | 
272 |     return_as_ser : bool, optional (default=True)
273 |         If `True`, the function returns a pandas Series using only the index. 
274 |         If `False`, it returns a pandas DataFrame using both the index and columns.
275 | 
276 |     Returns
277 |     -------
278 |     Series, DataFrame, or np.ndarray
279 |         A pandas Series or DataFrame with index and columns matching those of `pd_obj`, 
280 |         or the original numpy array if `pd_obj` is not a pandas object.
281 |     """
282 |     if not is_ser_df(pd_obj): return arr
283 |     index = getattr_(pd_obj, index_key)
284 |     columns = getattr_(pd_obj, columns_key)
285 |     if return_as_ser: return pd.Series(arr, index=index)
286 |     return pd.DataFrame(arr, index=index, columns=columns)
287 | 
288 | ###############################
289 | ## file i/o
290 | ###############################
291 | 
292 | import os
293 | 
294 | # reviewed
295 | def check_dir_exist(filepath):
296 |     """
297 |     Check whether the directory of the specified file path exists. If it does not exist, 
298 |     create the directory. Handles potential race conditions where multiple processes may 
299 |     attempt to create the directory simultaneously.
300 | 
301 |     Parameters
302 |     ----------
303 |     filepath : str
304 |         The file path for which the existence of the parent directory is checked.
305 |     """
306 |     dirname = os.path.dirname(filepath)
307 |     if dirname != "":
308 |         if not os.path.exists(dirname):
309 |             try:
310 |                 os.makedirs(dirname, exist_ok=True)
311 |                 print(f"Created folder: {dirname}")
312 |             except FileExistsError:
313 |                 # The directory was created by another process between the check and creation
314 |                 pass
315 |     return
316 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | 
 6 | 
 7 | 
 8 | [project]
 9 | name = "jumpmodels"
10 | version = "0.1.1"
11 | authors = [
12 |   { name="Yizhan Shu", email="olivershu98@gmail.com" },
13 | ]
14 | description = "Statistical Jump Models in Python, with scikit-learn-style APIs"
15 | readme = "README.md"
16 | requires-python = ">=3.8"
17 | dependencies = [
18 |   "numpy",
19 |   "pandas",
20 |   "scipy",
21 |   "scikit-learn",
22 |   "matplotlib",
23 | ]
24 | classifiers = [
25 |     "Programming Language :: Python :: 3",
26 |     "License :: OSI Approved :: Apache Software License",
27 |     "Operating System :: OS Independent",
28 |     "Intended Audience :: Science/Research",
29 |     "Intended Audience :: Financial and Insurance Industry",
30 |     "Intended Audience :: Developers",
31 |     "Topic :: Scientific/Engineering :: Mathematics",
32 |     "Topic :: Scientific/Engineering :: Information Analysis",
33 | ]
34 | keywords = ["regime", "regime switching", "jump models", "clustering", "time series", "financial data"]
35 | 
36 | 
37 | 
38 | 
39 | 
40 | [project.optional-dependencies]
41 | example = [
42 |     "jupyterlab",
43 |     "yfinance",
44 | ]
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | [project.urls]
52 | Homepage = "https://github.com/Yizhan-Oliver-Shu/jump-models"
53 | Issues = "https://github.com/Yizhan-Oliver-Shu/jump-models/issues"


--------------------------------------------------------------------------------