├── .gitignore ├── JM_value_example.png ├── LICENSE ├── README.md ├── environment.yaml ├── examples └── nasdaq │ ├── data │ ├── NDX.csv │ └── NDX.pkl │ ├── example.ipynb │ ├── example.py │ ├── feature.py │ ├── get_data.py │ ├── plots │ ├── CJM_lambd-600.0_test_online.pdf │ ├── CJM_lambd-600.0_train.pdf │ ├── JM_lambd-0.0_train.pdf │ ├── JM_lambd-50.0_test_online.pdf │ ├── JM_lambd-50.0_train.pdf │ ├── SJM_lambd-50.0_max-feats-3.0_test_online.pdf │ └── SJM_lambd-50.0_max-feats-3.0_train.pdf │ └── utils_dir.py ├── jumpmodels ├── __init__.py ├── base.py ├── jump.py ├── plot.py ├── preprocess.py ├── sparse_jump.py └── utils │ ├── __init__.py │ ├── calculation.py │ ├── cluster.py │ ├── index.py │ └── validation.py └── pyproject.toml /.gitignore: -------------------------------------------------------------------------------- 1 | # Mac system trash 2 | .DS_Store 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | cover/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | .pybuilder/ 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | # For a library or package, you might want to ignore these files since the code is 90 | # intended to run in multiple environments; otherwise, check them in: 91 | # .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # poetry 101 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 102 | # This is especially recommended for binary packages to ensure reproducibility, and is more 103 | # commonly ignored for libraries. 104 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 105 | #poetry.lock 106 | 107 | # pdm 108 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 109 | #pdm.lock 110 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 111 | # in version control. 112 | # https://pdm.fming.dev/#use-with-ide 113 | .pdm.toml 114 | 115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 116 | __pypackages__/ 117 | 118 | # Celery stuff 119 | celerybeat-schedule 120 | celerybeat.pid 121 | 122 | # SageMath parsed files 123 | *.sage.py 124 | 125 | # Environments 126 | .env 127 | .venv 128 | env/ 129 | venv/ 130 | ENV/ 131 | env.bak/ 132 | venv.bak/ 133 | 134 | # Spyder project settings 135 | .spyderproject 136 | .spyproject 137 | 138 | # Rope project settings 139 | .ropeproject 140 | 141 | # mkdocs documentation 142 | /site 143 | 144 | # mypy 145 | .mypy_cache/ 146 | .dmypy.json 147 | dmypy.json 148 | 149 | # Pyre type checker 150 | .pyre/ 151 | 152 | # pytype static type analyzer 153 | .pytype/ 154 | 155 | # Cython debug symbols 156 | cython_debug/ 157 | 158 | # PyCharm 159 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 160 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 161 | # and can be added to the global gitignore or merged into this file. For a more nuclear 162 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 163 | #.idea/ 164 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 165 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 166 | 167 | # User-specific stuff 168 | .idea/**/workspace.xml 169 | .idea/**/tasks.xml 170 | .idea/**/usage.statistics.xml 171 | .idea/**/dictionaries 172 | .idea/**/shelf 173 | 174 | # AWS User-specific 175 | .idea/**/aws.xml 176 | 177 | # Generated files 178 | .idea/**/contentModel.xml 179 | 180 | # Sensitive or high-churn files 181 | .idea/**/dataSources/ 182 | .idea/**/dataSources.ids 183 | .idea/**/dataSources.local.xml 184 | .idea/**/sqlDataSources.xml 185 | .idea/**/dynamic.xml 186 | .idea/**/uiDesigner.xml 187 | .idea/**/dbnavigator.xml 188 | 189 | # Gradle 190 | .idea/**/gradle.xml 191 | .idea/**/libraries 192 | 193 | # Gradle and Maven with auto-import 194 | # When using Gradle or Maven with auto-import, you should exclude module files, 195 | # since they will be recreated, and may cause churn. Uncomment if using 196 | # auto-import. 197 | # .idea/artifacts 198 | # .idea/compiler.xml 199 | # .idea/jarRepositories.xml 200 | # .idea/modules.xml 201 | # .idea/*.iml 202 | # .idea/modules 203 | # *.iml 204 | # *.ipr 205 | 206 | # CMake 207 | cmake-build-*/ 208 | 209 | # Mongo Explorer plugin 210 | .idea/**/mongoSettings.xml 211 | 212 | # File-based project format 213 | *.iws 214 | 215 | # IntelliJ 216 | out/ 217 | 218 | # mpeltonen/sbt-idea plugin 219 | .idea_modules/ 220 | 221 | # JIRA plugin 222 | atlassian-ide-plugin.xml 223 | 224 | # Cursive Clojure plugin 225 | .idea/replstate.xml 226 | 227 | # SonarLint plugin 228 | .idea/sonarlint/ 229 | 230 | # Crashlytics plugin (for Android Studio and IntelliJ) 231 | com_crashlytics_export_strings.xml 232 | crashlytics.properties 233 | crashlytics-build.properties 234 | fabric.properties 235 | 236 | # Editor-based Rest Client 237 | .idea/httpRequests 238 | 239 | # Android studio 3.1+ serialized cache file 240 | .idea/caches/build_file_checksums.ser 241 | 242 | # Visual Studio temporary files, build results, and 243 | # files generated by popular Visual Studio add-ons. 244 | 245 | # User-specific files 246 | .vs/ 247 | *.user 248 | *.userosscache 249 | *.suo 250 | *.userprefs 251 | *.dll.config 252 | *.pdb -------------------------------------------------------------------------------- /JM_value_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yizhan-Oliver-Shu/jump-models/d0fa00ce10126791695a259a45c5ddd41fbced80/JM_value_example.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Fitting Example](JM_value_example.png) 2 | 3 | *Note: An explanation of the application of JMs to the value factor in this figure can be found in the* [Examples](#usage-and-examples) *section.* 4 | 5 | # **jumpmodels**: Python Library for Statistical Jump Models 6 | 7 | [![JumpModels on PyPI](https://img.shields.io/pypi/v/jumpmodels.svg)](https://pypi.org/project/jumpmodels/) 8 | 9 | `jumpmodels` is a Python library offering a collection of statistical jump models (JMs), an unsupervised algorithm designed for regime identification in time series data. 10 | It includes implementations of the original discrete JM, the continuous JM (CJM), and the sparse JM (SJM) with feature selection. 11 | The library follows a [`scikit-learn`](https://github.com/scikit-learn/scikit-learn)-style API and supports `pandas` DataFrames for both input and output. 12 | 13 | 14 | 15 | --- 16 | 17 | - [Installation](#installation) 18 | - [Usage & Examples](#usage-and-examples) 19 | - [References & Citations](#references-and-citations) 20 | - [Contributing](#contributing) 21 | - [Credits & Related Repo](#credits-and-related-repo) 22 | - [License](#license) 23 | 24 | 25 | 26 | 27 | ## Installation 28 | 29 | To install the package, use the following [pip](https://pip.pypa.io/en/stable/) command: 30 | 31 | ```bash 32 | pip install jumpmodels 33 | ``` 34 | 35 | 36 | `jumpmodels` requires the following dependencies: 37 | 38 | - Python `(>=3.8)` 39 | - `numpy` 40 | - `pandas` 41 | - `scipy` 42 | - `scikit-learn` 43 | - `matplotlib` 44 | 45 | All dependencies will be installed automatically with the package. While version sensitivity is minimal, an `environment.yaml` file is provided to ensure reproducibility. 46 | 47 | To run the example notebook, you will also need the following additional dependencies: 48 | 49 | - `yfinance` 50 | - `jupyterlab` 51 | 52 | You can install these along with the package by running: 53 | 54 | ```bash 55 | pip install jumpmodels[example] 56 | ``` 57 | 58 | 59 | **Note:** In addition to the Python package dependencies listed above, the plotting functions in `plot.py` work best with LaTeX installed locally. To enable this: 60 | 61 | 1. First install a LaTeX distribution such as [MiKTeX](https://miktex.org/download). 62 | 2. Second run `pip install latex` in your Python environment. 63 | 64 | Special thanks to [@Peter](https://github.com/peter1357908) for pointing this out in [issue #3](https://github.com/Yizhan-Oliver-Shu/jump-models/issues/3). 65 | 66 | If you prefer not to install LaTeX, you can comment out the `matplotlib_setting()` function at the beginning of `plot.py`. 67 | 68 | 69 | ## Usage and Examples 70 | 71 | You can import the two core classes, `JumpModel` and `SparseJumpModel`, as follows: 72 | 73 | ```python 74 | from jumpmodels.jump import JumpModel # JM & CJM class 75 | from jumpmodels.sparse_jump import SparseJumpModel # Sparse JM class 76 | ``` 77 | 78 | We follow a `scikit-learn`-style API, with class methods such as `.fit()`, `.predict()`, `.predict_proba()`, and `.set_params()` for model fitting, state and probability prediction, and resetting model parameters. 79 | Specifically designed for time series applications, we also provide `.predict_online()` and `.predict_proba_online()` methods for online prediction. 80 | 81 | 82 | A comprehensive demonstration of the core functionality is available in the `examples/Nasdaq/example.ipynb` notebook, which includes an analysis of the Nasdaq-100 Index using data from [Yahoo Finance](https://finance.yahoo.com/quote/%5ENDX/) (fully public source). 83 | 84 | The figure on top features an application of the sparse JM, showing the in-sample identified bull and bear market regimes for the value factor index based on its daily active returns relative to the market. 85 | Further details can be found in Shu and Mulvey (2024), as listed in the [References](#factor) section. 86 | 87 | 88 | 89 | 90 | 91 | ## References and Citations 92 | 93 | Below are articles related to the methodology and applications of JMs. 94 | If any of them assist your research, please cite the corresponding paper. 95 | 96 | ### JM Methodology 97 | 98 | - **Continuous Statistical Jump Models** (CJM): Aydınhan, A. O., Kolm, P. N., Mulvey, J. M., and Shu, Y. (2024). Identifying patterns in financial markets: Extending the statistical jump model for regime identification. *Annals of Operations Research*. To appear. [[journal](https://link.springer.com/article/10.1007/s10479-024-06035-z)] [[SSRN](https://papers.ssrn.com/abstract=4556048)] 99 | 100 | 101 | ```bibtex 102 | @article{Aydinhan2024CJM, 103 | title = {Identifying patterns in financial markets: extending the statistical jump model for regime identification}, 104 | author = {Afşar Onat Aydınhan and Petter N. Kolm and John M. Mulvey and Yizhan Shu}, 105 | journal = {Annals of Operations Research}, 106 | year = {2024}, 107 | note = {To appear}, 108 | doi = {https://doi.org/10.1007/s10479-024-06035-z}, 109 | } 110 | ``` 111 | 112 | 113 | - (Original) **Statistical Jump Models**: Nystrup, P., Lindström, E., and Madsen, H. (2020a). Learning hidden Markov models with persistent states by penalizing jumps. *Expert Systems with Applications*, 150:113307. [[journal](https://www.sciencedirect.com/science/article/abs/pii/S0957417420301329)] [[OpenAccess](https://orbit.dtu.dk/files/255194701/Learning_hidden_Markov_models_with_persistent_states_by_penalizing_jumps_ACCEPTED_ESWA.pdf)] 114 | 115 | 116 | ```bibtex 117 | @article{Nystrup2020JM, 118 | title = {Learning hidden {Markov} models with persistent states by penalizing jumps}, 119 | author = {Peter Nystrup and Erik Lindstr{\"o}m and Henrik Madsen}, 120 | journal = {Expert Systems with Applications}, 121 | year = {2020}, 122 | pages = {113307}, 123 | volume = {150}, 124 | doi = {https://doi.org/10.1016/j.eswa.2020.113307}, 125 | } 126 | ``` 127 | 128 | 129 | - **Sparse Jump Models**: Nystrup, P., Kolm, P. N., and Lindström, E. (2021). Feature selection in jump models. *Expert Systems with Applications*, 184:115558. [[journal](https://www.sciencedirect.com/science/article/pii/S0957417421009647)] [[SSRN](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3805831)] 130 | 131 | 132 | ```bibtex 133 | @article{nystrup2021SJM, 134 | title = {Feature selection in jump models}, 135 | author = {Peter Nystrup and Petter N. Kolm and Erik Lindstr{\"o}m}, 136 | journal = {Expert Systems with Applications}, 137 | volume = {184}, 138 | pages = {115558}, 139 | year = {2021}, 140 | doi = {https://doi.org/10.1016/j.eswa.2021.115558}, 141 | } 142 | ``` 143 | 144 | 145 | 146 | 147 | - **Online Inference for JMs**: Nystrup, P., Kolm, P. N., and Lindström, E. (2020b). Greedy online classification of persistent market states using realized intraday volatility features. *The Journal of Financial Data Science*, 2(3):25–39. [[journal](https://www.pm-research.com/content/iijjfds/2/3/25)] [[OpenAccess](https://backend.orbit.dtu.dk/ws/portalfiles/portal/242396317/Greedy_online_classification_of_persistent_market_states_using_realized_intraday_volatility_features.pdf)] 148 | 149 | ```bibtex 150 | @article{Nystrup2020onlineJM, 151 | title = {Greedy Online Classification of Persistent Market States Using Realized Intraday Volatility Features}, 152 | author = {Peter Nystrup and Petter N. Kolm and Erik Lindstr{\"o}m}, 153 | journal = {The Journal of Financial Data Science} 154 | year = {2020}, 155 | volume = {2}, 156 | number = {3}, 157 | pages = {25--39}, 158 | doi = {https://doi.org/10.3905/jfds.2020.2.3.025}, 159 | } 160 | ``` 161 | 162 | 163 | ### JM Applications 164 | 165 | 166 | - **Downside Risk Recution**: Shu, Y., Yu, C., and Mulvey, J. M. (2024a). Downside risk reduction using regime-switching signals: A statistical jump model approach. *Journal of Asset Management*. To appear. [[journal](https://link.springer.com/article/10.1057/s41260-024-00376-x)] [[SSRN](https://ssrn.com/abstract=4719989)] 167 | 168 | 169 | ```bibtex 170 | @article{Shu2024downside, 171 | title = {Downside Risk Reduction Using Regime-Switching Signals: A Statistical Jump Model Approach}, 172 | author = {Shu, Yizhan and Yu, Chenyu and Mulvey, John M.}, 173 | journal = {Journal of Asset Management}, 174 | year = {2024}, 175 | note = {To appear}, 176 | doi = {https://doi.org/10.1057/s41260-024-00376-x}, 177 | } 178 | ``` 179 | 180 | 181 | 182 | 183 | - **Dynamic Asset Allocation**: Shu, Y., Yu, C., and Mulvey, J. M. (2024b). Dynamic asset allocation with asset-specific regime forecasts. *Annals of Operations Research*. To appear. [[journal](https://link.springer.com/article/10.1007/s10479-024-06266-0)] [[SSRN](https://ssrn.com/abstract=4864358)] 184 | 185 | ```bibtex 186 | @article{Shu2024DAA, 187 | title = {Dynamic Asset Allocation with Asset-Specific Regime Forecasts}, 188 | author = {Shu, Yizhan and Yu, Chenyu and Mulvey, John M.}, 189 | journal = {Annals of Operations Research}, 190 | year = {2024}, 191 | note = {To appear}, 192 | doi = {https://doi.org/10.1007/s10479-024-06266-0}, 193 | } 194 | ``` 195 | 196 | 197 | 198 | 199 | - **Dynamic Factor Allocation**: Shu, Y. and Mulvey, J. M. (2024). Dynamic Factor Allocation Leveraging Regime-Switching Signals. [[SSRN](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4960484)] 200 | 201 | 202 | ```bibtex 203 | @article{Shu2024factor, 204 | title = {Dynamic Factor Allocation Leveraging Regime-Switching Signals}, 205 | author = {Shu, Yizhan and Mulvey, John M.}, 206 | journal = {SSRN}, 207 | year = {2024}, 208 | } 209 | ``` 210 | 211 | 212 | 213 | ## Contributing 214 | 215 | Pull requests and open issues are welcome. I am happy to discuss any related questions. 216 | 217 | 218 | ## Credits and Related Repo 219 | 220 | This library builds upon the open-source [code](https://www.sciencedirect.com/science/article/pii/S0957417421009647#appSB) accompanying Nystrup et al. (2021). 221 | 222 | The GitHub [Repo](https://github.com/FedericoCortese/GIC-for-SJM) by Federico P. Cortese implements the generalized information criteria (GIC) for high-dimensional SJMs, detailed in Cortese, F. P., Kolm, P. N., and Lindström, E. (2024). Generalized information criteria for high-dimensional sparse statistical jump models [[SSRN](https://papers.ssrn.com/sol3/papers.cfm?abstract_id=4774429)]. 223 | 224 | The structure of this README file is inspired by the format used in [`cvxpylayers`](https://github.com/cvxgrp/cvxpylayers). 225 | 226 | 227 | 228 | 229 | 230 | ## License 231 | 232 | Our library carries an Apache 2.0 license. 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | -------------------------------------------------------------------------------- /environment.yaml: -------------------------------------------------------------------------------- 1 | name: jm-pack-base 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - brotli=1.1.0 7 | - brotli-bin=1.1.0 8 | - bzip2=1.0.8 9 | - ca-certificates=2024.8.30 10 | - certifi=2024.8.30 11 | - contourpy=1.3.0 12 | - cycler=0.12.1 13 | - fonttools=4.54.1 14 | - freetype=2.12.1 15 | - joblib=1.4.2 16 | - kiwisolver=1.4.7 17 | - lcms2=2.16 18 | - lerc=4.0.0 19 | - libblas=3.9.0 20 | - libbrotlicommon=1.1.0 21 | - libbrotlidec=1.1.0 22 | - libbrotlienc=1.1.0 23 | - libcblas=3.9.0 24 | - libcxx=19.1.1 25 | - libdeflate=1.21 26 | - libexpat=2.6.3 27 | - libffi=3.4.2 28 | - libgfortran=5.0.0 29 | - libgfortran5=13.2.0 30 | - libjpeg-turbo=3.0.0 31 | - liblapack=3.9.0 32 | - libopenblas=0.3.27 33 | - libpng=1.6.44 34 | - libsqlite=3.46.1 35 | - libtiff=4.7.0 36 | - libwebp-base=1.4.0 37 | - libxcb=1.17.0 38 | - libzlib=1.3.1 39 | - llvm-openmp=19.1.0 40 | - matplotlib=3.9.2 41 | - matplotlib-base=3.9.2 42 | - munkres=1.1.4 43 | - ncurses=6.5 44 | - numpy=2.1.1 45 | - openjpeg=2.5.2 46 | - openssl=3.3.2 47 | - packaging=24.1 48 | - pandas=2.2.3 49 | - pillow=10.4.0 50 | - pip=24.2 51 | - pthread-stubs=0.4 52 | - pyparsing=3.1.4 53 | - python=3.12.6 54 | - python-dateutil=2.9.0 55 | - python-tzdata=2024.2 56 | - python_abi=3.12 57 | - pytz=2024.1 58 | - qhull=2020.2 59 | - readline=8.2 60 | - scikit-learn=1.5.2 61 | - scipy=1.14.1 62 | - setuptools=75.1.0 63 | - six=1.16.0 64 | - threadpoolctl=3.5.0 65 | - tk=8.6.13 66 | - tornado=6.4.1 67 | - tzdata=2024a 68 | - wheel=0.44.0 69 | - xorg-libxau=1.0.11 70 | - xorg-libxdmcp=1.1.5 71 | - xz=5.2.6 72 | - zstd=1.5.6 73 | prefix: /Users/yizhan/mambaforge/envs/jm-pack-base 74 | -------------------------------------------------------------------------------- /examples/nasdaq/data/NDX.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yizhan-Oliver-Shu/jump-models/d0fa00ce10126791695a259a45c5ddd41fbced80/examples/nasdaq/data/NDX.pkl -------------------------------------------------------------------------------- /examples/nasdaq/example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | from utils_dir import get_curr_dir, include_home_dir 7 | include_home_dir() 8 | 9 | import pandas as pd 10 | 11 | from jumpmodels.utils import filter_date_range # useful helpers 12 | from jumpmodels.jump import JumpModel # class of JM & CJM 13 | from jumpmodels.sparse_jump import SparseJumpModel # class of Sparse JM 14 | 15 | 16 | # # Load Data & Features 17 | # 18 | # This example demonstrates the class of *statistical jump models* (JMs) and various helper functions for regime analysis provided by our package `jumpmodels`, using an application on the Nasdaq-100 Index. 19 | # The core classes, `JumpModel` and `SparseJumpModel`, implement the original JM, continuous JM (CJM), and sparse JM (SJM) with feature selection. 20 | # These models follow the API style used in `scikit-learn` for easy integration and efficient usage. 21 | # For detailed mathematical and algorithmic explanations of these models, please refer to the literature cited in the `README`. 22 | # 23 | # Relevant helper functions will be imported as needed throughout this example. 24 | # If running this notebook in Jupyter Lab/Notebook poses any issues, there is an exported `.py` script available in this folder for convenient execution. 25 | # 26 | 27 | # ## Raw Data 28 | # 29 | # In this example, we analyze the regimes of the Nasdaq-100 Index. 30 | # The daily index price data is retrieved from [Yahoo Finance](https://finance.yahoo.com/quote/%5ENDX/) under the ticker `NDX`. 31 | # 32 | # The data retrieval is handled in the script `get_data.py`, and the dataset is already saved in the `example/Nasdaq/data/` folder in both `csv` and `pkl` formats, so there’s no need to run `get_data.py` manually. 33 | # 34 | # We work with daily frequency data using `pandas` DataFrames, where the index is of type `datetime.date`. 35 | # This format is consistent with the convention used in the `CRSP` database. 36 | # All helper functions in this package are designed to support this type of date index. 37 | 38 | # ## Feature Engineering 39 | # 40 | # Feeding the model a robust feature set is key to the successful application of any learning algorithm. 41 | # This example uses a simple feature set consisting of nine features: the exponentially weighted moving (EWM) return, downside deviation (in log scale), and Sortino ratio, each computed with three halflife values ranging from one week (5 days) to one quarter (3 months). 42 | # 43 | # Users may need to adjust the features or halflives to suit their specific applications. 44 | # The literature referenced in the `README` offers a solid foundation for further exploration. 45 | # 46 | # The computation of these features is detailed in `feature.py` in the same folder as this example, and we use the `DataLoader` class to load both the index returns and the engineered features. 47 | # The loaded data covers the period from the start of 2007 to the end of September 2024. 48 | 49 | # In[2]: 50 | 51 | 52 | from feature import DataLoader 53 | 54 | data = DataLoader(ticker="NDX", ver="v0").load(start_date="2007-1-1", end_date="2024-09-30") 55 | 56 | print("Daily returns stored in `data.ret_ser`:", "-"*50, sep="\n") 57 | print(data.ret_ser, "-"*50, sep="\n") 58 | print("Features stored in `data.X`:", "-"*50, sep="\n") 59 | print(data.X) 60 | 61 | 62 | # ## Train/Test Split and Preprocessing 63 | # 64 | # We perform a simple time-based split: data from the beginning of 2007 to the end of 2021, covering a 15-year period, is used as the training set for fitting the JMs. 65 | # The period from 2022 to late 2024 is reserved as the test set, where we apply the trained JMs to perform online regime inference. 66 | # We use the helper function `filter_date_range` to filter the start and end dates of a DataFrame. 67 | 68 | # In[3]: 69 | 70 | 71 | train_start, test_start = "2007-1-1", "2022-1-1" 72 | # filter dates 73 | X_train = filter_date_range(data.X, start_date=train_start, end_date=test_start) 74 | X_test = filter_date_range(data.X, start_date=test_start) 75 | # print time split 76 | train_start, train_end = X_train.index[[0, -1]] 77 | test_start, test_end = X_test.index[[0, -1]] 78 | print("Training starts at:", train_start, "and ends at:", train_end) 79 | print("Testing starts at:", test_start, "and ends at:", test_end) 80 | 81 | 82 | # The module `jumpmodels.preprocess` provides two classes for preprocessing: one for standardizing and one for clipping the feature data. 83 | # We first clip the data within three standard deviations for all features and then perform standardization before feeding the data into the JMs. 84 | # Both classes are first fitted on the training data and subsequently used to transform the test data. 85 | # 86 | # These classes support both `pandas` DataFrames and `numpy` arrays as direct inputs and outputs. 87 | # We prefer to retain the DataFrame type whenever possible to preserve the date index and column labels. 88 | 89 | # In[4]: 90 | 91 | 92 | # Preprocessing 93 | from jumpmodels.preprocess import StandardScalerPD, DataClipperStd 94 | clipper = DataClipperStd(mul=3.) 95 | scalar = StandardScalerPD() 96 | # fit on training data 97 | X_train_processed = scalar.fit_transform(clipper.fit_transform(X_train)) 98 | # transform the test data 99 | X_test_processed = scalar.transform(clipper.transform(X_test)) 100 | 101 | 102 | # # Original JM 103 | 104 | # ## In-Sample Fitting 105 | # 106 | # We begin by illustrating the in-sample training of the original JM. 107 | # The model parameters are set as follows: the number of components/states/regimes is 2, the jump penalty $\lambda$ is 50.0, and `cont=False`, indicating the original discrete JM that performs hard clustering. 108 | # It is important to note that the jump penalty $\lambda$ is a crucial hyperparameter that requires tuning, either through statistical criteria or cross-validation (see references for details). 109 | # 110 | # The docstring provides comprehensive documentation of all parameters and attributes (thanks to ChatGPT). 111 | 112 | # In[5]: 113 | 114 | 115 | # set the jump penalty 116 | jump_penalty=50. 117 | # initlalize the JM instance 118 | jm = JumpModel(n_components=2, jump_penalty=jump_penalty, cont=False, ) 119 | 120 | 121 | # In the `.fit()` call, we pass the return series for each period to be used for sorting the states. 122 | # We specify `sort_by="cumret"`, meaning that the state labels (0 or 1) are determined by the cumulative returns under each state. The state with higher cumulative returns is denoted as $s_t=0$ (bull market), and the state with lower returns is denoted as $s_t=1$ (bear market). 123 | # 124 | 125 | # In[6]: 126 | 127 | 128 | # call .fit() 129 | jm.fit(X_train_processed, data.ret_ser, sort_by="cumret") 130 | 131 | 132 | # The cluster centroids for each state are stored in the `centers_` attribute. 133 | # While these values are scaled, making direct interpretation hard, the bull market state is clearly characterized by higher returns, lower downside deviation, and a higher Sortino ratio, with a distinct separation between the two regimes. 134 | 135 | # In[7]: 136 | 137 | 138 | print("Scaled Cluster Centroids:", pd.DataFrame(jm.centers_, index=["Bull", "Bear"], columns=X_train.columns), sep="\n" + "-"*50 + "\n") 139 | 140 | 141 | # ### Visualization 142 | # 143 | # The `jumpmodels.plot` module provides useful functions for visualizing regime identification. 144 | # We'll use the `labels_` attribute of the JM instance, which contains integers from 0 to `n_c-1`, representing the in-sample fitted regime assignment for each period. 145 | # 146 | # From the plot, we observe that the identified regimes for the Nasdaq-100 Index successfully capture several significant market downturns, including the global financial crisis, corrections in 2012, 2015-2016, 2019, and the COVID-19 crash in 2020. 147 | # These identified regimes correspond well to shifts in market fundamentals, as interpreted in hindsight. 148 | # 149 | 150 | # In[8]: 151 | 152 | 153 | from jumpmodels.plot import plot_regimes_and_cumret, savefig_plt 154 | 155 | ax, ax2 = plot_regimes_and_cumret(jm.labels_, data.ret_ser, n_c=2, start_date=train_start, end_date=train_end, ) 156 | ax.set(title=f"In-Sample Fitted Regimes by the JM ($\\lambda$={jump_penalty})") 157 | savefig_plt(f"{get_curr_dir()}/plots/JM_lambd-{jump_penalty}_train.pdf") 158 | 159 | 160 | # ### Modifying Parameters via `set_params()` 161 | # 162 | # Our model inherits from the `BaseEstimator` class provided by `scikit-learn`, enabling a wide range of utility methods. 163 | # Among these, we highlight the `.set_params()` function, which allows users to reset any input parameters without creating a new instance. 164 | # This functionality is particularly useful when the model needs to be refitted multiple times, such as when testing different jump penalties. 165 | # 166 | # As an example, we reset the jump penalty to zero, effectively reducing the model to a baseline $k$-means clustering algorithm where temporal information is ignored. 167 | # This comparison illustrates the value of applying a jump penalty to ensure temporal consistency and reduce the occurrence of unrealistic regime shifts. 168 | 169 | # In[9]: 170 | 171 | 172 | # reset jump_penalty to zero 173 | jump_penalty=0. 174 | jm.set_params(jump_penalty=jump_penalty) 175 | print("The jump penalty of the JM instance has been reset to: jm.jump_penalty =", jm.jump_penalty) 176 | 177 | 178 | # In[10]: 179 | 180 | 181 | # refit 182 | jm.fit(X_train_processed, data.ret_ser, sort_by="cumret") 183 | 184 | # plot 185 | ax, ax2 = plot_regimes_and_cumret(jm.labels_, data.ret_ser, n_c=2, start_date=train_start, end_date=train_end, ) 186 | ax.set(title=f"In-Sample Fitted Regimes by the JM ($\\lambda$={jump_penalty})") 187 | savefig_plt(f"{get_curr_dir()}/plots/JM_lambd-{jump_penalty}_train.pdf") 188 | 189 | 190 | # ## Online Inference 191 | # 192 | # After completing the in-sample training, we apply the trained models for online inference on the test period using the `predict_online()` method. 193 | # Here, *online inference* means that the regime inference for period $t$ is based solely on the data available up to the end of that period, without using any future data. 194 | # We revert the jump penalty to a reasonable value of 50.0. 195 | # 196 | # 197 | 198 | # In[11]: 199 | 200 | 201 | # refit 202 | jump_penalty=50. 203 | jm.set_params(jump_penalty=jump_penalty).fit(X_train_processed, data.ret_ser, sort_by="cumret") 204 | # make online inference 205 | labels_test_online = jm.predict_online(X_test_processed) 206 | 207 | 208 | # From the visualization below, we observe that the JM effectively signals the bear market in 2022, driven by aggressive interest rate hikes. 209 | # This period saw a return of over $-$15% and a significant drawdown. 210 | # However, the brief bear period captured in the second half of 2024 is followed by a strong price reversal. 211 | # This latency issue constitutes a common challenge in real-time applications of regime-switching signals. 212 | # Improving the feature set or fine-tuning the jump penalty may help address this issue. 213 | 214 | # In[12]: 215 | 216 | 217 | # plot and save 218 | ax, ax2 = plot_regimes_and_cumret(labels_test_online, data.ret_ser, n_c=2, start_date=test_start, end_date=test_end, ) 219 | ax.set(title=f"Out-of-Sample Online Inferred Regimes by the JM ($\\lambda$={jump_penalty})") 220 | savefig_plt(f"{get_curr_dir()}/plots/JM_lambd-{jump_penalty}_test_online.pdf") 221 | 222 | 223 | # In contrast to online inference, the `.predict()` method performs state decoding using all test data (i.e., from 2022 to 2024) at once. 224 | # While this approach is less realistic for trading applications, we observe that, with access to the full dataset, the model avoids the reversal in late 2024 and exits the bear signal in 2023 slightly earlier than with online inference. 225 | # 226 | # Though this approach is less applicable for real-world backtesting in financial markets, it holds potential uses in other engineering fields (such as language modeling, where access to an entire sentence is available at once.) 227 | 228 | # In[13]: 229 | 230 | 231 | # make inference using all test data 232 | labels_test = jm.predict(X_test_processed) 233 | # plot 234 | ax, ax2 = plot_regimes_and_cumret(labels_test, data.ret_ser, n_c=2, start_date=test_start, end_date=test_end, ) 235 | _ = ax.set(title=f"Out-of-Sample Predicted Regimes by the JM Using All Test Data ($\\lambda$={jump_penalty})") 236 | 237 | 238 | # # CJM: Continuous Extension of the JM 239 | # 240 | # With this, we conclude a minimal overview of the core functionality of using JMs to assign regime labels to in-sample training periods and leverage trained models for out-of-sample prediction, either through online inference or by processing all data at once. 241 | # The methods -- such as `.fit()`, `.set_params()`, and `predict_online()` -- extend seamlessly to the following JM variants: CJM and SJM. 242 | # Here, we provide brief illustrations of these extensions. 243 | 244 | # ## In-Sample Fitting 245 | # 246 | # The CJM (Continuous Jump Model) uses the same `JumpModel` class as the discrete model, with the parameter `cont=True`. 247 | # 248 | # ### Parameters 249 | # 250 | # Regarding the jump penalty value, it is typically set to be 10 times larger than the $\lambda$ used in the discrete model to achieve similar fittings, so we choose $\lambda=600.0$ here. 251 | # 252 | # Additionally, CJM introduces two specialized parameters: `mode_loss` and `grid_size`, which require more nuanced understanding. 253 | # Generally, the default values are recommended for most cases. 254 | # 255 | 256 | # In[14]: 257 | 258 | 259 | jump_penalty=600. 260 | cjm = JumpModel(n_components=2, jump_penalty=jump_penalty, cont=True) 261 | 262 | 263 | # The `proba_` attribute of the CJM instance stores the estimated probability of each period belonging to each state. 264 | # Unlike the discrete model, where the state assignment changes abruptly, CJM offers smooth probability transitions, ranging from 0% to 100%. 265 | # This probabilistic interpretation has potential applications in many domains, especially where softer regime assignments are beneficial. 266 | 267 | # In[15]: 268 | 269 | 270 | cjm.fit(X_train_processed, data.ret_ser, sort_by="cumret") 271 | 272 | # plot 273 | ax, ax2 = plot_regimes_and_cumret(cjm.proba_, data.ret_ser, n_c=2, start_date=train_start, end_date=train_end, ) 274 | ax2.set(ylabel="Regime Probability") 275 | ax.set(title=f"In-Sample Fitted Regimes by the CJM ($\\lambda$={jump_penalty})") 276 | savefig_plt(f"{get_curr_dir()}/plots/CJM_lambd-{jump_penalty}_train.pdf") 277 | 278 | 279 | # ## Online Inference 280 | # 281 | # The `.predict_proba_online()` method allows CJM to make probabilistic regime inferences online. 282 | # From the plot, we observe that the confidence in the bear market during late 2024 doesn't fully reach 100%, potentially reducing the mislabeling issue discussed earlier. 283 | # This smoother transition in probabilities may offer better regime detection in uncertain market conditions. 284 | 285 | # In[16]: 286 | 287 | 288 | # online inference 289 | proba_test_online = cjm.predict_proba_online(X_test_processed) 290 | 291 | # plot 292 | ax, ax2 = plot_regimes_and_cumret(proba_test_online, data.ret_ser, start_date=test_start, end_date=test_end, ) 293 | ax2.set(ylabel="Regime Probability") 294 | ax.set(title=f"Out-of-Sample Online Inferred Regimes by the CJM ($\\lambda$={jump_penalty})") 295 | savefig_plt(f"{get_curr_dir()}/plots/CJM_lambd-{jump_penalty}_test_online.pdf") 296 | 297 | 298 | # # SJM: Sparse JM with Feature Selection 299 | # 300 | # Finally, the Sparse Jump Model (SJM) introduces feature weighting on top of the original JM or CJM. 301 | # Features leading to better in-sample clustering effects, as measured by variance reduction, are assigned higher weights, while a LASSO-like constraint on the weight vector ensures that noisy features receive zero weight. 302 | # 303 | # ## In-Sample Fitting 304 | # 305 | # ### Parameters 306 | # 307 | # SJM is implemented in the class `SparseJumpModel`, with an additional parameter `max_feats`, which controls the number of features included. 308 | # This parameter roughly reflects the effective number of features. (In the notation of Nystrup et al. (2021), `max_feats` corresponds to $\kappa^2$.) 309 | # 310 | # The jump penalty value is of a similar magnitude to the non-sparse model. In this case, we try `max_feats=3.` and `jump_penalty=50.` 311 | 312 | # In[17]: 313 | 314 | 315 | max_feats=3. 316 | jump_penalty=50. 317 | # init sjm instance 318 | sjm = SparseJumpModel(n_components=2, max_feats=max_feats, jump_penalty=jump_penalty, ) 319 | # fit 320 | sjm.fit(X_train_processed, ret_ser=data.ret_ser, sort_by="cumret") 321 | 322 | 323 | # The feature weights are stored in the attribute `feature_weights`. 324 | # Generally, we observe that features with longer halflives receive higher weights, indicating that less smoothed features are noisier and are excluded from the model, thanks to the feature weighting mechanism. 325 | 326 | # In[18]: 327 | 328 | 329 | print("SJM Feature Weights:", "-"*50, sjm.feat_weights, sep="\n") 330 | 331 | 332 | # A comparison of the SJM-identified regimes with those identified by JM reveals that the GFC is consolidated into a single bear regime, demonstrating that short-term noise has been effectively mitigated. 333 | 334 | # In[19]: 335 | 336 | 337 | # plot 338 | ax, ax2 = plot_regimes_and_cumret(sjm.labels_, data.ret_ser, n_c=2, start_date=train_start, end_date=train_end, ) 339 | ax.set(title=f"In-Sample Fitted Regimes by the SJM ($\\lambda$={jump_penalty}, $\\kappa^2$={max_feats})") 340 | savefig_plt(f"{get_curr_dir()}/plots/SJM_lambd-{jump_penalty}_max-feats-{max_feats}_train.pdf") 341 | 342 | 343 | # ## Online Inference 344 | # 345 | # As before, the `.predict_online()` method handles online inference. 346 | # Notably, through feature selection, the previously problematic bear market signal in late 2024 is absent in the SJM's online inference, highlighting the potential benefits of feature selection. 347 | # 348 | # 349 | 350 | # In[20]: 351 | 352 | 353 | # online inference 354 | labels_test_online_sjm = sjm.predict_online(X_test_processed) 355 | 356 | # plot 357 | ax, ax2 = plot_regimes_and_cumret(labels_test_online_sjm, data.ret_ser, start_date=test_start, end_date=test_end, ) 358 | ax.set(title=f"Out-of-Sample Online Inferred Regimes by the SJM ($\\lambda$={jump_penalty}, $\\kappa^2$={max_feats})") 359 | savefig_plt(f"{get_curr_dir()}/plots/SJM_lambd-{jump_penalty}_max-feats-{max_feats}_test_online.pdf") 360 | 361 | 362 | # # Conclusion 363 | # 364 | # This concludes the introduction to the functionalities of our `jumpmodels` library. 365 | # The field of statistical jump models is still actively evolving, with ongoing research exploring new avenues. 366 | # We hope that the models and helper functions provided in this package will be useful in your own work. 367 | # Citations and credits are always appreciated. 368 | # 369 | # We welcome pull requests and open issues, and I’m happy to discuss any related questions. 370 | 371 | # In[ ]: 372 | 373 | 374 | 375 | 376 | -------------------------------------------------------------------------------- /examples/nasdaq/feature.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helpers for engineering the features to be input to JMs. 3 | """ 4 | 5 | from utils_dir import * 6 | include_home_dir() 7 | 8 | import numpy as np 9 | import pandas as pd 10 | from sklearn.base import BaseEstimator 11 | 12 | from jumpmodels.utils import * 13 | 14 | ############################################ 15 | ## Feature Engineering 16 | ############################################ 17 | 18 | # reviewed 19 | def compute_ewm_DD(ret_ser: pd.Series, hl: float) -> pd.Series: 20 | """ 21 | Compute the exponentially weighted moving downside deviation (DD) for a return series. 22 | 23 | The downside deviation is calculated as the square root of the exponentially 24 | weighted second moment of negative returns. 25 | 26 | Parameters 27 | ---------- 28 | ret_ser : pd.Series 29 | The input return series. 30 | 31 | hl : float 32 | The halflife parameter for the exponentially weighted moving average. 33 | 34 | Returns 35 | ------- 36 | pd.Series 37 | The exponentially weighted moving downside deviation for the return series. 38 | """ 39 | ret_ser_neg: pd.Series = np.minimum(ret_ser, 0.) 40 | sq_mean = ret_ser_neg.pow(2).ewm(halflife=hl).mean() 41 | return np.sqrt(sq_mean) 42 | 43 | # reviewed 44 | def feature_engineer(ret_ser: pd.Series, ver: str = "v0") -> pd.DataFrame: 45 | """ 46 | Engineer a set of features based on a return series. 47 | 48 | This function customizes the feature set according to the specified version string. 49 | 50 | Parameters 51 | ---------- 52 | ret_ser : pd.Series 53 | The input return series for feature engineering. 54 | 55 | ver : str 56 | The version of feature engineering to apply. Only supports "v0". 57 | 58 | Returns 59 | ------- 60 | pd.DataFrame 61 | The engineered feature set. 62 | """ 63 | if ver == "v0": 64 | feat_dict = {} 65 | hls = [5, 20, 60] 66 | for hl in hls: 67 | # Feature 1: EWM-ret 68 | feat_dict[f"ret_{hl}"] = ret_ser.ewm(halflife=hl).mean() 69 | # Feature 2: log(EWM-DD) 70 | DD = compute_ewm_DD(ret_ser, hl) 71 | feat_dict[f"DD-log_{hl}"] = np.log(DD) 72 | # Feature 3: EWM-Sortino-ratio = EWM-ret/EWM-DD 73 | feat_dict[f"sortino_{hl}"] = feat_dict[f"ret_{hl}"].div(DD) 74 | return pd.DataFrame(feat_dict) 75 | 76 | # try out your favorite feature sets 77 | else: 78 | raise NotImplementedError() 79 | 80 | ############################################ 81 | ## DataLoader Class 82 | ############################################ 83 | 84 | class DataLoader(BaseEstimator): 85 | """ 86 | Class for loading the feature matrix. 87 | 88 | This class loads raw return data, computes features, and filters the data by date. 89 | 90 | Parameters 91 | ---------- 92 | ticker : str 93 | The ticker symbol for which to load data. Only supports "NDX". 94 | 95 | ver : str 96 | The version of the feature set to apply. Only supports "v0". 97 | 98 | Attributes 99 | ---------- 100 | X : pd.DataFrame 101 | The feature matrix. 102 | 103 | ret_ser : pd.Series 104 | The return series. 105 | """ 106 | def __init__(self, ticker: str = "NDX", ver: str = "v0"): 107 | self.ticker = ticker 108 | self.ver = ver 109 | 110 | # reviewed 111 | def load(self, start_date: DATE_TYPE = None, end_date: DATE_TYPE = None): 112 | """ 113 | Load the raw return data, compute features, and filter by date range. 114 | 115 | Parameters 116 | ---------- 117 | start_date : DATE_TYPE, optional 118 | The start date for filtering the data. If None, no start filtering is applied. 119 | 120 | end_date : DATE_TYPE, optional 121 | The end date for filtering the data. If None, no end filtering is applied. 122 | 123 | Returns 124 | ------- 125 | self 126 | The DataLoader instance with the feature matrix and return series stored in attributes. 127 | """ 128 | # load raw data 129 | curr_dir = get_curr_dir() 130 | ret_ser_raw = pd.read_pickle(f"{curr_dir}/data/{self.ticker}.pkl").ret.dropna() 131 | ret_ser_raw.name = self.ticker 132 | # features 133 | df_features_all = feature_engineer(ret_ser_raw, self.ver) 134 | 135 | # filter date 136 | X = filter_date_range(df_features_all, start_date, end_date) 137 | valid_no_nan(X) 138 | # save attributes 139 | self.X = X 140 | self.ret_ser = filter_date_range(ret_ser_raw, start_date, end_date) 141 | # save more useful attributes if needed 142 | return self 143 | -------------------------------------------------------------------------------- /examples/nasdaq/get_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script retrieves the daily closing price data for 3 | the Nasdaq-100 index from Yahoo Finance via its Python API. 4 | 5 | Users do not need to run this script manually, as the return data 6 | is already saved in `example/Nasdaq/data/`. 7 | """ 8 | 9 | from utils_dir import * 10 | include_home_dir() 11 | 12 | import numpy as np 13 | import pandas as pd 14 | import yfinance as yf 15 | 16 | from jumpmodels.utils import check_dir_exist 17 | 18 | TICKER = "NDX" # Nasdaq-100 Index 19 | 20 | def get_data(): 21 | # download closing prices 22 | close: pd.Series = yf.download("^"+TICKER, start="1985-10-01", end="2024-09-30")['Close'] 23 | # convert to ret 24 | ret = close.pct_change() 25 | # concat as df 26 | df = pd.DataFrame({"close": close.squeeze(), "ret": ret.squeeze()}, index=close.index.date) 27 | df.index.name = "date" 28 | 29 | # save 30 | curr_dir = get_curr_dir() 31 | data_dir = f"{curr_dir}/data/"; check_dir_exist(data_dir) 32 | pd.to_pickle(df, f"{data_dir}{TICKER}.pkl") 33 | np.round(df, 6).to_csv(f"{data_dir}{TICKER}.csv") 34 | print("Successfully downloaded data for ticker:", TICKER) 35 | return 36 | 37 | if __name__ == "__main__": 38 | get_data() -------------------------------------------------------------------------------- /examples/nasdaq/plots/CJM_lambd-600.0_test_online.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yizhan-Oliver-Shu/jump-models/d0fa00ce10126791695a259a45c5ddd41fbced80/examples/nasdaq/plots/CJM_lambd-600.0_test_online.pdf -------------------------------------------------------------------------------- /examples/nasdaq/plots/CJM_lambd-600.0_train.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yizhan-Oliver-Shu/jump-models/d0fa00ce10126791695a259a45c5ddd41fbced80/examples/nasdaq/plots/CJM_lambd-600.0_train.pdf -------------------------------------------------------------------------------- /examples/nasdaq/plots/JM_lambd-0.0_train.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yizhan-Oliver-Shu/jump-models/d0fa00ce10126791695a259a45c5ddd41fbced80/examples/nasdaq/plots/JM_lambd-0.0_train.pdf -------------------------------------------------------------------------------- /examples/nasdaq/plots/JM_lambd-50.0_test_online.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yizhan-Oliver-Shu/jump-models/d0fa00ce10126791695a259a45c5ddd41fbced80/examples/nasdaq/plots/JM_lambd-50.0_test_online.pdf -------------------------------------------------------------------------------- /examples/nasdaq/plots/JM_lambd-50.0_train.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yizhan-Oliver-Shu/jump-models/d0fa00ce10126791695a259a45c5ddd41fbced80/examples/nasdaq/plots/JM_lambd-50.0_train.pdf -------------------------------------------------------------------------------- /examples/nasdaq/plots/SJM_lambd-50.0_max-feats-3.0_test_online.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yizhan-Oliver-Shu/jump-models/d0fa00ce10126791695a259a45c5ddd41fbced80/examples/nasdaq/plots/SJM_lambd-50.0_max-feats-3.0_test_online.pdf -------------------------------------------------------------------------------- /examples/nasdaq/plots/SJM_lambd-50.0_max-feats-3.0_train.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Yizhan-Oliver-Shu/jump-models/d0fa00ce10126791695a259a45c5ddd41fbced80/examples/nasdaq/plots/SJM_lambd-50.0_max-feats-3.0_train.pdf -------------------------------------------------------------------------------- /examples/nasdaq/utils_dir.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helpers for working with file directories. 3 | 4 | Useful for all scripts/notebooks in this folder. 5 | Please ensure the file structure under `example/Nasdaq` is preserved 6 | in its original form for everything to function properly. 7 | """ 8 | 9 | import sys, os 10 | 11 | def get_curr_dir(): 12 | """ 13 | Return the current directory of this `get_data.py` file. 14 | """ 15 | return os.path.dirname(os.path.abspath(__file__)) 16 | 17 | def include_home_dir(): 18 | """ 19 | Add the project's home directory to `sys.path`. 20 | 21 | This function ensures that the home directory of the project is included in 22 | `sys.path` to allow imports from other parts of the project. For this to work 23 | correctly, the script must be placed in the `example/Nasdaq/` folder. 24 | """ 25 | curr_dir = get_curr_dir() 26 | home_dir = os.path.dirname(os.path.dirname(curr_dir)) 27 | sys.path.append(home_dir) 28 | return -------------------------------------------------------------------------------- /jumpmodels/__init__.py: -------------------------------------------------------------------------------- 1 | # global constants 2 | RANDOM_STATE = 0 -------------------------------------------------------------------------------- /jumpmodels/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for the base class used in clustering-like algorithms. 3 | 4 | This module provides helpers for parameter sorting, parameter initialization, and base class 5 | definitions for clustering-like algorithms. 6 | 7 | Depends on 8 | ---------- 9 | utils/ : Modules 10 | """ 11 | 12 | from .utils import * 13 | 14 | from sklearn.base import BaseEstimator 15 | from sklearn.utils import check_random_state 16 | from sklearn.cluster import kmeans_plusplus 17 | 18 | ################################## 19 | # Sorting 20 | ################################## 21 | 22 | # reviewed 23 | def sort_param_dict_from_idx(params: dict, idx: np.ndarray) -> None: 24 | """ 25 | Sort a dictionary of parameters according to a given index array. 26 | 27 | Expected parameter shapes: 28 | - `ret_` : (n_c,) 29 | - `vol_` : (n_c,) 30 | - `means_` : (n_c, n_f) 31 | - `centers_` : (n_c, n_f) 32 | - `transmat_` : (n_c, n_c) 33 | - `startprob_` : (n_c,) 34 | - `proba_` : (n_s, n_c) 35 | - `covars_` : (n_c, 1) 36 | 37 | Parameters 38 | ---------- 39 | params : dict 40 | A dictionary of parameters, each corresponding to a clustering result. 41 | 42 | idx : ndarray of shape (n_c,) 43 | The index array to sort the parameters by. 44 | """ 45 | # permute `axis=0` 46 | for key in ['ret_', 'vol_', 'means_', 'centers_', 'startprob_', 'covars_']: 47 | if key in params: params[key] = params[key][idx] 48 | # transmat, need to permute both `axis=0 & 1` 49 | if 'transmat_' in params: params['transmat_'] = params['transmat_'][idx][:, idx] 50 | # proba, need to permute `axis=1` 51 | if 'proba_' in params: params['proba_'] = params['proba_'][:, idx] 52 | return 53 | 54 | # reviewed 55 | def sort_param_dict(params: dict, sort_by='ret') -> None: 56 | """ 57 | Sort the states by a given criterion and permute all parameters accordingly. 58 | Supported sorting criteria are ["cumret", "vol", "freq", "ret"], i.e. 59 | states sorted by decreasing (cumulative) return, increasing vol, decreasing frequency. 60 | 61 | `nan` values will be (ideally) sorted to the end. 62 | 63 | Parameters 64 | ---------- 65 | params : dict 66 | A dictionary of parameters, each corresponding to a clustering result. 67 | 68 | sort_by : str, optional (default='ret') 69 | The criterion to sort the parameters by. Must be one of ["cumret", "vol", "freq", "ret"]. 70 | """ 71 | if sort_by is None: return 72 | assert sort_by in ["cumret", "vol", "freq", "ret"] 73 | if "proba_" in params: freq = params["proba_"].sum(axis=0) 74 | if sort_by == 'vol': 75 | assert 'vol_' in params 76 | criterion = params['vol_'] 77 | elif sort_by == "cumret": 78 | assert "ret_" in params and "proba_" in params 79 | criterion = -params["ret_"] * freq # missing regimes will have a cumret of nan*0 = nan 80 | elif sort_by == "ret": 81 | assert "ret_" in params 82 | criterion = -params['ret_'] 83 | elif sort_by == "freq": 84 | assert "proba_" in params 85 | criterion = -freq # decreasing freq 86 | else: 87 | raise NotImplementedError() 88 | criterion = replace_inf_by_nan(criterion) 89 | idx = np.argsort(criterion) 90 | sort_param_dict_from_idx(params, idx) 91 | return 92 | 93 | # reviewed 94 | def align_and_check_ret_ser(ret_ser: SER_ARR_TYPE, X: DF_ARR_TYPE) -> np.ndarray: 95 | """ 96 | Align a return series with the input data matrix `X`, 97 | and convert it to a 1D array. 98 | 99 | Parameters 100 | ---------- 101 | ret_ser : Series or ndarray 102 | The return series to validate. 103 | 104 | X : DataFrame or ndarray 105 | The data matrix to align with. 106 | 107 | Returns 108 | ------- 109 | ndarray 110 | The aligned and validated 1D return array. 111 | """ 112 | ret_ser = align_x_with_y(ret_ser, X) 113 | return check_1d_array(ret_ser) 114 | 115 | # reviewed 116 | def sort_states_from_ret(ret_ser: Optional[SER_ARR_TYPE], 117 | X: DF_ARR_TYPE, 118 | best_res: dict, 119 | sort_by: str = "cumret") -> None: 120 | """ 121 | Sort the states in the fitted parameters stored in a dictionary according to a specified criterion. 122 | This is intended for financial applications. If not applicable, input `None` for `ret_ser`. 123 | 124 | Parameters 125 | ---------- 126 | ret_ser : Series or ndarray, optional 127 | The return series to use for computing average return and volatility within each state. 128 | If `None`, sorting is attempted by decreasing frequency (given that the `proba_` param is estimated). 129 | 130 | X : DataFrame or ndarray 131 | The data matrix to use for alignment. 132 | 133 | best_res : dict 134 | Fitted parameters of the best clustering results to sort. 135 | 136 | sort_by : str, optional (default="cumret") 137 | The criterion to use for sorting. Must be one of ["cumret", "vol", "freq", "ret"]. 138 | 139 | - If `ret_ser` is provided, it is used to compute the mean return (`ret_`) and volatility (`vol_`) 140 | within each state. Sorting by decreasing (cumulative) return and increasing volatility is possible. 141 | - If `ret_ser` is `None`, sort by frequency if the `proba_` attribute exists, otherwise 142 | don't sort anything. 143 | """ 144 | if ret_ser is not None: 145 | # valid inputs 146 | ret_ser_arr = align_and_check_ret_ser(ret_ser, X) 147 | # compute mean & vol for each cluster 148 | best_res['ret_'], best_res['vol_'] = weighted_mean_std_cluster(ret_ser_arr, best_res['proba_']) 149 | # the best parameters sorted by a criterion 150 | sort_param_dict(best_res, sort_by=sort_by) 151 | elif "proba_" in best_res: 152 | sort_param_dict(best_res, sort_by="freq") 153 | return 154 | 155 | ################################## 156 | # Initialization 157 | ################################## 158 | 159 | # reviewed 160 | def init_centers_kmeans_plusplus(X: np.ndarray, n_c=2, n_init=10, random_state=None) -> list[np.ndarray]: 161 | """ 162 | Initialize the cluster centers using the K-Means++ algorithm, repeated `n_init` times. 163 | 164 | Parameters 165 | ---------- 166 | X : ndarray of shape (n_s, n_f) 167 | The data matrix. 168 | 169 | n_c : int, optional (default=2) 170 | The number of clusters. 171 | 172 | n_init : int, optional (default=10) 173 | The number of initializations to perform. 174 | 175 | random_state : int, RandomState instance, or None, optional (default=None) 176 | Controls the randomness of the center initialization. 177 | 178 | Returns 179 | ------- 180 | centers : list of ndarray 181 | A list of initialized centers for each run. 182 | """ 183 | random_state = check_random_state(random_state) 184 | centers = [kmeans_plusplus(X, n_c, random_state=random_state)[0] for _ in range(n_init)] 185 | return centers # (n_init, n_c, n_f) 186 | 187 | ################################## 188 | # Base Class 189 | ################################## 190 | 191 | class BaseClusteringAlgo(BaseEstimator): 192 | """ 193 | A base class for all clustering-like algorithms. 194 | 195 | This class provides several common methods but does not include any model fitting logic. 196 | It is intended to be inherited with specific implementations. 197 | 198 | Parameters 199 | ---------- 200 | n_components : int 201 | The number of components (clusters). 202 | 203 | n_init : int 204 | The number of initializations to perform. 205 | 206 | max_iter : int 207 | The maximum number of iterations. 208 | 209 | tol : float 210 | The tolerance for convergence. 211 | 212 | random_state : int, RandomState instance, or None 213 | Controls the randomness. 214 | 215 | verbose : int 216 | Controls the verbosity of the output. 217 | """ 218 | # reviewed 219 | def __init__(self, 220 | n_components, 221 | n_init, 222 | max_iter, 223 | tol, 224 | random_state, 225 | verbose 226 | ) -> None: 227 | self.n_components = n_components 228 | self.n_init = n_init 229 | self.max_iter = max_iter 230 | self.tol = tol 231 | self.random_state = random_state 232 | self.verbose = verbose 233 | 234 | # reviewed 235 | def is_shape_match_X_centers(self, X: DF_ARR_TYPE) -> bool: 236 | """ 237 | Check whether the shape of `X` and `centers_` matches. Useful for `predict` methods. 238 | `self` must already has the attribute `centers_`. 239 | 240 | Parameters 241 | ---------- 242 | X : DataFrame or ndarray 243 | The input data matrix. 244 | 245 | Returns 246 | ------- 247 | bool 248 | True if the shapes match, False otherwise. 249 | """ 250 | n_f = X.shape[1] 251 | return self.centers_.shape == (self.n_components, n_f) 252 | 253 | # reviewed 254 | def init_centers(self, X: np.ndarray) -> np.ndarray: 255 | """ 256 | Initialize the centers using k-Means++ for multiple initializations. 257 | If attribute `centers_` exists and matches the shape of `X`, it will also 258 | be included as an initial value. 259 | 260 | Parameters 261 | ---------- 262 | X : ndarray of shape (n_s, n_f) 263 | The input data matrix. 264 | 265 | Returns 266 | ------- 267 | centers : ndarray 268 | The initialized centers for each run. 269 | """ 270 | centers = init_centers_kmeans_plusplus(X, self.n_components, self.n_init, self.random_state) 271 | if hasattr(self, "centers_") and self.is_shape_match_X_centers(X): 272 | centers.append(self.centers_) # use previously fitted value as one initial center value 273 | return np.array(centers) 274 | 275 | # reviewed 276 | def check_X_predict_func(self, X: DF_ARR_TYPE) -> np.ndarray: 277 | """ 278 | Check the input data matrix for `.predict` methods, ensuring it is a 2D array and 279 | matches the shape of `centers_`. 280 | 281 | Parameters 282 | ---------- 283 | X : DataFrame or ndarray 284 | The input data matrix. 285 | 286 | Returns 287 | ------- 288 | ndarray 289 | The validated 2D data array. 290 | """ 291 | X_arr = check_2d_array(X) 292 | assert self.is_shape_match_X_centers(X_arr) 293 | return X_arr 294 | -------------------------------------------------------------------------------- /jumpmodels/jump.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for statistical jump models (JMs) and continuous jump models (CJMs). 3 | 4 | This module provides utilities and helper functions for implementing and working 5 | with jump models and their continuous variants. 6 | 7 | Depends on 8 | ---------- 9 | utils/ : Modules 10 | Utility functions for validation and clustering operations. 11 | base : Module 12 | Base class for clustering-like algorithms. 13 | """ 14 | 15 | from itertools import product 16 | from scipy.spatial.distance import cdist 17 | from scipy.special import logsumexp 18 | 19 | from . import RANDOM_STATE 20 | from .utils import * 21 | from .base import * 22 | 23 | ################################# 24 | ## model helpers 25 | ################################# 26 | 27 | # reviewed 28 | def jump_penalty_to_mx(jump_penalty: float, n_c: int) -> np.ndarray: 29 | """ 30 | Convert a scalar jump penalty into a penalty matrix. 31 | 32 | Parameters 33 | ---------- 34 | jump_penalty : float 35 | The scalar value representing the jump penalty. 36 | 37 | n_c : int 38 | The number of clusters or components. 39 | 40 | Returns 41 | ------- 42 | np.ndarray 43 | A matrix of shape (n_c, n_c) where off-diagonal elements are the penalty values 44 | and diagonal elements are zero. 45 | """ 46 | # assert is_numbers(jump_penalty) 47 | return jump_penalty * (np.ones((n_c, n_c)) - np.eye(n_c)) # default dtype is float 48 | 49 | # reviewed 50 | def discretize_prob_simplex(n_c: int, grid_size: float) -> np.ndarray: 51 | """ 52 | Sample grid points on a probability simplex. This function generates all possible 53 | combinations of probabilities that sum to 1, given the grid size. 54 | NB: this operation is of combinatorial complexity. 55 | 56 | Parameters 57 | ---------- 58 | n_c : int 59 | The number of components or clusters. 60 | 61 | grid_size : float 62 | The step size for discretization of the simplex. 63 | 64 | Returns 65 | ------- 66 | np.ndarray 67 | An array of shape (n_candidates, n_c), where each row represents a point on the 68 | simplex. The number of candidates depends on the grid size. 69 | """ 70 | N = int(1/grid_size) 71 | tuples = filter(lambda x: sum(x)==N, product(range(N+1), repeat = n_c)) 72 | lst = np.array(list(tuples)[::-1], dtype=float)/N # (n_candidates, n_c) 73 | return lst 74 | 75 | ################################# 76 | ## DP algo & E step 77 | ################################# 78 | 79 | # reviewed 80 | def dp(loss_mx: np.ndarray, 81 | penalty_mx: np.ndarray, 82 | return_value_mx: bool = False) -> Union[tuple[np.ndarray, float], np.ndarray]: 83 | r""" 84 | Solve the optimization problem involved in the E-step calculation (state assignment), 85 | using a dynamic programming (DP) algorithm. 86 | 87 | The objective is to minimize: 88 | 89 | $$\min \sum_{t=0}^{T-1} L(t, s_t) + \sum_{t=1}^{T-1} \Lambda(s_{t-1}, s_t).$$ 90 | 91 | If some columns of `loss_mx` contain `NaN` values, they are replaced with `inf`, 92 | making those clusters unreachable. 93 | 94 | Note: The DP algorithm cannot be easily sped up using Numba due to issues with 95 | `.min(axis=0)` in Numba. 96 | 97 | Parameters 98 | ---------- 99 | loss_mx : ndarray of shape (n_s, n_c) 100 | The loss matrix, where `L(t, k)` represents the loss for time `t` and state `k`. 101 | 102 | penalty_mx : ndarray of shape (n_c, n_c) 103 | The jump penalty matrix between states. 104 | 105 | return_value_mx : bool, optional (default=False) 106 | If `True`, compute and return the value matrix from the DP algorithm. The value at 107 | each time step `t` is based on all information up to that point, making it suitable 108 | for online inference. 109 | 110 | Returns 111 | ------- 112 | tuple[np.ndarray, float] or np.ndarray 113 | If `return_value_mx` is `False`, returns a tuple containing: 114 | - The optimal state assignments. 115 | - The optimal loss function value. 116 | 117 | If `return_value_mx` is `True`, returns the value matrix. 118 | """ 119 | # valid shape 120 | n_s, n_c = loss_mx.shape 121 | assert penalty_mx.shape == (n_c, n_c) 122 | # replace nan by inf 123 | loss_mx = replace_nan_by_inf(loss_mx) 124 | # DP algo 125 | values, assign = np.empty((n_s, n_c)), np.empty(n_s, dtype=int) 126 | # initial 127 | values[0] = loss_mx[0] 128 | # DP iteration 129 | for t in range(1, n_s): 130 | values[t] = loss_mx[t] + (values[t-1][:, np.newaxis] + penalty_mx).min(axis=0) # values[t-1][:, np.newaxis] turns the (t-1)-th row into a column 131 | # 132 | if return_value_mx: 133 | return values 134 | # find optimal path backwards 135 | assign[-1] = values[-1].argmin() 136 | value_opt = values[-1, assign[-1]] 137 | # traceback 138 | for t in range(n_s - 1, 0, -1): 139 | assign[t-1] = (values[t-1] + penalty_mx[:, assign[t]]).argmin() 140 | return assign, value_opt 141 | 142 | # reviewed 143 | def raise_JM_labels_to_proba(labels_: np.ndarray, n_c: int, prob_vecs: Optional[np.ndarray] = None) -> np.ndarray: 144 | """ 145 | Convert JM labels into a probability matrix. If `prob_vecs` is provided, 146 | the probability matrix is constructed using the probability vectors corresponding to each label. 147 | Otherwise, a hard-clustering probability matrix is created from the labels. 148 | """ 149 | return prob_vecs[labels_] if prob_vecs is not None else raise_labels_into_proba(labels_, n_c) 150 | 151 | # reviewed 152 | def raise_JM_proba_to_df(proba_: np.ndarray, X: DF_ARR_TYPE) -> DF_ARR_TYPE: 153 | """ 154 | Convert a probability matrix into a pandas DataFrame, aligning with the index of the input 155 | data matrix `X`. 156 | """ 157 | return raise_arr_to_pd_obj(proba_, X, columns_key=None, return_as_ser=False) 158 | 159 | LARGE_FLOAT = 1e100 160 | 161 | # reviewed 162 | def do_E_step(X: np.ndarray, 163 | centers_: np.ndarray, 164 | penalty_mx: np.ndarray, 165 | prob_vecs: Optional[np.ndarray] = None, 166 | return_value_mx: bool = False) -> Union[tuple[np.ndarray, np.ndarray, float], np.ndarray]: 167 | """ 168 | Perform a single E-step: compute the loss matrix and calling the solver. 169 | 170 | This function handles both hard clustering and continuous models. The `centers_` parameter 171 | can contain `NaN` values. It returns the probabilities, labels, and optimal value, where 172 | `labels_` correspond to the state space. 173 | 174 | Parameters 175 | ---------- 176 | X : ndarray of shape (n_s, n_f) 177 | The input data matrix, where `n_s` is the number of samples and `n_f` is the number of features. 178 | 179 | centers_ : ndarray of shape (n_c, n_f) 180 | The cluster centers. Can contain `NaN` values. 181 | 182 | penalty_mx : ndarray of shape (n_c, n_c) 183 | The penalty matrix representing the transition cost between states. 184 | 185 | prob_vecs : ndarray of shape (N, n_c), optional 186 | Probability vectors for the continuous model. If provided, this adjusts the loss matrix. 187 | 188 | return_value_mx : bool, optional (default=False) 189 | If `True`, return the value matrix from the DP algorithm, which can be used for online inference. 190 | 191 | Returns 192 | ------- 193 | tuple[np.ndarray, np.ndarray, float] or np.ndarray 194 | If `return_value_mx` is `False`, returns a tuple containing: 195 | - `proba_` : ndarray of shape (n_s, n_c) 196 | The probability matrix, where each row corresponds to the probabilities for a sample. 197 | - `labels_` : ndarray of shape (n_s,) 198 | The state labels assigned to each sample. 199 | - `val_` : float 200 | The optimal value of the objective function. 201 | 202 | If `return_value_mx` is `True`, returns the value matrix instead of the tuple. 203 | """ 204 | n_c = len(centers_) # (n_c, n_f) 205 | # compute loss matrix 206 | loss_mx = .5 * cdist(X, centers_, "sqeuclidean") # (n_s, n_c) 207 | # contain `nan` if `centers_` contains `nan`. 208 | if prob_vecs is not None: # cont model, (N, n_c) 209 | # replace the nan in loss_mx by a very large floating number 210 | loss_mx = np.nan_to_num(loss_mx, nan=LARGE_FLOAT, posinf=LARGE_FLOAT, neginf=LARGE_FLOAT) 211 | loss_mx = loss_mx @ prob_vecs.T # each pair of loss between period t and candidate vector, (n_s, N) 212 | if return_value_mx: return dp(loss_mx, penalty_mx, return_value_mx=True) 213 | # do a full E step 214 | labels_, val_ = dp(loss_mx, penalty_mx, return_value_mx=False) # output labels_ is of type int 215 | proba_ = raise_JM_labels_to_proba(labels_, n_c, prob_vecs) 216 | return proba_, labels_, val_ # the returned proba_ must be a valid proba arr 217 | 218 | ################################# 219 | ## feature weights 220 | ################################# 221 | 222 | # reviewed 223 | def valid_feat_weights(feat_weights: Optional[SER_ARR_TYPE]) -> None: 224 | """ 225 | Validate the input `feat_weights`, ensuring all weights are non-negative and at least 226 | one is positive. This function is called at the beginning of the method to ensure 227 | the feature weights are valid. 228 | 229 | Parameters 230 | ---------- 231 | feat_weights : Series or ndarray, optional 232 | The array of feature weights to validate. If `None`, no validation is performed. 233 | 234 | Raises 235 | ------ 236 | AssertionError 237 | If any feature weights are negative or if no positive weights exist. 238 | """ 239 | if feat_weights is None: return 240 | feat_weights_arr = check_1d_array(feat_weights) 241 | assert (feat_weights_arr >= 0.).all(), "Feature weights must be non-negative." 242 | assert (feat_weights_arr > 0.).any(), "At least one feature weight must be positive." 243 | return 244 | 245 | # reviewed 246 | def _valid_shape_X_feat_weights(X: DF_ARR_TYPE, feat_weights: Optional[SER_ARR_TYPE]) -> None: 247 | """ 248 | Assert that the dimensions of the input data matrix `X` and feature weights match. 249 | 250 | Parameters 251 | ---------- 252 | X : DataFrame or ndarray 253 | The input data matrix. 254 | 255 | feat_weights : Series or ndarray, optional 256 | The array of feature weights. If `None`, no assertion is made. 257 | 258 | Raises 259 | ------ 260 | AssertionError 261 | If the dimensions of `X` and `feat_weights` do not match. 262 | """ 263 | if feat_weights is None: 264 | return 265 | if is_ser_df(X) and is_ser_df(feat_weights): 266 | assert (X.columns==feat_weights.index).all(), "Feature mismatch: column names do not match feature weight index." 267 | else: 268 | assert X.shape[1]==len(feat_weights) , "Feature mismatch: number of features does not match feature weights." 269 | return 270 | 271 | # reviewed 272 | def _weight_X(X: DF_ARR_TYPE, feat_weights: Optional[SER_ARR_TYPE]) -> np.ndarray: 273 | """ 274 | Apply feature weights to the input data matrix `X`. If `feat_weights` is `None`, no 275 | weights are applied. It is assumed that dimensions match. 276 | 277 | Parameters 278 | ---------- 279 | X : DataFrame or ndarray 280 | The input data matrix. 281 | 282 | feat_weights : Series or ndarray, optional 283 | The array of feature weights. If `None`, no weighting is applied. 284 | 285 | Returns 286 | ------- 287 | np.ndarray 288 | The weighted data matrix, with the same shape as `X`. 289 | """ 290 | X_arr = check_2d_array(X) 291 | if feat_weights is None: return X_arr 292 | # Apply feature weights 293 | feat_weights_arr = check_1d_array(feat_weights) 294 | return X_arr * feat_weights_arr 295 | 296 | # reviewed 297 | def check_X_with_feat_weights(X: DF_ARR_TYPE, feat_weights: Optional[SER_ARR_TYPE]) -> np.ndarray: 298 | """ 299 | Process the input data matrix `X` and feature weights, returning a weighted version of `X`. 300 | 301 | Parameters 302 | ---------- 303 | X : DataFrame or ndarray 304 | The input data matrix. 305 | 306 | feat_weights : Series or ndarray, optional 307 | The array of feature weights. If `None`, no weighting is applied. 308 | 309 | Returns 310 | ------- 311 | np.ndarray 312 | The weighted data matrix. 313 | """ 314 | # Validate that the dimensions of X and feat_weights match 315 | _valid_shape_X_feat_weights(X, feat_weights) 316 | # Apply feature weights to X 317 | return _weight_X(X, feat_weights) 318 | 319 | ################################# 320 | ## model code 321 | ################################# 322 | 323 | class JumpModel(BaseClusteringAlgo): 324 | """ 325 | Statistical jump model estimation, supporting both discrete and continuous models. 326 | 327 | This class provides methods for fitting and predicting with jump models, using coordinate 328 | descent for optimization. Both discrete and continuous models are supported, with optional 329 | feature weighting and state sorting. 330 | 331 | Parameters 332 | ---------- 333 | n_components : int, default=2 334 | The number of components (states) in the model. 335 | 336 | jump_penalty : float, default=0. 337 | Penalty term (`lambda`) applied to state transitions in both discrete and continuous models. 338 | 339 | cont : bool, default=False 340 | If `True`, the continuous jump model is used. Otherwise, the discrete model is applied. 341 | 342 | grid_size : float, default=0.05 343 | The grid size for discretizing the probability simplex. Only relevant for the continuous model. 344 | 345 | mode_loss : bool, default=True 346 | Whether to apply the mode loss penalty. Only relevant for the continuous model. 347 | 348 | random_state : int or RandomState, optional (default=None) 349 | Random number seed for reproducibility. 350 | 351 | max_iter : int, default=1000 352 | Maximum number of iterations for the coordinate descent algorithm during model fitting. 353 | 354 | tol : float, default=1e-8 355 | Stopping tolerance for the improvement in objective value during optimization. 356 | 357 | n_init : int, default=10 358 | Number of initializations for the model fitting process. 359 | 360 | verbose : int, default=0 361 | Controls the verbosity of the output. Higher values indicate more verbose output. 362 | 363 | Attributes 364 | ---------- 365 | centers_ : ndarray of shape (n_c, n_f) 366 | The cluster centroids estimated during model fitting. 367 | 368 | labels_ : Series or ndarray 369 | In-sample fitted optimal label sequence. 370 | 371 | proba_ : DataFrame or ndarray 372 | In-sample fitted optimal probability matrix. 373 | 374 | ret_, vol_ : Series or ndarray 375 | The average return (`ret_`) and volatility (`vol_`) for each state. These attributes 376 | are available only if `ret_ser` is provided to the `.fit()` method. 377 | 378 | transmat_ : ndarray of shape (n_c, n_c) 379 | The estimated transition probability matrix between states. 380 | 381 | val_ : float 382 | The optimal value of the loss function. 383 | """ 384 | # reviewed 385 | def __init__(self, 386 | n_components: int = 2, 387 | jump_penalty: float = 0., 388 | cont: bool = False, 389 | grid_size: float = 0.05, 390 | mode_loss: bool = True, 391 | random_state = RANDOM_STATE, 392 | max_iter: int = 1000, 393 | tol: float = 1e-8, 394 | n_init: int = 10, 395 | verbose: int = 0): 396 | super().__init__(int(n_components), n_init, max_iter, tol, random_state, verbose) 397 | self.jump_penalty = jump_penalty 398 | self.cont = cont 399 | self.grid_size = grid_size 400 | self.mode_loss = mode_loss 401 | self.alpha = 2 # the power raised to the jump penalty in CJM 402 | 403 | # reviewed 404 | def check_jump_penalty_mx(self) -> np.ndarray: 405 | """ 406 | Initialize the jump penalty matrix for state transitions. 407 | 408 | - For the discrete model, the state space is {0, 1, ..., n_c - 1}, and the scalar 409 | `jump_penalty` is converted into a matrix. 410 | - For the continuous model, `jump_penalty` is multiplied by the pairwise L1 distance 411 | between probability vectors. Optionally applies a mode loss penalty. 412 | 413 | Returns 414 | ------- 415 | np.ndarray 416 | The jump penalty matrix to be used in the model. 417 | """ 418 | assert is_numbers(self.jump_penalty) 419 | if not self.cont: 420 | self.prob_vecs = None # useful in the E step to tell whether the model is continuous/discrete. 421 | jump_penalty_mx = jump_penalty_to_mx(self.jump_penalty, self.n_components) 422 | else: # continuous model 423 | self.prob_vecs = discretize_prob_simplex(self.n_components, self.grid_size) # state space. useful for computing L mx in E step 424 | pairwise_l1_dist = cdist(self.prob_vecs, self.prob_vecs, 'cityblock')/2 425 | jump_penalty_mx = self.jump_penalty * (pairwise_l1_dist ** self.alpha) 426 | if self.mode_loss: # adding mode loss ensures that the penalty mx has correspondence with a TPM. i.e. sum(exp(- )) of every row leads to the same value. 427 | mode_loss = logsumexp(-jump_penalty_mx, axis=1, keepdims=True) 428 | mode_loss -= mode_loss[0] # offset a constant 429 | jump_penalty_mx += mode_loss 430 | self.jump_penalty_mx = jump_penalty_mx # to be used in `.predict()` & `.predict_proba()` 431 | return jump_penalty_mx 432 | 433 | # reviewed 434 | def check_X_predict_func(self, X: DF_ARR_TYPE) -> np.ndarray: 435 | """ 436 | Validate the input data `X` for all prediction methods (but not for fitting), 437 | and apply feature weighting if applicable. Assumes that the model has already 438 | been fitted. 439 | 440 | This method overrides the superclass method. 441 | 442 | Parameters 443 | ---------- 444 | X : DataFrame or ndarray 445 | The input data matrix. 446 | 447 | Returns 448 | ------- 449 | np.ndarray 450 | The weighted input data matrix, if feature weights are provided. 451 | """ 452 | self.is_shape_match_X_centers(X) 453 | feat_weights = getattr_(self, "feat_weights") 454 | return check_X_with_feat_weights(X, feat_weights) 455 | 456 | # reviewed 457 | def fit(self, 458 | X: DF_ARR_TYPE, 459 | ret_ser: Optional[SER_ARR_TYPE] = None, 460 | feat_weights: Optional[SER_ARR_TYPE] = None, 461 | sort_by: Optional[str] = "cumret"): 462 | """ 463 | Fit the jump model using the coordinate descent algorithm. 464 | 465 | The states are sorted by the specified criterion: ["cumret", "vol", "freq", "ret"]. 466 | The Viterbi algorithm is optionally used for state assignment. This choice does 467 | not impact the final numerical results but may affect computational speed. 468 | 469 | Parameters 470 | ---------- 471 | X : DataFrame or ndarray 472 | The input data matrix. 473 | 474 | ret_ser : Series or ndarray, optional 475 | A return series used for sorting states and calculating state-specific returns 476 | and volatilities. 477 | 478 | feat_weights : Series or ndarray, optional 479 | Feature weights to apply to the input data matrix. 480 | 481 | sort_by : ["cumret", "vol", "freq", "ret"], optional (default="cumret") 482 | Criterion for sorting the states. 483 | """ 484 | # valid feat weights 485 | valid_feat_weights(feat_weights) 486 | # check X 487 | X_arr = check_X_with_feat_weights(X, feat_weights) 488 | # save valid feat weights 489 | self.feat_weights = feat_weights 490 | # get attributes 491 | n_c = self.n_components 492 | max_iter = self.max_iter 493 | tol = self.tol 494 | verbose = self.verbose 495 | # make sure the state space, and compute the penalty matrix used for the E step 496 | jump_penalty_mx = self.check_jump_penalty_mx() 497 | # init centers 498 | init_centers_values = self.init_centers(X_arr) 499 | # the best results over all initializations, compare to it in the last part of each iteration 500 | best_val = np.inf 501 | best_res = {} # store: "centers_", "proba_", "labels_". 502 | best_res['labels_'] = None # "labels_" is not always 0/1, but the labels of the state space (candidate prob vecs) 503 | # it is only used to compare whether two inits lead to the same estimation. the final `labels_` is based on `proba_.argmax(axis=1)`. 504 | # iter over all the initializations 505 | for n_init_, centers_ in enumerate(init_centers_values): 506 | # initialize the labels and value in the previous iteration. 507 | labels_pre, val_pre = None, np.inf 508 | # do one E step 509 | proba_, labels_, val_ = do_E_step(X_arr, centers_, jump_penalty_mx, prob_vecs=self.prob_vecs) 510 | num_iter = 0 511 | # iterate between M and E steps 512 | while (num_iter < max_iter and (not is_same_clustering(labels_, labels_pre)) and val_pre - val_ > tol): 513 | # update 514 | num_iter += 1 515 | labels_pre, val_pre = labels_, val_ 516 | # M step: update centers 517 | centers_ = weighted_mean_cluster(X_arr, proba_) 518 | # E step 519 | proba_, labels_, val_ = do_E_step(X_arr, centers_, jump_penalty_mx, prob_vecs=self.prob_vecs) 520 | if verbose: print(f"{n_init_}-th init. val: {val_}") 521 | # compare with previous initializations 522 | if (not is_same_clustering(best_res['labels_'], labels_)) and val_ < best_val: 523 | best_idx = n_init_ 524 | best_val = val_ 525 | # save model attributes 526 | best_res['centers_'] = centers_ 527 | best_res['labels_'] = labels_ # only used to compare with later iters, won't permutate 528 | best_res['proba_'] = proba_ 529 | self.val_ = best_val 530 | if verbose: print(f"{best_idx}-th init has the best value: {best_val}.") 531 | # sort states 532 | sort_states_from_ret(ret_ser, X, best_res, sort_by=sort_by) 533 | # save attributes 534 | if ret_ser is not None: 535 | self.ret_ = best_res["ret_"] 536 | self.vol_ = best_res["vol_"] 537 | self.centers_ = best_res['centers_'] # weighted centers 538 | self.proba_ = raise_JM_proba_to_df(best_res['proba_'], X) 539 | self.labels_ = reduce_proba_to_labels(self.proba_) 540 | self.transmat_ = empirical_trans_mx(self.labels_, n_components=n_c) 541 | return self 542 | 543 | # reviewed 544 | def predict_proba_online(self, X: DF_ARR_TYPE) -> DF_ARR_TYPE: 545 | """ 546 | Predict the probability of each state in an online fashion, where the prediction 547 | for the i-th row is based only on data prior to that row. 548 | 549 | Parameters 550 | ---------- 551 | X : DataFrame or ndarray 552 | The input data matrix. 553 | 554 | Returns 555 | ------- 556 | DataFrame or ndarray 557 | The predicted probabilities for each state. 558 | """ 559 | X_arr = self.check_X_predict_func(X) 560 | value_mx = do_E_step(X_arr, self.centers_, self.jump_penalty_mx, self.prob_vecs, return_value_mx=True) 561 | labels_ = value_mx.argmin(axis=1) 562 | proba_ = raise_JM_labels_to_proba(labels_, self.n_components, self.prob_vecs) 563 | return raise_JM_proba_to_df(proba_, X) 564 | 565 | # reviewed 566 | def predict_online(self, X: DF_ARR_TYPE) -> SER_ARR_TYPE: 567 | """ 568 | Predict the state in an online fashion, where the prediction for the i-th row 569 | is based only on data prior to that row. 570 | 571 | Parameters 572 | ---------- 573 | X : DataFrame or ndarray 574 | The input data matrix. 575 | 576 | Returns 577 | ------- 578 | Series or ndarray 579 | The predicted state labels for each sample. 580 | """ 581 | return reduce_proba_to_labels(self.predict_proba_online(X)) 582 | 583 | # reviewed 584 | def predict_proba(self, X: DF_ARR_TYPE) -> DF_ARR_TYPE: 585 | """ 586 | Predict the probability of each state, using all available data in `X`. 587 | 588 | Parameters 589 | ---------- 590 | X : DataFrame or ndarray 591 | The input data matrix. 592 | 593 | use_viterbi : bool, optional (default=True) 594 | Whether to use the Viterbi solver. 595 | 596 | Returns 597 | ------- 598 | DataFrame or ndarray 599 | The predicted probabilities for each state. 600 | """ 601 | X_arr = self.check_X_predict_func(X) 602 | proba_, _, _ = do_E_step(X_arr, self.centers_, self.jump_penalty_mx, self.prob_vecs) 603 | return raise_JM_proba_to_df(proba_, X) 604 | 605 | # reviewed 606 | def predict(self, X: DF_ARR_TYPE) -> SER_ARR_TYPE: 607 | """ 608 | Predict the state for each sample, using all available data in `X`. 609 | 610 | Parameters 611 | ---------- 612 | X : DataFrame or ndarray 613 | The input data matrix. 614 | 615 | use_viterbi : bool, optional (default=True) 616 | Whether to use the Viterbi solver. 617 | 618 | Returns 619 | ------- 620 | Series or ndarray 621 | The predicted state labels for each sample. 622 | """ 623 | return reduce_proba_to_labels(self.predict_proba(X)) 624 | -------------------------------------------------------------------------------- /jumpmodels/plot.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for plotting functions, especially for visualizing regime identification. 3 | 4 | Depends on: 5 | ----------- 6 | utils/ : Modules 7 | """ 8 | 9 | from .utils import * 10 | 11 | import matplotlib.pyplot as plt 12 | 13 | ALPHA_LINE = .8 14 | ALPHA_FILL = .3 15 | AXES_TYPE = Optional[plt.Axes] 16 | 17 | ############################ 18 | ## matplotlib setting 19 | ############################ 20 | 21 | # reviewed 22 | def matplotlib_setting(): 23 | """ 24 | Set global rcParams for matplotlib to produce nice and large publication-quality figures. 25 | """ 26 | plt.rcParams['figure.figsize'] = (24, 12) 27 | plt.rcParams['axes.titlesize'] = 30 28 | plt.rcParams['axes.labelsize'] = 30 29 | plt.rcParams['xtick.labelsize'] = 30 30 | plt.rcParams['ytick.labelsize'] = 30 31 | plt.rcParams['legend.fontsize'] = 30 32 | plt.rcParams['font.size'] = 26 33 | plt.rcParams['font.family'] = 'cmr10' 34 | plt.rcParams['axes.formatter.use_mathtext'] = True 35 | plt.rcParams['text.usetex'] = True 36 | plt.rcParams['text.latex.preamble'] = r'\usepackage{amsmath}' 37 | plt.rcParams["savefig.dpi"] = 300 38 | plt.rcParams["savefig.bbox"] = "tight" 39 | return 40 | 41 | # Set global matplotlib params 42 | matplotlib_setting() 43 | 44 | ######################################################## 45 | ## File I/O + Function I/O 46 | ######################################################## 47 | 48 | # reviewed 49 | def savefig_plt(filepath, close=False): 50 | """ 51 | Save the current figure to a specified path. Automatically creates the folder if it doesn't exist. 52 | 53 | Parameters 54 | ---------- 55 | filepath : str 56 | The path where the figure should be saved. 57 | 58 | close : bool, optional (default=False) 59 | Whether to close the figure after saving. 60 | """ 61 | check_dir_exist(filepath) 62 | plt.savefig(filepath) 63 | if close: plt.close() 64 | return 65 | 66 | # reviewed 67 | def check_axes(ax: AXES_TYPE = None, nrows=1, ncols=1, figsize_single=(24, 12), **kwargs) -> Union[plt.Axes, np.ndarray]: 68 | """ 69 | Create a new axes if `ax` is None; otherwise return the existing axes. 70 | 71 | Parameters 72 | ---------- 73 | ax : plt.Axes, optional 74 | An existing matplotlib Axes object. If None, a new one is created. 75 | 76 | nrows : int, optional (default=1) 77 | Number of rows for the subplots. 78 | 79 | ncols : int, optional (default=1) 80 | Number of columns for the subplots. 81 | 82 | figsize_single : tuple, optional (default=(24, 12)) 83 | The size of a single subplot. 84 | 85 | Returns 86 | ------- 87 | plt.Axes or np.ndarray 88 | The axes object(s) for plotting. 89 | """ 90 | if ax is None: 91 | w, h = figsize_single 92 | _, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=(ncols*w, nrows*h), **kwargs) 93 | return ax 94 | 95 | ######################################################## 96 | ## Plotting Cumulative Returns 97 | ######################################################## 98 | 99 | # Convert y-axis to percent format 100 | from matplotlib.ticker import FuncFormatter 101 | 102 | def convert_yaxis_to_percent(ax: plt.Axes) -> None: 103 | """ 104 | Convert the ticks on the y-axis to percent without decimals (e.g., 4.0 becomes 400%). 105 | """ 106 | def to_percent(x, position): 107 | pos_flag = x >= 0 108 | string = f"{abs(x) * 100:.0f}\\%" 109 | if pos_flag: return string 110 | return "$-$" + string 111 | ax.yaxis.set_major_formatter(FuncFormatter(to_percent)) 112 | return 113 | 114 | # reviewed 115 | def plot_cumret(ret_df: Union[PD_TYPE, dict], 116 | start_date: DATE_TYPE = None, 117 | end_date: DATE_TYPE = None, 118 | ax: AXES_TYPE = None, 119 | ylabel_ret="Cumulative Returns", 120 | ) -> plt.Axes: 121 | """ 122 | Plot the cumulative returns from a return DataFrame or dictionary. 123 | 124 | Parameters 125 | ---------- 126 | ret_df : DataFrame or dict 127 | The input return data for computing cumulative returns. 128 | 129 | start_date : str or datetime.date, optional 130 | The start date for the plot. Defaults to None. 131 | 132 | end_date : str or datetime.date, optional 133 | The end date for the plot. Defaults to None. 134 | 135 | ax : plt.Axes, optional 136 | The axes on which to plot. If None, a new one is created. 137 | 138 | ylabel_ret : str, optional (default="Cumulative Returns") 139 | The label for the y-axis. 140 | 141 | Returns 142 | ------- 143 | plt.Axes 144 | The axes object with the plotted cumulative returns. 145 | """ 146 | ax = check_axes(ax) 147 | # Process and filter the return data 148 | ret_df = filter_date_range(pd.DataFrame(ret_df), start_date, end_date) 149 | ret_df.index.name = None 150 | # plot cumret 151 | ret_df.cumsum(axis=0).plot(ax=ax) 152 | # set ax attrs 153 | ax.set(ylabel=ylabel_ret) 154 | convert_yaxis_to_percent(ax) 155 | return ax 156 | 157 | ############################ 158 | ## plot regimes 159 | ############################ 160 | 161 | # reviewed 162 | def fill_between(ser: pd.Series, 163 | start_date: DATE_TYPE = None, 164 | end_date: DATE_TYPE = None, 165 | ax: AXES_TYPE = None, 166 | color: Optional[str] = None, 167 | fill_between_label: Optional[str] = None) -> plt.Axes: 168 | """ 169 | Fill the area between a curve and the x-axis with a specified color and label. 170 | 171 | Parameters 172 | ---------- 173 | ser : pd.Series 174 | The data series to plot. 175 | 176 | start_date : str or datetime.date, optional 177 | The start date for the plot. Defaults to None. 178 | 179 | end_date : str or datetime.date, optional 180 | The end date for the plot. Defaults to None. 181 | 182 | ax : plt.Axes, optional 183 | The axes on which to plot. If None, a new one is created. 184 | 185 | color : str, optional 186 | The fill color. Defaults to None. 187 | 188 | fill_between_label : str, optional 189 | The label for the filled area. Defaults to None. 190 | 191 | Returns 192 | ------- 193 | plt.Axes 194 | The axes object with the filled area plot. 195 | """ 196 | ax = check_axes(ax) 197 | # filter dates 198 | ser = filter_date_range(ser, start_date, end_date) 199 | # plot 200 | ax.fill_between(ser.index, ser, step="pre", alpha=ALPHA_FILL, color=color, label=fill_between_label) 201 | ax.legend() 202 | return ax 203 | 204 | # reviewed 205 | def plot_regimes(regimes: PD_TYPE, 206 | n_c: int = 2, 207 | start_date: DATE_TYPE = None, 208 | end_date: DATE_TYPE = None, 209 | ax: AXES_TYPE = None, 210 | colors_regimes: Optional[list] = ['g', 'r'], 211 | labels_regimes: Optional[list] = ['Bull', 'Bear'], 212 | ) -> plt.Axes: 213 | """ 214 | Plot regime identification based on a 1D label series or 2D probability matrix. 215 | 216 | Parameters 217 | ---------- 218 | regimes : DataFrame or Series 219 | The regime data to plot. A integer sequence from {0, 1, ..., n_c-1} if 1D input, 220 | or a probability matrix of shape (n_s, n_c) 221 | 222 | n_c : int, optional (default=2) 223 | The number of components (regimes) to plot. 224 | 225 | start_date : str or datetime.date, optional 226 | The start date for the plot. Defaults to None. 227 | 228 | end_date : str or datetime.date, optional 229 | The end date for the plot. Defaults to None. 230 | 231 | ax : plt.Axes, optional 232 | The axes on which to plot. If None, a new one is created. 233 | 234 | colors_regimes : list, optional 235 | The colors for the regimes. Defaults to ['g', 'r'] (`n_c = 2`). 236 | if `None`, colors will be automatically generated. 237 | 238 | labels_regimes : list, optional 239 | The labels for the regimes. Defaults to ['Bull', 'Bear'] (`n_c = 2`). 240 | if `None`, labels will be automatically generated. 241 | 242 | Returns 243 | ------- 244 | plt.Axes 245 | The axes object with the regime plot. 246 | """ 247 | regimes = filter_date_range(regimes, start_date, end_date) 248 | if is_ser(regimes): 249 | regimes = pd.DataFrame(raise_labels_into_proba(regimes.to_numpy(), n_c=n_c), index=regimes.index) 250 | assert regimes.shape[1]==n_c, "Mismatch between number of components and regime data shape." 251 | ax = check_axes(ax) 252 | # color list 253 | if colors_regimes is None: # generate color list 254 | color_cycle = plt.rcParams['axes.prop_cycle'].by_key()['color'] 255 | colors_regimes = [color_cycle[i % len(color_cycle)] for i in range(n_c)] 256 | else: 257 | assert len(colors_regimes) == n_c, "Mismatch between length of color list and number of components. You can input `colors_regimes = None` for colors to be generated authomatically." 258 | # labels 259 | if labels_regimes is None: 260 | labels_regimes = [f"Regime {i}" for i in range(1, n_c+1)] 261 | else: 262 | assert len(labels_regimes) == n_c, "Mismatch between length of label list and number of components. You can input `labels_regimes = None` for labels to be generated authomatically." 263 | # plot 264 | for i in range(n_c): 265 | fill_between(regimes.iloc[:, i], ax=ax, color=colors_regimes[i], fill_between_label=labels_regimes[i]) 266 | return ax 267 | 268 | # reviewed 269 | def plot_regimes_and_cumret(regimes: PD_TYPE, 270 | ret_df: Union[PD_TYPE, dict], 271 | n_c: int = 2, 272 | start_date: DATE_TYPE = None, 273 | end_date: DATE_TYPE = None, 274 | ax: AXES_TYPE = None, 275 | colors_regimes: Optional[list] = ['g', 'r'], 276 | labels_regimes: Optional[list] = ['Bull', 'Bear'], 277 | ylabel_ret="Cumulative Returns", 278 | legend_loc="upper left" 279 | ) -> tuple[plt.Axes, plt.Axes]: 280 | """ 281 | Plot cumulative returns alongside regime identification in a single figure. 282 | 283 | Parameters 284 | ---------- 285 | regimes : DataFrame or Series 286 | The regime data to plot. A integer sequence from {0, 1, ..., n_c-1} if 1D input, 287 | or a probability matrix of shape (n_s, n_c) 288 | 289 | ret_df : DataFrame or dict 290 | The return data to plot. 291 | 292 | n_c : int, optional (default=2) 293 | The number of regimes/components. 294 | 295 | start_date : str or datetime.date, optional 296 | The start date for the plot. Defaults to None. 297 | 298 | end_date : str or datetime.date, optional 299 | The end date for the plot. Defaults to None. 300 | 301 | ax : plt.Axes, optional 302 | The axes on which to plot. If None, a new one is created. 303 | 304 | colors_regimes : list, optional 305 | The colors for the regimes. Defaults to ['g', 'r'] (`n_c = 2`). 306 | if `None`, colors will be automatically generated. 307 | 308 | labels_regimes : list, optional 309 | The labels for the regimes. Defaults to ['Bull', 'Bear'] (`n_c = 2`). 310 | if `None`, labels will be automatically generated. 311 | 312 | ylabel_ret : str, optional 313 | The label for the cumulative return y-axis. 314 | 315 | legend_loc : str, optional 316 | The location of the legend. 317 | 318 | Returns 319 | ------- 320 | tuple 321 | The axes objects for cumulative returns and regimes. 322 | """ 323 | # plot cumret 324 | ax = plot_cumret(ret_df, start_date=start_date, end_date=end_date, ax=ax, ylabel_ret=ylabel_ret) 325 | # plot regimes 326 | ax2 = ax.twinx() 327 | ax2.set(ylabel="Regime") 328 | plot_regimes(regimes, n_c, start_date=start_date, end_date=end_date, ax=ax2, colors_regimes=colors_regimes, labels_regimes=labels_regimes) 329 | # merge legneds 330 | lines, labels = ax.get_legend_handles_labels() 331 | lines2, labels2 = ax2.get_legend_handles_labels() 332 | legend = ax2.legend(lines + lines2, labels + labels2, loc=legend_loc) 333 | return (ax, ax2) 334 | -------------------------------------------------------------------------------- /jumpmodels/preprocess.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for data preprocessing. 3 | 4 | This module contains classes for scaling and clipping data, with a focus on 5 | handling pandas DataFrame input/output. 6 | 7 | Depends on 8 | ---------- 9 | utils/ : Modules 10 | """ 11 | 12 | from .utils import * 13 | 14 | from sklearn.base import BaseEstimator 15 | from sklearn.preprocessing import StandardScaler 16 | 17 | ############################################ 18 | ## Scaler 19 | ############################################ 20 | 21 | # reviewed 22 | class StandardScalerPD(BaseEstimator): 23 | """ 24 | Provides support for pandas DataFrame input/output with the `StandardScaler()` class. 25 | 26 | This class extends the functionality of the standard `StandardScaler` by ensuring that 27 | the input and output are handled as pandas DataFrames, preserving index and column labels. 28 | """ 29 | def init_scaler(self): 30 | """ 31 | Initialize and return the standard `StandardScaler` instance. 32 | """ 33 | return StandardScaler() 34 | 35 | def fit_transform(self, X: DF_ARR_TYPE) -> DF_ARR_TYPE: 36 | """ 37 | Fit the scaler to the DataFrame and transform it in one step. 38 | 39 | Parameters 40 | ---------- 41 | X : DataFrame or ndarray 42 | The input DataFrame to be scaled. 43 | 44 | Returns 45 | ------- 46 | DataFrame or ndarray 47 | The scaled DataFrame. 48 | """ 49 | return self.fit(X).transform(X) 50 | 51 | def fit(self, X: DF_ARR_TYPE): 52 | """ 53 | Fit the scaler to the input DataFrame. 54 | 55 | Parameters 56 | ---------- 57 | X : DataFrame or ndarray 58 | The input DataFrame to be used for fitting. 59 | 60 | Returns 61 | ------- 62 | self 63 | """ 64 | self.scaler = self.init_scaler().fit(X) 65 | return self 66 | 67 | def transform(self, X: DF_ARR_TYPE) -> DF_ARR_TYPE: 68 | """ 69 | Transform the input DataFrame using the fitted scaler. 70 | 71 | Parameters 72 | ---------- 73 | X : DataFrame or ndarray 74 | The input DataFrame to be transformed. 75 | 76 | Returns 77 | ------- 78 | DataFrame or ndarray 79 | The transformed (scaled) DataFrame. 80 | """ 81 | return raise_arr_to_pd_obj(self.scaler.transform(X), X, return_as_ser=False) 82 | 83 | ############################################ 84 | ## Clipper 85 | ############################################ 86 | 87 | # reviewed 88 | class BaseDataClipper(BaseEstimator): 89 | """ 90 | Base class for data clippers. 91 | 92 | This class implements the `.transform()` and `.fit_transform()` methods, but leaves the `.fit()` 93 | method to be implemented in subclasses. It is designed to clip data values within a specified range. 94 | 95 | Should be inherited by other classes that define the clipping bounds. 96 | """ 97 | def __init__(self) -> None: 98 | self.lb = None # Lower bound, initialized as None. Must be a numpy array. 99 | self.ub = None # Upper bound, initialized as None. Must be a numpy array. 100 | 101 | def fit(self, X: DF_ARR_TYPE): 102 | raise NotImplementedError() 103 | 104 | def fit_transform(self, X: DF_ARR_TYPE) -> DF_ARR_TYPE: 105 | """ 106 | Fit the clipper and transform the input data in one step. 107 | 108 | Parameters 109 | ---------- 110 | X : DataFrame or ndarray 111 | The input data to be clipped. 112 | 113 | Returns 114 | ------- 115 | DataFrame or ndarray 116 | The clipped data. 117 | """ 118 | return self.fit(X).transform(X) 119 | 120 | def transform(self, X: DF_ARR_TYPE) -> DF_ARR_TYPE: 121 | """ 122 | Clip the input data using the fitted lower (`lb`) and upper (`ub`) bounds. 123 | 124 | Parameters 125 | ---------- 126 | X : DataFrame or ndarray 127 | The input data to be clipped. 128 | 129 | Returns 130 | ------- 131 | DataFrame or ndarray 132 | The clipped data. 133 | """ 134 | if self.ub is None and self.lb is None: return X 135 | return np.clip(X, self.lb, self.ub) 136 | 137 | # reviewed 138 | class DataClipperStd(BaseDataClipper): 139 | """ 140 | Data clipper based on feature standard deviation. 141 | 142 | This class performs winsorization of the data, clipping it within a specified multiple of the 143 | feature's standard deviation. The clipping bounds are defined as: 144 | 145 | lower bound = mean - (mul * std) 146 | upper bound = mean + (mul * std) 147 | 148 | Parameters 149 | ---------- 150 | mul : float, default=3. 151 | The multiple of the feature's standard deviation used for clipping. 152 | 153 | Attributes 154 | ---------- 155 | lb : ndarray 156 | The lower bound for each feature, calculated as mean - (mul * std). 157 | 158 | ub : ndarray 159 | The upper bound for each feature, calculated as mean + (mul * std). 160 | """ 161 | def __init__(self, mul: float = 3.) -> None: 162 | super().__init__() 163 | self.mul = mul 164 | 165 | def fit(self, X: DF_ARR_TYPE): 166 | """ 167 | Fit the clipper to the data by calculating the clipping bounds based on 168 | the mean and standard deviation of each feature. 169 | 170 | Parameters 171 | ---------- 172 | X : DataFrame or ndarray 173 | The input data to fit the clipper. 174 | 175 | Returns 176 | ------- 177 | DataClipperStd 178 | The fitted clipper instance. 179 | """ 180 | mul = self.mul 181 | assert mul > 0, "The multiplier `mul` must be positive." 182 | 183 | mean, std = X.mean(axis=0), X.std(axis=0, ddof=0) 184 | if is_df(X): 185 | mean = mean.to_numpy() 186 | std = std.to_numpy() 187 | self.lb = mean - mul * std; assert isinstance(self.lb, np.ndarray) 188 | self.ub = mean + mul * std; assert isinstance(self.ub, np.ndarray) 189 | return self 190 | -------------------------------------------------------------------------------- /jumpmodels/sparse_jump.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module for Sparse Jump Models (SJMs). 3 | 4 | This module provides an implementation of sparse jump models, extending the jump model 5 | with additional support for feature selection through Lasso-like optimization. 6 | 7 | Depends on 8 | ---------- 9 | utils/ : Modules 10 | Utility functions for validation and clustering operations. 11 | jump : Module 12 | Discrete and continuous jump models. 13 | """ 14 | 15 | from .utils import * 16 | from .jump import * 17 | 18 | from numpy.linalg import norm 19 | 20 | ######################################################## 21 | ## Lasso Problem for Feature Weights 22 | ######################################################## 23 | 24 | # reviewed 25 | def binary_search_decrease(func, 26 | left: float, 27 | right: float, 28 | value: float, 29 | *args, 30 | tol_x: float = 1e-8, 31 | tol_y: float = 0., 32 | max_iter: int = 100, 33 | verbose: int = 0, 34 | **kwargs) -> float: 35 | """ 36 | Binary search for a decreasing function. 37 | 38 | This method performs binary search to find the point where the function `func` 39 | decreases to a specified value within given tolerances. 40 | 41 | Parameters 42 | ---------- 43 | func : callable 44 | The function to be minimized. 45 | 46 | left : float 47 | The left bound for the search. 48 | 49 | right : float 50 | The right bound for the search. 51 | 52 | value : float 53 | The target value to find. 54 | 55 | tol_x : float, optional (default=1e-8) 56 | The tolerance for the search along the x-axis. 57 | 58 | tol_y : float, optional (default=0.) 59 | The tolerance for the search along the y-axis (function value). 60 | 61 | max_iter : int, optional (default=100) 62 | Maximum number of iterations. 63 | 64 | verbose : int, optional (default=0) 65 | Verbosity level. If greater than 0, prints progress information. 66 | 67 | Returns 68 | ------- 69 | float 70 | The optimal point where the function reaches the target value. 71 | """ 72 | if value >= func(left): return left 73 | if value <= func(right): return right 74 | # 75 | gap = right-left 76 | num_iter = 0 77 | while (gap > tol_x and num_iter < max_iter): 78 | # print(f"{left}, {right}") 79 | num_iter += 1 80 | middle = (right + left) / 2 81 | func_call = func(middle, *args, **kwargs) 82 | if verbose: print("x value", middle, "y value", func_call) 83 | if func_call < value-tol_y/2: 84 | right = middle 85 | elif func_call > value+tol_y/2: 86 | left = middle 87 | else: 88 | return middle 89 | gap /= 2 90 | if num_iter < max_iter: 91 | return middle 92 | raise Exception("Non-convergence: Possible mathematical error.") 93 | 94 | # reviewed 95 | def soft_thres_l2_normalized(x: SER_ARR_TYPE, thres: float = 0.) -> SER_ARR_TYPE: 96 | """ 97 | Soft thresholding for a non-negative vector `x`, followed by L2 normalization. 98 | 99 | Parameters 100 | ---------- 101 | x : Series or ndarray 102 | The input vector to be thresholded and normalized. 103 | 104 | thres : float, optional (default=0.) 105 | The threshold for soft thresholding. 106 | 107 | Returns 108 | ------- 109 | Series or ndarray 110 | The thresholded and L2-normalized vector. 111 | """ 112 | y = np.maximum(0, x-thres) 113 | y_norm = norm(y) 114 | assert y_norm > 0 115 | return y / y_norm 116 | 117 | # reviewed 118 | def solve_lasso(a: SER_ARR_TYPE, 119 | norm_ub: float, 120 | tol: float = 1e-8) -> SER_ARR_TYPE: 121 | """ 122 | Solve the Lasso problem for feature weights. 123 | 124 | This function finds the optimal feature weights subject to the constraint that the 125 | L1-norm of the weights is bounded by `norm_ub`. 126 | 127 | Parameters 128 | ---------- 129 | a : Series or ndarray 130 | The input vector for the Lasso problem. 131 | 132 | norm_ub : float 133 | The upper bound for the L1-norm of the feature weights. 134 | Equals to `kappa` in the published articles. 135 | 136 | tol : float, optional (default=1e-8) 137 | The tolerance for the binary search. 138 | 139 | Returns 140 | ------- 141 | Series or ndarray 142 | The optimized feature weights. 143 | """ 144 | assert norm_ub >= 1. 145 | a_arr = check_1d_array(a) 146 | left, right = 0., np.unique(a_arr)[-2] # right is the second largest element of `a` 147 | if right < tol: thres_sol = 0. 148 | else: 149 | func = lambda thres: soft_thres_l2_normalized(a_arr, thres).sum() 150 | thres_sol = binary_search_decrease(func, left, right, norm_ub, tol_x=tol) 151 | # return thres_sol 152 | w = soft_thres_l2_normalized(a_arr, thres_sol) 153 | return raise_arr_to_pd_obj(w, a) 154 | 155 | # reviewed 156 | def compute_BCSS(X: DF_ARR_TYPE, 157 | proba_: DF_ARR_TYPE, 158 | centers_: Optional[np.ndarray] = None, 159 | tol: float = 1e-6) -> SER_ARR_TYPE: 160 | """ 161 | Compute the Between Cluster Sum of Squares (BCSS). 162 | 163 | The BCSS is computed based on the cluster centers and probabilities. If no centers are provided, 164 | they will be computed from probabilities. Any BCSS values below the tolerance are set to zero. 165 | 166 | Parameters 167 | ---------- 168 | X : DataFrame or ndarray 169 | The input data matrix. 170 | 171 | proba_ : DataFrame or ndarray 172 | The cluster assignment probabilities. 173 | 174 | centers_ : ndarray, optional 175 | The cluster centers. NA values are acceptable. 176 | If not provided, they are estimated from the data. 177 | 178 | tol : float, optional (default=1e-6) 179 | The tolerance for setting BCSS values to zero. 180 | 181 | Returns 182 | ------- 183 | Series or ndarray 184 | The BCSS values for each feature. 185 | """ 186 | X_arr, proba_arr = check_2d_array(X), check_2d_array(proba_) 187 | if centers_ is None: centers_ = weighted_mean_cluster(X_arr, proba_arr) 188 | # replace NAs in centers with 0. won't affect computation 189 | centers_ = np.nan_to_num(centers_, nan=0.) 190 | # assert not np.isnan(centers_).any() 191 | Ns = proba_arr.sum(axis=0) 192 | BCSS = Ns @ ((centers_ - X_arr.mean(axis=0))**2) 193 | BCSS = set_zero_arr(BCSS, tol=tol) 194 | assert not np.isnan(BCSS).any() 195 | return raise_arr_to_pd_obj(BCSS, X, index_key="columns") 196 | 197 | ############################ 198 | ## SJM 199 | ############################ 200 | 201 | class SparseJumpModel(BaseEstimator): 202 | """ 203 | Sparse Jump Model (SJM) with feature selection. 204 | 205 | This model extends the standard jump model by incorporating a Lasso-like feature 206 | selection process, where the number of selected features is controlled by `max_feats`. 207 | 208 | Parameters 209 | ---------- 210 | n_components : int, default=2 211 | Number of components (clusters). 212 | 213 | max_feats : float, default=100. 214 | Controls the number of features included. This is the square of `kappa`, and 215 | represents the effective number of features. 216 | 217 | jump_penalty : float, default=0. 218 | The jump penalty. In SJM, this penalty is scaled by 219 | `1 / sqrt(n_features)` since features are weighted. 220 | 221 | cont : bool, default=False 222 | If `True`, the continuous jump model is used. Otherwise, the discrete model is applied. 223 | 224 | grid_size : float, default=0.05 225 | The grid size for discretizing the probability simplex (only used for continuous models). 226 | 227 | mode_loss : bool, default=True 228 | Whether to apply the mode loss penalty (only relevant for continuous models). 229 | 230 | random_state : int or RandomState, optional 231 | Random number generator seed for reproducibility. 232 | 233 | max_iter : int, default=30 234 | Maximum number of iterations for the coordinate descent algorithm in feature selection. 235 | 236 | tol_w : float, default=1e-4 237 | Tolerance for stopping the optimization of feature weights. 238 | 239 | max_iter_jm : int, default=1000 240 | Maximum number of iterations for the jump model fitting process. 241 | 242 | tol_jm : float, default=1e-8 243 | Stopping tolerance for the jump model fitting. 244 | 245 | n_init_jm : int, default=10 246 | Number of initializations for the jump model. 247 | 248 | verbose : int, default=0 249 | Controls the verbosity of the output. 250 | 251 | Attributes 252 | ---------- 253 | jm_ins : JumpModel 254 | The fitted jump model instance, with feature weighting. 255 | 256 | feat_weights : ndarray 257 | The optimal feature weights. 258 | Square root of the `w` vector in the oroginal SJM formulation. 259 | 260 | labels_ : Series or ndarray 261 | In-sample optimal state assignments. 262 | 263 | proba_ : DataFrame or ndarray 264 | In-sample optimal probability matrix. 265 | 266 | ret_, vol_ : Series or ndarray 267 | Average return (`ret_`) and volatility (`vol_`) for each state, if `ret_ser` is provided. 268 | 269 | centers_ : ndarray 270 | The weighted cluster centers. 271 | """ 272 | # reviewed 273 | def __init__(self, 274 | n_components: int = 2, 275 | max_feats: float = 100., 276 | jump_penalty: float = 0., 277 | cont: bool = False, 278 | grid_size: float = 0.05, 279 | mode_loss: bool = True, 280 | random_state = RANDOM_STATE, 281 | max_iter: int = 30, 282 | tol_w: float = 1e-4, 283 | max_iter_jm: int = 1000, 284 | tol_jm: float = 1e-8, 285 | n_init_jm: int = 10, 286 | verbose: int = 0): 287 | self.n_components = int(n_components) 288 | self.max_feats = max_feats 289 | self.jump_penalty = jump_penalty 290 | self.cont = cont 291 | self.grid_size = grid_size 292 | self.mode_loss = mode_loss 293 | self.random_state = random_state 294 | self.max_iter = max_iter 295 | self.tol_w = tol_w 296 | self.max_iter_jm = max_iter_jm 297 | self.tol_jm = tol_jm 298 | self.n_init_jm = n_init_jm 299 | self.verbose = verbose 300 | 301 | # reviewed 302 | def init_jm(self): 303 | """ 304 | Initialize the jump model instance with scaled jump penalty. 305 | """ 306 | jump_penalty = self.jump_penalty / np.sqrt(self.n_features_all) 307 | jm = JumpModel(n_components=self.n_components, 308 | jump_penalty=jump_penalty, 309 | cont=self.cont, 310 | grid_size=self.grid_size, 311 | mode_loss=self.mode_loss, 312 | random_state=self.random_state, 313 | max_iter=self.max_iter_jm, 314 | tol=self.tol_jm, 315 | n_init=self.n_init_jm, 316 | verbose=decre_verbose(self.verbose)) 317 | self.jm_ins = jm 318 | return jm 319 | 320 | # reviewed 321 | def print_log(self, n_iter, BCSS, w): 322 | """ 323 | Print fitting logs if verbosity is enabled. 324 | """ 325 | if self.verbose: 326 | print("Iter:", n_iter) 327 | print("BCSS:\n", BCSS) #, "sum:", BCSS.sum() 328 | print("w:\n", w, "\n") 329 | return 330 | 331 | # reviewed 332 | def fit(self, 333 | X: DF_ARR_TYPE, 334 | ret_ser: Optional[SER_ARR_TYPE] = None, 335 | sort_by: Optional[str] = "cumret"): 336 | """ 337 | Fit the sparse jump model using coordinate descent. 338 | 339 | This method iteratively optimizes the feature weights and fits the jump model 340 | on the weighted data. 341 | 342 | Parameters 343 | ---------- 344 | X : DataFrame or ndarray 345 | The input data matrix. 346 | 347 | ret_ser : Series or ndarray, optional 348 | A return series used for sorting states. 349 | 350 | sort_by : ["cumret", "vol", "freq", "ret"], optional (default="cumret") 351 | Criterion for sorting states. 352 | 353 | Returns 354 | ------- 355 | SparseJumpModel 356 | The fitted sparse jump model. 357 | """ 358 | # 359 | X_arr = check_2d_array(X) 360 | self.n_features_all = X_arr.shape[1] 361 | # jm ins 362 | jm = self.init_jm() 363 | # get attrs 364 | max_iter = self.max_iter 365 | tol_w = self.tol_w 366 | norm_ub = np.sqrt(self.max_feats) 367 | # 368 | w_old = np.ones(self.n_features_all)*2 # not a valid weight, only used for entering the 1st iter 369 | w = np.ones(self.n_features_all) / np.sqrt(self.n_features_all) # initial weight # np.repeat(1/np.sqrt(self.n_features_all), self.n_features_all) 370 | n_iter = 0 371 | while (n_iter < max_iter and norm(w-w_old, 1) / norm(w_old, 1) > tol_w): 372 | # 373 | n_iter += 1 374 | w_old = w 375 | # Step 1: fix w, fit JM 376 | feat_weights = np.sqrt(w) 377 | # use the previous optimal center, weighted by the most recent w, as an initialization 378 | if n_iter > 1: jm.centers_ = centers_unweighted * feat_weights 379 | # fit JM on weighted data 380 | jm.fit(X, ret_ser=ret_ser, feat_weights=feat_weights, sort_by=sort_by) 381 | # Step 2: optimize w 382 | # update (unweighted) centers 383 | centers_unweighted = weighted_mean_cluster(X_arr, jm.proba_) 384 | # compute BCSS on the original data 385 | BCSS = compute_BCSS(X_arr, jm.proba_, centers_unweighted) 386 | if (BCSS <= 0).all(): # all in one cluster 387 | self.print_log(n_iter, BCSS, w) 388 | break 389 | w = solve_lasso(BCSS/BCSS.max(), norm_ub) 390 | self.print_log(n_iter, BCSS, w) 391 | # best res 392 | self.w = raise_arr_to_pd_obj(w, X, index_key="columns") 393 | self.feat_weights = raise_arr_to_pd_obj(jm.feat_weights, X, index_key="columns") 394 | self.centers_ = jm.centers_ # weighted centers 395 | # self.centers_ = weighted_mean_cluster(X_arr, jm.proba_, ) 396 | self.labels_ = jm.labels_ 397 | self.proba_ = jm.proba_ 398 | if ret_ser is not None: 399 | self.ret_ = jm.ret_ 400 | self.vol_ = jm.vol_ 401 | return self 402 | 403 | def predict_proba_online(self, X: DF_ARR_TYPE) -> DF_ARR_TYPE: 404 | """ 405 | Predict state probabilities in an online fashion. 406 | """ 407 | return self.jm_ins.predict_proba_online(X) 408 | 409 | def predict_online(self, X: DF_ARR_TYPE) -> SER_ARR_TYPE: 410 | """ 411 | Predict states in an online fashion. 412 | """ 413 | return self.jm_ins.predict_online(X) 414 | 415 | def predict_proba(self, X: DF_ARR_TYPE) -> DF_ARR_TYPE: 416 | """ 417 | Predict state probabilities using all available data. 418 | """ 419 | return self.jm_ins.predict_proba(X) 420 | 421 | def predict(self, X: DF_ARR_TYPE) -> SER_ARR_TYPE: 422 | """ 423 | Predict states using all available data. 424 | """ 425 | return self.jm_ins.predict(X) -------------------------------------------------------------------------------- /jumpmodels/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Although this import style is generally discouraged, 2 | # it works well for our codebase given the simple structure 3 | from .validation import * 4 | from .index import * 5 | from .calculation import * 6 | from .cluster import * -------------------------------------------------------------------------------- /jumpmodels/utils/calculation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helpers for basic numerical calculations. 3 | 4 | This module focuses on numerical calculations with special attention to `numpy` behaviors 5 | involving NaN and infinity: 6 | 7 | - 0. / 0. = np.nan 8 | - 0. * np.inf = np.nan 9 | - 0. * np.nan = np.nan 10 | - 1. / 0. = np.inf 11 | - -1. / 0. = -np.inf 12 | 13 | Typically, it is rare for a statement to directly yield `np.inf`; the first two examples 14 | are the most common cases. 15 | 16 | Depends on 17 | ---------- 18 | utils.validation : Module 19 | """ 20 | 21 | from .validation import * 22 | 23 | # will not raise warnings if: divide by zero, take sqrt of nega values 24 | np.seterr(divide="ignore", invalid="ignore") 25 | 26 | # reviewed 27 | def set_zero_arr(x: np.ndarray, tol=1e-6) -> np.ndarray: 28 | """ 29 | Set elements of a numpy array that are close to zero to exactly zero. 30 | 31 | Parameters 32 | ---------- 33 | x : ndarray 34 | The input numpy array. 35 | 36 | tol : float, optional (default=1e-6) 37 | The tolerance value. Elements with absolute values smaller than `tol` 38 | are set to zero. 39 | 40 | Returns 41 | ------- 42 | ndarray 43 | A numpy array with near-zero values replaced by exact zeros. 44 | """ 45 | return np.where(np.abs(x) < tol, 0., x) 46 | 47 | # reviewed 48 | def replace_inf_by_nan(x: Union[float, np.ndarray]) -> Union[float, np.ndarray]: 49 | """ 50 | Replace both positive and negative infinity values with NaN in a float or numpy array. 51 | 52 | Parameters 53 | ---------- 54 | x : float or ndarray 55 | The input float or numpy array. 56 | 57 | Returns 58 | ------- 59 | float or ndarray 60 | A float or numpy array with infinities replaced by NaN. 61 | """ 62 | return np.where(np.isinf(x), np.nan, x) 63 | 64 | # reviewed 65 | def replace_nan_by_inf(x: Union[float, np.ndarray]) -> Union[float, np.ndarray]: 66 | """ 67 | Replace all NaN values with positive infinity in a float or numpy array. 68 | 69 | Parameters 70 | ---------- 71 | x : float or ndarray 72 | The input float or numpy array. 73 | 74 | Returns 75 | ------- 76 | float or ndarray 77 | A float or numpy array with NaN values replaced by infinity. 78 | """ 79 | return np.where(np.isnan(x), np.inf, x) 80 | 81 | # reviewed 82 | def decre_verbose(verbose: int) -> int: 83 | """ 84 | Decrement a non-negative integer by 1, ensuring the result is non-negative. 85 | 86 | Parameters 87 | ---------- 88 | verbose : int 89 | A non-negative integer to decrement. 90 | 91 | Returns 92 | ------- 93 | int 94 | The decremented value, ensuring it is non-negative. 95 | """ 96 | return max(0, verbose-1) 97 | 98 | ################################# 99 | ## weighted ave 100 | ################################# 101 | 102 | # reviewed 103 | def weighted_mean_cluster(X: np.ndarray, weights: np.ndarray) -> np.ndarray: 104 | """ 105 | Compute the weighted sample average for each cluster. `X` can be a 1D or 2D array. 106 | If the total weights sum to zero (indicating no observation), return `np.nan`. 107 | No `np.inf` will appear in the result. 108 | 109 | Parameters 110 | ---------- 111 | X : ndarray of shape (n_s,) or (n_s, n_f) 112 | The data matrix, where `n_s` is the number of samples and `n_f` is the number of features. 113 | 114 | weights : ndarray of shape (n_s, n_c) 115 | The weight array for each sample and cluster. Must be all non-negative. Support for 116 | `weights` of shape (n_s,) can be added later if needed. 117 | 118 | Returns 119 | ------- 120 | ndarray of shape (n_c,) or (n_c, n_f) 121 | The weighted mean for each cluster. 122 | """ 123 | # valid X 124 | assert X.ndim in [1, 2] # (n_s,) or (n_s, n_f) 125 | X_2d = check_2d_array(X, assert_na=False) # (n_s, n_f) 126 | # valid weights 127 | weights = check_2d_array(weights, assert_na=False) # (n_s, n_c) 128 | assert len(X_2d) == len(weights) 129 | assert (weights >= 0).all() 130 | # 131 | weighted_sum = weights.T @ X_2d # (n_c, n_f) 132 | Ns = weights.sum(axis=0, keepdims=True).T # (n_c, 1) 133 | means_ = weighted_sum / Ns # (n_c, n_f) 134 | if X.ndim == 1: means_ = means_.squeeze() 135 | return means_ # (n_c,) or (n_c, n_f) 136 | 137 | # reviewed 138 | def weighted_mean_std_cluster(X: np.ndarray, weights: np.ndarray, bias=False) -> np.ndarray: 139 | """ 140 | Compute the weighted means and standard deviations for each cluster. 141 | 142 | In extreme cases leading to NaNs (otherwise, all values are normal): 143 | - No observation: both `var_` and `factor` will be NaNs, and standard deviation will also be NaN. 144 | - Only one observation: `var_` will be zero, while `factor` will be `np.inf`. When considering the debiasing 145 | factor, this results in NaN standard deviations. 146 | 147 | Parameters 148 | ---------- 149 | X : ndarray of shape (n_s,) or (n_s, n_f) 150 | The data matrix, where `n_s` is the number of samples and `n_f` is the number of features. 151 | 152 | weights : ndarray of shape (n_s, n_c) 153 | The weight array for each sample and cluster. Must be all non-negative. 154 | 155 | bias : bool, optional (default=False) 156 | If False, apply a debiasing factor to the variance calculation. 157 | 158 | Returns 159 | ------- 160 | means_ : ndarray of shape (n_c,) or (n_c, n_f) 161 | The weighted mean for each cluster. 162 | 163 | stds_ : ndarray of shape (n_c,) or (n_c, n_f) 164 | The weighted standard deviation for each cluster. 165 | """ 166 | X_2d = check_2d_array(X, assert_na=False) # (n_s, n_f) 167 | means_ = weighted_mean_cluster(X_2d, weights) # (n_c, n_f) 168 | sq_means_ = weighted_mean_cluster(X_2d ** 2, weights) # (n_c, n_f) 169 | var_ = sq_means_ - means_ ** 2 # (n_c, n_f) 170 | if not bias: # debiase factor, see: https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights 171 | V1 = weights.sum(axis=0, keepdims=True) # (1, n_c) 172 | V2 = (weights**2).sum(axis=0, keepdims=True) # (1, n_c) 173 | factor = 1. / (1. - V2/V1**2) # (1, n_c) 174 | factor = factor.T # (n_c, 1) 175 | var_ *= factor # (n_c, n_f) 176 | stds_ = np.sqrt(var_) # (n_c, n_f) 177 | if X.ndim == 1: 178 | return means_.squeeze(), stds_.squeeze() 179 | return means_, stds_ 180 | -------------------------------------------------------------------------------- /jumpmodels/utils/cluster.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helpers for numerical calculations in clustering analysis. 3 | 4 | This module provides functions to handle clustering-related tasks such as label validation, 5 | probability conversion, and transition matrix computation. 6 | 7 | Depends on 8 | ---------- 9 | utils.validation : Module 10 | """ 11 | 12 | from .validation import * 13 | 14 | # reviewed 15 | def is_valid_labels(labels_: SER_ARR_TYPE, n_c: int = 2) -> bool: 16 | """ 17 | Check whether a label array/series is a valid label sequence. The values of `labels_` must 18 | lie in the set {0, 1, ..., n_c-1}. 19 | 20 | Parameters 21 | ---------- 22 | labels_ : ndarray or Series 23 | The array or series of labels to check. 24 | 25 | n_c : int, optional (default=2) 26 | The number of clusters. Labels must lie in {0, 1, ..., n_c-1}. 27 | 28 | Returns 29 | ------- 30 | bool 31 | True if the labels are valid, False otherwise. 32 | """ 33 | labels_arr = check_1d_array(labels_) # check whether it is intrinsically 1-d 34 | return set(labels_arr).issubset(set(range(n_c))) 35 | 36 | # reviewed 37 | def is_valid_proba(proba_: DF_ARR_TYPE) -> bool: 38 | """ 39 | Check whether a probability array/series is valid, meaning all values are non-negative 40 | and all rows sum to 1. 41 | 42 | Parameters 43 | ---------- 44 | proba_ : ndarray or DataFrame 45 | The probability matrix to check. 46 | 47 | Returns 48 | ------- 49 | bool 50 | True if the probability matrix is valid, False otherwise. 51 | """ 52 | proba_arr = check_2d_array(proba_) 53 | return (proba_arr>=0).all() and np.isclose(proba_arr.sum(axis=1), 1.).all() 54 | 55 | # reviewed 56 | def raise_labels_into_proba(labels_: np.ndarray, n_c: int) -> np.ndarray: 57 | """ 58 | Convert a discrete label array into a probability matrix. The resulting matrix corresponds 59 | to hard clustering, with 0./1. values. 60 | 61 | Parameters 62 | ---------- 63 | labels_ : ndarray of shape (n_s,) 64 | The array of integer labels. 65 | 66 | n_c : int 67 | The number of clusters. 68 | 69 | Returns 70 | ------- 71 | proba_ : ndarray of shape (n_s, n_c) 72 | The probability assignment array. 73 | """ 74 | # labels_ must be ints, and smaller than n_c 75 | # don't verify inputs, for performance consideration 76 | n_s = len(labels_) 77 | proba_ = np.zeros((n_s, n_c)) 78 | proba_[range(n_s), labels_] = 1. 79 | # assert is_valid_proba(proba_) 80 | return proba_ 81 | 82 | # reviewed 83 | def reduce_proba_to_labels(proba_: DF_ARR_TYPE) -> SER_ARR_TYPE: 84 | """ 85 | Convert a probability matrix into a label series by taking the argmax of each row. 86 | 87 | Parameters 88 | ---------- 89 | proba_ : ndarray or DataFrame 90 | The probability matrix to convert. 91 | 92 | Returns 93 | ------- 94 | labels_ : ndarray or Series 95 | The label series obtained by taking the argmax of each row. 96 | """ 97 | if is_df(proba_): return proba_.idxmax(axis=1) 98 | # arr 99 | return proba_.argmax(axis=1) 100 | 101 | # reviewed 102 | def is_map_from_left_to_right(labels_left: Optional[SER_ARR_TYPE], labels_right: Optional[SER_ARR_TYPE]) -> bool: 103 | """ 104 | Check whether the map from `labels_left` to `labels_right` is valid, meaning elements with the same label 105 | in `labels_left` must have the same label in `labels_right`. If either label array is `None`, return `False`. 106 | 107 | Parameters 108 | ---------- 109 | labels_left : ndarray or Series, optional 110 | The left-side label array. 111 | 112 | labels_right : ndarray or Series, optional 113 | The right-side label array. 114 | 115 | Returns 116 | ------- 117 | bool 118 | True if the mapping is valid, False otherwise. 119 | """ 120 | if labels_left is None or labels_right is None: 121 | return False 122 | assert len(labels_left) == len(labels_right) 123 | for label in np.unique(labels_left): 124 | if len(np.unique(labels_right[labels_left==label])) != 1: 125 | return False 126 | return True 127 | 128 | # reviewed 129 | def is_same_clustering(labels1: Optional[SER_ARR_TYPE], labels2: Optional[SER_ARR_TYPE]) -> bool: 130 | """ 131 | Check whether two clustering results are the same, under permutation. If either input is `None`, return `False`. 132 | 133 | Parameters 134 | ---------- 135 | labels1 : ndarray or Series, optional 136 | The first label array. 137 | 138 | labels2 : ndarray or Series, optional 139 | The second label array. 140 | 141 | Returns 142 | ------- 143 | bool 144 | True if the two clustering results are the same, False otherwise. 145 | """ 146 | return is_map_from_left_to_right(labels1, labels2) and is_map_from_left_to_right(labels2, labels1) 147 | 148 | # reviewed 149 | def empirical_trans_mx(labels_: SER_ARR_TYPE, n_components=2, return_counts=False) -> np.ndarray: 150 | """ 151 | Compute the empirical transition count or probability matrix from a label array/series. 152 | Probability values will be `nan` if no transition from a state is observed. 153 | 154 | Parameters 155 | ---------- 156 | labels_ : ndarray or Series 157 | The label array/series with values in {0, 1, ..., n_components - 1}, of both float/int dtype. 158 | 159 | n_components : int, optional (default=2) 160 | The number of unique labels. 161 | 162 | return_counts : bool, optional (default=False) 163 | If True, return the transition counts instead of probabilities. 164 | 165 | Returns 166 | ------- 167 | ndarray 168 | The transition count or probability matrix. 169 | """ 170 | assert is_valid_labels(labels_, n_c=n_components) 171 | labels_ = check_1d_array(labels_, dtype=int) # labels must be int type, as it will be used as arr index. 172 | # count transitions 173 | count_mx = np.zeros((n_components, n_components), dtype=int) 174 | for i in range(n_components): 175 | # the next states after label==i 176 | labels_next = labels_[1:][labels_[:-1]==i] # shift label by 1 177 | # count next states 178 | states, counts = np.unique(labels_next, return_counts=True) # states must be ints. 179 | count_mx[i, states] = counts 180 | if return_counts: return count_mx 181 | # return probability 182 | return (1.*count_mx) / count_mx.sum(axis=1, keepdims=True) 183 | 184 | # reviewed 185 | def compute_num_shifts(labels_: SER_ARR_TYPE) -> int: 186 | """ 187 | Count the number of regime shifts in a (int) label array/series. 188 | """ 189 | labels_arr = check_1d_array(labels_) 190 | return (labels_arr[:-1]!=labels_arr[1:]).sum() 191 | -------------------------------------------------------------------------------- /jumpmodels/utils/index.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helpers for working with the index of pandas objects, typically of type `datetime.date`. 3 | 4 | This module provides functions to filter and align the index of pandas Series 5 | and DataFrames. The functionality ensures proper handling of date-based indices and 6 | alignment of pandas objects. 7 | 8 | Depends on 9 | ---------- 10 | utils.validation : Module 11 | """ 12 | 13 | from .validation import * 14 | 15 | # reviewed 16 | def filter_date_range(obj: PD_TYPE, start_date: DATE_TYPE = None, end_date: DATE_TYPE = None) -> PD_TYPE: 17 | """ 18 | Filter a pandas Series or DataFrame with a `datetime.date` index by a specified date range. 19 | Returns a copy of the filtered object for data safety. 20 | 21 | Parameters 22 | ---------- 23 | obj : Series or DataFrame 24 | The pandas object to filter, which must have an index of dtype `datetime.date`. 25 | 26 | start_date : str, datetime.date, or None, optional 27 | The start date of the range. If `None`, no start date filter is applied. 28 | 29 | end_date : str, datetime.date, or None, optional 30 | The end date of the range. If `None`, no end date filter is applied. 31 | 32 | Returns 33 | ------- 34 | Series or DataFrame 35 | A copy of the filtered pandas object. 36 | """ 37 | assert is_ser_df(obj) 38 | start_date, end_date = check_datetime_date(start_date), check_datetime_date(end_date) 39 | if start_date is not None: obj = obj.loc[start_date:] 40 | if end_date is not None: obj = obj.loc[:end_date] 41 | return obj.copy() 42 | 43 | # reviewed 44 | def align_index(x: PD_TYPE, y: PD_TYPE) -> PD_TYPE: 45 | """ 46 | Return a subset of `x` so that its index aligns with the index of `y`. 47 | Returns a copy of the subset for data safety. 48 | 49 | Parameters 50 | ---------- 51 | x : Series or DataFrame 52 | The pandas object whose index is to be aligned with `y`. 53 | 54 | y : Series or DataFrame 55 | The pandas object whose index is used for alignment. 56 | 57 | Returns 58 | ------- 59 | Series or DataFrame 60 | A copy of `x` with its index aligned to `y`. 61 | """ 62 | return x.loc[y.index].copy() # throw error if the index is not contained 63 | 64 | # reviewed 65 | def align_x_with_y(x: NUMERICAL_OBJ_TYPE, y: NUMERICAL_OBJ_TYPE) -> NUMERICAL_OBJ_TYPE: 66 | """ 67 | Align `x` with `y`. If both `x` and `y` are pandas objects, align their indices using 68 | `align_index`. If they are not both pandas objects, assert that their lengths match. 69 | Returns a copy for data safety. 70 | 71 | Parameters 72 | ---------- 73 | x : ndarray, Series, or DataFrame 74 | The first numerical object to align. 75 | 76 | y : ndarray, Series, or DataFrame 77 | The second numerical object to align. 78 | 79 | Returns 80 | ------- 81 | ndarray, Series, or DataFrame 82 | A copy of `x`, aligned with `y`. 83 | """ 84 | if is_ser_df(x) and is_ser_df(y): return align_index(x, y) 85 | # not all pd objects, assert that lens match 86 | assert is_same_len(x, y), "the two input arrays should be of the same length" 87 | return x.copy() 88 | -------------------------------------------------------------------------------- /jumpmodels/utils/validation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Module of functions to validate input/output and parameters in functions or estimators. 3 | 4 | This module provides general validation functions and does not depend on any custom modules. 5 | """ 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import numbers 10 | from typing import Union, Optional, Dict 11 | import datetime 12 | 13 | # custom data types 14 | PD_TYPE = Union[pd.Series, pd.DataFrame] 15 | NUMERICAL_OBJ_TYPE = Union[np.ndarray, PD_TYPE] 16 | SER_ARR_TYPE = Union[np.ndarray, pd.Series] 17 | DF_ARR_TYPE = Union[np.ndarray, pd.DataFrame] 18 | DATE_TYPE = Optional[Union[str, datetime.date]] 19 | 20 | pd.set_option('display.width', 300) 21 | 22 | ############################### 23 | ## convert input types 24 | ############################### 25 | 26 | # reviewed 27 | def is_no_nan(obj: NUMERICAL_OBJ_TYPE) -> bool: 28 | """ 29 | Check whether an object does not contain any NaN or None values. 30 | 31 | Parameters 32 | ---------- 33 | obj : Array/Series/DataFrame 34 | The input numerical object to check. It can be a numpy array, pandas Series, 35 | or pandas DataFrame. 36 | 37 | Returns 38 | ------- 39 | bool 40 | `True` if the object does not contain any NaN or None values, `False` otherwise. 41 | """ 42 | return not pd.isna(np.asarray(obj)).any() 43 | 44 | # reviewed 45 | def valid_no_nan(obj: NUMERICAL_OBJ_TYPE): 46 | """ 47 | Assert that an object does not contain any NaN or None values. 48 | 49 | Parameters 50 | ---------- 51 | obj : Array/Series/DataFrame 52 | The input numerical object to check. It can be a numpy array, pandas Series, 53 | or pandas DataFrame. 54 | 55 | Raises 56 | ------ 57 | AssertionError 58 | If the object contains NaN or None values. 59 | """ 60 | assert is_no_nan(obj), f"input numerical object contains NaNs." 61 | return 62 | 63 | # reviewed 64 | def check_2d_array(X: NUMERICAL_OBJ_TYPE, single_col=False, dtype=None, assert_na=True) -> np.ndarray: 65 | """ 66 | Convert an array-like object into a 2D array. If the input is 1D, a new axis will be appended. 67 | Only accepts 1D and 2D inputs. If `single_col` is True, the function will assert that 68 | `X.shape[1] == 1`. The function returns a copy for data safety. 69 | 70 | Parameters 71 | ---------- 72 | X : Array/Series/DataFrame 73 | Array-like object (numpy array, pandas Series, or pandas DataFrame). Raises an exception if 74 | the dimensionality is not 1 or 2. 75 | 76 | single_col : bool, optional (default=False) 77 | If True, assert that `X.shape[1] == 1`, ensuring that the input contains only one column. 78 | 79 | dtype : data-type, optional 80 | Desired numpy data type for the returned array. 81 | 82 | assert_na : bool, optional (default=True) 83 | Whether to assert that the input `X` does not contain any NA values. 84 | 85 | Returns 86 | ------- 87 | np.ndarray 88 | A 2D numpy array. 89 | """ 90 | X = np.array(X, dtype=dtype) 91 | if X.ndim == 1: X = X[:, np.newaxis] # append new axis 92 | assert X.ndim == 2 93 | if single_col: assert X.shape[1] == 1 94 | if assert_na: valid_no_nan(X) 95 | return X 96 | 97 | # reviewed 98 | def check_1d_array(X: NUMERICAL_OBJ_TYPE, dtype=None, assert_na=True) -> np.ndarray: 99 | """ 100 | Convert an array-like object into a 1D array. The function returns a copy for data safety. 101 | 102 | Parameters 103 | ---------- 104 | X : Array/Series/DataFrame 105 | Array-like object (numpy array, pandas Series, or pandas DataFrame). Raises an exception if 106 | the dimensionality after calling `.squeeze()` is not 1. 107 | 108 | dtype : data-type, optional 109 | Desired numpy data type for the returned array. 110 | 111 | assert_na : bool, optional (default=True) 112 | Whether to assert that the input `X` does not contain any NA values. 113 | 114 | Returns 115 | ------- 116 | np.ndarray 117 | A 1D numpy array. 118 | """ 119 | X = np.array(X, dtype=dtype).squeeze() 120 | assert X.ndim == 1 121 | if assert_na: valid_no_nan(X) 122 | return X 123 | 124 | # reviewed 125 | def check_datetime_date(date: DATE_TYPE) -> Optional[datetime.date]: 126 | """ 127 | Convert a date-like object into a `datetime.date` object. If the input is `None`, 128 | return `None`. 129 | 130 | Parameters 131 | ---------- 132 | date : str, datetime.date, or None 133 | The input date-like object to be converted. Can be a string, a datetime object, 134 | or `None`. 135 | 136 | Returns 137 | ------- 138 | datetime.date or None 139 | A `datetime.date` object if the input is a valid date-like object, otherwise `None`. 140 | """ 141 | if date is None: return None 142 | return pd.Timestamp(date).date() 143 | 144 | ############################### 145 | ## binary checks 146 | ############################### 147 | 148 | # reviewed 149 | def is_ser(obj) -> bool: 150 | """ 151 | Check whether the input object is a Series. 152 | """ 153 | return isinstance(obj, pd.Series) 154 | 155 | # reviewed 156 | def is_df(obj) -> bool: 157 | """ 158 | Check whether the input object is a DataFrame. 159 | """ 160 | return isinstance(obj, pd.DataFrame) 161 | 162 | # reviewed 163 | def is_ser_df(obj) -> bool: 164 | """ 165 | Check whether the input object is a Series/DataFrame. 166 | """ 167 | return isinstance(obj, PD_TYPE) 168 | 169 | # reviewed 170 | def is_numbers(x) -> bool: 171 | """ 172 | Check whether the input is a scalar number. 173 | """ 174 | return isinstance(x, numbers.Number) 175 | 176 | # reviewed 177 | def is_same_len(*args) -> bool: 178 | """ 179 | Check whether all input arguments have the same length. 180 | 181 | Parameters 182 | ---------- 183 | *args : iterable 184 | Variable number of input iterables (e.g., lists, arrays, or other iterable objects). 185 | 186 | Returns 187 | ------- 188 | bool 189 | `True` if all input arguments have the same length, `False` otherwise. 190 | """ 191 | return len(set(len(x) for x in args)) == 1 192 | 193 | # reviewed 194 | def is_same_index(*args) -> bool: 195 | """ 196 | Check whether the index of all input pandas Series or DataFrames are exactly the same. 197 | This function is typically used to verify if the date indices of different Series/DataFrames 198 | align with each other. 199 | 200 | Parameters 201 | ---------- 202 | *args : Series or DataFrame 203 | Variable number of pandas Series or DataFrame objects whose indices are to be compared. 204 | 205 | Returns 206 | ------- 207 | bool 208 | `True` if all input Series/DataFrames have the same index, `False` otherwise. 209 | """ 210 | assert is_same_len(*args) 211 | index_this = None 212 | for item in args: 213 | # assert is_ser_df(item) 214 | if index_this is None: # the first item 215 | index_this = item.index 216 | continue 217 | index_that = item.index 218 | if not (index_this==index_that).all(): 219 | return False 220 | return True 221 | 222 | ############################### 223 | ## output cast in pd types 224 | ############################### 225 | 226 | # reviewed 227 | def getattr_(obj: object, key: Optional[str]): 228 | """ 229 | Retrieve the attribute `key` from the object `obj`. If `key` is `None`, or the object 230 | does not have the attribute `key`, return `None`. 231 | 232 | Parameters 233 | ---------- 234 | obj : object 235 | The object from which to retrieve the attribute. 236 | 237 | key : str, optional 238 | The name of the attribute to retrieve. If `None`, the function returns `None`. 239 | 240 | Returns 241 | ------- 242 | any or None 243 | The value of the attribute if it exists, otherwise `None`. 244 | """ 245 | if key is not None and hasattr(obj, key): 246 | return getattr(obj, key) 247 | else: 248 | return None 249 | 250 | # reviewed 251 | def raise_arr_to_pd_obj(arr: np.ndarray, pd_obj: NUMERICAL_OBJ_TYPE, index_key="index", columns_key="columns", return_as_ser=True) -> NUMERICAL_OBJ_TYPE: 252 | """ 253 | Convert a numpy array into a pandas Series or DataFrame, using the index and columns 254 | attributes of `pd_obj` for labeling. If `pd_obj` is not a pandas object, the function 255 | returns the array unchanged. 256 | 257 | Parameters 258 | ---------- 259 | arr : np.ndarray 260 | The array to be converted into a pandas Series or DataFrame. 261 | 262 | pd_obj : Series, DataFrame, or array-like 263 | The pandas object from which to extract the index and columns for the new pandas object. 264 | 265 | index_key : str, optional (default="index") 266 | The attribute name for retrieving the index of the output from `pd_obj`. 267 | 268 | columns_key : str, optional (default="columns") 269 | The attribute name for retrieving the columns of the output from `pd_obj`. 270 | Only useful if the parameter `return_as_ser` is set to `False`. 271 | 272 | return_as_ser : bool, optional (default=True) 273 | If `True`, the function returns a pandas Series using only the index. 274 | If `False`, it returns a pandas DataFrame using both the index and columns. 275 | 276 | Returns 277 | ------- 278 | Series, DataFrame, or np.ndarray 279 | A pandas Series or DataFrame with index and columns matching those of `pd_obj`, 280 | or the original numpy array if `pd_obj` is not a pandas object. 281 | """ 282 | if not is_ser_df(pd_obj): return arr 283 | index = getattr_(pd_obj, index_key) 284 | columns = getattr_(pd_obj, columns_key) 285 | if return_as_ser: return pd.Series(arr, index=index) 286 | return pd.DataFrame(arr, index=index, columns=columns) 287 | 288 | ############################### 289 | ## file i/o 290 | ############################### 291 | 292 | import os 293 | 294 | # reviewed 295 | def check_dir_exist(filepath): 296 | """ 297 | Check whether the directory of the specified file path exists. If it does not exist, 298 | create the directory. Handles potential race conditions where multiple processes may 299 | attempt to create the directory simultaneously. 300 | 301 | Parameters 302 | ---------- 303 | filepath : str 304 | The file path for which the existence of the parent directory is checked. 305 | """ 306 | dirname = os.path.dirname(filepath) 307 | if dirname != "": 308 | if not os.path.exists(dirname): 309 | try: 310 | os.makedirs(dirname, exist_ok=True) 311 | print(f"Created folder: {dirname}") 312 | except FileExistsError: 313 | # The directory was created by another process between the check and creation 314 | pass 315 | return 316 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | 6 | 7 | 8 | [project] 9 | name = "jumpmodels" 10 | version = "0.1.1" 11 | authors = [ 12 | { name="Yizhan Shu", email="olivershu98@gmail.com" }, 13 | ] 14 | description = "Statistical Jump Models in Python, with scikit-learn-style APIs" 15 | readme = "README.md" 16 | requires-python = ">=3.8" 17 | dependencies = [ 18 | "numpy", 19 | "pandas", 20 | "scipy", 21 | "scikit-learn", 22 | "matplotlib", 23 | ] 24 | classifiers = [ 25 | "Programming Language :: Python :: 3", 26 | "License :: OSI Approved :: Apache Software License", 27 | "Operating System :: OS Independent", 28 | "Intended Audience :: Science/Research", 29 | "Intended Audience :: Financial and Insurance Industry", 30 | "Intended Audience :: Developers", 31 | "Topic :: Scientific/Engineering :: Mathematics", 32 | "Topic :: Scientific/Engineering :: Information Analysis", 33 | ] 34 | keywords = ["regime", "regime switching", "jump models", "clustering", "time series", "financial data"] 35 | 36 | 37 | 38 | 39 | 40 | [project.optional-dependencies] 41 | example = [ 42 | "jupyterlab", 43 | "yfinance", 44 | ] 45 | 46 | 47 | 48 | 49 | 50 | 51 | [project.urls] 52 | Homepage = "https://github.com/Yizhan-Oliver-Shu/jump-models" 53 | Issues = "https://github.com/Yizhan-Oliver-Shu/jump-models/issues" --------------------------------------------------------------------------------